From: Daniel Gryniewicz Date: Tue, 23 Sep 2025 15:39:13 +0000 (-0400) Subject: RGW - Fix dedup build X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F65643%2Fhead;p=ceph.git RGW - Fix dedup build Dedup is RadosStore specific, so move it to driver/rados, and only start it if radosstore is being built. Signed-off-by: Daniel Gryniewicz --- diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt index 3699ec33c318..0fd7a8bca473 100644 --- a/src/rgw/CMakeLists.txt +++ b/src/rgw/CMakeLists.txt @@ -140,11 +140,6 @@ set(librgw_common_srcs rgw_bucket_encryption.cc rgw_tracer.cc rgw_lua_background.cc - rgw_dedup.cc - rgw_dedup_table.cc - rgw_dedup_store.cc - rgw_dedup_utils.cc - rgw_dedup_cluster.cc rgw_data_access.cc rgw_realm_watcher.cc rgw_bucket_logging.cc @@ -236,7 +231,12 @@ if(WITH_RADOSGW_RADOS) driver/rados/config/realm_watcher.cc driver/rados/config/store.cc driver/rados/config/zone.cc - driver/rados/config/zonegroup.cc) + driver/rados/config/zonegroup.cc + driver/rados/rgw_dedup.cc + driver/rados/rgw_dedup_table.cc + driver/rados/rgw_dedup_store.cc + driver/rados/rgw_dedup_utils.cc + driver/rados/rgw_dedup_cluster.cc) endif() if(WITH_RADOSGW_AMQP_ENDPOINT) list(APPEND librgw_common_srcs rgw_amqp.cc) diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc new file mode 100644 index 000000000000..7c00ddf6f2a7 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup.cc @@ -0,0 +1,2704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/rados_types.hpp" +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "rgw_tools.h" +#include "svc_zone.h" +#include "common/config.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "rgw_common.h" +#include "rgw_sal.h" +#include "rgw_zone.h" +#include "rgw_cache.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ +#include "rgw_aio_throttle.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_sal_config.h" +#include "rgw_lib.h" +#include "rgw_placement_types.h" +#include "driver/rados/rgw_bucket.h" +#include "driver/rados/rgw_sal_rados.h" +#include "cls/rgw/cls_rgw_ops.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw/cls_rgw_const.h" +#include "cls/refcount/cls_refcount_client.h" +#include "cls/version/cls_version_client.h" +#include "fmt/ranges.h" +#include "osd/osd_types.h" +#include "common/ceph_crypto.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//using namespace std::chrono_literals; +using namespace librados; +using namespace std; +using namespace rgw::dedup; + +#include "rgw_dedup_remap.h" +#include "rgw_sal_rados.h" +#include "rgw_dedup_table.h" +#include "rgw_dedup_utils.h" +#include "rgw_dedup.h" +#include "rgw_dedup_store.h" +#include "rgw_dedup_cluster.h" +#include "rgw_dedup_epoch.h" +#include "rgw_perf_counters.h" +#include "include/ceph_assert.h" + +static constexpr auto dout_subsys = ceph_subsys_rgw_dedup; + +namespace rgw::dedup { + static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128; + using storage_class_idx_t = uint8_t; + + //--------------------------------------------------------------------------- + void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist &bl) + { + ldpp_dout(parent->dpp, 10) << __func__ << "::notify_id=" << notify_id + << "::cookie=" << cookie + << "::notifier_id=" << notifier_id << dendl; + if (parent->d_watch_handle != cookie) { + ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie + << "::d_watch_handle=" << parent->d_watch_handle + << dendl; + return; + } + parent->handle_notify(notify_id, cookie, bl); + } + + //--------------------------------------------------------------------------- + void Background::DedupWatcher::handle_error(uint64_t cookie, int err) + { + if (parent->d_watch_handle != cookie) { + ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie + << "::d_watch_handle=" << parent->d_watch_handle + << dendl; + return; + } + ldpp_dout(parent->dpp, 1) << __func__ << "::error=" << err << dendl; + + parent->unwatch_reload(parent->dpp); + parent->watch_reload(parent->dpp); + } + + //--------------------------------------------------------------------------- + void control_t::reset() + { + this->dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE; + this->started = false; + this->dedup_exec = false; + this->shutdown_req = false; + this->shutdown_done = false; + this->local_pause_req = false; + this->local_paused = false; + this->remote_abort_req = false; + this->remote_aborted = false; + this->remote_pause_req = false; + this->remote_paused = false; + this->remote_restart_req = false; + } + + //--------------------------------------------------------------------------- + void encode(const control_t& ctl, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + encode(static_cast(ctl.dedup_type), bl); + encode(ctl.started, bl); + encode(ctl.dedup_exec, bl); + encode(ctl.shutdown_req, bl); + encode(ctl.shutdown_done, bl); + encode(ctl.local_pause_req, bl); + encode(ctl.local_paused, bl); + encode(ctl.remote_abort_req, bl); + encode(ctl.remote_aborted, bl); + encode(ctl.remote_pause_req, bl); + encode(ctl.remote_paused, bl); + encode(ctl.remote_restart_req, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + int32_t dedup_type; + decode(dedup_type, bl); + ctl.dedup_type = static_cast (dedup_type); + decode(ctl.started, bl); + decode(ctl.dedup_exec, bl); + decode(ctl.shutdown_req, bl); + decode(ctl.shutdown_done, bl); + decode(ctl.local_pause_req, bl); + decode(ctl.local_paused, bl); + decode(ctl.remote_abort_req, bl); + decode(ctl.remote_aborted, bl); + decode(ctl.remote_pause_req, bl); + decode(ctl.remote_paused, bl); + decode(ctl.remote_restart_req, bl); + DECODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, const control_t &ctl) + { + out << ctl.dedup_type; + if (ctl.started) { + out << "::started"; + } + if (ctl.dedup_exec) { + out << "::dedup_exec"; + } + if (ctl.shutdown_req) { + out << "::shutdown_req"; + } + if (ctl.shutdown_done) { + out << "::shutdown_done"; + } + if (ctl.local_pause_req) { + out << "::local_pause_req"; + } + if (ctl.local_paused) { + out << "::local_paused"; + } + if (ctl.remote_abort_req) { + out << "::remote_abort_req"; + } + if (ctl.remote_aborted) { + out << "::remote_aborted"; + } + if (ctl.remote_pause_req) { + out << "::remote_pause_req"; + } + if (ctl.remote_paused) { + out << "::remote_paused"; + } + if (ctl.remote_restart_req) { + out << "::remote_restart_req"; + } + + return out; + } + + //=========================================================================== + // rgw::dedup::Background + //=========================================================================== + //--------------------------------------------------------------------------- + static void display_ioctx_state(const DoutPrefixProvider *dpp, + const librados::IoCtx &ioctx, + const char *caller) + { + if (ioctx.is_valid()) { + ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id=" + << ioctx.get_instance_id() << dendl; + } + else { + ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl; + } + } + + //--------------------------------------------------------------------------- + static int safe_pool_delete(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + int64_t expected_pool_id) + { + const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; + auto rados_handle = store->getRados()->get_rados_handle(); + int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); + if (pool_id < 0) { + int err = pool_id; + if (err == ENOENT) { + ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::" + << dedup_pool.name << "::expected_pool_id=" + << expected_pool_id << dendl; + } + else { + ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name + << ") err=" << cpp_strerror(-err) << dendl; + } + return err; + } + + if (pool_id != expected_pool_id) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: " + << expected_pool_id << " to: " << pool_id + << " abort pool_delete() request!" << dendl; + // report Stale file handle + return -ESTALE; + } + + ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name + << ") pool_id=" << pool_id << dendl; + return rados_handle->pool_delete(dedup_pool.name.c_str()); + } + + //--------------------------------------------------------------------------- + static int64_t create_pool(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + const std::string &pool_name) + { +#if 0 + // using Replica-1 for the intermediate data + // since it can be regenerated in case of a failure + std::string replica_count(std::to_string(1)); +#else + // temporary solution until we find a way to disable the health warn on replica1 + std::string replica_count(std::to_string(2)); +#endif + librados::bufferlist inbl; + std::string output; + std::string command = R"( + { + "prefix": "osd pool create", + "pool": ")" + pool_name + + R"(", + "pool_type": "replicated", + "size": )" + replica_count + + R"( + })"; + + auto rados_handle = store->getRados()->get_rados_handle(); + int ret = rados_handle->mon_command(command, inbl, nullptr, &output); + if (output.length()) { + if (output != "pool 'rgw_dedup_pool' already exists") { + ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl; + } + } + if (ret != 0 && ret != -EEXIST) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool " + << pool_name << " with: " + << cpp_strerror(-ret) << ", ret=" << ret << dendl; + return ret; + } + const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; + return rados_handle->pool_lookup(dedup_pool.name.c_str()); + } + + //--------------------------------------------------------------------------- + static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + bool create, + librados::IoCtx &ioctx) + { + const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; + std::string pool_name(dedup_pool.name.c_str()); + auto rados_handle = store->getRados()->get_rados_handle(); + int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); + if (pool_id >= 0) { + // TBD: what to do when create option is passed + ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name + << " already exists, pool_id=" << pool_id << dendl; + } + else if (create) { + pool_id = create_pool(store, dpp, pool_name); + if (pool_id >= 0) { + ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name + << " was created, pool_id=" << pool_id << dendl; + } + else { + return pool_id; + } + } + else { + ldpp_dout(dpp, 1) << __func__ + << "::ERR: pool doesn't exist and no create option" << dendl; + return -ENOENT; + } + + int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret + << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = ioctx.application_enable("rgw_dedup", false); + if (ret == 0) { + ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name + << " was associated with dedup app" << dendl; + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool " + << dedup_pool.name << " with: " + << cpp_strerror(-ret) << ", ret=" << ret << dendl; + } + return ret; + } + + //--------------------------------------------------------------------------- + int Background::init_rados_access_handles(bool init_pool) + { + store = dynamic_cast(driver); + if (!store) { + ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl; + // this is the return code used in rgw_bucket.cc + return -ENOTSUP; + } + + rados = store->getRados(); + rados_handle = rados->get_rados_handle(); + if (init_pool) { + int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx); + display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__); + return ret; + } + return 0; + } + + //--------------------------------------------------------------------------- + Background::Background(rgw::sal::Driver* _driver, CephContext* _cct) : + driver(_driver), + dp(_cct, dout_subsys, "dedup background: "), + dpp(&dp), + cct(_cct), + d_cluster(dpp, cct, driver), + d_watcher_ctx(this) + { + d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size; + d_head_object_size = cct->_conf->rgw_max_chunk_size; + //ceph_assert(4*1024*1024 == d_head_object_size); + + int ret = init_rados_access_handles(false); + if (ret != 0) { + derr << __func__ << "::ERR: failed init_rados_access_handles() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + throw std::runtime_error("Failed init_rados_access_handles()"); + } + + d_heart_beat_last_update = ceph_clock_now(); + d_heart_beat_max_elapsed_sec = 3; + } + + //--------------------------------------------------------------------------- + int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *p_bucket, + const parsed_etag_t *p_parsed_etag, + const std::string &obj_name, + uint64_t obj_size, + const std::string &storage_class) + { + disk_record_t rec(p_bucket, obj_name, p_parsed_etag, obj_size, storage_class); + // First pass using only ETAG and size taken from bucket-index + rec.s.flags.set_fastlane(); + + auto p_disk = disk_arr.get_shard_block_seq(p_parsed_etag->md5_low); + disk_block_seq_t::record_info_t rec_info; + int ret = p_disk->add_record(d_dedup_cluster_ioctx, &rec, &rec_info); + if (unlikely(ret != 0)) { + return ret; + } + ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/" + << obj_name << " was written to block_idx=" + << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + int Background::add_record_to_dedup_table(dedup_table_t *p_table, + const disk_record_t *p_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_stats_t *p_stats, + remapper_t *remapper) + { + uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size); + storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, + &p_stats->failed_map_overflow); + if (unlikely(sc_idx == remapper_t::NULL_IDX)) { + // TBD: need stat counters + return -EOVERFLOW; + } + key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, + p_rec->s.num_parts, sc_idx); + bool has_shared_manifest = p_rec->has_shared_manifest(); + ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name + << ", obj=" << p_rec->obj_name << ", block_id=" + << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id + << ", shared_manifest=" << has_shared_manifest + << "::num_parts=" << p_rec->s.num_parts + << "::size_4k_units=" << key.size_4k_units + << "::ETAG=" << std::hex << p_rec->s.md5_high + << p_rec->s.md5_low << std::dec << dendl; + + int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest); + if (ret == 0) { + p_stats->loaded_objects ++; + ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" + << p_rec->obj_name << " was added successfully to table" + << "::loaded_objects=" << p_stats->loaded_objects << dendl; + } + else { + // We allocate memory for the dedup on startup based on the existing obj count + // If the system grew significantly since that point we won't be able to + // accommodate all the objects in the hash-table. + // Please keep in mind that it is very unlikely since duplicates objects will + // consume a single entry and since we skip small objects so in reality + // I expect the allocation to be more than sufficient. + // + // However, if we filled up the system there is still value is continuing + // with this process since we might find duplicates to existing object (which + // don't take extra space) + + int level = 15; + if (p_stats->failed_table_load % 0x10000 == 0) { + level = 5; + } + else if (p_stats->failed_table_load % 0x100 == 0) { + level = 10; + } + ldpp_dout(dpp, level) << __func__ << "::Failed p_table->add_entry (overflow)" + << "::loaded_objects=" << p_stats->loaded_objects + << "::failed_table_load=" << p_stats->failed_table_load + << dendl; + + p_stats->failed_table_load++; + } + return ret; + } + +#ifdef FULL_DEDUP_SUPPORT + + static constexpr uint64_t cost = 1; // 1 throttle unit per request + static constexpr uint64_t id = 0; // ids unused + //--------------------------------------------------------------------------- + [[maybe_unused]]static void show_ref_tags(const DoutPrefixProvider* dpp, std::string &oid, rgw_rados_ref &obj) + { + unsigned idx = 0; + std::list refs; + std::string wildcard_tag; + int ret = cls_refcount_read(obj.ioctx, oid, &refs, true); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << "::ERR: manifest::failed cls_refcount_read()" + << " idx=" << idx << dendl; + return; + } + + for (list::iterator iter = refs.begin(); iter != refs.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << "::manifest::" << oid << "::" << idx + << "::TAG=" << *iter << dendl; + } + } + + //--------------------------------------------------------------------------- + int Background::free_tail_objs_by_manifest(const string &ref_tag, + const string &oid, + RGWObjManifest &tgt_manifest) + { + unsigned idx = 0; + for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + if (oid == raw_obj.oid) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl; + continue; + } + + rgw_rados_ref obj; + int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "ERR: manifest::failed to open context " + << obj << dendl; + continue; + } + librados::IoCtx ioctx = obj.ioctx; + ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid + << dendl; + ret = ioctx.remove(raw_obj.oid); + } + + return 0; + } + + //--------------------------------------------------------------------------- + int Background::rollback_ref_by_manifest(const string &ref_tag, + const string &oid, + RGWObjManifest &manifest) + { + unsigned idx = 0; + int ret_code = 0; + std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + if (oid == raw_obj.oid) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " + << raw_obj.oid << dendl; + continue; + } + + rgw_rados_ref obj; + int local_ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); + if (local_ret < 0) { + ret_code = local_ret; + ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context " + << obj << dendl; + // skip bad objects, nothing we can do + continue; + } + + ObjectWriteOperation op; + cls_refcount_put(op, ref_tag, true); + rgw::AioResultList completed = aio->get(obj.obj, + rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), + cost, id); + } + rgw::AioResultList completed = aio->drain(); + return ret_code; + } + + //--------------------------------------------------------------------------- + int Background::inc_ref_count_by_manifest(const string &ref_tag, + const string &oid, + RGWObjManifest &manifest) + { + std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); + rgw::AioResultList all_results; + int ret = 0; + unsigned idx = 0; + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + if (oid == raw_obj.oid) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl; + continue; + } + + rgw_rados_ref obj; + ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context " + << obj << dendl; + break; + } + + ObjectWriteOperation op; + cls_refcount_get(op, ref_tag, true); + ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl; + rgw::AioResultList completed = aio->get(obj.obj, + rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), + cost, id); + ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj + << ", the error code = " << ret << dendl; + break; + } + } + + if (ret == 0) { + rgw::AioResultList completed = aio->drain(); + int ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret == 0) { + return 0; + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest: failed to drain ios ret=" + << ret < rollback all ref-inc operations + /* wait all pending op done */ + rgw::AioResultList completed = aio->drain(); + all_results.splice(all_results.end(), completed); + int ret2 = 0; + for (auto& aio_res : all_results) { + if (aio_res.result < 0) { + continue; // skip errors + } + rgw_rados_ref obj; + ret2 = rgw_get_rados_ref(dpp, rados_handle, aio_res.obj, &obj); + if (ret2 < 0) { + continue; + } + + ObjectWriteOperation op; + cls_refcount_put(op, ref_tag, true); + rgw::AioResultList completed = aio->get(obj.obj, + rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), + cost, id); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl; + } + } + completed = aio->drain(); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret=" + << ret2 < bucket; + { + rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; + int ret = driver->load_bucket(dpp, b, &bucket, null_yield); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): " + << cpp_strerror(-ret) << dendl; + return ret; + } + } + + build_oid(p_rec->bucket_id, p_rec->obj_name, oid); + //ldpp_dout(dpp, 0) << __func__ << "::OID=" << oid << " || bucket_id=" << bucket_id << dendl; + rgw_pool data_pool; + rgw_obj obj{bucket->get_key(), *oid}; + if (!rados->get_obj_data_pool(bucket->get_placement_rule(), obj, &data_pool)) { + ldpp_dout(dpp, 1) << __func__ << "::failed to get data pool for bucket " + << bucket->get_name() << dendl; + return -EIO; + } + int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:" + << data_pool.to_str() << dendl; + return -EIO; + } + + return 0; + } + + //--------------------------------------------------------------------------- + static void init_cmp_pairs(const disk_record_t *p_rec, + const bufferlist &etag_bl, + bufferlist &hash_bl, // OUT PARAM + librados::ObjectWriteOperation *p_op) + { + p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl); + // TBD: do we really need the secondary compare using the full manifest? + // Can replace it with something cheaper like size/version? + p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl); + + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + const unsigned units = (256 / (sizeof(uint64_t)*8)); + static_assert(units == 4); + for (unsigned i = 0; i < units; i++) { + ceph::encode(p_rec->s.hash[i], hash_bl); + } + + if (!p_rec->s.flags.hash_calculated()) { + p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl); + } + } + + //--------------------------------------------------------------------------- + int Background::dedup_object(const disk_record_t *p_src_rec, + const disk_record_t *p_tgt_rec, + md5_stats_t *p_stats, + bool has_shared_manifest_src) + { + RGWObjManifest src_manifest; + try { + auto bl_iter = p_src_rec->manifest_bl.cbegin(); + decode(src_manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl; + return -EINVAL; + } + RGWObjManifest tgt_manifest; + try { + auto bl_iter = p_tgt_rec->manifest_bl.cbegin(); + decode(tgt_manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: " + << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> " + << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl; + + bufferlist etag_bl; + etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl); + ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts + << "::ETAG=" << etag_bl.to_str() << dendl; + + bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl; + crypto::digest(p_src_rec->manifest_bl).encode(hash_bl); + // Use a shorter hash (64bit instead of 160bit) + hash_bl.splice(0, 8, &manifest_hash_bl); + librados::ObjectWriteOperation tgt_op; + init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op); + tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); + tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl); + if (p_tgt_rec->s.flags.hash_calculated()) { + tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl); + p_stats->set_hash_attrs++; + } + + std::string src_oid, tgt_oid; + librados::IoCtx src_ioctx, tgt_ioctx; + int ret1 = get_ioctx(dpp, driver, rados, p_src_rec, &src_ioctx, &src_oid); + int ret2 = get_ioctx(dpp, driver, rados, p_tgt_rec, &tgt_ioctx, &tgt_oid); + if (unlikely(ret1 != 0 || ret2 != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl; + return (ret1 ? ret1 : ret2); + } + + // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG?? + string ref_tag = p_tgt_rec->ref_tag; + ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl; + int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); + if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl; + ret = tgt_ioctx.operate(tgt_oid, &tgt_op); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate(" + << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl; + rollback_ref_by_manifest(ref_tag, src_oid, src_manifest); + return ret; + } + + // free tail objects based on TGT manifest + free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest); + + if (!has_shared_manifest_src) { + // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST + // after deduping B and update it in dedup_table, but don't update the + // disk-record (as require an expensive random-disk-write). + // When deduping C we can trust the shared_manifest state in the table and + // skip a redundant update to SRC object attribute + bufferlist src_hash_bl; + librados::ObjectWriteOperation src_op; + init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op); + src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); + if (p_src_rec->s.flags.hash_calculated()) { + src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl); + p_stats->set_hash_attrs++; + } + + ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl; + ret = src_ioctx.operate(src_oid, &src_op); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate(" + << src_oid << "), err is " << cpp_strerror(-ret)<obj_name << dendl; + return -EINVAL; + } + + blake3_hasher hmac; + blake3_hasher_init(&hmac); + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + rgw_rados_ref obj; + int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: " + << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; + return ret; + } + + bufferlist bl; + librados::IoCtx ioctx = obj.ioctx; + // read full object + ret = ioctx.read(raw_obj.oid, bl, 0, 0); + if (ret > 0) { + for (const auto& bptr : bl.buffers()) { + blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length()); + } + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid + << ", error is " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN); + return 0; + } + + //--------------------------------------------------------------------------- + [[maybe_unused]]static void __attribute__ ((noinline)) + print_record(const DoutPrefixProvider* dpp, + const disk_record_t *p_tgt_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard) + { + ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name + << ", obj=" << p_tgt_rec->obj_name + << ", block_id=" << block_id + << ", rec_id=" << (int)rec_id + << ", md5_shard=" << (int)md5_shard << dendl; + + ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard + << "::" << p_tgt_rec->bucket_name + << "/" << p_tgt_rec->obj_name + << "::num_parts=" << p_tgt_rec->s.num_parts + << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high + << p_tgt_rec->s.md5_low << std::dec << dendl; + } + + //--------------------------------------------------------------------------- + int Background::add_obj_attrs_to_record(rgw_bucket *p_rb, + disk_record_t *p_rec, + const rgw::sal::Attrs &attrs, + dedup_table_t *p_table, + md5_stats_t *p_stats) /*IN-OUT*/ + { + // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG + auto itr = attrs.find(RGW_ATTR_TAIL_TAG); + if (itr != attrs.end()) { + p_rec->ref_tag = itr->second.to_str(); + } + else { + itr = attrs.find(RGW_ATTR_ID_TAG); + if (itr != attrs.end()) { + p_rec->ref_tag = itr->second.to_str(); + } + else { + ldpp_dout(dpp, 5) << __func__ << "::No TAIL_TAG and no ID_TAG" << dendl; + return -EINVAL; + } + } + p_rec->s.ref_tag_len = p_rec->ref_tag.length(); + + // clear bufferlist first + p_rec->manifest_bl.clear(); + + itr = attrs.find(RGW_ATTR_MANIFEST); + if (itr != attrs.end()) { + const bufferlist &bl = itr->second; + RGWObjManifest manifest; + try { + auto bl_iter = bl.cbegin(); + decode(manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ + << "::ERROR: unable to decode manifest" << dendl; + return -EINVAL; + } + + // force explicit tail_placement as the dedup could be on another bucket + const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); + if (tail_placement.bucket.name.empty()) { + ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl; + manifest.set_tail_placement(tail_placement.placement_rule, *p_rb); + encode(manifest, p_rec->manifest_bl); + } + else { + p_rec->manifest_bl = bl; + } + p_rec->s.manifest_len = p_rec->manifest_bl.length(); + } + else { + ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl; + return -EINVAL; + } + + itr = attrs.find(RGW_ATTR_SHARE_MANIFEST); + if (itr != attrs.end()) { + uint64_t hash = 0; + try { + auto bl_iter = itr->second.cbegin(); + ceph::decode(hash, bl_iter); + p_rec->s.shared_manifest = hash; + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad shared_manifest" << dendl; + return -EINVAL; + } + ldpp_dout(dpp, 20) << __func__ << "::Set Shared_Manifest::OBJ_NAME=" + << p_rec->obj_name << "::shared_manifest=0x" << std::hex + << p_rec->s.shared_manifest << std::dec << dendl; + p_rec->s.flags.set_shared_manifest(); + } + else { + memset(&p_rec->s.shared_manifest, 0, sizeof(p_rec->s.shared_manifest)); + } + + itr = attrs.find(RGW_ATTR_BLAKE3); + if (itr != attrs.end()) { + try { + auto bl_iter = itr->second.cbegin(); + // BLAKE3 hash 256 bit splitted into multiple 64bit units + const unsigned units = (256 / (sizeof(uint64_t)*8)); + static_assert(units == 4); + for (unsigned i = 0; i < units; i++) { + uint64_t val; + ceph::decode(val, bl_iter); + p_rec->s.hash[i] = val; + } + p_stats->valid_hash_attrs++; + return 0; + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl; + return -EINVAL; + } + } + + p_stats->invalid_hash_attrs++; + // TBD: redundant memset... + memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash)); + // BLAKE3_OUT_LEN is 32 Bytes + int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash); + if (ret == 0) { + p_rec->s.flags.set_hash_calculated(); + } + + return ret; + } + + //--------------------------------------------------------------------------- + // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table + // so all entries left are sources of dedup with multiple copies. + // Need to read attributes from the Head-Object and output them to a new SLAB + int Background::read_object_attribute(dedup_table_t *p_table, + disk_record_t *p_rec, + disk_block_id_t old_block_id, + record_id_t old_rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats /* IN-OUT */, + disk_block_seq_t *p_disk, + remapper_t *remapper) + { + bool should_print_debug = cct->_conf->subsys.should_gather(); + if (unlikely(should_print_debug)) { + print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard); + } + p_stats->processed_objects ++; + + uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size); + uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); + storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, + &p_stats->failed_map_overflow); + if (unlikely(sc_idx == remapper_t::NULL_IDX)) { + // TBD: need stat counters + return -EOVERFLOW; + } + key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, + p_rec->s.num_parts, sc_idx); + dedup_table_t::value_t src_val; + int ret = p_table->get_val(&key_from_bucket_index, &src_val); + if (ret != 0) { + if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) { + // record has no valid entry in table because it is a too small + // It was loaded to table for calculation and then purged + p_stats->skipped_purged_small++; + ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::" + << p_rec->obj_name << "::" << ondisk_byte_size << dendl; + // help small object tests pass - avoid complication differentiating between + // small objects ( < 64KB, >= 64KB <= 4MB, > 4MB + p_stats->processed_objects--; + } + else { + // record has no valid entry in table because it is a singleton + p_stats->skipped_singleton++; + p_stats->skipped_singleton_bytes += ondisk_byte_size; + ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::" + << p_rec->obj_name << std::dec << dendl; + } + return 0; + } + + // Every object after this point was counted as a dedup potential + // If we conclude that it can't be dedup it should be accounted for + rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; + unique_ptr bucket; + ret = driver->load_bucket(dpp, b, &bucket, null_yield); + if (unlikely(ret != 0)) { + // could happen when the bucket is removed between passes + p_stats->ingress_failed_load_bucket++; + ldpp_dout(dpp, 15) << __func__ << "::Failed driver->load_bucket(): " + << cpp_strerror(-ret) << dendl; + return 0; + } + + unique_ptr p_obj = bucket->get_object(p_rec->obj_name); + if (unlikely(!p_obj)) { + // could happen when the object is removed between passes + p_stats->ingress_failed_get_object++; + ldpp_dout(dpp, 15) << __func__ << "::Failed bucket->get_object(" + << p_rec->obj_name << ")" << dendl; + return 0; + } + + ret = p_obj->get_obj_attrs(null_yield, dpp); + if (unlikely(ret < 0)) { + p_stats->ingress_failed_get_obj_attrs++; + ldpp_dout(dpp, 10) << __func__ << "::ERR: failed to stat object(" << p_rec->obj_name + << "), returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + const rgw::sal::Attrs& attrs = p_obj->get_attrs(); + if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) { + p_stats->ingress_skip_encrypted++; + p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size; + ldpp_dout(dpp, 20) <<__func__ << "::Skipping encrypted object " + << p_rec->obj_name << dendl; + return 0; + } + + // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed + if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) { + p_stats->ingress_skip_compressed++; + p_stats->ingress_skip_compressed_bytes += ondisk_byte_size; + ldpp_dout(dpp, 20) <<__func__ << "::Skipping compressed object " + << p_rec->obj_name << dendl; + return 0; + } + + // extract ETAG and Size and compare with values taken from the bucket-index + parsed_etag_t parsed_etag; + auto itr = attrs.find(RGW_ATTR_ETAG); + if (itr != attrs.end()) { + if (unlikely(!parse_etag_string(itr->second.to_str(), &parsed_etag))) { + p_stats->ingress_corrupted_etag++; + ldpp_dout(dpp, 10) << __func__ << "::ERROR: corrupted etag::" << p_rec->obj_name << dendl; + return -EINVAL; + } + } + else { + p_stats->ingress_corrupted_etag++; + ldpp_dout(dpp, 10) << __func__ << "::ERROR: no etag" << p_rec->obj_name << dendl; + return -EINVAL; + } + + std::string storage_class; + itr = attrs.find(RGW_ATTR_STORAGE_CLASS); + if (itr != attrs.end()) { + storage_class = itr->second.to_str(); + } + else { + storage_class = RGW_STORAGE_CLASS_STANDARD; + } + // no need to check for remap success as we compare keys bellow + sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow); + key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low, + byte_size_to_disk_blocks(p_obj->get_size()), + parsed_etag.num_parts, sc_idx); + if (unlikely(key_from_obj != key_from_bucket_index || + p_rec->s.obj_bytes_size != p_obj->get_size())) { + ldpp_dout(dpp, 15) <<__func__ << "::Skipping changed object " + << p_rec->obj_name << dendl; + p_stats->ingress_skip_changed_objs++; + return 0; + } + + // reset flags + p_rec->s.flags.clear(); + ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + + disk_block_seq_t::record_info_t rec_info; + ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info); + if (ret == 0) { + // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest + ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK); + ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" + << p_rec->obj_name << " was written to block_idx=" + << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id + << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl; + p_table->update_entry(&key_from_bucket_index, rec_info.block_id, + rec_info.rec_id, p_rec->has_shared_manifest()); + } + else { + ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl; + if (ret == -EINVAL) { + p_stats->ingress_corrupted_obj_attrs++; + } + } + return ret; + } + + //--------------------------------------------------------------------------- + static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp, + rgw::sal::Driver* driver, + RGWRados* rados, + const disk_record_t *p_rec) + { + bufferlist etag_bl; + bufferlist hash_bl; + librados::ObjectWriteOperation op; + etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts, + &etag_bl); + init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op); + op.setxattr(RGW_ATTR_BLAKE3, hash_bl); + + std::string oid; + librados::IoCtx ioctx; + int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl; + return ret; + } + + ret = ioctx.operate(oid, &op); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate(" + << oid << "), err is " << cpp_strerror(-ret) << dendl; + } + return ret; + } + + //--------------------------------------------------------------------------- + // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table + // so all entries left are sources of dedup with multiple copies. + // If the record is marked as Shared-Manifest-Object -> skip it + // if the record's key doesn’t exist in table -> skip it (it is a singleton and it was purged) + // If the record block-index matches the hashtable entry -> skip it (it is the SRC object) + // All other entries are Dedicated-Manifest-Objects with a valid SRC object + + // we can withstand most errors moving to the next object + // only report an error if we recived a stop scan request! + // + int Background::try_deduping_record(dedup_table_t *p_table, + const disk_record_t *p_tgt_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats, /* IN-OUT */ + remapper_t *remapper) + { + bool should_print_debug = cct->_conf->subsys.should_gather(); + if (unlikely(should_print_debug)) { + print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard); + } + + uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size); + storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp, + &p_stats->failed_map_overflow); + ceph_assert(sc_idx != remapper_t::NULL_IDX); + key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units, + p_tgt_rec->s.num_parts, sc_idx); + dedup_table_t::value_t src_val; + int ret = p_table->get_val(&key, &src_val); + if (ret != 0) { + // record has no valid entry in table because it is a singleton + // should never happened since we purged all singletons before + ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name + << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts + << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high + << p_tgt_rec->s.md5_low << std::dec << dendl; + ceph_abort("Unexpcted singleton"); + return 0; + } + + disk_block_id_t src_block_id = src_val.block_idx; + record_id_t src_rec_id = src_val.rec_id; + if (block_id == src_block_id && rec_id == src_rec_id) { + // the table entry point to this record which means it is a dedup source so nothing to do + p_stats->skipped_source_record++; + ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl; + return 0; + } + + // ceph store full blocks so need to round up and multiply by block_size + uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); + uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size, + p_tgt_rec->s.num_parts, + ondisk_byte_size); + if (p_tgt_rec->s.flags.has_shared_manifest()) { + // record holds a shared_manifest object so can't be a dedup target + p_stats->skipped_shared_manifest++; + p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes; + ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl; + return 0; + } + + // This records is a dedup target with source record on source_block_id + disk_record_t src_rec; + ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id, + src_rec_id, md5_shard, dpp); + if (unlikely(ret != 0)) { + p_stats->failed_src_load++; + // we can withstand most errors moving to the next object + ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record(" + << src_block_id << ", " << src_rec_id << ")" << dendl; + return 0; + } + + ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name + << "/" << src_rec.obj_name << dendl; + // verify that SRC and TGT records don't refer to the same physical object + // This could happen in theory if we read the same objects twice + if (src_rec.obj_name == p_tgt_rec->obj_name && src_rec.bucket_name == p_tgt_rec->bucket_name) { + p_stats->duplicate_records++; + ldpp_dout(dpp, 10) << __func__ << "::WARN: Duplicate records for object=" + << src_rec.obj_name << dendl; + return 0; + } + + // the hash table size is rounded to the nearest 4KB and will wrap after 16G + if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) { + p_stats->size_mismatch++; + ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::" + << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size + << "::" << p_tgt_rec->obj_name << "::" + << p_tgt_rec->s.obj_bytes_size << dendl; + return 0; + } + + if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) { + p_stats->hash_mismatch++; + ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl; + // TBD: set hash attributes on head objects to save calc next time + if (src_rec.s.flags.hash_calculated()) { + write_blake3_object_attribute(dpp, driver, rados, &src_rec); + p_stats->set_hash_attrs++; + } + if (p_tgt_rec->s.flags.hash_calculated()) { + write_blake3_object_attribute(dpp, driver, rados, p_tgt_rec); + p_stats->set_hash_attrs++; + } + return 0; + } + + ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest()); + if (ret == 0) { + p_stats->deduped_objects++; + p_stats->deduped_objects_bytes += dedupable_objects_bytes; + if (p_tgt_rec->s.num_parts == 0) { + // single part objects duplicate the head object when dedup is used + p_stats->dup_head_bytes += d_head_object_size; + } + + // mark the SRC object as a providor of a shared manifest + if (!src_val.has_shared_manifest()) { + p_stats->set_shared_manifest_src++; + // set the shared manifest flag in the dedup table + p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id); + } + else { + ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl; + } + } + else { + ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for " + << src_rec.bucket_name << "/" << src_rec.obj_name << dendl; + p_stats->failed_dedup++; + } + + return 0; + } + +#endif // #ifdef FULL_DEDUP_SUPPORT + //--------------------------------------------------------------------------- + const char* Background::dedup_step_name(dedup_step_t step) + { + static const char* names[] = {"STEP_NONE", + "STEP_BUCKET_INDEX_INGRESS", + "STEP_BUILD_TABLE", + "STEP_READ_ATTRIBUTES", + "STEP_REMOVE_DUPLICATES"}; + static const char* undefined_step = "UNDEFINED_STEP"; + if (step >= STEP_NONE && step <= STEP_REMOVE_DUPLICATES) { + return names[step]; + } + else { + return undefined_step; + } + } + + //--------------------------------------------------------------------------- + int Background::process_all_slabs(dedup_table_t *p_table, + dedup_step_t step, + md5_shard_t md5_shard, + work_shard_t worker_id, + uint32_t *p_slab_count, + md5_stats_t *p_stats, /* IN-OUT */ + disk_block_seq_t *p_disk_block_seq, + remapper_t *remapper) + { + char block_buff[sizeof(disk_block_t)]; + const int MAX_OBJ_LOAD_FAILURE = 3; + const int MAX_BAD_BLOCKS = 2; + bool has_more = true; + uint32_t seq_number = 0; + int failure_count = 0; + ldpp_dout(dpp, 20) << __func__ << "::" << dedup_step_name(step) << "::worker_id=" + << worker_id << ", md5_shard=" << md5_shard << dendl; + *p_slab_count = 0; + while (has_more) { + bufferlist bl; + int ret = load_slab(d_dedup_cluster_ioctx, bl, md5_shard, worker_id, seq_number, dpp); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR::Failed loading object!! md5_shard=" << md5_shard + << ", worker_id=" << worker_id << ", seq_number=" << seq_number + << ", failure_count=" << failure_count << dendl; + // skip to the next SLAB stopping after 3 bad objects + if (failure_count++ < MAX_OBJ_LOAD_FAILURE) { + seq_number += DISK_BLOCK_COUNT; + continue; + } + else { + return ret; + } + } + + (*p_slab_count)++; + failure_count = 0; + unsigned slab_rec_count = 0; + auto bl_itr = bl.cbegin(); + for (uint32_t block_num = 0; block_num < DISK_BLOCK_COUNT; block_num++, seq_number++) { + disk_block_id_t disk_block_id(worker_id, seq_number); + const char *p = get_next_data_ptr(bl_itr, block_buff, sizeof(block_buff), + dpp); + disk_block_t *p_disk_block = (disk_block_t*)p; + disk_block_header_t *p_header = p_disk_block->get_header(); + p_header->deserialize(); + if (unlikely(p_header->verify(disk_block_id, dpp) != 0)) { + p_stats->failed_block_load++; + // move to next block until reaching a valid block + if (failure_count++ < MAX_BAD_BLOCKS) { + continue; + } + else { + ldpp_dout(dpp, 1) << __func__ << "::Skipping slab with too many bad blocks::" + << (int)md5_shard << ", worker_id=" << (int)worker_id + << ", seq_number=" << seq_number << dendl; + failure_count = 0; + break; + } + } + + if (p_header->rec_count == 0) { + ldpp_dout(dpp, 20) << __func__ << "::Block #" << block_num + << " has an empty header, no more blocks" << dendl; + has_more = false; + break; + } + + for (unsigned rec_id = 0; rec_id < p_header->rec_count; rec_id++) { + unsigned offset = p_header->rec_offsets[rec_id]; + // We deserialize the record inside the CTOR + disk_record_t rec(p + offset); + ret = rec.validate(__func__, dpp, disk_block_id, rec_id); + if (unlikely(ret != 0)) { + p_stats->failed_rec_load++; + return ret; + } + + if (step == STEP_BUILD_TABLE) { + add_record_to_dedup_table(p_table, &rec, disk_block_id, rec_id, p_stats, remapper); + slab_rec_count++; + } +#ifdef FULL_DEDUP_SUPPORT + else if (step == STEP_READ_ATTRIBUTES) { + read_object_attribute(p_table, &rec, disk_block_id, rec_id, md5_shard, + p_stats, p_disk_block_seq, remapper); + slab_rec_count++; + } + else if (step == STEP_REMOVE_DUPLICATES) { + try_deduping_record(p_table, &rec, disk_block_id, rec_id, md5_shard, + p_stats, remapper); + slab_rec_count++; + } +#endif // #ifdef FULL_DEDUP_SUPPORT + else { + ceph_abort("unexpected step"); + } + } + + check_and_update_md5_heartbeat(md5_shard, p_stats->loaded_objects, + p_stats->processed_objects); + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + if (unlikely(d_ctl.should_stop())) { + return -ECANCELED; + } + + has_more = (p_header->offset == BLOCK_MAGIC); + ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC); + if (!has_more) { + ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id + << ", rec_count=" << p_header->rec_count << dendl; + break; + } + } + ldpp_dout(dpp, 20) <<__func__ << "::slab seq_number=" << seq_number + << ", rec_count=" << slab_rec_count << dendl; + } + return 0; + } + + //--------------------------------------------------------------------------- + static void __attribute__ ((noinline)) + show_ingress_bucket_idx_obj(const DoutPrefixProvider *dpp, + const parsed_etag_t &parsed_etag, + const string &bucket_name, + const string &obj_name) + { + ldpp_dout(dpp, 20) << __func__ << "::(1)::" << bucket_name << "/" << obj_name + << "::num_parts=" << parsed_etag.num_parts + << "::ETAG=" << std::hex << parsed_etag.md5_high + << parsed_etag.md5_low << std::dec << dendl; + } + + //--------------------------------------------------------------------------- + int Background::ingress_bucket_idx_single_object(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *p_bucket, + const rgw_bucket_dir_entry &entry, + worker_stats_t *p_worker_stats /*IN-OUT*/) + { + // ceph store full blocks so need to round up and multiply by block_size + uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size); + // count all objects including too small and non default storage_class objs + p_worker_stats->ingress_obj++; + p_worker_stats->ingress_obj_bytes += ondisk_byte_size; + + parsed_etag_t parsed_etag; + if (unlikely(!parse_etag_string(entry.meta.etag, &parsed_etag))) { + p_worker_stats->ingress_corrupted_etag++; + ldpp_dout(dpp, 1) << __func__ << "::ERROR: corrupted etag" << dendl; + return -EINVAL; + } + + if (unlikely((cct->_conf->subsys.should_gather()))) { + show_ingress_bucket_idx_obj(dpp, parsed_etag, p_bucket->get_name(), entry.key.name); + } + + // We limit dedup to objects from the same storage_class + // TBD: + // Should we use a skip-list of storage_classes we should skip (like glacier) ? + const std::string& storage_class = + rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class); + if (storage_class == RGW_STORAGE_CLASS_STANDARD) { + p_worker_stats->default_storage_class_objs++; + p_worker_stats->default_storage_class_objs_bytes += ondisk_byte_size; + } + else { + ldpp_dout(dpp, 20) << __func__ << "::" << entry.key.name + << "::storage_class:" << entry.meta.storage_class << dendl; + p_worker_stats->non_default_storage_class_objs++; + p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size; + } + + if (ondisk_byte_size <= d_min_obj_size_for_dedup) { + if (parsed_etag.num_parts == 0) { + // dedup only useful for objects bigger than 4MB + p_worker_stats->ingress_skip_too_small++; + p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size; + + if (ondisk_byte_size >= 64*1024) { + p_worker_stats->ingress_skip_too_small_64KB++; + p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size; + } + else { + return 0; + } + } + else { + // multipart objects are always good candidates for dedup + // the head object is empty and data is stored only in tail objs + p_worker_stats->small_multipart_obj++; + } + } + // multipart/single_part counters are for objects being fully processed + if (parsed_etag.num_parts > 0) { + p_worker_stats->multipart_objs++; + } + else { + p_worker_stats->single_part_objs++; + } + + return add_disk_rec_from_bucket_idx(disk_arr, p_bucket, &parsed_etag, + entry.key.name, entry.meta.size, + storage_class); + } + + //--------------------------------------------------------------------------- + void Background::check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, + uint64_t count_b, const char *prefix) + { + utime_t now = ceph_clock_now(); + utime_t time_elapsed = now - d_heart_beat_last_update; + if (unlikely(time_elapsed.tv.tv_sec >= d_heart_beat_max_elapsed_sec)) { + ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec=" + << d_heart_beat_max_elapsed_sec << dendl; + d_heart_beat_last_update = now; + d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b, + prefix); + } + } + + //--------------------------------------------------------------------------- + void Background::check_and_update_worker_heartbeat(work_shard_t worker_id, + int64_t ingress_obj_count) + { + check_and_update_heartbeat(worker_id, ingress_obj_count, 0, WORKER_SHARD_PREFIX); + } + + //--------------------------------------------------------------------------- + void Background::check_and_update_md5_heartbeat(md5_shard_t md5_id, + uint64_t load_count, + uint64_t dedup_count) + { + check_and_update_heartbeat(md5_id, load_count, dedup_count, MD5_SHARD_PREFIX); + } + + //--------------------------------------------------------------------------- + static uint32_t move_to_next_bucket_index_shard(const DoutPrefixProvider* dpp, + unsigned current_shard, + unsigned num_work_shards, + const std::string &bucket_name, + rgw_obj_index_key *p_marker /* OUT-PARAM */) + { + uint32_t next_shard = current_shard + num_work_shards; + ldpp_dout(dpp, 20) << __func__ << "::" << bucket_name << "::curr_shard=" + << current_shard << ", next shard=" << next_shard << dendl; + *p_marker = rgw_obj_index_key(); // reset marker to an empty index + return next_shard; + } + + // This function process bucket-index shards of a given @bucket + // The bucket-index-shards are stored in a group of @oids + // The @oids are using a simple map from the shard-id to the oid holding bucket-indices + // We start by processing all bucket-indices owned by this @worker-id + // Once we are done with a given bucket-index shard we skip to the next + // bucket-index-shard owned by this worker-id + // if (bucket_index_shard % work_id) == 0) -> read and process bucket_index_shard + // else -> skip bucket_index_shard and don't read it + //--------------------------------------------------------------------------- + int Background::process_bucket_shards(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *bucket, + std::map &oids, + librados::IoCtx &ioctx, + work_shard_t worker_id, + work_shard_t num_work_shards, + worker_stats_t *p_worker_stats /*IN-OUT*/) + { + const uint32_t num_shards = oids.size(); + uint32_t current_shard = worker_id; + rgw_obj_index_key marker; // start with an empty marker + const string null_prefix, null_delimiter; + const bool list_versions = true; + const int max_entries = 1000; + uint32_t obj_count = 0; + + while (current_shard < num_shards ) { + check_and_update_worker_heartbeat(worker_id, p_worker_stats->ingress_obj); + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + if (unlikely(d_ctl.should_stop())) { + return -ECANCELED; + } + + const string& oid = oids[current_shard]; + rgw_cls_list_ret result; + librados::ObjectReadOperation op; + // get bucket-indices of @current_shard + cls_rgw_bucket_list_op(op, marker, null_prefix, null_delimiter, max_entries, + list_versions, &result); + int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), nullptr, null_yield); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_rados_operate() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards, + bucket->get_name(), &marker); + continue; + } + obj_count += result.dir.m.size(); + for (auto& entry : result.dir.m) { + const rgw_bucket_dir_entry& dirent = entry.second; + if (unlikely((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty())) { + // TBD: should we bailout ??? + ldpp_dout(dpp, 1) << __func__ << "::ERR: calling check_disk_state bucket=" + << bucket->get_name() << " entry=" << dirent.key << dendl; + // make sure we're advancing marker + marker = dirent.key; + continue; + } + marker = dirent.key; + ret = ingress_bucket_idx_single_object(disk_arr, bucket, dirent, p_worker_stats); + } + // TBD: advance marker only once here! + if (result.is_truncated) { + ldpp_dout(dpp, 15) << __func__ << "::[" << current_shard + << "]result.is_truncated::count=" << obj_count << dendl; + } + else { + // we reached the end of this shard -> move to the next shard + current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards, + bucket->get_name(), &marker); + ldpp_dout(dpp, 15) << __func__ << "::move_to_next_bucket_index_shard::count=" + << obj_count << "::new_shard=" << current_shard << dendl; + } + } + ldpp_dout(dpp, 15) << __func__ << "::Finished processing Bucket " + << bucket->get_name() << ", num_shards=" << num_shards + << ", obj_count=" << obj_count << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + int Background::ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr, + const rgw_bucket &bucket_rec, + work_shard_t worker_id, + work_shard_t num_work_shards, + worker_stats_t *p_worker_stats /*IN-OUT*/) + { + unique_ptr bucket; + int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): " + << cpp_strerror(-ret) << dendl; + return ret; + } + + const std::string bucket_id = bucket->get_key().get_key(); + RGWBucketInfo bucket_info; + ret = rados->get_bucket_instance_info(bucket_id, bucket_info, + nullptr, nullptr, null_yield, dpp); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) { + // probably a race condition with bucket removal + ldpp_dout(dpp, 10) << __func__ << "::ret == -ENOENT" << dendl; + return 0; + } + ldpp_dout(dpp, 5) << __func__ << "::ERROR: get_bucket_instance_info(), ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + const rgw::bucket_index_layout_generation idx_layout = bucket_info.layout.current_index; + librados::IoCtx ioctx; + // objects holding the bucket-listings + std::map oids; + ret = store->svc()->bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, + idx_layout, &ioctx, &oids, nullptr); + if (ret >= 0) { + // process all the shards in this bucket owned by the worker_id + return process_bucket_shards(disk_arr, bucket.get(), oids, ioctx, worker_id, + num_work_shards, p_worker_stats); + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: open_bucket_index() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + } + + //--------------------------------------------------------------------------- + static void display_table_stat_counters(const DoutPrefixProvider* dpp, + const md5_stats_t *p_stats) + { + uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count + + p_stats->big_objs_stat.unique_count + + p_stats->big_objs_stat.duplicate_count); + + ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n" + << "::total_count=" << obj_count_in_shard + << "::loaded_objects=" << p_stats->loaded_objects + << p_stats->big_objs_stat << dendl; + ldpp_dout(dpp, 10) << __func__ << "::small objs::" + << p_stats->small_objs_stat << dendl; + } + + //--------------------------------------------------------------------------- + int Background::objects_dedup_single_md5_shard(dedup_table_t *p_table, + md5_shard_t md5_shard, + md5_stats_t *p_stats, + work_shard_t num_work_shards) + { + remapper_t remapper(MAX_STORAGE_CLASS_IDX); + // make sure that the standard storage_class is always in the mapper! + storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp, + &p_stats->failed_map_overflow); + ceph_assert(sc_idx == 0); + uint32_t slab_count_arr[num_work_shards]; + // first load all etags to hashtable to find dedups + // the entries come from bucket-index and got minimal info (etag, size) + for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { + process_all_slabs(p_table, STEP_BUILD_TABLE, md5_shard, worker_id, + slab_count_arr+worker_id, p_stats, nullptr, &remapper); + if (unlikely(d_ctl.should_stop())) { + ldpp_dout(dpp, 5) << __func__ << "::STEP_BUILD_TABLE::STOPPED\n" << dendl; + return -ECANCELED; + } + } + p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat, + &p_stats->dup_head_bytes_estimate); + display_table_stat_counters(dpp, p_stats); + + ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl; + if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) { + for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { + remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]); + } + return 0; + } + +#ifndef FULL_DEDUP_SUPPORT + // we don't support full dedup with this release + return 0; +#endif + + p_table->remove_singletons_and_redistribute_keys(); + // The SLABs holds minimal data set brought from the bucket-index + // Objects participating in DEDUP need to read attributes from the Head-Object + // TBD - find a better name than num_work_shards for the combined output + { + disk_block_t arr[DISK_BLOCK_COUNT]; + worker_stats_t wstat; + disk_block_seq_t disk_block_seq(dpp, arr, num_work_shards, md5_shard, &wstat); + for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { + process_all_slabs(p_table, STEP_READ_ATTRIBUTES, md5_shard, worker_id, + slab_count_arr+worker_id, p_stats, &disk_block_seq, &remapper); + if (unlikely(d_ctl.should_stop())) { + ldpp_dout(dpp, 5) << __func__ << "::STEP_READ_ATTRIBUTES::STOPPED\n" << dendl; + return -ECANCELED; + } + // we finished processing output SLAB from @worker_id -> remove them + remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]); + } + disk_block_seq.flush_disk_records(d_dedup_cluster_ioctx); + } + + ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::started..." << dendl; + uint32_t slab_count = 0; + process_all_slabs(p_table, STEP_REMOVE_DUPLICATES, md5_shard, num_work_shards, + &slab_count, p_stats, nullptr, &remapper); + if (unlikely(d_ctl.should_stop())) { + ldpp_dout(dpp, 5) << __func__ << "::STEP_REMOVE_DUPLICATES::STOPPED\n" << dendl; + return -ECANCELED; + } + ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::finished..." << dendl; + // remove the special SLAB holding aggragted data + remove_slabs(num_work_shards, md5_shard, slab_count); + return 0; + } + + //--------------------------------------------------------------------------- + int Background::read_bucket_stats(const rgw_bucket &bucket_rec, + uint64_t *p_num_obj, + uint64_t *p_size) + { + unique_ptr bucket; + int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): " + << cpp_strerror(-ret) << dendl; + return ret; + } + + const auto& index = bucket->get_info().get_current_index(); + if (is_layout_indexless(index)) { + ldpp_dout(dpp, 1) << __func__ + << "::ERR, indexless buckets do not maintain stats; bucket=" + << bucket->get_name() << dendl; + return -EINVAL; + } + + std::map stats; + std::string bucket_ver, master_ver; + std::string max_marker; + ret = bucket->read_stats(dpp, null_yield, index, RGW_NO_SHARD, &bucket_ver, + &master_ver, stats, &max_marker); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR getting bucket stats bucket=" + << bucket->get_name() << " ret=" << ret << dendl; + return ret; + } + + for (auto itr = stats.begin(); itr != stats.end(); ++itr) { + RGWStorageStats& s = itr->second; + ldpp_dout(dpp, 20) << __func__ << "::" << bucket->get_name() << "::" + << to_string(itr->first) << "::num_obj=" << s.num_objects + << "::size=" << s.size << dendl; + *p_num_obj += s.num_objects; + *p_size += s.size; + } + + return 0; + } + + //--------------------------------------------------------------------------- + int Background::collect_all_buckets_stats() + { + int ret = 0; + std::string section("bucket.instance"); + std::string marker; + void *handle = nullptr; + ret = driver->meta_list_keys_init(dpp, section, marker, &handle); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: " + << cpp_strerror(-ret) << dendl; + return ret; + } + + d_all_buckets_obj_count = 0; + d_all_buckets_obj_size = 0; + + bool has_more = true; + while (has_more) { + std::list entries; + constexpr int max_keys = 1000; + ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more); + if (ret == 0) { + for (auto& entry : entries) { + ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl; + rgw_bucket bucket; + ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: " + << cpp_strerror(-ret) << dendl; + goto err; + } + ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl; + ret = read_bucket_stats(bucket, &d_all_buckets_obj_count, + &d_all_buckets_obj_size); + if (unlikely(ret != 0)) { + goto err; + } + } + driver->meta_list_keys_complete(handle); + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed driver->meta_list_keys_next()" << dendl; + goto err; + } + } + ldpp_dout(dpp, 10) <<__func__ + << "::all_buckets_obj_count=" << d_all_buckets_obj_count + << "::all_buckets_obj_size=" << d_all_buckets_obj_size + << dendl; + return 0; + + err: + ldpp_dout(dpp, 1) << __func__ << "::error handler" << dendl; + // reset counters to mark that we don't have the info + d_all_buckets_obj_count = 0; + d_all_buckets_obj_size = 0; + if (handle) { + driver->meta_list_keys_complete(handle); + } + return ret; + } + + //--------------------------------------------------------------------------- + int Background::objects_ingress_single_work_shard(work_shard_t worker_id, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards, + worker_stats_t *p_worker_stats, + uint8_t *raw_mem, + uint64_t raw_mem_size) + { + int ret = 0; + std::string section("bucket.instance"); + std::string marker; + void *handle = nullptr; + ret = driver->meta_list_keys_init(dpp, section, marker, &handle); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: " + << cpp_strerror(-ret) << dendl; + return ret; + } + disk_block_array_t disk_arr(dpp, raw_mem, raw_mem_size, worker_id, + p_worker_stats, num_md5_shards); + bool has_more = true; + // iterate over all buckets + while (ret == 0 && has_more) { + std::list entries; + constexpr int max_keys = 1000; + ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more); + if (ret == 0) { + ldpp_dout(dpp, 20) <<__func__ << "::entries.size()=" << entries.size() << dendl; + for (auto& entry : entries) { + ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl; + rgw_bucket bucket; + ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr); + if (unlikely(ret < 0)) { + // bad bucket entry, skip to the next one + ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: " + << cpp_strerror(-ret) << dendl; + continue; + } + ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl; + ret = ingress_bucket_objects_single_shard(disk_arr, bucket, worker_id, + num_work_shards, p_worker_stats); + if (unlikely(ret != 0)) { + if (d_ctl.should_stop()) { + driver->meta_list_keys_complete(handle); + return -ECANCELED; + } + ldpp_dout(dpp, 1) << __func__ << "::Failed ingress_bucket_objects_single_shard()" << dendl; + // skip bad bucket and move on to the next one + continue; + } + } + driver->meta_list_keys_complete(handle); + } + else { + ldpp_dout(dpp, 1) << __func__ << "::failed driver->meta_list_keys_next()" << dendl; + driver->meta_list_keys_complete(handle); + // TBD: what can we do here? + break; + } + } + ldpp_dout(dpp, 20) <<__func__ << "::flush_output_buffers() worker_id=" + << worker_id << dendl; + disk_arr.flush_output_buffers(dpp, d_dedup_cluster_ioctx); + return ret; + } + + //--------------------------------------------------------------------------- + int Background::remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count) + { + unsigned failure_count = 0; + + for (uint32_t slab_id = 0; slab_id < slab_count; slab_id++) { + uint32_t seq_number = disk_block_id_t::slab_id_to_seq_num(slab_id); + disk_block_id_t block_id(worker_id, seq_number); + std::string oid(block_id.get_slab_name(md5_shard)); + ldpp_dout(dpp, 20) << __func__ << "::calling ioctx->remove(" << oid << ")" << dendl; + int ret = d_dedup_cluster_ioctx.remove(oid); + if (ret != 0) { + ldpp_dout(dpp, 0) << __func__ << "::ERR Failed ioctx->remove(" << oid << ")" << dendl; + failure_count++; + } + } + + return failure_count; + } + + //--------------------------------------------------------------------------- + int Background::f_ingress_work_shard(unsigned worker_id, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards) + { + ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << worker_id << dendl; + utime_t start_time = ceph_clock_now(); + worker_stats_t worker_stats; + int ret = objects_ingress_single_work_shard(worker_id, num_work_shards, num_md5_shards, + &worker_stats,raw_mem, raw_mem_size); + if (ret == 0) { + worker_stats.duration = ceph_clock_now() - start_time; + d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats); + ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl; + ldpp_dout(dpp, 10) << "Shard Process Duration = " + << worker_stats.duration << dendl; + } + //ldpp_dout(dpp, 0) << __func__ << "::sleep for 2 seconds\n" << dendl; + //std::this_thread::sleep_for(std::chrono::seconds(2)); + return ret; + } + + //--------------------------------------------------------------------------- + int Background::f_dedup_md5_shard(unsigned md5_shard, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards) + { + utime_t start_time = ceph_clock_now(); + md5_stats_t md5_stats; + //DEDUP_DYN_ALLOC + dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size); + int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards); + if (ret == 0) { + md5_stats.duration = ceph_clock_now() - start_time; + d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats); + ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl; + ldpp_dout(dpp, 10) << "Shard Process Duration = " + << md5_stats.duration << dendl; + } + return ret; + } + + //--------------------------------------------------------------------------- + int Background::process_all_shards(bool ingress_work_shards, + int (Background::*func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t), + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards) + { + while (true) { + d_heart_beat_last_update = ceph_clock_now(); + uint16_t shard_id; + if (ingress_work_shards) { + shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards); + } + else { + shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards); + } + + // start with a common error handler + if (shard_id != NULL_SHARD) { + ldpp_dout(dpp, 10) << __func__ << "::Got shard_id=" << shard_id << dendl; + int ret = (this->*func)(shard_id, raw_mem, raw_mem_size, num_work_shards, + num_md5_shards); + if (unlikely(ret != 0)) { + if (d_ctl.should_stop()) { + ldpp_dout(dpp, 5) << __func__ << "::stop execution" << dendl; + return -ECANCELED; + } + else { + ldpp_dout(dpp, 5) << __func__ << "::Skip shard #" << shard_id << dendl; + } + } + } + else { + ldpp_dout(dpp, 10) << __func__ << "::finished processing all shards" < vec; + vec.push_back("default.rgw.buckets.data"); + map stats; + auto rados_handle = rados->get_rados_handle(); + int ret = rados_handle->get_pool_stats(vec, stats); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << ":ERROR: fetching pool stats: " + << cpp_strerror(-ret) << dendl; + return ret; + } + + for (auto i = stats.begin(); i != stats.end(); ++i) { + const char *pool_name = i->first.c_str(); + librados::pool_stat_t& s = i->second; + // TBD: add support for EC + // We need to find the user byte size without the added protection + double replica_level = (double)s.num_object_copies / s.num_objects; + *p_num_objects = s.num_objects; + *p_num_objects_bytes = s.num_bytes / replica_level; + ldpp_dout(dpp, 10) <<__func__ << "::" << pool_name << "::num_objects=" + << s.num_objects << "::num_copies=" << s.num_object_copies + << "::num_bytes=" << s.num_bytes << "/" << *p_num_objects_bytes << dendl; + } + return 0; + } + + //------------------------------------------------------------------------------- + // 32B per object-entry in the hashtable + // 2MB per shard-buffer + //=============||==============||=========||===================================|| + // Obj Count || shard count || memory || calculation || + // ------------||--------------||---------||---------------------------------- || + // 1M || 4 || 8MB || 8MB/32 = 0.25M * 4 = 1M || + // 4M || 8 || 16MB || 16MB/32 = 0.50M * 8 = 4M || + //------------------------------------------------------------------------------- + // 16M || 16 || 32MB || 32MB/32 = 1.00M * 16 = 16M || + //------------------------------------------------------------------------------- + // 64M || 32 || 64MB || 64MB/32 = 2.00M * 32 = 64M || + // 256M || 64 || 128MB || 128MB/32 = 4.00M * 64 = 256M || + // 1024M( 1G) || 128 || 256MB || 256MB/32 = 8.00M * 128 = 1024M || + // 4096M( 4G) || 256 || 512MB || 512MB/32 = 16M.00 * 256 = 4096M || + // 16384M(16G) || 512 || 1024MB || 1024MB/32 = 32M.00 * 512 = 16384M || + //-------------||--------------||---------||-----------------------------------|| + static md5_shard_t calc_num_md5_shards(uint64_t obj_count) + { + // create headroom by allocating space for a 10% bigger system + obj_count = obj_count + (obj_count/10); + + uint64_t M = 1024 * 1024; + if (obj_count < 1*M) { + // less than 1M objects -> use 4 shards (8MB) + return 4; + } + else if (obj_count < 4*M) { + // less than 4M objects -> use 8 shards (16MB) + return 8; + } + else if (obj_count < 16*M) { + // less than 16M objects -> use 16 shards (32MB) + return 16; + } + else if (obj_count < 64*M) { + // less than 64M objects -> use 32 shards (64MB) + return 32; + } + else if (obj_count < 256*M) { + // less than 256M objects -> use 64 shards (128MB) + return 64; + } + else if (obj_count < 1024*M) { + // less than 1024M objects -> use 128 shards (256MB) + return 128; + } + else if (obj_count < 4*1024*M) { + // less than 4096M objects -> use 256 shards (512MB) + return 256; + } + else { + return 512; + } + } + + //--------------------------------------------------------------------------- + int Background::setup(dedup_epoch_t *p_epoch) + { + int ret = collect_all_buckets_stats(); + if (unlikely(ret != 0)) { + return ret; + } + + md5_shard_t num_md5_shards = calc_num_md5_shards(d_all_buckets_obj_count); + num_md5_shards = std::min(num_md5_shards, MAX_MD5_SHARD); + num_md5_shards = std::max(num_md5_shards, MIN_MD5_SHARD); + work_shard_t num_work_shards = num_md5_shards; + num_work_shards = std::min(num_work_shards, MAX_WORK_SHARD); + + ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <num_work_shards > MAX_WORK_SHARD)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_work_shards=" + << p_epoch->num_work_shards + << " is larger than MAX_WORK_SHARD (" + << MAX_WORK_SHARD << ")" << dendl; + return -EOVERFLOW; + } + if (unlikely(p_epoch->num_md5_shards > MAX_MD5_SHARD)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_md5_shards=" + << p_epoch->num_md5_shards + << " is larger than MAX_MD5_SHARD (" + << MAX_MD5_SHARD << ")" << dendl; + return -EOVERFLOW; + } + + ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl; + d_ctl.dedup_type = p_epoch->dedup_type; +#ifdef FULL_DEDUP_SUPPORT + ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL || + d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); +#else + ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); +#endif + ldpp_dout(dpp, 10) << __func__ << "::" << d_ctl.dedup_type << dendl; + + return 0; + } + + //--------------------------------------------------------------------------- + int Background::watch_reload(const DoutPrefixProvider* dpp) + { + return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx); + } + + //--------------------------------------------------------------------------- + int Background::unwatch_reload(const DoutPrefixProvider* dpp) + { + if (d_watch_handle == 0) { + // nothing to unwatch + ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): nothing to watch" + << dendl; + return 0; + } + + ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle=" + << d_watch_handle << dendl; + + int ret = cluster::unwatch_reload(store, dpp, d_watch_handle); + if (ret == 0) { + ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching " + << "::d_watch_handle=" << d_watch_handle << dendl; + d_watch_handle = 0; + } + return ret; + } + + //--------------------------------------------------------------------------- + void Background::handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl) + { + int ret = 0; + int32_t urgent_msg = URGENT_MSG_NONE; + try { + auto bl_iter = bl.cbegin(); + ceph::decode(urgent_msg, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad urgent_msg" << dendl; + ret = -EINVAL; + } + ldpp_dout(dpp, 5) << __func__ << "::-->" << get_urgent_msg_names(urgent_msg) << dendl; + + // use lock to prevent concurrent pause/resume requests + std::unique_lock cond_lock(d_cond_mutex); // [------>open lock block + if (unlikely(d_ctl.local_urgent_req())) { + // can't operate when the system is paused/shutdown + cond_lock.unlock(); // close lock block------>] + ldpp_dout(dpp, 5) << __func__ + << "::system is paused/shutdown -> cancel notification" << dendl; + cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY); + return; + } + + switch(urgent_msg) { + case URGENT_MSG_ABORT: + if (d_ctl.dedup_exec) { + d_ctl.remote_abort_req = true; + d_cond.notify_all(); + d_cond.wait(cond_lock, [this]{return d_ctl.remote_aborted || d_ctl.local_urgent_req();}); + d_ctl.remote_aborted ? ret = 0 : ret = -EBUSY; + } + else { + ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl; + } + break; + case URGENT_MSG_RESTART: + if (!d_ctl.dedup_exec) { + d_ctl.remote_restart_req = true; + d_cond.notify_all(); + } + else { + ldpp_dout(dpp, 5) << __func__ << "::\ncan't restart active dedup\n"<< dendl; + ret = -EEXIST; + } + break; + case URGENT_MSG_PASUE: + if (d_ctl.dedup_exec && !d_ctl.remote_paused) { + d_ctl.remote_pause_req = true; + d_cond.notify_all(); + d_cond.wait(cond_lock, [this]{return d_ctl.remote_paused || d_ctl.local_urgent_req();}); + d_ctl.remote_paused ? ret = 0 : ret = -EBUSY; + } + else { + if (d_ctl.remote_paused) { + ldpp_dout(dpp, 5) << __func__ << "::dedup is already paused" << dendl; + } + else { + ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl; + } + } + break; + case URGENT_MSG_RESUME: + if (d_ctl.remote_pause_req || d_ctl.remote_paused) { + d_ctl.remote_pause_req = false; + d_ctl.remote_paused = false; + d_cond.notify_all(); + } + else { + ldpp_dout(dpp, 5) << __func__ << "::dedup is not paused->nothing to do" << dendl; + } + break; + default: + ldpp_dout(dpp, 1) << __func__ << "::unexpected urgent_msg: " + << get_urgent_msg_names(urgent_msg) << dendl; + ret = -EINVAL; + } + + cond_lock.unlock(); // close lock block------>] + cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret); + } + + //--------------------------------------------------------------------------- + void Background::start() + { + const DoutPrefixProvider* const dpp = &dp; + ldpp_dout(dpp, 10) << __FILE__ << "::" <<__func__ << dendl; + { + std::unique_lock pause_lock(d_cond_mutex); + if (d_ctl.started) { + // start the thread only once + ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl; + return; + } + d_ctl.started = true; + } + d_runner = std::thread(&Background::run, this); + } + + //------------------------- -------------------------------------------------- + void Background::shutdown() + { + ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg shutdown()" << dendl; + std::unique_lock cond_lock(d_cond_mutex); + bool nested_call = false; + if (d_ctl.shutdown_req) { + // should never happen! + ldpp_dout(dpp, 1) <<__func__ << "dedup_bg nested call" << dendl; + nested_call = true; + } + d_ctl.shutdown_req = true; + d_cond.notify_all(); + ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl; + d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;}); + //cond_lock.unlock(); + + if (nested_call) { + ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl; + d_cond.notify_all(); + } + + if (d_runner.joinable()) { + ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg wait join()" << dendl; + d_runner.join(); + ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg finished join()" << dendl; + } + else { + ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg not joinable()" << dendl; + } + + d_ctl.reset(); + } + + //--------------------------------------------------------------------------- + void Background::pause() + { + display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request"); + std::unique_lock cond_lock(d_cond_mutex); + + if (d_ctl.local_paused || d_ctl.shutdown_done) { + cond_lock.unlock(); + ldpp_dout(dpp, 1) << __FILE__ << "::" <<__func__ + << "::dedup_bg is already paused/stopped" << dendl; + return; + } + + bool nested_call = false; + if (d_ctl.local_pause_req) { + // should never happen! + ldpp_dout(dpp, 1) <<__func__ << "::nested call" << dendl; + nested_call = true; + } + d_ctl.local_pause_req = true; + d_cond.notify_all(); + d_cond.wait(cond_lock, [this]{return d_ctl.local_paused||d_ctl.shutdown_done;}); + if (nested_call) { + ldpp_dout(dpp, 1) << "dedup_bg::nested call:: repeat notify" << dendl; + d_cond.notify_all(); + } + + // destory open watch request and pool handle before pause() is completed + unwatch_reload(dpp); + d_dedup_cluster_ioctx.close(); + ldpp_dout(dpp, 5) << "dedup_bg paused" << dendl; + } + + //--------------------------------------------------------------------------- + void Background::resume(rgw::sal::Driver* _driver) + { + ldpp_dout(dpp, 5) << "dedup_bg->resume()" << dendl; + // use lock to prevent concurrent pause/resume requests + std::unique_lock cond_lock(d_cond_mutex); + + if (!d_ctl.local_paused) { + cond_lock.unlock(); + ldpp_dout(dpp, 5) << "dedup_bg::resume thread is not paused!" << dendl; + if (_driver != driver) { + ldpp_dout(dpp, 1) << "dedup_bg attempt to change driver on an active system was refused" << dendl; + } + return; + } + + driver = _driver; + // can pool change its uid between pause/resume ??? + int ret = init_rados_access_handles(false); + if (ret != 0) { + derr << "dedup_bg::resume() failed init_rados_access_handles() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + throw std::runtime_error("Failed init_rados_access_handles()"); + } + display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done"); + // create new watch request using the new pool handle + watch_reload(dpp); + d_ctl.local_pause_req = false; + d_ctl.local_paused = false; + + // wake up threads blocked after seeing pause state + d_cond.notify_all(); + ldpp_dout(dpp, 5) << "dedup_bg was resumed" << dendl; + } + + //--------------------------------------------------------------------------- + void Background::handle_pause_req(const char *caller) + { + ldpp_dout(dpp, 5) << __func__ << "::caller=" << caller << dendl; + ldpp_dout(dpp, 5) << __func__ << "::" << d_ctl << dendl; + while (d_ctl.local_pause_req || d_ctl.local_paused || d_ctl.remote_pause_req || d_ctl.remote_paused) { + std::unique_lock cond_lock(d_cond_mutex); + if (d_ctl.should_stop()) { + ldpp_dout(dpp, 5) << __func__ << "::should_stop!" << dendl; + return; + } + + if (d_ctl.local_pause_req) { + d_ctl.local_pause_req = false; + d_ctl.local_paused = true; + } + + if (d_ctl.remote_pause_req) { + d_ctl.remote_pause_req = false; + d_ctl.remote_paused = true; + } + + d_cond.notify_all(); + + if (d_ctl.local_paused) { + ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.local_paused" << dendl; + d_cond.wait(cond_lock, [this]{return !d_ctl.local_paused || d_ctl.should_stop() ;}); + } + + if (d_ctl.remote_paused) { + ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.remote_paused" << dendl; + d_cond.wait(cond_lock, [this]{return !d_ctl.remote_paused || d_ctl.should_stop() || d_ctl.local_pause_req;}); + } + } // while loop + + ldpp_dout(dpp, 5) << "Dedup background thread resumed!" << dendl; + } + + //--------------------------------------------------------------------------- + void Background::work_shards_barrier(work_shard_t num_work_shards) + { + // Wait for other worker to finish ingress step + // We can move to the next step even if some token are in failed state + const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members + unsigned ttl = 3; + unsigned time_elapsed = 0; + + while (true) { + int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards); + // we start incrementing time_elapsed only after all valid tokens finish + if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) { + break; + } + + ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl=" + << ttl << " seconds" << dendl; + std::unique_lock cond_lock(d_cond_mutex); + d_cond.wait_for(cond_lock, std::chrono::seconds(ttl), + [this]{return d_ctl.should_stop() || d_ctl.should_pause();}); + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + if (unlikely(d_ctl.should_stop())) { + return; + } + + if (ret != -EAGAIN) { + // All incomplete tokens are corrupted or in time out state + // Give them an extra 120 seconds just in case ... + time_elapsed += ttl; + } + // else there are still good tokens in process, wait for them + } + + ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n" + << dendl; + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + } + + //--------------------------------------------------------------------------- + static bool all_md5_shards_completed(cluster *p_cluster, + rgw::sal::RadosStore *store, + md5_shard_t num_md5_shards) + { + return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0); + } + + //--------------------------------------------------------------------------- + void Background::md5_shards_barrier(md5_shard_t num_md5_shards) + { + // Wait for others to finish step + unsigned ttl = 3; + // require that everything completed successfully before deleting the pool + while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) { + ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl=" + << ttl << " seconds" << dendl; + std::unique_lock cond_lock(d_cond_mutex); + d_cond.wait_for(cond_lock, std::chrono::seconds(ttl), + [this]{return d_ctl.should_stop() || d_ctl.should_pause();}); + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + if (unlikely(d_ctl.should_stop())) { + return; + } + } + + ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n" + << dendl; + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + } + } + + //--------------------------------------------------------------------------- + void Background::run() + { + const auto rc = ceph_pthread_setname("dedup_bg"); + ldpp_dout(dpp, 5) << __func__ << "ceph_pthread_setname() ret=" << rc << dendl; + + // 256x8KB=2MB + const uint64_t PER_SHARD_BUFFER_SIZE = DISK_BLOCK_COUNT *sizeof(disk_block_t); + ldpp_dout(dpp, 20) <<__func__ << "::dedup::main loop" << dendl; + + while (!d_ctl.shutdown_req) { + if (unlikely(d_ctl.should_pause())) { + handle_pause_req(__func__); + if (unlikely(d_ctl.should_stop())) { + ldpp_dout(dpp, 5) <<__func__ << "::stop req after a pause" << dendl; + d_ctl.dedup_exec = false; + } + } + + if (d_ctl.dedup_exec) { + dedup_epoch_t epoch; + if (setup(&epoch) != 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl; + return; + } + const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; + int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); + if (pool_id < 0) { + ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl; + return; + } + work_shard_t num_work_shards = epoch.num_work_shards; + md5_shard_t num_md5_shards = epoch.num_md5_shards; + const uint64_t RAW_MEM_SIZE = PER_SHARD_BUFFER_SIZE * num_md5_shards; + ldpp_dout(dpp, 5) <<__func__ << "::RAW_MEM_SIZE=" << RAW_MEM_SIZE + << "::num_work_shards=" << num_work_shards + << "::num_md5_shards=" << num_md5_shards << dendl; + // DEDUP_DYN_ALLOC + auto raw_mem = std::make_unique(RAW_MEM_SIZE); + if (raw_mem == nullptr) { + ldpp_dout(dpp, 1) << "failed slab memory allocation - size=" << RAW_MEM_SIZE << dendl; + return; + } + + process_all_shards(true, &Background::f_ingress_work_shard, raw_mem.get(), + RAW_MEM_SIZE, num_work_shards, num_md5_shards); + if (!d_ctl.should_stop()) { + // Wait for all other workers to finish ingress step + work_shards_barrier(num_work_shards); + if (!d_ctl.should_stop()) { + process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(), + RAW_MEM_SIZE, num_work_shards, num_md5_shards); + // Wait for all other md5 shards to finish + md5_shards_barrier(num_md5_shards); + safe_pool_delete(store, dpp, pool_id); + } + else { + ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl; + } + } + else { + ldpp_dout(dpp, 5) <<__func__ << "::stop req from ingress_work_shard" << dendl; + } + } // dedup_exec + + std::unique_lock cond_lock(d_cond_mutex); + d_ctl.dedup_exec = false; + if (d_ctl.remote_abort_req) { + d_ctl.remote_aborted = true; + + d_ctl.remote_abort_req = false; + d_ctl.remote_paused = false; + d_cond.notify_all(); + ldpp_dout(dpp, 5) << __func__ << "::Dedup was aborted on a remote req" << dendl; + } + d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();}); + if (!d_ctl.should_stop() && !d_ctl.should_pause()) { + // TBD: should we release lock here ??? + if (d_cluster.can_start_new_scan(store)) { + d_ctl.dedup_exec = true; + d_ctl.remote_aborted = false; + d_ctl.remote_paused = false; + d_ctl.remote_restart_req = false; + d_cond.notify_all(); + } + }else if (d_ctl.should_stop()) { + ldpp_dout(dpp, 5) << "main loop::should_stop::" << d_ctl << dendl; + } + else { + ldpp_dout(dpp, 5) << "main loop::should_pause::" << d_ctl << dendl; + } + } + d_ctl.shutdown_done = true; + d_cond.notify_all(); + // shutdown + ldpp_dout(dpp, 5) << __func__ << "::Dedup background thread stopped" << dendl; + } + +}; //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h new file mode 100644 index 000000000000..48dafe38cb1e --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include "common/dout.h" +#include "rgw_common.h" +#include "rgw_dedup_utils.h" +#include "rgw_dedup_table.h" +#include "rgw_dedup_cluster.h" +#include "rgw_realm_reloader.h" +#include +#include +#include +#include +#include + +namespace rgw::dedup { + struct dedup_epoch_t; + struct control_t { + control_t() { + reset(); + } + void reset(); + inline bool local_urgent_req() const { + return (shutdown_req || local_pause_req); + } + inline bool should_stop() const { + return (shutdown_req || remote_abort_req); + } + inline bool should_pause() const { + return (local_pause_req || remote_pause_req); + } + + // allow to start/pasue/resume/stop execution + dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE; + bool started = false; + bool dedup_exec = false; + bool shutdown_req = false; + bool shutdown_done = false; + bool local_pause_req = false; + bool local_paused = false; + bool remote_abort_req = false; + bool remote_aborted = false; + bool remote_pause_req = false; + bool remote_paused = false; + bool remote_restart_req = false; + }; + std::ostream& operator<<(std::ostream &out, const control_t &ctl); + void encode(const control_t& ctl, ceph::bufferlist& bl); + void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl); + class remapper_t; + class disk_block_seq_t; + struct disk_record_t; + struct key_t; + //Interval between each execution of the script is set to 5 seconds + static inline constexpr int INIT_EXECUTE_INTERVAL = 5; + class Background : public RGWRealmReloader::Pauser { + class DedupWatcher : public librados::WatchCtx2 { + Background* const parent; + public: + DedupWatcher(Background* _parent) : parent(_parent) {} + ~DedupWatcher() override = default; + void handle_notify(uint64_t notify_id, uint64_t cookie, + uint64_t notifier_id, bufferlist& bl) override; + void handle_error(uint64_t cookie, int err) override; + }; + + public: + Background(rgw::sal::Driver* _driver, CephContext* _cct); + int watch_reload(const DoutPrefixProvider* dpp); + int unwatch_reload(const DoutPrefixProvider* dpp); + void handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl); + void start(); + void shutdown(); + void pause() override; + void resume(rgw::sal::Driver* _driver) override; + + private: + enum dedup_step_t { + STEP_NONE, + STEP_BUCKET_INDEX_INGRESS, + STEP_BUILD_TABLE, + STEP_READ_ATTRIBUTES, + STEP_REMOVE_DUPLICATES + }; + + void run(); + int setup(struct dedup_epoch_t*); + void work_shards_barrier(work_shard_t num_work_shards); + void md5_shards_barrier(md5_shard_t num_md5_shards); + void handle_pause_req(const char* caller); + const char* dedup_step_name(dedup_step_t step); + int read_buckets(); + void check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, uint64_t count_b, + const char *prefix); + + inline void check_and_update_worker_heartbeat(work_shard_t worker_id, int64_t obj_count); + inline void check_and_update_md5_heartbeat(md5_shard_t md5_id, + uint64_t load_count, + uint64_t dedup_count); + int ingress_bucket_idx_single_object(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *bucket, + const rgw_bucket_dir_entry &entry, + worker_stats_t *p_worker_stats /*IN-OUT*/); + int process_bucket_shards(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *bucket, + std::map &oids, + librados::IoCtx &ioctx, + work_shard_t shard_id, + work_shard_t num_work_shards, + worker_stats_t *p_worker_stats /*IN-OUT*/); + int ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr, + const rgw_bucket &bucket_rec, + work_shard_t worker_id, + work_shard_t num_work_shards, + worker_stats_t *p_worker_stats /*IN-OUT*/); + int objects_ingress_single_work_shard(work_shard_t worker_id, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards, + worker_stats_t *p_worker_stats, + uint8_t *raw_mem, + uint64_t raw_mem_size); + int f_ingress_work_shard(unsigned shard_id, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards); + int f_dedup_md5_shard(unsigned shard_id, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards); + int process_all_shards(bool ingress_work_shards, + int (Background::* func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t), + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards); + int read_bucket_stats(const rgw_bucket &bucket_rec, + uint64_t *p_num_obj, + uint64_t *p_size); + int collect_all_buckets_stats(); + int objects_dedup_single_md5_shard(dedup_table_t *p_table, + md5_shard_t md5_shard, + md5_stats_t *p_stats, + work_shard_t num_work_shards); + int add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr, + const rgw::sal::Bucket *p_bucket, + const parsed_etag_t *p_parsed_etag, + const std::string &obj_name, + uint64_t obj_size, + const std::string &storage_class); + + int add_record_to_dedup_table(dedup_table_t *p_table, + const struct disk_record_t *p_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_stats_t *p_stats, + remapper_t *remapper); + + int process_all_slabs(dedup_table_t *p_table, + dedup_step_t step, + md5_shard_t md5_shard, + work_shard_t work_shard, + uint32_t *p_seq_count, + md5_stats_t *p_stats /* IN-OUT */, + disk_block_seq_t *p_disk_block_arr, + remapper_t *remapper); + +#ifdef FULL_DEDUP_SUPPORT + int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash); + int add_obj_attrs_to_record(rgw_bucket *p_rb, + disk_record_t *p_rec, + const rgw::sal::Attrs &attrs, + dedup_table_t *p_table, + md5_stats_t *p_stats); /* IN-OUT */ + + int read_object_attribute(dedup_table_t *p_table, + disk_record_t *p_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats /* IN-OUT */, + disk_block_seq_t *p_disk, + remapper_t *remapper); + int try_deduping_record(dedup_table_t *p_table, + const disk_record_t *p_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats, /* IN-OUT */ + remapper_t *remapper); + int inc_ref_count_by_manifest(const std::string &ref_tag, + const std::string &oid, + RGWObjManifest &manifest); + int rollback_ref_by_manifest(const std::string &ref_tag, + const std::string &oid, + RGWObjManifest &tgt_manifest); + int free_tail_objs_by_manifest(const std::string &ref_tag, + const std::string &oid, + RGWObjManifest &tgt_manifest); + int dedup_object(const disk_record_t *p_src_rec, + const disk_record_t *p_tgt_rec, + md5_stats_t *p_stats, + bool is_shared_manifest_src); +#endif + int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count); + int init_rados_access_handles(bool init_pool); + + // private data members + rgw::sal::Driver* driver = nullptr; + rgw::sal::RadosStore* store = nullptr; + RGWRados* rados = nullptr; + librados::Rados* rados_handle = nullptr; + const DoutPrefix dp; + const DoutPrefixProvider* const dpp; + CephContext* const cct; + cluster d_cluster; + librados::IoCtx d_dedup_cluster_ioctx; + utime_t d_heart_beat_last_update; + unsigned d_heart_beat_max_elapsed_sec; + + // A pool with 6 billion objects has a 1/(2^64) chance for collison with a 128bit MD5 + uint64_t d_max_protected_objects = (6ULL * 1024 * 1024 * 1024); + uint64_t d_all_buckets_obj_count = 0; + uint64_t d_all_buckets_obj_size = 0; + // we don't benefit from deduping RGW objects smaller than head-object size + uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024); + uint32_t d_head_object_size = (4ULL * 1024 * 1024); + control_t d_ctl; + uint64_t d_watch_handle = 0; + DedupWatcher d_watcher_ctx; + + std::thread d_runner; + std::mutex d_cond_mutex; + std::condition_variable d_cond; + }; + +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.cc b/src/rgw/driver/rados/rgw_dedup_cluster.cc new file mode 100644 index 000000000000..7bdb308af87c --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_cluster.cc @@ -0,0 +1,1346 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_dedup_cluster.h" +#include "rgw_dedup.h" +#include "rgw_dedup_epoch.h" +#include "rgw_common.h" +#include "rgw_dedup_store.h" +#include "include/rados/rados_types.hpp" +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "svc_zone.h" +#include "common/Clock.h" // for ceph_clock_now() +#include "common/config.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "rgw_common.h" +#include "include/denc.h" +#include "rgw_sal.h" +#include "driver/rados/rgw_sal_rados.h" +#include +#include +#include + +namespace rgw::dedup { + const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN"; + const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ"; + + static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30; + struct shard_progress_t; + static int collect_shard_stats(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + utime_t epoch_time, + unsigned shards_count, + const char *prefix, + bufferlist bl_arr[], + struct shard_progress_t *sp_arr); + + const uint64_t SP_ALL_OBJECTS = ULLONG_MAX; + const uint64_t SP_NO_OBJECTS = 0ULL; + const char* SHARD_PROGRESS_ATTR = "shard_progress"; + + //--------------------------------------------------------------------------- + static int get_control_ioctx(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + librados::IoCtx &ctl_ioctx /* OUT-PARAM */) + { + const auto& control_pool = store->svc()->zone->get_zone_params().control_pool; + auto rados_handle = store->getRados()->get_rados_handle(); + int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + } + return ret; + } + + //--------------------------------------------------------------------------- + static int get_epoch(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + dedup_epoch_t *p_epoch, /* OUT */ + const char *caller) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + std::string oid(DEDUP_EPOCH_TOKEN); + bufferlist bl; + ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl); + if (ret > 0) { + try { + auto p = bl.cbegin(); + decode(*p_epoch, p); + }catch (const buffer::error&) { + ldpp_dout(dpp, 0) << __func__ << "::failed epoch decode!" << dendl; + return -EINVAL; + } + if (caller) { + ldpp_dout(dpp, 10) << __func__ << "::"<< caller<< "::" << *p_epoch << dendl; + } + return 0; + } + else { + // zero length read means no data + if (ret == 0) { + ret = -ENODATA; + } + ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "") + << "::failed ctl_ioctx.getxattr() with: " + << cpp_strerror(-ret) << ", ret=" << ret << dendl; + return ret; + } + } + + //--------------------------------------------------------------------------- + static int set_epoch(rgw::sal::RadosStore *store, + const std::string &cluster_id, + const DoutPrefixProvider *dpp, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + std::string oid(DEDUP_EPOCH_TOKEN); + ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl; + bool exclusive = true; // block overwrite of old objects + ret = ctl_ioctx.create(oid, exclusive); + if (ret >= 0) { + ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl; + // now try and take ownership + } + else if (ret == -EEXIST) { + ldpp_dout(dpp, 10) << __func__ << "::Epoch object exists -> trying to take over" << dendl; + // try and take ownership + } + else{ + ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << oid + <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <serial + 1, dedup_type, + ceph_clock_now(), num_work_shards, num_md5_shards}; + bufferlist old_epoch_bl, new_epoch_bl, err_bl; + encode(*p_old_epoch, old_epoch_bl); + encode(new_epoch, new_epoch_bl); + librados::ObjectWriteOperation op; + op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, old_epoch_bl); + op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl); + + ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl; + std::string oid(DEDUP_EPOCH_TOKEN); + ret = ctl_ioctx.operate(oid, &op); + if (ret != 0) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate(" + << oid << "), err is " << cpp_strerror(-ret) << dendl; + } + + return ret; + } + + //--------------------------------------------------------------------------- + struct shard_progress_t { + shard_progress_t() { + // init an empty object + this->progress_a = SP_NO_OBJECTS; + this->progress_b = SP_NO_OBJECTS; + this->completed = false; + + // set all timers to now + this->creation_time = utime_t(); + this->completion_time = utime_t(); + this->update_time = utime_t(); + + // owner and stats_bl are empty until set + } + + shard_progress_t(uint64_t _progress_a, + uint64_t _progress_b, + bool _completed, + const std::string &_owner, + const bufferlist &_stats_bl) : owner(_owner), stats_bl(_stats_bl) { + this->progress_a = _progress_a; + this->progress_b = _progress_b; + this->completed = _completed; + + utime_t now = ceph_clock_now(); + this->update_time = now; + + if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) { + this->creation_time = now; + } + if (_completed) { + this->completion_time = now; + } + } + + bool is_completed() const { + if (this->progress_b == SP_ALL_OBJECTS) { + ceph_assert(this->completed); + return true; + } + else { + ceph_assert(!this->completed); + return false; + } + } + + bool was_not_started() const { + return (this->creation_time == this->update_time); + } + + uint64_t progress_a; + uint64_t progress_b; + bool completed; + utime_t update_time; + utime_t creation_time; + utime_t completion_time; + std::string owner; + bufferlist stats_bl; + }; + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, shard_progress_t& sp) + { + out << (sp.completed ? " + ::" : " - ::"); + out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]"; + out << "::creation: " << sp.creation_time; + out << "::update: " << sp.update_time; + out << "::completion: " << sp.completion_time; + return out; + } + + //--------------------------------------------------------------------------- + void encode(const shard_progress_t& sp, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + encode(sp.progress_a, bl); + encode(sp.progress_b, bl); + encode(sp.completed, bl); + encode(sp.creation_time, bl); + encode(sp.completion_time, bl); + encode(sp.update_time, bl); + encode(sp.owner, bl); + encode(sp.stats_bl, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + void decode(shard_progress_t & sp, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + decode(sp.progress_a, bl); + decode(sp.progress_b, bl); + decode(sp.completed, bl); + decode(sp.creation_time, bl); + decode(sp.completion_time, bl); + decode(sp.update_time, bl); + decode(sp.owner, bl); + decode(sp.stats_bl, bl); + DECODE_FINISH(bl); + } + + //========================================================================== + + //--------------------------------------------------------------------------- + void cluster::clear() + { + d_curr_md5_shard = 0; + d_curr_worker_shard = 0; + + d_num_completed_workers = 0; + d_num_completed_md5 = 0; + + memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers)); + memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5)); + } + + + static constexpr auto COOKIE_LEN = 15; + static constexpr auto CLUSTER_ID_LEN = 15; + //--------------------------------------------------------------------------- + cluster::cluster(const DoutPrefixProvider *_dpp, + CephContext *cct, + rgw::sal::Driver* driver): + dpp(_dpp), + d_lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)), + d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN)) + { + clear(); + } + + //--------------------------------------------------------------------------- + int cluster::reset(rgw::sal::RadosStore *store, + dedup_epoch_t *p_epoch, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards) + { + ldpp_dout(dpp, 10) << __func__ << "::REQ num_work_shards=" << num_work_shards + << "::num_md5_shards=" << num_md5_shards << dendl; + clear(); + + while (true) { + int ret = get_epoch(store, dpp, p_epoch, __func__); + if (ret != 0) { + return ret; + } + if (p_epoch->num_work_shards && p_epoch->num_md5_shards) { + ldpp_dout(dpp, 10) << __func__ << "::ACC num_work_shards=" << p_epoch->num_work_shards + << "::num_md5_shards=" << p_epoch->num_md5_shards << dendl; + break; + } + else if (!num_work_shards && !num_md5_shards) { + ldpp_dout(dpp, 10) << __func__ << "::Init flow, no need to wait" << dendl; + break; + } + else { + ret = swap_epoch(store, dpp, p_epoch, + static_cast (p_epoch->dedup_type), + num_work_shards, num_md5_shards); + } + } + + d_epoch_time = p_epoch->time; + // retry cleanup 3 times before declaring failure + const unsigned RETRY_LIMIT = 3; + int ret = 1; + for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) { + ret = cleanup_prev_run(store); + } + if (ret != 0) { + return ret; + } + + create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX); + create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX); + + ret = verify_all_shard_tokens(store, p_epoch->num_work_shards, + WORKER_SHARD_PREFIX); + if (ret != 0) { + return ret; + } + return verify_all_shard_tokens(store, p_epoch->num_md5_shards, + MD5_SHARD_PREFIX); + } + + //--------------------------------------------------------------------------- + int cluster::cleanup_prev_run(rgw::sal::RadosStore *store) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + int error_code = 0; + constexpr uint32_t max = 100; + std::string marker; + bool truncated = false; + rgw::AccessListFilter filter{}; + unsigned deleted_count = 0, skipped_count = 0; + unsigned failed_count = 0, no_entry_count = 0; + do { + std::vector oids; + int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated); + if (ret == -ENOENT) { + ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl; + break; + } + else if (ret < 0) { + ldpp_dout(dpp, 1) << "failed rgw_list_pool()! ret=" << ret + << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + + for (const std::string& oid : oids) { + if (shard_token_oid::legal_oid_name(oid) == false) { + ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl; + skipped_count++; + continue; + } + + uint64_t size; + struct timespec tspec; + ret = ctl_ioctx.stat2(oid, &size, &tspec); + if (ret == -ENOENT) { + ldpp_dout(dpp, 20) << __func__ << "::" << oid + << " was removed by others" << dendl; + no_entry_count++; + continue; + } + else if (ret != 0) { + ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " + << oid << " )" << dendl; + error_code = ret; + failed_count++; + continue; + } + utime_t mtime(tspec); + if (d_epoch_time < mtime) { + ldpp_dout(dpp, 10) << __func__ << "::skipping new obj! " + << "::EPOCH={" << d_epoch_time.tv.tv_sec << ":" << d_epoch_time.tv.tv_nsec << "} " + << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl; + skipped_count++; + continue; + } + ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl; + ret = ctl_ioctx.remove(oid); + if (ret == 0) { + deleted_count++; + } + else if (ret == -ENOENT) { + ldpp_dout(dpp, 20) << __func__ << "::" << oid + << " was removed by others" << dendl; + no_entry_count++; + continue; + } + else { + error_code = ret; + failed_count++; + ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid + << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl; + } + } + ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size() + << "::deleted=" << deleted_count + << "::failed=" << failed_count + << "::no entry=" << no_entry_count + << "::skipped=" << skipped_count << dendl; + } while (truncated); + + return error_code; + } + + //--------------------------------------------------------------------------- + int cluster::create_shard_tokens(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + shard_token_oid sto(prefix); + for (unsigned shard = 0; shard < shards_count; shard++) { + sto.set_shard(shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl; + bool exclusive = true; + ret = ctl_ioctx.create(oid, exclusive); + if (ret >= 0) { + ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl; + } + else if (ret == -EEXIST) { + ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create(" + << oid << ") -EEXIST!" << dendl; + } + else { + // TBD: can it happen legally ? + ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid + << ") with: " << ret << "::" << cpp_strerror(-ret) << dendl; + } + } + + return 0; + } + + //--------------------------------------------------------------------------- + int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + shard_token_oid sto(prefix); + for (unsigned shard = 0; shard < shards_count; shard++) { + sto.set_shard(shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl; + + uint64_t size; + struct timespec tspec; + ret = ctl_ioctx.stat2(oid, &size, &tspec); + if (ret != 0) { + ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )" + << "::shards_count=" << shards_count << dendl; + return ret; + } + } + + return 0; + } + + //--------------------------------------------------------------------------- + int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store, + unsigned shard, + uint64_t count_a, + uint64_t count_b, + const char *prefix) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + shard_token_oid sto(prefix, shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + bufferlist empty_bl; + shard_progress_t sp(count_a, count_b, false, d_cluster_id, empty_bl); + sp.creation_time = d_token_creation_time; + bufferlist sp_bl; + encode(sp, sp_bl); + return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); + } + + //--------------------------------------------------------------------------- + int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store, + unsigned shard, + uint64_t obj_count, + const char *prefix, + const bufferlist &bl) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + shard_token_oid sto(prefix, shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl; + + shard_progress_t sp(obj_count, SP_ALL_OBJECTS, true, d_cluster_id, bl); + sp.creation_time = d_token_creation_time; + bufferlist sp_bl; + encode(sp, sp_bl); + ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); + if (ret == 0) { + ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")" + << dendl; + } + else { + ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid + << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl; + } + + return ret; + } + + //--------------------------------------------------------------------------- + int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store, + uint16_t start_shard, + uint16_t max_shard, + const char *prefix) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + // lock paramters: + const utime_t lock_duration; // zero duration means lock doesn't expire + const uint8_t lock_flags = 0; // no flags + const std::string lock_tag; // no tag + + shard_token_oid sto(prefix); + for (auto shard = start_shard; shard < max_shard; shard++) { + sto.set_shard(shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 10) << __func__ << "::try garbbing " << oid << dendl; + librados::ObjectWriteOperation op; + op.assert_exists(); + rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie, + lock_tag, "dedup_shard_token", lock_duration, lock_flags); + ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield); + if (ret == -EBUSY) { + // someone else took this token -> move to the next one + ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid << + " is owned by other rgw" << dendl; + continue; + } + else if (ret == -ENOENT) { + // token is deleted - processing will stop the next time we try to read from the queue + ldpp_dout(dpp, 5) << __func__ << "::" << oid + << " token doesn't exist, fail lock!" << dendl; + continue; + } + else if (ret < 0) { + // failed to lock for another reason, continue to process other queues + ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to lock token: " << oid + << ":: ret=" << ret << "::" << cpp_strerror(-ret) << dendl; + //has_error = true; + continue; + } + ldpp_dout(dpp, 10) << __func__ << "::successfully locked " << oid << dendl; + bufferlist empty_bl; + shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl); + d_token_creation_time = sp.creation_time; + bufferlist sp_bl; + encode(sp, sp_bl); + ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); + if (ret == 0) { + ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl; + return shard; + } + } + + return NULL_SHARD; + } + + //--------------------------------------------------------------------------- + work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store, + work_shard_t num_work_shards) + { + int32_t shard = get_next_shard_token(store, d_curr_worker_shard, + num_work_shards, WORKER_SHARD_PREFIX); + if (shard >= 0 && shard < num_work_shards) { + d_curr_worker_shard = shard + 1; + return shard; + } + else { + return NULL_WORK_SHARD; + } + } + + //--------------------------------------------------------------------------- + md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store, + md5_shard_t num_md5_shards) + { + int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards, + MD5_SHARD_PREFIX); + if (shard >= 0 && shard < num_md5_shards) { + d_curr_md5_shard = shard + 1; + return shard; + } + else { + return NULL_MD5_SHARD; + } + } + + //--------------------------------------------------------------------------- + int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix, + uint16_t *p_num_completed, + uint8_t completed_arr[]) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + int err_code = 0; + unsigned count = 0; + shard_token_oid sto(prefix); + for (unsigned shard = 0; shard < shards_count; shard++) { + if (completed_arr[shard] == TOKEN_STATE_COMPLETED) { + count++; + continue; + } + + sto.set_shard(shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl; + bufferlist bl; + ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl); + if (unlikely(ret <= 0)) { + if (ret != -ENODATA) { + ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + } + completed_arr[shard] = TOKEN_STATE_CORRUPTED; + // all failures to get valid token state return ENODATA + err_code = -ENODATA; + continue; + } + + shard_progress_t sp; + try { + auto p = bl.cbegin(); + decode(sp, p); + } + catch (const buffer::error&) { + ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl; + completed_arr[shard] = TOKEN_STATE_CORRUPTED; + // all failures to get valid token state return ENODATA + err_code = -ENODATA; + continue; + } + + if (sp.is_completed()) { + utime_t duration = sp.completion_time - sp.creation_time; + // mark token completed; + (*p_num_completed)++; + completed_arr[shard] = TOKEN_STATE_COMPLETED; + ldpp_dout(dpp, 20) << __func__ << "::" << oid + << "::completed! duration=" << duration << dendl; + count++; + } + else if (sp.was_not_started()) { + // token was not started yet + // TBD: + // If it is not locked we can process it (by why we skipped it)?? + // If locked, check when it was done and if timed-out + ldpp_dout(dpp, 10) << __func__ << "::" << oid + << "::was not started, skipping" << dendl; + return -EAGAIN; + } + else { + static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0); + utime_t time_elapsed = ceph_clock_now() - sp.update_time; + if (time_elapsed > heartbeat_timeout) { + // lock expired -> try and break lock + ldpp_dout(dpp, 5) << __func__ << "::" << oid + << "::expired lock, skipping:" << time_elapsed + << "::" << sp << dendl; + completed_arr[shard] = TOKEN_STATE_TIMED_OUT; + err_code = -ETIME; + continue; + } + else { + return -EAGAIN; + } + } + } // loop + + if (count < shards_count) { + unsigned n = shards_count - count; + ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl; + } + return err_code; + } + + //--------------------------------------------------------------------------- + static int collect_shard_stats(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + utime_t epoch_time, + unsigned shards_count, + const char *prefix, + bufferlist bl_arr[], + shard_progress_t *sp_arr) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + unsigned count = 0; + cluster::shard_token_oid sto(prefix); + for (unsigned shard = 0; shard < shards_count; shard++) { + sto.set_shard(shard); + std::string oid(sto.get_buff(), sto.get_buff_size()); + ldpp_dout(dpp, 20) << __func__ << "::checking object: " << oid << dendl; + + uint64_t size; + struct timespec tspec; + if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) { + ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )" + << "::shards_count=" << shards_count << dendl; + continue; + } + utime_t mtime(tspec); + if (epoch_time > mtime) { + ldpp_dout(dpp, 10) << __func__ << "::skipping old obj! " + << "::EPOCH={" << epoch_time.tv.tv_sec << ":" << epoch_time.tv.tv_nsec << "} " + << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl; + continue; + } + + shard_progress_t sp; + bufferlist bl; + ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl); + if (ret > 0) { + try { + auto p = bl.cbegin(); + decode(sp, p); + sp_arr[shard] = sp; + count++; + } + catch (const buffer::error&) { + ldpp_dout(dpp, 10) << __func__ << "::(1)failed shard_progress_t decode!" << dendl; + return -EINVAL; + } + } + else if (ret != -ENODATA) { + ldpp_dout(dpp, 10) << __func__ << "::" << oid << "::failed getxattr() ret=" + << ret << "::" << cpp_strerror(-ret) << dendl; + continue; + } + bl_arr[shard] = sp.stats_bl; + } + + if (count != shards_count) { + ldpp_dout(dpp, 10) << __func__ << "::missing shards stats! we got " + << count << " / " << shards_count << dendl; + } + + return count; + } + + struct member_time_t { + utime_t start_time; + utime_t end_time; + utime_t aggregated_time; + }; + + //--------------------------------------------------------------------------- + static void collect_single_shard_stats(const DoutPrefixProvider *dpp, + std::map &owner_map, + const shard_progress_t sp_arr[], + unsigned shard, + bool *p_show_time, + const char *name) + { + const utime_t null_time; + const shard_progress_t &sp = sp_arr[shard]; + if (sp.creation_time == null_time || sp.completion_time == null_time) { + *p_show_time = false; + return; + } + + const std::string &owner = sp.owner; + utime_t duration = sp.completion_time - sp.creation_time; + if (owner_map.find(owner) != owner_map.end()) { + owner_map[owner].aggregated_time += duration; + owner_map[owner].end_time = sp.completion_time; + } + else { + owner_map[owner].start_time = sp.creation_time; + owner_map[owner].aggregated_time = duration; + owner_map[owner].end_time = sp.completion_time; + } + ldpp_dout(dpp, 10) << __func__ << "::Got " << name + << " stats for shard #" << shard << dendl; + } + + //--------------------------------------------------------------------------- + static void show_incomplete_shards_fmt(bool has_incomplete_shards, + unsigned num_shards, + const shard_progress_t sp_arr[], + Formatter *fmt) + + { + if (!has_incomplete_shards) { + return; + } + Formatter::ArraySection array_section{*fmt, "incomplete_shards"}; + for (unsigned shard = 0; shard < num_shards; shard++) { + if (sp_arr[shard].is_completed() ) { + continue; + } + Formatter::ObjectSection object_section{*fmt, "shard_progress"}; + fmt->dump_unsigned("shard_id", shard); + fmt->dump_string("owner", sp_arr[shard].owner); + fmt->dump_unsigned("progress_a", sp_arr[shard].progress_a); + fmt->dump_unsigned("progress_b", sp_arr[shard].progress_b); + fmt->dump_stream("last updated") << sp_arr[shard].update_time; + } + } + + //--------------------------------------------------------------------------- + static utime_t show_time_func_fmt(const utime_t &start_time, + bool show_time, + const std::map &owner_map, + Formatter *fmt) + { + member_time_t all_members_time; + all_members_time.start_time = start_time; + all_members_time.end_time = start_time; + all_members_time.aggregated_time = utime_t(); + + Formatter::ObjectSection section{*fmt, "time"}; + { + Formatter::ArraySection array_section{*fmt, "per-shard time"}; + for (const auto& [owner, value] : owner_map) { + uint32_t sec = value.end_time.tv.tv_sec - value.start_time.tv.tv_sec; + fmt->dump_stream("member time") + << owner << "::start time = [" << value.start_time.tv.tv_sec % 1000 + << ":" << value.start_time.tv.tv_nsec / (1000*1000) << "] " + << "::aggregated time = " << value.aggregated_time.tv.tv_sec + << "(" << sec << ") seconds"; + all_members_time.aggregated_time += value.aggregated_time; + if (all_members_time.end_time < value.end_time) { + all_members_time.end_time = value.end_time; + } + } + } + + if (show_time) { + uint32_t sec = all_members_time.end_time.tv.tv_sec - all_members_time.start_time.tv.tv_sec; + + Formatter::ObjectSection section{*fmt, "All shards time"}; + fmt->dump_stream("start time") << all_members_time.start_time; + fmt->dump_stream("end time") + << all_members_time.end_time << " (" << sec << " seconds total)"; + fmt->dump_unsigned("aggregated time (sec)", all_members_time.aggregated_time.tv.tv_sec); + } + + return all_members_time.end_time; + } + + //--------------------------------------------------------------------------- + static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum, + const md5_stats_t &md5_stats_sum, + Formatter *fmt) + { + uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes; + uint64_t s3_dedup_bytes = md5_stats_sum.big_objs_stat.dedup_bytes_estimate; + uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes; + Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"}; + fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); + fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); + fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate); + + if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { + double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; + fmt->dump_float("dedup_ratio", dedup_ratio); + } + else { + fmt->dump_float("dedup_ratio", 0); + } + } + + //--------------------------------------------------------------------------- + static void show_dedup_ratio_actual_fmt(const worker_stats_t &wrk_stats_sum, + const md5_stats_t &md5_stats_sum, + Formatter *fmt) + { + uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes; + uint64_t s3_dedup_bytes = (md5_stats_sum.deduped_objects_bytes + + md5_stats_sum.shared_manifest_dedup_bytes); + uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes; + + Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"}; + fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); + fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); + fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes); + if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { + double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; + fmt->dump_float("dedup_ratio", dedup_ratio); + } + else { + fmt->dump_float("dedup_ratio", 0); + } + } + + //--------------------------------------------------------------------------- + // command-line called from radosgw-admin.cc + int cluster::collect_all_shard_stats(rgw::sal::RadosStore *store, + Formatter *fmt, + const DoutPrefixProvider *dpp) + { + dedup_epoch_t epoch; + int ret = get_epoch(store, dpp, &epoch, nullptr); + if (ret != 0) { + return ret; + } + + Formatter::ObjectSection section{*fmt, "DEDUP STAT COUNTERS"}; + work_shard_t num_work_shards = epoch.num_work_shards; + md5_shard_t num_md5_shards = epoch.num_md5_shards; + + unsigned completed_work_shards_count = 0; + unsigned completed_md5_shards_count = 0; + utime_t md5_start_time; + worker_stats_t wrk_stats_sum; + { + std::map owner_map; + bool show_time = true; + bufferlist bl_arr[num_work_shards]; + shard_progress_t sp_arr[num_work_shards]; + int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards, + WORKER_SHARD_PREFIX, bl_arr, sp_arr); + if (cnt != num_work_shards && 0) { + std::cerr << ">>>Partial work shard stats recived " << cnt << " / " + << num_work_shards << "\n" << std::endl; + } + bool has_incomplete_shards = false; + for (unsigned shard = 0; shard < num_work_shards; shard++) { + if (bl_arr[shard].length() == 0) { + has_incomplete_shards = true; + continue; + } + completed_work_shards_count++; + worker_stats_t stats; + try { + auto p = bl_arr[shard].cbegin(); + decode(stats, p); + wrk_stats_sum += stats; + }catch (const buffer::error&) { + // TBD: can we use std::cerr or should we use formatter ?? + std::cerr << __func__ << "::(2)failed worker_stats_t decode #" << shard << std::endl; + continue; + } + collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "WORKER"); + } + Formatter::ObjectSection worker_stats(*fmt, "worker_stats"); + wrk_stats_sum.dump(fmt); + show_incomplete_shards_fmt(has_incomplete_shards, num_work_shards, sp_arr, fmt); + md5_start_time = show_time_func_fmt(epoch.time, show_time, owner_map, fmt); + } + + if (completed_work_shards_count == num_work_shards) { + std::map owner_map; + bool show_time = true; + md5_stats_t md5_stats_sum; + bufferlist bl_arr[num_md5_shards]; + shard_progress_t sp_arr[num_md5_shards]; + int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards, + MD5_SHARD_PREFIX, bl_arr, sp_arr); + if (cnt != num_md5_shards && 0) { + std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / " + << num_md5_shards << "\n" << std::endl; + } + bool has_incomplete_shards = false; + for (unsigned shard = 0; shard < num_md5_shards; shard++) { + if (bl_arr[shard].length() == 0) { + has_incomplete_shards = true; + continue; + } + completed_md5_shards_count++; + md5_stats_t stats; + try { + auto p = bl_arr[shard].cbegin(); + decode(stats, p); + md5_stats_sum += stats; + }catch (const buffer::error&) { + // TBD: can we use std::cerr or should we use formatter ?? + std::cerr << __func__ << "::failed md5_stats_t decode #" << shard << std::endl; + continue; + } + collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "MD5"); + } + { + Formatter::ObjectSection outer(*fmt, "md5_stats"); + md5_stats_sum.dump(fmt); + show_incomplete_shards_fmt(has_incomplete_shards, num_md5_shards, sp_arr, fmt); + show_time_func_fmt(md5_start_time, show_time, owner_map, fmt); + } + show_dedup_ratio_estimate_fmt(wrk_stats_sum, md5_stats_sum, fmt); + show_dedup_ratio_actual_fmt(wrk_stats_sum, md5_stats_sum, fmt); + } + + fmt->dump_bool("completed", (completed_md5_shards_count == num_md5_shards)); + return 0; + } + + //--------------------------------------------------------------------------- + int cluster::watch_reload(rgw::sal::RadosStore *store, + const DoutPrefixProvider* dpp, + uint64_t *p_watch_handle, + librados::WatchCtx2 *ctx) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + const std::string & oid = DEDUP_WATCH_OBJ; + // create the object to watch (object may already exist) + bool exclusive = true; + ret = ctl_ioctx.create(oid, exclusive); + if (ret >= 0) { + ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid + << " was created!" << dendl; + } + else if (ret == -EEXIST) { + ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl; + } + else { + ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create(" + << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx); + if (ret < 0) { + ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid + << ". error: " << cpp_strerror(-ret) << dendl; + *p_watch_handle = 0; + return ret; + } + ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching " + << oid << "::watch_handle=" << *p_watch_handle << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + int cluster::unwatch_reload(rgw::sal::RadosStore *store, + const DoutPrefixProvider* dpp, + uint64_t watch_handle) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + ret = ctl_ioctx.unwatch2(watch_handle); + if (ret < 0) { + ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() " + << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl; + return ret; + } + return 0; + } + + //--------------------------------------------------------------------------- + int cluster::ack_notify(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + const control_t *p_ctl, + uint64_t notify_id, + uint64_t cookie, + int status) + { + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl; + bufferlist reply_bl; + ceph::encode(status, reply_bl); + encode(*p_ctl, reply_bl); + ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl); + + return 0; + } + + //--------------------------------------------------------------------------- + // command-line called from radosgw-admin.cc + int cluster::dedup_control(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + urgent_msg_t urgent_msg) + { + ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = " + << get_urgent_msg_names(urgent_msg) << dendl; + if (urgent_msg != URGENT_MSG_RESUME && + urgent_msg != URGENT_MSG_PASUE && + urgent_msg != URGENT_MSG_RESTART && + urgent_msg != URGENT_MSG_ABORT) { + ldpp_dout(dpp, 1) << __func__ << "::illegal urgent_msg="<< urgent_msg << dendl; + return -EINVAL; + } + + librados::IoCtx ctl_ioctx; + int ret = get_control_ioctx(store, dpp, ctl_ioctx); + if (unlikely(ret != 0)) { + return ret; + } + + // 10 seconds timeout + const uint64_t timeout_ms = 10*1000; + bufferlist reply_bl, urgent_msg_bl; + ceph::encode(urgent_msg, urgent_msg_bl); + ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl, + timeout_ms, &reply_bl, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify(" + << DEDUP_WATCH_OBJ << ")::err="< acks; + std::vector timeouts; + ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts); + if (timeouts.size() > 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify(" + << DEDUP_WATCH_OBJ << ")::timeout error" << dendl; + return -EAGAIN; + } + + for (auto& ack : acks) { + try { + ldpp_dout(dpp, 20) << __func__ << "::ACK: notifier_id=" << ack.notifier_id + << "::cookie=" << ack.cookie << dendl; + auto iter = ack.payload_bl.cbegin(); + ceph::decode(ret, iter); + struct rgw::dedup::control_t ctl; + decode(ctl, iter); + ldpp_dout(dpp, 10) << __func__ << "::++ACK::ctl=" << ctl << "::ret=" << ret << dendl; + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::failed decoding notify acks" << dendl; + return -EINVAL; + } + if (ret != 0) { + ldpp_dout(dpp, 1) << __func__ << "::Bad notify ack, ret=" << ret + << "::err=" << cpp_strerror(-ret) << dendl; + return ret; + } + } + ldpp_dout(dpp, 10) << __func__ << "::" << get_urgent_msg_names(urgent_msg) + << " finished successfully!" << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + // command-line called from radosgw-admin.cc + int cluster::dedup_restart_scan(rgw::sal::RadosStore *store, + dedup_req_type_t dedup_type, + const DoutPrefixProvider *dpp) + { + ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl; + + dedup_epoch_t old_epoch; + // store the previous epoch for cmp-swap + int ret = get_epoch(store, dpp, &old_epoch, __func__); + if (ret != 0) { + // generate an empty epoch with zero counters + std::string cluster_id("NULL_CLUSTER_ID"); + ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: " + << cluster_id << dendl; + set_epoch(store, cluster_id, dpp, 0, 0); + ret = get_epoch(store, dpp, &old_epoch, __func__); + if (ret) { + return ret; + } + } + + // first abort all dedup work! + ret = dedup_control(store, dpp, URGENT_MSG_ABORT); + if (ret != 0) { + return ret; + } +#if 0 + // then delete dedup-pool to ensure a clean start + const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; + auto rados_handle = store->getRados()->get_rados_handle(); + ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl; + rados_handle->pool_delete(dedup_pool.name.c_str()); +#endif + + ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl; +#ifdef FULL_DEDUP_SUPPORT + ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE || + dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL); +#else + ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); +#endif + ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0); + if (ret == 0) { + ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl; + return dedup_control(store, dpp, URGENT_MSG_RESTART); + } + else { + return ret; + } + } + + //--------------------------------------------------------------------------- + bool cluster::can_start_new_scan(rgw::sal::RadosStore *store) + { + ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl; + dedup_epoch_t new_epoch; + if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) { + ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::" + << "::scan can be restarted!\n\n\n" << dendl; + // no epoch object exists -> we should start a new scan + return true; + } + + if (new_epoch.time <= d_epoch_time) { + if (new_epoch.time == d_epoch_time) { + ldpp_dout(dpp, 10) << __func__ << "::Epoch hasn't change - > Do not restart scan!!" << dendl; + } + else { + ldpp_dout(dpp, 1) << __func__ << " ::Do not restart scan!\n epoch=" + << d_epoch_time << "\nnew_epoch="<< new_epoch.time < now = TRUE " << dendl; + } + return false; + } +} // namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.h b/src/rgw/driver/rados/rgw_dedup_cluster.h new file mode 100644 index 000000000000..64b2c54a4fa2 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_cluster.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include "common/dout.h" +#include "rgw_dedup_utils.h" +#include "rgw_dedup_store.h" +#include + +namespace rgw::dedup { + static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK."; + static constexpr const char* MD5_SHARD_PREFIX = "MD5.SHRD.TK."; + struct control_t; + struct dedup_epoch_t; + + class cluster{ + public: + //================================================================================== + class shard_token_oid { + public: + //--------------------------------------------------------------------------- + shard_token_oid(const char *prefix) { + this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix); + this->total_len = this->prefix_len; + } + + //--------------------------------------------------------------------------- + shard_token_oid(const char *prefix, uint16_t shard) { + this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix); + set_shard(shard); + } + + //--------------------------------------------------------------------------- + void set_shard(uint16_t shard) { + int n = snprintf(this->buff + this->prefix_len, BUFF_SIZE, "%03x", shard); + this->total_len = this->prefix_len + n; + } + + //--------------------------------------------------------------------------- + static bool legal_oid_name(const std::string& oid) { + return ((oid.length() <= BUFF_SIZE) && + (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX))); + } + inline const char* get_buff() { return this->buff; } + inline unsigned get_buff_size() { return this->total_len; } + private: + static const unsigned BUFF_SIZE = 15; + unsigned total_len = 0; + unsigned prefix_len = 0; + char buff[BUFF_SIZE]; + }; + + //================================================================================== + cluster(const DoutPrefixProvider *_dpp, + CephContext* cct, + rgw::sal::Driver* driver); + int reset(rgw::sal::RadosStore *store, + struct dedup_epoch_t*, + work_shard_t num_work_shards, + md5_shard_t num_md5_shards); + + utime_t get_epoch_time() { return d_epoch_time; } + work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store, + work_shard_t num_work_shards); + md5_shard_t get_next_md5_shard_token(rgw::sal::RadosStore *store, + md5_shard_t num_md5_shards); + bool can_start_new_scan(rgw::sal::RadosStore *store); + static int collect_all_shard_stats(rgw::sal::RadosStore *store, + Formatter *p_formatter, + const DoutPrefixProvider *dpp); + static int watch_reload(rgw::sal::RadosStore *store, + const DoutPrefixProvider* dpp, + uint64_t *p_watch_handle, + librados::WatchCtx2 *ctx); + static int unwatch_reload(rgw::sal::RadosStore *store, + const DoutPrefixProvider* dpp, + uint64_t watch_handle); + static int ack_notify(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + const struct control_t *p_ctl, + uint64_t notify_id, + uint64_t cookie, + int status); + static int dedup_control(rgw::sal::RadosStore *store, + const DoutPrefixProvider *dpp, + urgent_msg_t urgent_msg); + static int dedup_restart_scan(rgw::sal::RadosStore *store, + dedup_req_type_t dedup_type, + const DoutPrefixProvider *dpp); + + //--------------------------------------------------------------------------- + int mark_work_shard_token_completed(rgw::sal::RadosStore *store, + work_shard_t work_shard, + const worker_stats_t *p_stats) + { + ceph::bufferlist bl; + encode(*p_stats, bl); + d_num_completed_workers++; + d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED; + + return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj, + WORKER_SHARD_PREFIX, bl); + } + + //--------------------------------------------------------------------------- + int mark_md5_shard_token_completed(rgw::sal::RadosStore *store, + md5_shard_t md5_shard, + const md5_stats_t *p_stats) + { + ceph::bufferlist bl; + encode(*p_stats, bl); + d_num_completed_md5++; + d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED; + return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects, + MD5_SHARD_PREFIX, bl); + } + + int update_shard_token_heartbeat(rgw::sal::RadosStore *store, + unsigned shard, + uint64_t count_a, + uint64_t count_b, + const char *prefix); + + //--------------------------------------------------------------------------- + int all_work_shard_tokens_completed(rgw::sal::RadosStore *store, + work_shard_t num_work_shards) + { + return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX, + &d_num_completed_workers, d_completed_workers); + } + + //--------------------------------------------------------------------------- + int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store, + md5_shard_t num_md5_shards) + { + return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX, + &d_num_completed_md5, d_completed_md5); + } + + private: + static constexpr unsigned TOKEN_STATE_PENDING = 0x00; + static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC; + static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD; + static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF; + + void clear(); + int all_shard_tokens_completed(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix, + uint16_t *p_num_completed, + uint8_t completed_arr[]); + int cleanup_prev_run(rgw::sal::RadosStore *store); + int32_t get_next_shard_token(rgw::sal::RadosStore *store, + uint16_t start_shard, + uint16_t max_count, + const char *prefix); + int create_shard_tokens(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix); + int verify_all_shard_tokens(rgw::sal::RadosStore *store, + unsigned shards_count, + const char *prefix); + int mark_shard_token_completed(rgw::sal::RadosStore *store, + unsigned shard, + uint64_t obj_count, + const char *prefix, + const bufferlist &bl); + + const DoutPrefixProvider *dpp; + std::string d_lock_cookie; + std::string d_cluster_id; + md5_shard_t d_curr_md5_shard = 0; + work_shard_t d_curr_worker_shard = 0; + utime_t d_epoch_time; + utime_t d_token_creation_time; + uint8_t d_completed_workers[MAX_WORK_SHARD]; + uint8_t d_completed_md5[MAX_MD5_SHARD]; + uint16_t d_num_completed_workers = 0; + uint16_t d_num_completed_md5 = 0; + }; + +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_epoch.h b/src/rgw/driver/rados/rgw_dedup_epoch.h new file mode 100644 index 000000000000..84492d357392 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_epoch.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "common/Clock.h" // for ceph_clock_now() +#include "common/dout.h" +#include "rgw_dedup_utils.h" + +#include + +namespace rgw::dedup { + constexpr const char* RGW_DEDUP_ATTR_EPOCH = "rgw.dedup.attr.epoch"; + //=========================================================================== + + struct dedup_epoch_t { + uint32_t serial; + dedup_req_type_t dedup_type; + utime_t time; + uint32_t num_work_shards = 0; + uint32_t num_md5_shards = 0; + }; + + //--------------------------------------------------------------------------- + inline void encode(const dedup_epoch_t& o, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + encode(o.serial, bl); + encode(static_cast(o.dedup_type), bl); + encode(o.time, bl); + encode(o.num_work_shards, bl); + encode(o.num_md5_shards, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + inline void decode(dedup_epoch_t& o, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + decode(o.serial, bl); + int32_t dedup_type; + decode(dedup_type, bl); + o.dedup_type = static_cast (dedup_type); + decode(o.time, bl); + decode(o.num_work_shards, bl); + decode(o.num_md5_shards, bl); + DECODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + inline std::ostream& operator<<(std::ostream &out, const dedup_epoch_t &ep) + { + utime_t elapsed = ceph_clock_now() - ep.time; + out << "EPOCH::Time={" << ep.time.tv.tv_sec <<":"<< ep.time.tv.tv_nsec << "}::"; + out << "Elapsed={" << elapsed.tv.tv_sec <<":"<< elapsed.tv.tv_nsec << "}::"; + out << ep.dedup_type << "::serial=" << ep.serial; + out << "::num_work_shards=" << ep.num_work_shards; + out << "::num_md5_shards=" << ep.num_md5_shards; + return out; + } + +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_remap.h b/src/rgw/driver/rados/rgw_dedup_remap.h new file mode 100644 index 000000000000..60ef66ecbe80 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_remap.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include "common/dout.h" +#include +#include +#include + + +namespace rgw::dedup { + class remapper_t + { + public: + static inline constexpr uint8_t NULL_IDX = 0xFF; + remapper_t(uint32_t max_entries) : d_max_entries(max_entries) {} + uint8_t remap(const std::string &key, + const DoutPrefixProvider* dpp, + uint64_t *p_overflow_count) { // IN-OUT + uint8_t idx; + + auto itr = d_map.find(key); + if (itr != d_map.end()) { + idx = itr->second; + ldpp_dout(dpp, 20) << __func__ << "::Existing key: " << key + << " is mapped to idx=" << (int)idx << dendl; + } + else if (d_num_entries < d_max_entries) { + // assign it the next entry + idx = d_num_entries++; + d_map[key] = idx; + ldpp_dout(dpp, 20) << __func__ << "::New key: " << key + << " was mapped to idx=" << (int)idx << dendl; + } + else { + (*p_overflow_count) ++; + ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed adding key: " + << key << dendl; + idx = NULL_IDX; + } + + return idx; + } + + private: + uint32_t d_num_entries = 0; + const uint32_t d_max_entries; + std::unordered_map d_map; + }; + +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_store.cc b/src/rgw/driver/rados/rgw_dedup_store.cc new file mode 100644 index 000000000000..fd15bbc372d8 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_store.cc @@ -0,0 +1,732 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/rados/rados_types.hpp" +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "svc_zone.h" +#include "common/config.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "rgw_common.h" +#include "include/denc.h" +#include "rgw_sal.h" +#include "driver/rados/rgw_sal_rados.h" +#include "rgw_dedup_utils.h" +#include "rgw_dedup.h" +#include "rgw_dedup_store.h" +#include "fmt/ranges.h" +#include + +namespace rgw::dedup { + + //--------------------------------------------------------------------------- + disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket, + const std::string &obj_name, + const parsed_etag_t *p_parsed_etag, + uint64_t obj_size, + const std::string &storage_class) + { + this->s.rec_version = 0; + this->s.flags = 0; + this->s.num_parts = p_parsed_etag->num_parts; + this->obj_name = obj_name; + this->s.obj_name_len = this->obj_name.length(); + this->bucket_name = p_bucket->get_name(); + this->s.bucket_name_len = this->bucket_name.length(); + + this->s.md5_high = p_parsed_etag->md5_high; + this->s.md5_low = p_parsed_etag->md5_low; + this->s.obj_bytes_size = obj_size; + this->s.object_version = 0; + + this->bucket_id = p_bucket->get_bucket_id(); + this->s.bucket_id_len = this->bucket_id.length(); + this->tenant_name = p_bucket->get_tenant(); + this->s.tenant_name_len = this->tenant_name.length(); + this->stor_class = storage_class; + this->s.stor_class_len = storage_class.length(); + + this->s.ref_tag_len = 0; + this->s.manifest_len = 0; + + this->s.shared_manifest = 0; + memset(this->s.hash, 0, sizeof(this->s.hash)); + this->ref_tag = ""; + this->manifest_bl.clear(); + } + + //--------------------------------------------------------------------------- + disk_record_t::disk_record_t(const char *buff) + { + disk_record_t *p_rec = (disk_record_t*)buff; + this->s.rec_version = p_rec->s.rec_version; + // wrong version, bail out + if (unlikely(p_rec->s.rec_version != 0)) { + return; + } + + this->s.flags = p_rec->s.flags; + this->s.num_parts = CEPHTOH_16(p_rec->s.num_parts); + this->s.obj_name_len = CEPHTOH_16(p_rec->s.obj_name_len); + this->s.bucket_name_len = CEPHTOH_16(p_rec->s.bucket_name_len); + + this->s.md5_high = CEPHTOH_64(p_rec->s.md5_high); + this->s.md5_low = CEPHTOH_64(p_rec->s.md5_low); + this->s.obj_bytes_size = CEPHTOH_64(p_rec->s.obj_bytes_size); + this->s.object_version = CEPHTOH_64(p_rec->s.object_version); + + this->s.bucket_id_len = CEPHTOH_16(p_rec->s.bucket_id_len); + this->s.tenant_name_len = CEPHTOH_16(p_rec->s.tenant_name_len); + this->s.stor_class_len = CEPHTOH_16(p_rec->s.stor_class_len); + this->s.ref_tag_len = CEPHTOH_16(p_rec->s.ref_tag_len); + this->s.manifest_len = CEPHTOH_16(p_rec->s.manifest_len); + + const char *p = buff + sizeof(this->s); + this->obj_name = std::string(p, this->s.obj_name_len); + p += p_rec->s.obj_name_len; + + this->bucket_name = std::string(p, this->s.bucket_name_len); + p += p_rec->s.bucket_name_len; + + this->bucket_id = std::string(p, this->s.bucket_id_len); + p += p_rec->s.bucket_id_len; + + this->tenant_name = std::string(p, this->s.tenant_name_len); + p += p_rec->s.tenant_name_len; + + this->stor_class = std::string(p, this->s.stor_class_len); + p += p_rec->s.stor_class_len; + + if (p_rec->s.flags.is_fastlane()) { + // TBD:: remove asserts + ceph_assert(this->s.ref_tag_len == 0); + ceph_assert(this->s.manifest_len == 0); + } + else { + this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest); + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + const unsigned units = (256 / (sizeof(uint64_t)*8)); + static_assert(units == 4); + for (unsigned i = 0; i < units; i++) { + this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]); + } + this->ref_tag = std::string(p, this->s.ref_tag_len); + p += p_rec->s.ref_tag_len; + + this->manifest_bl.append(p, this->s.manifest_len); + } + } + + //--------------------------------------------------------------------------- + size_t disk_record_t::serialize(char *buff) const + { + ceph_assert(this->s.rec_version == 0); + disk_record_t *p_rec = (disk_record_t*)buff; + p_rec->s.rec_version = 0; + p_rec->s.flags = this->s.flags; + p_rec->s.num_parts = HTOCEPH_16(this->s.num_parts); + p_rec->s.obj_name_len = HTOCEPH_16(this->obj_name.length()); + p_rec->s.bucket_name_len = HTOCEPH_16(this->bucket_name.length()); + + p_rec->s.md5_high = HTOCEPH_64(this->s.md5_high); + p_rec->s.md5_low = HTOCEPH_64(this->s.md5_low); + p_rec->s.obj_bytes_size = HTOCEPH_64(this->s.obj_bytes_size); + p_rec->s.object_version = HTOCEPH_64(this->s.object_version); + + p_rec->s.bucket_id_len = HTOCEPH_16(this->bucket_id.length()); + p_rec->s.tenant_name_len = HTOCEPH_16(this->tenant_name.length()); + p_rec->s.stor_class_len = HTOCEPH_16(this->stor_class.length()); + p_rec->s.ref_tag_len = HTOCEPH_16(this->ref_tag.length()); + p_rec->s.manifest_len = HTOCEPH_16(this->manifest_bl.length()); + char *p = buff + sizeof(this->s); + unsigned len = this->obj_name.length(); + std::memcpy(p, this->obj_name.data(), len); + p += len; + + len = this->bucket_name.length(); + std::memcpy(p, this->bucket_name.data(), len); + p += len; + + len = this->bucket_id.length(); + std::memcpy(p, this->bucket_id.data(), len); + p += len; + + len = this->tenant_name.length(); + std::memcpy(p, this->tenant_name.data(), len); + p += len; + + len = this->stor_class.length(); + std::memcpy(p, this->stor_class.data(), len); + p += len; + + if (this->s.flags.is_fastlane()) { + // TBD:: remove asserts + ceph_assert(this->s.ref_tag_len == 0); + ceph_assert(this->s.manifest_len == 0); + } + else { + p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest); + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + const unsigned units = (256 / (sizeof(uint64_t)*8)); + static_assert(units == 4); + for (unsigned i = 0; i < units; i++) { + p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]); + } + len = this->ref_tag.length(); + std::memcpy(p, this->ref_tag.data(), len); + p += len; + + len = this->manifest_bl.length(); + const char *p_manifest = const_cast(this)->manifest_bl.c_str(); + std::memcpy(p, p_manifest, len); + p += len; + } + return (p - buff); + } + + //--------------------------------------------------------------------------- + size_t disk_record_t::length() const + { + return (sizeof(this->s) + + this->obj_name.length() + + this->bucket_name.length() + + this->bucket_id.length() + + this->tenant_name.length() + + this->stor_class.length() + + this->ref_tag.length() + + this->manifest_bl.length()); + } + + //--------------------------------------------------------------------------- + int disk_record_t::validate(const char *caller, + const DoutPrefixProvider* dpp, + disk_block_id_t block_id, + record_id_t rec_id) const + { + // optimistic approach + if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) { + ldpp_dout(dpp, 20) << __func__ << "::success" << dendl; + return 0; + } + + // wrong version + if (this->s.rec_version != 0) { + // TBD + //p_stats->failed_wrong_ver++; + ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: Bad record version: " + << this->s.rec_version + << "::block_id=" << block_id + << "::rec_id=" << rec_id + << dendl; + return -EPROTO; // Protocol error + } + + // if arrived here record size is too large + // TBD + //p_stats->failed_rec_overflow++; + ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: record size too big: " + << this->length() + << "::block_id=" << block_id + << "::rec_id=" << rec_id + << dendl; + return -EOVERFLOW; // maybe should use -E2BIG ?? + } + + //--------------------------------------------------------------------------- + std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec) + { + stream << rec.obj_name << "::" << rec.s.obj_name_len << "\n"; + stream << rec.bucket_name << "::" << rec.s.bucket_name_len << "\n"; + stream << rec.bucket_id << "::" << rec.s.bucket_id_len << "\n"; + stream << rec.tenant_name << "::" << rec.s.tenant_name_len << "\n"; + stream << rec.stor_class << "::" << rec.s.stor_class_len << "\n"; + stream << rec.ref_tag << "::" << rec.s.ref_tag_len << "\n"; + stream << "num_parts = " << rec.s.num_parts << "\n"; + stream << "obj_size = " << rec.s.obj_bytes_size/1024 <<" KiB" << "\n"; + stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n"; + stream << "HASH = "; + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + const unsigned units = (256 / (sizeof(uint64_t)*8)); + static_assert(units == 4); + for (unsigned i = 0; i < units; i++) { + stream << rec.s.hash[i]; + } + stream << "\n"; + + if (rec.has_shared_manifest()) { + stream << "Shared Manifest Object\n"; + } + else { + stream << "Dedicated Manifest Object\n"; + } + stream << "Manifest len=" << rec.s.manifest_len << "\n"; + return stream; + } + + //--------------------------------------------------------------------------- + void disk_block_t::init(work_shard_t worker_id, uint32_t seq_number) + { + disk_block_header_t *p_header = get_header(); + p_header->offset = sizeof(disk_block_header_t); + p_header->rec_count = 0; + p_header->block_id = disk_block_id_t(worker_id, seq_number); + } + + //--------------------------------------------------------------------------- + int disk_block_header_t::verify(disk_block_id_t expected_block_id, const DoutPrefixProvider* dpp) + { + if (unlikely(offset != BLOCK_MAGIC && offset != LAST_BLOCK_MAGIC)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR::bad magic number (0x" << std::hex << offset << std::dec << ")" << dendl; + return -EINVAL; + } + + if (unlikely(rec_count > MAX_REC_IN_BLOCK) ) { + ldpp_dout(dpp, 1) << __func__ << "::ERR::rec_count=" << rec_count << " > MAX_REC_IN_BLOCK" << dendl; + return -EINVAL; + } + + if (unlikely(this->block_id != expected_block_id)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR::block_id=" << block_id + << "!= expected_block_id=" << expected_block_id << dendl; + return -EINVAL; + } + + return 0; + } + + //--------------------------------------------------------------------------- + record_id_t disk_block_t::add_record(const disk_record_t *p_rec, + const DoutPrefixProvider *dpp) + { + disk_block_header_t *p_header = get_header(); + if (unlikely(p_header->rec_count >= MAX_REC_IN_BLOCK)) { + ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count + << ", MAX_REC_IN_BLOCK=" << MAX_REC_IN_BLOCK << dendl; + return MAX_REC_IN_BLOCK; + } + + if ((DISK_BLOCK_SIZE - p_header->offset) >= p_rec->length()) { + p_header->rec_offsets[p_header->rec_count] = p_header->offset; + unsigned rec_id = p_header->rec_count; + p_header->rec_count ++; + p_rec->serialize(data+p_header->offset); + p_header->offset += p_rec->length(); + return rec_id; + } + else { + return MAX_REC_IN_BLOCK; + } + } + + //--------------------------------------------------------------------------- + void disk_block_t::close_block(const DoutPrefixProvider* dpp, bool has_more) + { + disk_block_header_t *p_header = get_header(); + ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count + << ", has_more=" << (has_more? "TRUE" : "FALSE") << dendl; + + memset(data + p_header->offset, 0, (DISK_BLOCK_SIZE - p_header->offset)); + if (has_more) { + p_header->offset = HTOCEPH_16(BLOCK_MAGIC); + } + else { + p_header->offset = HTOCEPH_16(LAST_BLOCK_MAGIC); + } + for (unsigned i = 0; i < p_header->rec_count; i++) { + p_header->rec_offsets[i] = HTOCEPH_16(p_header->rec_offsets[i]); + } + p_header->rec_count = HTOCEPH_16(p_header->rec_count); + p_header->block_id = HTOCEPH_32((uint32_t)p_header->block_id); + // TBD: CRC + } + + //--------------------------------------------------------------------------- + void disk_block_header_t::deserialize() + { + this->offset = CEPHTOH_16(this->offset); + this->rec_count = CEPHTOH_16(this->rec_count); + this->block_id = CEPHTOH_32((uint32_t)this->block_id); + for (unsigned i = 0; i < this->rec_count; i++) { + this->rec_offsets[i] = CEPHTOH_16(this->rec_offsets[i]); + } + } + + //--------------------------------------------------------------------------- + disk_block_seq_t::disk_block_seq_t(const DoutPrefixProvider* dpp_in, + disk_block_t *p_arr_in, + work_shard_t worker_id, + md5_shard_t md5_shard, + worker_stats_t *p_stats_in) + { + activate(dpp_in, p_arr_in, worker_id, md5_shard, p_stats_in); + } + + //--------------------------------------------------------------------------- + void disk_block_seq_t::activate(const DoutPrefixProvider* dpp_in, + disk_block_t *p_arr_in, + work_shard_t worker_id, + md5_shard_t md5_shard, + worker_stats_t *p_stats_in) + { + dpp = dpp_in; + p_arr = p_arr_in; + d_worker_id = worker_id; + d_md5_shard = md5_shard; + p_stats = p_stats_in; + p_curr_block = nullptr; + d_seq_number = 0; + + memset(p_arr, 0, sizeof(disk_block_t)); + slab_reset(); + } + + //--------------------------------------------------------------------------- + [[maybe_unused]]static int print_manifest(const DoutPrefixProvider *dpp, + RGWRados *rados, + const bufferlist &manifest_bl) + { + RGWObjManifest manifest; + try { + auto bl_iter = manifest_bl.cbegin(); + decode(manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: unable to decode manifest" << dendl; + return -EINVAL; + } + + unsigned idx = 0; + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl; + } + ldpp_dout(dpp, 20) << "==============================================" << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream& out, const disk_block_id_t& block_id) + { + std::ios_base::fmtflags flags = out.flags(); + out << std::hex << "0x" + << (uint32_t)block_id.get_work_shard_id() << "::" + << (uint32_t)block_id.get_slab_id() << "::" + << (uint32_t)block_id.get_block_offset(); + + if (flags & std::ios::dec) { + out << std::dec; + } + return out; + } + + //--------------------------------------------------------------------------- + std::string disk_block_id_t::get_slab_name(md5_shard_t md5_shard) const + { + // SLAB.MD5_ID.WORKER_ID.SLAB_SEQ_ID + const char *SLAB_NAME_FORMAT = "SLB.%03X.%02X.%04X"; + static constexpr uint32_t SLAB_NAME_SIZE = 16; + char name_buf[SLAB_NAME_SIZE]; + slab_id_t slab_id = get_slab_id(); + work_shard_t work_id = get_work_shard_id(); + unsigned n = snprintf(name_buf, sizeof(name_buf), SLAB_NAME_FORMAT, + md5_shard, work_id, slab_id); + std::string oid(name_buf, n); + return oid; + } + + //--------------------------------------------------------------------------- + int load_record(librados::IoCtx &ioctx, + const disk_record_t *p_tgt_rec, + disk_record_t *p_src_rec, /* OUT */ + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + const DoutPrefixProvider *dpp) + { + std::string oid(block_id.get_slab_name(md5_shard)); + int read_len = DISK_BLOCK_SIZE; + static_assert(sizeof(disk_block_t) == DISK_BLOCK_SIZE); + int byte_offset = block_id.get_block_offset() * DISK_BLOCK_SIZE; + bufferlist bl; + int ret = ioctx.read(oid, bl, read_len, byte_offset); + if (unlikely(ret != read_len)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read block from " << oid + << "::ret=" << ret << "::err=" << cpp_strerror(-ret)<egress_slabs++; + slab_reset(); + return ret; + } + + //--------------------------------------------------------------------------- + int disk_block_seq_t::flush_disk_records(librados::IoCtx &ioctx) + { + ceph_assert(p_arr); + ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)d_worker_id + << ", md5_shard=" << (uint32_t)d_md5_shard << dendl; + + // we need to force flush at the end of a cycle even if there was no work done + // it is used as a signal to worker in the next step + if (p_curr_block == &p_arr[0] && p_curr_block->is_empty()) { + ldpp_dout(dpp, 20) << __func__ << "::Empty buffers, generate terminating block" << dendl; + } + p_stats->egress_blocks++; + p_curr_block->close_block(dpp, false); + + int ret = flush(ioctx); + return ret; + } + + //--------------------------------------------------------------------------- + int disk_block_seq_t::add_record(librados::IoCtx &ioctx, + const disk_record_t *p_rec, // IN-OUT + record_info_t *p_rec_info) // OUT-PARAM + { + disk_block_id_t null_block_id; + int ret = p_rec->validate(__func__, dpp, null_block_id, MAX_REC_IN_BLOCK); + if (unlikely(ret != 0)) { + // TBD + //p_stats->failed_rec_store++; + return ret; + } + + p_stats->egress_records ++; + // first, try and add the record to the current open block + p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); + if (p_rec_info->rec_id < MAX_REC_IN_BLOCK) { + p_rec_info->block_id = p_curr_block->get_block_id(); + return 0; + } + else { + // Not enough space left in current block, close it and open the next block + ldpp_dout(dpp, 20) << __func__ << "::Block is full-> close and move to next" << dendl; + p_stats->egress_blocks++; + p_curr_block->close_block(dpp, true); + } + + // Do we have more Blocks in the block-array ? + if (p_curr_block < last_block()) { + p_curr_block ++; + d_seq_number ++; + p_curr_block->init(d_worker_id, d_seq_number); + p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); + } + else { + ldpp_dout(dpp, 20) << __func__ << "::calling flush()" << dendl; + ret = flush(ioctx); + p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); + } + + p_rec_info->block_id = p_curr_block->get_block_id(); + return ret; + } + + //--------------------------------------------------------------------------- + disk_block_array_t::disk_block_array_t(const DoutPrefixProvider* dpp, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t worker_id, + worker_stats_t *p_stats, + md5_shard_t num_md5_shards) + { + d_num_md5_shards = num_md5_shards; + d_worker_id = worker_id; + disk_block_t *p = (disk_block_t *)raw_mem; + disk_block_t *p_end = (disk_block_t *)(raw_mem + raw_mem_size); + + for (unsigned md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) { + ldpp_dout(dpp, 20) << __func__ << "::p=" << p << "::p_end=" << p_end << dendl; + if (p + DISK_BLOCK_COUNT <= p_end) { + d_disk_arr[md5_shard].activate(dpp, p, d_worker_id, md5_shard, p_stats); + p += DISK_BLOCK_COUNT; + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: buffer overflow! " + << "::md5_shard=" << md5_shard << "/" << d_num_md5_shards + << "::raw_mem_size=" << raw_mem_size << dendl; + ldpp_dout(dpp, 1) << __func__ + << "::sizeof(disk_block_t)=" << sizeof(disk_block_t) + << "::DISK_BLOCK_COUNT=" << DISK_BLOCK_COUNT << dendl; + ceph_abort(); + } + } + } + + //--------------------------------------------------------------------------- + void disk_block_array_t::flush_output_buffers(const DoutPrefixProvider* dpp, + librados::IoCtx &ioctx) + { + for (md5_shard_t md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) { + ldpp_dout(dpp, 20) <<__func__ << "::flush buffers:: worker_id=" + << d_worker_id<< ", md5_shard=" << md5_shard << dendl; + d_disk_arr[md5_shard].flush_disk_records(ioctx); + } + } +} // namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_store.h b/src/rgw/driver/rados/rgw_dedup_store.h new file mode 100644 index 000000000000..a89abb134206 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_store.h @@ -0,0 +1,304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include "common/dout.h" +#include "rgw_common.h" +#include "rgw_realm_reloader.h" +#include +#include +#include +#include +#include +#include +#include +#include "include/rados/rados_types.hpp" +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "rgw_dedup_utils.h" +#include "BLAKE3/c/blake3.h" + +namespace rgw::dedup { + struct key_t; +#define CEPHTOH_16 le16toh +#define CEPHTOH_32 le32toh +#define CEPHTOH_64 le64toh +#define HTOCEPH_16 htole16 +#define HTOCEPH_32 htole32 +#define HTOCEPH_64 htole64 + + static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024; + // we use 16 bit offset + static_assert(DISK_BLOCK_SIZE < 64*1024); + static constexpr unsigned DISK_BLOCK_COUNT = 256; + static_assert(DISK_BLOCK_COUNT <= (4*1024*1024/DISK_BLOCK_SIZE)); + static constexpr unsigned MAX_REC_IN_BLOCK = 32; + // we use 8bit record indices + static_assert(MAX_REC_IN_BLOCK < 0xFF); + using slab_id_t = uint16_t; + using block_offset_t = uint8_t; + using record_id_t = uint8_t; + + // disk_block_id_t is a 32 bits concataion of shard_id, slab_id and block_off + // ---8---- | -------16------- | ---8---- + // shard_id | slab_id | block_off + struct __attribute__ ((packed)) disk_block_id_t + { + public: + disk_block_id_t() { + block_id = 0; + } + + disk_block_id_t(work_shard_t shard_id, uint32_t seq_number) { + ceph_assert((seq_number & SEQ_NUMBER_MASK) == seq_number); + ceph_assert(shard_id <= MAX_WORK_SHARD); + block_id = (uint32_t)shard_id << OBJ_SHARD_SHIFT | seq_number; + } + + disk_block_id_t& operator =(const disk_block_id_t &other) { + this->block_id = other.block_id; + return *this; + } + + inline disk_block_id_t& operator =(uint32_t val) { + this->block_id = val; + return *this; + } + + inline bool operator ==(const disk_block_id_t &other) const { + return (this->block_id == other.block_id); + } + + inline explicit operator uint32_t() const { + return this->block_id; + } + + friend std::ostream& operator<<(std::ostream& os, const disk_block_id_t& block_id); + + std::string get_slab_name(md5_shard_t md5_shard) const; + + static inline slab_id_t seq_num_to_slab_id(uint32_t seq_number) { + return (seq_number & SLAB_ID_MASK) >> SLAB_ID_SHIFT; + } + + static inline uint32_t slab_id_to_seq_num(uint32_t slab_id) { + return (slab_id << SLAB_ID_SHIFT); + } + + inline block_offset_t get_block_offset() const { + return get_block_offset(get_seq_num()); + } + + inline work_shard_t get_work_shard_id() const { + return (block_id & OBJ_SHARD_MASK) >> OBJ_SHARD_SHIFT; + } + + private: + inline uint32_t get_seq_num() const { + return (block_id & SEQ_NUMBER_MASK); + } + + inline slab_id_t get_slab_id() const { + return seq_num_to_slab_id(get_seq_num()); + } + + inline block_offset_t get_block_offset(uint32_t seq_number) const { + return (seq_number & BLOCK_OFF_MASK); + } + + static constexpr uint32_t OBJ_SHARD_SHIFT = 24; + static constexpr uint32_t OBJ_SHARD_MASK = 0xFF000000; + + static constexpr uint32_t SEQ_NUMBER_SHIFT = 0; + static constexpr uint32_t SEQ_NUMBER_MASK = 0x00FFFFFF; + + static constexpr uint32_t SLAB_ID_SHIFT = 8; + static constexpr uint32_t SLAB_ID_MASK = 0x00FFFF00; + + static constexpr uint32_t BLOCK_OFF_SHIFT = 0; + static constexpr uint32_t BLOCK_OFF_MASK = 0x000000FF; + + uint32_t block_id; + }; + + struct disk_record_t + { + disk_record_t(const char *buff); + disk_record_t(const rgw::sal::Bucket *p_bucket, + const std::string &obj_name, + const parsed_etag_t *p_parsed_etag, + uint64_t obj_size, + const std::string &storage_class); + disk_record_t() {} + size_t serialize(char *buff) const; + size_t length() const; + int validate(const char *caller, + const DoutPrefixProvider* dpp, + disk_block_id_t block_id, + record_id_t rec_id) const; + inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); } + inline void set_shared_manifest() { s.flags.set_shared_manifest(); } + + struct __attribute__ ((packed)) packed_rec_t + { + uint8_t rec_version; // allows changing record format + dedup_flags_t flags; // 1 Byte flags + uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000) + uint16_t obj_name_len; + uint16_t bucket_name_len; + + uint64_t md5_high; // High Bytes of the Object Data MD5 + uint64_t md5_low; // Low Bytes of the Object Data MD5 + uint64_t obj_bytes_size; + uint64_t object_version; + + uint16_t bucket_id_len; + uint16_t tenant_name_len; + uint16_t stor_class_len; + uint16_t ref_tag_len; + + uint16_t manifest_len; + uint8_t pad[6]; + + uint64_t shared_manifest; // 64bit hash of the SRC object manifest + uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3 + }s; + std::string obj_name; + // TBD: find pool name making it easier to get ioctx + std::string bucket_name; + std::string bucket_id; + std::string tenant_name; + std::string ref_tag; + std::string stor_class; + bufferlist manifest_bl; + }; + static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash)); + std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec); + + static constexpr unsigned BLOCK_MAGIC = 0xFACE; + static constexpr unsigned LAST_BLOCK_MAGIC = 0xCAD7; + struct __attribute__ ((packed)) disk_block_header_t { + void deserialize(); + int verify(disk_block_id_t block_id, const DoutPrefixProvider* dpp); + uint16_t offset; + uint16_t rec_count; + disk_block_id_t block_id; + uint16_t rec_offsets[MAX_REC_IN_BLOCK]; + }; + static constexpr unsigned MAX_REC_SIZE = (DISK_BLOCK_SIZE - sizeof(disk_block_header_t)); + + struct __attribute__ ((packed)) disk_block_t + { + const disk_block_header_t* get_header() const { return (disk_block_header_t*)data; } + disk_block_header_t* get_header() { return (disk_block_header_t*)data; } + bool is_empty() const { return (get_header()->rec_count == 0); } + + void init(work_shard_t worker_id, uint32_t seq_number); + record_id_t add_record(const disk_record_t *p_rec, const DoutPrefixProvider *dpp); + void close_block(const DoutPrefixProvider* dpp, bool has_more); + disk_block_id_t get_block_id() { + disk_block_header_t *p_header = get_header(); + return p_header->block_id; + } + char data[DISK_BLOCK_SIZE]; + }; + + int load_record(librados::IoCtx &ioctx, + const disk_record_t *p_tgt_rec, + disk_record_t *p_src_rec, /* OUT */ + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + const DoutPrefixProvider *dpp); + + int load_slab(librados::IoCtx &ioctx, + bufferlist &bl, + md5_shard_t md5_shard, + work_shard_t worker_id, + uint32_t seq_number, + const DoutPrefixProvider* dpp); + + int store_slab(librados::IoCtx &ioctx, + bufferlist &bl, + md5_shard_t md5_shard, + work_shard_t worker_id, + uint32_t seq_number, + const DoutPrefixProvider* dpp); + + class disk_block_array_t; + class disk_block_seq_t + { + friend class disk_block_array_t; + public: + struct record_info_t { + disk_block_id_t block_id; + record_id_t rec_id; + }; + + disk_block_seq_t(const DoutPrefixProvider* dpp_in, + disk_block_t *p_arr_in, + work_shard_t worker_id, + md5_shard_t md5_shard, + worker_stats_t *p_stats_in); + int flush_disk_records(librados::IoCtx &ioctx); + md5_shard_t get_md5_shard() { return d_md5_shard; } + int add_record(librados::IoCtx &ioctx, + const disk_record_t *p_rec, // IN-OUT + record_info_t *p_rec_info); // OUT-PARAM + + private: + disk_block_seq_t() {;} + void activate(const DoutPrefixProvider* _dpp, + disk_block_t *_p_arr, + work_shard_t worker_id, + md5_shard_t md5_shard, + worker_stats_t *p_stats); + inline const disk_block_t* last_block() { return &p_arr[DISK_BLOCK_COUNT-1]; } + int flush(librados::IoCtx &ioctx); + void slab_reset() { + p_curr_block = p_arr; + p_curr_block->init(d_worker_id, d_seq_number); + } + + disk_block_t *p_arr = nullptr; + disk_block_t *p_curr_block = nullptr; + worker_stats_t *p_stats = nullptr; + const DoutPrefixProvider *dpp = nullptr; + uint32_t d_seq_number = 0; + work_shard_t d_worker_id = NULL_WORK_SHARD; + md5_shard_t d_md5_shard = NULL_MD5_SHARD; + }; + + class disk_block_array_t + { + public: + disk_block_array_t(const DoutPrefixProvider* _dpp, + uint8_t *raw_mem, + uint64_t raw_mem_size, + work_shard_t worker_id, + worker_stats_t *p_worker_stats, + md5_shard_t num_md5_shards); + void flush_output_buffers(const DoutPrefixProvider* dpp, + librados::IoCtx &ioctx); + disk_block_seq_t* get_shard_block_seq(uint64_t md5_low) { + md5_shard_t md5_shard = md5_low % d_num_md5_shards; + return d_disk_arr + md5_shard; + } + + //private: + disk_block_seq_t d_disk_arr[MAX_MD5_SHARD]; + work_shard_t d_worker_id; + md5_shard_t d_num_md5_shards; + }; +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_table.cc b/src/rgw/driver/rados/rgw_dedup_table.cc new file mode 100644 index 000000000000..09335655df62 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_table.cc @@ -0,0 +1,335 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_dedup_table.h" +#include "include/ceph_assert.h" +#include +#include + +namespace rgw::dedup { + + //--------------------------------------------------------------------------- + dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp, + uint32_t _head_object_size, + uint8_t *p_slab, + uint64_t slab_size) + { + dpp = _dpp; + head_object_size = _head_object_size; + memset(p_slab, 0, slab_size); + hash_tab = (table_entry_t*)p_slab; + entries_count = slab_size/sizeof(table_entry_t); + values_count = 0; + occupied_count = 0; + } + + //--------------------------------------------------------------------------- + void dedup_table_t::remove_singletons_and_redistribute_keys() + { + for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) { + if (!hash_tab[tab_idx].val.is_occupied()) { + continue; + } + + if (hash_tab[tab_idx].val.is_singleton()) { + hash_tab[tab_idx].val.clear_flags(); + redistributed_clear++; + continue; + } + + const key_t &key = hash_tab[tab_idx].key; + // This is an approximation only since size is stored in 4KB resolution + uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); + if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { + hash_tab[tab_idx].val.clear_flags(); + redistributed_clear++; + continue; + } + + uint32_t key_idx = key.hash() % entries_count; + if (key_idx != tab_idx) { + uint64_t count = 1; + redistributed_count++; + uint32_t idx = key_idx; + while (hash_tab[idx].val.is_occupied() && + !hash_tab[idx].val.is_singleton() && + (hash_tab[idx].key != key)) { + count++; + idx = (idx + 1) % entries_count; + } + + if (idx != tab_idx) { + if (hash_tab[idx].val.is_occupied() && hash_tab[idx].val.is_singleton() ) { + redistributed_clear++; + } + if (idx == key_idx) { + redistributed_perfect++; + } + hash_tab[idx] = hash_tab[tab_idx]; + hash_tab[tab_idx].val.clear_flags(); + } + else { + redistributed_loopback++; + } + + redistributed_search_max = std::max(redistributed_search_max, count); + redistributed_search_total += count; + } + else { + redistributed_not_needed++; + } + } + } + + //--------------------------------------------------------------------------- + uint32_t dedup_table_t::find_entry(const key_t *p_key) const + { + uint32_t idx = p_key->hash() % entries_count; + + // search until we either find the key, or find an empty slot. + while (hash_tab[idx].val.is_occupied() && (hash_tab[idx].key != *p_key)) { + idx = (idx + 1) % entries_count; + } + return idx; + } + + //--------------------------------------------------------------------------- + int dedup_table_t::add_entry(key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id, + bool shared_manifest) + { + value_t new_val(block_id, rec_id, shared_manifest); + uint32_t idx = find_entry(p_key); + value_t &val = hash_tab[idx].val; + if (!val.is_occupied()) { + if (occupied_count < entries_count) { + occupied_count++; + } + else { + return -EOVERFLOW; + } + + hash_tab[idx].key = *p_key; + hash_tab[idx].val = new_val; + ldpp_dout(dpp, 20) << __func__ << "::add new entry" << dendl; + ceph_assert(val.count == 1); + } + else { + ceph_assert(hash_tab[idx].key == *p_key); + val.count ++; + if (!val.has_shared_manifest() && shared_manifest) { + // replace value! + ldpp_dout(dpp, 20) << __func__ << "::Replace with shared_manifest::[" + << val.block_idx << "/" << (int)val.rec_id << "] -> [" + << block_id << "/" << (int)rec_id << "]" << dendl; + new_val.count = val.count; + hash_tab[idx].val = new_val; + } + ceph_assert(val.count > 1); + } + values_count++; + ldpp_dout(dpp, 20) << __func__ << "::COUNT="<< val.count << dendl; + return 0; + } + + //--------------------------------------------------------------------------- + void dedup_table_t::update_entry(key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id, + bool shared_manifest) + { + uint32_t idx = find_entry(p_key); + ceph_assert(hash_tab[idx].key == *p_key); + value_t &val = hash_tab[idx].val; + ceph_assert(val.is_occupied()); + // we only update non-singletons since we purge singletons after the first pass + ceph_assert(val.count > 1); + + // need to overwrite the block_idx/rec_id from the first pass + // unless already set with shared_manifest with the correct block-id/rec-id + // We only set the shared_manifest flag on the second pass where we + // got valid block-id/rec-id + if (!val.has_shared_manifest()) { + // replace value! + value_t new_val(block_id, rec_id, shared_manifest); + new_val.count = val.count; + hash_tab[idx].val = new_val; + ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::[" + << val.block_idx << "/" << (int)val.rec_id << "] -> [" + << block_id << "/" << (int)rec_id << "]" << dendl; + } + } + + //--------------------------------------------------------------------------- + int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id) + { + uint32_t idx = find_entry(p_key); + value_t &val = hash_tab[idx].val; + if (val.is_occupied()) { + if (val.block_idx == block_id && val.rec_id == rec_id) { + val.set_shared_manifest_src(); + return 0; + } + } + + return -ENOENT; + } + + //--------------------------------------------------------------------------- + int dedup_table_t::get_val(const key_t *p_key, struct value_t *p_val /*OUT*/) + { + uint32_t idx = find_entry(p_key); + const value_t &val = hash_tab[idx].val; + if (!val.is_occupied()) { + return -ENOENT; + } + + *p_val = val; + return 0; + } + + //--------------------------------------------------------------------------- + void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs, + dedup_stats_t *p_big_objs, + uint64_t *p_duplicate_head_bytes) + { + for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) { + if (!hash_tab[tab_idx].val.is_occupied()) { + continue; + } + + const key_t &key = hash_tab[tab_idx].key; + // This is an approximation only since size is stored in 4KB resolution + uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); + uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1); + + // skip small single part objects which we can't dedup + if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { + if (hash_tab[tab_idx].val.is_singleton()) { + p_small_objs->singleton_count++; + } + else { + p_small_objs->duplicate_count += duplicate_count; + p_small_objs->unique_count ++; + p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx); + } + continue; + } + + if (hash_tab[tab_idx].val.is_singleton()) { + p_big_objs->singleton_count++; + } + else { + ceph_assert(hash_tab[tab_idx].val.count > 1); + uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size, + key.num_parts, + byte_size_approx); + p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx); + p_big_objs->duplicate_count += duplicate_count; + p_big_objs->unique_count ++; + + if (!key.multipart_object()) { + // single part objects duplicate the head object when dedup is used + uint64_t dup_head_bytes = duplicate_count * head_object_size; + *p_duplicate_head_bytes += dup_head_bytes; + } + } + } + } + +} // namespace rgw::dedup + +#if 0 +#include +#include +#include +#include +#include +#include + +//--------------------------------------------------------------------------- +int main() +{ + static constexpr unsigned MAX_ENTRIES = 1024; + rgw::dedup::key_t *key_tab = new rgw::dedup::key_t[MAX_ENTRIES]; + if (!key_tab) { + std::cerr << "faild alloc!" << std::endl; + return 1; + } + rgw::dedup::key_t *p_key = key_tab; + //rgw::dedup::dedup_table_t tab(MAX_ENTRIES + MAX_ENTRIES/5); + rgw::dedup::dedup_table_t tab(MAX_ENTRIES); + + std::cout << "sizeof(key)=" << sizeof(rgw::dedup::key_t) << std::endl; + // Seed with a real random value, if available + std::random_device r; + // Choose a random mean between 1 ULLONG_MAX + std::default_random_engine e1(r()); + std::uniform_int_distribution uniform_dist(1, std::numeric_limits::max()); + + for (unsigned i = 0; i < MAX_ENTRIES; i++) { + uint64_t md5_high = uniform_dist(e1); + uint64_t md5_low = uniform_dist(e1); + uint32_t size_4k_units = std::rand(); + uint16_t num_parts = std::rand(); + //std::cout << std::hex << md5_high << "::" << md5_low << "::" << block_id << std::endl; + rgw::dedup::key_t key(md5_high, md5_low, size_4k_units, num_parts); + *p_key = key; + p_key++; + } + work_shard_t work_shard = 3; + for (unsigned i = 0; i < MAX_ENTRIES; i++) { + disk_block_id_t block_id(worker_id, std::rand()); + tab.add_entry(key_tab+i, block_id, 0, false, false); + } + double avg = (double)total / MAX_ENTRIES; + std::cout << "Insert::num entries=" << MAX_ENTRIES << ", total=" << total + << ", avg=" << avg << ", max=" << max << std::endl; + std::cout << "==========================================\n"; + + total = 0; + max = 0; + for (unsigned i = 0; i < MAX_ENTRIES; i++) { + tab.find_entry(key_tab+i); + } + avg = (double)total / MAX_ENTRIES; + std::cout << "Find::num entries=" << MAX_ENTRIES << ", total=" << total + << ", avg=" << avg << ", max=" << max << std::endl; + std::cout << "==========================================\n"; + tab.remove_singletons_and_redistribute_keys(); + tab.print_redistribute_stats(); + tab.stat_counters_reset(); + std::cout << "==========================================\n"; + total = 0; + max = 0; + uint32_t cnt = 0; + for (unsigned i = 0; i < MAX_ENTRIES; i++) { + rgw::dedup::key_t *p_key = key_tab+i; + tab.find_entry(p_key); + cnt++; +#if 0 + if (p_key->md5_high % 5 == 0) { + tab.find_entry(p_key); + cnt++; + } +#endif + } + avg = (double)total / cnt; + std::cout << "num entries=" << cnt << ", total=" << total + << ", avg=" << avg << ", max=" << max << std::endl; +} +#endif diff --git a/src/rgw/driver/rados/rgw_dedup_table.h b/src/rgw/driver/rados/rgw_dedup_table.h new file mode 100644 index 000000000000..51d36006944f --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_table.h @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include +#include +#include +#include "common/dout.h" +#include "rgw_dedup_store.h" +namespace rgw::dedup { + + // 24 Bytes key + struct key_t { + key_t() { ;} + key_t(uint64_t _md5_high, + uint64_t _md5_low, + uint32_t _size_4k_units, + uint16_t _num_parts, + uint8_t _stor_class_idx) { + md5_high = _md5_high; + md5_low = _md5_low; + size_4k_units = _size_4k_units; + num_parts = _num_parts; + stor_class_idx = _stor_class_idx; + pad8 = 0; + } + + bool operator==(const struct key_t& other) const { + return (memcmp(this, &other, sizeof(other)) == 0); + } + + bool operator!=(const struct key_t& other) const { + return !operator==(other); + } + + uint64_t hash() const { + // The MD5 is already a hashing function so no need for another hash + return this->md5_low; + } + + bool multipart_object() const { + return num_parts > 0; + } + + uint64_t md5_high; // High Bytes of the Object Data MD5 + uint64_t md5_low; // Low Bytes of the Object Data MD5 + uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB) + uint16_t num_parts; // How many parts were used in multipart upload (AWS MAX-PART is 10,000) + uint8_t stor_class_idx;// storage class id + uint8_t pad8; + } __attribute__((__packed__)); + static_assert(sizeof(key_t) == 24); + + class dedup_table_t { + public: + // 8 Bytes Value + struct value_t { + value_t() { + this->block_idx = 0xFFFFFFFF; + this->count = 0; + this->rec_id = 0xFF; + this->flags.clear(); + } + + value_t(disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest) { + this->block_idx = block_id; + this->count = 1; + this->rec_id = rec_id; + this->flags.clear(); + this->flags.set_occupied(); + if (shared_manifest) { + flags.set_shared_manifest(); + } + } + + inline void clear_flags() { flags.clear(); } + inline bool has_shared_manifest() const {return flags.has_shared_manifest(); } + inline void set_shared_manifest_src() { this->flags.set_shared_manifest(); } + inline bool is_singleton() const { return (count == 1); } + inline bool is_occupied() const { return flags.is_occupied(); } + inline void set_occupied() { this->flags.set_occupied(); } + inline void clear_occupied() { this->flags.clear_occupied(); } + + disk_block_id_t block_idx; // 32 bits + uint16_t count; // 16 bits + record_id_t rec_id; // 8 bits + dedup_flags_t flags; // 8 bits + } __attribute__((__packed__)); + static_assert(sizeof(value_t) == 8); + + dedup_table_t(const DoutPrefixProvider* _dpp, + uint32_t _head_object_size, + uint8_t *p_slab, + uint64_t slab_size); + int add_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, + bool shared_manifest); + void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, + bool shared_manifest); + + int get_val(const key_t *p_key, struct value_t *p_val /*OUT*/); + + int set_shared_manifest_src_mode(const key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id); + + void count_duplicates(dedup_stats_t *p_small_objs_stat, + dedup_stats_t *p_big_objs_stat, + uint64_t *p_duplicate_head_bytes); + + void remove_singletons_and_redistribute_keys(); + private: + // 32 Bytes unified entries + struct table_entry_t { + key_t key; + value_t val; + } __attribute__((__packed__)); + static_assert(sizeof(table_entry_t) == 32); + + uint32_t find_entry(const key_t *p_key) const; + uint32_t values_count = 0; + uint32_t entries_count = 0; + uint32_t occupied_count = 0; + uint32_t head_object_size = (4ULL * 1024 * 1024); + table_entry_t *hash_tab = nullptr; + + // stat counters + uint64_t redistributed_count = 0; + uint64_t redistributed_search_total = 0; + uint64_t redistributed_search_max = 0; + uint64_t redistributed_loopback = 0; + uint64_t redistributed_perfect = 0; + uint64_t redistributed_clear = 0; + uint64_t redistributed_not_needed = 0; + const DoutPrefixProvider* dpp; + }; + +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_utils.cc b/src/rgw/driver/rados/rgw_dedup_utils.cc new file mode 100644 index 000000000000..baadee5aeef5 --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_utils.cc @@ -0,0 +1,697 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_dedup_utils.h" +#include "common/ceph_crypto.h" + +namespace rgw::dedup { + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type) + { + if (dedup_type == dedup_req_type_t::DEDUP_TYPE_NONE) { + out << "DEDUP_TYPE_NONE"; + } + else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE) { + out << "DEDUP_TYPE_ESTIMATE"; + } + else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL) { + out << "DEDUP_TYPE_FULL"; + } + else { + out << "\n*** unexpected dedup_type ***\n"; + } + + return out; + } + + //--------------------------------------------------------------------------- + dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other) + { + this->singleton_count += other.singleton_count; + this->unique_count += other.unique_count; + this->duplicate_count += other.duplicate_count; + this->dedup_bytes_estimate += other.dedup_bytes_estimate; + return *this; + } + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats) + { + out << "::singleton_count=" << stats.singleton_count + << "::unique_count=" << stats.unique_count + << "::duplicate_count=" << stats.duplicate_count + << "::duplicated_bytes=" << stats.dedup_bytes_estimate; + return out; + } + + //--------------------------------------------------------------------------- + void encode(const dedup_stats_t& ds, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + encode(ds.singleton_count, bl); + encode(ds.unique_count, bl); + encode(ds.duplicate_count, bl); + encode(ds.dedup_bytes_estimate, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + decode(ds.singleton_count, bl); + decode(ds.unique_count, bl); + decode(ds.duplicate_count, bl); + decode(ds.dedup_bytes_estimate, bl); + DECODE_FINISH(bl); + } + + // convert a hex-string to a 64bit integer (max 16 hex digits) + //--------------------------------------------------------------------------- + bool hex2int(const char *p, const char *p_end, uint64_t *p_val) + { + if (p_end - p <= (int)(sizeof(uint64_t) * 2)) { + uint64_t val = 0; + while (p < p_end) { + // get current character then increment + uint8_t byte = *p++; + // transform hex character to the 4bit equivalent number, using the ASCII table indexes + if (byte >= '0' && byte <= '9') { + byte = byte - '0'; + } + else if (byte >= 'a' && byte <='f') { + byte = byte - 'a' + 10; + } + else if (byte >= 'A' && byte <='F') { + byte = byte - 'A' + 10; + } + else { + // terminate on the first non hex char + return false; + } + // shift 4 to make space for new digit, and add the 4 bits of the new digit + val = (val << 4) | (byte & 0xF); + } + *p_val = val; + return true; + } + else { + return false; + } + } + + //--------------------------------------------------------------------------- + bool dec2int(const char *p, const char* p_end, uint16_t *p_val) + { + uint16_t val = 0; + while (p < p_end) { + uint8_t byte = *p++; + if (byte >= '0' && byte <= '9') { + val = val * 10 + (byte - '0'); + } + else { + // terminate on the first non hex char + return false; + } + } + *p_val = val; + return true; + } + + // 16Bytes MD5 takes 32 chars + const unsigned MD5_LENGTH = 32; + + //--------------------------------------------------------------------------- + static bool get_num_parts(const std::string & etag, uint16_t *p_num_parts) + { + // Amazon S3 multipart upload Maximum number = 10,000 + const unsigned MAX_PARTS = 10000; + if (etag.length() <= MD5_LENGTH) { + // i.e. no multipart + *p_num_parts = 0; + return true; + } + + // Amazon S3 multipart upload Maximum number = 10,000 (5 decimal digits) + // We need 1 extra byte for the '-' delimiter and 1 extra byte for '"' at the end + // 7 Bytes should suffice, but we roundup to 8 Bytes + const unsigned MAX_PART_LEN = 8; + if (unlikely(etag.length() > MD5_LENGTH + MAX_PART_LEN)) { + // illegal ETAG + return false; + } + + std::string::size_type n = etag.find('-', etag.length() - MAX_PART_LEN); + if (n != std::string::npos) { + char buff[MAX_PART_LEN]; + // again, 1 extra byte for the '-' delimiter + unsigned copy_size = etag.length() - (n + 1); + if (copy_size <= MAX_PART_LEN) { + unsigned nbytes = etag.copy(buff, copy_size, n+1); + uint16_t num_parts; + const unsigned MAX_UINT16_DIGITS = 5; // 65536 + if (nbytes <= MAX_UINT16_DIGITS) { + if (dec2int(buff, buff+nbytes, &num_parts) && num_parts <= MAX_PARTS) { + *p_num_parts = num_parts; + return true; + } // else, not all digits are legal + } // else, more than 5 digits + } // else, copy len too large + } // else, '-' delimiter was not found + + // illegal number of parts + return false; + } + + //--------------------------------------------------------------------------- + bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag) + { + char buff[MD5_LENGTH*2]; + uint16_t num_parts = 0; + if (get_num_parts(etag, &num_parts)) { + etag.copy(buff, MD5_LENGTH, 0); + uint64_t high, low; + if (hex2int(buff, buff+16, &high)) { + if (hex2int(buff+16, buff+32, &low)) { + parsed_etag->md5_high = high; // High Bytes of the Object Data MD5 + parsed_etag->md5_low = low; // Low Bytes of the Object Data MD5 + parsed_etag->num_parts = num_parts; // How many parts were used in multipart upload + return true; + } + } + } + + // an illegal etag string + return false; + } + + //--------------------------------------------------------------------------- + void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts, + ceph::bufferlist *bl) + { + char buff[64]; + int n = snprintf(buff, sizeof(buff), "%016lx%016lx", md5_high, md5_low); + if (num_parts >= 1) { + n += snprintf(buff + n, sizeof(buff) - n, "-%u", num_parts); + } + bl->append(buff, n); + } + + //--------------------------------------------------------------------------- + const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr, + char data_buff[], + size_t len, + const DoutPrefixProvider* dpp) + { + const char *p = nullptr; + size_t n = bl_itr.get_ptr_and_advance(len, &p); + if (n == len) { + // we got a zero-copy raw pointer to contiguous data on the buffer-list + return p; + } + + std::vector vec; + // otherwise - copy the data to the @data_buff + char *p_buff = data_buff; + do { + vec.push_back(n); + std::memcpy(p_buff, p, n); + p_buff += n; + len -= n; + if (len > 0) { + n = bl_itr.get_ptr_and_advance(len, &p); + } + } while (len > 0); + + ldpp_dout(dpp, 20) << __func__ << "::vec=" << vec << dendl; + return data_buff; + } + + static const char* s_urgent_msg_names[] = { + "URGENT_MSG_NONE", + "URGENT_MSG_ABORT", + "URGENT_MSG_PASUE", + "URGENT_MSG_RESUME", + "URGENT_MSG_RESTART", + "URGENT_MSG_INVALID" + }; + + //--------------------------------------------------------------------------- + const char* get_urgent_msg_names(int msg) + { + if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) { + return s_urgent_msg_names[msg]; + } + else { + return s_urgent_msg_names[URGENT_MSG_INVALID]; + } + } + + //--------------------------------------------------------------------------- + worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other) + { + this->ingress_obj += other.ingress_obj; + this->ingress_obj_bytes += other.ingress_obj_bytes; + this->egress_records += other.egress_records; + this->egress_blocks += other.egress_blocks; + this->egress_slabs += other.egress_slabs; + this->single_part_objs += other.single_part_objs; + this->multipart_objs += other.multipart_objs; + this->small_multipart_obj += other.small_multipart_obj; + this->default_storage_class_objs += other.default_storage_class_objs; + this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes; + this->non_default_storage_class_objs += other.non_default_storage_class_objs; + this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes; + this->ingress_corrupted_etag += other.ingress_corrupted_etag; + this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes; + this->ingress_skip_too_small += other.ingress_skip_too_small; + this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes; + this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB; + + return *this; + } + //--------------------------------------------------------------------------- + void worker_stats_t::dump(Formatter *f) const + { + // main section + { + Formatter::ObjectSection main(*f, "main"); + + f->dump_unsigned("Ingress Objs count", this->ingress_obj); + f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes); + f->dump_unsigned("Egress Records count", this->egress_records); + f->dump_unsigned("Egress Blocks count", this->egress_blocks); + f->dump_unsigned("Egress Slabs count", this->egress_slabs); + f->dump_unsigned("Single part obj count", this->single_part_objs); + f->dump_unsigned("Multipart obj count", this->multipart_objs); + if (this->small_multipart_obj) { + f->dump_unsigned("Small Multipart obj count", this->small_multipart_obj); + } + } + + { + Formatter::ObjectSection notify(*f, "notify"); + + if(this->non_default_storage_class_objs) { + f->dump_unsigned("non default storage class objs", + this->non_default_storage_class_objs); + f->dump_unsigned("non default storage class objs bytes", + this->non_default_storage_class_objs_bytes); + } + else { + ceph_assert(this->default_storage_class_objs == this->ingress_obj); + ceph_assert(this->default_storage_class_objs_bytes == this->ingress_obj_bytes); + } + } + + { + Formatter::ObjectSection skipped(*f, "skipped"); + if(this->ingress_skip_too_small) { + f->dump_unsigned("Ingress skip: too small objs", + this->ingress_skip_too_small); + f->dump_unsigned("Ingress skip: too small bytes", + this->ingress_skip_too_small_bytes); + + if(this->ingress_skip_too_small_64KB) { + f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj", + this->ingress_skip_too_small_64KB); + f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes", + this->ingress_skip_too_small_64KB_bytes); + } + } + } + + { + Formatter::ObjectSection failed(*f, "failed"); + if(this->ingress_corrupted_etag) { + f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag); + } + } + } + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, const worker_stats_t &s) + { + JSONFormatter formatter(false); + s.dump(&formatter); + std::stringstream sstream; + formatter.flush(sstream); + out << sstream.str(); + return out; + } + + //--------------------------------------------------------------------------- + void encode(const worker_stats_t& w, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + encode(w.ingress_obj, bl); + encode(w.ingress_obj_bytes, bl); + encode(w.egress_records, bl); + encode(w.egress_blocks, bl); + encode(w.egress_slabs, bl); + + encode(w.single_part_objs, bl); + encode(w.multipart_objs, bl); + encode(w.small_multipart_obj, bl); + + encode(w.default_storage_class_objs, bl); + encode(w.default_storage_class_objs_bytes, bl); + encode(w.non_default_storage_class_objs, bl); + encode(w.non_default_storage_class_objs_bytes, bl); + + encode(w.ingress_corrupted_etag, bl); + + encode(w.ingress_skip_too_small_bytes, bl); + encode(w.ingress_skip_too_small, bl); + + encode(w.ingress_skip_too_small_64KB_bytes, bl); + encode(w.ingress_skip_too_small_64KB, bl); + + encode(w.duration, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + decode(w.ingress_obj, bl); + decode(w.ingress_obj_bytes, bl); + decode(w.egress_records, bl); + decode(w.egress_blocks, bl); + decode(w.egress_slabs, bl); + decode(w.single_part_objs, bl); + decode(w.multipart_objs, bl); + decode(w.small_multipart_obj, bl); + decode(w.default_storage_class_objs, bl); + decode(w.default_storage_class_objs_bytes, bl); + decode(w.non_default_storage_class_objs, bl); + decode(w.non_default_storage_class_objs_bytes, bl); + decode(w.ingress_corrupted_etag, bl); + decode(w.ingress_skip_too_small_bytes, bl); + decode(w.ingress_skip_too_small, bl); + decode(w.ingress_skip_too_small_64KB_bytes, bl); + decode(w.ingress_skip_too_small_64KB, bl); + + decode(w.duration, bl); + DECODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other) + { + this->small_objs_stat += other.small_objs_stat; + this->big_objs_stat += other.big_objs_stat; + this->ingress_failed_load_bucket += other.ingress_failed_load_bucket; + this->ingress_failed_get_object += other.ingress_failed_get_object; + this->ingress_failed_get_obj_attrs += other.ingress_failed_get_obj_attrs; + this->ingress_corrupted_etag += other.ingress_corrupted_etag; + this->ingress_corrupted_obj_attrs += other.ingress_corrupted_obj_attrs; + this->ingress_skip_encrypted += other.ingress_skip_encrypted; + this->ingress_skip_encrypted_bytes += other.ingress_skip_encrypted_bytes; + this->ingress_skip_compressed += other.ingress_skip_compressed; + this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes; + this->ingress_skip_changed_objs += other.ingress_skip_changed_objs; + this->shared_manifest_dedup_bytes += other.shared_manifest_dedup_bytes; + + this->skipped_shared_manifest += other.skipped_shared_manifest; + this->skipped_purged_small += other.skipped_purged_small; + this->skipped_singleton += other.skipped_singleton; + this->skipped_singleton_bytes += other.skipped_singleton_bytes; + this->skipped_source_record += other.skipped_source_record; + this->duplicate_records += other.duplicate_records; + this->size_mismatch += other.size_mismatch; + this->hash_mismatch += other.hash_mismatch; + this->failed_src_load += other.failed_src_load; + this->failed_rec_load += other.failed_rec_load; + this->failed_block_load += other.failed_block_load; + + this->valid_hash_attrs += other.valid_hash_attrs; + this->invalid_hash_attrs += other.invalid_hash_attrs; + this->set_hash_attrs += other.set_hash_attrs; + this->skip_hash_cmp += other.skip_hash_cmp; + + this->set_shared_manifest_src += other.set_shared_manifest_src; + this->loaded_objects += other.loaded_objects; + this->processed_objects += other.processed_objects; + this->dup_head_bytes_estimate += other.dup_head_bytes_estimate; + this->deduped_objects += other.deduped_objects; + this->deduped_objects_bytes += other.deduped_objects_bytes; + this->dup_head_bytes += other.dup_head_bytes; + + this->failed_dedup += other.failed_dedup; + this->failed_table_load += other.failed_table_load; + this->failed_map_overflow += other.failed_map_overflow; + return *this; + } + + //--------------------------------------------------------------------------- + std::ostream& operator<<(std::ostream &out, const md5_stats_t &s) + { + JSONFormatter formatter(false); + s.dump(&formatter); + std::stringstream sstream; + formatter.flush(sstream); + out << sstream.str(); + return out; + } + + //--------------------------------------------------------------------------- + void md5_stats_t::dump(Formatter *f) const + { + // main section + { + Formatter::ObjectSection main(*f, "main"); + + f->dump_unsigned("Total processed objects", this->processed_objects); + f->dump_unsigned("Loaded objects", this->loaded_objects); + f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src); + f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects); + f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes); + f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes); + f->dump_unsigned("Already Deduped bytes (prev cycles)", + this->shared_manifest_dedup_bytes); + + const dedup_stats_t &ds = this->big_objs_stat; + f->dump_unsigned("Singleton Obj", ds.singleton_count); + f->dump_unsigned("Unique Obj", ds.unique_count); + f->dump_unsigned("Duplicate Obj", ds.duplicate_count); + f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate); + } + + // Potential Dedup Section: + // What could be gained by allowing dedup for smaller objects (64KB-4MB) + // Space wasted because of duplicated head-object (4MB) + { + Formatter::ObjectSection potential(*f, "Potential Dedup"); + const dedup_stats_t &ds = this->small_objs_stat; + f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count); + f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count); + f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count); + f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate); + f->dump_unsigned("Duplicated Head Bytes Estimate", + this->dup_head_bytes_estimate); + f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes); + } + + { + Formatter::ObjectSection notify(*f, "notify"); + if (this->failed_table_load) { + f->dump_unsigned("Failed Table Load", this->failed_table_load); + } + if (this->failed_map_overflow) { + f->dump_unsigned("Failed Remap Overflow", this->failed_map_overflow); + } + + f->dump_unsigned("Valid HASH attrs", this->valid_hash_attrs); + f->dump_unsigned("Invalid HASH attrs", this->invalid_hash_attrs); + + if (this->set_hash_attrs) { + f->dump_unsigned("Set HASH", this->set_hash_attrs); + } + + if (this->skip_hash_cmp) { + f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp); + } + } + + { + Formatter::ObjectSection skipped(*f, "skipped"); + f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest); + f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small); + f->dump_unsigned("Skipped singleton objs", this->skipped_singleton); + if (this->skipped_singleton) { + f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes); + } + f->dump_unsigned("Skipped source record", this->skipped_source_record); + + if (this->ingress_skip_encrypted) { + f->dump_unsigned("Skipped Encrypted objs", this->ingress_skip_encrypted); + f->dump_unsigned("Skipped Encrypted Bytes",this->ingress_skip_encrypted_bytes); + } + if (this->ingress_skip_compressed) { + f->dump_unsigned("Skipped Compressed objs", this->ingress_skip_compressed); + f->dump_unsigned("Skipped Compressed Bytes", this->ingress_skip_compressed_bytes); + } + if (this->ingress_skip_changed_objs) { + f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs); + } + } + + { + Formatter::ObjectSection sys_failures(*f, "system failures"); + if (this->ingress_failed_load_bucket) { + f->dump_unsigned("Failed load_bucket()", this->ingress_failed_load_bucket); + } + if (this->ingress_failed_get_object) { + f->dump_unsigned("Failed get_object()", this->ingress_failed_get_object); + } + if (this->ingress_failed_get_obj_attrs) { + f->dump_unsigned("Failed get_obj_attrs", this->ingress_failed_get_obj_attrs); + } + if (this->ingress_corrupted_etag) { + f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag); + } + if (this->ingress_corrupted_obj_attrs) { + f->dump_unsigned("Corrupted obj attributes", this->ingress_corrupted_obj_attrs); + } + if (this->failed_src_load) { + f->dump_unsigned("Failed SRC-Load ", this->failed_src_load); + } + if (this->failed_rec_load) { + f->dump_unsigned("Failed Record-Load ", this->failed_rec_load); + } + if (this->failed_block_load) { + f->dump_unsigned("Failed Block-Load ", this->failed_block_load); + } + if (this->failed_dedup) { + f->dump_unsigned("Failed Dedup", this->failed_dedup); + } + } + + { + Formatter::ObjectSection logical_failures(*f, "logical failures"); + if (this->hash_mismatch) { + f->dump_unsigned("HASH mismatch", this->hash_mismatch); + } + if (this->duplicate_records) { + f->dump_unsigned("Duplicate SRC/TGT", this->duplicate_records); + } + if (this->size_mismatch) { + f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch); + } + } + } + + //--------------------------------------------------------------------------- + void encode(const md5_stats_t& m, ceph::bufferlist& bl) + { + ENCODE_START(1, 1, bl); + + encode(m.small_objs_stat, bl); + encode(m.big_objs_stat, bl); + encode(m.ingress_failed_load_bucket, bl); + encode(m.ingress_failed_get_object, bl); + encode(m.ingress_failed_get_obj_attrs, bl); + encode(m.ingress_corrupted_etag, bl); + encode(m.ingress_corrupted_obj_attrs, bl); + encode(m.ingress_skip_encrypted, bl); + encode(m.ingress_skip_encrypted_bytes, bl); + encode(m.ingress_skip_compressed, bl); + encode(m.ingress_skip_compressed_bytes, bl); + encode(m.ingress_skip_changed_objs, bl); + encode(m.shared_manifest_dedup_bytes, bl); + + encode(m.skipped_shared_manifest, bl); + encode(m.skipped_purged_small, bl); + encode(m.skipped_singleton, bl); + encode(m.skipped_singleton_bytes, bl); + encode(m.skipped_source_record, bl); + encode(m.duplicate_records, bl); + encode(m.size_mismatch, bl); + encode(m.hash_mismatch, bl); + encode(m.failed_src_load, bl); + encode(m.failed_rec_load, bl); + encode(m.failed_block_load, bl); + + encode(m.valid_hash_attrs, bl); + encode(m.invalid_hash_attrs, bl); + encode(m.set_hash_attrs, bl); + encode(m.skip_hash_cmp, bl); + encode(m.set_shared_manifest_src, bl); + + encode(m.loaded_objects, bl); + encode(m.processed_objects, bl); + encode(m.dup_head_bytes_estimate, bl); + encode(m.deduped_objects, bl); + encode(m.deduped_objects_bytes, bl); + encode(m.dup_head_bytes, bl); + encode(m.failed_dedup, bl); + encode(m.failed_table_load, bl); + encode(m.failed_map_overflow, bl); + + encode(m.duration, bl); + ENCODE_FINISH(bl); + } + + //--------------------------------------------------------------------------- + void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl) + { + DECODE_START(1, bl); + decode(m.small_objs_stat, bl); + decode(m.big_objs_stat, bl); + decode(m.ingress_failed_load_bucket, bl); + decode(m.ingress_failed_get_object, bl); + decode(m.ingress_failed_get_obj_attrs, bl); + decode(m.ingress_corrupted_etag, bl); + decode(m.ingress_corrupted_obj_attrs, bl); + decode(m.ingress_skip_encrypted, bl); + decode(m.ingress_skip_encrypted_bytes, bl); + decode(m.ingress_skip_compressed, bl); + decode(m.ingress_skip_compressed_bytes, bl); + decode(m.ingress_skip_changed_objs, bl); + decode(m.shared_manifest_dedup_bytes, bl); + + decode(m.skipped_shared_manifest, bl); + decode(m.skipped_purged_small, bl); + decode(m.skipped_singleton, bl); + decode(m.skipped_singleton_bytes, bl); + decode(m.skipped_source_record, bl); + decode(m.duplicate_records, bl); + decode(m.size_mismatch, bl); + decode(m.hash_mismatch, bl); + decode(m.failed_src_load, bl); + decode(m.failed_rec_load, bl); + decode(m.failed_block_load, bl); + + decode(m.valid_hash_attrs, bl); + decode(m.invalid_hash_attrs, bl); + decode(m.set_hash_attrs, bl); + decode(m.skip_hash_cmp, bl); + decode(m.set_shared_manifest_src, bl); + + decode(m.loaded_objects, bl); + decode(m.processed_objects, bl); + decode(m.dup_head_bytes_estimate, bl); + decode(m.deduped_objects, bl); + decode(m.deduped_objects_bytes, bl); + decode(m.dup_head_bytes, bl); + decode(m.failed_dedup, bl); + decode(m.failed_table_load, bl); + decode(m.failed_map_overflow, bl); + + decode(m.duration, bl); + DECODE_FINISH(bl); + } +} //namespace rgw::dedup diff --git a/src/rgw/driver/rados/rgw_dedup_utils.h b/src/rgw/driver/rados/rgw_dedup_utils.h new file mode 100644 index 000000000000..f008fcaba38b --- /dev/null +++ b/src/rgw/driver/rados/rgw_dedup_utils.h @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Gabriel BenHanokh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include +#include "include/rados/buffer.h" +#include "include/encoding.h" +#include "common/Formatter.h" +#include "common/ceph_json.h" +#include +#include "include/utime.h" +#include "include/encoding.h" +#include "common/dout.h" + +#define FULL_DEDUP_SUPPORT +namespace rgw::dedup { + using work_shard_t = uint16_t; + using md5_shard_t = uint16_t; + + // settings to help debug small systems + const work_shard_t MIN_WORK_SHARD = 2; + const md5_shard_t MIN_MD5_SHARD = 4; + + // Those are the correct values for production system + const work_shard_t MAX_WORK_SHARD = 255; + const md5_shard_t MAX_MD5_SHARD = 512; + + const work_shard_t NULL_WORK_SHARD = 0xFFFF; + const md5_shard_t NULL_MD5_SHARD = 0xFFFF; + const unsigned NULL_SHARD = 0xFFFF; + + // work_shard is an 8 bits int with 255 legal values for the first iteration + // and one value (0xFF) reserved for second iteration + const unsigned WORK_SHARD_HARD_LIMIT = 0x0FF; + // md5_shard_t is a 12 bits int with 4096 possible values + const unsigned MD5_SHARD_HARD_LIMIT = 0xFFF; + + static_assert(MAX_WORK_SHARD < NULL_WORK_SHARD); + static_assert(MAX_WORK_SHARD < NULL_SHARD); + static_assert(MAX_WORK_SHARD <= WORK_SHARD_HARD_LIMIT); + static_assert(MAX_MD5_SHARD < NULL_MD5_SHARD); + static_assert(MAX_MD5_SHARD < NULL_SHARD); + static_assert(MAX_MD5_SHARD <= MD5_SHARD_HARD_LIMIT); + + //--------------------------------------------------------------------------- + enum dedup_req_type_t { + DEDUP_TYPE_NONE = 0, + DEDUP_TYPE_ESTIMATE = 1, + DEDUP_TYPE_FULL = 2 + }; + + std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type); + struct __attribute__ ((packed)) dedup_flags_t { + private: + static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC + static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB + static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB + static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC + + public: + dedup_flags_t() : flags(0) {} + dedup_flags_t(uint8_t _flags) : flags(_flags) {} + inline void clear() { this->flags = 0; } + inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); } + inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; } + inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); } + inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; } + inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); } + inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; } + inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; } + inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); } + inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; } + private: + uint8_t flags; + }; + + struct dedup_stats_t { + dedup_stats_t& operator+=(const dedup_stats_t& other); + + uint64_t singleton_count = 0; + uint64_t unique_count = 0; + uint64_t duplicate_count = 0; + uint64_t dedup_bytes_estimate = 0; + }; + + std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats); + void encode(const dedup_stats_t& ds, ceph::bufferlist& bl); + void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl); + + struct worker_stats_t { + worker_stats_t& operator +=(const worker_stats_t& other); + void dump(Formatter *f) const; + + uint64_t ingress_obj = 0; + uint64_t ingress_obj_bytes = 0; + uint64_t egress_records = 0; + uint64_t egress_blocks = 0; + uint64_t egress_slabs = 0; + + uint64_t single_part_objs = 0; + uint64_t multipart_objs = 0; + uint64_t small_multipart_obj = 0; + + uint64_t default_storage_class_objs = 0; + uint64_t default_storage_class_objs_bytes = 0; + + uint64_t non_default_storage_class_objs = 0; + uint64_t non_default_storage_class_objs_bytes = 0; + + uint64_t ingress_corrupted_etag = 0; + + uint64_t ingress_skip_too_small_bytes = 0; + uint64_t ingress_skip_too_small = 0; + + uint64_t ingress_skip_too_small_64KB_bytes = 0; + uint64_t ingress_skip_too_small_64KB = 0; + + utime_t duration = {0, 0}; + }; + std::ostream& operator<<(std::ostream &out, const worker_stats_t &s); + void encode(const worker_stats_t& w, ceph::bufferlist& bl); + void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl); + + + struct md5_stats_t { + md5_stats_t& operator +=(const md5_stats_t& other); + void dump(Formatter *f) const; + + dedup_stats_t small_objs_stat; + dedup_stats_t big_objs_stat; + uint64_t ingress_failed_load_bucket = 0; + uint64_t ingress_failed_get_object = 0; + uint64_t ingress_failed_get_obj_attrs = 0; + uint64_t ingress_corrupted_etag = 0; + uint64_t ingress_corrupted_obj_attrs = 0; + uint64_t ingress_skip_encrypted = 0; + uint64_t ingress_skip_encrypted_bytes = 0; + uint64_t ingress_skip_compressed = 0; + uint64_t ingress_skip_compressed_bytes = 0; + uint64_t ingress_skip_changed_objs = 0; + + uint64_t shared_manifest_dedup_bytes = 0; + uint64_t skipped_shared_manifest = 0; + uint64_t skipped_purged_small = 0; + uint64_t skipped_singleton = 0; + uint64_t skipped_singleton_bytes = 0; + uint64_t skipped_source_record = 0; + uint64_t duplicate_records = 0; + uint64_t size_mismatch = 0; + uint64_t hash_mismatch = 0; + uint64_t failed_src_load = 0; + uint64_t failed_rec_load = 0; + uint64_t failed_block_load = 0; + + uint64_t valid_hash_attrs = 0; + uint64_t invalid_hash_attrs = 0; + uint64_t set_hash_attrs = 0; + uint64_t skip_hash_cmp = 0; + + uint64_t set_shared_manifest_src = 0; + uint64_t loaded_objects = 0; + uint64_t processed_objects = 0; + // counter is using on-disk size affected by block-size + uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes + uint64_t deduped_objects = 0; + // counter is using s3 byte size disregarding the on-disk size affected by block-size + uint64_t deduped_objects_bytes = 0; + uint64_t dup_head_bytes = 0; + uint64_t failed_dedup = 0; + uint64_t failed_table_load = 0; + uint64_t failed_map_overflow = 0; + utime_t duration = {0, 0}; + }; + std::ostream &operator<<(std::ostream &out, const md5_stats_t &s); + void encode(const md5_stats_t& m, ceph::bufferlist& bl); + void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl); + + struct parsed_etag_t { + uint64_t md5_high; // High Bytes of the Object Data MD5 + uint64_t md5_low; // Low Bytes of the Object Data MD5 + uint16_t num_parts; // How many parts were used in multipart upload + // Setting num_parts to zero when multipart is not used + }; + +#define DIV_UP(a, b) ( ((a)+(b-1)) / b) + // CEPH min allocation unit on disk is 4KB + // TBD: take from config + static constexpr uint64_t DISK_ALLOC_SIZE = 4*1024; + // 16 bytes hexstring -> 8 Byte uint64_t + static inline constexpr unsigned HEX_UNIT_SIZE = 16; + + //--------------------------------------------------------------------------- + static inline uint64_t byte_size_to_disk_blocks(uint64_t byte_size) { + return DIV_UP(byte_size, DISK_ALLOC_SIZE); + } + + //--------------------------------------------------------------------------- + static inline uint64_t disk_blocks_to_byte_size(uint64_t disk_blocks) { + return disk_blocks * DISK_ALLOC_SIZE; + } + + //--------------------------------------------------------------------------- + // ceph store full blocks so need to round up and multiply by block_size + static inline uint64_t calc_on_disk_byte_size(uint64_t byte_size) { + uint64_t size_4k_units = byte_size_to_disk_blocks(byte_size); + return disk_blocks_to_byte_size(size_4k_units); + } + + enum urgent_msg_t { + URGENT_MSG_NONE = 0, + URGENT_MSG_ABORT = 1, + URGENT_MSG_PASUE = 2, + URGENT_MSG_RESUME = 3, + URGENT_MSG_RESTART = 4, + URGENT_MSG_INVALID = 5 + }; + + const char* get_urgent_msg_names(int msg); + bool hex2int(const char *p, const char *p_end, uint64_t *p_val); + bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag); + void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts, + ceph::bufferlist *bl); + const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr, + char data_buff[], + size_t len, + const DoutPrefixProvider* dpp); + + //--------------------------------------------------------------------------- + static inline void build_oid(const std::string &bucket_id, + const std::string &obj_name, + std::string *oid) + { + *oid = bucket_id + "_" + obj_name; + } + + //--------------------------------------------------------------------------- + static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size, + uint16_t num_parts, + uint64_t size_bytes) + { + if (num_parts > 0) { + // multipart objects with an empty head i.e. we achive full dedup + return size_bytes; + } + else { + // reduce the head size + if (size_bytes > head_obj_size) { + return size_bytes - head_obj_size; + } + else { + return 0; + } + } + } + +} //namespace rgw::dedup diff --git a/src/rgw/rgw_appmain.cc b/src/rgw/rgw_appmain.cc index cb3428ea9da2..dddf3583fbe3 100644 --- a/src/rgw/rgw_appmain.cc +++ b/src/rgw/rgw_appmain.cc @@ -65,7 +65,9 @@ #include "rgw_asio_frontend.h" #include "rgw_dmclock_scheduler_ctx.h" #include "rgw_lua.h" +#ifdef WITH_RADOSGW_RADOS #include "rgw_dedup.h" +#endif #ifdef WITH_RADOSGW_DBSTORE #include "rgw_sal_dbstore.h" #endif @@ -595,6 +597,7 @@ void rgw::AppMain::init_lua() #endif } /* init_lua */ +#ifdef WITH_RADOSGW_RADOS void rgw::AppMain::init_dedup() { rgw::sal::Driver* driver = env.driver; @@ -609,6 +612,7 @@ void rgw::AppMain::init_dedup() } } } +#endif void rgw::AppMain::shutdown(std::function finalize_async_signals) { @@ -637,9 +641,11 @@ void rgw::AppMain::shutdown(std::function finalize_async_signals) ldh.reset(nullptr); // deletes ldap helper if it was created rgw_log_usage_finalize(); +#ifdef WITH_RADOSGW_RADOS if (dedup_background) { dedup_background->shutdown(); } +#endif if (lua_background) { lua_background->shutdown(); diff --git a/src/rgw/rgw_dedup.cc b/src/rgw/rgw_dedup.cc deleted file mode 100644 index 7c00ddf6f2a7..000000000000 --- a/src/rgw/rgw_dedup.cc +++ /dev/null @@ -1,2704 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/rados/rados_types.hpp" -#include "include/rados/buffer.h" -#include "include/rados/librados.hpp" -#include "rgw_tools.h" -#include "svc_zone.h" -#include "common/config.h" -#include "common/Cond.h" -#include "common/debug.h" -#include "common/errno.h" -#include "rgw_common.h" -#include "rgw_sal.h" -#include "rgw_zone.h" -#include "rgw_cache.h" -#include "rgw_acl.h" -#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ -#include "rgw_aio_throttle.h" -#include "driver/rados/rgw_bucket.h" -#include "rgw_sal_config.h" -#include "rgw_lib.h" -#include "rgw_placement_types.h" -#include "driver/rados/rgw_bucket.h" -#include "driver/rados/rgw_sal_rados.h" -#include "cls/rgw/cls_rgw_ops.h" -#include "cls/rgw/cls_rgw_client.h" -#include "cls/rgw/cls_rgw_const.h" -#include "cls/refcount/cls_refcount_client.h" -#include "cls/version/cls_version_client.h" -#include "fmt/ranges.h" -#include "osd/osd_types.h" -#include "common/ceph_crypto.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//using namespace std::chrono_literals; -using namespace librados; -using namespace std; -using namespace rgw::dedup; - -#include "rgw_dedup_remap.h" -#include "rgw_sal_rados.h" -#include "rgw_dedup_table.h" -#include "rgw_dedup_utils.h" -#include "rgw_dedup.h" -#include "rgw_dedup_store.h" -#include "rgw_dedup_cluster.h" -#include "rgw_dedup_epoch.h" -#include "rgw_perf_counters.h" -#include "include/ceph_assert.h" - -static constexpr auto dout_subsys = ceph_subsys_rgw_dedup; - -namespace rgw::dedup { - static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128; - using storage_class_idx_t = uint8_t; - - //--------------------------------------------------------------------------- - void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie, - uint64_t notifier_id, bufferlist &bl) - { - ldpp_dout(parent->dpp, 10) << __func__ << "::notify_id=" << notify_id - << "::cookie=" << cookie - << "::notifier_id=" << notifier_id << dendl; - if (parent->d_watch_handle != cookie) { - ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie - << "::d_watch_handle=" << parent->d_watch_handle - << dendl; - return; - } - parent->handle_notify(notify_id, cookie, bl); - } - - //--------------------------------------------------------------------------- - void Background::DedupWatcher::handle_error(uint64_t cookie, int err) - { - if (parent->d_watch_handle != cookie) { - ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie - << "::d_watch_handle=" << parent->d_watch_handle - << dendl; - return; - } - ldpp_dout(parent->dpp, 1) << __func__ << "::error=" << err << dendl; - - parent->unwatch_reload(parent->dpp); - parent->watch_reload(parent->dpp); - } - - //--------------------------------------------------------------------------- - void control_t::reset() - { - this->dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE; - this->started = false; - this->dedup_exec = false; - this->shutdown_req = false; - this->shutdown_done = false; - this->local_pause_req = false; - this->local_paused = false; - this->remote_abort_req = false; - this->remote_aborted = false; - this->remote_pause_req = false; - this->remote_paused = false; - this->remote_restart_req = false; - } - - //--------------------------------------------------------------------------- - void encode(const control_t& ctl, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - encode(static_cast(ctl.dedup_type), bl); - encode(ctl.started, bl); - encode(ctl.dedup_exec, bl); - encode(ctl.shutdown_req, bl); - encode(ctl.shutdown_done, bl); - encode(ctl.local_pause_req, bl); - encode(ctl.local_paused, bl); - encode(ctl.remote_abort_req, bl); - encode(ctl.remote_aborted, bl); - encode(ctl.remote_pause_req, bl); - encode(ctl.remote_paused, bl); - encode(ctl.remote_restart_req, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - int32_t dedup_type; - decode(dedup_type, bl); - ctl.dedup_type = static_cast (dedup_type); - decode(ctl.started, bl); - decode(ctl.dedup_exec, bl); - decode(ctl.shutdown_req, bl); - decode(ctl.shutdown_done, bl); - decode(ctl.local_pause_req, bl); - decode(ctl.local_paused, bl); - decode(ctl.remote_abort_req, bl); - decode(ctl.remote_aborted, bl); - decode(ctl.remote_pause_req, bl); - decode(ctl.remote_paused, bl); - decode(ctl.remote_restart_req, bl); - DECODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, const control_t &ctl) - { - out << ctl.dedup_type; - if (ctl.started) { - out << "::started"; - } - if (ctl.dedup_exec) { - out << "::dedup_exec"; - } - if (ctl.shutdown_req) { - out << "::shutdown_req"; - } - if (ctl.shutdown_done) { - out << "::shutdown_done"; - } - if (ctl.local_pause_req) { - out << "::local_pause_req"; - } - if (ctl.local_paused) { - out << "::local_paused"; - } - if (ctl.remote_abort_req) { - out << "::remote_abort_req"; - } - if (ctl.remote_aborted) { - out << "::remote_aborted"; - } - if (ctl.remote_pause_req) { - out << "::remote_pause_req"; - } - if (ctl.remote_paused) { - out << "::remote_paused"; - } - if (ctl.remote_restart_req) { - out << "::remote_restart_req"; - } - - return out; - } - - //=========================================================================== - // rgw::dedup::Background - //=========================================================================== - //--------------------------------------------------------------------------- - static void display_ioctx_state(const DoutPrefixProvider *dpp, - const librados::IoCtx &ioctx, - const char *caller) - { - if (ioctx.is_valid()) { - ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id=" - << ioctx.get_instance_id() << dendl; - } - else { - ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl; - } - } - - //--------------------------------------------------------------------------- - static int safe_pool_delete(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - int64_t expected_pool_id) - { - const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; - auto rados_handle = store->getRados()->get_rados_handle(); - int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); - if (pool_id < 0) { - int err = pool_id; - if (err == ENOENT) { - ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::" - << dedup_pool.name << "::expected_pool_id=" - << expected_pool_id << dendl; - } - else { - ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name - << ") err=" << cpp_strerror(-err) << dendl; - } - return err; - } - - if (pool_id != expected_pool_id) { - ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: " - << expected_pool_id << " to: " << pool_id - << " abort pool_delete() request!" << dendl; - // report Stale file handle - return -ESTALE; - } - - ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name - << ") pool_id=" << pool_id << dendl; - return rados_handle->pool_delete(dedup_pool.name.c_str()); - } - - //--------------------------------------------------------------------------- - static int64_t create_pool(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - const std::string &pool_name) - { -#if 0 - // using Replica-1 for the intermediate data - // since it can be regenerated in case of a failure - std::string replica_count(std::to_string(1)); -#else - // temporary solution until we find a way to disable the health warn on replica1 - std::string replica_count(std::to_string(2)); -#endif - librados::bufferlist inbl; - std::string output; - std::string command = R"( - { - "prefix": "osd pool create", - "pool": ")" + pool_name + - R"(", - "pool_type": "replicated", - "size": )" + replica_count + - R"( - })"; - - auto rados_handle = store->getRados()->get_rados_handle(); - int ret = rados_handle->mon_command(command, inbl, nullptr, &output); - if (output.length()) { - if (output != "pool 'rgw_dedup_pool' already exists") { - ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl; - } - } - if (ret != 0 && ret != -EEXIST) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool " - << pool_name << " with: " - << cpp_strerror(-ret) << ", ret=" << ret << dendl; - return ret; - } - const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; - return rados_handle->pool_lookup(dedup_pool.name.c_str()); - } - - //--------------------------------------------------------------------------- - static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - bool create, - librados::IoCtx &ioctx) - { - const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; - std::string pool_name(dedup_pool.name.c_str()); - auto rados_handle = store->getRados()->get_rados_handle(); - int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); - if (pool_id >= 0) { - // TBD: what to do when create option is passed - ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name - << " already exists, pool_id=" << pool_id << dendl; - } - else if (create) { - pool_id = create_pool(store, dpp, pool_name); - if (pool_id >= 0) { - ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name - << " was created, pool_id=" << pool_id << dendl; - } - else { - return pool_id; - } - } - else { - ldpp_dout(dpp, 1) << __func__ - << "::ERR: pool doesn't exist and no create option" << dendl; - return -ENOENT; - } - - int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx); - if (unlikely(ret < 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret - << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - - ret = ioctx.application_enable("rgw_dedup", false); - if (ret == 0) { - ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name - << " was associated with dedup app" << dendl; - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool " - << dedup_pool.name << " with: " - << cpp_strerror(-ret) << ", ret=" << ret << dendl; - } - return ret; - } - - //--------------------------------------------------------------------------- - int Background::init_rados_access_handles(bool init_pool) - { - store = dynamic_cast(driver); - if (!store) { - ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl; - // this is the return code used in rgw_bucket.cc - return -ENOTSUP; - } - - rados = store->getRados(); - rados_handle = rados->get_rados_handle(); - if (init_pool) { - int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx); - display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__); - return ret; - } - return 0; - } - - //--------------------------------------------------------------------------- - Background::Background(rgw::sal::Driver* _driver, CephContext* _cct) : - driver(_driver), - dp(_cct, dout_subsys, "dedup background: "), - dpp(&dp), - cct(_cct), - d_cluster(dpp, cct, driver), - d_watcher_ctx(this) - { - d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size; - d_head_object_size = cct->_conf->rgw_max_chunk_size; - //ceph_assert(4*1024*1024 == d_head_object_size); - - int ret = init_rados_access_handles(false); - if (ret != 0) { - derr << __func__ << "::ERR: failed init_rados_access_handles() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - throw std::runtime_error("Failed init_rados_access_handles()"); - } - - d_heart_beat_last_update = ceph_clock_now(); - d_heart_beat_max_elapsed_sec = 3; - } - - //--------------------------------------------------------------------------- - int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *p_bucket, - const parsed_etag_t *p_parsed_etag, - const std::string &obj_name, - uint64_t obj_size, - const std::string &storage_class) - { - disk_record_t rec(p_bucket, obj_name, p_parsed_etag, obj_size, storage_class); - // First pass using only ETAG and size taken from bucket-index - rec.s.flags.set_fastlane(); - - auto p_disk = disk_arr.get_shard_block_seq(p_parsed_etag->md5_low); - disk_block_seq_t::record_info_t rec_info; - int ret = p_disk->add_record(d_dedup_cluster_ioctx, &rec, &rec_info); - if (unlikely(ret != 0)) { - return ret; - } - ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/" - << obj_name << " was written to block_idx=" - << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - int Background::add_record_to_dedup_table(dedup_table_t *p_table, - const disk_record_t *p_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_stats_t *p_stats, - remapper_t *remapper) - { - uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size); - storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, - &p_stats->failed_map_overflow); - if (unlikely(sc_idx == remapper_t::NULL_IDX)) { - // TBD: need stat counters - return -EOVERFLOW; - } - key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, - p_rec->s.num_parts, sc_idx); - bool has_shared_manifest = p_rec->has_shared_manifest(); - ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name - << ", obj=" << p_rec->obj_name << ", block_id=" - << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id - << ", shared_manifest=" << has_shared_manifest - << "::num_parts=" << p_rec->s.num_parts - << "::size_4k_units=" << key.size_4k_units - << "::ETAG=" << std::hex << p_rec->s.md5_high - << p_rec->s.md5_low << std::dec << dendl; - - int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest); - if (ret == 0) { - p_stats->loaded_objects ++; - ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" - << p_rec->obj_name << " was added successfully to table" - << "::loaded_objects=" << p_stats->loaded_objects << dendl; - } - else { - // We allocate memory for the dedup on startup based on the existing obj count - // If the system grew significantly since that point we won't be able to - // accommodate all the objects in the hash-table. - // Please keep in mind that it is very unlikely since duplicates objects will - // consume a single entry and since we skip small objects so in reality - // I expect the allocation to be more than sufficient. - // - // However, if we filled up the system there is still value is continuing - // with this process since we might find duplicates to existing object (which - // don't take extra space) - - int level = 15; - if (p_stats->failed_table_load % 0x10000 == 0) { - level = 5; - } - else if (p_stats->failed_table_load % 0x100 == 0) { - level = 10; - } - ldpp_dout(dpp, level) << __func__ << "::Failed p_table->add_entry (overflow)" - << "::loaded_objects=" << p_stats->loaded_objects - << "::failed_table_load=" << p_stats->failed_table_load - << dendl; - - p_stats->failed_table_load++; - } - return ret; - } - -#ifdef FULL_DEDUP_SUPPORT - - static constexpr uint64_t cost = 1; // 1 throttle unit per request - static constexpr uint64_t id = 0; // ids unused - //--------------------------------------------------------------------------- - [[maybe_unused]]static void show_ref_tags(const DoutPrefixProvider* dpp, std::string &oid, rgw_rados_ref &obj) - { - unsigned idx = 0; - std::list refs; - std::string wildcard_tag; - int ret = cls_refcount_read(obj.ioctx, oid, &refs, true); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << "::ERR: manifest::failed cls_refcount_read()" - << " idx=" << idx << dendl; - return; - } - - for (list::iterator iter = refs.begin(); iter != refs.end(); ++iter) { - ldpp_dout(dpp, 20) << __func__ << "::manifest::" << oid << "::" << idx - << "::TAG=" << *iter << dendl; - } - } - - //--------------------------------------------------------------------------- - int Background::free_tail_objs_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &tgt_manifest) - { - unsigned idx = 0; - for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - if (oid == raw_obj.oid) { - ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl; - continue; - } - - rgw_rados_ref obj; - int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "ERR: manifest::failed to open context " - << obj << dendl; - continue; - } - librados::IoCtx ioctx = obj.ioctx; - ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid - << dendl; - ret = ioctx.remove(raw_obj.oid); - } - - return 0; - } - - //--------------------------------------------------------------------------- - int Background::rollback_ref_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &manifest) - { - unsigned idx = 0; - int ret_code = 0; - std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); - for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - if (oid == raw_obj.oid) { - ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " - << raw_obj.oid << dendl; - continue; - } - - rgw_rados_ref obj; - int local_ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); - if (local_ret < 0) { - ret_code = local_ret; - ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context " - << obj << dendl; - // skip bad objects, nothing we can do - continue; - } - - ObjectWriteOperation op; - cls_refcount_put(op, ref_tag, true); - rgw::AioResultList completed = aio->get(obj.obj, - rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), - cost, id); - } - rgw::AioResultList completed = aio->drain(); - return ret_code; - } - - //--------------------------------------------------------------------------- - int Background::inc_ref_count_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &manifest) - { - std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); - rgw::AioResultList all_results; - int ret = 0; - unsigned idx = 0; - for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - if (oid == raw_obj.oid) { - ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl; - continue; - } - - rgw_rados_ref obj; - ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context " - << obj << dendl; - break; - } - - ObjectWriteOperation op; - cls_refcount_get(op, ref_tag, true); - ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl; - rgw::AioResultList completed = aio->get(obj.obj, - rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), - cost, id); - ret = rgw::check_for_errors(completed); - all_results.splice(all_results.end(), completed); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj - << ", the error code = " << ret << dendl; - break; - } - } - - if (ret == 0) { - rgw::AioResultList completed = aio->drain(); - int ret = rgw::check_for_errors(completed); - all_results.splice(all_results.end(), completed); - if (ret == 0) { - return 0; - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest: failed to drain ios ret=" - << ret < rollback all ref-inc operations - /* wait all pending op done */ - rgw::AioResultList completed = aio->drain(); - all_results.splice(all_results.end(), completed); - int ret2 = 0; - for (auto& aio_res : all_results) { - if (aio_res.result < 0) { - continue; // skip errors - } - rgw_rados_ref obj; - ret2 = rgw_get_rados_ref(dpp, rados_handle, aio_res.obj, &obj); - if (ret2 < 0) { - continue; - } - - ObjectWriteOperation op; - cls_refcount_put(op, ref_tag, true); - rgw::AioResultList completed = aio->get(obj.obj, - rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), - cost, id); - ret2 = rgw::check_for_errors(completed); - if (ret2 < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl; - } - } - completed = aio->drain(); - ret2 = rgw::check_for_errors(completed); - if (ret2 < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret=" - << ret2 < bucket; - { - rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; - int ret = driver->load_bucket(dpp, b, &bucket, null_yield); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): " - << cpp_strerror(-ret) << dendl; - return ret; - } - } - - build_oid(p_rec->bucket_id, p_rec->obj_name, oid); - //ldpp_dout(dpp, 0) << __func__ << "::OID=" << oid << " || bucket_id=" << bucket_id << dendl; - rgw_pool data_pool; - rgw_obj obj{bucket->get_key(), *oid}; - if (!rados->get_obj_data_pool(bucket->get_placement_rule(), obj, &data_pool)) { - ldpp_dout(dpp, 1) << __func__ << "::failed to get data pool for bucket " - << bucket->get_name() << dendl; - return -EIO; - } - int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:" - << data_pool.to_str() << dendl; - return -EIO; - } - - return 0; - } - - //--------------------------------------------------------------------------- - static void init_cmp_pairs(const disk_record_t *p_rec, - const bufferlist &etag_bl, - bufferlist &hash_bl, // OUT PARAM - librados::ObjectWriteOperation *p_op) - { - p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl); - // TBD: do we really need the secondary compare using the full manifest? - // Can replace it with something cheaper like size/version? - p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl); - - // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { - ceph::encode(p_rec->s.hash[i], hash_bl); - } - - if (!p_rec->s.flags.hash_calculated()) { - p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl); - } - } - - //--------------------------------------------------------------------------- - int Background::dedup_object(const disk_record_t *p_src_rec, - const disk_record_t *p_tgt_rec, - md5_stats_t *p_stats, - bool has_shared_manifest_src) - { - RGWObjManifest src_manifest; - try { - auto bl_iter = p_src_rec->manifest_bl.cbegin(); - decode(src_manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl; - return -EINVAL; - } - RGWObjManifest tgt_manifest; - try { - auto bl_iter = p_tgt_rec->manifest_bl.cbegin(); - decode(tgt_manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl; - return -EINVAL; - } - ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: " - << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> " - << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl; - - bufferlist etag_bl; - etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl); - ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts - << "::ETAG=" << etag_bl.to_str() << dendl; - - bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl; - crypto::digest(p_src_rec->manifest_bl).encode(hash_bl); - // Use a shorter hash (64bit instead of 160bit) - hash_bl.splice(0, 8, &manifest_hash_bl); - librados::ObjectWriteOperation tgt_op; - init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op); - tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); - tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl); - if (p_tgt_rec->s.flags.hash_calculated()) { - tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl); - p_stats->set_hash_attrs++; - } - - std::string src_oid, tgt_oid; - librados::IoCtx src_ioctx, tgt_ioctx; - int ret1 = get_ioctx(dpp, driver, rados, p_src_rec, &src_ioctx, &src_oid); - int ret2 = get_ioctx(dpp, driver, rados, p_tgt_rec, &tgt_ioctx, &tgt_oid); - if (unlikely(ret1 != 0 || ret2 != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl; - return (ret1 ? ret1 : ret2); - } - - // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG?? - string ref_tag = p_tgt_rec->ref_tag; - ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl; - int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); - if (ret == 0) { - ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl; - ret = tgt_ioctx.operate(tgt_oid, &tgt_op); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate(" - << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl; - rollback_ref_by_manifest(ref_tag, src_oid, src_manifest); - return ret; - } - - // free tail objects based on TGT manifest - free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest); - - if (!has_shared_manifest_src) { - // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST - // after deduping B and update it in dedup_table, but don't update the - // disk-record (as require an expensive random-disk-write). - // When deduping C we can trust the shared_manifest state in the table and - // skip a redundant update to SRC object attribute - bufferlist src_hash_bl; - librados::ObjectWriteOperation src_op; - init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op); - src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); - if (p_src_rec->s.flags.hash_calculated()) { - src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl); - p_stats->set_hash_attrs++; - } - - ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl; - ret = src_ioctx.operate(src_oid, &src_op); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate(" - << src_oid << "), err is " << cpp_strerror(-ret)<obj_name << dendl; - return -EINVAL; - } - - blake3_hasher hmac; - blake3_hasher_init(&hmac); - for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - rgw_rados_ref obj; - int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: " - << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; - return ret; - } - - bufferlist bl; - librados::IoCtx ioctx = obj.ioctx; - // read full object - ret = ioctx.read(raw_obj.oid, bl, 0, 0); - if (ret > 0) { - for (const auto& bptr : bl.buffers()) { - blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length()); - } - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid - << ", error is " << cpp_strerror(-ret) << dendl; - return ret; - } - } - - blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN); - return 0; - } - - //--------------------------------------------------------------------------- - [[maybe_unused]]static void __attribute__ ((noinline)) - print_record(const DoutPrefixProvider* dpp, - const disk_record_t *p_tgt_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard) - { - ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name - << ", obj=" << p_tgt_rec->obj_name - << ", block_id=" << block_id - << ", rec_id=" << (int)rec_id - << ", md5_shard=" << (int)md5_shard << dendl; - - ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard - << "::" << p_tgt_rec->bucket_name - << "/" << p_tgt_rec->obj_name - << "::num_parts=" << p_tgt_rec->s.num_parts - << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high - << p_tgt_rec->s.md5_low << std::dec << dendl; - } - - //--------------------------------------------------------------------------- - int Background::add_obj_attrs_to_record(rgw_bucket *p_rb, - disk_record_t *p_rec, - const rgw::sal::Attrs &attrs, - dedup_table_t *p_table, - md5_stats_t *p_stats) /*IN-OUT*/ - { - // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG - auto itr = attrs.find(RGW_ATTR_TAIL_TAG); - if (itr != attrs.end()) { - p_rec->ref_tag = itr->second.to_str(); - } - else { - itr = attrs.find(RGW_ATTR_ID_TAG); - if (itr != attrs.end()) { - p_rec->ref_tag = itr->second.to_str(); - } - else { - ldpp_dout(dpp, 5) << __func__ << "::No TAIL_TAG and no ID_TAG" << dendl; - return -EINVAL; - } - } - p_rec->s.ref_tag_len = p_rec->ref_tag.length(); - - // clear bufferlist first - p_rec->manifest_bl.clear(); - - itr = attrs.find(RGW_ATTR_MANIFEST); - if (itr != attrs.end()) { - const bufferlist &bl = itr->second; - RGWObjManifest manifest; - try { - auto bl_iter = bl.cbegin(); - decode(manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ - << "::ERROR: unable to decode manifest" << dendl; - return -EINVAL; - } - - // force explicit tail_placement as the dedup could be on another bucket - const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); - if (tail_placement.bucket.name.empty()) { - ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl; - manifest.set_tail_placement(tail_placement.placement_rule, *p_rb); - encode(manifest, p_rec->manifest_bl); - } - else { - p_rec->manifest_bl = bl; - } - p_rec->s.manifest_len = p_rec->manifest_bl.length(); - } - else { - ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl; - return -EINVAL; - } - - itr = attrs.find(RGW_ATTR_SHARE_MANIFEST); - if (itr != attrs.end()) { - uint64_t hash = 0; - try { - auto bl_iter = itr->second.cbegin(); - ceph::decode(hash, bl_iter); - p_rec->s.shared_manifest = hash; - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad shared_manifest" << dendl; - return -EINVAL; - } - ldpp_dout(dpp, 20) << __func__ << "::Set Shared_Manifest::OBJ_NAME=" - << p_rec->obj_name << "::shared_manifest=0x" << std::hex - << p_rec->s.shared_manifest << std::dec << dendl; - p_rec->s.flags.set_shared_manifest(); - } - else { - memset(&p_rec->s.shared_manifest, 0, sizeof(p_rec->s.shared_manifest)); - } - - itr = attrs.find(RGW_ATTR_BLAKE3); - if (itr != attrs.end()) { - try { - auto bl_iter = itr->second.cbegin(); - // BLAKE3 hash 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { - uint64_t val; - ceph::decode(val, bl_iter); - p_rec->s.hash[i] = val; - } - p_stats->valid_hash_attrs++; - return 0; - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl; - return -EINVAL; - } - } - - p_stats->invalid_hash_attrs++; - // TBD: redundant memset... - memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash)); - // BLAKE3_OUT_LEN is 32 Bytes - int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash); - if (ret == 0) { - p_rec->s.flags.set_hash_calculated(); - } - - return ret; - } - - //--------------------------------------------------------------------------- - // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table - // so all entries left are sources of dedup with multiple copies. - // Need to read attributes from the Head-Object and output them to a new SLAB - int Background::read_object_attribute(dedup_table_t *p_table, - disk_record_t *p_rec, - disk_block_id_t old_block_id, - record_id_t old_rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats /* IN-OUT */, - disk_block_seq_t *p_disk, - remapper_t *remapper) - { - bool should_print_debug = cct->_conf->subsys.should_gather(); - if (unlikely(should_print_debug)) { - print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard); - } - p_stats->processed_objects ++; - - uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size); - uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); - storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, - &p_stats->failed_map_overflow); - if (unlikely(sc_idx == remapper_t::NULL_IDX)) { - // TBD: need stat counters - return -EOVERFLOW; - } - key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, - p_rec->s.num_parts, sc_idx); - dedup_table_t::value_t src_val; - int ret = p_table->get_val(&key_from_bucket_index, &src_val); - if (ret != 0) { - if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) { - // record has no valid entry in table because it is a too small - // It was loaded to table for calculation and then purged - p_stats->skipped_purged_small++; - ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::" - << p_rec->obj_name << "::" << ondisk_byte_size << dendl; - // help small object tests pass - avoid complication differentiating between - // small objects ( < 64KB, >= 64KB <= 4MB, > 4MB - p_stats->processed_objects--; - } - else { - // record has no valid entry in table because it is a singleton - p_stats->skipped_singleton++; - p_stats->skipped_singleton_bytes += ondisk_byte_size; - ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::" - << p_rec->obj_name << std::dec << dendl; - } - return 0; - } - - // Every object after this point was counted as a dedup potential - // If we conclude that it can't be dedup it should be accounted for - rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; - unique_ptr bucket; - ret = driver->load_bucket(dpp, b, &bucket, null_yield); - if (unlikely(ret != 0)) { - // could happen when the bucket is removed between passes - p_stats->ingress_failed_load_bucket++; - ldpp_dout(dpp, 15) << __func__ << "::Failed driver->load_bucket(): " - << cpp_strerror(-ret) << dendl; - return 0; - } - - unique_ptr p_obj = bucket->get_object(p_rec->obj_name); - if (unlikely(!p_obj)) { - // could happen when the object is removed between passes - p_stats->ingress_failed_get_object++; - ldpp_dout(dpp, 15) << __func__ << "::Failed bucket->get_object(" - << p_rec->obj_name << ")" << dendl; - return 0; - } - - ret = p_obj->get_obj_attrs(null_yield, dpp); - if (unlikely(ret < 0)) { - p_stats->ingress_failed_get_obj_attrs++; - ldpp_dout(dpp, 10) << __func__ << "::ERR: failed to stat object(" << p_rec->obj_name - << "), returned error: " << cpp_strerror(-ret) << dendl; - return ret; - } - - const rgw::sal::Attrs& attrs = p_obj->get_attrs(); - if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) { - p_stats->ingress_skip_encrypted++; - p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size; - ldpp_dout(dpp, 20) <<__func__ << "::Skipping encrypted object " - << p_rec->obj_name << dendl; - return 0; - } - - // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed - if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) { - p_stats->ingress_skip_compressed++; - p_stats->ingress_skip_compressed_bytes += ondisk_byte_size; - ldpp_dout(dpp, 20) <<__func__ << "::Skipping compressed object " - << p_rec->obj_name << dendl; - return 0; - } - - // extract ETAG and Size and compare with values taken from the bucket-index - parsed_etag_t parsed_etag; - auto itr = attrs.find(RGW_ATTR_ETAG); - if (itr != attrs.end()) { - if (unlikely(!parse_etag_string(itr->second.to_str(), &parsed_etag))) { - p_stats->ingress_corrupted_etag++; - ldpp_dout(dpp, 10) << __func__ << "::ERROR: corrupted etag::" << p_rec->obj_name << dendl; - return -EINVAL; - } - } - else { - p_stats->ingress_corrupted_etag++; - ldpp_dout(dpp, 10) << __func__ << "::ERROR: no etag" << p_rec->obj_name << dendl; - return -EINVAL; - } - - std::string storage_class; - itr = attrs.find(RGW_ATTR_STORAGE_CLASS); - if (itr != attrs.end()) { - storage_class = itr->second.to_str(); - } - else { - storage_class = RGW_STORAGE_CLASS_STANDARD; - } - // no need to check for remap success as we compare keys bellow - sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow); - key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low, - byte_size_to_disk_blocks(p_obj->get_size()), - parsed_etag.num_parts, sc_idx); - if (unlikely(key_from_obj != key_from_bucket_index || - p_rec->s.obj_bytes_size != p_obj->get_size())) { - ldpp_dout(dpp, 15) <<__func__ << "::Skipping changed object " - << p_rec->obj_name << dendl; - p_stats->ingress_skip_changed_objs++; - return 0; - } - - // reset flags - p_rec->s.flags.clear(); - ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - - disk_block_seq_t::record_info_t rec_info; - ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info); - if (ret == 0) { - // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest - ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK); - ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" - << p_rec->obj_name << " was written to block_idx=" - << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id - << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl; - p_table->update_entry(&key_from_bucket_index, rec_info.block_id, - rec_info.rec_id, p_rec->has_shared_manifest()); - } - else { - ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl; - if (ret == -EINVAL) { - p_stats->ingress_corrupted_obj_attrs++; - } - } - return ret; - } - - //--------------------------------------------------------------------------- - static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp, - rgw::sal::Driver* driver, - RGWRados* rados, - const disk_record_t *p_rec) - { - bufferlist etag_bl; - bufferlist hash_bl; - librados::ObjectWriteOperation op; - etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts, - &etag_bl); - init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op); - op.setxattr(RGW_ATTR_BLAKE3, hash_bl); - - std::string oid; - librados::IoCtx ioctx; - int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl; - return ret; - } - - ret = ioctx.operate(oid, &op); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate(" - << oid << "), err is " << cpp_strerror(-ret) << dendl; - } - return ret; - } - - //--------------------------------------------------------------------------- - // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table - // so all entries left are sources of dedup with multiple copies. - // If the record is marked as Shared-Manifest-Object -> skip it - // if the record's key doesn’t exist in table -> skip it (it is a singleton and it was purged) - // If the record block-index matches the hashtable entry -> skip it (it is the SRC object) - // All other entries are Dedicated-Manifest-Objects with a valid SRC object - - // we can withstand most errors moving to the next object - // only report an error if we recived a stop scan request! - // - int Background::try_deduping_record(dedup_table_t *p_table, - const disk_record_t *p_tgt_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats, /* IN-OUT */ - remapper_t *remapper) - { - bool should_print_debug = cct->_conf->subsys.should_gather(); - if (unlikely(should_print_debug)) { - print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard); - } - - uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size); - storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp, - &p_stats->failed_map_overflow); - ceph_assert(sc_idx != remapper_t::NULL_IDX); - key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units, - p_tgt_rec->s.num_parts, sc_idx); - dedup_table_t::value_t src_val; - int ret = p_table->get_val(&key, &src_val); - if (ret != 0) { - // record has no valid entry in table because it is a singleton - // should never happened since we purged all singletons before - ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name - << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts - << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high - << p_tgt_rec->s.md5_low << std::dec << dendl; - ceph_abort("Unexpcted singleton"); - return 0; - } - - disk_block_id_t src_block_id = src_val.block_idx; - record_id_t src_rec_id = src_val.rec_id; - if (block_id == src_block_id && rec_id == src_rec_id) { - // the table entry point to this record which means it is a dedup source so nothing to do - p_stats->skipped_source_record++; - ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl; - return 0; - } - - // ceph store full blocks so need to round up and multiply by block_size - uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); - uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size, - p_tgt_rec->s.num_parts, - ondisk_byte_size); - if (p_tgt_rec->s.flags.has_shared_manifest()) { - // record holds a shared_manifest object so can't be a dedup target - p_stats->skipped_shared_manifest++; - p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes; - ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl; - return 0; - } - - // This records is a dedup target with source record on source_block_id - disk_record_t src_rec; - ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id, - src_rec_id, md5_shard, dpp); - if (unlikely(ret != 0)) { - p_stats->failed_src_load++; - // we can withstand most errors moving to the next object - ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record(" - << src_block_id << ", " << src_rec_id << ")" << dendl; - return 0; - } - - ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name - << "/" << src_rec.obj_name << dendl; - // verify that SRC and TGT records don't refer to the same physical object - // This could happen in theory if we read the same objects twice - if (src_rec.obj_name == p_tgt_rec->obj_name && src_rec.bucket_name == p_tgt_rec->bucket_name) { - p_stats->duplicate_records++; - ldpp_dout(dpp, 10) << __func__ << "::WARN: Duplicate records for object=" - << src_rec.obj_name << dendl; - return 0; - } - - // the hash table size is rounded to the nearest 4KB and will wrap after 16G - if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) { - p_stats->size_mismatch++; - ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::" - << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size - << "::" << p_tgt_rec->obj_name << "::" - << p_tgt_rec->s.obj_bytes_size << dendl; - return 0; - } - - if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) { - p_stats->hash_mismatch++; - ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl; - // TBD: set hash attributes on head objects to save calc next time - if (src_rec.s.flags.hash_calculated()) { - write_blake3_object_attribute(dpp, driver, rados, &src_rec); - p_stats->set_hash_attrs++; - } - if (p_tgt_rec->s.flags.hash_calculated()) { - write_blake3_object_attribute(dpp, driver, rados, p_tgt_rec); - p_stats->set_hash_attrs++; - } - return 0; - } - - ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest()); - if (ret == 0) { - p_stats->deduped_objects++; - p_stats->deduped_objects_bytes += dedupable_objects_bytes; - if (p_tgt_rec->s.num_parts == 0) { - // single part objects duplicate the head object when dedup is used - p_stats->dup_head_bytes += d_head_object_size; - } - - // mark the SRC object as a providor of a shared manifest - if (!src_val.has_shared_manifest()) { - p_stats->set_shared_manifest_src++; - // set the shared manifest flag in the dedup table - p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id); - } - else { - ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl; - } - } - else { - ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for " - << src_rec.bucket_name << "/" << src_rec.obj_name << dendl; - p_stats->failed_dedup++; - } - - return 0; - } - -#endif // #ifdef FULL_DEDUP_SUPPORT - //--------------------------------------------------------------------------- - const char* Background::dedup_step_name(dedup_step_t step) - { - static const char* names[] = {"STEP_NONE", - "STEP_BUCKET_INDEX_INGRESS", - "STEP_BUILD_TABLE", - "STEP_READ_ATTRIBUTES", - "STEP_REMOVE_DUPLICATES"}; - static const char* undefined_step = "UNDEFINED_STEP"; - if (step >= STEP_NONE && step <= STEP_REMOVE_DUPLICATES) { - return names[step]; - } - else { - return undefined_step; - } - } - - //--------------------------------------------------------------------------- - int Background::process_all_slabs(dedup_table_t *p_table, - dedup_step_t step, - md5_shard_t md5_shard, - work_shard_t worker_id, - uint32_t *p_slab_count, - md5_stats_t *p_stats, /* IN-OUT */ - disk_block_seq_t *p_disk_block_seq, - remapper_t *remapper) - { - char block_buff[sizeof(disk_block_t)]; - const int MAX_OBJ_LOAD_FAILURE = 3; - const int MAX_BAD_BLOCKS = 2; - bool has_more = true; - uint32_t seq_number = 0; - int failure_count = 0; - ldpp_dout(dpp, 20) << __func__ << "::" << dedup_step_name(step) << "::worker_id=" - << worker_id << ", md5_shard=" << md5_shard << dendl; - *p_slab_count = 0; - while (has_more) { - bufferlist bl; - int ret = load_slab(d_dedup_cluster_ioctx, bl, md5_shard, worker_id, seq_number, dpp); - if (unlikely(ret < 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR::Failed loading object!! md5_shard=" << md5_shard - << ", worker_id=" << worker_id << ", seq_number=" << seq_number - << ", failure_count=" << failure_count << dendl; - // skip to the next SLAB stopping after 3 bad objects - if (failure_count++ < MAX_OBJ_LOAD_FAILURE) { - seq_number += DISK_BLOCK_COUNT; - continue; - } - else { - return ret; - } - } - - (*p_slab_count)++; - failure_count = 0; - unsigned slab_rec_count = 0; - auto bl_itr = bl.cbegin(); - for (uint32_t block_num = 0; block_num < DISK_BLOCK_COUNT; block_num++, seq_number++) { - disk_block_id_t disk_block_id(worker_id, seq_number); - const char *p = get_next_data_ptr(bl_itr, block_buff, sizeof(block_buff), - dpp); - disk_block_t *p_disk_block = (disk_block_t*)p; - disk_block_header_t *p_header = p_disk_block->get_header(); - p_header->deserialize(); - if (unlikely(p_header->verify(disk_block_id, dpp) != 0)) { - p_stats->failed_block_load++; - // move to next block until reaching a valid block - if (failure_count++ < MAX_BAD_BLOCKS) { - continue; - } - else { - ldpp_dout(dpp, 1) << __func__ << "::Skipping slab with too many bad blocks::" - << (int)md5_shard << ", worker_id=" << (int)worker_id - << ", seq_number=" << seq_number << dendl; - failure_count = 0; - break; - } - } - - if (p_header->rec_count == 0) { - ldpp_dout(dpp, 20) << __func__ << "::Block #" << block_num - << " has an empty header, no more blocks" << dendl; - has_more = false; - break; - } - - for (unsigned rec_id = 0; rec_id < p_header->rec_count; rec_id++) { - unsigned offset = p_header->rec_offsets[rec_id]; - // We deserialize the record inside the CTOR - disk_record_t rec(p + offset); - ret = rec.validate(__func__, dpp, disk_block_id, rec_id); - if (unlikely(ret != 0)) { - p_stats->failed_rec_load++; - return ret; - } - - if (step == STEP_BUILD_TABLE) { - add_record_to_dedup_table(p_table, &rec, disk_block_id, rec_id, p_stats, remapper); - slab_rec_count++; - } -#ifdef FULL_DEDUP_SUPPORT - else if (step == STEP_READ_ATTRIBUTES) { - read_object_attribute(p_table, &rec, disk_block_id, rec_id, md5_shard, - p_stats, p_disk_block_seq, remapper); - slab_rec_count++; - } - else if (step == STEP_REMOVE_DUPLICATES) { - try_deduping_record(p_table, &rec, disk_block_id, rec_id, md5_shard, - p_stats, remapper); - slab_rec_count++; - } -#endif // #ifdef FULL_DEDUP_SUPPORT - else { - ceph_abort("unexpected step"); - } - } - - check_and_update_md5_heartbeat(md5_shard, p_stats->loaded_objects, - p_stats->processed_objects); - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - if (unlikely(d_ctl.should_stop())) { - return -ECANCELED; - } - - has_more = (p_header->offset == BLOCK_MAGIC); - ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC); - if (!has_more) { - ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id - << ", rec_count=" << p_header->rec_count << dendl; - break; - } - } - ldpp_dout(dpp, 20) <<__func__ << "::slab seq_number=" << seq_number - << ", rec_count=" << slab_rec_count << dendl; - } - return 0; - } - - //--------------------------------------------------------------------------- - static void __attribute__ ((noinline)) - show_ingress_bucket_idx_obj(const DoutPrefixProvider *dpp, - const parsed_etag_t &parsed_etag, - const string &bucket_name, - const string &obj_name) - { - ldpp_dout(dpp, 20) << __func__ << "::(1)::" << bucket_name << "/" << obj_name - << "::num_parts=" << parsed_etag.num_parts - << "::ETAG=" << std::hex << parsed_etag.md5_high - << parsed_etag.md5_low << std::dec << dendl; - } - - //--------------------------------------------------------------------------- - int Background::ingress_bucket_idx_single_object(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *p_bucket, - const rgw_bucket_dir_entry &entry, - worker_stats_t *p_worker_stats /*IN-OUT*/) - { - // ceph store full blocks so need to round up and multiply by block_size - uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size); - // count all objects including too small and non default storage_class objs - p_worker_stats->ingress_obj++; - p_worker_stats->ingress_obj_bytes += ondisk_byte_size; - - parsed_etag_t parsed_etag; - if (unlikely(!parse_etag_string(entry.meta.etag, &parsed_etag))) { - p_worker_stats->ingress_corrupted_etag++; - ldpp_dout(dpp, 1) << __func__ << "::ERROR: corrupted etag" << dendl; - return -EINVAL; - } - - if (unlikely((cct->_conf->subsys.should_gather()))) { - show_ingress_bucket_idx_obj(dpp, parsed_etag, p_bucket->get_name(), entry.key.name); - } - - // We limit dedup to objects from the same storage_class - // TBD: - // Should we use a skip-list of storage_classes we should skip (like glacier) ? - const std::string& storage_class = - rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class); - if (storage_class == RGW_STORAGE_CLASS_STANDARD) { - p_worker_stats->default_storage_class_objs++; - p_worker_stats->default_storage_class_objs_bytes += ondisk_byte_size; - } - else { - ldpp_dout(dpp, 20) << __func__ << "::" << entry.key.name - << "::storage_class:" << entry.meta.storage_class << dendl; - p_worker_stats->non_default_storage_class_objs++; - p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size; - } - - if (ondisk_byte_size <= d_min_obj_size_for_dedup) { - if (parsed_etag.num_parts == 0) { - // dedup only useful for objects bigger than 4MB - p_worker_stats->ingress_skip_too_small++; - p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size; - - if (ondisk_byte_size >= 64*1024) { - p_worker_stats->ingress_skip_too_small_64KB++; - p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size; - } - else { - return 0; - } - } - else { - // multipart objects are always good candidates for dedup - // the head object is empty and data is stored only in tail objs - p_worker_stats->small_multipart_obj++; - } - } - // multipart/single_part counters are for objects being fully processed - if (parsed_etag.num_parts > 0) { - p_worker_stats->multipart_objs++; - } - else { - p_worker_stats->single_part_objs++; - } - - return add_disk_rec_from_bucket_idx(disk_arr, p_bucket, &parsed_etag, - entry.key.name, entry.meta.size, - storage_class); - } - - //--------------------------------------------------------------------------- - void Background::check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, - uint64_t count_b, const char *prefix) - { - utime_t now = ceph_clock_now(); - utime_t time_elapsed = now - d_heart_beat_last_update; - if (unlikely(time_elapsed.tv.tv_sec >= d_heart_beat_max_elapsed_sec)) { - ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec=" - << d_heart_beat_max_elapsed_sec << dendl; - d_heart_beat_last_update = now; - d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b, - prefix); - } - } - - //--------------------------------------------------------------------------- - void Background::check_and_update_worker_heartbeat(work_shard_t worker_id, - int64_t ingress_obj_count) - { - check_and_update_heartbeat(worker_id, ingress_obj_count, 0, WORKER_SHARD_PREFIX); - } - - //--------------------------------------------------------------------------- - void Background::check_and_update_md5_heartbeat(md5_shard_t md5_id, - uint64_t load_count, - uint64_t dedup_count) - { - check_and_update_heartbeat(md5_id, load_count, dedup_count, MD5_SHARD_PREFIX); - } - - //--------------------------------------------------------------------------- - static uint32_t move_to_next_bucket_index_shard(const DoutPrefixProvider* dpp, - unsigned current_shard, - unsigned num_work_shards, - const std::string &bucket_name, - rgw_obj_index_key *p_marker /* OUT-PARAM */) - { - uint32_t next_shard = current_shard + num_work_shards; - ldpp_dout(dpp, 20) << __func__ << "::" << bucket_name << "::curr_shard=" - << current_shard << ", next shard=" << next_shard << dendl; - *p_marker = rgw_obj_index_key(); // reset marker to an empty index - return next_shard; - } - - // This function process bucket-index shards of a given @bucket - // The bucket-index-shards are stored in a group of @oids - // The @oids are using a simple map from the shard-id to the oid holding bucket-indices - // We start by processing all bucket-indices owned by this @worker-id - // Once we are done with a given bucket-index shard we skip to the next - // bucket-index-shard owned by this worker-id - // if (bucket_index_shard % work_id) == 0) -> read and process bucket_index_shard - // else -> skip bucket_index_shard and don't read it - //--------------------------------------------------------------------------- - int Background::process_bucket_shards(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *bucket, - std::map &oids, - librados::IoCtx &ioctx, - work_shard_t worker_id, - work_shard_t num_work_shards, - worker_stats_t *p_worker_stats /*IN-OUT*/) - { - const uint32_t num_shards = oids.size(); - uint32_t current_shard = worker_id; - rgw_obj_index_key marker; // start with an empty marker - const string null_prefix, null_delimiter; - const bool list_versions = true; - const int max_entries = 1000; - uint32_t obj_count = 0; - - while (current_shard < num_shards ) { - check_and_update_worker_heartbeat(worker_id, p_worker_stats->ingress_obj); - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - if (unlikely(d_ctl.should_stop())) { - return -ECANCELED; - } - - const string& oid = oids[current_shard]; - rgw_cls_list_ret result; - librados::ObjectReadOperation op; - // get bucket-indices of @current_shard - cls_rgw_bucket_list_op(op, marker, null_prefix, null_delimiter, max_entries, - list_versions, &result); - int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), nullptr, null_yield); - if (unlikely(ret < 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_rados_operate() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards, - bucket->get_name(), &marker); - continue; - } - obj_count += result.dir.m.size(); - for (auto& entry : result.dir.m) { - const rgw_bucket_dir_entry& dirent = entry.second; - if (unlikely((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty())) { - // TBD: should we bailout ??? - ldpp_dout(dpp, 1) << __func__ << "::ERR: calling check_disk_state bucket=" - << bucket->get_name() << " entry=" << dirent.key << dendl; - // make sure we're advancing marker - marker = dirent.key; - continue; - } - marker = dirent.key; - ret = ingress_bucket_idx_single_object(disk_arr, bucket, dirent, p_worker_stats); - } - // TBD: advance marker only once here! - if (result.is_truncated) { - ldpp_dout(dpp, 15) << __func__ << "::[" << current_shard - << "]result.is_truncated::count=" << obj_count << dendl; - } - else { - // we reached the end of this shard -> move to the next shard - current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards, - bucket->get_name(), &marker); - ldpp_dout(dpp, 15) << __func__ << "::move_to_next_bucket_index_shard::count=" - << obj_count << "::new_shard=" << current_shard << dendl; - } - } - ldpp_dout(dpp, 15) << __func__ << "::Finished processing Bucket " - << bucket->get_name() << ", num_shards=" << num_shards - << ", obj_count=" << obj_count << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - int Background::ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr, - const rgw_bucket &bucket_rec, - work_shard_t worker_id, - work_shard_t num_work_shards, - worker_stats_t *p_worker_stats /*IN-OUT*/) - { - unique_ptr bucket; - int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): " - << cpp_strerror(-ret) << dendl; - return ret; - } - - const std::string bucket_id = bucket->get_key().get_key(); - RGWBucketInfo bucket_info; - ret = rados->get_bucket_instance_info(bucket_id, bucket_info, - nullptr, nullptr, null_yield, dpp); - if (unlikely(ret < 0)) { - if (ret == -ENOENT) { - // probably a race condition with bucket removal - ldpp_dout(dpp, 10) << __func__ << "::ret == -ENOENT" << dendl; - return 0; - } - ldpp_dout(dpp, 5) << __func__ << "::ERROR: get_bucket_instance_info(), ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - const rgw::bucket_index_layout_generation idx_layout = bucket_info.layout.current_index; - librados::IoCtx ioctx; - // objects holding the bucket-listings - std::map oids; - ret = store->svc()->bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, - idx_layout, &ioctx, &oids, nullptr); - if (ret >= 0) { - // process all the shards in this bucket owned by the worker_id - return process_bucket_shards(disk_arr, bucket.get(), oids, ioctx, worker_id, - num_work_shards, p_worker_stats); - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: open_bucket_index() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - } - - //--------------------------------------------------------------------------- - static void display_table_stat_counters(const DoutPrefixProvider* dpp, - const md5_stats_t *p_stats) - { - uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count + - p_stats->big_objs_stat.unique_count + - p_stats->big_objs_stat.duplicate_count); - - ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n" - << "::total_count=" << obj_count_in_shard - << "::loaded_objects=" << p_stats->loaded_objects - << p_stats->big_objs_stat << dendl; - ldpp_dout(dpp, 10) << __func__ << "::small objs::" - << p_stats->small_objs_stat << dendl; - } - - //--------------------------------------------------------------------------- - int Background::objects_dedup_single_md5_shard(dedup_table_t *p_table, - md5_shard_t md5_shard, - md5_stats_t *p_stats, - work_shard_t num_work_shards) - { - remapper_t remapper(MAX_STORAGE_CLASS_IDX); - // make sure that the standard storage_class is always in the mapper! - storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp, - &p_stats->failed_map_overflow); - ceph_assert(sc_idx == 0); - uint32_t slab_count_arr[num_work_shards]; - // first load all etags to hashtable to find dedups - // the entries come from bucket-index and got minimal info (etag, size) - for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { - process_all_slabs(p_table, STEP_BUILD_TABLE, md5_shard, worker_id, - slab_count_arr+worker_id, p_stats, nullptr, &remapper); - if (unlikely(d_ctl.should_stop())) { - ldpp_dout(dpp, 5) << __func__ << "::STEP_BUILD_TABLE::STOPPED\n" << dendl; - return -ECANCELED; - } - } - p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat, - &p_stats->dup_head_bytes_estimate); - display_table_stat_counters(dpp, p_stats); - - ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl; - if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) { - for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { - remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]); - } - return 0; - } - -#ifndef FULL_DEDUP_SUPPORT - // we don't support full dedup with this release - return 0; -#endif - - p_table->remove_singletons_and_redistribute_keys(); - // The SLABs holds minimal data set brought from the bucket-index - // Objects participating in DEDUP need to read attributes from the Head-Object - // TBD - find a better name than num_work_shards for the combined output - { - disk_block_t arr[DISK_BLOCK_COUNT]; - worker_stats_t wstat; - disk_block_seq_t disk_block_seq(dpp, arr, num_work_shards, md5_shard, &wstat); - for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) { - process_all_slabs(p_table, STEP_READ_ATTRIBUTES, md5_shard, worker_id, - slab_count_arr+worker_id, p_stats, &disk_block_seq, &remapper); - if (unlikely(d_ctl.should_stop())) { - ldpp_dout(dpp, 5) << __func__ << "::STEP_READ_ATTRIBUTES::STOPPED\n" << dendl; - return -ECANCELED; - } - // we finished processing output SLAB from @worker_id -> remove them - remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]); - } - disk_block_seq.flush_disk_records(d_dedup_cluster_ioctx); - } - - ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::started..." << dendl; - uint32_t slab_count = 0; - process_all_slabs(p_table, STEP_REMOVE_DUPLICATES, md5_shard, num_work_shards, - &slab_count, p_stats, nullptr, &remapper); - if (unlikely(d_ctl.should_stop())) { - ldpp_dout(dpp, 5) << __func__ << "::STEP_REMOVE_DUPLICATES::STOPPED\n" << dendl; - return -ECANCELED; - } - ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::finished..." << dendl; - // remove the special SLAB holding aggragted data - remove_slabs(num_work_shards, md5_shard, slab_count); - return 0; - } - - //--------------------------------------------------------------------------- - int Background::read_bucket_stats(const rgw_bucket &bucket_rec, - uint64_t *p_num_obj, - uint64_t *p_size) - { - unique_ptr bucket; - int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): " - << cpp_strerror(-ret) << dendl; - return ret; - } - - const auto& index = bucket->get_info().get_current_index(); - if (is_layout_indexless(index)) { - ldpp_dout(dpp, 1) << __func__ - << "::ERR, indexless buckets do not maintain stats; bucket=" - << bucket->get_name() << dendl; - return -EINVAL; - } - - std::map stats; - std::string bucket_ver, master_ver; - std::string max_marker; - ret = bucket->read_stats(dpp, null_yield, index, RGW_NO_SHARD, &bucket_ver, - &master_ver, stats, &max_marker); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR getting bucket stats bucket=" - << bucket->get_name() << " ret=" << ret << dendl; - return ret; - } - - for (auto itr = stats.begin(); itr != stats.end(); ++itr) { - RGWStorageStats& s = itr->second; - ldpp_dout(dpp, 20) << __func__ << "::" << bucket->get_name() << "::" - << to_string(itr->first) << "::num_obj=" << s.num_objects - << "::size=" << s.size << dendl; - *p_num_obj += s.num_objects; - *p_size += s.size; - } - - return 0; - } - - //--------------------------------------------------------------------------- - int Background::collect_all_buckets_stats() - { - int ret = 0; - std::string section("bucket.instance"); - std::string marker; - void *handle = nullptr; - ret = driver->meta_list_keys_init(dpp, section, marker, &handle); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: " - << cpp_strerror(-ret) << dendl; - return ret; - } - - d_all_buckets_obj_count = 0; - d_all_buckets_obj_size = 0; - - bool has_more = true; - while (has_more) { - std::list entries; - constexpr int max_keys = 1000; - ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more); - if (ret == 0) { - for (auto& entry : entries) { - ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl; - rgw_bucket bucket; - ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr); - if (unlikely(ret < 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: " - << cpp_strerror(-ret) << dendl; - goto err; - } - ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl; - ret = read_bucket_stats(bucket, &d_all_buckets_obj_count, - &d_all_buckets_obj_size); - if (unlikely(ret != 0)) { - goto err; - } - } - driver->meta_list_keys_complete(handle); - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed driver->meta_list_keys_next()" << dendl; - goto err; - } - } - ldpp_dout(dpp, 10) <<__func__ - << "::all_buckets_obj_count=" << d_all_buckets_obj_count - << "::all_buckets_obj_size=" << d_all_buckets_obj_size - << dendl; - return 0; - - err: - ldpp_dout(dpp, 1) << __func__ << "::error handler" << dendl; - // reset counters to mark that we don't have the info - d_all_buckets_obj_count = 0; - d_all_buckets_obj_size = 0; - if (handle) { - driver->meta_list_keys_complete(handle); - } - return ret; - } - - //--------------------------------------------------------------------------- - int Background::objects_ingress_single_work_shard(work_shard_t worker_id, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards, - worker_stats_t *p_worker_stats, - uint8_t *raw_mem, - uint64_t raw_mem_size) - { - int ret = 0; - std::string section("bucket.instance"); - std::string marker; - void *handle = nullptr; - ret = driver->meta_list_keys_init(dpp, section, marker, &handle); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: " - << cpp_strerror(-ret) << dendl; - return ret; - } - disk_block_array_t disk_arr(dpp, raw_mem, raw_mem_size, worker_id, - p_worker_stats, num_md5_shards); - bool has_more = true; - // iterate over all buckets - while (ret == 0 && has_more) { - std::list entries; - constexpr int max_keys = 1000; - ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more); - if (ret == 0) { - ldpp_dout(dpp, 20) <<__func__ << "::entries.size()=" << entries.size() << dendl; - for (auto& entry : entries) { - ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl; - rgw_bucket bucket; - ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr); - if (unlikely(ret < 0)) { - // bad bucket entry, skip to the next one - ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: " - << cpp_strerror(-ret) << dendl; - continue; - } - ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl; - ret = ingress_bucket_objects_single_shard(disk_arr, bucket, worker_id, - num_work_shards, p_worker_stats); - if (unlikely(ret != 0)) { - if (d_ctl.should_stop()) { - driver->meta_list_keys_complete(handle); - return -ECANCELED; - } - ldpp_dout(dpp, 1) << __func__ << "::Failed ingress_bucket_objects_single_shard()" << dendl; - // skip bad bucket and move on to the next one - continue; - } - } - driver->meta_list_keys_complete(handle); - } - else { - ldpp_dout(dpp, 1) << __func__ << "::failed driver->meta_list_keys_next()" << dendl; - driver->meta_list_keys_complete(handle); - // TBD: what can we do here? - break; - } - } - ldpp_dout(dpp, 20) <<__func__ << "::flush_output_buffers() worker_id=" - << worker_id << dendl; - disk_arr.flush_output_buffers(dpp, d_dedup_cluster_ioctx); - return ret; - } - - //--------------------------------------------------------------------------- - int Background::remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count) - { - unsigned failure_count = 0; - - for (uint32_t slab_id = 0; slab_id < slab_count; slab_id++) { - uint32_t seq_number = disk_block_id_t::slab_id_to_seq_num(slab_id); - disk_block_id_t block_id(worker_id, seq_number); - std::string oid(block_id.get_slab_name(md5_shard)); - ldpp_dout(dpp, 20) << __func__ << "::calling ioctx->remove(" << oid << ")" << dendl; - int ret = d_dedup_cluster_ioctx.remove(oid); - if (ret != 0) { - ldpp_dout(dpp, 0) << __func__ << "::ERR Failed ioctx->remove(" << oid << ")" << dendl; - failure_count++; - } - } - - return failure_count; - } - - //--------------------------------------------------------------------------- - int Background::f_ingress_work_shard(unsigned worker_id, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards) - { - ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << worker_id << dendl; - utime_t start_time = ceph_clock_now(); - worker_stats_t worker_stats; - int ret = objects_ingress_single_work_shard(worker_id, num_work_shards, num_md5_shards, - &worker_stats,raw_mem, raw_mem_size); - if (ret == 0) { - worker_stats.duration = ceph_clock_now() - start_time; - d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats); - ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl; - ldpp_dout(dpp, 10) << "Shard Process Duration = " - << worker_stats.duration << dendl; - } - //ldpp_dout(dpp, 0) << __func__ << "::sleep for 2 seconds\n" << dendl; - //std::this_thread::sleep_for(std::chrono::seconds(2)); - return ret; - } - - //--------------------------------------------------------------------------- - int Background::f_dedup_md5_shard(unsigned md5_shard, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards) - { - utime_t start_time = ceph_clock_now(); - md5_stats_t md5_stats; - //DEDUP_DYN_ALLOC - dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size); - int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards); - if (ret == 0) { - md5_stats.duration = ceph_clock_now() - start_time; - d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats); - ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl; - ldpp_dout(dpp, 10) << "Shard Process Duration = " - << md5_stats.duration << dendl; - } - return ret; - } - - //--------------------------------------------------------------------------- - int Background::process_all_shards(bool ingress_work_shards, - int (Background::*func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t), - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards) - { - while (true) { - d_heart_beat_last_update = ceph_clock_now(); - uint16_t shard_id; - if (ingress_work_shards) { - shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards); - } - else { - shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards); - } - - // start with a common error handler - if (shard_id != NULL_SHARD) { - ldpp_dout(dpp, 10) << __func__ << "::Got shard_id=" << shard_id << dendl; - int ret = (this->*func)(shard_id, raw_mem, raw_mem_size, num_work_shards, - num_md5_shards); - if (unlikely(ret != 0)) { - if (d_ctl.should_stop()) { - ldpp_dout(dpp, 5) << __func__ << "::stop execution" << dendl; - return -ECANCELED; - } - else { - ldpp_dout(dpp, 5) << __func__ << "::Skip shard #" << shard_id << dendl; - } - } - } - else { - ldpp_dout(dpp, 10) << __func__ << "::finished processing all shards" < vec; - vec.push_back("default.rgw.buckets.data"); - map stats; - auto rados_handle = rados->get_rados_handle(); - int ret = rados_handle->get_pool_stats(vec, stats); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << ":ERROR: fetching pool stats: " - << cpp_strerror(-ret) << dendl; - return ret; - } - - for (auto i = stats.begin(); i != stats.end(); ++i) { - const char *pool_name = i->first.c_str(); - librados::pool_stat_t& s = i->second; - // TBD: add support for EC - // We need to find the user byte size without the added protection - double replica_level = (double)s.num_object_copies / s.num_objects; - *p_num_objects = s.num_objects; - *p_num_objects_bytes = s.num_bytes / replica_level; - ldpp_dout(dpp, 10) <<__func__ << "::" << pool_name << "::num_objects=" - << s.num_objects << "::num_copies=" << s.num_object_copies - << "::num_bytes=" << s.num_bytes << "/" << *p_num_objects_bytes << dendl; - } - return 0; - } - - //------------------------------------------------------------------------------- - // 32B per object-entry in the hashtable - // 2MB per shard-buffer - //=============||==============||=========||===================================|| - // Obj Count || shard count || memory || calculation || - // ------------||--------------||---------||---------------------------------- || - // 1M || 4 || 8MB || 8MB/32 = 0.25M * 4 = 1M || - // 4M || 8 || 16MB || 16MB/32 = 0.50M * 8 = 4M || - //------------------------------------------------------------------------------- - // 16M || 16 || 32MB || 32MB/32 = 1.00M * 16 = 16M || - //------------------------------------------------------------------------------- - // 64M || 32 || 64MB || 64MB/32 = 2.00M * 32 = 64M || - // 256M || 64 || 128MB || 128MB/32 = 4.00M * 64 = 256M || - // 1024M( 1G) || 128 || 256MB || 256MB/32 = 8.00M * 128 = 1024M || - // 4096M( 4G) || 256 || 512MB || 512MB/32 = 16M.00 * 256 = 4096M || - // 16384M(16G) || 512 || 1024MB || 1024MB/32 = 32M.00 * 512 = 16384M || - //-------------||--------------||---------||-----------------------------------|| - static md5_shard_t calc_num_md5_shards(uint64_t obj_count) - { - // create headroom by allocating space for a 10% bigger system - obj_count = obj_count + (obj_count/10); - - uint64_t M = 1024 * 1024; - if (obj_count < 1*M) { - // less than 1M objects -> use 4 shards (8MB) - return 4; - } - else if (obj_count < 4*M) { - // less than 4M objects -> use 8 shards (16MB) - return 8; - } - else if (obj_count < 16*M) { - // less than 16M objects -> use 16 shards (32MB) - return 16; - } - else if (obj_count < 64*M) { - // less than 64M objects -> use 32 shards (64MB) - return 32; - } - else if (obj_count < 256*M) { - // less than 256M objects -> use 64 shards (128MB) - return 64; - } - else if (obj_count < 1024*M) { - // less than 1024M objects -> use 128 shards (256MB) - return 128; - } - else if (obj_count < 4*1024*M) { - // less than 4096M objects -> use 256 shards (512MB) - return 256; - } - else { - return 512; - } - } - - //--------------------------------------------------------------------------- - int Background::setup(dedup_epoch_t *p_epoch) - { - int ret = collect_all_buckets_stats(); - if (unlikely(ret != 0)) { - return ret; - } - - md5_shard_t num_md5_shards = calc_num_md5_shards(d_all_buckets_obj_count); - num_md5_shards = std::min(num_md5_shards, MAX_MD5_SHARD); - num_md5_shards = std::max(num_md5_shards, MIN_MD5_SHARD); - work_shard_t num_work_shards = num_md5_shards; - num_work_shards = std::min(num_work_shards, MAX_WORK_SHARD); - - ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <num_work_shards > MAX_WORK_SHARD)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_work_shards=" - << p_epoch->num_work_shards - << " is larger than MAX_WORK_SHARD (" - << MAX_WORK_SHARD << ")" << dendl; - return -EOVERFLOW; - } - if (unlikely(p_epoch->num_md5_shards > MAX_MD5_SHARD)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_md5_shards=" - << p_epoch->num_md5_shards - << " is larger than MAX_MD5_SHARD (" - << MAX_MD5_SHARD << ")" << dendl; - return -EOVERFLOW; - } - - ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl; - d_ctl.dedup_type = p_epoch->dedup_type; -#ifdef FULL_DEDUP_SUPPORT - ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL || - d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); -#else - ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); -#endif - ldpp_dout(dpp, 10) << __func__ << "::" << d_ctl.dedup_type << dendl; - - return 0; - } - - //--------------------------------------------------------------------------- - int Background::watch_reload(const DoutPrefixProvider* dpp) - { - return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx); - } - - //--------------------------------------------------------------------------- - int Background::unwatch_reload(const DoutPrefixProvider* dpp) - { - if (d_watch_handle == 0) { - // nothing to unwatch - ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): nothing to watch" - << dendl; - return 0; - } - - ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle=" - << d_watch_handle << dendl; - - int ret = cluster::unwatch_reload(store, dpp, d_watch_handle); - if (ret == 0) { - ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching " - << "::d_watch_handle=" << d_watch_handle << dendl; - d_watch_handle = 0; - } - return ret; - } - - //--------------------------------------------------------------------------- - void Background::handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl) - { - int ret = 0; - int32_t urgent_msg = URGENT_MSG_NONE; - try { - auto bl_iter = bl.cbegin(); - ceph::decode(urgent_msg, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad urgent_msg" << dendl; - ret = -EINVAL; - } - ldpp_dout(dpp, 5) << __func__ << "::-->" << get_urgent_msg_names(urgent_msg) << dendl; - - // use lock to prevent concurrent pause/resume requests - std::unique_lock cond_lock(d_cond_mutex); // [------>open lock block - if (unlikely(d_ctl.local_urgent_req())) { - // can't operate when the system is paused/shutdown - cond_lock.unlock(); // close lock block------>] - ldpp_dout(dpp, 5) << __func__ - << "::system is paused/shutdown -> cancel notification" << dendl; - cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY); - return; - } - - switch(urgent_msg) { - case URGENT_MSG_ABORT: - if (d_ctl.dedup_exec) { - d_ctl.remote_abort_req = true; - d_cond.notify_all(); - d_cond.wait(cond_lock, [this]{return d_ctl.remote_aborted || d_ctl.local_urgent_req();}); - d_ctl.remote_aborted ? ret = 0 : ret = -EBUSY; - } - else { - ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl; - } - break; - case URGENT_MSG_RESTART: - if (!d_ctl.dedup_exec) { - d_ctl.remote_restart_req = true; - d_cond.notify_all(); - } - else { - ldpp_dout(dpp, 5) << __func__ << "::\ncan't restart active dedup\n"<< dendl; - ret = -EEXIST; - } - break; - case URGENT_MSG_PASUE: - if (d_ctl.dedup_exec && !d_ctl.remote_paused) { - d_ctl.remote_pause_req = true; - d_cond.notify_all(); - d_cond.wait(cond_lock, [this]{return d_ctl.remote_paused || d_ctl.local_urgent_req();}); - d_ctl.remote_paused ? ret = 0 : ret = -EBUSY; - } - else { - if (d_ctl.remote_paused) { - ldpp_dout(dpp, 5) << __func__ << "::dedup is already paused" << dendl; - } - else { - ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl; - } - } - break; - case URGENT_MSG_RESUME: - if (d_ctl.remote_pause_req || d_ctl.remote_paused) { - d_ctl.remote_pause_req = false; - d_ctl.remote_paused = false; - d_cond.notify_all(); - } - else { - ldpp_dout(dpp, 5) << __func__ << "::dedup is not paused->nothing to do" << dendl; - } - break; - default: - ldpp_dout(dpp, 1) << __func__ << "::unexpected urgent_msg: " - << get_urgent_msg_names(urgent_msg) << dendl; - ret = -EINVAL; - } - - cond_lock.unlock(); // close lock block------>] - cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret); - } - - //--------------------------------------------------------------------------- - void Background::start() - { - const DoutPrefixProvider* const dpp = &dp; - ldpp_dout(dpp, 10) << __FILE__ << "::" <<__func__ << dendl; - { - std::unique_lock pause_lock(d_cond_mutex); - if (d_ctl.started) { - // start the thread only once - ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl; - return; - } - d_ctl.started = true; - } - d_runner = std::thread(&Background::run, this); - } - - //------------------------- -------------------------------------------------- - void Background::shutdown() - { - ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg shutdown()" << dendl; - std::unique_lock cond_lock(d_cond_mutex); - bool nested_call = false; - if (d_ctl.shutdown_req) { - // should never happen! - ldpp_dout(dpp, 1) <<__func__ << "dedup_bg nested call" << dendl; - nested_call = true; - } - d_ctl.shutdown_req = true; - d_cond.notify_all(); - ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl; - d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;}); - //cond_lock.unlock(); - - if (nested_call) { - ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl; - d_cond.notify_all(); - } - - if (d_runner.joinable()) { - ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg wait join()" << dendl; - d_runner.join(); - ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg finished join()" << dendl; - } - else { - ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg not joinable()" << dendl; - } - - d_ctl.reset(); - } - - //--------------------------------------------------------------------------- - void Background::pause() - { - display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request"); - std::unique_lock cond_lock(d_cond_mutex); - - if (d_ctl.local_paused || d_ctl.shutdown_done) { - cond_lock.unlock(); - ldpp_dout(dpp, 1) << __FILE__ << "::" <<__func__ - << "::dedup_bg is already paused/stopped" << dendl; - return; - } - - bool nested_call = false; - if (d_ctl.local_pause_req) { - // should never happen! - ldpp_dout(dpp, 1) <<__func__ << "::nested call" << dendl; - nested_call = true; - } - d_ctl.local_pause_req = true; - d_cond.notify_all(); - d_cond.wait(cond_lock, [this]{return d_ctl.local_paused||d_ctl.shutdown_done;}); - if (nested_call) { - ldpp_dout(dpp, 1) << "dedup_bg::nested call:: repeat notify" << dendl; - d_cond.notify_all(); - } - - // destory open watch request and pool handle before pause() is completed - unwatch_reload(dpp); - d_dedup_cluster_ioctx.close(); - ldpp_dout(dpp, 5) << "dedup_bg paused" << dendl; - } - - //--------------------------------------------------------------------------- - void Background::resume(rgw::sal::Driver* _driver) - { - ldpp_dout(dpp, 5) << "dedup_bg->resume()" << dendl; - // use lock to prevent concurrent pause/resume requests - std::unique_lock cond_lock(d_cond_mutex); - - if (!d_ctl.local_paused) { - cond_lock.unlock(); - ldpp_dout(dpp, 5) << "dedup_bg::resume thread is not paused!" << dendl; - if (_driver != driver) { - ldpp_dout(dpp, 1) << "dedup_bg attempt to change driver on an active system was refused" << dendl; - } - return; - } - - driver = _driver; - // can pool change its uid between pause/resume ??? - int ret = init_rados_access_handles(false); - if (ret != 0) { - derr << "dedup_bg::resume() failed init_rados_access_handles() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - throw std::runtime_error("Failed init_rados_access_handles()"); - } - display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done"); - // create new watch request using the new pool handle - watch_reload(dpp); - d_ctl.local_pause_req = false; - d_ctl.local_paused = false; - - // wake up threads blocked after seeing pause state - d_cond.notify_all(); - ldpp_dout(dpp, 5) << "dedup_bg was resumed" << dendl; - } - - //--------------------------------------------------------------------------- - void Background::handle_pause_req(const char *caller) - { - ldpp_dout(dpp, 5) << __func__ << "::caller=" << caller << dendl; - ldpp_dout(dpp, 5) << __func__ << "::" << d_ctl << dendl; - while (d_ctl.local_pause_req || d_ctl.local_paused || d_ctl.remote_pause_req || d_ctl.remote_paused) { - std::unique_lock cond_lock(d_cond_mutex); - if (d_ctl.should_stop()) { - ldpp_dout(dpp, 5) << __func__ << "::should_stop!" << dendl; - return; - } - - if (d_ctl.local_pause_req) { - d_ctl.local_pause_req = false; - d_ctl.local_paused = true; - } - - if (d_ctl.remote_pause_req) { - d_ctl.remote_pause_req = false; - d_ctl.remote_paused = true; - } - - d_cond.notify_all(); - - if (d_ctl.local_paused) { - ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.local_paused" << dendl; - d_cond.wait(cond_lock, [this]{return !d_ctl.local_paused || d_ctl.should_stop() ;}); - } - - if (d_ctl.remote_paused) { - ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.remote_paused" << dendl; - d_cond.wait(cond_lock, [this]{return !d_ctl.remote_paused || d_ctl.should_stop() || d_ctl.local_pause_req;}); - } - } // while loop - - ldpp_dout(dpp, 5) << "Dedup background thread resumed!" << dendl; - } - - //--------------------------------------------------------------------------- - void Background::work_shards_barrier(work_shard_t num_work_shards) - { - // Wait for other worker to finish ingress step - // We can move to the next step even if some token are in failed state - const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members - unsigned ttl = 3; - unsigned time_elapsed = 0; - - while (true) { - int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards); - // we start incrementing time_elapsed only after all valid tokens finish - if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) { - break; - } - - ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl=" - << ttl << " seconds" << dendl; - std::unique_lock cond_lock(d_cond_mutex); - d_cond.wait_for(cond_lock, std::chrono::seconds(ttl), - [this]{return d_ctl.should_stop() || d_ctl.should_pause();}); - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - if (unlikely(d_ctl.should_stop())) { - return; - } - - if (ret != -EAGAIN) { - // All incomplete tokens are corrupted or in time out state - // Give them an extra 120 seconds just in case ... - time_elapsed += ttl; - } - // else there are still good tokens in process, wait for them - } - - ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n" - << dendl; - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - } - - //--------------------------------------------------------------------------- - static bool all_md5_shards_completed(cluster *p_cluster, - rgw::sal::RadosStore *store, - md5_shard_t num_md5_shards) - { - return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0); - } - - //--------------------------------------------------------------------------- - void Background::md5_shards_barrier(md5_shard_t num_md5_shards) - { - // Wait for others to finish step - unsigned ttl = 3; - // require that everything completed successfully before deleting the pool - while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) { - ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl=" - << ttl << " seconds" << dendl; - std::unique_lock cond_lock(d_cond_mutex); - d_cond.wait_for(cond_lock, std::chrono::seconds(ttl), - [this]{return d_ctl.should_stop() || d_ctl.should_pause();}); - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - if (unlikely(d_ctl.should_stop())) { - return; - } - } - - ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n" - << dendl; - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - } - } - - //--------------------------------------------------------------------------- - void Background::run() - { - const auto rc = ceph_pthread_setname("dedup_bg"); - ldpp_dout(dpp, 5) << __func__ << "ceph_pthread_setname() ret=" << rc << dendl; - - // 256x8KB=2MB - const uint64_t PER_SHARD_BUFFER_SIZE = DISK_BLOCK_COUNT *sizeof(disk_block_t); - ldpp_dout(dpp, 20) <<__func__ << "::dedup::main loop" << dendl; - - while (!d_ctl.shutdown_req) { - if (unlikely(d_ctl.should_pause())) { - handle_pause_req(__func__); - if (unlikely(d_ctl.should_stop())) { - ldpp_dout(dpp, 5) <<__func__ << "::stop req after a pause" << dendl; - d_ctl.dedup_exec = false; - } - } - - if (d_ctl.dedup_exec) { - dedup_epoch_t epoch; - if (setup(&epoch) != 0) { - ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl; - return; - } - const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; - int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); - if (pool_id < 0) { - ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl; - return; - } - work_shard_t num_work_shards = epoch.num_work_shards; - md5_shard_t num_md5_shards = epoch.num_md5_shards; - const uint64_t RAW_MEM_SIZE = PER_SHARD_BUFFER_SIZE * num_md5_shards; - ldpp_dout(dpp, 5) <<__func__ << "::RAW_MEM_SIZE=" << RAW_MEM_SIZE - << "::num_work_shards=" << num_work_shards - << "::num_md5_shards=" << num_md5_shards << dendl; - // DEDUP_DYN_ALLOC - auto raw_mem = std::make_unique(RAW_MEM_SIZE); - if (raw_mem == nullptr) { - ldpp_dout(dpp, 1) << "failed slab memory allocation - size=" << RAW_MEM_SIZE << dendl; - return; - } - - process_all_shards(true, &Background::f_ingress_work_shard, raw_mem.get(), - RAW_MEM_SIZE, num_work_shards, num_md5_shards); - if (!d_ctl.should_stop()) { - // Wait for all other workers to finish ingress step - work_shards_barrier(num_work_shards); - if (!d_ctl.should_stop()) { - process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(), - RAW_MEM_SIZE, num_work_shards, num_md5_shards); - // Wait for all other md5 shards to finish - md5_shards_barrier(num_md5_shards); - safe_pool_delete(store, dpp, pool_id); - } - else { - ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl; - } - } - else { - ldpp_dout(dpp, 5) <<__func__ << "::stop req from ingress_work_shard" << dendl; - } - } // dedup_exec - - std::unique_lock cond_lock(d_cond_mutex); - d_ctl.dedup_exec = false; - if (d_ctl.remote_abort_req) { - d_ctl.remote_aborted = true; - - d_ctl.remote_abort_req = false; - d_ctl.remote_paused = false; - d_cond.notify_all(); - ldpp_dout(dpp, 5) << __func__ << "::Dedup was aborted on a remote req" << dendl; - } - d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();}); - if (!d_ctl.should_stop() && !d_ctl.should_pause()) { - // TBD: should we release lock here ??? - if (d_cluster.can_start_new_scan(store)) { - d_ctl.dedup_exec = true; - d_ctl.remote_aborted = false; - d_ctl.remote_paused = false; - d_ctl.remote_restart_req = false; - d_cond.notify_all(); - } - }else if (d_ctl.should_stop()) { - ldpp_dout(dpp, 5) << "main loop::should_stop::" << d_ctl << dendl; - } - else { - ldpp_dout(dpp, 5) << "main loop::should_pause::" << d_ctl << dendl; - } - } - d_ctl.shutdown_done = true; - d_cond.notify_all(); - // shutdown - ldpp_dout(dpp, 5) << __func__ << "::Dedup background thread stopped" << dendl; - } - -}; //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup.h b/src/rgw/rgw_dedup.h deleted file mode 100644 index 48dafe38cb1e..000000000000 --- a/src/rgw/rgw_dedup.h +++ /dev/null @@ -1,250 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include "common/dout.h" -#include "rgw_common.h" -#include "rgw_dedup_utils.h" -#include "rgw_dedup_table.h" -#include "rgw_dedup_cluster.h" -#include "rgw_realm_reloader.h" -#include -#include -#include -#include -#include - -namespace rgw::dedup { - struct dedup_epoch_t; - struct control_t { - control_t() { - reset(); - } - void reset(); - inline bool local_urgent_req() const { - return (shutdown_req || local_pause_req); - } - inline bool should_stop() const { - return (shutdown_req || remote_abort_req); - } - inline bool should_pause() const { - return (local_pause_req || remote_pause_req); - } - - // allow to start/pasue/resume/stop execution - dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE; - bool started = false; - bool dedup_exec = false; - bool shutdown_req = false; - bool shutdown_done = false; - bool local_pause_req = false; - bool local_paused = false; - bool remote_abort_req = false; - bool remote_aborted = false; - bool remote_pause_req = false; - bool remote_paused = false; - bool remote_restart_req = false; - }; - std::ostream& operator<<(std::ostream &out, const control_t &ctl); - void encode(const control_t& ctl, ceph::bufferlist& bl); - void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl); - class remapper_t; - class disk_block_seq_t; - struct disk_record_t; - struct key_t; - //Interval between each execution of the script is set to 5 seconds - static inline constexpr int INIT_EXECUTE_INTERVAL = 5; - class Background : public RGWRealmReloader::Pauser { - class DedupWatcher : public librados::WatchCtx2 { - Background* const parent; - public: - DedupWatcher(Background* _parent) : parent(_parent) {} - ~DedupWatcher() override = default; - void handle_notify(uint64_t notify_id, uint64_t cookie, - uint64_t notifier_id, bufferlist& bl) override; - void handle_error(uint64_t cookie, int err) override; - }; - - public: - Background(rgw::sal::Driver* _driver, CephContext* _cct); - int watch_reload(const DoutPrefixProvider* dpp); - int unwatch_reload(const DoutPrefixProvider* dpp); - void handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl); - void start(); - void shutdown(); - void pause() override; - void resume(rgw::sal::Driver* _driver) override; - - private: - enum dedup_step_t { - STEP_NONE, - STEP_BUCKET_INDEX_INGRESS, - STEP_BUILD_TABLE, - STEP_READ_ATTRIBUTES, - STEP_REMOVE_DUPLICATES - }; - - void run(); - int setup(struct dedup_epoch_t*); - void work_shards_barrier(work_shard_t num_work_shards); - void md5_shards_barrier(md5_shard_t num_md5_shards); - void handle_pause_req(const char* caller); - const char* dedup_step_name(dedup_step_t step); - int read_buckets(); - void check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, uint64_t count_b, - const char *prefix); - - inline void check_and_update_worker_heartbeat(work_shard_t worker_id, int64_t obj_count); - inline void check_and_update_md5_heartbeat(md5_shard_t md5_id, - uint64_t load_count, - uint64_t dedup_count); - int ingress_bucket_idx_single_object(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *bucket, - const rgw_bucket_dir_entry &entry, - worker_stats_t *p_worker_stats /*IN-OUT*/); - int process_bucket_shards(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *bucket, - std::map &oids, - librados::IoCtx &ioctx, - work_shard_t shard_id, - work_shard_t num_work_shards, - worker_stats_t *p_worker_stats /*IN-OUT*/); - int ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr, - const rgw_bucket &bucket_rec, - work_shard_t worker_id, - work_shard_t num_work_shards, - worker_stats_t *p_worker_stats /*IN-OUT*/); - int objects_ingress_single_work_shard(work_shard_t worker_id, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards, - worker_stats_t *p_worker_stats, - uint8_t *raw_mem, - uint64_t raw_mem_size); - int f_ingress_work_shard(unsigned shard_id, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards); - int f_dedup_md5_shard(unsigned shard_id, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards); - int process_all_shards(bool ingress_work_shards, - int (Background::* func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t), - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards); - int read_bucket_stats(const rgw_bucket &bucket_rec, - uint64_t *p_num_obj, - uint64_t *p_size); - int collect_all_buckets_stats(); - int objects_dedup_single_md5_shard(dedup_table_t *p_table, - md5_shard_t md5_shard, - md5_stats_t *p_stats, - work_shard_t num_work_shards); - int add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr, - const rgw::sal::Bucket *p_bucket, - const parsed_etag_t *p_parsed_etag, - const std::string &obj_name, - uint64_t obj_size, - const std::string &storage_class); - - int add_record_to_dedup_table(dedup_table_t *p_table, - const struct disk_record_t *p_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_stats_t *p_stats, - remapper_t *remapper); - - int process_all_slabs(dedup_table_t *p_table, - dedup_step_t step, - md5_shard_t md5_shard, - work_shard_t work_shard, - uint32_t *p_seq_count, - md5_stats_t *p_stats /* IN-OUT */, - disk_block_seq_t *p_disk_block_arr, - remapper_t *remapper); - -#ifdef FULL_DEDUP_SUPPORT - int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash); - int add_obj_attrs_to_record(rgw_bucket *p_rb, - disk_record_t *p_rec, - const rgw::sal::Attrs &attrs, - dedup_table_t *p_table, - md5_stats_t *p_stats); /* IN-OUT */ - - int read_object_attribute(dedup_table_t *p_table, - disk_record_t *p_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats /* IN-OUT */, - disk_block_seq_t *p_disk, - remapper_t *remapper); - int try_deduping_record(dedup_table_t *p_table, - const disk_record_t *p_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats, /* IN-OUT */ - remapper_t *remapper); - int inc_ref_count_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &manifest); - int rollback_ref_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &tgt_manifest); - int free_tail_objs_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &tgt_manifest); - int dedup_object(const disk_record_t *p_src_rec, - const disk_record_t *p_tgt_rec, - md5_stats_t *p_stats, - bool is_shared_manifest_src); -#endif - int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count); - int init_rados_access_handles(bool init_pool); - - // private data members - rgw::sal::Driver* driver = nullptr; - rgw::sal::RadosStore* store = nullptr; - RGWRados* rados = nullptr; - librados::Rados* rados_handle = nullptr; - const DoutPrefix dp; - const DoutPrefixProvider* const dpp; - CephContext* const cct; - cluster d_cluster; - librados::IoCtx d_dedup_cluster_ioctx; - utime_t d_heart_beat_last_update; - unsigned d_heart_beat_max_elapsed_sec; - - // A pool with 6 billion objects has a 1/(2^64) chance for collison with a 128bit MD5 - uint64_t d_max_protected_objects = (6ULL * 1024 * 1024 * 1024); - uint64_t d_all_buckets_obj_count = 0; - uint64_t d_all_buckets_obj_size = 0; - // we don't benefit from deduping RGW objects smaller than head-object size - uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024); - uint32_t d_head_object_size = (4ULL * 1024 * 1024); - control_t d_ctl; - uint64_t d_watch_handle = 0; - DedupWatcher d_watcher_ctx; - - std::thread d_runner; - std::mutex d_cond_mutex; - std::condition_variable d_cond; - }; - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_cluster.cc b/src/rgw/rgw_dedup_cluster.cc deleted file mode 100644 index 7bdb308af87c..000000000000 --- a/src/rgw/rgw_dedup_cluster.cc +++ /dev/null @@ -1,1346 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "rgw_dedup_cluster.h" -#include "rgw_dedup.h" -#include "rgw_dedup_epoch.h" -#include "rgw_common.h" -#include "rgw_dedup_store.h" -#include "include/rados/rados_types.hpp" -#include "include/rados/buffer.h" -#include "include/rados/librados.hpp" -#include "svc_zone.h" -#include "common/Clock.h" // for ceph_clock_now() -#include "common/config.h" -#include "common/Cond.h" -#include "common/debug.h" -#include "common/errno.h" -#include "rgw_common.h" -#include "include/denc.h" -#include "rgw_sal.h" -#include "driver/rados/rgw_sal_rados.h" -#include -#include -#include - -namespace rgw::dedup { - const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN"; - const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ"; - - static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30; - struct shard_progress_t; - static int collect_shard_stats(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - utime_t epoch_time, - unsigned shards_count, - const char *prefix, - bufferlist bl_arr[], - struct shard_progress_t *sp_arr); - - const uint64_t SP_ALL_OBJECTS = ULLONG_MAX; - const uint64_t SP_NO_OBJECTS = 0ULL; - const char* SHARD_PROGRESS_ATTR = "shard_progress"; - - //--------------------------------------------------------------------------- - static int get_control_ioctx(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - librados::IoCtx &ctl_ioctx /* OUT-PARAM */) - { - const auto& control_pool = store->svc()->zone->get_zone_params().control_pool; - auto rados_handle = store->getRados()->get_rados_handle(); - int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx); - if (unlikely(ret < 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - } - return ret; - } - - //--------------------------------------------------------------------------- - static int get_epoch(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - dedup_epoch_t *p_epoch, /* OUT */ - const char *caller) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - std::string oid(DEDUP_EPOCH_TOKEN); - bufferlist bl; - ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl); - if (ret > 0) { - try { - auto p = bl.cbegin(); - decode(*p_epoch, p); - }catch (const buffer::error&) { - ldpp_dout(dpp, 0) << __func__ << "::failed epoch decode!" << dendl; - return -EINVAL; - } - if (caller) { - ldpp_dout(dpp, 10) << __func__ << "::"<< caller<< "::" << *p_epoch << dendl; - } - return 0; - } - else { - // zero length read means no data - if (ret == 0) { - ret = -ENODATA; - } - ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "") - << "::failed ctl_ioctx.getxattr() with: " - << cpp_strerror(-ret) << ", ret=" << ret << dendl; - return ret; - } - } - - //--------------------------------------------------------------------------- - static int set_epoch(rgw::sal::RadosStore *store, - const std::string &cluster_id, - const DoutPrefixProvider *dpp, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - std::string oid(DEDUP_EPOCH_TOKEN); - ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl; - bool exclusive = true; // block overwrite of old objects - ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { - ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl; - // now try and take ownership - } - else if (ret == -EEXIST) { - ldpp_dout(dpp, 10) << __func__ << "::Epoch object exists -> trying to take over" << dendl; - // try and take ownership - } - else{ - ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << oid - <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <serial + 1, dedup_type, - ceph_clock_now(), num_work_shards, num_md5_shards}; - bufferlist old_epoch_bl, new_epoch_bl, err_bl; - encode(*p_old_epoch, old_epoch_bl); - encode(new_epoch, new_epoch_bl); - librados::ObjectWriteOperation op; - op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, old_epoch_bl); - op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl); - - ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl; - std::string oid(DEDUP_EPOCH_TOKEN); - ret = ctl_ioctx.operate(oid, &op); - if (ret != 0) { - ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate(" - << oid << "), err is " << cpp_strerror(-ret) << dendl; - } - - return ret; - } - - //--------------------------------------------------------------------------- - struct shard_progress_t { - shard_progress_t() { - // init an empty object - this->progress_a = SP_NO_OBJECTS; - this->progress_b = SP_NO_OBJECTS; - this->completed = false; - - // set all timers to now - this->creation_time = utime_t(); - this->completion_time = utime_t(); - this->update_time = utime_t(); - - // owner and stats_bl are empty until set - } - - shard_progress_t(uint64_t _progress_a, - uint64_t _progress_b, - bool _completed, - const std::string &_owner, - const bufferlist &_stats_bl) : owner(_owner), stats_bl(_stats_bl) { - this->progress_a = _progress_a; - this->progress_b = _progress_b; - this->completed = _completed; - - utime_t now = ceph_clock_now(); - this->update_time = now; - - if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) { - this->creation_time = now; - } - if (_completed) { - this->completion_time = now; - } - } - - bool is_completed() const { - if (this->progress_b == SP_ALL_OBJECTS) { - ceph_assert(this->completed); - return true; - } - else { - ceph_assert(!this->completed); - return false; - } - } - - bool was_not_started() const { - return (this->creation_time == this->update_time); - } - - uint64_t progress_a; - uint64_t progress_b; - bool completed; - utime_t update_time; - utime_t creation_time; - utime_t completion_time; - std::string owner; - bufferlist stats_bl; - }; - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, shard_progress_t& sp) - { - out << (sp.completed ? " + ::" : " - ::"); - out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]"; - out << "::creation: " << sp.creation_time; - out << "::update: " << sp.update_time; - out << "::completion: " << sp.completion_time; - return out; - } - - //--------------------------------------------------------------------------- - void encode(const shard_progress_t& sp, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - encode(sp.progress_a, bl); - encode(sp.progress_b, bl); - encode(sp.completed, bl); - encode(sp.creation_time, bl); - encode(sp.completion_time, bl); - encode(sp.update_time, bl); - encode(sp.owner, bl); - encode(sp.stats_bl, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - void decode(shard_progress_t & sp, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - decode(sp.progress_a, bl); - decode(sp.progress_b, bl); - decode(sp.completed, bl); - decode(sp.creation_time, bl); - decode(sp.completion_time, bl); - decode(sp.update_time, bl); - decode(sp.owner, bl); - decode(sp.stats_bl, bl); - DECODE_FINISH(bl); - } - - //========================================================================== - - //--------------------------------------------------------------------------- - void cluster::clear() - { - d_curr_md5_shard = 0; - d_curr_worker_shard = 0; - - d_num_completed_workers = 0; - d_num_completed_md5 = 0; - - memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers)); - memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5)); - } - - - static constexpr auto COOKIE_LEN = 15; - static constexpr auto CLUSTER_ID_LEN = 15; - //--------------------------------------------------------------------------- - cluster::cluster(const DoutPrefixProvider *_dpp, - CephContext *cct, - rgw::sal::Driver* driver): - dpp(_dpp), - d_lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)), - d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN)) - { - clear(); - } - - //--------------------------------------------------------------------------- - int cluster::reset(rgw::sal::RadosStore *store, - dedup_epoch_t *p_epoch, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards) - { - ldpp_dout(dpp, 10) << __func__ << "::REQ num_work_shards=" << num_work_shards - << "::num_md5_shards=" << num_md5_shards << dendl; - clear(); - - while (true) { - int ret = get_epoch(store, dpp, p_epoch, __func__); - if (ret != 0) { - return ret; - } - if (p_epoch->num_work_shards && p_epoch->num_md5_shards) { - ldpp_dout(dpp, 10) << __func__ << "::ACC num_work_shards=" << p_epoch->num_work_shards - << "::num_md5_shards=" << p_epoch->num_md5_shards << dendl; - break; - } - else if (!num_work_shards && !num_md5_shards) { - ldpp_dout(dpp, 10) << __func__ << "::Init flow, no need to wait" << dendl; - break; - } - else { - ret = swap_epoch(store, dpp, p_epoch, - static_cast (p_epoch->dedup_type), - num_work_shards, num_md5_shards); - } - } - - d_epoch_time = p_epoch->time; - // retry cleanup 3 times before declaring failure - const unsigned RETRY_LIMIT = 3; - int ret = 1; - for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) { - ret = cleanup_prev_run(store); - } - if (ret != 0) { - return ret; - } - - create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX); - create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX); - - ret = verify_all_shard_tokens(store, p_epoch->num_work_shards, - WORKER_SHARD_PREFIX); - if (ret != 0) { - return ret; - } - return verify_all_shard_tokens(store, p_epoch->num_md5_shards, - MD5_SHARD_PREFIX); - } - - //--------------------------------------------------------------------------- - int cluster::cleanup_prev_run(rgw::sal::RadosStore *store) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - int error_code = 0; - constexpr uint32_t max = 100; - std::string marker; - bool truncated = false; - rgw::AccessListFilter filter{}; - unsigned deleted_count = 0, skipped_count = 0; - unsigned failed_count = 0, no_entry_count = 0; - do { - std::vector oids; - int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated); - if (ret == -ENOENT) { - ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl; - break; - } - else if (ret < 0) { - ldpp_dout(dpp, 1) << "failed rgw_list_pool()! ret=" << ret - << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - - for (const std::string& oid : oids) { - if (shard_token_oid::legal_oid_name(oid) == false) { - ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl; - skipped_count++; - continue; - } - - uint64_t size; - struct timespec tspec; - ret = ctl_ioctx.stat2(oid, &size, &tspec); - if (ret == -ENOENT) { - ldpp_dout(dpp, 20) << __func__ << "::" << oid - << " was removed by others" << dendl; - no_entry_count++; - continue; - } - else if (ret != 0) { - ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " - << oid << " )" << dendl; - error_code = ret; - failed_count++; - continue; - } - utime_t mtime(tspec); - if (d_epoch_time < mtime) { - ldpp_dout(dpp, 10) << __func__ << "::skipping new obj! " - << "::EPOCH={" << d_epoch_time.tv.tv_sec << ":" << d_epoch_time.tv.tv_nsec << "} " - << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl; - skipped_count++; - continue; - } - ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl; - ret = ctl_ioctx.remove(oid); - if (ret == 0) { - deleted_count++; - } - else if (ret == -ENOENT) { - ldpp_dout(dpp, 20) << __func__ << "::" << oid - << " was removed by others" << dendl; - no_entry_count++; - continue; - } - else { - error_code = ret; - failed_count++; - ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid - << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl; - } - } - ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size() - << "::deleted=" << deleted_count - << "::failed=" << failed_count - << "::no entry=" << no_entry_count - << "::skipped=" << skipped_count << dendl; - } while (truncated); - - return error_code; - } - - //--------------------------------------------------------------------------- - int cluster::create_shard_tokens(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - shard_token_oid sto(prefix); - for (unsigned shard = 0; shard < shards_count; shard++) { - sto.set_shard(shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl; - bool exclusive = true; - ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { - ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl; - } - else if (ret == -EEXIST) { - ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create(" - << oid << ") -EEXIST!" << dendl; - } - else { - // TBD: can it happen legally ? - ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid - << ") with: " << ret << "::" << cpp_strerror(-ret) << dendl; - } - } - - return 0; - } - - //--------------------------------------------------------------------------- - int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - shard_token_oid sto(prefix); - for (unsigned shard = 0; shard < shards_count; shard++) { - sto.set_shard(shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl; - - uint64_t size; - struct timespec tspec; - ret = ctl_ioctx.stat2(oid, &size, &tspec); - if (ret != 0) { - ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )" - << "::shards_count=" << shards_count << dendl; - return ret; - } - } - - return 0; - } - - //--------------------------------------------------------------------------- - int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store, - unsigned shard, - uint64_t count_a, - uint64_t count_b, - const char *prefix) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - shard_token_oid sto(prefix, shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - bufferlist empty_bl; - shard_progress_t sp(count_a, count_b, false, d_cluster_id, empty_bl); - sp.creation_time = d_token_creation_time; - bufferlist sp_bl; - encode(sp, sp_bl); - return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); - } - - //--------------------------------------------------------------------------- - int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store, - unsigned shard, - uint64_t obj_count, - const char *prefix, - const bufferlist &bl) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - shard_token_oid sto(prefix, shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl; - - shard_progress_t sp(obj_count, SP_ALL_OBJECTS, true, d_cluster_id, bl); - sp.creation_time = d_token_creation_time; - bufferlist sp_bl; - encode(sp, sp_bl); - ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); - if (ret == 0) { - ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")" - << dendl; - } - else { - ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid - << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl; - } - - return ret; - } - - //--------------------------------------------------------------------------- - int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store, - uint16_t start_shard, - uint16_t max_shard, - const char *prefix) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - // lock paramters: - const utime_t lock_duration; // zero duration means lock doesn't expire - const uint8_t lock_flags = 0; // no flags - const std::string lock_tag; // no tag - - shard_token_oid sto(prefix); - for (auto shard = start_shard; shard < max_shard; shard++) { - sto.set_shard(shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 10) << __func__ << "::try garbbing " << oid << dendl; - librados::ObjectWriteOperation op; - op.assert_exists(); - rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie, - lock_tag, "dedup_shard_token", lock_duration, lock_flags); - ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield); - if (ret == -EBUSY) { - // someone else took this token -> move to the next one - ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid << - " is owned by other rgw" << dendl; - continue; - } - else if (ret == -ENOENT) { - // token is deleted - processing will stop the next time we try to read from the queue - ldpp_dout(dpp, 5) << __func__ << "::" << oid - << " token doesn't exist, fail lock!" << dendl; - continue; - } - else if (ret < 0) { - // failed to lock for another reason, continue to process other queues - ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to lock token: " << oid - << ":: ret=" << ret << "::" << cpp_strerror(-ret) << dendl; - //has_error = true; - continue; - } - ldpp_dout(dpp, 10) << __func__ << "::successfully locked " << oid << dendl; - bufferlist empty_bl; - shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl); - d_token_creation_time = sp.creation_time; - bufferlist sp_bl; - encode(sp, sp_bl); - ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl); - if (ret == 0) { - ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl; - return shard; - } - } - - return NULL_SHARD; - } - - //--------------------------------------------------------------------------- - work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store, - work_shard_t num_work_shards) - { - int32_t shard = get_next_shard_token(store, d_curr_worker_shard, - num_work_shards, WORKER_SHARD_PREFIX); - if (shard >= 0 && shard < num_work_shards) { - d_curr_worker_shard = shard + 1; - return shard; - } - else { - return NULL_WORK_SHARD; - } - } - - //--------------------------------------------------------------------------- - md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store, - md5_shard_t num_md5_shards) - { - int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards, - MD5_SHARD_PREFIX); - if (shard >= 0 && shard < num_md5_shards) { - d_curr_md5_shard = shard + 1; - return shard; - } - else { - return NULL_MD5_SHARD; - } - } - - //--------------------------------------------------------------------------- - int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix, - uint16_t *p_num_completed, - uint8_t completed_arr[]) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - int err_code = 0; - unsigned count = 0; - shard_token_oid sto(prefix); - for (unsigned shard = 0; shard < shards_count; shard++) { - if (completed_arr[shard] == TOKEN_STATE_COMPLETED) { - count++; - continue; - } - - sto.set_shard(shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl; - bufferlist bl; - ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl); - if (unlikely(ret <= 0)) { - if (ret != -ENODATA) { - ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - } - completed_arr[shard] = TOKEN_STATE_CORRUPTED; - // all failures to get valid token state return ENODATA - err_code = -ENODATA; - continue; - } - - shard_progress_t sp; - try { - auto p = bl.cbegin(); - decode(sp, p); - } - catch (const buffer::error&) { - ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl; - completed_arr[shard] = TOKEN_STATE_CORRUPTED; - // all failures to get valid token state return ENODATA - err_code = -ENODATA; - continue; - } - - if (sp.is_completed()) { - utime_t duration = sp.completion_time - sp.creation_time; - // mark token completed; - (*p_num_completed)++; - completed_arr[shard] = TOKEN_STATE_COMPLETED; - ldpp_dout(dpp, 20) << __func__ << "::" << oid - << "::completed! duration=" << duration << dendl; - count++; - } - else if (sp.was_not_started()) { - // token was not started yet - // TBD: - // If it is not locked we can process it (by why we skipped it)?? - // If locked, check when it was done and if timed-out - ldpp_dout(dpp, 10) << __func__ << "::" << oid - << "::was not started, skipping" << dendl; - return -EAGAIN; - } - else { - static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0); - utime_t time_elapsed = ceph_clock_now() - sp.update_time; - if (time_elapsed > heartbeat_timeout) { - // lock expired -> try and break lock - ldpp_dout(dpp, 5) << __func__ << "::" << oid - << "::expired lock, skipping:" << time_elapsed - << "::" << sp << dendl; - completed_arr[shard] = TOKEN_STATE_TIMED_OUT; - err_code = -ETIME; - continue; - } - else { - return -EAGAIN; - } - } - } // loop - - if (count < shards_count) { - unsigned n = shards_count - count; - ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl; - } - return err_code; - } - - //--------------------------------------------------------------------------- - static int collect_shard_stats(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - utime_t epoch_time, - unsigned shards_count, - const char *prefix, - bufferlist bl_arr[], - shard_progress_t *sp_arr) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - unsigned count = 0; - cluster::shard_token_oid sto(prefix); - for (unsigned shard = 0; shard < shards_count; shard++) { - sto.set_shard(shard); - std::string oid(sto.get_buff(), sto.get_buff_size()); - ldpp_dout(dpp, 20) << __func__ << "::checking object: " << oid << dendl; - - uint64_t size; - struct timespec tspec; - if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) { - ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )" - << "::shards_count=" << shards_count << dendl; - continue; - } - utime_t mtime(tspec); - if (epoch_time > mtime) { - ldpp_dout(dpp, 10) << __func__ << "::skipping old obj! " - << "::EPOCH={" << epoch_time.tv.tv_sec << ":" << epoch_time.tv.tv_nsec << "} " - << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl; - continue; - } - - shard_progress_t sp; - bufferlist bl; - ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl); - if (ret > 0) { - try { - auto p = bl.cbegin(); - decode(sp, p); - sp_arr[shard] = sp; - count++; - } - catch (const buffer::error&) { - ldpp_dout(dpp, 10) << __func__ << "::(1)failed shard_progress_t decode!" << dendl; - return -EINVAL; - } - } - else if (ret != -ENODATA) { - ldpp_dout(dpp, 10) << __func__ << "::" << oid << "::failed getxattr() ret=" - << ret << "::" << cpp_strerror(-ret) << dendl; - continue; - } - bl_arr[shard] = sp.stats_bl; - } - - if (count != shards_count) { - ldpp_dout(dpp, 10) << __func__ << "::missing shards stats! we got " - << count << " / " << shards_count << dendl; - } - - return count; - } - - struct member_time_t { - utime_t start_time; - utime_t end_time; - utime_t aggregated_time; - }; - - //--------------------------------------------------------------------------- - static void collect_single_shard_stats(const DoutPrefixProvider *dpp, - std::map &owner_map, - const shard_progress_t sp_arr[], - unsigned shard, - bool *p_show_time, - const char *name) - { - const utime_t null_time; - const shard_progress_t &sp = sp_arr[shard]; - if (sp.creation_time == null_time || sp.completion_time == null_time) { - *p_show_time = false; - return; - } - - const std::string &owner = sp.owner; - utime_t duration = sp.completion_time - sp.creation_time; - if (owner_map.find(owner) != owner_map.end()) { - owner_map[owner].aggregated_time += duration; - owner_map[owner].end_time = sp.completion_time; - } - else { - owner_map[owner].start_time = sp.creation_time; - owner_map[owner].aggregated_time = duration; - owner_map[owner].end_time = sp.completion_time; - } - ldpp_dout(dpp, 10) << __func__ << "::Got " << name - << " stats for shard #" << shard << dendl; - } - - //--------------------------------------------------------------------------- - static void show_incomplete_shards_fmt(bool has_incomplete_shards, - unsigned num_shards, - const shard_progress_t sp_arr[], - Formatter *fmt) - - { - if (!has_incomplete_shards) { - return; - } - Formatter::ArraySection array_section{*fmt, "incomplete_shards"}; - for (unsigned shard = 0; shard < num_shards; shard++) { - if (sp_arr[shard].is_completed() ) { - continue; - } - Formatter::ObjectSection object_section{*fmt, "shard_progress"}; - fmt->dump_unsigned("shard_id", shard); - fmt->dump_string("owner", sp_arr[shard].owner); - fmt->dump_unsigned("progress_a", sp_arr[shard].progress_a); - fmt->dump_unsigned("progress_b", sp_arr[shard].progress_b); - fmt->dump_stream("last updated") << sp_arr[shard].update_time; - } - } - - //--------------------------------------------------------------------------- - static utime_t show_time_func_fmt(const utime_t &start_time, - bool show_time, - const std::map &owner_map, - Formatter *fmt) - { - member_time_t all_members_time; - all_members_time.start_time = start_time; - all_members_time.end_time = start_time; - all_members_time.aggregated_time = utime_t(); - - Formatter::ObjectSection section{*fmt, "time"}; - { - Formatter::ArraySection array_section{*fmt, "per-shard time"}; - for (const auto& [owner, value] : owner_map) { - uint32_t sec = value.end_time.tv.tv_sec - value.start_time.tv.tv_sec; - fmt->dump_stream("member time") - << owner << "::start time = [" << value.start_time.tv.tv_sec % 1000 - << ":" << value.start_time.tv.tv_nsec / (1000*1000) << "] " - << "::aggregated time = " << value.aggregated_time.tv.tv_sec - << "(" << sec << ") seconds"; - all_members_time.aggregated_time += value.aggregated_time; - if (all_members_time.end_time < value.end_time) { - all_members_time.end_time = value.end_time; - } - } - } - - if (show_time) { - uint32_t sec = all_members_time.end_time.tv.tv_sec - all_members_time.start_time.tv.tv_sec; - - Formatter::ObjectSection section{*fmt, "All shards time"}; - fmt->dump_stream("start time") << all_members_time.start_time; - fmt->dump_stream("end time") - << all_members_time.end_time << " (" << sec << " seconds total)"; - fmt->dump_unsigned("aggregated time (sec)", all_members_time.aggregated_time.tv.tv_sec); - } - - return all_members_time.end_time; - } - - //--------------------------------------------------------------------------- - static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum, - const md5_stats_t &md5_stats_sum, - Formatter *fmt) - { - uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes; - uint64_t s3_dedup_bytes = md5_stats_sum.big_objs_stat.dedup_bytes_estimate; - uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes; - Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"}; - fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); - fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); - fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate); - - if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { - double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; - fmt->dump_float("dedup_ratio", dedup_ratio); - } - else { - fmt->dump_float("dedup_ratio", 0); - } - } - - //--------------------------------------------------------------------------- - static void show_dedup_ratio_actual_fmt(const worker_stats_t &wrk_stats_sum, - const md5_stats_t &md5_stats_sum, - Formatter *fmt) - { - uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes; - uint64_t s3_dedup_bytes = (md5_stats_sum.deduped_objects_bytes + - md5_stats_sum.shared_manifest_dedup_bytes); - uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes; - - Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"}; - fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); - fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); - fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes); - if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { - double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; - fmt->dump_float("dedup_ratio", dedup_ratio); - } - else { - fmt->dump_float("dedup_ratio", 0); - } - } - - //--------------------------------------------------------------------------- - // command-line called from radosgw-admin.cc - int cluster::collect_all_shard_stats(rgw::sal::RadosStore *store, - Formatter *fmt, - const DoutPrefixProvider *dpp) - { - dedup_epoch_t epoch; - int ret = get_epoch(store, dpp, &epoch, nullptr); - if (ret != 0) { - return ret; - } - - Formatter::ObjectSection section{*fmt, "DEDUP STAT COUNTERS"}; - work_shard_t num_work_shards = epoch.num_work_shards; - md5_shard_t num_md5_shards = epoch.num_md5_shards; - - unsigned completed_work_shards_count = 0; - unsigned completed_md5_shards_count = 0; - utime_t md5_start_time; - worker_stats_t wrk_stats_sum; - { - std::map owner_map; - bool show_time = true; - bufferlist bl_arr[num_work_shards]; - shard_progress_t sp_arr[num_work_shards]; - int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards, - WORKER_SHARD_PREFIX, bl_arr, sp_arr); - if (cnt != num_work_shards && 0) { - std::cerr << ">>>Partial work shard stats recived " << cnt << " / " - << num_work_shards << "\n" << std::endl; - } - bool has_incomplete_shards = false; - for (unsigned shard = 0; shard < num_work_shards; shard++) { - if (bl_arr[shard].length() == 0) { - has_incomplete_shards = true; - continue; - } - completed_work_shards_count++; - worker_stats_t stats; - try { - auto p = bl_arr[shard].cbegin(); - decode(stats, p); - wrk_stats_sum += stats; - }catch (const buffer::error&) { - // TBD: can we use std::cerr or should we use formatter ?? - std::cerr << __func__ << "::(2)failed worker_stats_t decode #" << shard << std::endl; - continue; - } - collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "WORKER"); - } - Formatter::ObjectSection worker_stats(*fmt, "worker_stats"); - wrk_stats_sum.dump(fmt); - show_incomplete_shards_fmt(has_incomplete_shards, num_work_shards, sp_arr, fmt); - md5_start_time = show_time_func_fmt(epoch.time, show_time, owner_map, fmt); - } - - if (completed_work_shards_count == num_work_shards) { - std::map owner_map; - bool show_time = true; - md5_stats_t md5_stats_sum; - bufferlist bl_arr[num_md5_shards]; - shard_progress_t sp_arr[num_md5_shards]; - int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards, - MD5_SHARD_PREFIX, bl_arr, sp_arr); - if (cnt != num_md5_shards && 0) { - std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / " - << num_md5_shards << "\n" << std::endl; - } - bool has_incomplete_shards = false; - for (unsigned shard = 0; shard < num_md5_shards; shard++) { - if (bl_arr[shard].length() == 0) { - has_incomplete_shards = true; - continue; - } - completed_md5_shards_count++; - md5_stats_t stats; - try { - auto p = bl_arr[shard].cbegin(); - decode(stats, p); - md5_stats_sum += stats; - }catch (const buffer::error&) { - // TBD: can we use std::cerr or should we use formatter ?? - std::cerr << __func__ << "::failed md5_stats_t decode #" << shard << std::endl; - continue; - } - collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "MD5"); - } - { - Formatter::ObjectSection outer(*fmt, "md5_stats"); - md5_stats_sum.dump(fmt); - show_incomplete_shards_fmt(has_incomplete_shards, num_md5_shards, sp_arr, fmt); - show_time_func_fmt(md5_start_time, show_time, owner_map, fmt); - } - show_dedup_ratio_estimate_fmt(wrk_stats_sum, md5_stats_sum, fmt); - show_dedup_ratio_actual_fmt(wrk_stats_sum, md5_stats_sum, fmt); - } - - fmt->dump_bool("completed", (completed_md5_shards_count == num_md5_shards)); - return 0; - } - - //--------------------------------------------------------------------------- - int cluster::watch_reload(rgw::sal::RadosStore *store, - const DoutPrefixProvider* dpp, - uint64_t *p_watch_handle, - librados::WatchCtx2 *ctx) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - const std::string & oid = DEDUP_WATCH_OBJ; - // create the object to watch (object may already exist) - bool exclusive = true; - ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { - ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid - << " was created!" << dendl; - } - else if (ret == -EEXIST) { - ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl; - } - else { - ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create(" - << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - - ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx); - if (ret < 0) { - ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid - << ". error: " << cpp_strerror(-ret) << dendl; - *p_watch_handle = 0; - return ret; - } - ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching " - << oid << "::watch_handle=" << *p_watch_handle << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - int cluster::unwatch_reload(rgw::sal::RadosStore *store, - const DoutPrefixProvider* dpp, - uint64_t watch_handle) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - ret = ctl_ioctx.unwatch2(watch_handle); - if (ret < 0) { - ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() " - << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl; - return ret; - } - return 0; - } - - //--------------------------------------------------------------------------- - int cluster::ack_notify(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - const control_t *p_ctl, - uint64_t notify_id, - uint64_t cookie, - int status) - { - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl; - bufferlist reply_bl; - ceph::encode(status, reply_bl); - encode(*p_ctl, reply_bl); - ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl); - - return 0; - } - - //--------------------------------------------------------------------------- - // command-line called from radosgw-admin.cc - int cluster::dedup_control(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - urgent_msg_t urgent_msg) - { - ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = " - << get_urgent_msg_names(urgent_msg) << dendl; - if (urgent_msg != URGENT_MSG_RESUME && - urgent_msg != URGENT_MSG_PASUE && - urgent_msg != URGENT_MSG_RESTART && - urgent_msg != URGENT_MSG_ABORT) { - ldpp_dout(dpp, 1) << __func__ << "::illegal urgent_msg="<< urgent_msg << dendl; - return -EINVAL; - } - - librados::IoCtx ctl_ioctx; - int ret = get_control_ioctx(store, dpp, ctl_ioctx); - if (unlikely(ret != 0)) { - return ret; - } - - // 10 seconds timeout - const uint64_t timeout_ms = 10*1000; - bufferlist reply_bl, urgent_msg_bl; - ceph::encode(urgent_msg, urgent_msg_bl); - ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl, - timeout_ms, &reply_bl, null_yield); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify(" - << DEDUP_WATCH_OBJ << ")::err="< acks; - std::vector timeouts; - ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts); - if (timeouts.size() > 0) { - ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify(" - << DEDUP_WATCH_OBJ << ")::timeout error" << dendl; - return -EAGAIN; - } - - for (auto& ack : acks) { - try { - ldpp_dout(dpp, 20) << __func__ << "::ACK: notifier_id=" << ack.notifier_id - << "::cookie=" << ack.cookie << dendl; - auto iter = ack.payload_bl.cbegin(); - ceph::decode(ret, iter); - struct rgw::dedup::control_t ctl; - decode(ctl, iter); - ldpp_dout(dpp, 10) << __func__ << "::++ACK::ctl=" << ctl << "::ret=" << ret << dendl; - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::failed decoding notify acks" << dendl; - return -EINVAL; - } - if (ret != 0) { - ldpp_dout(dpp, 1) << __func__ << "::Bad notify ack, ret=" << ret - << "::err=" << cpp_strerror(-ret) << dendl; - return ret; - } - } - ldpp_dout(dpp, 10) << __func__ << "::" << get_urgent_msg_names(urgent_msg) - << " finished successfully!" << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - // command-line called from radosgw-admin.cc - int cluster::dedup_restart_scan(rgw::sal::RadosStore *store, - dedup_req_type_t dedup_type, - const DoutPrefixProvider *dpp) - { - ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl; - - dedup_epoch_t old_epoch; - // store the previous epoch for cmp-swap - int ret = get_epoch(store, dpp, &old_epoch, __func__); - if (ret != 0) { - // generate an empty epoch with zero counters - std::string cluster_id("NULL_CLUSTER_ID"); - ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: " - << cluster_id << dendl; - set_epoch(store, cluster_id, dpp, 0, 0); - ret = get_epoch(store, dpp, &old_epoch, __func__); - if (ret) { - return ret; - } - } - - // first abort all dedup work! - ret = dedup_control(store, dpp, URGENT_MSG_ABORT); - if (ret != 0) { - return ret; - } -#if 0 - // then delete dedup-pool to ensure a clean start - const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; - auto rados_handle = store->getRados()->get_rados_handle(); - ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl; - rados_handle->pool_delete(dedup_pool.name.c_str()); -#endif - - ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl; -#ifdef FULL_DEDUP_SUPPORT - ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE || - dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL); -#else - ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); -#endif - ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0); - if (ret == 0) { - ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl; - return dedup_control(store, dpp, URGENT_MSG_RESTART); - } - else { - return ret; - } - } - - //--------------------------------------------------------------------------- - bool cluster::can_start_new_scan(rgw::sal::RadosStore *store) - { - ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl; - dedup_epoch_t new_epoch; - if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) { - ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::" - << "::scan can be restarted!\n\n\n" << dendl; - // no epoch object exists -> we should start a new scan - return true; - } - - if (new_epoch.time <= d_epoch_time) { - if (new_epoch.time == d_epoch_time) { - ldpp_dout(dpp, 10) << __func__ << "::Epoch hasn't change - > Do not restart scan!!" << dendl; - } - else { - ldpp_dout(dpp, 1) << __func__ << " ::Do not restart scan!\n epoch=" - << d_epoch_time << "\nnew_epoch="<< new_epoch.time < now = TRUE " << dendl; - } - return false; - } -} // namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_cluster.h b/src/rgw/rgw_dedup_cluster.h deleted file mode 100644 index 64b2c54a4fa2..000000000000 --- a/src/rgw/rgw_dedup_cluster.h +++ /dev/null @@ -1,193 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include "common/dout.h" -#include "rgw_dedup_utils.h" -#include "rgw_dedup_store.h" -#include - -namespace rgw::dedup { - static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK."; - static constexpr const char* MD5_SHARD_PREFIX = "MD5.SHRD.TK."; - struct control_t; - struct dedup_epoch_t; - - class cluster{ - public: - //================================================================================== - class shard_token_oid { - public: - //--------------------------------------------------------------------------- - shard_token_oid(const char *prefix) { - this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix); - this->total_len = this->prefix_len; - } - - //--------------------------------------------------------------------------- - shard_token_oid(const char *prefix, uint16_t shard) { - this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix); - set_shard(shard); - } - - //--------------------------------------------------------------------------- - void set_shard(uint16_t shard) { - int n = snprintf(this->buff + this->prefix_len, BUFF_SIZE, "%03x", shard); - this->total_len = this->prefix_len + n; - } - - //--------------------------------------------------------------------------- - static bool legal_oid_name(const std::string& oid) { - return ((oid.length() <= BUFF_SIZE) && - (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX))); - } - inline const char* get_buff() { return this->buff; } - inline unsigned get_buff_size() { return this->total_len; } - private: - static const unsigned BUFF_SIZE = 15; - unsigned total_len = 0; - unsigned prefix_len = 0; - char buff[BUFF_SIZE]; - }; - - //================================================================================== - cluster(const DoutPrefixProvider *_dpp, - CephContext* cct, - rgw::sal::Driver* driver); - int reset(rgw::sal::RadosStore *store, - struct dedup_epoch_t*, - work_shard_t num_work_shards, - md5_shard_t num_md5_shards); - - utime_t get_epoch_time() { return d_epoch_time; } - work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store, - work_shard_t num_work_shards); - md5_shard_t get_next_md5_shard_token(rgw::sal::RadosStore *store, - md5_shard_t num_md5_shards); - bool can_start_new_scan(rgw::sal::RadosStore *store); - static int collect_all_shard_stats(rgw::sal::RadosStore *store, - Formatter *p_formatter, - const DoutPrefixProvider *dpp); - static int watch_reload(rgw::sal::RadosStore *store, - const DoutPrefixProvider* dpp, - uint64_t *p_watch_handle, - librados::WatchCtx2 *ctx); - static int unwatch_reload(rgw::sal::RadosStore *store, - const DoutPrefixProvider* dpp, - uint64_t watch_handle); - static int ack_notify(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - const struct control_t *p_ctl, - uint64_t notify_id, - uint64_t cookie, - int status); - static int dedup_control(rgw::sal::RadosStore *store, - const DoutPrefixProvider *dpp, - urgent_msg_t urgent_msg); - static int dedup_restart_scan(rgw::sal::RadosStore *store, - dedup_req_type_t dedup_type, - const DoutPrefixProvider *dpp); - - //--------------------------------------------------------------------------- - int mark_work_shard_token_completed(rgw::sal::RadosStore *store, - work_shard_t work_shard, - const worker_stats_t *p_stats) - { - ceph::bufferlist bl; - encode(*p_stats, bl); - d_num_completed_workers++; - d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED; - - return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj, - WORKER_SHARD_PREFIX, bl); - } - - //--------------------------------------------------------------------------- - int mark_md5_shard_token_completed(rgw::sal::RadosStore *store, - md5_shard_t md5_shard, - const md5_stats_t *p_stats) - { - ceph::bufferlist bl; - encode(*p_stats, bl); - d_num_completed_md5++; - d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED; - return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects, - MD5_SHARD_PREFIX, bl); - } - - int update_shard_token_heartbeat(rgw::sal::RadosStore *store, - unsigned shard, - uint64_t count_a, - uint64_t count_b, - const char *prefix); - - //--------------------------------------------------------------------------- - int all_work_shard_tokens_completed(rgw::sal::RadosStore *store, - work_shard_t num_work_shards) - { - return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX, - &d_num_completed_workers, d_completed_workers); - } - - //--------------------------------------------------------------------------- - int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store, - md5_shard_t num_md5_shards) - { - return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX, - &d_num_completed_md5, d_completed_md5); - } - - private: - static constexpr unsigned TOKEN_STATE_PENDING = 0x00; - static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC; - static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD; - static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF; - - void clear(); - int all_shard_tokens_completed(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix, - uint16_t *p_num_completed, - uint8_t completed_arr[]); - int cleanup_prev_run(rgw::sal::RadosStore *store); - int32_t get_next_shard_token(rgw::sal::RadosStore *store, - uint16_t start_shard, - uint16_t max_count, - const char *prefix); - int create_shard_tokens(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix); - int verify_all_shard_tokens(rgw::sal::RadosStore *store, - unsigned shards_count, - const char *prefix); - int mark_shard_token_completed(rgw::sal::RadosStore *store, - unsigned shard, - uint64_t obj_count, - const char *prefix, - const bufferlist &bl); - - const DoutPrefixProvider *dpp; - std::string d_lock_cookie; - std::string d_cluster_id; - md5_shard_t d_curr_md5_shard = 0; - work_shard_t d_curr_worker_shard = 0; - utime_t d_epoch_time; - utime_t d_token_creation_time; - uint8_t d_completed_workers[MAX_WORK_SHARD]; - uint8_t d_completed_md5[MAX_MD5_SHARD]; - uint16_t d_num_completed_workers = 0; - uint16_t d_num_completed_md5 = 0; - }; - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_epoch.h b/src/rgw/rgw_dedup_epoch.h deleted file mode 100644 index 84492d357392..000000000000 --- a/src/rgw/rgw_dedup_epoch.h +++ /dev/null @@ -1,73 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once - -#include "common/Clock.h" // for ceph_clock_now() -#include "common/dout.h" -#include "rgw_dedup_utils.h" - -#include - -namespace rgw::dedup { - constexpr const char* RGW_DEDUP_ATTR_EPOCH = "rgw.dedup.attr.epoch"; - //=========================================================================== - - struct dedup_epoch_t { - uint32_t serial; - dedup_req_type_t dedup_type; - utime_t time; - uint32_t num_work_shards = 0; - uint32_t num_md5_shards = 0; - }; - - //--------------------------------------------------------------------------- - inline void encode(const dedup_epoch_t& o, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - encode(o.serial, bl); - encode(static_cast(o.dedup_type), bl); - encode(o.time, bl); - encode(o.num_work_shards, bl); - encode(o.num_md5_shards, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - inline void decode(dedup_epoch_t& o, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - decode(o.serial, bl); - int32_t dedup_type; - decode(dedup_type, bl); - o.dedup_type = static_cast (dedup_type); - decode(o.time, bl); - decode(o.num_work_shards, bl); - decode(o.num_md5_shards, bl); - DECODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - inline std::ostream& operator<<(std::ostream &out, const dedup_epoch_t &ep) - { - utime_t elapsed = ceph_clock_now() - ep.time; - out << "EPOCH::Time={" << ep.time.tv.tv_sec <<":"<< ep.time.tv.tv_nsec << "}::"; - out << "Elapsed={" << elapsed.tv.tv_sec <<":"<< elapsed.tv.tv_nsec << "}::"; - out << ep.dedup_type << "::serial=" << ep.serial; - out << "::num_work_shards=" << ep.num_work_shards; - out << "::num_md5_shards=" << ep.num_md5_shards; - return out; - } - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_remap.h b/src/rgw/rgw_dedup_remap.h deleted file mode 100644 index 60ef66ecbe80..000000000000 --- a/src/rgw/rgw_dedup_remap.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include "common/dout.h" -#include -#include -#include - - -namespace rgw::dedup { - class remapper_t - { - public: - static inline constexpr uint8_t NULL_IDX = 0xFF; - remapper_t(uint32_t max_entries) : d_max_entries(max_entries) {} - uint8_t remap(const std::string &key, - const DoutPrefixProvider* dpp, - uint64_t *p_overflow_count) { // IN-OUT - uint8_t idx; - - auto itr = d_map.find(key); - if (itr != d_map.end()) { - idx = itr->second; - ldpp_dout(dpp, 20) << __func__ << "::Existing key: " << key - << " is mapped to idx=" << (int)idx << dendl; - } - else if (d_num_entries < d_max_entries) { - // assign it the next entry - idx = d_num_entries++; - d_map[key] = idx; - ldpp_dout(dpp, 20) << __func__ << "::New key: " << key - << " was mapped to idx=" << (int)idx << dendl; - } - else { - (*p_overflow_count) ++; - ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed adding key: " - << key << dendl; - idx = NULL_IDX; - } - - return idx; - } - - private: - uint32_t d_num_entries = 0; - const uint32_t d_max_entries; - std::unordered_map d_map; - }; - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_store.cc b/src/rgw/rgw_dedup_store.cc deleted file mode 100644 index fd15bbc372d8..000000000000 --- a/src/rgw/rgw_dedup_store.cc +++ /dev/null @@ -1,732 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/rados/rados_types.hpp" -#include "include/rados/buffer.h" -#include "include/rados/librados.hpp" -#include "svc_zone.h" -#include "common/config.h" -#include "common/Cond.h" -#include "common/debug.h" -#include "common/errno.h" -#include "rgw_common.h" -#include "include/denc.h" -#include "rgw_sal.h" -#include "driver/rados/rgw_sal_rados.h" -#include "rgw_dedup_utils.h" -#include "rgw_dedup.h" -#include "rgw_dedup_store.h" -#include "fmt/ranges.h" -#include - -namespace rgw::dedup { - - //--------------------------------------------------------------------------- - disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket, - const std::string &obj_name, - const parsed_etag_t *p_parsed_etag, - uint64_t obj_size, - const std::string &storage_class) - { - this->s.rec_version = 0; - this->s.flags = 0; - this->s.num_parts = p_parsed_etag->num_parts; - this->obj_name = obj_name; - this->s.obj_name_len = this->obj_name.length(); - this->bucket_name = p_bucket->get_name(); - this->s.bucket_name_len = this->bucket_name.length(); - - this->s.md5_high = p_parsed_etag->md5_high; - this->s.md5_low = p_parsed_etag->md5_low; - this->s.obj_bytes_size = obj_size; - this->s.object_version = 0; - - this->bucket_id = p_bucket->get_bucket_id(); - this->s.bucket_id_len = this->bucket_id.length(); - this->tenant_name = p_bucket->get_tenant(); - this->s.tenant_name_len = this->tenant_name.length(); - this->stor_class = storage_class; - this->s.stor_class_len = storage_class.length(); - - this->s.ref_tag_len = 0; - this->s.manifest_len = 0; - - this->s.shared_manifest = 0; - memset(this->s.hash, 0, sizeof(this->s.hash)); - this->ref_tag = ""; - this->manifest_bl.clear(); - } - - //--------------------------------------------------------------------------- - disk_record_t::disk_record_t(const char *buff) - { - disk_record_t *p_rec = (disk_record_t*)buff; - this->s.rec_version = p_rec->s.rec_version; - // wrong version, bail out - if (unlikely(p_rec->s.rec_version != 0)) { - return; - } - - this->s.flags = p_rec->s.flags; - this->s.num_parts = CEPHTOH_16(p_rec->s.num_parts); - this->s.obj_name_len = CEPHTOH_16(p_rec->s.obj_name_len); - this->s.bucket_name_len = CEPHTOH_16(p_rec->s.bucket_name_len); - - this->s.md5_high = CEPHTOH_64(p_rec->s.md5_high); - this->s.md5_low = CEPHTOH_64(p_rec->s.md5_low); - this->s.obj_bytes_size = CEPHTOH_64(p_rec->s.obj_bytes_size); - this->s.object_version = CEPHTOH_64(p_rec->s.object_version); - - this->s.bucket_id_len = CEPHTOH_16(p_rec->s.bucket_id_len); - this->s.tenant_name_len = CEPHTOH_16(p_rec->s.tenant_name_len); - this->s.stor_class_len = CEPHTOH_16(p_rec->s.stor_class_len); - this->s.ref_tag_len = CEPHTOH_16(p_rec->s.ref_tag_len); - this->s.manifest_len = CEPHTOH_16(p_rec->s.manifest_len); - - const char *p = buff + sizeof(this->s); - this->obj_name = std::string(p, this->s.obj_name_len); - p += p_rec->s.obj_name_len; - - this->bucket_name = std::string(p, this->s.bucket_name_len); - p += p_rec->s.bucket_name_len; - - this->bucket_id = std::string(p, this->s.bucket_id_len); - p += p_rec->s.bucket_id_len; - - this->tenant_name = std::string(p, this->s.tenant_name_len); - p += p_rec->s.tenant_name_len; - - this->stor_class = std::string(p, this->s.stor_class_len); - p += p_rec->s.stor_class_len; - - if (p_rec->s.flags.is_fastlane()) { - // TBD:: remove asserts - ceph_assert(this->s.ref_tag_len == 0); - ceph_assert(this->s.manifest_len == 0); - } - else { - this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest); - // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { - this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]); - } - this->ref_tag = std::string(p, this->s.ref_tag_len); - p += p_rec->s.ref_tag_len; - - this->manifest_bl.append(p, this->s.manifest_len); - } - } - - //--------------------------------------------------------------------------- - size_t disk_record_t::serialize(char *buff) const - { - ceph_assert(this->s.rec_version == 0); - disk_record_t *p_rec = (disk_record_t*)buff; - p_rec->s.rec_version = 0; - p_rec->s.flags = this->s.flags; - p_rec->s.num_parts = HTOCEPH_16(this->s.num_parts); - p_rec->s.obj_name_len = HTOCEPH_16(this->obj_name.length()); - p_rec->s.bucket_name_len = HTOCEPH_16(this->bucket_name.length()); - - p_rec->s.md5_high = HTOCEPH_64(this->s.md5_high); - p_rec->s.md5_low = HTOCEPH_64(this->s.md5_low); - p_rec->s.obj_bytes_size = HTOCEPH_64(this->s.obj_bytes_size); - p_rec->s.object_version = HTOCEPH_64(this->s.object_version); - - p_rec->s.bucket_id_len = HTOCEPH_16(this->bucket_id.length()); - p_rec->s.tenant_name_len = HTOCEPH_16(this->tenant_name.length()); - p_rec->s.stor_class_len = HTOCEPH_16(this->stor_class.length()); - p_rec->s.ref_tag_len = HTOCEPH_16(this->ref_tag.length()); - p_rec->s.manifest_len = HTOCEPH_16(this->manifest_bl.length()); - char *p = buff + sizeof(this->s); - unsigned len = this->obj_name.length(); - std::memcpy(p, this->obj_name.data(), len); - p += len; - - len = this->bucket_name.length(); - std::memcpy(p, this->bucket_name.data(), len); - p += len; - - len = this->bucket_id.length(); - std::memcpy(p, this->bucket_id.data(), len); - p += len; - - len = this->tenant_name.length(); - std::memcpy(p, this->tenant_name.data(), len); - p += len; - - len = this->stor_class.length(); - std::memcpy(p, this->stor_class.data(), len); - p += len; - - if (this->s.flags.is_fastlane()) { - // TBD:: remove asserts - ceph_assert(this->s.ref_tag_len == 0); - ceph_assert(this->s.manifest_len == 0); - } - else { - p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest); - // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { - p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]); - } - len = this->ref_tag.length(); - std::memcpy(p, this->ref_tag.data(), len); - p += len; - - len = this->manifest_bl.length(); - const char *p_manifest = const_cast(this)->manifest_bl.c_str(); - std::memcpy(p, p_manifest, len); - p += len; - } - return (p - buff); - } - - //--------------------------------------------------------------------------- - size_t disk_record_t::length() const - { - return (sizeof(this->s) + - this->obj_name.length() + - this->bucket_name.length() + - this->bucket_id.length() + - this->tenant_name.length() + - this->stor_class.length() + - this->ref_tag.length() + - this->manifest_bl.length()); - } - - //--------------------------------------------------------------------------- - int disk_record_t::validate(const char *caller, - const DoutPrefixProvider* dpp, - disk_block_id_t block_id, - record_id_t rec_id) const - { - // optimistic approach - if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) { - ldpp_dout(dpp, 20) << __func__ << "::success" << dendl; - return 0; - } - - // wrong version - if (this->s.rec_version != 0) { - // TBD - //p_stats->failed_wrong_ver++; - ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: Bad record version: " - << this->s.rec_version - << "::block_id=" << block_id - << "::rec_id=" << rec_id - << dendl; - return -EPROTO; // Protocol error - } - - // if arrived here record size is too large - // TBD - //p_stats->failed_rec_overflow++; - ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: record size too big: " - << this->length() - << "::block_id=" << block_id - << "::rec_id=" << rec_id - << dendl; - return -EOVERFLOW; // maybe should use -E2BIG ?? - } - - //--------------------------------------------------------------------------- - std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec) - { - stream << rec.obj_name << "::" << rec.s.obj_name_len << "\n"; - stream << rec.bucket_name << "::" << rec.s.bucket_name_len << "\n"; - stream << rec.bucket_id << "::" << rec.s.bucket_id_len << "\n"; - stream << rec.tenant_name << "::" << rec.s.tenant_name_len << "\n"; - stream << rec.stor_class << "::" << rec.s.stor_class_len << "\n"; - stream << rec.ref_tag << "::" << rec.s.ref_tag_len << "\n"; - stream << "num_parts = " << rec.s.num_parts << "\n"; - stream << "obj_size = " << rec.s.obj_bytes_size/1024 <<" KiB" << "\n"; - stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n"; - stream << "HASH = "; - // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { - stream << rec.s.hash[i]; - } - stream << "\n"; - - if (rec.has_shared_manifest()) { - stream << "Shared Manifest Object\n"; - } - else { - stream << "Dedicated Manifest Object\n"; - } - stream << "Manifest len=" << rec.s.manifest_len << "\n"; - return stream; - } - - //--------------------------------------------------------------------------- - void disk_block_t::init(work_shard_t worker_id, uint32_t seq_number) - { - disk_block_header_t *p_header = get_header(); - p_header->offset = sizeof(disk_block_header_t); - p_header->rec_count = 0; - p_header->block_id = disk_block_id_t(worker_id, seq_number); - } - - //--------------------------------------------------------------------------- - int disk_block_header_t::verify(disk_block_id_t expected_block_id, const DoutPrefixProvider* dpp) - { - if (unlikely(offset != BLOCK_MAGIC && offset != LAST_BLOCK_MAGIC)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR::bad magic number (0x" << std::hex << offset << std::dec << ")" << dendl; - return -EINVAL; - } - - if (unlikely(rec_count > MAX_REC_IN_BLOCK) ) { - ldpp_dout(dpp, 1) << __func__ << "::ERR::rec_count=" << rec_count << " > MAX_REC_IN_BLOCK" << dendl; - return -EINVAL; - } - - if (unlikely(this->block_id != expected_block_id)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR::block_id=" << block_id - << "!= expected_block_id=" << expected_block_id << dendl; - return -EINVAL; - } - - return 0; - } - - //--------------------------------------------------------------------------- - record_id_t disk_block_t::add_record(const disk_record_t *p_rec, - const DoutPrefixProvider *dpp) - { - disk_block_header_t *p_header = get_header(); - if (unlikely(p_header->rec_count >= MAX_REC_IN_BLOCK)) { - ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count - << ", MAX_REC_IN_BLOCK=" << MAX_REC_IN_BLOCK << dendl; - return MAX_REC_IN_BLOCK; - } - - if ((DISK_BLOCK_SIZE - p_header->offset) >= p_rec->length()) { - p_header->rec_offsets[p_header->rec_count] = p_header->offset; - unsigned rec_id = p_header->rec_count; - p_header->rec_count ++; - p_rec->serialize(data+p_header->offset); - p_header->offset += p_rec->length(); - return rec_id; - } - else { - return MAX_REC_IN_BLOCK; - } - } - - //--------------------------------------------------------------------------- - void disk_block_t::close_block(const DoutPrefixProvider* dpp, bool has_more) - { - disk_block_header_t *p_header = get_header(); - ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count - << ", has_more=" << (has_more? "TRUE" : "FALSE") << dendl; - - memset(data + p_header->offset, 0, (DISK_BLOCK_SIZE - p_header->offset)); - if (has_more) { - p_header->offset = HTOCEPH_16(BLOCK_MAGIC); - } - else { - p_header->offset = HTOCEPH_16(LAST_BLOCK_MAGIC); - } - for (unsigned i = 0; i < p_header->rec_count; i++) { - p_header->rec_offsets[i] = HTOCEPH_16(p_header->rec_offsets[i]); - } - p_header->rec_count = HTOCEPH_16(p_header->rec_count); - p_header->block_id = HTOCEPH_32((uint32_t)p_header->block_id); - // TBD: CRC - } - - //--------------------------------------------------------------------------- - void disk_block_header_t::deserialize() - { - this->offset = CEPHTOH_16(this->offset); - this->rec_count = CEPHTOH_16(this->rec_count); - this->block_id = CEPHTOH_32((uint32_t)this->block_id); - for (unsigned i = 0; i < this->rec_count; i++) { - this->rec_offsets[i] = CEPHTOH_16(this->rec_offsets[i]); - } - } - - //--------------------------------------------------------------------------- - disk_block_seq_t::disk_block_seq_t(const DoutPrefixProvider* dpp_in, - disk_block_t *p_arr_in, - work_shard_t worker_id, - md5_shard_t md5_shard, - worker_stats_t *p_stats_in) - { - activate(dpp_in, p_arr_in, worker_id, md5_shard, p_stats_in); - } - - //--------------------------------------------------------------------------- - void disk_block_seq_t::activate(const DoutPrefixProvider* dpp_in, - disk_block_t *p_arr_in, - work_shard_t worker_id, - md5_shard_t md5_shard, - worker_stats_t *p_stats_in) - { - dpp = dpp_in; - p_arr = p_arr_in; - d_worker_id = worker_id; - d_md5_shard = md5_shard; - p_stats = p_stats_in; - p_curr_block = nullptr; - d_seq_number = 0; - - memset(p_arr, 0, sizeof(disk_block_t)); - slab_reset(); - } - - //--------------------------------------------------------------------------- - [[maybe_unused]]static int print_manifest(const DoutPrefixProvider *dpp, - RGWRados *rados, - const bufferlist &manifest_bl) - { - RGWObjManifest manifest; - try { - auto bl_iter = manifest_bl.cbegin(); - decode(manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: unable to decode manifest" << dendl; - return -EINVAL; - } - - unsigned idx = 0; - for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl; - } - ldpp_dout(dpp, 20) << "==============================================" << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream& out, const disk_block_id_t& block_id) - { - std::ios_base::fmtflags flags = out.flags(); - out << std::hex << "0x" - << (uint32_t)block_id.get_work_shard_id() << "::" - << (uint32_t)block_id.get_slab_id() << "::" - << (uint32_t)block_id.get_block_offset(); - - if (flags & std::ios::dec) { - out << std::dec; - } - return out; - } - - //--------------------------------------------------------------------------- - std::string disk_block_id_t::get_slab_name(md5_shard_t md5_shard) const - { - // SLAB.MD5_ID.WORKER_ID.SLAB_SEQ_ID - const char *SLAB_NAME_FORMAT = "SLB.%03X.%02X.%04X"; - static constexpr uint32_t SLAB_NAME_SIZE = 16; - char name_buf[SLAB_NAME_SIZE]; - slab_id_t slab_id = get_slab_id(); - work_shard_t work_id = get_work_shard_id(); - unsigned n = snprintf(name_buf, sizeof(name_buf), SLAB_NAME_FORMAT, - md5_shard, work_id, slab_id); - std::string oid(name_buf, n); - return oid; - } - - //--------------------------------------------------------------------------- - int load_record(librados::IoCtx &ioctx, - const disk_record_t *p_tgt_rec, - disk_record_t *p_src_rec, /* OUT */ - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - const DoutPrefixProvider *dpp) - { - std::string oid(block_id.get_slab_name(md5_shard)); - int read_len = DISK_BLOCK_SIZE; - static_assert(sizeof(disk_block_t) == DISK_BLOCK_SIZE); - int byte_offset = block_id.get_block_offset() * DISK_BLOCK_SIZE; - bufferlist bl; - int ret = ioctx.read(oid, bl, read_len, byte_offset); - if (unlikely(ret != read_len)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read block from " << oid - << "::ret=" << ret << "::err=" << cpp_strerror(-ret)<egress_slabs++; - slab_reset(); - return ret; - } - - //--------------------------------------------------------------------------- - int disk_block_seq_t::flush_disk_records(librados::IoCtx &ioctx) - { - ceph_assert(p_arr); - ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)d_worker_id - << ", md5_shard=" << (uint32_t)d_md5_shard << dendl; - - // we need to force flush at the end of a cycle even if there was no work done - // it is used as a signal to worker in the next step - if (p_curr_block == &p_arr[0] && p_curr_block->is_empty()) { - ldpp_dout(dpp, 20) << __func__ << "::Empty buffers, generate terminating block" << dendl; - } - p_stats->egress_blocks++; - p_curr_block->close_block(dpp, false); - - int ret = flush(ioctx); - return ret; - } - - //--------------------------------------------------------------------------- - int disk_block_seq_t::add_record(librados::IoCtx &ioctx, - const disk_record_t *p_rec, // IN-OUT - record_info_t *p_rec_info) // OUT-PARAM - { - disk_block_id_t null_block_id; - int ret = p_rec->validate(__func__, dpp, null_block_id, MAX_REC_IN_BLOCK); - if (unlikely(ret != 0)) { - // TBD - //p_stats->failed_rec_store++; - return ret; - } - - p_stats->egress_records ++; - // first, try and add the record to the current open block - p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); - if (p_rec_info->rec_id < MAX_REC_IN_BLOCK) { - p_rec_info->block_id = p_curr_block->get_block_id(); - return 0; - } - else { - // Not enough space left in current block, close it and open the next block - ldpp_dout(dpp, 20) << __func__ << "::Block is full-> close and move to next" << dendl; - p_stats->egress_blocks++; - p_curr_block->close_block(dpp, true); - } - - // Do we have more Blocks in the block-array ? - if (p_curr_block < last_block()) { - p_curr_block ++; - d_seq_number ++; - p_curr_block->init(d_worker_id, d_seq_number); - p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); - } - else { - ldpp_dout(dpp, 20) << __func__ << "::calling flush()" << dendl; - ret = flush(ioctx); - p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp); - } - - p_rec_info->block_id = p_curr_block->get_block_id(); - return ret; - } - - //--------------------------------------------------------------------------- - disk_block_array_t::disk_block_array_t(const DoutPrefixProvider* dpp, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t worker_id, - worker_stats_t *p_stats, - md5_shard_t num_md5_shards) - { - d_num_md5_shards = num_md5_shards; - d_worker_id = worker_id; - disk_block_t *p = (disk_block_t *)raw_mem; - disk_block_t *p_end = (disk_block_t *)(raw_mem + raw_mem_size); - - for (unsigned md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) { - ldpp_dout(dpp, 20) << __func__ << "::p=" << p << "::p_end=" << p_end << dendl; - if (p + DISK_BLOCK_COUNT <= p_end) { - d_disk_arr[md5_shard].activate(dpp, p, d_worker_id, md5_shard, p_stats); - p += DISK_BLOCK_COUNT; - } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: buffer overflow! " - << "::md5_shard=" << md5_shard << "/" << d_num_md5_shards - << "::raw_mem_size=" << raw_mem_size << dendl; - ldpp_dout(dpp, 1) << __func__ - << "::sizeof(disk_block_t)=" << sizeof(disk_block_t) - << "::DISK_BLOCK_COUNT=" << DISK_BLOCK_COUNT << dendl; - ceph_abort(); - } - } - } - - //--------------------------------------------------------------------------- - void disk_block_array_t::flush_output_buffers(const DoutPrefixProvider* dpp, - librados::IoCtx &ioctx) - { - for (md5_shard_t md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) { - ldpp_dout(dpp, 20) <<__func__ << "::flush buffers:: worker_id=" - << d_worker_id<< ", md5_shard=" << md5_shard << dendl; - d_disk_arr[md5_shard].flush_disk_records(ioctx); - } - } -} // namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_store.h b/src/rgw/rgw_dedup_store.h deleted file mode 100644 index a89abb134206..000000000000 --- a/src/rgw/rgw_dedup_store.h +++ /dev/null @@ -1,304 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include "common/dout.h" -#include "rgw_common.h" -#include "rgw_realm_reloader.h" -#include -#include -#include -#include -#include -#include -#include -#include "include/rados/rados_types.hpp" -#include "include/rados/buffer.h" -#include "include/rados/librados.hpp" -#include "rgw_dedup_utils.h" -#include "BLAKE3/c/blake3.h" - -namespace rgw::dedup { - struct key_t; -#define CEPHTOH_16 le16toh -#define CEPHTOH_32 le32toh -#define CEPHTOH_64 le64toh -#define HTOCEPH_16 htole16 -#define HTOCEPH_32 htole32 -#define HTOCEPH_64 htole64 - - static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024; - // we use 16 bit offset - static_assert(DISK_BLOCK_SIZE < 64*1024); - static constexpr unsigned DISK_BLOCK_COUNT = 256; - static_assert(DISK_BLOCK_COUNT <= (4*1024*1024/DISK_BLOCK_SIZE)); - static constexpr unsigned MAX_REC_IN_BLOCK = 32; - // we use 8bit record indices - static_assert(MAX_REC_IN_BLOCK < 0xFF); - using slab_id_t = uint16_t; - using block_offset_t = uint8_t; - using record_id_t = uint8_t; - - // disk_block_id_t is a 32 bits concataion of shard_id, slab_id and block_off - // ---8---- | -------16------- | ---8---- - // shard_id | slab_id | block_off - struct __attribute__ ((packed)) disk_block_id_t - { - public: - disk_block_id_t() { - block_id = 0; - } - - disk_block_id_t(work_shard_t shard_id, uint32_t seq_number) { - ceph_assert((seq_number & SEQ_NUMBER_MASK) == seq_number); - ceph_assert(shard_id <= MAX_WORK_SHARD); - block_id = (uint32_t)shard_id << OBJ_SHARD_SHIFT | seq_number; - } - - disk_block_id_t& operator =(const disk_block_id_t &other) { - this->block_id = other.block_id; - return *this; - } - - inline disk_block_id_t& operator =(uint32_t val) { - this->block_id = val; - return *this; - } - - inline bool operator ==(const disk_block_id_t &other) const { - return (this->block_id == other.block_id); - } - - inline explicit operator uint32_t() const { - return this->block_id; - } - - friend std::ostream& operator<<(std::ostream& os, const disk_block_id_t& block_id); - - std::string get_slab_name(md5_shard_t md5_shard) const; - - static inline slab_id_t seq_num_to_slab_id(uint32_t seq_number) { - return (seq_number & SLAB_ID_MASK) >> SLAB_ID_SHIFT; - } - - static inline uint32_t slab_id_to_seq_num(uint32_t slab_id) { - return (slab_id << SLAB_ID_SHIFT); - } - - inline block_offset_t get_block_offset() const { - return get_block_offset(get_seq_num()); - } - - inline work_shard_t get_work_shard_id() const { - return (block_id & OBJ_SHARD_MASK) >> OBJ_SHARD_SHIFT; - } - - private: - inline uint32_t get_seq_num() const { - return (block_id & SEQ_NUMBER_MASK); - } - - inline slab_id_t get_slab_id() const { - return seq_num_to_slab_id(get_seq_num()); - } - - inline block_offset_t get_block_offset(uint32_t seq_number) const { - return (seq_number & BLOCK_OFF_MASK); - } - - static constexpr uint32_t OBJ_SHARD_SHIFT = 24; - static constexpr uint32_t OBJ_SHARD_MASK = 0xFF000000; - - static constexpr uint32_t SEQ_NUMBER_SHIFT = 0; - static constexpr uint32_t SEQ_NUMBER_MASK = 0x00FFFFFF; - - static constexpr uint32_t SLAB_ID_SHIFT = 8; - static constexpr uint32_t SLAB_ID_MASK = 0x00FFFF00; - - static constexpr uint32_t BLOCK_OFF_SHIFT = 0; - static constexpr uint32_t BLOCK_OFF_MASK = 0x000000FF; - - uint32_t block_id; - }; - - struct disk_record_t - { - disk_record_t(const char *buff); - disk_record_t(const rgw::sal::Bucket *p_bucket, - const std::string &obj_name, - const parsed_etag_t *p_parsed_etag, - uint64_t obj_size, - const std::string &storage_class); - disk_record_t() {} - size_t serialize(char *buff) const; - size_t length() const; - int validate(const char *caller, - const DoutPrefixProvider* dpp, - disk_block_id_t block_id, - record_id_t rec_id) const; - inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); } - inline void set_shared_manifest() { s.flags.set_shared_manifest(); } - - struct __attribute__ ((packed)) packed_rec_t - { - uint8_t rec_version; // allows changing record format - dedup_flags_t flags; // 1 Byte flags - uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000) - uint16_t obj_name_len; - uint16_t bucket_name_len; - - uint64_t md5_high; // High Bytes of the Object Data MD5 - uint64_t md5_low; // Low Bytes of the Object Data MD5 - uint64_t obj_bytes_size; - uint64_t object_version; - - uint16_t bucket_id_len; - uint16_t tenant_name_len; - uint16_t stor_class_len; - uint16_t ref_tag_len; - - uint16_t manifest_len; - uint8_t pad[6]; - - uint64_t shared_manifest; // 64bit hash of the SRC object manifest - uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3 - }s; - std::string obj_name; - // TBD: find pool name making it easier to get ioctx - std::string bucket_name; - std::string bucket_id; - std::string tenant_name; - std::string ref_tag; - std::string stor_class; - bufferlist manifest_bl; - }; - static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash)); - std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec); - - static constexpr unsigned BLOCK_MAGIC = 0xFACE; - static constexpr unsigned LAST_BLOCK_MAGIC = 0xCAD7; - struct __attribute__ ((packed)) disk_block_header_t { - void deserialize(); - int verify(disk_block_id_t block_id, const DoutPrefixProvider* dpp); - uint16_t offset; - uint16_t rec_count; - disk_block_id_t block_id; - uint16_t rec_offsets[MAX_REC_IN_BLOCK]; - }; - static constexpr unsigned MAX_REC_SIZE = (DISK_BLOCK_SIZE - sizeof(disk_block_header_t)); - - struct __attribute__ ((packed)) disk_block_t - { - const disk_block_header_t* get_header() const { return (disk_block_header_t*)data; } - disk_block_header_t* get_header() { return (disk_block_header_t*)data; } - bool is_empty() const { return (get_header()->rec_count == 0); } - - void init(work_shard_t worker_id, uint32_t seq_number); - record_id_t add_record(const disk_record_t *p_rec, const DoutPrefixProvider *dpp); - void close_block(const DoutPrefixProvider* dpp, bool has_more); - disk_block_id_t get_block_id() { - disk_block_header_t *p_header = get_header(); - return p_header->block_id; - } - char data[DISK_BLOCK_SIZE]; - }; - - int load_record(librados::IoCtx &ioctx, - const disk_record_t *p_tgt_rec, - disk_record_t *p_src_rec, /* OUT */ - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - const DoutPrefixProvider *dpp); - - int load_slab(librados::IoCtx &ioctx, - bufferlist &bl, - md5_shard_t md5_shard, - work_shard_t worker_id, - uint32_t seq_number, - const DoutPrefixProvider* dpp); - - int store_slab(librados::IoCtx &ioctx, - bufferlist &bl, - md5_shard_t md5_shard, - work_shard_t worker_id, - uint32_t seq_number, - const DoutPrefixProvider* dpp); - - class disk_block_array_t; - class disk_block_seq_t - { - friend class disk_block_array_t; - public: - struct record_info_t { - disk_block_id_t block_id; - record_id_t rec_id; - }; - - disk_block_seq_t(const DoutPrefixProvider* dpp_in, - disk_block_t *p_arr_in, - work_shard_t worker_id, - md5_shard_t md5_shard, - worker_stats_t *p_stats_in); - int flush_disk_records(librados::IoCtx &ioctx); - md5_shard_t get_md5_shard() { return d_md5_shard; } - int add_record(librados::IoCtx &ioctx, - const disk_record_t *p_rec, // IN-OUT - record_info_t *p_rec_info); // OUT-PARAM - - private: - disk_block_seq_t() {;} - void activate(const DoutPrefixProvider* _dpp, - disk_block_t *_p_arr, - work_shard_t worker_id, - md5_shard_t md5_shard, - worker_stats_t *p_stats); - inline const disk_block_t* last_block() { return &p_arr[DISK_BLOCK_COUNT-1]; } - int flush(librados::IoCtx &ioctx); - void slab_reset() { - p_curr_block = p_arr; - p_curr_block->init(d_worker_id, d_seq_number); - } - - disk_block_t *p_arr = nullptr; - disk_block_t *p_curr_block = nullptr; - worker_stats_t *p_stats = nullptr; - const DoutPrefixProvider *dpp = nullptr; - uint32_t d_seq_number = 0; - work_shard_t d_worker_id = NULL_WORK_SHARD; - md5_shard_t d_md5_shard = NULL_MD5_SHARD; - }; - - class disk_block_array_t - { - public: - disk_block_array_t(const DoutPrefixProvider* _dpp, - uint8_t *raw_mem, - uint64_t raw_mem_size, - work_shard_t worker_id, - worker_stats_t *p_worker_stats, - md5_shard_t num_md5_shards); - void flush_output_buffers(const DoutPrefixProvider* dpp, - librados::IoCtx &ioctx); - disk_block_seq_t* get_shard_block_seq(uint64_t md5_low) { - md5_shard_t md5_shard = md5_low % d_num_md5_shards; - return d_disk_arr + md5_shard; - } - - //private: - disk_block_seq_t d_disk_arr[MAX_MD5_SHARD]; - work_shard_t d_worker_id; - md5_shard_t d_num_md5_shards; - }; -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_table.cc b/src/rgw/rgw_dedup_table.cc deleted file mode 100644 index 09335655df62..000000000000 --- a/src/rgw/rgw_dedup_table.cc +++ /dev/null @@ -1,335 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "rgw_dedup_table.h" -#include "include/ceph_assert.h" -#include -#include - -namespace rgw::dedup { - - //--------------------------------------------------------------------------- - dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp, - uint32_t _head_object_size, - uint8_t *p_slab, - uint64_t slab_size) - { - dpp = _dpp; - head_object_size = _head_object_size; - memset(p_slab, 0, slab_size); - hash_tab = (table_entry_t*)p_slab; - entries_count = slab_size/sizeof(table_entry_t); - values_count = 0; - occupied_count = 0; - } - - //--------------------------------------------------------------------------- - void dedup_table_t::remove_singletons_and_redistribute_keys() - { - for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) { - if (!hash_tab[tab_idx].val.is_occupied()) { - continue; - } - - if (hash_tab[tab_idx].val.is_singleton()) { - hash_tab[tab_idx].val.clear_flags(); - redistributed_clear++; - continue; - } - - const key_t &key = hash_tab[tab_idx].key; - // This is an approximation only since size is stored in 4KB resolution - uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); - if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { - hash_tab[tab_idx].val.clear_flags(); - redistributed_clear++; - continue; - } - - uint32_t key_idx = key.hash() % entries_count; - if (key_idx != tab_idx) { - uint64_t count = 1; - redistributed_count++; - uint32_t idx = key_idx; - while (hash_tab[idx].val.is_occupied() && - !hash_tab[idx].val.is_singleton() && - (hash_tab[idx].key != key)) { - count++; - idx = (idx + 1) % entries_count; - } - - if (idx != tab_idx) { - if (hash_tab[idx].val.is_occupied() && hash_tab[idx].val.is_singleton() ) { - redistributed_clear++; - } - if (idx == key_idx) { - redistributed_perfect++; - } - hash_tab[idx] = hash_tab[tab_idx]; - hash_tab[tab_idx].val.clear_flags(); - } - else { - redistributed_loopback++; - } - - redistributed_search_max = std::max(redistributed_search_max, count); - redistributed_search_total += count; - } - else { - redistributed_not_needed++; - } - } - } - - //--------------------------------------------------------------------------- - uint32_t dedup_table_t::find_entry(const key_t *p_key) const - { - uint32_t idx = p_key->hash() % entries_count; - - // search until we either find the key, or find an empty slot. - while (hash_tab[idx].val.is_occupied() && (hash_tab[idx].key != *p_key)) { - idx = (idx + 1) % entries_count; - } - return idx; - } - - //--------------------------------------------------------------------------- - int dedup_table_t::add_entry(key_t *p_key, - disk_block_id_t block_id, - record_id_t rec_id, - bool shared_manifest) - { - value_t new_val(block_id, rec_id, shared_manifest); - uint32_t idx = find_entry(p_key); - value_t &val = hash_tab[idx].val; - if (!val.is_occupied()) { - if (occupied_count < entries_count) { - occupied_count++; - } - else { - return -EOVERFLOW; - } - - hash_tab[idx].key = *p_key; - hash_tab[idx].val = new_val; - ldpp_dout(dpp, 20) << __func__ << "::add new entry" << dendl; - ceph_assert(val.count == 1); - } - else { - ceph_assert(hash_tab[idx].key == *p_key); - val.count ++; - if (!val.has_shared_manifest() && shared_manifest) { - // replace value! - ldpp_dout(dpp, 20) << __func__ << "::Replace with shared_manifest::[" - << val.block_idx << "/" << (int)val.rec_id << "] -> [" - << block_id << "/" << (int)rec_id << "]" << dendl; - new_val.count = val.count; - hash_tab[idx].val = new_val; - } - ceph_assert(val.count > 1); - } - values_count++; - ldpp_dout(dpp, 20) << __func__ << "::COUNT="<< val.count << dendl; - return 0; - } - - //--------------------------------------------------------------------------- - void dedup_table_t::update_entry(key_t *p_key, - disk_block_id_t block_id, - record_id_t rec_id, - bool shared_manifest) - { - uint32_t idx = find_entry(p_key); - ceph_assert(hash_tab[idx].key == *p_key); - value_t &val = hash_tab[idx].val; - ceph_assert(val.is_occupied()); - // we only update non-singletons since we purge singletons after the first pass - ceph_assert(val.count > 1); - - // need to overwrite the block_idx/rec_id from the first pass - // unless already set with shared_manifest with the correct block-id/rec-id - // We only set the shared_manifest flag on the second pass where we - // got valid block-id/rec-id - if (!val.has_shared_manifest()) { - // replace value! - value_t new_val(block_id, rec_id, shared_manifest); - new_val.count = val.count; - hash_tab[idx].val = new_val; - ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::[" - << val.block_idx << "/" << (int)val.rec_id << "] -> [" - << block_id << "/" << (int)rec_id << "]" << dendl; - } - } - - //--------------------------------------------------------------------------- - int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key, - disk_block_id_t block_id, - record_id_t rec_id) - { - uint32_t idx = find_entry(p_key); - value_t &val = hash_tab[idx].val; - if (val.is_occupied()) { - if (val.block_idx == block_id && val.rec_id == rec_id) { - val.set_shared_manifest_src(); - return 0; - } - } - - return -ENOENT; - } - - //--------------------------------------------------------------------------- - int dedup_table_t::get_val(const key_t *p_key, struct value_t *p_val /*OUT*/) - { - uint32_t idx = find_entry(p_key); - const value_t &val = hash_tab[idx].val; - if (!val.is_occupied()) { - return -ENOENT; - } - - *p_val = val; - return 0; - } - - //--------------------------------------------------------------------------- - void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs, - dedup_stats_t *p_big_objs, - uint64_t *p_duplicate_head_bytes) - { - for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) { - if (!hash_tab[tab_idx].val.is_occupied()) { - continue; - } - - const key_t &key = hash_tab[tab_idx].key; - // This is an approximation only since size is stored in 4KB resolution - uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); - uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1); - - // skip small single part objects which we can't dedup - if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { - if (hash_tab[tab_idx].val.is_singleton()) { - p_small_objs->singleton_count++; - } - else { - p_small_objs->duplicate_count += duplicate_count; - p_small_objs->unique_count ++; - p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx); - } - continue; - } - - if (hash_tab[tab_idx].val.is_singleton()) { - p_big_objs->singleton_count++; - } - else { - ceph_assert(hash_tab[tab_idx].val.count > 1); - uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size, - key.num_parts, - byte_size_approx); - p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx); - p_big_objs->duplicate_count += duplicate_count; - p_big_objs->unique_count ++; - - if (!key.multipart_object()) { - // single part objects duplicate the head object when dedup is used - uint64_t dup_head_bytes = duplicate_count * head_object_size; - *p_duplicate_head_bytes += dup_head_bytes; - } - } - } - } - -} // namespace rgw::dedup - -#if 0 -#include -#include -#include -#include -#include -#include - -//--------------------------------------------------------------------------- -int main() -{ - static constexpr unsigned MAX_ENTRIES = 1024; - rgw::dedup::key_t *key_tab = new rgw::dedup::key_t[MAX_ENTRIES]; - if (!key_tab) { - std::cerr << "faild alloc!" << std::endl; - return 1; - } - rgw::dedup::key_t *p_key = key_tab; - //rgw::dedup::dedup_table_t tab(MAX_ENTRIES + MAX_ENTRIES/5); - rgw::dedup::dedup_table_t tab(MAX_ENTRIES); - - std::cout << "sizeof(key)=" << sizeof(rgw::dedup::key_t) << std::endl; - // Seed with a real random value, if available - std::random_device r; - // Choose a random mean between 1 ULLONG_MAX - std::default_random_engine e1(r()); - std::uniform_int_distribution uniform_dist(1, std::numeric_limits::max()); - - for (unsigned i = 0; i < MAX_ENTRIES; i++) { - uint64_t md5_high = uniform_dist(e1); - uint64_t md5_low = uniform_dist(e1); - uint32_t size_4k_units = std::rand(); - uint16_t num_parts = std::rand(); - //std::cout << std::hex << md5_high << "::" << md5_low << "::" << block_id << std::endl; - rgw::dedup::key_t key(md5_high, md5_low, size_4k_units, num_parts); - *p_key = key; - p_key++; - } - work_shard_t work_shard = 3; - for (unsigned i = 0; i < MAX_ENTRIES; i++) { - disk_block_id_t block_id(worker_id, std::rand()); - tab.add_entry(key_tab+i, block_id, 0, false, false); - } - double avg = (double)total / MAX_ENTRIES; - std::cout << "Insert::num entries=" << MAX_ENTRIES << ", total=" << total - << ", avg=" << avg << ", max=" << max << std::endl; - std::cout << "==========================================\n"; - - total = 0; - max = 0; - for (unsigned i = 0; i < MAX_ENTRIES; i++) { - tab.find_entry(key_tab+i); - } - avg = (double)total / MAX_ENTRIES; - std::cout << "Find::num entries=" << MAX_ENTRIES << ", total=" << total - << ", avg=" << avg << ", max=" << max << std::endl; - std::cout << "==========================================\n"; - tab.remove_singletons_and_redistribute_keys(); - tab.print_redistribute_stats(); - tab.stat_counters_reset(); - std::cout << "==========================================\n"; - total = 0; - max = 0; - uint32_t cnt = 0; - for (unsigned i = 0; i < MAX_ENTRIES; i++) { - rgw::dedup::key_t *p_key = key_tab+i; - tab.find_entry(p_key); - cnt++; -#if 0 - if (p_key->md5_high % 5 == 0) { - tab.find_entry(p_key); - cnt++; - } -#endif - } - avg = (double)total / cnt; - std::cout << "num entries=" << cnt << ", total=" << total - << ", avg=" << avg << ", max=" << max << std::endl; -} -#endif diff --git a/src/rgw/rgw_dedup_table.h b/src/rgw/rgw_dedup_table.h deleted file mode 100644 index 51d36006944f..000000000000 --- a/src/rgw/rgw_dedup_table.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include -#include -#include -#include "common/dout.h" -#include "rgw_dedup_store.h" -namespace rgw::dedup { - - // 24 Bytes key - struct key_t { - key_t() { ;} - key_t(uint64_t _md5_high, - uint64_t _md5_low, - uint32_t _size_4k_units, - uint16_t _num_parts, - uint8_t _stor_class_idx) { - md5_high = _md5_high; - md5_low = _md5_low; - size_4k_units = _size_4k_units; - num_parts = _num_parts; - stor_class_idx = _stor_class_idx; - pad8 = 0; - } - - bool operator==(const struct key_t& other) const { - return (memcmp(this, &other, sizeof(other)) == 0); - } - - bool operator!=(const struct key_t& other) const { - return !operator==(other); - } - - uint64_t hash() const { - // The MD5 is already a hashing function so no need for another hash - return this->md5_low; - } - - bool multipart_object() const { - return num_parts > 0; - } - - uint64_t md5_high; // High Bytes of the Object Data MD5 - uint64_t md5_low; // Low Bytes of the Object Data MD5 - uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB) - uint16_t num_parts; // How many parts were used in multipart upload (AWS MAX-PART is 10,000) - uint8_t stor_class_idx;// storage class id - uint8_t pad8; - } __attribute__((__packed__)); - static_assert(sizeof(key_t) == 24); - - class dedup_table_t { - public: - // 8 Bytes Value - struct value_t { - value_t() { - this->block_idx = 0xFFFFFFFF; - this->count = 0; - this->rec_id = 0xFF; - this->flags.clear(); - } - - value_t(disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest) { - this->block_idx = block_id; - this->count = 1; - this->rec_id = rec_id; - this->flags.clear(); - this->flags.set_occupied(); - if (shared_manifest) { - flags.set_shared_manifest(); - } - } - - inline void clear_flags() { flags.clear(); } - inline bool has_shared_manifest() const {return flags.has_shared_manifest(); } - inline void set_shared_manifest_src() { this->flags.set_shared_manifest(); } - inline bool is_singleton() const { return (count == 1); } - inline bool is_occupied() const { return flags.is_occupied(); } - inline void set_occupied() { this->flags.set_occupied(); } - inline void clear_occupied() { this->flags.clear_occupied(); } - - disk_block_id_t block_idx; // 32 bits - uint16_t count; // 16 bits - record_id_t rec_id; // 8 bits - dedup_flags_t flags; // 8 bits - } __attribute__((__packed__)); - static_assert(sizeof(value_t) == 8); - - dedup_table_t(const DoutPrefixProvider* _dpp, - uint32_t _head_object_size, - uint8_t *p_slab, - uint64_t slab_size); - int add_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, - bool shared_manifest); - void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, - bool shared_manifest); - - int get_val(const key_t *p_key, struct value_t *p_val /*OUT*/); - - int set_shared_manifest_src_mode(const key_t *p_key, - disk_block_id_t block_id, - record_id_t rec_id); - - void count_duplicates(dedup_stats_t *p_small_objs_stat, - dedup_stats_t *p_big_objs_stat, - uint64_t *p_duplicate_head_bytes); - - void remove_singletons_and_redistribute_keys(); - private: - // 32 Bytes unified entries - struct table_entry_t { - key_t key; - value_t val; - } __attribute__((__packed__)); - static_assert(sizeof(table_entry_t) == 32); - - uint32_t find_entry(const key_t *p_key) const; - uint32_t values_count = 0; - uint32_t entries_count = 0; - uint32_t occupied_count = 0; - uint32_t head_object_size = (4ULL * 1024 * 1024); - table_entry_t *hash_tab = nullptr; - - // stat counters - uint64_t redistributed_count = 0; - uint64_t redistributed_search_total = 0; - uint64_t redistributed_search_max = 0; - uint64_t redistributed_loopback = 0; - uint64_t redistributed_perfect = 0; - uint64_t redistributed_clear = 0; - uint64_t redistributed_not_needed = 0; - const DoutPrefixProvider* dpp; - }; - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_utils.cc b/src/rgw/rgw_dedup_utils.cc deleted file mode 100644 index baadee5aeef5..000000000000 --- a/src/rgw/rgw_dedup_utils.cc +++ /dev/null @@ -1,697 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "rgw_dedup_utils.h" -#include "common/ceph_crypto.h" - -namespace rgw::dedup { - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type) - { - if (dedup_type == dedup_req_type_t::DEDUP_TYPE_NONE) { - out << "DEDUP_TYPE_NONE"; - } - else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE) { - out << "DEDUP_TYPE_ESTIMATE"; - } - else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL) { - out << "DEDUP_TYPE_FULL"; - } - else { - out << "\n*** unexpected dedup_type ***\n"; - } - - return out; - } - - //--------------------------------------------------------------------------- - dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other) - { - this->singleton_count += other.singleton_count; - this->unique_count += other.unique_count; - this->duplicate_count += other.duplicate_count; - this->dedup_bytes_estimate += other.dedup_bytes_estimate; - return *this; - } - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats) - { - out << "::singleton_count=" << stats.singleton_count - << "::unique_count=" << stats.unique_count - << "::duplicate_count=" << stats.duplicate_count - << "::duplicated_bytes=" << stats.dedup_bytes_estimate; - return out; - } - - //--------------------------------------------------------------------------- - void encode(const dedup_stats_t& ds, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - encode(ds.singleton_count, bl); - encode(ds.unique_count, bl); - encode(ds.duplicate_count, bl); - encode(ds.dedup_bytes_estimate, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - decode(ds.singleton_count, bl); - decode(ds.unique_count, bl); - decode(ds.duplicate_count, bl); - decode(ds.dedup_bytes_estimate, bl); - DECODE_FINISH(bl); - } - - // convert a hex-string to a 64bit integer (max 16 hex digits) - //--------------------------------------------------------------------------- - bool hex2int(const char *p, const char *p_end, uint64_t *p_val) - { - if (p_end - p <= (int)(sizeof(uint64_t) * 2)) { - uint64_t val = 0; - while (p < p_end) { - // get current character then increment - uint8_t byte = *p++; - // transform hex character to the 4bit equivalent number, using the ASCII table indexes - if (byte >= '0' && byte <= '9') { - byte = byte - '0'; - } - else if (byte >= 'a' && byte <='f') { - byte = byte - 'a' + 10; - } - else if (byte >= 'A' && byte <='F') { - byte = byte - 'A' + 10; - } - else { - // terminate on the first non hex char - return false; - } - // shift 4 to make space for new digit, and add the 4 bits of the new digit - val = (val << 4) | (byte & 0xF); - } - *p_val = val; - return true; - } - else { - return false; - } - } - - //--------------------------------------------------------------------------- - bool dec2int(const char *p, const char* p_end, uint16_t *p_val) - { - uint16_t val = 0; - while (p < p_end) { - uint8_t byte = *p++; - if (byte >= '0' && byte <= '9') { - val = val * 10 + (byte - '0'); - } - else { - // terminate on the first non hex char - return false; - } - } - *p_val = val; - return true; - } - - // 16Bytes MD5 takes 32 chars - const unsigned MD5_LENGTH = 32; - - //--------------------------------------------------------------------------- - static bool get_num_parts(const std::string & etag, uint16_t *p_num_parts) - { - // Amazon S3 multipart upload Maximum number = 10,000 - const unsigned MAX_PARTS = 10000; - if (etag.length() <= MD5_LENGTH) { - // i.e. no multipart - *p_num_parts = 0; - return true; - } - - // Amazon S3 multipart upload Maximum number = 10,000 (5 decimal digits) - // We need 1 extra byte for the '-' delimiter and 1 extra byte for '"' at the end - // 7 Bytes should suffice, but we roundup to 8 Bytes - const unsigned MAX_PART_LEN = 8; - if (unlikely(etag.length() > MD5_LENGTH + MAX_PART_LEN)) { - // illegal ETAG - return false; - } - - std::string::size_type n = etag.find('-', etag.length() - MAX_PART_LEN); - if (n != std::string::npos) { - char buff[MAX_PART_LEN]; - // again, 1 extra byte for the '-' delimiter - unsigned copy_size = etag.length() - (n + 1); - if (copy_size <= MAX_PART_LEN) { - unsigned nbytes = etag.copy(buff, copy_size, n+1); - uint16_t num_parts; - const unsigned MAX_UINT16_DIGITS = 5; // 65536 - if (nbytes <= MAX_UINT16_DIGITS) { - if (dec2int(buff, buff+nbytes, &num_parts) && num_parts <= MAX_PARTS) { - *p_num_parts = num_parts; - return true; - } // else, not all digits are legal - } // else, more than 5 digits - } // else, copy len too large - } // else, '-' delimiter was not found - - // illegal number of parts - return false; - } - - //--------------------------------------------------------------------------- - bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag) - { - char buff[MD5_LENGTH*2]; - uint16_t num_parts = 0; - if (get_num_parts(etag, &num_parts)) { - etag.copy(buff, MD5_LENGTH, 0); - uint64_t high, low; - if (hex2int(buff, buff+16, &high)) { - if (hex2int(buff+16, buff+32, &low)) { - parsed_etag->md5_high = high; // High Bytes of the Object Data MD5 - parsed_etag->md5_low = low; // Low Bytes of the Object Data MD5 - parsed_etag->num_parts = num_parts; // How many parts were used in multipart upload - return true; - } - } - } - - // an illegal etag string - return false; - } - - //--------------------------------------------------------------------------- - void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts, - ceph::bufferlist *bl) - { - char buff[64]; - int n = snprintf(buff, sizeof(buff), "%016lx%016lx", md5_high, md5_low); - if (num_parts >= 1) { - n += snprintf(buff + n, sizeof(buff) - n, "-%u", num_parts); - } - bl->append(buff, n); - } - - //--------------------------------------------------------------------------- - const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr, - char data_buff[], - size_t len, - const DoutPrefixProvider* dpp) - { - const char *p = nullptr; - size_t n = bl_itr.get_ptr_and_advance(len, &p); - if (n == len) { - // we got a zero-copy raw pointer to contiguous data on the buffer-list - return p; - } - - std::vector vec; - // otherwise - copy the data to the @data_buff - char *p_buff = data_buff; - do { - vec.push_back(n); - std::memcpy(p_buff, p, n); - p_buff += n; - len -= n; - if (len > 0) { - n = bl_itr.get_ptr_and_advance(len, &p); - } - } while (len > 0); - - ldpp_dout(dpp, 20) << __func__ << "::vec=" << vec << dendl; - return data_buff; - } - - static const char* s_urgent_msg_names[] = { - "URGENT_MSG_NONE", - "URGENT_MSG_ABORT", - "URGENT_MSG_PASUE", - "URGENT_MSG_RESUME", - "URGENT_MSG_RESTART", - "URGENT_MSG_INVALID" - }; - - //--------------------------------------------------------------------------- - const char* get_urgent_msg_names(int msg) - { - if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) { - return s_urgent_msg_names[msg]; - } - else { - return s_urgent_msg_names[URGENT_MSG_INVALID]; - } - } - - //--------------------------------------------------------------------------- - worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other) - { - this->ingress_obj += other.ingress_obj; - this->ingress_obj_bytes += other.ingress_obj_bytes; - this->egress_records += other.egress_records; - this->egress_blocks += other.egress_blocks; - this->egress_slabs += other.egress_slabs; - this->single_part_objs += other.single_part_objs; - this->multipart_objs += other.multipart_objs; - this->small_multipart_obj += other.small_multipart_obj; - this->default_storage_class_objs += other.default_storage_class_objs; - this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes; - this->non_default_storage_class_objs += other.non_default_storage_class_objs; - this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes; - this->ingress_corrupted_etag += other.ingress_corrupted_etag; - this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes; - this->ingress_skip_too_small += other.ingress_skip_too_small; - this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes; - this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB; - - return *this; - } - //--------------------------------------------------------------------------- - void worker_stats_t::dump(Formatter *f) const - { - // main section - { - Formatter::ObjectSection main(*f, "main"); - - f->dump_unsigned("Ingress Objs count", this->ingress_obj); - f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes); - f->dump_unsigned("Egress Records count", this->egress_records); - f->dump_unsigned("Egress Blocks count", this->egress_blocks); - f->dump_unsigned("Egress Slabs count", this->egress_slabs); - f->dump_unsigned("Single part obj count", this->single_part_objs); - f->dump_unsigned("Multipart obj count", this->multipart_objs); - if (this->small_multipart_obj) { - f->dump_unsigned("Small Multipart obj count", this->small_multipart_obj); - } - } - - { - Formatter::ObjectSection notify(*f, "notify"); - - if(this->non_default_storage_class_objs) { - f->dump_unsigned("non default storage class objs", - this->non_default_storage_class_objs); - f->dump_unsigned("non default storage class objs bytes", - this->non_default_storage_class_objs_bytes); - } - else { - ceph_assert(this->default_storage_class_objs == this->ingress_obj); - ceph_assert(this->default_storage_class_objs_bytes == this->ingress_obj_bytes); - } - } - - { - Formatter::ObjectSection skipped(*f, "skipped"); - if(this->ingress_skip_too_small) { - f->dump_unsigned("Ingress skip: too small objs", - this->ingress_skip_too_small); - f->dump_unsigned("Ingress skip: too small bytes", - this->ingress_skip_too_small_bytes); - - if(this->ingress_skip_too_small_64KB) { - f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj", - this->ingress_skip_too_small_64KB); - f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes", - this->ingress_skip_too_small_64KB_bytes); - } - } - } - - { - Formatter::ObjectSection failed(*f, "failed"); - if(this->ingress_corrupted_etag) { - f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag); - } - } - } - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, const worker_stats_t &s) - { - JSONFormatter formatter(false); - s.dump(&formatter); - std::stringstream sstream; - formatter.flush(sstream); - out << sstream.str(); - return out; - } - - //--------------------------------------------------------------------------- - void encode(const worker_stats_t& w, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - encode(w.ingress_obj, bl); - encode(w.ingress_obj_bytes, bl); - encode(w.egress_records, bl); - encode(w.egress_blocks, bl); - encode(w.egress_slabs, bl); - - encode(w.single_part_objs, bl); - encode(w.multipart_objs, bl); - encode(w.small_multipart_obj, bl); - - encode(w.default_storage_class_objs, bl); - encode(w.default_storage_class_objs_bytes, bl); - encode(w.non_default_storage_class_objs, bl); - encode(w.non_default_storage_class_objs_bytes, bl); - - encode(w.ingress_corrupted_etag, bl); - - encode(w.ingress_skip_too_small_bytes, bl); - encode(w.ingress_skip_too_small, bl); - - encode(w.ingress_skip_too_small_64KB_bytes, bl); - encode(w.ingress_skip_too_small_64KB, bl); - - encode(w.duration, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - decode(w.ingress_obj, bl); - decode(w.ingress_obj_bytes, bl); - decode(w.egress_records, bl); - decode(w.egress_blocks, bl); - decode(w.egress_slabs, bl); - decode(w.single_part_objs, bl); - decode(w.multipart_objs, bl); - decode(w.small_multipart_obj, bl); - decode(w.default_storage_class_objs, bl); - decode(w.default_storage_class_objs_bytes, bl); - decode(w.non_default_storage_class_objs, bl); - decode(w.non_default_storage_class_objs_bytes, bl); - decode(w.ingress_corrupted_etag, bl); - decode(w.ingress_skip_too_small_bytes, bl); - decode(w.ingress_skip_too_small, bl); - decode(w.ingress_skip_too_small_64KB_bytes, bl); - decode(w.ingress_skip_too_small_64KB, bl); - - decode(w.duration, bl); - DECODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other) - { - this->small_objs_stat += other.small_objs_stat; - this->big_objs_stat += other.big_objs_stat; - this->ingress_failed_load_bucket += other.ingress_failed_load_bucket; - this->ingress_failed_get_object += other.ingress_failed_get_object; - this->ingress_failed_get_obj_attrs += other.ingress_failed_get_obj_attrs; - this->ingress_corrupted_etag += other.ingress_corrupted_etag; - this->ingress_corrupted_obj_attrs += other.ingress_corrupted_obj_attrs; - this->ingress_skip_encrypted += other.ingress_skip_encrypted; - this->ingress_skip_encrypted_bytes += other.ingress_skip_encrypted_bytes; - this->ingress_skip_compressed += other.ingress_skip_compressed; - this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes; - this->ingress_skip_changed_objs += other.ingress_skip_changed_objs; - this->shared_manifest_dedup_bytes += other.shared_manifest_dedup_bytes; - - this->skipped_shared_manifest += other.skipped_shared_manifest; - this->skipped_purged_small += other.skipped_purged_small; - this->skipped_singleton += other.skipped_singleton; - this->skipped_singleton_bytes += other.skipped_singleton_bytes; - this->skipped_source_record += other.skipped_source_record; - this->duplicate_records += other.duplicate_records; - this->size_mismatch += other.size_mismatch; - this->hash_mismatch += other.hash_mismatch; - this->failed_src_load += other.failed_src_load; - this->failed_rec_load += other.failed_rec_load; - this->failed_block_load += other.failed_block_load; - - this->valid_hash_attrs += other.valid_hash_attrs; - this->invalid_hash_attrs += other.invalid_hash_attrs; - this->set_hash_attrs += other.set_hash_attrs; - this->skip_hash_cmp += other.skip_hash_cmp; - - this->set_shared_manifest_src += other.set_shared_manifest_src; - this->loaded_objects += other.loaded_objects; - this->processed_objects += other.processed_objects; - this->dup_head_bytes_estimate += other.dup_head_bytes_estimate; - this->deduped_objects += other.deduped_objects; - this->deduped_objects_bytes += other.deduped_objects_bytes; - this->dup_head_bytes += other.dup_head_bytes; - - this->failed_dedup += other.failed_dedup; - this->failed_table_load += other.failed_table_load; - this->failed_map_overflow += other.failed_map_overflow; - return *this; - } - - //--------------------------------------------------------------------------- - std::ostream& operator<<(std::ostream &out, const md5_stats_t &s) - { - JSONFormatter formatter(false); - s.dump(&formatter); - std::stringstream sstream; - formatter.flush(sstream); - out << sstream.str(); - return out; - } - - //--------------------------------------------------------------------------- - void md5_stats_t::dump(Formatter *f) const - { - // main section - { - Formatter::ObjectSection main(*f, "main"); - - f->dump_unsigned("Total processed objects", this->processed_objects); - f->dump_unsigned("Loaded objects", this->loaded_objects); - f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src); - f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects); - f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes); - f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes); - f->dump_unsigned("Already Deduped bytes (prev cycles)", - this->shared_manifest_dedup_bytes); - - const dedup_stats_t &ds = this->big_objs_stat; - f->dump_unsigned("Singleton Obj", ds.singleton_count); - f->dump_unsigned("Unique Obj", ds.unique_count); - f->dump_unsigned("Duplicate Obj", ds.duplicate_count); - f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate); - } - - // Potential Dedup Section: - // What could be gained by allowing dedup for smaller objects (64KB-4MB) - // Space wasted because of duplicated head-object (4MB) - { - Formatter::ObjectSection potential(*f, "Potential Dedup"); - const dedup_stats_t &ds = this->small_objs_stat; - f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count); - f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count); - f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count); - f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate); - f->dump_unsigned("Duplicated Head Bytes Estimate", - this->dup_head_bytes_estimate); - f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes); - } - - { - Formatter::ObjectSection notify(*f, "notify"); - if (this->failed_table_load) { - f->dump_unsigned("Failed Table Load", this->failed_table_load); - } - if (this->failed_map_overflow) { - f->dump_unsigned("Failed Remap Overflow", this->failed_map_overflow); - } - - f->dump_unsigned("Valid HASH attrs", this->valid_hash_attrs); - f->dump_unsigned("Invalid HASH attrs", this->invalid_hash_attrs); - - if (this->set_hash_attrs) { - f->dump_unsigned("Set HASH", this->set_hash_attrs); - } - - if (this->skip_hash_cmp) { - f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp); - } - } - - { - Formatter::ObjectSection skipped(*f, "skipped"); - f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest); - f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small); - f->dump_unsigned("Skipped singleton objs", this->skipped_singleton); - if (this->skipped_singleton) { - f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes); - } - f->dump_unsigned("Skipped source record", this->skipped_source_record); - - if (this->ingress_skip_encrypted) { - f->dump_unsigned("Skipped Encrypted objs", this->ingress_skip_encrypted); - f->dump_unsigned("Skipped Encrypted Bytes",this->ingress_skip_encrypted_bytes); - } - if (this->ingress_skip_compressed) { - f->dump_unsigned("Skipped Compressed objs", this->ingress_skip_compressed); - f->dump_unsigned("Skipped Compressed Bytes", this->ingress_skip_compressed_bytes); - } - if (this->ingress_skip_changed_objs) { - f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs); - } - } - - { - Formatter::ObjectSection sys_failures(*f, "system failures"); - if (this->ingress_failed_load_bucket) { - f->dump_unsigned("Failed load_bucket()", this->ingress_failed_load_bucket); - } - if (this->ingress_failed_get_object) { - f->dump_unsigned("Failed get_object()", this->ingress_failed_get_object); - } - if (this->ingress_failed_get_obj_attrs) { - f->dump_unsigned("Failed get_obj_attrs", this->ingress_failed_get_obj_attrs); - } - if (this->ingress_corrupted_etag) { - f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag); - } - if (this->ingress_corrupted_obj_attrs) { - f->dump_unsigned("Corrupted obj attributes", this->ingress_corrupted_obj_attrs); - } - if (this->failed_src_load) { - f->dump_unsigned("Failed SRC-Load ", this->failed_src_load); - } - if (this->failed_rec_load) { - f->dump_unsigned("Failed Record-Load ", this->failed_rec_load); - } - if (this->failed_block_load) { - f->dump_unsigned("Failed Block-Load ", this->failed_block_load); - } - if (this->failed_dedup) { - f->dump_unsigned("Failed Dedup", this->failed_dedup); - } - } - - { - Formatter::ObjectSection logical_failures(*f, "logical failures"); - if (this->hash_mismatch) { - f->dump_unsigned("HASH mismatch", this->hash_mismatch); - } - if (this->duplicate_records) { - f->dump_unsigned("Duplicate SRC/TGT", this->duplicate_records); - } - if (this->size_mismatch) { - f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch); - } - } - } - - //--------------------------------------------------------------------------- - void encode(const md5_stats_t& m, ceph::bufferlist& bl) - { - ENCODE_START(1, 1, bl); - - encode(m.small_objs_stat, bl); - encode(m.big_objs_stat, bl); - encode(m.ingress_failed_load_bucket, bl); - encode(m.ingress_failed_get_object, bl); - encode(m.ingress_failed_get_obj_attrs, bl); - encode(m.ingress_corrupted_etag, bl); - encode(m.ingress_corrupted_obj_attrs, bl); - encode(m.ingress_skip_encrypted, bl); - encode(m.ingress_skip_encrypted_bytes, bl); - encode(m.ingress_skip_compressed, bl); - encode(m.ingress_skip_compressed_bytes, bl); - encode(m.ingress_skip_changed_objs, bl); - encode(m.shared_manifest_dedup_bytes, bl); - - encode(m.skipped_shared_manifest, bl); - encode(m.skipped_purged_small, bl); - encode(m.skipped_singleton, bl); - encode(m.skipped_singleton_bytes, bl); - encode(m.skipped_source_record, bl); - encode(m.duplicate_records, bl); - encode(m.size_mismatch, bl); - encode(m.hash_mismatch, bl); - encode(m.failed_src_load, bl); - encode(m.failed_rec_load, bl); - encode(m.failed_block_load, bl); - - encode(m.valid_hash_attrs, bl); - encode(m.invalid_hash_attrs, bl); - encode(m.set_hash_attrs, bl); - encode(m.skip_hash_cmp, bl); - encode(m.set_shared_manifest_src, bl); - - encode(m.loaded_objects, bl); - encode(m.processed_objects, bl); - encode(m.dup_head_bytes_estimate, bl); - encode(m.deduped_objects, bl); - encode(m.deduped_objects_bytes, bl); - encode(m.dup_head_bytes, bl); - encode(m.failed_dedup, bl); - encode(m.failed_table_load, bl); - encode(m.failed_map_overflow, bl); - - encode(m.duration, bl); - ENCODE_FINISH(bl); - } - - //--------------------------------------------------------------------------- - void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl) - { - DECODE_START(1, bl); - decode(m.small_objs_stat, bl); - decode(m.big_objs_stat, bl); - decode(m.ingress_failed_load_bucket, bl); - decode(m.ingress_failed_get_object, bl); - decode(m.ingress_failed_get_obj_attrs, bl); - decode(m.ingress_corrupted_etag, bl); - decode(m.ingress_corrupted_obj_attrs, bl); - decode(m.ingress_skip_encrypted, bl); - decode(m.ingress_skip_encrypted_bytes, bl); - decode(m.ingress_skip_compressed, bl); - decode(m.ingress_skip_compressed_bytes, bl); - decode(m.ingress_skip_changed_objs, bl); - decode(m.shared_manifest_dedup_bytes, bl); - - decode(m.skipped_shared_manifest, bl); - decode(m.skipped_purged_small, bl); - decode(m.skipped_singleton, bl); - decode(m.skipped_singleton_bytes, bl); - decode(m.skipped_source_record, bl); - decode(m.duplicate_records, bl); - decode(m.size_mismatch, bl); - decode(m.hash_mismatch, bl); - decode(m.failed_src_load, bl); - decode(m.failed_rec_load, bl); - decode(m.failed_block_load, bl); - - decode(m.valid_hash_attrs, bl); - decode(m.invalid_hash_attrs, bl); - decode(m.set_hash_attrs, bl); - decode(m.skip_hash_cmp, bl); - decode(m.set_shared_manifest_src, bl); - - decode(m.loaded_objects, bl); - decode(m.processed_objects, bl); - decode(m.dup_head_bytes_estimate, bl); - decode(m.deduped_objects, bl); - decode(m.deduped_objects_bytes, bl); - decode(m.dup_head_bytes, bl); - decode(m.failed_dedup, bl); - decode(m.failed_table_load, bl); - decode(m.failed_map_overflow, bl); - - decode(m.duration, bl); - DECODE_FINISH(bl); - } -} //namespace rgw::dedup diff --git a/src/rgw/rgw_dedup_utils.h b/src/rgw/rgw_dedup_utils.h deleted file mode 100644 index f008fcaba38b..000000000000 --- a/src/rgw/rgw_dedup_utils.h +++ /dev/null @@ -1,267 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Author: Gabriel BenHanokh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once -#include -#include "include/rados/buffer.h" -#include "include/encoding.h" -#include "common/Formatter.h" -#include "common/ceph_json.h" -#include -#include "include/utime.h" -#include "include/encoding.h" -#include "common/dout.h" - -#define FULL_DEDUP_SUPPORT -namespace rgw::dedup { - using work_shard_t = uint16_t; - using md5_shard_t = uint16_t; - - // settings to help debug small systems - const work_shard_t MIN_WORK_SHARD = 2; - const md5_shard_t MIN_MD5_SHARD = 4; - - // Those are the correct values for production system - const work_shard_t MAX_WORK_SHARD = 255; - const md5_shard_t MAX_MD5_SHARD = 512; - - const work_shard_t NULL_WORK_SHARD = 0xFFFF; - const md5_shard_t NULL_MD5_SHARD = 0xFFFF; - const unsigned NULL_SHARD = 0xFFFF; - - // work_shard is an 8 bits int with 255 legal values for the first iteration - // and one value (0xFF) reserved for second iteration - const unsigned WORK_SHARD_HARD_LIMIT = 0x0FF; - // md5_shard_t is a 12 bits int with 4096 possible values - const unsigned MD5_SHARD_HARD_LIMIT = 0xFFF; - - static_assert(MAX_WORK_SHARD < NULL_WORK_SHARD); - static_assert(MAX_WORK_SHARD < NULL_SHARD); - static_assert(MAX_WORK_SHARD <= WORK_SHARD_HARD_LIMIT); - static_assert(MAX_MD5_SHARD < NULL_MD5_SHARD); - static_assert(MAX_MD5_SHARD < NULL_SHARD); - static_assert(MAX_MD5_SHARD <= MD5_SHARD_HARD_LIMIT); - - //--------------------------------------------------------------------------- - enum dedup_req_type_t { - DEDUP_TYPE_NONE = 0, - DEDUP_TYPE_ESTIMATE = 1, - DEDUP_TYPE_FULL = 2 - }; - - std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type); - struct __attribute__ ((packed)) dedup_flags_t { - private: - static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC - static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB - static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB - static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC - - public: - dedup_flags_t() : flags(0) {} - dedup_flags_t(uint8_t _flags) : flags(_flags) {} - inline void clear() { this->flags = 0; } - inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); } - inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; } - inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); } - inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; } - inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); } - inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; } - inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; } - inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); } - inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; } - private: - uint8_t flags; - }; - - struct dedup_stats_t { - dedup_stats_t& operator+=(const dedup_stats_t& other); - - uint64_t singleton_count = 0; - uint64_t unique_count = 0; - uint64_t duplicate_count = 0; - uint64_t dedup_bytes_estimate = 0; - }; - - std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats); - void encode(const dedup_stats_t& ds, ceph::bufferlist& bl); - void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl); - - struct worker_stats_t { - worker_stats_t& operator +=(const worker_stats_t& other); - void dump(Formatter *f) const; - - uint64_t ingress_obj = 0; - uint64_t ingress_obj_bytes = 0; - uint64_t egress_records = 0; - uint64_t egress_blocks = 0; - uint64_t egress_slabs = 0; - - uint64_t single_part_objs = 0; - uint64_t multipart_objs = 0; - uint64_t small_multipart_obj = 0; - - uint64_t default_storage_class_objs = 0; - uint64_t default_storage_class_objs_bytes = 0; - - uint64_t non_default_storage_class_objs = 0; - uint64_t non_default_storage_class_objs_bytes = 0; - - uint64_t ingress_corrupted_etag = 0; - - uint64_t ingress_skip_too_small_bytes = 0; - uint64_t ingress_skip_too_small = 0; - - uint64_t ingress_skip_too_small_64KB_bytes = 0; - uint64_t ingress_skip_too_small_64KB = 0; - - utime_t duration = {0, 0}; - }; - std::ostream& operator<<(std::ostream &out, const worker_stats_t &s); - void encode(const worker_stats_t& w, ceph::bufferlist& bl); - void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl); - - - struct md5_stats_t { - md5_stats_t& operator +=(const md5_stats_t& other); - void dump(Formatter *f) const; - - dedup_stats_t small_objs_stat; - dedup_stats_t big_objs_stat; - uint64_t ingress_failed_load_bucket = 0; - uint64_t ingress_failed_get_object = 0; - uint64_t ingress_failed_get_obj_attrs = 0; - uint64_t ingress_corrupted_etag = 0; - uint64_t ingress_corrupted_obj_attrs = 0; - uint64_t ingress_skip_encrypted = 0; - uint64_t ingress_skip_encrypted_bytes = 0; - uint64_t ingress_skip_compressed = 0; - uint64_t ingress_skip_compressed_bytes = 0; - uint64_t ingress_skip_changed_objs = 0; - - uint64_t shared_manifest_dedup_bytes = 0; - uint64_t skipped_shared_manifest = 0; - uint64_t skipped_purged_small = 0; - uint64_t skipped_singleton = 0; - uint64_t skipped_singleton_bytes = 0; - uint64_t skipped_source_record = 0; - uint64_t duplicate_records = 0; - uint64_t size_mismatch = 0; - uint64_t hash_mismatch = 0; - uint64_t failed_src_load = 0; - uint64_t failed_rec_load = 0; - uint64_t failed_block_load = 0; - - uint64_t valid_hash_attrs = 0; - uint64_t invalid_hash_attrs = 0; - uint64_t set_hash_attrs = 0; - uint64_t skip_hash_cmp = 0; - - uint64_t set_shared_manifest_src = 0; - uint64_t loaded_objects = 0; - uint64_t processed_objects = 0; - // counter is using on-disk size affected by block-size - uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes - uint64_t deduped_objects = 0; - // counter is using s3 byte size disregarding the on-disk size affected by block-size - uint64_t deduped_objects_bytes = 0; - uint64_t dup_head_bytes = 0; - uint64_t failed_dedup = 0; - uint64_t failed_table_load = 0; - uint64_t failed_map_overflow = 0; - utime_t duration = {0, 0}; - }; - std::ostream &operator<<(std::ostream &out, const md5_stats_t &s); - void encode(const md5_stats_t& m, ceph::bufferlist& bl); - void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl); - - struct parsed_etag_t { - uint64_t md5_high; // High Bytes of the Object Data MD5 - uint64_t md5_low; // Low Bytes of the Object Data MD5 - uint16_t num_parts; // How many parts were used in multipart upload - // Setting num_parts to zero when multipart is not used - }; - -#define DIV_UP(a, b) ( ((a)+(b-1)) / b) - // CEPH min allocation unit on disk is 4KB - // TBD: take from config - static constexpr uint64_t DISK_ALLOC_SIZE = 4*1024; - // 16 bytes hexstring -> 8 Byte uint64_t - static inline constexpr unsigned HEX_UNIT_SIZE = 16; - - //--------------------------------------------------------------------------- - static inline uint64_t byte_size_to_disk_blocks(uint64_t byte_size) { - return DIV_UP(byte_size, DISK_ALLOC_SIZE); - } - - //--------------------------------------------------------------------------- - static inline uint64_t disk_blocks_to_byte_size(uint64_t disk_blocks) { - return disk_blocks * DISK_ALLOC_SIZE; - } - - //--------------------------------------------------------------------------- - // ceph store full blocks so need to round up and multiply by block_size - static inline uint64_t calc_on_disk_byte_size(uint64_t byte_size) { - uint64_t size_4k_units = byte_size_to_disk_blocks(byte_size); - return disk_blocks_to_byte_size(size_4k_units); - } - - enum urgent_msg_t { - URGENT_MSG_NONE = 0, - URGENT_MSG_ABORT = 1, - URGENT_MSG_PASUE = 2, - URGENT_MSG_RESUME = 3, - URGENT_MSG_RESTART = 4, - URGENT_MSG_INVALID = 5 - }; - - const char* get_urgent_msg_names(int msg); - bool hex2int(const char *p, const char *p_end, uint64_t *p_val); - bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag); - void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts, - ceph::bufferlist *bl); - const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr, - char data_buff[], - size_t len, - const DoutPrefixProvider* dpp); - - //--------------------------------------------------------------------------- - static inline void build_oid(const std::string &bucket_id, - const std::string &obj_name, - std::string *oid) - { - *oid = bucket_id + "_" + obj_name; - } - - //--------------------------------------------------------------------------- - static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size, - uint16_t num_parts, - uint64_t size_bytes) - { - if (num_parts > 0) { - // multipart objects with an empty head i.e. we achive full dedup - return size_bytes; - } - else { - // reduce the head size - if (size_bytes > head_obj_size) { - return size_bytes - head_obj_size; - } - else { - return 0; - } - } - } - -} //namespace rgw::dedup diff --git a/src/rgw/rgw_lib.cc b/src/rgw/rgw_lib.cc index 2dd12dd40031..8c86bdba6456 100644 --- a/src/rgw/rgw_lib.cc +++ b/src/rgw/rgw_lib.cc @@ -545,7 +545,9 @@ namespace rgw { } main.init_lua(); +#ifdef WITH_RADOSGW_RADOS main.init_dedup(); +#endif return 0; } /* RGWLib::init() */ diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index a47e6553ca2e..403ef6060a9e 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -162,7 +162,9 @@ int main(int argc, char *argv[]) main.init_opslog(); main.init_tracepoints(); main.init_lua(); +#ifdef WITH_RADOSGW_RADOS main.init_dedup(); +#endif r = main.init_frontends2(nullptr /* RGWLib */); if (r != 0) { derr << "ERROR: initialize frontend fail, r = " << r << dendl;