From: Gabriel BenHanokh Date: Mon, 15 Sep 2025 19:01:02 +0000 (+0000) Subject: rgw/dedup: split-head mechanism X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=48ba9c00caa3e2a705952259ce858cb4cf3b331b;p=ceph-ci.git rgw/dedup: split-head mechanism Split head object into 2 objects - one with attributes and no data and a new tail-object with only data. The new-tail object will be deduped (unlike the head objects which can't be dedup) We will split head for objects with size 16MB or less A few extra improvemnts: Skip objects created by server-side-copy Use reftag for comp-swap instead of manifest Skip shared-manifest objects after readint attributes Made max_obj_size_for_split and min_obj_size_for_dedup config value in rgw.yaml.in refined test: validate size after dedup TBD: add rados ls -l to report object size on-bulk to speedup the process improved tests - verify refcount are working, validate objects, remove duplicates and then verify the last remaining object making sure it was not deleted Signed-off-by: Gabriel BenHanokh --- diff --git a/doc/radosgw/s3_objects_dedup.rst b/doc/radosgw/s3_objects_dedup.rst index b0b83d0ddf7..b8b1ffbefa4 100644 --- a/doc/radosgw/s3_objects_dedup.rst +++ b/doc/radosgw/s3_objects_dedup.rst @@ -22,8 +22,6 @@ Admin commands Aborts an active dedup session and release all resources used by it. - ``radosgw-admin dedup stats``: Collects & displays last dedup statistics. -- ``radosgw-admin dedup estimate``: - Starts a new dedup estimate session (aborting first existing session if exists). - ``radosgw-admin dedup throttle --max-bucket-index-ops=``: Specify max bucket-index requests per second allowed for a single RGW server during dedup, 0 means unlimited. - ``radosgw-admin dedup throttle --stat``: @@ -34,13 +32,17 @@ Skipped Objects *************** Dedup Estimate process skips the following objects: -- Objects smaller than 4 MB (unless they are multipart). +- Objects smaller than rgw_dedup_min_obj_size_for_dedup (unless they are multipart). - Objects with different placement rules. - Objects with different pools. - Objects with different storage classes. The full dedup process skips all the above and it also skips **compressed** and **user-encrypted** objects. +The minimum size object for dedup is controlled by the following config option: + +.. confval:: rgw_dedup_min_obj_size_for_dedup + ******************* Estimate Processing ******************* @@ -85,6 +87,22 @@ If they are, we proceed with the deduplication: - copying the manifest from the source to the target. - removing all tail-objects on the target. +*************** +Split Head Mode +*************** +Dedup code can split the head object into 2 objects + +- one with attributes and no data and +- a new tail-object with only data. + +The new-tail object will be deduped (unlike the head objects which can't be deduplicated) + +The split-Head mode is controlled by the following central configuration option: + +.. confval:: rgw_dedup_max_obj_size_for_split + +We will split head for objects with size smaller or equal to rgw_dedup_max_obj_size_for_split + ************ Memory Usage ************ diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index a163117fb8b..a72a039a282 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -81,6 +81,29 @@ options: default: false services: - rgw +- name: rgw_dedup_max_obj_size_for_split + type: size + level: advanced + desc: The maximum RGW object size to split head. + A value of 0 (zero) disables the split-head functionality + long_desc: Dedup code can split head object into 2 objects - + one with attributes and no data and + a new tail-object with only data. + The new-tail object will be deduped (unlike the head objects which + can't be deduplicated) + We will split head for objects with size 16MB or less + default: 16_M + services: + - rgw + with_legacy: true +- name: rgw_dedup_min_obj_size_for_dedup + type: size + level: advanced + desc: The minimum RGW object size for dedup (0 means dedup all objects). + default: 64_K + services: + - rgw + with_legacy: true - name: rgw_max_chunk_size type: size level: advanced diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc index f841e8aad5a..c1174bc7ef4 100644 --- a/src/rgw/driver/rados/rgw_dedup.cc +++ b/src/rgw/driver/rados/rgw_dedup.cc @@ -83,6 +83,20 @@ namespace rgw::dedup { static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128; using storage_class_idx_t = uint8_t; + //--------------------------------------------------------------------------- + [[maybe_unused]] static int print_manifest(const DoutPrefixProvider *dpp, + RGWRados *rados, + const RGWObjManifest &manifest) + { + unsigned idx = 0; + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl; + } + ldpp_dout(dpp, 20) << "==============================================" << dendl; + return 0; + } + //--------------------------------------------------------------------------- void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie, uint64_t notifier_id, bufferlist &bl) @@ -321,7 +335,6 @@ namespace rgw::dedup { //--------------------------------------------------------------------------- static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store, const DoutPrefixProvider *dpp, - bool create, librados::IoCtx &ioctx) { const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool; @@ -329,11 +342,10 @@ namespace rgw::dedup { auto rados_handle = store->getRados()->get_rados_handle(); int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str()); if (pool_id >= 0) { - // TBD: what to do when create option is passed ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name << " already exists, pool_id=" << pool_id << dendl; } - else if (create) { + else { pool_id = create_pool(store, dpp, pool_name); if (pool_id >= 0) { ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name @@ -343,11 +355,6 @@ namespace rgw::dedup { return pool_id; } } - else { - ldpp_dout(dpp, 1) << __func__ - << "::ERR: pool doesn't exist and no create option" << dendl; - return -ENOENT; - } int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx); if (unlikely(ret < 0)) { @@ -382,7 +389,7 @@ namespace rgw::dedup { rados = store->getRados(); rados_handle = rados->get_rados_handle(); if (init_pool) { - int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx); + int ret = init_dedup_pool_ioctx(store, dpp, d_dedup_cluster_ioctx); display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__); return ret; } @@ -398,9 +405,14 @@ namespace rgw::dedup { d_cluster(dpp, cct, driver), d_watcher_ctx(this) { - d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size; d_head_object_size = cct->_conf->rgw_max_chunk_size; - //ceph_assert(4*1024*1024 == d_head_object_size); + d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup; + d_max_obj_size_for_split = cct->_conf->rgw_dedup_max_obj_size_for_split; + + ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size + << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup + << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split + << dendl; int ret = init_rados_access_handles(false); if (ret != 0) { @@ -413,6 +425,16 @@ namespace rgw::dedup { d_heart_beat_max_elapsed_sec = 3; } + //------------------------------------------------------------------------------ + uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes) + { + return calc_deduped_bytes(d_head_object_size, + d_min_obj_size_for_dedup, + d_max_obj_size_for_split, + num_parts, + size_bytes); + } + //--------------------------------------------------------------------------- int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr, const rgw::sal::Bucket *p_bucket, @@ -434,7 +456,8 @@ namespace rgw::dedup { } ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/" << obj_name << " was written to block_idx=" - << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl; + << rec_info.block_id << " rec_id=" << (int)rec_info.rec_id + << dendl; return 0; } @@ -450,12 +473,11 @@ namespace rgw::dedup { storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, &p_stats->failed_map_overflow); if (unlikely(sc_idx == remapper_t::NULL_IDX)) { - // TBD: need stat counters return -EOVERFLOW; } key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, p_rec->s.num_parts, sc_idx); - bool has_shared_manifest = p_rec->has_shared_manifest(); + bool has_shared_manifest = p_rec->s.flags.has_shared_manifest(); ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name << ", obj=" << p_rec->obj_name << ", block_id=" << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id @@ -504,6 +526,115 @@ namespace rgw::dedup { } #ifdef FULL_DEDUP_SUPPORT + //--------------------------------------------------------------------------- + static inline std::string build_oid(const std::string& bucket_id, + const std::string& obj_name) + { + std::string oid; + oid.reserve(bucket_id.size() + 1 + obj_name.size()); + oid.append(bucket_id).append("_").append(obj_name); + return oid; + } + + //--------------------------------------------------------------------------- + static int get_ioctx_internal(const DoutPrefixProvider* const dpp, + rgw::sal::Driver* driver, + rgw::sal::RadosStore* store, + const std::string &obj_name, + const std::string &instance, + const rgw_bucket &rb, + librados::IoCtx *p_ioctx, + std::string *p_oid) + { + unique_ptr bucket; + { + int ret = driver->load_bucket(dpp, rb, &bucket, null_yield); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): " + << cpp_strerror(-ret) << dendl; + return ret; + } + } + + string dummy_locator; + const rgw_obj_index_key key(obj_name, instance); + rgw_obj obj(bucket->get_key(), key); + get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator); + RGWBucketInfo& bucket_info = bucket->get_info(); + return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx); + } + + //--------------------------------------------------------------------------- + static inline int get_ioctx(const DoutPrefixProvider* const dpp, + rgw::sal::Driver* driver, + rgw::sal::RadosStore* store, + const disk_record_t *p_rec, + librados::IoCtx *p_ioctx, + std::string *p_oid) + { + rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; + return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance, + b, p_ioctx, p_oid); + } + + //--------------------------------------------------------------------------- + static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest) + { + static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW); + std::string_view suffix = "0"; + const std::string &prefix = manifest.get_prefix(); + + std::string tail_name; + tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1); + // TBD: + // it is unclear when RGW code pads with "_" before the shadow string + // It won't change correctness, but might look weird + //tail_name.append("_"); + tail_name.append(shadow_string); + tail_name.append("_"); + tail_name.append(prefix); + tail_name.append(suffix); + return tail_name; + } + + //--------------------------------------------------------------------------- + static void remove_created_tail_object(const DoutPrefixProvider *dpp, + librados::IoCtx &ioctx, + const std::string &tail_oid, + md5_stats_t *p_stats) + { + p_stats->rollback_tail_obj++; + int ret = ioctx.remove(tail_oid); + if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ << "::" << tail_oid + << " was successfully removed" << dendl; + } + else { + ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << tail_oid + << " ), ret=" << ret << "::" << cpp_strerror(-ret) < 0 && + d_max_obj_size_for_split && + obj_size <= d_max_obj_size_for_split); + } + + //--------------------------------------------------------------------------- + [[maybe_unused]] static bool empty_rgw_bucket(const rgw_bucket &b) + { + return (b.tenant.empty() && + b.name.empty() && + b.marker.empty() && + b.bucket_id.empty() && + b.explicit_placement.data_pool.empty() && + b.explicit_placement.data_extra_pool.empty() && + b.explicit_placement.index_pool.empty()); + } static constexpr uint64_t cost = 1; // 1 throttle unit per request static constexpr uint64_t id = 0; // ids unused @@ -527,15 +658,17 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - int Background::free_tail_objs_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &tgt_manifest) + int Background::free_tail_objs_by_manifest(const string &ref_tag, + const string &oid, + const RGWObjManifest &manifest) { unsigned idx = 0; - for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) { + std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); + for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) { rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); if (oid == raw_obj.oid) { - ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl; + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " + << raw_obj.oid << dendl; continue; } @@ -546,21 +679,25 @@ namespace rgw::dedup { << obj << dendl; continue; } - librados::IoCtx ioctx = obj.ioctx; - ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid - << dendl; + ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid << dendl; d_ctl.metadata_access_throttle.acquire(); - ret = ioctx.remove(raw_obj.oid); + ObjectWriteOperation op; + rgw::AioResultList completed; + cls_refcount_put(op, ref_tag, true); + completed = aio->get(obj.obj, + rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), + cost, id); } - + rgw::AioResultList completed = aio->drain(); return 0; } //--------------------------------------------------------------------------- - int Background::rollback_ref_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &manifest) + int Background::rollback_ref_by_manifest(const string &ref_tag, + const string &oid, + const RGWObjManifest &manifest) { + ldpp_dout(dpp, 20) << __func__ << "::" << oid << dendl; unsigned idx = 0; int ret_code = 0; std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); @@ -595,9 +732,9 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - int Background::inc_ref_count_by_manifest(const string &ref_tag, - const string &oid, - RGWObjManifest &manifest) + int Background::inc_ref_count_by_manifest(const string &ref_tag, + const string &oid, + const RGWObjManifest &manifest) { std::unique_ptr aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield); rgw::AioResultList all_results; @@ -614,14 +751,15 @@ namespace rgw::dedup { ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); if (ret < 0) { ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context " - << obj << dendl; + << raw_obj.oid << dendl; break; } ObjectWriteOperation op; cls_refcount_get(op, ref_tag, true); d_ctl.metadata_access_throttle.acquire(); - ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl; + ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " + << raw_obj.oid << "::" << obj.obj.oid << dendl; rgw::AioResultList completed = aio->get(obj.obj, rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), cost, id); @@ -629,14 +767,14 @@ namespace rgw::dedup { all_results.splice(all_results.end(), completed); if (ret < 0) { ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj - << ", the error code = " << ret << dendl; + << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl; break; } } if (ret == 0) { rgw::AioResultList completed = aio->drain(); - int ret = rgw::check_for_errors(completed); + ret = rgw::check_for_errors(completed); all_results.splice(all_results.end(), completed); if (ret == 0) { return 0; @@ -647,13 +785,14 @@ namespace rgw::dedup { } } - // if arrived here we failed somewhere -> rollback all ref-inc operations /* wait all pending op done */ rgw::AioResultList completed = aio->drain(); all_results.splice(all_results.end(), completed); int ret2 = 0; for (auto& aio_res : all_results) { if (aio_res.result < 0) { + ldpp_dout(dpp, 10) << __func__ << "::skip failed refcount inc: " + << aio_res.obj.oid << dendl; continue; // skip errors } rgw_rados_ref obj; @@ -664,219 +803,302 @@ namespace rgw::dedup { ObjectWriteOperation op; cls_refcount_put(op, ref_tag, true); + ldpp_dout(dpp, 10) << __func__ << "::rollback refcount inc on: " + << aio_res.obj.oid << dendl; rgw::AioResultList completed = aio->get(obj.obj, rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), cost, id); ret2 = rgw::check_for_errors(completed); if (ret2 < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl; + ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" + << aio_res.obj << dendl; } } completed = aio->drain(); ret2 = rgw::check_for_errors(completed); if (ret2 < 0) { ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret=" - << ret2 < bucket; - { - rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; - int ret = driver->load_bucket(dpp, b, &bucket, null_yield); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): " - << cpp_strerror(-ret) << dendl; - return ret; - } - } + ldpp_dout(dpp, 20) << __func__ << "::DEDUP SRC:" + << p_src_rec->bucket_name << "/" << p_src_rec->obj_name + << "(" << src_head_size << ") ::TGT:" + << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name + << "(" << tgt_head_size << ")" << dendl; + ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts + << "::ETAG=" << etag_bl.to_str() << dendl; + } - string dummy_locator; - const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance); - rgw_obj obj(bucket->get_key(), key); - get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator); - RGWBucketInfo& bucket_info = bucket->get_info(); - return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx); + //--------------------------------------------------------------------------- + /* The target (TGT) manifest must inherit the source (SRC) manifest, as both share + * the same tail objects. + * However, the TGT head object needs to maintain its unique identity, including + * its head-placement-rule and head-object parameters, which are stored in + * `rgw_obj`. + * + * The size of the TGT head object must be adjusted to match the SRC head size. + * This is straightforward when Split-Head is enabled, as both heads can be set to + * zero and all data is stored in the tail. + * + * A potential issue arises if the SRC and TGT have different head sizes and + * Split-Head is not used. + * While this scenario is unlikely in practice (as head-size is almost always 4MB), + * if it were to occur, we should abort the deduplication process to prevent data + * inconsistencies. + */ + static void adjust_target_manifest(const RGWObjManifest &src_manifest, + const RGWObjManifest &tgt_manifest, + bufferlist &new_manifest_bl) + { + // first create new_manifest from the src_manifest + RGWObjManifest new_manifest(src_manifest); + + // then, adjust head-object parameters to match the tgt_manifest + const uint64_t src_head_size = src_manifest.get_head_size(); + const auto& tgt_placement_rule = tgt_manifest.get_head_placement_rule(); + const rgw_obj &tgt_head_obj = tgt_manifest.get_obj(); + + new_manifest.set_head(tgt_placement_rule, tgt_head_obj, src_head_size); + encode(new_manifest, new_manifest_bl); } //--------------------------------------------------------------------------- - static void init_cmp_pairs(const disk_record_t *p_rec, - const bufferlist &etag_bl, - bufferlist &hash_bl, // OUT PARAM + static void init_cmp_pairs(const DoutPrefixProvider *dpp, + const disk_record_t *p_rec, + const bufferlist &etag_bl, + bufferlist &hash_bl, // OUT PARAM librados::ObjectWriteOperation *p_op) { p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl); - // TBD: do we really need the secondary compare using the full manifest? - // Can replace it with something cheaper like size/version? - p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl); + bufferlist ref_tag_bl; + ref_tag_bl.append(p_rec->ref_tag); + if (p_rec->s.flags.is_ref_tag_from_tail()) { + p_op->cmpxattr(RGW_ATTR_TAIL_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl); + } + else { + p_op->cmpxattr(RGW_ATTR_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl); + } // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { + for (unsigned i = 0; i < HASH_UNITS; i++) { ceph::encode(p_rec->s.hash[i], hash_bl); } if (!p_rec->s.flags.hash_calculated()) { + ldpp_dout(dpp, 20) << __func__ << "::CMP HASH " << p_rec->obj_name << dendl; p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl); } } //--------------------------------------------------------------------------- - int Background::dedup_object(const disk_record_t *p_src_rec, - const disk_record_t *p_tgt_rec, - md5_stats_t *p_stats, - bool has_shared_manifest_src) + static inline void build_manifest_hash_bl(const bufferlist &manifest_bl, + bufferlist &manifest_hash_bl) { - RGWObjManifest src_manifest; - try { - auto bl_iter = p_src_rec->manifest_bl.cbegin(); - decode(src_manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl; - return -EINVAL; - } - RGWObjManifest tgt_manifest; - try { - auto bl_iter = p_tgt_rec->manifest_bl.cbegin(); - decode(tgt_manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl; - return -EINVAL; - } - ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: " - << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> " - << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl; + bufferlist hash_bl; + crypto::digest(manifest_bl).encode(hash_bl); + // Use a shorter hash (64bit instead of 160bit) + hash_bl.splice(0, 8, &manifest_hash_bl); + } + //--------------------------------------------------------------------------- + int Background::dedup_object(disk_record_t *p_src_rec, + disk_record_t *p_tgt_rec, + const RGWObjManifest &src_manifest, + const RGWObjManifest &tgt_manifest, + md5_stats_t *p_stats, + const dedup_table_t::value_t *p_src_val, + const std::string &tail_oid) + { + const uint64_t src_head_size = src_manifest.get_head_size(); + const uint64_t tgt_head_size = tgt_manifest.get_head_size(); bufferlist etag_bl; etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl); - ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts - << "::ETAG=" << etag_bl.to_str() << dendl; - - bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl; - crypto::digest(p_src_rec->manifest_bl).encode(hash_bl); - // Use a shorter hash (64bit instead of 160bit) - hash_bl.splice(0, 8, &manifest_hash_bl); - librados::ObjectWriteOperation tgt_op; - init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op); - tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); - tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl); - if (p_tgt_rec->s.flags.hash_calculated()) { - tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl); - p_stats->set_hash_attrs++; + bool should_print_debug = cct->_conf->subsys.should_gather(); + if (unlikely(should_print_debug)) { + dedup_object_log(dpp, p_src_rec, p_tgt_rec, src_head_size, tgt_head_size, etag_bl); } std::string src_oid, tgt_oid; librados::IoCtx src_ioctx, tgt_ioctx; - int ret1 = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid); - int ret2 = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid); - if (unlikely(ret1 != 0 || ret2 != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl; - return (ret1 ? ret1 : ret2); + int ret = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid); + if (unlikely(ret != 0)) { + // can't remove created tail object without an ioctx handle + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed SRC get_ioctx()" << dendl; + return ret; } - // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG?? - string ref_tag = p_tgt_rec->ref_tag; + ret = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl; + if (p_src_rec->s.flags.is_split_head()) { + remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + } + return ret; + } + + // we don't dedup head-objects so head-size must match (unless split-head) + // see explanation in adjust_target_manifest() + if (unlikely(src_head_size != 0 && src_head_size != tgt_head_size)) { + ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size + << "::tgt_head_size=" << tgt_head_size << dendl; + if (p_src_rec->s.flags.is_split_head()) { + remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + } + // TBD: can we create a test case (requires control over head-object-size)?? + return -ECANCELED; + } + + const string &ref_tag = p_tgt_rec->ref_tag; ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl; - int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); - if (ret == 0) { - d_ctl.metadata_access_throttle.acquire(); - ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl; - ret = tgt_ioctx.operate(tgt_oid, &tgt_op); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate(" - << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl; - rollback_ref_by_manifest(ref_tag, src_oid, src_manifest); - return ret; + ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); + if (unlikely(ret != 0)) { + if (p_src_rec->s.flags.is_split_head()) { + remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); } + return ret; + } - // free tail objects based on TGT manifest - free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest); + bufferlist manifest_hash_bl; + build_manifest_hash_bl(p_src_rec->manifest_bl, manifest_hash_bl); - if (!has_shared_manifest_src) { - // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST - // after deduping B and update it in dedup_table, but don't update the - // disk-record (as require an expensive random-disk-write). - // When deduping C we can trust the shared_manifest state in the table and - // skip a redundant update to SRC object attribute + if (!p_src_val->has_shared_manifest()) { + // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST + // after deduping B and update it in dedup_table, but don't update the + // disk-record (as require an expensive random-disk-write). + // When deduping C we can trust the shared_manifest state in the table and + // skip a redundant update to SRC object attribute + librados::ObjectWriteOperation src_op; + { bufferlist src_hash_bl; - librados::ObjectWriteOperation src_op; - init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op); + init_cmp_pairs(dpp, p_src_rec, etag_bl, src_hash_bl, &src_op); src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); - if (p_src_rec->s.flags.hash_calculated()) { + if (p_src_rec->s.flags.hash_calculated() && !p_src_val->has_valid_hash()){ src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl); + ldpp_dout(dpp, 20) << __func__ <<"::Set SRC Strong Hash in CLS"<< dendl; p_stats->set_hash_attrs++; } + } - d_ctl.metadata_access_throttle.acquire(); - ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl; - ret = src_ioctx.operate(src_oid, &src_op); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate(" - << src_oid << "), err is " << cpp_strerror(-ret)<s.flags.is_split_head()) { + ldpp_dout(dpp, 20) << __func__ <<"::SRC-Split (truncate)" << dendl; + src_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl); + src_op.truncate(0); + p_stats->split_head_src++; + } + d_ctl.metadata_access_throttle.acquire(); + ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS"<< dendl; + ret = src_ioctx.operate(src_oid, &src_op); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate(" + << src_oid << "), err is " << cpp_strerror(-ret)<s.flags.is_split_head()) { + remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); } + return ret; + } + } + + librados::ObjectWriteOperation tgt_op; + { + bufferlist tgt_hash_bl; + init_cmp_pairs(dpp, p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op); + tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl); + bufferlist new_manifest_bl; + adjust_target_manifest(src_manifest, tgt_manifest, new_manifest_bl); + tgt_op.setxattr(RGW_ATTR_MANIFEST, new_manifest_bl); + //tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl); + if (p_tgt_rec->s.flags.hash_calculated()) { + tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl); + ldpp_dout(dpp, 20) << __func__ <<"::Set TGT Strong Hash in CLS"<< dendl; + p_stats->set_hash_attrs++; } } + // If failed before this point and split-head -> remove the new tail-object + if (src_head_size == 0 && tgt_head_size > 0) { + ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split OP (truncate)" << dendl; + p_tgt_rec->s.flags.set_split_head(); + tgt_op.truncate(0); + p_stats->split_head_tgt++; + } + d_ctl.metadata_access_throttle.acquire(); + ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS" << dendl; + ret = tgt_ioctx.operate(tgt_oid, &tgt_op); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate(" + << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl; + rollback_ref_by_manifest(ref_tag, src_oid, src_manifest); + return ret; + } + + // free tail objects based on TGT manifest + free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest); + // do we need to set compression on the head object or is it set on tail? // RGW_ATTR_COMPRESSION return ret; } //--------------------------------------------------------------------------- - int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash) + int Background::calc_object_blake3(const RGWObjManifest &manifest, + disk_record_t *p_rec, + uint8_t *p_hash, + blake3_hasher *p_pre_calc_hmac) { - ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl; - RGWObjManifest manifest; - try { - auto bl_iter = p_rec->manifest_bl.cbegin(); - decode(manifest, bl_iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest for: " - << p_rec->obj_name << dendl; - return -EINVAL; + ldpp_dout(dpp, 20) << __func__ << "::p_rec->obj_name=" << p_rec->obj_name << dendl; + + blake3_hasher _hmac, *p_hmac = nullptr; + if (!p_pre_calc_hmac) { + blake3_hasher_init(&_hmac); + p_hmac = &_hmac; + } + else { + p_hmac = p_pre_calc_hmac; } - blake3_hasher hmac; - blake3_hasher_init(&hmac); for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) { - rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); - rgw_rados_ref obj; - int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); - if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: " - << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; - return ret; - } + uint64_t offset = p.get_stripe_ofs(); + const rgw_obj_select& os = p.get_location(); + if (offset > 0 || !p_pre_calc_hmac) { + rgw_raw_obj raw_obj = os.get_raw_obj(rados); + rgw_rados_ref obj; + int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid=" + << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; + return ret; + } - bufferlist bl; - librados::IoCtx ioctx = obj.ioctx; - // read full object - ret = ioctx.read(raw_obj.oid, bl, 0, 0); - if (ret > 0) { + librados::IoCtx ioctx = obj.ioctx; + bufferlist bl; + // read full object + ret = ioctx.read(raw_obj.oid, bl, 0, 0); + if (unlikely(ret <= 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read oid " + << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; + return ret; + } for (const auto& bptr : bl.buffers()) { - blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length()); + blake3_hasher_update(p_hmac, (const unsigned char *)bptr.c_str(), bptr.length()); } } - else { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid - << ", error is " << cpp_strerror(-ret) << dendl; - return ret; - } } - - blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN); + blake3_hasher_finalize(p_hmac, p_hash, BLAKE3_OUT_LEN); + p_rec->s.flags.set_hash_calculated(); + p_rec->s.flags.set_has_valid_hash(); return 0; } @@ -890,28 +1112,58 @@ namespace rgw::dedup { { ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name << ", obj=" << p_tgt_rec->obj_name + << ", bytes_size=" << p_tgt_rec->s.obj_bytes_size << ", block_id=" << block_id - << ", rec_id=" << (int)rec_id - << ", md5_shard=" << (int)md5_shard << dendl; - - ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard - << "::" << p_tgt_rec->bucket_name - << "/" << p_tgt_rec->obj_name + << ", rec_id=" << (int)rec_id << "\n" + << ", md5_shard=" << (int)md5_shard << "::num_parts=" << p_tgt_rec->s.num_parts << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high << p_tgt_rec->s.md5_low << std::dec << dendl; } //--------------------------------------------------------------------------- - int Background::add_obj_attrs_to_record(rgw_bucket *p_rb, - disk_record_t *p_rec, + static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement) + { + return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty()); + } + + //--------------------------------------------------------------------------- + static void set_explicit_tail_placement(const DoutPrefixProvider* dpp, + RGWObjManifest *p_manifest,// IN-OUT PARAM + md5_stats_t *p_stats) + { + p_stats->manifest_no_tail_placement++; + ldpp_dout(dpp, 20) << __func__ << "::invalid_tail_placement -> update" << dendl; + const rgw_bucket_placement& tail_placement = p_manifest->get_tail_placement(); + const rgw_bucket *p_bucket = &tail_placement.bucket; + + if (tail_placement.bucket.name.empty()) { + // bucket was not set in tail_placement, force the head bucket explicitly + const rgw_obj& head_obj = p_manifest->get_obj(); + p_bucket = &head_obj.bucket; + } + + if (tail_placement.placement_rule.name.empty()) { + // explicitly use the head_placement_rule for tail objects and update bucket + // if needed + const auto &head_placement_rule = p_manifest->get_head_placement_rule(); + p_manifest->set_tail_placement(head_placement_rule, *p_bucket); + } + else { + // otherwise, keep the tail_placement_rule in place (but still update bucket) + p_manifest->set_tail_placement(tail_placement.placement_rule, *p_bucket); + } + } + + //--------------------------------------------------------------------------- + int Background::add_obj_attrs_to_record(disk_record_t *p_rec, const rgw::sal::Attrs &attrs, - dedup_table_t *p_table, md5_stats_t *p_stats) /*IN-OUT*/ { // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG auto itr = attrs.find(RGW_ATTR_TAIL_TAG); if (itr != attrs.end()) { + p_rec->s.flags.set_ref_tag_from_tail(); p_rec->ref_tag = itr->second.to_str(); } else { @@ -929,10 +1181,11 @@ namespace rgw::dedup { // clear bufferlist first p_rec->manifest_bl.clear(); + bool need_to_split_head = false; + RGWObjManifest manifest; itr = attrs.find(RGW_ATTR_MANIFEST); if (itr != attrs.end()) { const bufferlist &bl = itr->second; - RGWObjManifest manifest; try { auto bl_iter = bl.cbegin(); decode(manifest, bl_iter); @@ -941,12 +1194,13 @@ namespace rgw::dedup { << "::ERROR: unable to decode manifest" << dendl; return -EINVAL; } + need_to_split_head = should_split_head(manifest.get_head_size(), + p_rec->s.obj_bytes_size); // force explicit tail_placement as the dedup could be on another bucket const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); - if (tail_placement.bucket.name.empty()) { - ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl; - manifest.set_tail_placement(tail_placement.placement_rule, *p_rb); + if (unlikely(invalid_tail_placement(tail_placement))) { + set_explicit_tail_placement(dpp, &manifest, p_stats); encode(manifest, p_rec->manifest_bl); } else { @@ -958,6 +1212,18 @@ namespace rgw::dedup { ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl; return -EINVAL; } + const auto &head_placement_rule = manifest.get_head_placement_rule(); + const std::string& storage_class = + rgw_placement_rule::get_canonical_storage_class(head_placement_rule.storage_class); + + // p_rec holds an the storage_class value taken from the bucket-index/obj-attr + if (unlikely(storage_class != p_rec->stor_class)) { + ldpp_dout(dpp, 5) << __func__ << "::ERROR::manifest storage_class=" + << storage_class << " != " << "::bucket-index storage_class=" + << p_rec->stor_class << dendl; + p_stats->different_storage_class++; + return -EINVAL; + } itr = attrs.find(RGW_ATTR_SHARE_MANIFEST); if (itr != attrs.end()) { @@ -983,14 +1249,13 @@ namespace rgw::dedup { if (itr != attrs.end()) { try { auto bl_iter = itr->second.cbegin(); - // BLAKE3 hash 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + for (unsigned i = 0; i < HASH_UNITS; i++) { uint64_t val; ceph::decode(val, bl_iter); p_rec->s.hash[i] = val; } + p_rec->s.flags.set_has_valid_hash(); p_stats->valid_hash_attrs++; return 0; } catch (buffer::error& err) { @@ -999,16 +1264,17 @@ namespace rgw::dedup { } } + // if arrived here we need to calculate string hash p_stats->invalid_hash_attrs++; - // TBD: redundant memset... memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash)); - // BLAKE3_OUT_LEN is 32 Bytes - int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash); - if (ret == 0) { - p_rec->s.flags.set_hash_calculated(); - } - return ret; + if (!need_to_split_head) { + ldpp_dout(dpp, 20) << __func__ << "::CALC Object Strong Hash::" + << p_rec->obj_name << dendl; + return calc_object_blake3(manifest, p_rec, (uint8_t*)p_rec->s.hash); + } + // else, differ strong-hash calculation for next step and piggy back split-head + return 0; } //--------------------------------------------------------------------------- @@ -1035,7 +1301,6 @@ namespace rgw::dedup { storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp, &p_stats->failed_map_overflow); if (unlikely(sc_idx == remapper_t::NULL_IDX)) { - // TBD: need stat counters return -EOVERFLOW; } key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units, @@ -1043,7 +1308,7 @@ namespace rgw::dedup { dedup_table_t::value_t src_val; int ret = p_table->get_val(&key_from_bucket_index, &src_val); if (ret != 0) { - if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) { + if (!dedupable_object(p_rec->multipart_object(), d_min_obj_size_for_dedup, ondisk_byte_size)) { // record has no valid entry in table because it is a too small // It was loaded to table for calculation and then purged p_stats->skipped_purged_small++; @@ -1113,6 +1378,19 @@ namespace rgw::dedup { } const rgw::sal::Attrs& attrs = p_obj->get_attrs(); + if (src_val.has_shared_manifest() && (attrs.find(RGW_ATTR_SHARE_MANIFEST) != attrs.end())) { + // A shared_manifest object can't be a dedup target + // We only need to keep a single shared_manifest object + // to be used as a dedup-source (which we already got) + p_stats->skipped_shared_manifest++; + uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_rec->s.num_parts, + ondisk_byte_size); + p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes; + ldpp_dout(dpp, 20) << __func__ << "::(1)skipped shared_manifest, SRC::block_id=" + << src_val.block_idx << "::rec_id=" << (int)src_val.rec_id << dendl; + return 0; + } + if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) { p_stats->ingress_skip_encrypted++; p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size; @@ -1121,7 +1399,7 @@ namespace rgw::dedup { return 0; } - // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed + // TBD-Future: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) { p_stats->ingress_skip_compressed++; p_stats->ingress_skip_compressed_bytes += ondisk_byte_size; @@ -1154,6 +1432,16 @@ namespace rgw::dedup { else { storage_class = RGW_STORAGE_CLASS_STANDARD; } + + // p_rec holds an the storage_class value taken from the bucket-index + if (unlikely(storage_class != p_rec->stor_class)) { + ldpp_dout(dpp, 5) << __func__ << "::ERROR::ATTR storage_class=" + << storage_class << " != " << "::bucket-index storage_class=" + << p_rec->stor_class << dendl; + p_stats->different_storage_class++; + return -EINVAL; + } + // no need to check for remap success as we compare keys bellow sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow); key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low, @@ -1169,7 +1457,7 @@ namespace rgw::dedup { // reset flags p_rec->s.flags.clear(); - ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats); + ret = add_obj_attrs_to_record(p_rec, attrs, p_stats); if (unlikely(ret != 0)) { ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret=" << ret << "::" << cpp_strerror(-ret) << dendl; @@ -1180,13 +1468,16 @@ namespace rgw::dedup { ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info); if (ret == 0) { // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest - ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK); + if (unlikely(rec_info.rec_id >= MAX_REC_IN_BLOCK)) { + p_stats->illegal_rec_id++; + } ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" << p_rec->obj_name << " was written to block_idx=" << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id - << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl; + << "::shared_manifest=" + << p_rec->s.flags.has_shared_manifest() << dendl; p_table->update_entry(&key_from_bucket_index, rec_info.block_id, - rec_info.rec_id, p_rec->has_shared_manifest()); + rec_info.rec_id, p_rec->s.flags.has_shared_manifest()); } else { ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl; @@ -1198,17 +1489,18 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp, - rgw::sal::Driver* driver, - rgw::sal::RadosStore *store, - const disk_record_t *p_rec) + static int write_hash_object_attribute(const DoutPrefixProvider* const dpp, + rgw::sal::Driver* driver, + rgw::sal::RadosStore *store, + const disk_record_t *p_rec, + md5_stats_t *p_stats) { bufferlist etag_bl; bufferlist hash_bl; librados::ObjectWriteOperation op; etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts, &etag_bl); - init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op); + init_cmp_pairs(dpp, p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op); op.setxattr(RGW_ATTR_BLAKE3, hash_bl); std::string oid; @@ -1224,9 +1516,420 @@ namespace rgw::dedup { ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate(" << oid << "), err is " << cpp_strerror(-ret) << dendl; } + ldpp_dout(dpp, 20) << __func__ <<"::Write Strong Hash to " << p_rec->obj_name + << dendl; + p_stats->set_hash_attrs++; return ret; } + //--------------------------------------------------------------------------- + static bool compare_strong_hash(const DoutPrefixProvider *const dpp, + const disk_record_t *p_src_rec, + const disk_record_t *p_tgt_rec, + md5_stats_t *p_stats) + { + if (unlikely(0 != memcmp(p_src_rec->s.hash, p_tgt_rec->s.hash, sizeof(p_src_rec->s.hash)))) { + p_stats->hash_mismatch++; + ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl; + return false; + } + ldpp_dout(dpp, 20) << __func__ << "::SRC-TGT Strong-Hash match" << dendl; + // all is good + return true; + } + + //--------------------------------------------------------------------------- + static int read_hash_and_manifest(const DoutPrefixProvider *const dpp, + rgw::sal::Driver *driver, + RGWRados *rados, + disk_record_t *p_rec) + { + librados::IoCtx ioctx; + std::string oid; + int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl; + return ret; + } + + std::map attrset; + ret = ioctx.getxattrs(oid, attrset); + if (unlikely(ret < 0)) { + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.getxattrs(" + << oid << "), err is " << cpp_strerror(-ret) << dendl; + return ret; + } + + auto itr = attrset.find(RGW_ATTR_BLAKE3); + if (itr != attrset.end()) { + try { + auto bl_iter = itr->second.cbegin(); + // BLAKE3 hash has 256 bit splitted into multiple 64bit units + for (unsigned i = 0; i < HASH_UNITS; i++) { + uint64_t val; + ceph::decode(val, bl_iter); + p_rec->s.hash[i] = val; + } + p_rec->s.flags.set_has_valid_hash(); + // the hash was taken directly from the object attributes and not calculated + p_rec->s.flags.clear_hash_calculated(); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl; + return -EINVAL; + } + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: No HASH attribute" << dendl; + return -ENOENT; + } + + itr = attrset.find(RGW_ATTR_MANIFEST); + if (itr != attrset.end()) { + ldpp_dout(dpp, 20) << __func__ << "::Got Manifest " << p_rec->obj_name << dendl; + p_rec->manifest_bl = itr->second; + p_rec->s.manifest_len = p_rec->manifest_bl.length(); + } + else { + ldpp_dout(dpp, 1) << __func__ << "::ERR: No Manifest attribute" << dendl; + return -ENOENT; + } + + return 0; + } + + //--------------------------------------------------------------------------- + static void set_explicit_manifest(RGWObjManifest *p_manifest, + std::map &objs_map) + { + uint64_t obj_size = p_manifest->get_obj_size(); + p_manifest->set_head_size(0); + p_manifest->set_max_head_size(0); + p_manifest->set_prefix(""); + p_manifest->clear_rules(); + p_manifest->set_explicit(obj_size, objs_map); + } + + //--------------------------------------------------------------------------- + // This code is based on RGWObjManifest::convert_to_explicit() + static void build_explicit_objs_map(const DoutPrefixProvider *dpp, + RGWRados *rados, + const RGWObjManifest &manifest, + const rgw_bucket *p_bucket, + std::map *p_objs_map, + const std::string &tail_name, + md5_stats_t *p_stats) + { + bool manifest_raw_obj_logged = false; + unsigned idx = 0; + auto p = manifest.obj_begin(dpp); + while (p != manifest.obj_end(dpp)) { + const uint64_t offset = p.get_stripe_ofs(); + const rgw_obj_select& os = p.get_location(); + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]OBJ: " + << os.get_raw_obj(rados).oid << "::ofs=" << p.get_ofs() + << "::strp_offset=" << offset << dendl; + + RGWObjManifestPart& part = (*p_objs_map)[offset]; + part.loc_ofs = 0; + + if (offset == 0) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] HEAD OBJ: " + << os.get_raw_obj(rados).oid << dendl; + const rgw_obj &head_obj = manifest.get_obj(); + const rgw_obj_key &head_key = head_obj.key; + // TBD: Can we have different instance/ns values for head/tail ?? + // Should we take the instance/ns from the head or tail? + // Maybe should refuse objects with different instance/ns on head/tail ? + rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns); + rgw_obj tail_obj(*p_bucket, tail_key); + part.loc = tail_obj; + } + else { + // RGWObjManifest::convert_to_explicit() is assuming raw_obj, but looking + // at the RGWObjManifest::obj_iterator code it is clear the obj is not raw. + // If it happens to be raw we still handle it correctly (and inc stat-count) + std::optional obj_opt = os.get_head_obj(); + if (obj_opt.has_value()) { + part.loc = obj_opt.value(); + } + else { + // report raw object in manifest only once + if (!manifest_raw_obj_logged) { + manifest_raw_obj_logged = true; + ldpp_dout(dpp, 10) << __func__ << "::WARN: obj is_raw" << dendl; + p_stats->manifest_raw_obj++; + } + const rgw_raw_obj& raw = os.get_raw_obj(rados); + RGWSI_Tier_RADOS::raw_obj_to_obj(*p_bucket, raw, &part.loc); + } + } + + ++p; + uint64_t next_offset = p.get_stripe_ofs(); + part.size = next_offset - offset; + idx++; + } // while (p != manifest.obj_end()) + } + + //--------------------------------------------------------------------------- + int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM + RGWObjManifest &src_manifest, // IN/OUT PARAM + const disk_record_t *p_tgt_rec, + std::string &tail_oid, // OUT PARAM + md5_stats_t *p_stats) + { + ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::" + << p_src_rec->s.obj_bytes_size << dendl; + + uint64_t head_size = src_manifest.get_head_size(); + bufferlist bl; + std::string head_oid; + librados::IoCtx ioctx; + int ret = get_ioctx(dpp, driver, rados, p_src_rec, &ioctx, &head_oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl; + return ret; + } + + // read the full rados head-object + ldpp_dout(dpp, 20) << __func__ << "::ioctx.read(" << head_oid << ")" << dendl; + ret = ioctx.read(head_oid, bl, 0, 0); + if (unlikely(ret != (int)head_size)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << head_oid + << ", ret=" << ret << ", error is " << cpp_strerror(-ret) << dendl; + return ret; + } + + // we might have a valid hash left from a failed dedup (mismatch SRC/TGT) + if (!p_src_rec->s.flags.has_valid_hash()) { + ldpp_dout(dpp, 20) << __func__ << "::calc BLK3 for SRC " + << p_src_rec->obj_name << dendl; + blake3_hasher hmac; + blake3_hasher_init(&hmac); + for (const auto& bptr : bl.buffers()) { + blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), + bptr.length()); + } + uint8_t *p_hash = (uint8_t*)p_src_rec->s.hash; + ret = calc_object_blake3(src_manifest, p_src_rec, p_hash, &hmac); + if (unlikely(ret != 0)) { + return ret; + } + + // cancel split-head operation if strong hash differ + if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) { + return -ECANCELED; + } + } + + bool exclusive = true; // block overwrite + std::string tail_name = generate_split_head_tail_name(src_manifest); + const rgw_bucket_placement &tail_placement = src_manifest.get_tail_placement(); + // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort + if (unlikely(invalid_tail_placement(tail_placement))) { + p_stats->split_head_no_tail_placement++; + ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl; + return -EINVAL; + } + + const rgw_bucket *p_bucket = &tail_placement.bucket; + // tail objects might be on another storage_class/pool, need another ioctx + librados::IoCtx tail_ioctx; + ret = get_ioctx_internal(dpp, driver, store, tail_name, p_src_rec->instance, + *p_bucket, &tail_ioctx, &tail_oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl; + return ret; + } + + ret = tail_ioctx.create(tail_oid, exclusive); + if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ << "::successfully created: " << tail_oid << dendl; + } + else if (ret == -EEXIST) { + // should not happen as we take the prefix with unused counter 0 + // better to skip this dedup opportunity + ldpp_dout(dpp, 1) << __func__ << "::ERR object " << tail_oid << " exists!" << dendl; + p_stats->failed_split_head_creat++; + return ret; + } + else{ + ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << tail_oid + <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <obj_name << dendl; + ret = calc_object_blake3(tgt_manifest, p_tgt_rec, (uint8_t*)p_tgt_rec->s.hash); + if (unlikely(ret != 0)) { + // Don't run dedup without a valid strong hash + return false; + } + } + + // SRC hash could have been calculated and stored in obj-attributes before + // (will happen when we got multiple targets) + if (!p_src_rec->s.flags.has_valid_hash() && p_src_val->has_valid_hash()) { + // read the manifest and strong hash from the head-object attributes + ldpp_dout(dpp, 20) << __func__ << "::Fetch SRC strong hash from head-object::" + << p_src_rec->obj_name << dendl; + if (unlikely(read_hash_and_manifest(dpp, driver, rados, p_src_rec) != 0)) { + return false; + } + try { + auto bl_iter = p_src_rec->manifest_bl.cbegin(); + decode(src_manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed manifest decode" << dendl; + return false; + } + } + + // check hash before trying to split head (can be skipped if not equal) + if (p_src_rec->s.flags.has_valid_hash()) { + if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) { + return false; + } + } + + // we might still need to split-head here when hash is valid + // can happen if we failed compare before (md5-collison) and stored the src hash + // in the obj-attributes + uint64_t head_size = src_manifest.get_head_size(); + if (should_split_head(head_size, src_manifest.get_obj_size())) { + ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, tail_oid, p_stats); + // compare_strong_hash() is called internally by split_head_object() + return (ret == 0); + } + else if (!p_src_rec->s.flags.has_valid_hash()) { + // object not targeted for split_head it should have a valid hash -> skip it + ldpp_dout(dpp, 5) << __func__ + << "::ERR: object not targeted for split_head has no hash" << dendl; + p_stats->invalid_hash_no_split_head++; + return false; + } + + return true; + } + + //--------------------------------------------------------------------------- + static bool parse_manifests(const DoutPrefixProvider *dpp, + const disk_record_t *p_src_rec, + const disk_record_t *p_tgt_rec, + RGWObjManifest *p_src_manifest, + RGWObjManifest *p_tgt_manifest) + { + bool valid_src_manifest = false; + try { + auto bl_iter = p_src_rec->manifest_bl.cbegin(); + decode(*p_src_manifest, bl_iter); + valid_src_manifest = true; + bl_iter = p_tgt_rec->manifest_bl.cbegin(); + decode(*p_tgt_manifest, bl_iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad " + << (valid_src_manifest? "TGT" : "SRC") + << " manifest" << dendl; + return -EINVAL; + } + + return 0; + } + + //--------------------------------------------------------------------------- + static bool has_shared_tail_objects(const DoutPrefixProvider *dpp, + RGWRados *rados, + const disk_record_t *p_src_rec, + const disk_record_t *p_tgt_rec, + const RGWObjManifest &src_manifest, + const RGWObjManifest &tgt_manifest, + md5_stats_t *p_stats) + { + // Build a vector with all tail-objects on the SRC and then iterate over + // the TGT tail-objects looking for a single tail-object in both manifets. + // If found -> abort the dedup + // The only case leading to this scenario is server-side-copy + // It is probably enough to scan the first few tail-objects, but better safe... + std::string src_oid = build_oid(p_src_rec->bucket_id, p_src_rec->obj_name); + std::string tgt_oid = build_oid(p_tgt_rec->bucket_id, p_tgt_rec->obj_name); + std::vector vec; + unsigned idx = 0; + for (auto p = src_manifest.obj_begin(dpp); p != src_manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + if (src_oid != raw_obj.oid) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl; + vec.push_back(raw_obj.oid); + } + else { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " + << raw_obj.oid << dendl; + continue; + } + } + idx = 0; + for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) { + rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados); + if (tgt_oid != raw_obj.oid) { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl; + // Search for the tail_obj in the vector + // should be one of the first entries (first or second) + auto itr = std::find(vec.begin(), vec.end(), raw_obj.oid); + if (unlikely(itr != vec.end())) { + ldpp_dout(dpp, 10) << __func__ << "::tail obj " << raw_obj.oid + << " exists on both SRC and TGT Objects -> Abort DEDUP!"<< dendl; + p_stats->skip_shared_tail_objs ++; + return true; + } + } + else { + ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " + << raw_obj.oid << dendl; + continue; + } + } + + return false; + } + //--------------------------------------------------------------------------- // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table // so all entries left are sources of dedup with multiple copies. @@ -1238,35 +1941,39 @@ namespace rgw::dedup { // we can withstand most errors moving to the next object // only report an error if we recived a stop scan request! // - int Background::try_deduping_record(dedup_table_t *p_table, - const disk_record_t *p_tgt_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats, /* IN-OUT */ - remapper_t *remapper) - { - bool should_print_debug = cct->_conf->subsys.should_gather(); + int Background::try_deduping_record(dedup_table_t *p_table, + disk_record_t *p_tgt_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats, /* IN-OUT */ + remapper_t *remapper) + { + bool should_print_debug = cct->_conf->subsys.should_gather(); if (unlikely(should_print_debug)) { print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard); } - uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size); storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp, &p_stats->failed_map_overflow); - ceph_assert(sc_idx != remapper_t::NULL_IDX); + if (unlikely(sc_idx == remapper_t::NULL_IDX)) { + ldpp_dout(dpp, 5) << __func__ << "::invalid_storage_class_mapping for " + << p_tgt_rec->stor_class << "::" << p_tgt_rec->obj_name << dendl; + p_stats->invalid_storage_class_mapping++; + return 0; + } key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units, p_tgt_rec->s.num_parts, sc_idx); dedup_table_t::value_t src_val; int ret = p_table->get_val(&key, &src_val); - if (ret != 0) { + if (unlikely(ret != 0)) { // record has no valid entry in table because it is a singleton // should never happened since we purged all singletons before ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high << p_tgt_rec->s.md5_low << std::dec << dendl; - ceph_abort("Unexpcted singleton"); + p_stats->singleton_after_purge++; return 0; } @@ -1275,86 +1982,115 @@ namespace rgw::dedup { if (block_id == src_block_id && rec_id == src_rec_id) { // the table entry point to this record which means it is a dedup source so nothing to do p_stats->skipped_source_record++; - ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl; + ldpp_dout(dpp, 20) << __func__ << "::(2)skipped source-record, block_id=" + << block_id << "::rec_id=" << (int)rec_id << dendl; return 0; } - // ceph store full blocks so need to round up and multiply by block_size - uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); - uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size, - p_tgt_rec->s.num_parts, - ondisk_byte_size); + // should never happen if (p_tgt_rec->s.flags.has_shared_manifest()) { // record holds a shared_manifest object so can't be a dedup target - p_stats->skipped_shared_manifest++; - p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes; - ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl; + ldpp_dout(dpp, 1) << __func__ << "::(3)skipped shared_manifest, block_id=" + << block_id << "::rec_id=" << (int)rec_id << dendl; + p_stats->shared_manifest_after_purge++; return 0; } + // ceph store full blocks so need to round up and multiply by block_size + uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units); + uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_tgt_rec->s.num_parts, + ondisk_byte_size); + // This records is a dedup target with source record on source_block_id - disk_record_t src_rec; - ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id, + disk_record_t src_rec, *p_src_rec = &src_rec; + ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, p_src_rec, src_block_id, src_rec_id, md5_shard, dpp); if (unlikely(ret != 0)) { p_stats->failed_src_load++; // we can withstand most errors moving to the next object ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record(" - << src_block_id << ", " << src_rec_id << ")" << dendl; + << src_block_id << ", " << (int)src_rec_id << ")" << dendl; return 0; } - ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name - << "/" << src_rec.obj_name << dendl; + ldpp_dout(dpp, 20) << __func__ << "::SRC:" << p_src_rec->bucket_name << "/" + << p_src_rec->obj_name << "::TGT:" << p_tgt_rec->bucket_name + << "/" << p_tgt_rec->obj_name << dendl; // verify that SRC and TGT records don't refer to the same physical object // This could happen in theory if we read the same objects twice - if (src_rec.ref_tag == p_tgt_rec->ref_tag) { + if (p_src_rec->ref_tag == p_tgt_rec->ref_tag) { p_stats->duplicate_records++; ldpp_dout(dpp, 10) << __func__ << "::WARN::REF_TAG::Duplicate records for " - << src_rec.obj_name << "::" << src_rec.ref_tag << "::" + << p_src_rec->obj_name << "::" << p_src_rec->ref_tag <<"::" << p_tgt_rec->obj_name << dendl; return 0; } // the hash table size is rounded to the nearest 4KB and will wrap after 16G - if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) { + if (unlikely(p_src_rec->s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) { p_stats->size_mismatch++; ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::" - << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size + << p_src_rec->obj_name << "::" << p_src_rec->s.obj_bytes_size << "::" << p_tgt_rec->obj_name << "::" << p_tgt_rec->s.obj_bytes_size << dendl; return 0; } - if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) { - p_stats->hash_mismatch++; - ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl; - // TBD: set hash attributes on head objects to save calc next time - if (src_rec.s.flags.hash_calculated()) { - write_blake3_object_attribute(dpp, driver, store, &src_rec); - p_stats->set_hash_attrs++; + ret = parse_manifests(dpp, p_src_rec, p_tgt_rec, &src_manifest, &tgt_manifest); + if (unlikely(ret != 0)) { + return 0; + } + + // make sure objects were not created by server-side-copy + if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) { + return 0; + } + + + std::string tail_oid; + bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest, + tgt_manifest, &src_val, tail_oid, p_stats); + if (unlikely(!success)) { + if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) { + // set hash attributes on head objects to save calc next time + ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid SRC hash" << dendl; + ret = write_hash_object_attribute(dpp, driver, store, p_src_rec, p_stats); + if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ <<"::mark valid_hash in table" << dendl; + p_table->set_src_mode(&key, src_block_id, src_rec_id, false, true); + } } if (p_tgt_rec->s.flags.hash_calculated()) { - write_blake3_object_attribute(dpp, driver, store, p_tgt_rec); - p_stats->set_hash_attrs++; + ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid TGT hash" << dendl; + write_hash_object_attribute(dpp, driver, store, p_tgt_rec, p_stats); } return 0; } - ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest()); + ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats, + &src_val, tail_oid); if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl; p_stats->deduped_objects++; p_stats->deduped_objects_bytes += dedupable_objects_bytes; - if (p_tgt_rec->s.num_parts == 0) { + if (p_tgt_rec->s.flags.is_split_head()) { + ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split: dedup_bytes=" + << ondisk_byte_size << dendl; + p_stats->split_head_dedup_bytes += ondisk_byte_size; + } + else if (p_tgt_rec->s.num_parts == 0 && + // if we don't split head it will be duplicated + p_tgt_rec->s.obj_bytes_size > d_head_object_size) { // single part objects duplicate the head object when dedup is used p_stats->dup_head_bytes += d_head_object_size; } // mark the SRC object as a providor of a shared manifest if (!src_val.has_shared_manifest()) { + ldpp_dout(dpp, 20) << __func__ << "::mark shared_manifest+valid_hash"<< dendl; p_stats->set_shared_manifest_src++; - // set the shared manifest flag in the dedup table - p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id); + // We always set strong hash on SRC during dedup so mark in table! + p_table->set_src_mode(&key, src_block_id, src_rec_id, true, true); } else { ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl; @@ -1362,7 +2098,7 @@ namespace rgw::dedup { } else { ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for " - << src_rec.bucket_name << "/" << src_rec.obj_name << dendl; + << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << dendl; p_stats->failed_dedup++; } @@ -1498,10 +2234,12 @@ namespace rgw::dedup { } has_more = (p_header->offset == BLOCK_MAGIC); - ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC); if (!has_more) { ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id << ", rec_count=" << p_header->rec_count << dendl; + if (unlikely(p_header->offset != LAST_BLOCK_MAGIC)) { + p_stats->missing_last_block_marker++; + } break; } } @@ -1549,7 +2287,7 @@ namespace rgw::dedup { p_worker_stats->ingress_obj_bytes += ondisk_byte_size; // We limit dedup to objects from the same storage_class - // TBD: + // TBD-Future: // Should we use a skip-list of storage_classes we should skip (like glacier) ? const std::string& storage_class = rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class); @@ -1564,7 +2302,7 @@ namespace rgw::dedup { p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size; } - if (ondisk_byte_size <= d_min_obj_size_for_dedup) { + if (ondisk_byte_size < d_min_obj_size_for_dedup) { if (parsed_etag.num_parts == 0) { // dedup only useful for objects bigger than 4MB p_worker_stats->ingress_skip_too_small++; @@ -1802,7 +2540,7 @@ namespace rgw::dedup { // make sure that the standard storage_class is always in the mapper! storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp, &p_stats->failed_map_overflow); - ceph_assert(sc_idx == 0); + ceph_assert(sc_idx != remapper_t::NULL_IDX); uint32_t slab_count_arr[num_work_shards]; // first load all etags to hashtable to find dedups // the entries come from bucket-index and got minimal info (etag, size) @@ -2095,7 +2833,8 @@ namespace rgw::dedup { utime_t start_time = ceph_clock_now(); md5_stats_t md5_stats; //DEDUP_DYN_ALLOC - dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size); + dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup, + d_max_obj_size_for_split, raw_mem, raw_mem_size); int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards); if (ret == 0) { md5_stats.duration = ceph_clock_now() - start_time; @@ -2290,6 +3029,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl; d_ctl.dedup_type = p_epoch->dedup_type; + // TBD: replace with a stat-counter #ifdef FULL_DEDUP_SUPPORT ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_EXEC || d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE); @@ -2755,7 +3495,6 @@ namespace rgw::dedup { } d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();}); if (!d_ctl.should_stop() && !d_ctl.should_pause()) { - // TBD: should we release lock here ??? if (d_cluster.can_start_new_scan(store)) { d_ctl.dedup_exec = true; d_ctl.remote_aborted = false; diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h index b1df56249e8..adca55efebc 100644 --- a/src/rgw/driver/rados/rgw_dedup.h +++ b/src/rgw/driver/rados/rgw_dedup.h @@ -97,6 +97,8 @@ namespace rgw::dedup { STEP_REMOVE_DUPLICATES }; + inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes); + inline bool should_split_head(uint64_t head_size, uint64_t obj_size); void run(); int setup(struct dedup_epoch_t*); void work_shards_barrier(work_shard_t num_work_shards); @@ -182,11 +184,18 @@ namespace rgw::dedup { remapper_t *remapper); #ifdef FULL_DEDUP_SUPPORT - int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash); - int add_obj_attrs_to_record(rgw_bucket *p_rb, - disk_record_t *p_rec, + int calc_object_blake3(const RGWObjManifest &manifest, + disk_record_t *p_rec, + uint8_t *p_hash, + blake3_hasher *p_pre_calc_hmac = nullptr); + int split_head_object(disk_record_t *p_src_rec, // IN/OUT PARAM + RGWObjManifest &src_manifest, // IN/OUT PARAM + const disk_record_t *p_tgt_rec, + std::string &tail_oid, // OUT PARAM + md5_stats_t *p_stats); + + int add_obj_attrs_to_record(disk_record_t *p_rec, const rgw::sal::Attrs &attrs, - dedup_table_t *p_table, md5_stats_t *p_stats); /* IN-OUT */ int read_object_attribute(dedup_table_t *p_table, @@ -197,26 +206,36 @@ namespace rgw::dedup { md5_stats_t *p_stats /* IN-OUT */, disk_block_seq_t *p_disk, remapper_t *remapper); - int try_deduping_record(dedup_table_t *p_table, - const disk_record_t *p_rec, - disk_block_id_t block_id, - record_id_t rec_id, - md5_shard_t md5_shard, - md5_stats_t *p_stats, /* IN-OUT */ - remapper_t *remapper); - int inc_ref_count_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &manifest); - int rollback_ref_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &tgt_manifest); - int free_tail_objs_by_manifest(const std::string &ref_tag, - const std::string &oid, - RGWObjManifest &tgt_manifest); - int dedup_object(const disk_record_t *p_src_rec, - const disk_record_t *p_tgt_rec, - md5_stats_t *p_stats, - bool is_shared_manifest_src); + bool check_and_set_strong_hash(disk_record_t *p_src_rec, // IN/OUT PARAM + disk_record_t *p_tgt_rec, // IN/OUT PARAM + RGWObjManifest &src_manifest, + const RGWObjManifest &tgt_manifest, + const dedup_table_t::value_t *p_src_val, + std::string &tail_oid, // OUT PARAM + md5_stats_t *p_stats); + int try_deduping_record(dedup_table_t *p_table, + disk_record_t *p_rec, + disk_block_id_t block_id, + record_id_t rec_id, + md5_shard_t md5_shard, + md5_stats_t *p_stats, /* IN-OUT */ + remapper_t *remapper); + int inc_ref_count_by_manifest(const std::string &ref_tag, + const std::string &oid, + const RGWObjManifest &manifest); + int rollback_ref_by_manifest(const std::string &ref_tag, + const std::string &oid, + const RGWObjManifest &tgt_manifest); + int free_tail_objs_by_manifest(const std::string &ref_tag, + const std::string &oid, + const RGWObjManifest &tgt_manifest); + int dedup_object(disk_record_t *p_src_rec, + disk_record_t *p_tgt_rec, + const RGWObjManifest &src_manifest, + const RGWObjManifest &tgt_manifest, + md5_stats_t *p_stats, + const dedup_table_t::value_t *p_src_val, + const std::string &tail_oid); #endif int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count); int init_rados_access_handles(bool init_pool); @@ -235,8 +254,9 @@ namespace rgw::dedup { unsigned d_heart_beat_max_elapsed_sec; uint64_t d_all_buckets_obj_count = 0; uint64_t d_all_buckets_obj_size = 0; - // we don't benefit from deduping RGW objects smaller than head-object size - uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024); + + uint32_t d_min_obj_size_for_dedup = (64ULL * 1024); + uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024); uint32_t d_head_object_size = (4ULL * 1024 * 1024); control_t d_ctl; uint64_t d_watch_handle = 0; diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.cc b/src/rgw/driver/rados/rgw_dedup_cluster.cc index ebbbec74180..fafd66176ef 100644 --- a/src/rgw/driver/rados/rgw_dedup_cluster.cc +++ b/src/rgw/driver/rados/rgw_dedup_cluster.cc @@ -124,7 +124,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl; bool exclusive = true; // block overwrite of old objects ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { + if (ret == 0) { ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl; // now try and take ownership } @@ -495,7 +495,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl; bool exclusive = true; ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { + if (ret == 0) { ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl; } else if (ret == -EEXIST) { @@ -1124,7 +1124,7 @@ namespace rgw::dedup { // create the object to watch (object may already exist) bool exclusive = true; ret = ctl_ioctx.create(oid, exclusive); - if (ret >= 0) { + if (ret == 0) { ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid << " was created!" << dendl; } diff --git a/src/rgw/driver/rados/rgw_dedup_store.cc b/src/rgw/driver/rados/rgw_dedup_store.cc index d2b62651c6c..83fdfe19931 100644 --- a/src/rgw/driver/rados/rgw_dedup_store.cc +++ b/src/rgw/driver/rados/rgw_dedup_store.cc @@ -123,9 +123,7 @@ namespace rgw::dedup { else { this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest); // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { + for (unsigned i = 0; i < HASH_UNITS; i++) { this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]); } this->ref_tag = std::string(p, this->s.ref_tag_len); @@ -189,9 +187,7 @@ namespace rgw::dedup { else { p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest); // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { + for (unsigned i = 0; i < HASH_UNITS; i++) { p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]); } len = this->ref_tag.length(); @@ -228,7 +224,7 @@ namespace rgw::dedup { { // optimistic approach if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) { - ldpp_dout(dpp, 20) << __func__ << "::success" << dendl; + ldpp_dout(dpp, 20) << caller << "::validate disk_record success" << dendl; return 0; } @@ -270,14 +266,12 @@ namespace rgw::dedup { stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n"; stream << "HASH = "; // BLAKE3 hash has 256 bit splitted into multiple 64bit units - const unsigned units = (256 / (sizeof(uint64_t)*8)); - static_assert(units == 4); - for (unsigned i = 0; i < units; i++) { + for (unsigned i = 0; i < HASH_UNITS; i++) { stream << rec.s.hash[i]; } stream << "\n"; - if (rec.has_shared_manifest()) { + if (rec.s.flags.has_shared_manifest()) { stream << "Shared Manifest Object\n"; } else { @@ -603,19 +597,12 @@ namespace rgw::dedup { ceph_assert(bl.length()); int ret = ioctx.write_full(oid, bl); - if (ret == (int)bl.length()) { - ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to " - << oid << dendl; + if (ret == 0) { + ldpp_dout(dpp, 20) << __func__ << "::SLAB was written successfully" << dendl; } else { - if (ret == 0) { - // no error reported, but we wrote nothing which should never happen - ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid - << ", bl.length()=" << bl.length() << dendl; - ret = -ENODATA; - } ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid - << " with: " << cpp_strerror(-ret) << dendl; + << "::ret=" << ret << "::" << cpp_strerror(-ret) << dendl; } return ret; diff --git a/src/rgw/driver/rados/rgw_dedup_store.h b/src/rgw/driver/rados/rgw_dedup_store.h index 7bca5d4e70e..010e54cd454 100644 --- a/src/rgw/driver/rados/rgw_dedup_store.h +++ b/src/rgw/driver/rados/rgw_dedup_store.h @@ -38,7 +38,8 @@ namespace rgw::dedup { #define HTOCEPH_32 htole32 #define HTOCEPH_64 htole64 - static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024; + static constexpr unsigned HASH_UNITS = BLAKE3_OUT_LEN/sizeof(uint64_t); + static constexpr unsigned DISK_BLOCK_SIZE = 8*1024; // we use 16 bit offset static_assert(DISK_BLOCK_SIZE < 64*1024); static constexpr unsigned DISK_BLOCK_COUNT = 256; @@ -132,6 +133,35 @@ namespace rgw::dedup { uint32_t block_id; }; + struct __attribute__ ((packed)) record_flags_t { + private: + static constexpr uint8_t RGW_RECORD_FLAG_HAS_VALID_HASH = 0x01; + static constexpr uint8_t RGW_RECORD_FLAG_SHARED_MANIFEST = 0x02; + static constexpr uint8_t RGW_RECORD_FLAG_HASH_CALCULATED = 0x04; + static constexpr uint8_t RGW_RECORD_FLAG_FASTLANE = 0x08; + static constexpr uint8_t RGW_RECORD_FLAG_SPLIT_HEAD = 0x10; + static constexpr uint8_t RGW_RECORD_FLAG_TAIL_REFTAG = 0x20; + public: + record_flags_t() : flags(0) {} + record_flags_t(uint8_t _flags) : flags(_flags) {} + inline void clear() { this->flags = 0; } + inline bool hash_calculated() const { return ((flags & RGW_RECORD_FLAG_HASH_CALCULATED) != 0); } + inline void set_hash_calculated() { flags |= RGW_RECORD_FLAG_HASH_CALCULATED; } + inline void clear_hash_calculated() { flags &= ~RGW_RECORD_FLAG_HASH_CALCULATED; } + inline bool has_valid_hash() const { return ((flags & RGW_RECORD_FLAG_HAS_VALID_HASH) != 0); } + inline void set_has_valid_hash() { flags |= RGW_RECORD_FLAG_HAS_VALID_HASH; } + inline bool has_shared_manifest() const { return ((flags & RGW_RECORD_FLAG_SHARED_MANIFEST) != 0); } + inline void set_shared_manifest() { flags |= RGW_RECORD_FLAG_SHARED_MANIFEST; } + inline bool is_fastlane() const { return ((flags & RGW_RECORD_FLAG_FASTLANE) != 0); } + inline void set_fastlane() { flags |= RGW_RECORD_FLAG_FASTLANE; } + inline bool is_split_head() const { return ((flags & RGW_RECORD_FLAG_SPLIT_HEAD) != 0); } + inline void set_split_head() { flags |= RGW_RECORD_FLAG_SPLIT_HEAD; } + inline bool is_ref_tag_from_tail() const { return ((flags & RGW_RECORD_FLAG_TAIL_REFTAG) != 0); } + inline void set_ref_tag_from_tail() { flags |= RGW_RECORD_FLAG_TAIL_REFTAG; } + private: + uint8_t flags; + }; + struct disk_record_t { disk_record_t(const char *buff); @@ -148,32 +178,29 @@ namespace rgw::dedup { const DoutPrefixProvider* dpp, disk_block_id_t block_id, record_id_t rec_id) const; - inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); } - inline void set_shared_manifest() { s.flags.set_shared_manifest(); } - - struct __attribute__ ((packed)) packed_rec_t + inline bool multipart_object() { return (this->s.num_parts > 0); } + struct packed_rec_t { - uint8_t rec_version; // allows changing record format - dedup_flags_t flags; // 1 Byte flags - uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000) - uint16_t obj_name_len; - uint16_t bucket_name_len; - + uint64_t hash[4]; // 4 * 8 Bytes of HASH + uint64_t shared_manifest; // 64bit hash of the SRC object manifest uint64_t md5_high; // High Bytes of the Object Data MD5 uint64_t md5_low; // Low Bytes of the Object Data MD5 uint64_t obj_bytes_size; + uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000) + uint16_t obj_name_len; + uint16_t bucket_name_len; uint16_t bucket_id_len; + uint16_t tenant_name_len; uint16_t instance_len; uint16_t stor_class_len; uint16_t ref_tag_len; - uint16_t manifest_len; - uint8_t pad[6]; - uint64_t shared_manifest; // 64bit hash of the SRC object manifest - uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3 + uint8_t rec_version; // allows changing record format + record_flags_t flags; // 1 Byte flags + uint8_t pad[6]; }s; std::string obj_name; // TBD: find pool name making it easier to get ioctx @@ -186,6 +213,7 @@ namespace rgw::dedup { bufferlist manifest_bl; }; static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash)); + static_assert(sizeof(disk_record_t::packed_rec_t) == sizeof(uint64_t)*12); std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec); static constexpr unsigned BLOCK_MAGIC = 0xFACE; diff --git a/src/rgw/driver/rados/rgw_dedup_table.cc b/src/rgw/driver/rados/rgw_dedup_table.cc index 4f34b27d18e..d86896473a1 100644 --- a/src/rgw/driver/rados/rgw_dedup_table.cc +++ b/src/rgw/driver/rados/rgw_dedup_table.cc @@ -22,11 +22,15 @@ namespace rgw::dedup { //--------------------------------------------------------------------------- dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp, uint32_t _head_object_size, + uint32_t _min_obj_size_for_dedup, + uint32_t _max_obj_size_for_split, uint8_t *p_slab, uint64_t slab_size) { dpp = _dpp; head_object_size = _head_object_size; + min_obj_size_for_dedup = _min_obj_size_for_dedup; + max_obj_size_for_split = _max_obj_size_for_split; memset(p_slab, 0, slab_size); hash_tab = (table_entry_t*)p_slab; entries_count = slab_size/sizeof(table_entry_t); @@ -51,7 +55,7 @@ namespace rgw::dedup { const key_t &key = hash_tab[tab_idx].key; // This is an approximation only since size is stored in 4KB resolution uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); - if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { + if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) { hash_tab[tab_idx].val.clear_flags(); redistributed_clear++; continue; @@ -126,12 +130,16 @@ namespace rgw::dedup { } else { uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size, + min_obj_size_for_dedup, + max_obj_size_for_split, p_key->num_parts, byte_size_approx); p_big_objs->duplicate_count ++; p_big_objs->dedup_bytes_estimate += dup_bytes_approx; - if (!p_key->multipart_object()) { + // object smaller than max_obj_size_for_split will split their head + // and won't dup it + if (!key.multipart_object() && byte_size_approx > max_obj_size_for_split) { // single part objects duplicate the head object when dedup is used *p_duplicate_head_bytes += head_object_size; } @@ -206,23 +214,31 @@ namespace rgw::dedup { // replace value! value_t new_val(block_id, rec_id, shared_manifest); new_val.count = val.count; - hash_tab[idx].val = new_val; ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::[" << val.block_idx << "/" << (int)val.rec_id << "] -> [" << block_id << "/" << (int)rec_id << "]" << dendl; + + val = new_val; } } //--------------------------------------------------------------------------- - int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key, - disk_block_id_t block_id, - record_id_t rec_id) + int dedup_table_t::set_src_mode(const key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id, + bool set_shared_manifest_src, + bool set_has_valid_hash_src) { uint32_t idx = find_entry(p_key); value_t &val = hash_tab[idx].val; if (val.is_occupied()) { if (val.block_idx == block_id && val.rec_id == rec_id) { - val.set_shared_manifest_src(); + if (set_shared_manifest_src) { + val.set_shared_manifest_src(); + } + if (set_has_valid_hash_src) { + val.set_has_valid_hash_src(); + } return 0; } } @@ -281,7 +297,7 @@ namespace rgw::dedup { uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); // skip small single part objects which we can't dedup - if (!key.multipart_object() && (byte_size_approx <= head_object_size)) { + if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) { if (hash_tab[tab_idx].val.is_singleton()) { p_small_objs->singleton_count++; } diff --git a/src/rgw/driver/rados/rgw_dedup_table.h b/src/rgw/driver/rados/rgw_dedup_table.h index 4a46db6e5b7..501cb20d9aa 100644 --- a/src/rgw/driver/rados/rgw_dedup_table.h +++ b/src/rgw/driver/rados/rgw_dedup_table.h @@ -63,6 +63,26 @@ namespace rgw::dedup { static_assert(sizeof(key_t) == 24); class dedup_table_t { + struct __attribute__ ((packed)) table_flags_t { + private: + static constexpr uint8_t RGW_TABLE_FLAG_HAS_VALID_HASH = 0x01; + static constexpr uint8_t RGW_TABLE_FLAG_SHARED_MANIFEST = 0x02; + static constexpr uint8_t RGW_TABLE_FLAG_OCCUPIED = 0x04; + public: + table_flags_t() : flags(0) {} + table_flags_t(uint8_t _flags) : flags(_flags) {} + inline void clear() { this->flags = 0; } + inline bool has_valid_hash() const { return ((flags & RGW_TABLE_FLAG_HAS_VALID_HASH) != 0); } + inline void set_has_valid_hash() { flags |= RGW_TABLE_FLAG_HAS_VALID_HASH; } + inline bool has_shared_manifest() const { return ((flags & RGW_TABLE_FLAG_SHARED_MANIFEST) != 0); } + inline void set_shared_manifest() { flags |= RGW_TABLE_FLAG_SHARED_MANIFEST; } + inline bool is_occupied() const {return ((this->flags & RGW_TABLE_FLAG_OCCUPIED) != 0); } + inline void set_occupied() {this->flags |= RGW_TABLE_FLAG_OCCUPIED; } + inline void clear_occupied() { this->flags &= ~RGW_TABLE_FLAG_OCCUPIED; } + private: + uint8_t flags; + }; + public: // 8 Bytes Value struct value_t { @@ -93,6 +113,8 @@ namespace rgw::dedup { inline void inc_count() { count ++; } inline void reset_count() { count = 0; } inline void clear_flags() { flags.clear(); } + inline bool has_valid_hash() const {return flags.has_valid_hash(); } + inline void set_has_valid_hash_src() { this->flags.set_has_valid_hash(); } inline bool is_singleton() const { return (count == 1); } inline bool is_occupied() const { return flags.is_occupied(); } inline void set_occupied() { this->flags.set_occupied(); } @@ -102,12 +124,14 @@ namespace rgw::dedup { disk_block_id_t block_idx; // 32 bits uint16_t count; // 16 bits record_id_t rec_id; // 8 bits - dedup_flags_t flags; // 8 bits + table_flags_t flags; // 8 bits } __attribute__((__packed__)); static_assert(sizeof(value_t) == 8); dedup_table_t(const DoutPrefixProvider* _dpp, uint32_t _head_object_size, + uint32_t _min_obj_size_for_dedup, + uint32_t _max_obj_size_for_split, uint8_t *p_slab, uint64_t slab_size); int add_entry(key_t *p_key, @@ -129,6 +153,12 @@ namespace rgw::dedup { disk_block_id_t block_id, record_id_t rec_id); + int set_src_mode(const key_t *p_key, + disk_block_id_t block_id, + record_id_t rec_id, + bool set_shared_manifest_src, + bool set_has_valid_hash_src); + void count_duplicates(dedup_stats_t *p_small_objs_stat, dedup_stats_t *p_big_objs_stat); @@ -145,7 +175,9 @@ namespace rgw::dedup { uint32_t values_count = 0; uint32_t entries_count = 0; uint32_t occupied_count = 0; - uint32_t head_object_size = (4ULL * 1024 * 1024); + uint32_t head_object_size; + uint32_t min_obj_size_for_dedup; + uint32_t max_obj_size_for_split; table_entry_t *hash_tab = nullptr; // stat counters diff --git a/src/rgw/driver/rados/rgw_dedup_utils.cc b/src/rgw/driver/rados/rgw_dedup_utils.cc index 61ad6b91c51..74252a85395 100644 --- a/src/rgw/driver/rados/rgw_dedup_utils.cc +++ b/src/rgw/driver/rados/rgw_dedup_utils.cc @@ -14,8 +14,8 @@ #include "rgw_dedup_utils.h" #include "common/ceph_crypto.h" - namespace rgw::dedup { + //--------------------------------------------------------------------------- std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type) { @@ -566,10 +566,27 @@ namespace rgw::dedup { this->failed_rec_load += other.failed_rec_load; this->failed_block_load += other.failed_block_load; + this->different_storage_class += other.different_storage_class; + this->invalid_hash_no_split_head += other.invalid_hash_no_split_head; + this->invalid_storage_class_mapping += other.invalid_storage_class_mapping; + this->singleton_after_purge += other.singleton_after_purge; + this->shared_manifest_after_purge += other.shared_manifest_after_purge; + this->split_head_no_tail_placement += other.split_head_no_tail_placement; + this->illegal_rec_id += other.illegal_rec_id; + this->missing_last_block_marker += other.missing_last_block_marker; + this->valid_hash_attrs += other.valid_hash_attrs; this->invalid_hash_attrs += other.invalid_hash_attrs; this->set_hash_attrs += other.set_hash_attrs; this->skip_hash_cmp += other.skip_hash_cmp; + this->manifest_raw_obj += other.manifest_raw_obj; + this->manifest_no_tail_placement += other.manifest_no_tail_placement; + this->rollback_tail_obj += other.rollback_tail_obj; + this->failed_split_head_creat += other.failed_split_head_creat; + this->skip_shared_tail_objs += other.skip_shared_tail_objs; + this->split_head_src += other.split_head_src; + this->split_head_tgt += other.split_head_tgt; + this->split_head_dedup_bytes += other.split_head_dedup_bytes; this->set_shared_manifest_src += other.set_shared_manifest_src; this->loaded_objects += other.loaded_objects; @@ -659,9 +676,30 @@ namespace rgw::dedup { f->dump_unsigned("Set HASH", this->set_hash_attrs); } + if (this->skip_shared_tail_objs) { + f->dump_unsigned("Skip Shared Tail Objs (server-side-copy)", this->skip_shared_tail_objs); + } if (this->skip_hash_cmp) { f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp); } + if (this->manifest_raw_obj) { + f->dump_unsigned("Manifest has RAW OBJ", this->manifest_raw_obj); + } + if (this->manifest_no_tail_placement) { + f->dump_unsigned("Manifest has no tail placement", this->manifest_no_tail_placement); + } + if (this->rollback_tail_obj) { + f->dump_unsigned("Rollback tail obj", this->rollback_tail_obj); + } + if (this->split_head_src) { + f->dump_unsigned("Split-Head Src OBJ", this->split_head_src); + } + if (this->split_head_tgt) { + f->dump_unsigned("Split-Head Tgt OBJ", this->split_head_tgt); + } + if (this->split_head_dedup_bytes) { + f->dump_unsigned("Split-Head Dedup-Bytes", this->split_head_dedup_bytes); + } } { @@ -716,6 +754,18 @@ namespace rgw::dedup { if (this->failed_block_load) { f->dump_unsigned("Failed Block-Load ", this->failed_block_load); } + + if (this->illegal_rec_id) { + f->dump_unsigned("Failed illegal_rec_id", this->illegal_rec_id ); + } + if (this->missing_last_block_marker) { + f->dump_unsigned("Failed missing_last_block_marker in rec", + this->missing_last_block_marker); + } + + if (this->failed_split_head_creat) { + f->dump_unsigned("Failed Split-Head Create (EEXIST)", this->failed_split_head_creat); + } if (this->failed_dedup) { f->dump_unsigned("Failed Dedup", this->failed_dedup); } @@ -732,6 +782,30 @@ namespace rgw::dedup { if (this->size_mismatch) { f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch); } + if (this->different_storage_class) { + f->dump_unsigned("different_storage_class", + this->different_storage_class); + } + if (this->invalid_hash_no_split_head) { + f->dump_unsigned("Failed rec has invalid hash w/o split-head ", + this->invalid_hash_no_split_head); + } + if (this->invalid_storage_class_mapping) { + f->dump_unsigned("Failed, invalid_storage_class_mapping", + this->invalid_storage_class_mapping); + } + if (this->singleton_after_purge) { + f->dump_unsigned("Failed, has singleton after purge", + this->singleton_after_purge); + } + if (this->shared_manifest_after_purge) { + f->dump_unsigned("Failed, has shared manifest after purge", + this->shared_manifest_after_purge); + } + if (this->split_head_no_tail_placement) { + f->dump_unsigned("No Tail Placement during Split-Head processing", + this->split_head_no_tail_placement); + } } } @@ -768,10 +842,27 @@ namespace rgw::dedup { encode(m.failed_rec_load, bl); encode(m.failed_block_load, bl); + encode(m.different_storage_class, bl); + encode(m.invalid_hash_no_split_head, bl); + encode(m.invalid_storage_class_mapping, bl); + encode(m.singleton_after_purge, bl); + encode(m.shared_manifest_after_purge, bl); + encode(m.split_head_no_tail_placement, bl); + encode(m.illegal_rec_id, bl); + encode(m.missing_last_block_marker, bl); + encode(m.valid_hash_attrs, bl); encode(m.invalid_hash_attrs, bl); encode(m.set_hash_attrs, bl); encode(m.skip_hash_cmp, bl); + encode(m.manifest_raw_obj, bl); + encode(m.manifest_no_tail_placement, bl); + encode(m.rollback_tail_obj, bl); + encode(m.failed_split_head_creat, bl); + encode(m.skip_shared_tail_objs, bl); + encode(m.split_head_src, bl); + encode(m.split_head_tgt, bl); + encode(m.split_head_dedup_bytes, bl); encode(m.set_shared_manifest_src, bl); encode(m.loaded_objects, bl); @@ -822,10 +913,27 @@ namespace rgw::dedup { decode(m.failed_rec_load, bl); decode(m.failed_block_load, bl); + decode(m.different_storage_class, bl); + decode(m.invalid_hash_no_split_head, bl); + decode(m.invalid_storage_class_mapping, bl); + decode(m.singleton_after_purge, bl); + decode(m.shared_manifest_after_purge, bl); + decode(m.split_head_no_tail_placement, bl); + decode(m.illegal_rec_id, bl); + decode(m.missing_last_block_marker, bl); + decode(m.valid_hash_attrs, bl); decode(m.invalid_hash_attrs, bl); decode(m.set_hash_attrs, bl); decode(m.skip_hash_cmp, bl); + decode(m.manifest_raw_obj, bl); + decode(m.manifest_no_tail_placement, bl); + decode(m.rollback_tail_obj, bl); + decode(m.failed_split_head_creat, bl); + decode(m.skip_shared_tail_objs, bl); + decode(m.split_head_src, bl); + decode(m.split_head_tgt, bl); + decode(m.split_head_dedup_bytes, bl); decode(m.set_shared_manifest_src, bl); decode(m.loaded_objects, bl); diff --git a/src/rgw/driver/rados/rgw_dedup_utils.h b/src/rgw/driver/rados/rgw_dedup_utils.h index abe62432122..579e048a259 100644 --- a/src/rgw/driver/rados/rgw_dedup_utils.h +++ b/src/rgw/driver/rados/rgw_dedup_utils.h @@ -25,6 +25,7 @@ #include "common/dout.h" #define FULL_DEDUP_SUPPORT + namespace rgw::dedup { using namespace std::chrono; using work_shard_t = uint16_t; @@ -68,29 +69,6 @@ namespace rgw::dedup { }; std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type); - struct __attribute__ ((packed)) dedup_flags_t { - private: - static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC - static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB - static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB - static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC - - public: - dedup_flags_t() : flags(0) {} - dedup_flags_t(uint8_t _flags) : flags(_flags) {} - inline void clear() { this->flags = 0; } - inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); } - inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; } - inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); } - inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; } - inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); } - inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; } - inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; } - inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); } - inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; } - private: - uint8_t flags; - }; class alignas(8) Throttle { friend void validate_max_calls_offset(); @@ -262,11 +240,27 @@ namespace rgw::dedup { uint64_t failed_rec_load = 0; uint64_t failed_block_load = 0; + uint64_t different_storage_class = 0; + uint64_t invalid_hash_no_split_head = 0; + uint64_t invalid_storage_class_mapping = 0; + uint64_t singleton_after_purge = 0; + uint64_t shared_manifest_after_purge = 0; + uint64_t split_head_no_tail_placement = 0; + uint64_t illegal_rec_id = 0; + uint64_t missing_last_block_marker = 0; + uint64_t valid_hash_attrs = 0; uint64_t invalid_hash_attrs = 0; uint64_t set_hash_attrs = 0; + uint64_t skip_shared_tail_objs = 0; uint64_t skip_hash_cmp = 0; - + uint64_t manifest_raw_obj = 0; + uint64_t manifest_no_tail_placement = 0; + uint64_t rollback_tail_obj = 0; + uint64_t failed_split_head_creat = 0; + uint64_t split_head_src = 0; + uint64_t split_head_tgt = 0; + uint64_t split_head_dedup_bytes = 0; uint64_t set_shared_manifest_src = 0; uint64_t loaded_objects = 0; uint64_t processed_objects = 0; @@ -362,7 +356,19 @@ namespace rgw::dedup { const DoutPrefixProvider* dpp); //--------------------------------------------------------------------------- - static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size, + static inline bool dedupable_object(bool multipart_object, + uint64_t min_obj_size_for_dedup, + uint64_t object_byte_size) + { + // all multipart objects are dedupable because the head-object is empty + // otherwise make sure object_byte_size is large enough + return (multipart_object || object_byte_size >= min_obj_size_for_dedup); + } + + //--------------------------------------------------------------------------- + static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size, + uint32_t min_obj_size_for_dedup, + uint32_t max_obj_size_for_split, uint16_t num_parts, uint64_t size_bytes) { @@ -372,9 +378,13 @@ namespace rgw::dedup { } else { // reduce the head size - if (size_bytes > head_obj_size) { + if (size_bytes > max_obj_size_for_split) { return size_bytes - head_obj_size; } + else if (size_bytes >= min_obj_size_for_dedup) { + // Head is splitted into an empty obj and a new tail enabling a full dedup + return size_bytes; + } else { return 0; } diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h index 1e679a38b8e..4129a015c98 100644 --- a/src/rgw/driver/rados/rgw_obj_manifest.h +++ b/src/rgw/driver/rados/rgw_obj_manifest.h @@ -256,6 +256,10 @@ public: void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs, std::string *override_prefix, rgw_obj_select *location) const; + void clear_rules() { + rules.clear(); + } + void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) { RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size); rules[0] = rule; @@ -467,6 +471,10 @@ public: return max_head_size; } + void set_max_head_size(uint64_t _max_head_size) { + max_head_size = _max_head_size; + } + const std::string& get_tier_type() { return tier_type; } diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 87219399678..40ab41c400f 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -89,7 +89,6 @@ using ceph::crypto::MD5; #define RGW_ATTR_CORS RGW_ATTR_PREFIX "cors" #define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag" #define RGW_ATTR_CKSUM RGW_ATTR_PREFIX "cksum" -#define RGW_ATTR_SHA256 RGW_ATTR_PREFIX "x-amz-content-sha256" #define RGW_ATTR_BLAKE3 RGW_ATTR_PREFIX "blake3" #define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets" #define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py index 3f3a3d606dd..f24857e3d99 100644 --- a/src/test/rgw/dedup/test_dedup.py +++ b/src/test/rgw/dedup/test_dedup.py @@ -262,6 +262,17 @@ def gen_connections_multi(num_tenants): log.debug("gen_connections_multi: All connection and buckets are set") return (tenants, bucket_names, conns) +#------------------------------------------------------------------------------- +def create_buckets(conn, max_copies_count): + bucket_names=[] + for i in range(0, max_copies_count): + bucket_name=gen_bucket_name() + bucket_names.append(bucket_name) + log.debug("conn.create_bucket(Bucket=%s)", bucket_name) + conn.create_bucket(Bucket=bucket_name) + + return bucket_names + ##################### # dedup tests @@ -270,8 +281,11 @@ OUT_DIR="/tmp/dedup/" KB=(1024) MB=(1024*KB) POTENTIAL_OBJ_SIZE=(64*KB) +DEDUP_MIN_OBJ_SIZE=(64*KB) +SPLIT_HEAD_SIZE=(16*MB) RADOS_OBJ_SIZE=(4*MB) -MULTIPART_SIZE=(16*MB) +# The default multipart threshold size for S3cmd is 15 MB. +MULTIPART_SIZE=(15*MB) default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE) ETAG_ATTR="user.rgw.etag" POOLNAME="default.rgw.buckets.data" @@ -385,26 +399,41 @@ def count_space_in_all_buckets(): #------------------------------------------------------------------------------- def count_objects_in_bucket(bucket_name, conn): max_keys=1000 - marker="" + continuation_token = None obj_count=0 while True: log.debug("bucket_name=%s", bucket_name) - listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys) + list_args = { + 'Bucket': bucket_name, + 'MaxKeys': max_keys + } + if continuation_token: + list_args['ContinuationToken'] = continuation_token + + listing=conn.list_objects_v2(**list_args) if 'Contents' not in listing or len(listing['Contents'])== 0: return 0 obj_count += len(listing['Contents']) - if listing['IsTruncated']: - marker=listing['NextMarker'] - log.debug("marker=%s, obj_count=%d", marker, obj_count) - continue + if 'NextContinuationToken' in listing: + continuation_token = listing['NextContinuationToken'] + log.debug("count_objects_in_bucket: Token=%s, count=%d", + continuation_token, obj_count) else: return obj_count #------------------------------------------------------------------------------- -def count_object_parts_in_all_buckets(verbose=False): +def copy_obj(base_bucket_name, base_key, bucket_name, key): + s3_prefix="s3://" + src = s3_prefix + base_bucket_name + "/" + base_key + dest = s3_prefix + bucket_name + "/" + key + result = bash(['s3cmd', 'cp', src, dest]) + assert result[1] == 0 + +#------------------------------------------------------------------------------- +def count_object_parts_in_all_buckets(verbose=False, expected_size=0): result = rados(['lspools']) assert result[1] == 0 found=False @@ -420,17 +449,40 @@ def count_object_parts_in_all_buckets(verbose=False): result = rados(['ls', '-p ', POOLNAME]) assert result[1] == 0 - names=result[0].split() - count = 0 - for name in names: - #log.debug(name) - count = count + 1 + rados_count = len(names) + if (rados_count > 1000): + ### we can only do about 10 stat call per-second!! + ### TBD: add obj_size to ls output to allow more efficient size check + log.info(">>> rados obj_count(%d) is too high -> skip stat check\n", + len(names)) + expected_size = 0 + + byte_size_total = 0 + ondisk_size_total = 0 + start_time = time.perf_counter() + for rados_name in names: + if verbose: + log.debug(rados_name) + if expected_size: + result = rados(['-p ', POOLNAME, 'stat', rados_name]) + assert result[1] == 0 + stat = result[0].split() + byte_size=int(stat[-1]) + byte_size_total += byte_size + ondisk_size_total += calc_on_disk_byte_size(byte_size) + + if expected_size: + end_time = time.perf_counter() + time_elapsed = end_time - start_time + log.info("rados_count=%d, ondisk_size_total=%d, expected_size=%d, time=%d(sec)", + rados_count, ondisk_size_total, expected_size, time_elapsed) + assert ondisk_size_total == expected_size if verbose: - log.debug("Pool has %d rados objects", count) + log.debug("Pool has %d rados objects", rados_count) - return count + return rados_count #------------------------------------------------------------------------------- @@ -443,29 +495,61 @@ def cleanup_local(): return False +#------------------------------------------------------------------------------- +def check_delete_objects_response(response): + # Check for delete failures + if 'Errors' in response and response['Errors']: + log.error("Delete failures detected:") + for error in response['Errors']: + log.error("delete_objects::ERROR::Key=%s, Code=%s, Message=%s", + error['Key'], error['Code'], error['Message']) + + else: + log.debug("All objects deleted successfully.") + + +#------------------------------------------------------------------------------- +def delete_objects(conn, bucket_name, object_keys): + response=conn.delete_objects(Bucket=bucket_name, + Delete={"Objects": [{"Key": key} for key in object_keys]}) + + # Check for delete failures + check_delete_objects_response(response) + + #------------------------------------------------------------------------------- def delete_bucket_with_all_objects(bucket_name, conn): max_keys=1000 - marker="" + continuation_token = None obj_count=0 while True: - listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys) + list_args = { + 'Bucket': bucket_name, + 'MaxKeys': max_keys + } + if continuation_token: + list_args['ContinuationToken'] = continuation_token + + listing=conn.list_objects_v2(**list_args) if 'Contents' not in listing or len(listing['Contents'])== 0: log.debug("Bucket '%s' is empty, skipping...", bucket_name) return objects=[] for obj in listing['Contents']: - log.debug(obj['Key']) + log.debug("delete_bucket_with_all_objects: add obj: %s", obj['Key']) objects.append({'Key': obj['Key']}) obj_count += len(objects) # delete objects from the bucket - conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects}) - if listing['IsTruncated']: - marker=listing['NextMarker'] - log.debug("marker=%s, obj_count=%d", marker, obj_count) - continue + log.debug("delete_bucket_with_all_objects: delete %d objs", obj_count) + response=conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects}) + check_delete_objects_response(response) + + if 'NextContinuationToken' in listing: + continuation_token = listing['NextContinuationToken'] + log.debug("delete_bucket_with_all_objects: Token=%s, count=%d", + continuation_token, obj_count) else: break @@ -476,7 +560,7 @@ def delete_bucket_with_all_objects(bucket_name, conn): def verify_pool_is_empty(): result = admin(['gc', 'process', '--include-all']) assert result[1] == 0 - assert count_object_parts_in_all_buckets() == 0 + assert count_object_parts_in_all_buckets(False, 0) == 0 #------------------------------------------------------------------------------- @@ -538,15 +622,39 @@ def calc_rados_obj_count(num_copies, obj_size, config): return rados_obj_count +BLOCK_SIZE=4096 +#------------------------------------------------------------------------------- +def calc_on_disk_byte_size(byte_size): + return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE) + + +#------------------------------------------------------------------------------- +def calc_head_size(obj_size, config): + on_disk_byte_size = calc_on_disk_byte_size(obj_size) + threshold = config.multipart_threshold + # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part + # multi-part objects got a zero size Head objects + if obj_size >= threshold: + head_size = 0 + else: + head_size = min(RADOS_OBJ_SIZE, on_disk_byte_size) + + return head_size + + #------------------------------------------------------------------------------- def calc_dedupable_space(obj_size, config): + on_disk_byte_size = calc_on_disk_byte_size(obj_size) + threshold = config.multipart_threshold # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part # multi-part objects got a zero size Head objects if obj_size >= threshold: - dedupable_space = obj_size - elif obj_size > RADOS_OBJ_SIZE: - dedupable_space = obj_size - RADOS_OBJ_SIZE + dedupable_space = on_disk_byte_size + elif obj_size > SPLIT_HEAD_SIZE: + dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE + elif obj_size >= DEDUP_MIN_OBJ_SIZE: + dedupable_space = on_disk_byte_size else: dedupable_space = 0 @@ -554,10 +662,18 @@ def calc_dedupable_space(obj_size, config): float(obj_size)/MB, float(dedupable_space)/MB) return dedupable_space -BLOCK_SIZE=4096 #------------------------------------------------------------------------------- -def calc_on_disk_byte_size(byte_size): - return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE) +def calc_split_objs_count(obj_size, num_copies, config): + threshold = config.multipart_threshold + on_disk_byte_size = calc_on_disk_byte_size(obj_size) + + if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold: + return 0 + + if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE: + return 0 + + return 1 #------------------------------------------------------------------------------- @@ -569,7 +685,7 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config): threshold = config.multipart_threshold dedup_stats.skip_shared_manifest = 0 dedup_stats.size_before_dedup += (on_disk_byte_size * num_copies) - if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE: + if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE: dedup_stats.skip_too_small += num_copies dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies) @@ -584,8 +700,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config): return dedup_stats.total_processed_objects += num_copies - #dedup_stats.loaded_objects += num_copies - if num_copies == 1: dedup_stats.singleton_obj += 1 dedup_stats.skip_singleton += 1 @@ -628,21 +742,35 @@ def calc_expected_results(files, config): #------------------------------------------------------------------------------- -def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=True): +def print_files(files, config): + for idx, f in enumerate(files): + filename=f[0] + obj_size=f[1] + num_copies=f[2] + assert(obj_size) + split_head = calc_split_objs_count(obj_size, num_copies, config) + log.info("[%d]%s::size=%d, num_copies=%d, split_head=%d", + idx, filename, obj_size, num_copies, split_head); + + +#------------------------------------------------------------------------------- +def upload_objects(bucket_name, files, indices, conn, config, check_obj_count): dedup_stats = Dedup_Stats() total_space=0 duplicated_space=0 duplicated_tail_objs=0 rados_objects_total=0 s3_objects_total=0 + split_head_objs=0 for (f, idx) in zip(files, indices): filename=f[0] obj_size=f[1] num_copies=f[2] assert(obj_size) + split_head_objs += calc_split_objs_count(obj_size, num_copies, config) calc_expected_stats(dedup_stats, obj_size, num_copies, config) - total_space += (obj_size * num_copies) + total_space += (calc_on_disk_byte_size(obj_size) * num_copies) dedupable_space=calc_dedupable_space(obj_size, config) duplicated_space += ((num_copies-1) * dedupable_space) rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config) @@ -652,10 +780,9 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr s3_objects_total += num_copies if s3_objects_total and (s3_objects_total % 1000 == 0): log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", - s3_objects_total, rados_objects_total, total_space/MB) + s3_objects_total, rados_objects_total, total_space/MB) for i in range(idx, num_copies): key = gen_object_name(filename, i) - #log.debug("upload_file %s/%s with crc32", bucket_name, key) conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config) log.debug("==========================================") @@ -665,15 +792,70 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs) log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB) log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB) - - expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs) + log.info("split_head_objs=%d, rados_objects_total=%d, duplicated_tail_objs=%d", + split_head_objs, rados_objects_total, duplicated_tail_objs) + expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs) log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup) expcted_space_post_dedup=(total_space-duplicated_space) log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB) if check_obj_count: - assert rados_objects_total == count_object_parts_in_all_buckets() + log.debug("upload_objects: verify current Rados state - total_space=%d", total_space) + # assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup) + # skip size check as it is time consuming + assert rados_objects_total == count_object_parts_in_all_buckets(False, 0) + + return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total) + + +#------------------------------------------------------------------------------- +def upload_objects_with_copy(files, conn, bucket_names, indices, config): + dedup_stats = Dedup_Stats() + total_space=0 + rados_objects_total=0 + s3_objects_total=0 + + for (f, idx) in zip(files, indices): + filename=f[0] + obj_size=f[1] + num_copies=f[2] + assert(obj_size) + head_size = calc_head_size(obj_size, config) + obj_size_on_disk=calc_on_disk_byte_size(obj_size) + log.debug("upload_objects_with_copy:obj_size=%d, on_disk_size=%d, head_size=%d", + obj_size, obj_size_on_disk, head_size); + total_space += (obj_size_on_disk + (num_copies-1)*head_size) + rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config) + tail_objs_count =rados_obj_count-1 + rados_objects_total += (tail_objs_count + num_copies) + log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies); + s3_objects_total += num_copies + if s3_objects_total and (s3_objects_total % 1000 == 0): + log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", + s3_objects_total, rados_objects_total, total_space/MB) + + base_obj=dict() + for i in range(idx, num_copies): + key = gen_object_name(filename, i) + bucket_name=bucket_names[i] + if i == 0: + base_obj = {'Bucket': bucket_name, 'Key': key} + #log.info("upload_file: %s -> %s/%s", filename, bucket_name, key) + conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config) + else: + log.debug("copy_obj: %s/%s -> %s/%s", + base_obj['Bucket'], base_obj['Key'], bucket_name, key) + conn.copy_object(CopySource=base_obj, Bucket=bucket_name, Key=key) + + + dedup_stats.deduped_obj = 0 + dedup_stats.size_before_dedup = total_space + # No change should happen since tail-objects are already de-duplicated + dedup_stats.dedup_bytes_estimate = 0 + expected_rados_obj_count_post_dedup=rados_objects_total + + log.info("upload_objects: verify current Rados state - total_space=%d", total_space) + assert rados_objects_total == count_object_parts_in_all_buckets(False, total_space) - expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup) return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total) @@ -686,13 +868,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_ duplicated_tail_objs=0 rados_objects_total=0 s3_objects_total=0 + split_head_objs=0 + for (f, idx) in zip(files, indices): filename=f[0] obj_size=f[1] num_copies=f[2] assert(obj_size) + split_head_objs += calc_split_objs_count(obj_size, num_copies, config) calc_expected_stats(dedup_stats, obj_size, num_copies, config) - total_space += (obj_size * num_copies) + total_space += (calc_on_disk_byte_size(obj_size) * num_copies) dedupable_space=calc_dedupable_space(obj_size, config) duplicated_space += ((num_copies-1) * dedupable_space) rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config) @@ -702,7 +887,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_ s3_objects_total += num_copies if s3_objects_total and (s3_objects_total % 1000 == 0): log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", - s3_objects_total, rados_objects_total, total_space/MB) + s3_objects_total, rados_objects_total, total_space/MB) for i in range(idx, num_copies): ten_id = i % max_tenants key = gen_object_name(filename, i) @@ -710,8 +895,8 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_ log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key) log.debug("==========================================") - log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", - s3_objects_total, rados_objects_total, total_space/MB) + log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", + s3_objects_total, rados_objects_total, total_space/MB) log.debug("Based on calculation we should have %d rados objects", rados_objects_total) log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs) log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB) @@ -722,15 +907,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_ s3_object_count += count_objects_in_bucket(bucket_name, conn) log.debug("bucket listings reported a total of %d s3 objects", s3_object_count) - expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs) + expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs) log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup) expcted_space_post_dedup=(total_space-duplicated_space) log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB) if check_obj_count: - assert rados_objects_total == count_object_parts_in_all_buckets() + log.debug("upload_objects_multi: verify current Rados state (obj/size)") + #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup) + assert rados_objects_total == count_object_parts_in_all_buckets(False, 0) assert (s3_object_count == s3_objects_total) - expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup) return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total) @@ -769,13 +955,16 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_ duplicated_tail_objs=0 rados_objects_total=0 s3_objects_total=0 + split_head_objs=0 + for (f, idx) in zip(files, indices): filename=f[0] obj_size=f[1] num_copies=f[2] assert(obj_size) + split_head_objs += calc_split_objs_count(obj_size, num_copies, config) calc_expected_stats(dedup_stats, obj_size, num_copies, config) - total_space += (obj_size * num_copies) + total_space += (calc_on_disk_byte_size(obj_size) * num_copies) dedupable_space=calc_dedupable_space(obj_size, config) duplicated_space += ((num_copies-1) * dedupable_space) rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config) @@ -789,8 +978,8 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_ proc_list[idx].join() log.debug("==========================================") - log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", - s3_objects_total, rados_objects_total, total_space/MB) + log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB", + s3_objects_total, rados_objects_total, total_space/MB) log.debug("Based on calculation we should have %d rados objects", rados_objects_total) log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs) log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB) @@ -801,27 +990,135 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_ s3_object_count += count_objects_in_bucket(bucket_name, conn) log.debug("bucket listings reported a total of %d s3 objects", s3_object_count) - expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs) + expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs) log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup) expcted_space_post_dedup=(total_space-duplicated_space) log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB) if check_obj_count: - assert rados_objects_total == count_object_parts_in_all_buckets() + log.debug("procs_upload_objects:: count_object_parts_in_all_buckets()") + #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup) + assert rados_objects_total == count_object_parts_in_all_buckets(False, 0) assert (s3_object_count == s3_objects_total) - expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup) return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total) +#------------------------------------------------------------------------------- +def check_if_any_obj_exists(bucket_name, delete_list, conn): + delete_set = set(delete_list) + max_keys=1000 + continuation_token = None + + while True: + list_args = { + 'Bucket': bucket_name, + 'MaxKeys': max_keys + } + if continuation_token: + list_args['ContinuationToken'] = continuation_token + + listing=conn.list_objects_v2(**list_args) + if 'Contents' in listing: + for obj in listing['Contents']: + key=obj['Key'] + log.debug("check_if_any_obj_exists: key=%s", key) + if obj['Key'] in delete_set: + log.info("key <%s> was found in bucket", key) + + if 'NextContinuationToken' in listing: + continuation_token = listing['NextContinuationToken'] + log.debug("check_if_any_obj_exists: Token=%s", continuation_token) + else: + break + + +#------------------------------------------------------------------------------- +def delete_objects_multi(conns, bucket_names, ten_id, object_keys): + conn = conns[ten_id] + bucket_name = bucket_names[ten_id] + delete_objects(conn, bucket_name, object_keys) #------------------------------------------------------------------------------- -def verify_objects(bucket_name, files, conn, expected_results, config): +def delete_dup_objects_multi(files, conns, bucket_names): + max_tenants=len(conns) + tenants_obj_lists = [[] for _ in range(max_tenants)] + + for f in files: + filename=f[0] + num_copies=f[2] + if num_copies > 1: + start_idx=1 + for i in range(start_idx, num_copies): + key = gen_object_name(filename, i) + log.debug("delete_dup_objects_multi: delete key::%s::", key); + ten_id = i % max_tenants + object_keys = tenants_obj_lists[ten_id] + object_keys.append(key) + # flush delete request after every 500 objects + if len(object_keys) >= 500: + delete_objects_multi(conns, bucket_names, ten_id, object_keys) + object_keys.clear() + + # remove leftover objects + for ten_id in range(max_tenants): + object_keys = tenants_obj_lists[ten_id] + if len(object_keys): + delete_objects_multi(conns, bucket_names, ten_id, object_keys) + + # must call garbage collection for predictable count + result = admin(['gc', 'process', '--include-all']) + assert result[1] == 0 + + +#------------------------------------------------------------------------------- +def delete_dup_objects(bucket_name, files, conn): + delete_list_total=[] + object_keys=[] + + for f in files: + filename=f[0] + num_copies=f[2] + if num_copies > 1: + start_idx=1 + for i in range(start_idx, num_copies): + key = gen_object_name(filename, i) + log.debug("delete key::%s::", key); + delete_list_total.append(key) + object_keys.append(key) + + # flush delete request after every 500 files + if len(object_keys) >= 500: + delete_objects(conn, bucket_name, object_keys) + object_keys.clear() + + + # remove leftover objects + if len(object_keys): + delete_objects(conn, bucket_name, object_keys) + + verify=True + if verify: + log.debug("delete_dup_objects: verify delete_list_total") + check_if_any_obj_exists(bucket_name, delete_list_total, conn) + + # must call garbage collection for predictable count + result = admin(['gc', 'process', '--include-all']) + assert result[1] == 0 + + +#------------------------------------------------------------------------------- +def verify_objects(bucket_name, files, conn, expected_results, config, delete_dups): + if expected_results: + assert expected_results == count_object_parts_in_all_buckets(True) + tmpfile = OUT_DIR + "temp" for f in files: filename=f[0] obj_size=f[1] num_copies=f[2] log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies) - for i in range(0, num_copies): + + ### first verify duplicates at index 1 and higher + for i in range(1, num_copies): filecmp.clear_cache() key = gen_object_name(filename, i) conn.download_file(bucket_name, key, tmpfile, Config=config) @@ -829,12 +1126,28 @@ def verify_objects(bucket_name, files, conn, expected_results, config): assert equal ,"Files %s and %s differ!!" % (key, tmpfile) os.remove(tmpfile) - log.debug("verify_objects: finished reading all objects") - assert expected_results == count_object_parts_in_all_buckets(True) + ### Then delete all duplicates + if delete_dups: + delete_dup_objects(bucket_name, files, conn) + + ### Last, verify the object at index zero making sure refcount works + filecmp.clear_cache() + i = 0 + for f in files: + filename=f[0] + key = gen_object_name(filename, i) + conn.download_file(bucket_name, key, tmpfile, Config=config) + equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False) + assert equal ,"Files %s and %s differ!!" % (key, tmpfile) + os.remove(tmpfile) + log.debug("verify_objects::completed successfully!!") #------------------------------------------------------------------------------- -def verify_objects_multi(files, conns, bucket_names, expected_results, config): +def verify_objects_multi(files, conns, bucket_names, expected_results, config, delete_dups): + if expected_results: + assert expected_results == count_object_parts_in_all_buckets(True) + max_tenants=len(conns) tmpfile = OUT_DIR + "temp" for f in files: @@ -842,18 +1155,37 @@ def verify_objects_multi(files, conns, bucket_names, expected_results, config): obj_size=f[1] num_copies=f[2] log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies) - for i in range(0, num_copies): + ### first verify duplicates at index 1 and higher + for i in range(1, num_copies): filecmp.clear_cache() key = gen_object_name(filename, i) log.debug("comparing object %s with file %s", key, filename) ten_id = i % max_tenants - conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, Config=config) + conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, + Config=config) equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False) assert equal ,"Files %s and %s differ!!" % (key, tmpfile) os.remove(tmpfile) - assert expected_results == count_object_parts_in_all_buckets(True) - log.debug("verify_objects::completed successfully!!") + ### Then delete all duplicates + if delete_dups: + delete_dup_objects_multi(files, conns, bucket_names) + + ### Last, verify the object at index zero making sure refcount works + filecmp.clear_cache() + i = 0 + for f in files: + filename=f[0] + key = gen_object_name(filename, i) + log.debug("comparing object %s with file %s", key, filename) + ten_id = i % max_tenants + conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, + Config=config) + equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False) + assert equal ,"Files %s and %s differ!!" % (key, tmpfile) + os.remove(tmpfile) + + log.debug("verify_objects_multi::completed successfully!!") #------------------------------------------------------------------------------- @@ -893,7 +1225,7 @@ def threads_verify_objects(files, conns, bucket_names, expected_results, config) thread_list[idx].join() assert expected_results == count_object_parts_in_all_buckets(True) - log.debug("verify_objects::completed successfully!!") + log.debug("threads_verify_objects::completed successfully!!") #------------------------------------------------------------------------------- @@ -903,9 +1235,12 @@ def get_stats_line_val(line): #------------------------------------------------------------------------------- def print_dedup_stats(dedup_stats): + log.info("===============================================") + for key in dedup_stats.__dict__: - log.warning("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key]) + log.info("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key]) + log.info("===============================================") #------------------------------------------------------------------------------- def print_dedup_stats_diff(actual, expected): @@ -992,8 +1327,14 @@ def verify_dedup_ratio(expected_dedup_stats, dedup_ratio): else: ratio = 0 + + log.debug("skip_too_small_bytes = %d", expected_dedup_stats.skip_too_small_bytes) + if expected_dedup_stats.non_default_storage_class_objs_bytes: + log.debug("non_default_storage_class_objs_bytes= %d", + expected_dedup_stats.non_default_storage_class_objs_bytes) + log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before) - log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate); + log.debug("s3_dedup_bytes = %d", s3_dedup_bytes); log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after) log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio) @@ -1098,7 +1439,7 @@ def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time): set_bucket_index_throttling(limit) #------------------------------------------------------------------------------- -def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True): +def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size=0): # dedup should complete in less than 5 minutes max_dedup_time = 5*60 if expected_dedup_stats.deduped_obj > 10000: @@ -1113,7 +1454,16 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True): dedup_stats = ret[1] dedup_ratio_estimate = ret[2] dedup_ratio_actual = ret[3] + log.debug("exec_dedup: verify_dedup_ratio") + verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate) + if post_dedup_size == 0: + post_dedup_size = dedup_ratio_estimate.s3_bytes_after + + # no need to check after dry-run which doesn't change anything + if dry_run: + post_dedup_size = 0 + count_object_parts_in_all_buckets(True, post_dedup_size) if verify_stats == False: return ret @@ -1121,6 +1471,7 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True): log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj, expected_dedup_stats.potential_unique_obj) + #dedup_stats.set_hash = dedup_stats.invalid_hash if dedup_stats != expected_dedup_stats: log.debug("==================================================") @@ -1129,16 +1480,14 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True): log.debug("==================================================\n") assert dedup_stats == expected_dedup_stats - verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate) log.debug("expcted_dedup::stats check completed successfully!!") return ret - #------------------------------------------------------------------------------- def prepare_test(): cleanup_local() #make sure we are starting with all buckets empty - if count_object_parts_in_all_buckets() != 0: + if count_object_parts_in_all_buckets(False, 0) != 0: log.warning("The system was left dirty from previous run"); log.warning("Make sure to remove all objects before starting"); assert(0) @@ -1163,15 +1512,16 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run): prepare_test() try: files=[] - num_files = 8 - base_size = 4*KB + num_files = 5 + base_size = 1*KB log.debug("generate files: base size=%d KiB, max_size=%d KiB", base_size/KB, (pow(2, num_files) * base_size)/KB) gen_files(files, base_size, num_files) bucket = conn.create_bucket(Bucket=bucket_name) log.debug("upload objects to bucket <%s> ...", bucket_name) indices = [0] * len(files) - ret = upload_objects(bucket_name, files, indices, conn, default_config) + check_obj_count=True + ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count) expected_results = ret[0] dedup_stats = ret[1] s3_objects_total = ret[2] @@ -1183,13 +1533,11 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run): small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small = s3_objects_total - assert small_objs_dedup_stats == dedup_stats exec_dedup(dedup_stats, dry_run) if dry_run == False: log.debug("Verify all objects") - verify_objects(bucket_name, files, conn, expected_results, default_config) - + verify_objects(bucket_name, files, conn, expected_results, default_config, True) finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) @@ -1221,21 +1569,22 @@ def simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run): bucket = conn.create_bucket(Bucket=bucket_name) indices = [0] * len(files) log.debug("upload objects to bucket <%s> ...", bucket_name) - ret = upload_objects(bucket_name, files, indices, conn, config) + check_obj_count=True + ret = upload_objects(bucket_name, files, indices, conn, config, check_obj_count) expected_results = ret[0] dedup_stats = ret[1] log.info("%d S3 objects were uploaded", ret[2]) exec_dedup(dedup_stats, dry_run) if dry_run == False: log.debug("Verify all objects") - verify_objects(bucket_name, files, conn, expected_results, config) - - return ret + verify_objects(bucket_name, files, conn, expected_results, config, run_cleanup_after) finally: if run_cleanup_after: # cleanup must be executed even after a failure cleanup(bucket_name, conn) + return ret + #------------------------------------------------------------------------------- def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False): @@ -1246,7 +1595,8 @@ def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False) exec_dedup(dedup_stats, dry_run) if dry_run == False: log.debug("Verify all objects") - verify_objects_multi(files, conns, bucket_names, expected_results, config) + verify_objects_multi(files, conns, bucket_names, expected_results, config, + False) return ret @@ -1267,19 +1617,18 @@ def dedup_basic_with_tenants_common(files, max_copies_count, config, dry_run): #------------------------------------------------------------------------------- def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False): indices=[0] * len(files) - start = time.time_ns() - upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config) + check_obj_count=True + upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_count) upload_time_sec = (time.time_ns() - start) / (1000*1000*1000) expected_results = upload_ret[0] dedup_stats = upload_ret[1] s3_objects_total = upload_ret[2] - exec_ret=exec_dedup(dedup_stats, dry_run) exec_time_sec=exec_ret[0] verify_time_sec=0 if dry_run == False: - log.debug("Verify all objects") + log.debug("threads_simple_dedup_with_tenants::Verify all objects") start = time.time_ns() threads_verify_objects(files, conns, bucket_names, expected_results, config) @@ -1578,6 +1927,7 @@ CORRUPTIONS = ("no corruption", "change_etag", "illegal_hex_value", "change_num_parts", "illegal_separator", "illegal_dec_val_num_parts", "illegal_num_parts_overflow") + #------------------------------------------------------------------------------ def change_object_etag(rados_name, new_etag): result = rados(['-p ', POOLNAME, 'setxattr', rados_name, ETAG_ATTR, new_etag]) @@ -1646,7 +1996,7 @@ def corrupt_etag(key, corruption, expected_dedup_stats): new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats) log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s", - corruption, old_etag, new_etag) + corruption, old_etag, new_etag) change_object_etag(rados_name, new_etag) return (rados_name, old_etag) @@ -1670,7 +2020,8 @@ def test_dedup_etag_corruption(): bucket = conn.create_bucket(Bucket=bucket_name) indices = [0] * len(files) - ret = upload_objects(bucket_name, files, indices, conn, default_config) + check_obj_count=True + ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count) expected_results = ret[0] expected_dedup_stats = ret[1] s3_objects_total = ret[2] @@ -1751,7 +2102,8 @@ def test_md5_collisions(): conn=get_single_connection() bucket = conn.create_bucket(Bucket=bucket_name) indices = [0] * len(files) - upload_objects(bucket_name, files, indices, conn, config2) + check_obj_count=True + upload_objects(bucket_name, files, indices, conn, config2, check_obj_count) dedup_stats = Dedup_Stats() # we wrote 2 different small objects (BLOCK_SIZE) with the same md5 @@ -1770,33 +2122,178 @@ def test_md5_collisions(): dedup_stats.set_hash=dedup_stats.total_processed_objects dedup_stats.hash_mismatch=1 s3_bytes_before=dedup_stats.size_before_dedup - expected_ratio_actual=Dedup_Ratio() - expected_ratio_actual.s3_bytes_before=s3_bytes_before - expected_ratio_actual.s3_bytes_after=s3_bytes_before - expected_ratio_actual.ratio=0 + expected_ratio=Dedup_Ratio() + expected_ratio.s3_bytes_before=s3_bytes_before + expected_ratio.s3_bytes_after=s3_bytes_before + expected_ratio.ratio=0 dry_run=False log.debug("test_md5_collisions: first call to exec_dedup") - ret=exec_dedup(dedup_stats, dry_run) + ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE) dedup_ratio_actual=ret[3] - - assert expected_ratio_actual == dedup_ratio_actual + assert expected_ratio == dedup_ratio_actual dedup_stats.valid_hash=dedup_stats.total_processed_objects dedup_stats.invalid_hash=0 dedup_stats.set_hash=0 log.debug("test_md5_collisions: second call to exec_dedup") - ret=exec_dedup(dedup_stats, dry_run) + ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE) dedup_ratio_actual=ret[3] - assert expected_ratio_actual == dedup_ratio_actual + assert expected_ratio == dedup_ratio_actual finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) +#------------------------------------------------------------------------------- +def loop_dedup_split_head_with_tenants(): + prepare_test() + config=default_config + success=False + max_copies_count=4 + files=[] + num_files=11 # [16KB-32MB] + base_size = 16*KB + log.debug("generate files: base size=%d KiB, max_size=%d KiB", + base_size/KB, (pow(2, num_files) * base_size)/KB) + try: + gen_files(files, base_size, num_files, max_copies_count) + indices=[0] * len(files) + ret=gen_connections_multi2(max_copies_count) + #tenants=ret[0] + bucket_names=ret[1] + conns=ret[2] + + ret=upload_objects_multi(files, conns, bucket_names, indices, config, True) + expected_results = ret[0] + dedup_stats = ret[1] + + dry_run=False + exec_dedup(dedup_stats, dry_run, True) + log.debug("Verify all objects") + verify_objects_multi(files, conns, bucket_names, expected_results, config, + True) + success=True + finally: + cleanup_all_buckets(bucket_names, conns) + if not success: + print_files(files, config) + + +#------------------------------------------------------------------------------- +@pytest.mark.basic_test +def test_dedup_split_head_with_tenants(): + #return + + if full_dedup_is_disabled(): + return + + for idx in range(0, 9): + log.debug("test_dedup_split_head_with_tenants: loop #%d", idx); + loop_dedup_split_head_with_tenants() + + +#------------------------------------------------------------------------------- +def loop_dedup_split_head(): + prepare_test() + #bucket_name = gen_bucket_name() + bucket_name = "bucket1" + config=default_config + max_copies_count=4 + files=[] + num_files=11 # [16KB-32MB] + base_size = 16*KB + log.debug("generate files: base size=%d KiB, max_size=%d KiB", + base_size/KB, (pow(2, num_files) * base_size)/KB) + try: + gen_files(files, base_size, num_files, max_copies_count) + indices=[0] * len(files) + + conn=get_single_connection() + conn.create_bucket(Bucket=bucket_name) + check_obj_count=True + ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count) + expected_results = ret[0] + dedup_stats = ret[1] + dry_run=False + exec_dedup(dedup_stats, dry_run, True) + verify_objects(bucket_name, files, conn, expected_results, config, True) + finally: + cleanup(bucket_name, conn) + + +#------------------------------------------------------------------------------- +@pytest.mark.basic_test +def test_dedup_split_head(): + #return + + if full_dedup_is_disabled(): + return + + for idx in range(0, 9): + log.debug("test_dedup_split_head: loop #%d", idx); + loop_dedup_split_head() + +#------------------------------------------------------------------------------- +def dedup_copy_internal(multi_buckets): + if full_dedup_is_disabled(): + return + + prepare_test() + bucket_names=[] + config=default_config + max_copies_count=4 + files=[] + num_files=6 # [5MB-512MB] + base_size=5*MB + log.debug("generate files: base size=%d KiB, max_size=%d KiB", + base_size/KB, (pow(2, num_files) * base_size)/KB) + conn=get_single_connection() + try: + gen_files(files, base_size, num_files, max_copies_count) + indices=[0] * len(files) + if multi_buckets: + bucket_names=create_buckets(conn, max_copies_count) + else: + bucket_name = "bucket1" + conn.create_bucket(Bucket=bucket_name) + bucket_names=[bucket_name] * max_copies_count + + ret=upload_objects_with_copy(files, conn, bucket_names, indices, config) + expected_results = ret[0] + dedup_stats = ret[1] + dry_run=False + max_dedup_time = 5*60 + exec_dedup_internal(dedup_stats, dry_run, max_dedup_time) + + assert expected_results == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup) + expected_results=0 # skip object_parts verification + conns=[conn] * len(bucket_names) + verify_objects_multi(files, conns, bucket_names, expected_results, config, True) + finally: + # cleanup must be executed even after a failure + if multi_buckets: + for bucket_name in bucket_names: + cleanup(bucket_name, conn) + else: + cleanup(bucket_names[0], conn) + + +#------------------------------------------------------------------------------- +@pytest.mark.basic_test +def test_dedup_copy(): + #return + dedup_copy_internal(False) + +#------------------------------------------------------------------------------- +@pytest.mark.basic_test +def test_dedup_copy_multi_buckets(): + #return + dedup_copy_internal(True) + #------------------------------------------------------------------------------- @pytest.mark.basic_test def test_dedup_small(): @@ -1822,8 +2319,8 @@ def test_dedup_small_with_tenants(): prepare_test() max_copies_count=3 files=[] - num_files=10 # [4KB-4MB] - base_size = 4*KB + num_files=5 # [1KB-32KB] + base_size = 1*KB log.debug("generate files: base size=%d KiB, max_size=%d KiB", base_size/KB, (pow(2, num_files) * base_size)/KB) try: @@ -1851,7 +2348,8 @@ def test_dedup_small_with_tenants(): dry_run=False exec_dedup(dedup_stats, dry_run) log.debug("Verify all objects") - verify_objects_multi(files, conns, bucket_names, expected_results, default_config) + verify_objects_multi(files, conns, bucket_names, expected_results, + default_config, True) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -1894,7 +2392,7 @@ def test_dedup_inc_0_with_tenants(): dedup_stats2.set_shared_manifest_src=0 dedup_stats2.deduped_obj=0 dedup_stats2.deduped_obj_bytes=0 - dedup_stats2.valid_hash=dedup_stats.invalid_hash + dedup_stats2.valid_hash=dedup_stats.unique_obj dedup_stats2.invalid_hash=0 dedup_stats2.set_hash=0 @@ -1902,7 +2400,8 @@ def test_dedup_inc_0_with_tenants(): # run dedup again and make sure nothing has changed dry_run=False exec_dedup(dedup_stats2, dry_run) - verify_objects_multi(files, conns, bucket_names, expected_results, config) + verify_objects_multi(files, conns, bucket_names, expected_results, + config, True) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -1931,7 +2430,9 @@ def test_dedup_inc_0(): num_files = 11 gen_files_in_range(files, num_files, 1*MB, 64*MB) # upload objects, dedup, verify, but don't cleanup - ret = simple_dedup(conn, files, bucket_name, False, config, False) + run_cleanup_after=False + dry_run=False + ret = simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run) expected_results = ret[0] dedup_stats = ret[1] s3_objects_total = ret[2] @@ -1942,7 +2443,7 @@ def test_dedup_inc_0(): dedup_stats2.set_shared_manifest_src=0 dedup_stats2.deduped_obj=0 dedup_stats2.deduped_obj_bytes=0 - dedup_stats2.valid_hash=dedup_stats.invalid_hash + dedup_stats2.valid_hash=dedup_stats.unique_obj dedup_stats2.invalid_hash=0 dedup_stats2.set_hash=0 @@ -1950,7 +2451,7 @@ def test_dedup_inc_0(): # run dedup again and make sure nothing has changed dry_run=False exec_dedup(dedup_stats2, dry_run) - verify_objects(bucket_name, files, conn, expected_results, config) + verify_objects(bucket_name, files, conn, expected_results, config, True) finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) @@ -2002,6 +2503,7 @@ def test_dedup_inc_1_with_tenants(): ret=upload_objects_multi(files_combined, conns, bucket_names, indices, config, False) expected_results=ret[0] stats_combined=ret[1] + stats_combined.skip_shared_manifest = stats_base.deduped_obj stats_combined.skip_src_record -= stats_base.skip_src_record stats_combined.skip_src_record += stats_base.set_shared_manifest_src @@ -2010,15 +2512,16 @@ def test_dedup_inc_1_with_tenants(): stats_combined.deduped_obj -= stats_base.deduped_obj stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes - stats_combined.valid_hash = stats_base.set_hash + stats_combined.valid_hash = stats_base.unique_obj stats_combined.invalid_hash -= stats_base.set_hash - stats_combined.set_hash -= stats_base.set_hash + stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj log.debug("test_dedup_inc_1_with_tenants: incremental dedup:") # run dedup again dry_run=False exec_dedup(stats_combined, dry_run) - verify_objects_multi(files_combined, conns, bucket_names, expected_results, config) + verify_objects_multi(files_combined, conns, bucket_names, expected_results, + config, True) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -2063,7 +2566,8 @@ def test_dedup_inc_1(): num_copies_combined=num_copies_to_add+num_copies_base files_combined.append((filename, obj_size, num_copies_combined)) - ret=upload_objects(bucket_name, files_combined, indices, conn, config, False) + check_obj_count=False + ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count) expected_results = ret[0] stats_combined = ret[1] stats_combined.skip_shared_manifest = stats_base.deduped_obj @@ -2074,15 +2578,15 @@ def test_dedup_inc_1(): stats_combined.deduped_obj -= stats_base.deduped_obj stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes - stats_combined.valid_hash = stats_base.set_hash + stats_combined.valid_hash = stats_base.unique_obj stats_combined.invalid_hash -= stats_base.set_hash - stats_combined.set_hash -= stats_base.set_hash + stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj log.debug("test_dedup_inc_1: incremental dedup:") # run dedup again dry_run=False exec_dedup(stats_combined, dry_run) - verify_objects(bucket_name, files_combined, conn, expected_results, config) + verify_objects(bucket_name, files_combined, conn, expected_results, config, True) finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) @@ -2151,15 +2655,16 @@ def test_dedup_inc_2_with_tenants(): stats_combined.deduped_obj -= stats_base.deduped_obj stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes - stats_combined.valid_hash = stats_base.set_hash + stats_combined.valid_hash = stats_base.unique_obj stats_combined.invalid_hash -= stats_base.set_hash - stats_combined.set_hash -= stats_base.set_hash + stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj log.debug("test_dedup_inc_2_with_tenants: incremental dedup:") # run dedup again dry_run=False exec_dedup(stats_combined, dry_run) - verify_objects_multi(files_combined, conns, bucket_names, expected_results, config) + verify_objects_multi(files_combined, conns, bucket_names, expected_results, + config, True) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -2212,7 +2717,8 @@ def test_dedup_inc_2(): indices.append(0) assert(len(indices) == len(files_combined)) - ret=upload_objects(bucket_name, files_combined, indices, conn, config, False) + check_obj_count=False + ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count) expected_results = ret[0] stats_combined = ret[1] stats_combined.skip_shared_manifest = stats_base.deduped_obj @@ -2223,16 +2729,16 @@ def test_dedup_inc_2(): stats_combined.deduped_obj -= stats_base.deduped_obj stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes - stats_combined.valid_hash = stats_base.set_hash + stats_combined.valid_hash = stats_base.unique_obj stats_combined.invalid_hash -= stats_base.set_hash - stats_combined.set_hash -= stats_base.set_hash + stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj log.debug("test_dedup_inc_2: incremental dedup:") # run dedup again dry_run=False exec_dedup(stats_combined, dry_run) verify_objects(bucket_name, files_combined, conn, expected_results, - config) + config, True) finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) @@ -2246,7 +2752,6 @@ def test_dedup_inc_2(): @pytest.mark.basic_test def test_dedup_inc_with_remove_multi_tenants(): #return - if full_dedup_is_disabled(): return @@ -2259,6 +2764,9 @@ def test_dedup_inc_with_remove_multi_tenants(): bucket_names=ret[1] conns=ret[2] try: + split_heads_count=0 + split_heads_removed=0 + split_heads=[] files=[] num_files = 17 # gen_files_in_range creates 2-3 copies @@ -2268,14 +2776,23 @@ def test_dedup_inc_with_remove_multi_tenants(): expected_results_base = ret[0] stats_base = ret[1] + ### find which objects got split head before remove + for f in files: + obj_size=f[1] + num_copies=f[2] + split_head = calc_split_objs_count(obj_size, num_copies, config) + split_heads.append(split_head) + if split_head: + split_heads_count += 1 + # REMOVE some objects and update stats/expected src_record=0 shared_manifest=0 - valid_sha=0 + valid_hash=0 object_keys=[] files_sub=[] dedup_stats = Dedup_Stats() - for f in files: + for idx, f in enumerate(files): filename=f[0] obj_size=f[1] num_copies=f[2] @@ -2283,13 +2800,18 @@ def test_dedup_inc_with_remove_multi_tenants(): num_copies_2=num_copies-num_remove log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2); if num_copies_2: - if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE: - valid_sha += num_copies_2 + split_head = calc_split_objs_count(obj_size, num_copies_2, config) + if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head): + valid_hash += 1 src_record += 1 shared_manifest += (num_copies_2 - 1) files_sub.append((filename, obj_size, num_copies_2)) calc_expected_stats(dedup_stats, obj_size, num_copies_2, config) + elif split_heads[idx]: + # we removed all copies of a split-head object + split_heads_count -= 1 + split_heads_removed += 1 start_idx=num_copies_2 for i in range(start_idx, num_copies): @@ -2305,9 +2827,10 @@ def test_dedup_inc_with_remove_multi_tenants(): dedup_stats.set_shared_manifest_src=0 dedup_stats.deduped_obj=0 dedup_stats.deduped_obj_bytes=0 + dedup_stats.skip_src_record=src_record dedup_stats.skip_shared_manifest=shared_manifest - dedup_stats.valid_hash=valid_sha + dedup_stats.valid_hash=valid_hash dedup_stats.invalid_hash=0 dedup_stats.set_hash=0 @@ -2315,7 +2838,9 @@ def test_dedup_inc_with_remove_multi_tenants(): dry_run=False exec_dedup(dedup_stats, dry_run) expected_results=calc_expected_results(files_sub, config) - verify_objects_multi(files_sub, conns, bucket_names, expected_results, config) + expected_results += split_heads_count + verify_objects_multi(files_sub, conns, bucket_names, expected_results, + config, True) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -2329,7 +2854,6 @@ def test_dedup_inc_with_remove_multi_tenants(): @pytest.mark.basic_test def test_dedup_inc_with_remove(): #return - if full_dedup_is_disabled(): return @@ -2339,6 +2863,9 @@ def test_dedup_inc_with_remove(): log.debug("test_dedup_inc_with_remove: connect to AWS ...") conn=get_single_connection() try: + split_heads_count=0 + split_heads_removed=0 + split_heads=[] files=[] num_files = 17 gen_files_in_range(files, num_files, 1*MB, 64*MB) @@ -2347,28 +2874,41 @@ def test_dedup_inc_with_remove(): expected_results_base = ret[0] stats_base = ret[1] + ### find which objects got split head before remove + for f in files: + obj_size=f[1] + num_copies=f[2] + split_head = calc_split_objs_count(obj_size, num_copies, config) + split_heads.append(split_head) + if split_head: + split_heads_count += 1 + # REMOVE some objects and update stats/expected src_record=0 shared_manifest=0 - valid_sha=0 + valid_hash=0 object_keys=[] files_sub=[] dedup_stats = Dedup_Stats() - for f in files: + for idx, f in enumerate(files): filename=f[0] obj_size=f[1] num_copies=f[2] num_remove=random.randint(0, num_copies) num_copies_2=num_copies-num_remove - log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2); if num_copies_2: - if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE: - valid_sha += num_copies_2 + split_head = calc_split_objs_count(obj_size, num_copies_2, config) + if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head): + valid_hash += 1 src_record += 1 shared_manifest += (num_copies_2 - 1) files_sub.append((filename, obj_size, num_copies_2)) calc_expected_stats(dedup_stats, obj_size, num_copies_2, config) + elif split_heads[idx]: + # we removed all copies of a split-head object + split_heads_count -= 1 + split_heads_removed += 1 start_idx=num_copies_2 for i in range(start_idx, num_copies): @@ -2380,8 +2920,7 @@ def test_dedup_inc_with_remove(): log.debug("Skiping file=%s, num_remove=%d", filename, num_remove) continue - response=conn.delete_objects(Bucket=bucket_name, - Delete={"Objects": [{"Key": key} for key in object_keys]}) + delete_objects(conn, bucket_name, object_keys) # must call garbage collection for predictable count result = admin(['gc', 'process', '--include-all']) @@ -2393,17 +2932,17 @@ def test_dedup_inc_with_remove(): dedup_stats.deduped_obj_bytes=0 dedup_stats.skip_src_record=src_record dedup_stats.skip_shared_manifest=shared_manifest - dedup_stats.valid_hash=valid_sha + dedup_stats.valid_hash=valid_hash dedup_stats.invalid_hash=0 dedup_stats.set_hash=0 log.debug("test_dedup_inc_with_remove: incremental dedup:") log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup) - log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup) dry_run=False exec_dedup(dedup_stats, dry_run) expected_results=calc_expected_results(files_sub, config) - verify_objects(bucket_name, files_sub, conn, expected_results, config) + expected_results += split_heads_count + verify_objects(bucket_name, files_sub, conn, expected_results, config, True) finally: # cleanup must be executed even after a failure cleanup(bucket_name, conn) @@ -2462,7 +3001,6 @@ def test_dedup_multipart(): simple_dedup(conn, files, bucket_name, True, default_config, False) - #------------------------------------------------------------------------------- @pytest.mark.basic_test def test_dedup_basic_with_tenants(): @@ -2497,10 +3035,12 @@ def test_dedup_basic(): num_files=5 base_size = MULTIPART_SIZE log.debug("generate files: base size=%d MiB, max_size=%d MiB", - base_size/MB, (pow(2, num_files) * base_size)/MB) + base_size/MB, (pow(2, num_files) * base_size)/MB) gen_files(files, base_size, num_files) log.debug("call simple_dedup()") - simple_dedup(conn, files, bucket_name, True, default_config, False) + run_cleanup_after=True + dry_run=False + simple_dedup(conn, files, bucket_name, run_cleanup_after, default_config, dry_run) #------------------------------------------------------------------------------- @@ -2552,7 +3092,7 @@ def test_dedup_small_multipart(): #------------------------------------------------------------------------------- @pytest.mark.basic_test def test_dedup_large_scale_with_tenants(): - return + #return if full_dedup_is_disabled(): return @@ -2572,7 +3112,7 @@ def test_dedup_large_scale_with_tenants(): #------------------------------------------------------------------------------- @pytest.mark.basic_test def test_dedup_large_scale(): - return + #return if full_dedup_is_disabled(): return @@ -2592,7 +3132,7 @@ def test_dedup_large_scale(): #------------------------------------------------------------------------------- @pytest.mark.basic_test def test_empty_bucket(): - return + #return if full_dedup_is_disabled(): return @@ -2632,7 +3172,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config): # add new files num_files_new = 11 - gen_files_in_range(files_combined, num_files_new, 2*MB, 32*MB) + gen_files_in_range(files_combined, num_files_new, 1*MB, 32*MB) pad_count = len(files_combined) - len(files) for i in range(0, pad_count): indices.append(0) @@ -2646,7 +3186,8 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config): for f in files_combined: obj_size=f[1] num_copies=f[2] - if num_copies > 1 and obj_size > RADOS_OBJ_SIZE: + split_head = calc_split_objs_count(obj_size, num_copies, config) + if num_copies > 1 and (obj_size > RADOS_OBJ_SIZE or split_head): src_record += 1 stats_combined.skip_shared_manifest = stats_base.deduped_obj @@ -2655,15 +3196,15 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config): stats_combined.deduped_obj -= stats_base.deduped_obj stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes - stats_combined.valid_hash = stats_base.set_hash + stats_combined.valid_hash = stats_base.unique_obj stats_combined.invalid_hash -= stats_base.set_hash - stats_combined.set_hash -= stats_base.set_hash - + stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj log.debug("test_dedup_inc_2_with_tenants: incremental dedup:") # run dedup again dry_run=False exec_dedup(stats_combined, dry_run) - verify_objects_multi(files_combined, conns, bucket_names, expected_results, config) + verify_objects_multi(files_combined, conns, bucket_names, expected_results, + config, False) return (files_combined, stats_combined) @@ -2689,7 +3230,7 @@ def test_dedup_inc_loop_with_tenants(): files=[] num_files = 13 # gen_files_in_range creates 2-3 copies - gen_files_in_range(files, num_files, 1*MB, 64*MB) + gen_files_in_range(files, num_files, 256*KB, 64*MB) # upload objects, dedup, verify, but don't cleanup ret=simple_dedup_with_tenants(files, conns, bucket_names, config) stats_base=ret[1] @@ -2699,9 +3240,10 @@ def test_dedup_inc_loop_with_tenants(): files=ret[0] stats_last=ret[1] stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src - stats_base.deduped_obj += stats_last.deduped_obj - stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes - stats_base.set_hash += stats_last.set_hash + stats_base.unique_obj += stats_last.set_shared_manifest_src + stats_base.deduped_obj += stats_last.deduped_obj + stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes + stats_base.set_hash += stats_last.set_hash finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -2718,8 +3260,8 @@ def test_dedup_dry_small_with_tenants(): prepare_test() max_copies_count=3 files=[] - num_files=10 # [4KB-4MB] - base_size = 4*KB + num_files=5 # [1KB-32KB] + base_size = 1*KB log.debug("generate files: base size=%d KiB, max_size=%d KiB", base_size/KB, (pow(2, num_files) * base_size)/KB) try: @@ -2762,10 +3304,10 @@ def test_dedup_dry_multipart(): num_files=8 min_size=MULTIPART_SIZE - #gen_files_in_range(files, num_files, min_size, 1024*MB) + # create files in range [MULTIPART_SIZE, 128MB] aligned on RADOS_OBJ_SIZE gen_files_in_range(files, num_files, min_size, 128*MB) - # add files in range [MULTIPART_SIZE, 4*MULTIPART_SIZE] aligned on MULTIPART_SIZE + # add files in range [MULTIPART_SIZE, 8*MULTIPART_SIZE] aligned on MULTIPART_SIZE gen_files_in_range(files, num_files, min_size, min_size*8, MULTIPART_SIZE) # add file with excatly MULTIPART_SIZE @@ -2862,7 +3404,8 @@ def test_dedup_dry_small_large_mix(): conns[i].create_bucket(Bucket=bucket_names[i]) indices = [0] * len(files) - ret=procs_upload_objects(files, conns, bucket_names, indices, default_config) + check_obj_count=True + ret=procs_upload_objects(files, conns, bucket_names, indices, default_config, check_obj_count) upload_time_sec = (time.time_ns() - start) / (1000*1000*1000) expected_results = ret[0] dedup_stats = ret[1] @@ -2870,8 +3413,6 @@ def test_dedup_dry_small_large_mix(): log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total, upload_time_sec) exec_dedup(dedup_stats, dry_run) - if dry_run == False: - verify_objects(bucket_name, files, conn, expected_results, default_config) finally: # cleanup must be executed even after a failure cleanup_all_buckets(bucket_names, conns) @@ -2946,9 +3487,19 @@ def test_dedup_dry_large_scale_with_tenants(): size=1*KB files=[] config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB) - log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...") + log.info("test_dedup_dry_large_scale: connect to AWS ...") gen_files_fixed_size(files, num_files, size, max_copies_count) - threads_dedup_basic_with_tenants_common(files, num_threads, config, True) + conns=get_connections(num_threads) + bucket_names=get_buckets(num_threads) + for i in range(num_threads): + conns[i].create_bucket(Bucket=bucket_names[i]) + try: + threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True) + except Exception: + log.warning("test_dedup_dry_large_scale: failed!!") + finally: + # cleanup must be executed even after a failure + cleanup_all_buckets(bucket_names, conns) #------------------------------------------------------------------------------- @@ -2957,25 +3508,26 @@ def test_dedup_dry_large_scale(): #return prepare_test() - max_copies_count=3 - num_threads=64 - num_files=32*1024 + bucket_name = gen_bucket_name() + max_copies_count=2 + num_files=2*1024 size=1*KB files=[] config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB) - log.debug("test_dedup_dry_large_scale_new: connect to AWS ...") - gen_files_fixed_size(files, num_files, size, max_copies_count) - conns=get_connections(num_threads) - bucket_names=get_buckets(num_threads) - for i in range(num_threads): - conns[i].create_bucket(Bucket=bucket_names[i]) + log.info("test_dedup_dry_large_scale: connect to AWS ...") try: - threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True) - except: - log.warning("test_dedup_dry_large_scale: failed!!") + gen_files_fixed_size(files, num_files, size, max_copies_count) + indices=[0] * len(files) + conn=get_single_connection() + conn.create_bucket(Bucket=bucket_name) + check_obj_count=True + ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count) + dedup_stats = ret[1] + dry_run=True + exec_dedup(dedup_stats, dry_run, True) finally: # cleanup must be executed even after a failure - cleanup_all_buckets(bucket_names, conns) + cleanup(bucket_name, conn) #-------------------------------------------------------------------------------