From 627cee9a28ff3c9d4284168f4569bf49b39c8651 Mon Sep 17 00:00:00 2001 From: benhanokh Date: Mon, 23 Feb 2026 11:26:17 +0200 Subject: [PATCH] rgw/dedup split-head Simplified check for shared-tail-objects. Added test for copy after dedup Use tail-ioctx when removing newly created tail-head Signed-off-by: benhanokh --- src/rgw/driver/rados/rgw_dedup.cc | 99 +++++++++++++++++++------------ src/rgw/driver/rados/rgw_dedup.h | 14 +++-- src/test/rgw/dedup/test_dedup.py | 56 +++++++++++++++-- 3 files changed, 120 insertions(+), 49 deletions(-) diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc index 19792969b8b..330fb7e5015 100644 --- a/src/rgw/driver/rados/rgw_dedup.cc +++ b/src/rgw/driver/rados/rgw_dedup.cc @@ -605,13 +605,56 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - static void remove_created_tail_object(const DoutPrefixProvider *dpp, - librados::IoCtx &ioctx, - const std::string &tail_oid, - md5_stats_t *p_stats) + static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement) + { + return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty()); + } + + //--------------------------------------------------------------------------- + int Background::get_tail_ioctx(const disk_record_t *p_rec, + const RGWObjManifest &manifest, + md5_stats_t *p_stats /*IN-OUT*/, + librados::IoCtx *p_ioctx /*OUT*/, + std::string *p_tail_name /*OUT*/, + std::string *p_oid /*OUT*/) + { + const rgw_bucket_placement &tail_placement = manifest.get_tail_placement(); + // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort + if (unlikely(invalid_tail_placement(tail_placement))) { + p_stats->split_head_no_tail_placement++; + ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl; + return -EINVAL; + } + + const rgw_bucket& bucket = tail_placement.bucket; + *p_tail_name = generate_split_head_tail_name(manifest); + // tail objects might be on another storage_class/pool, need another ioctx + int ret = get_ioctx_internal(dpp, driver, store, *p_tail_name, p_rec->instance, + bucket, p_ioctx, p_oid); + if (unlikely(ret != 0)) { + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl; + return ret; + } + + return 0; + } + + //--------------------------------------------------------------------------- + void Background::remove_created_tail_object(const disk_record_t *p_rec, + const RGWObjManifest &manifest, + md5_stats_t *p_stats /*IN-OUT*/) { + librados::IoCtx tail_ioctx; + std::string tail_oid; + std::string tail_name; + int ret = get_tail_ioctx(p_rec, manifest, p_stats, &tail_ioctx, &tail_name, + &tail_oid); + if (unlikely(ret != 0)) { + return; + } + p_stats->rollback_tail_obj++; - int ret = ioctx.remove(tail_oid); + ret = tail_ioctx.remove(tail_oid); if (ret == 0) { ldpp_dout(dpp, 20) << __func__ << "::" << tail_oid << " was successfully removed" << dendl; @@ -923,8 +966,7 @@ namespace rgw::dedup { const RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, md5_stats_t *p_stats, - const dedup_table_t::value_t *p_src_val, - const std::string &tail_oid) + const dedup_table_t::value_t *p_src_val) { const uint64_t src_head_size = src_manifest.get_head_size(); const uint64_t tgt_head_size = tgt_manifest.get_head_size(); @@ -948,7 +990,7 @@ namespace rgw::dedup { if (unlikely(ret != 0)) { ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl; if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -959,7 +1001,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size << "::tgt_head_size=" << tgt_head_size << dendl; if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } // TBD: can we create a test case (requires control over head-object-size)?? return -ECANCELED; @@ -971,7 +1013,7 @@ namespace rgw::dedup { ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); if (unlikely(ret != 0)) { if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -1011,7 +1053,7 @@ namespace rgw::dedup { << src_oid << "), err is " << cpp_strerror(-ret)<s.flags.is_split_head()) { - remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -1126,12 +1168,6 @@ namespace rgw::dedup { << p_tgt_rec->s.md5_low << std::dec << dendl; } - //--------------------------------------------------------------------------- - static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement) - { - return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty()); - } - //--------------------------------------------------------------------------- static void set_explicit_tail_placement(const DoutPrefixProvider* dpp, RGWObjManifest *p_manifest,// IN-OUT PARAM @@ -1636,7 +1672,6 @@ namespace rgw::dedup { int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM RGWObjManifest &src_manifest, // IN/OUT PARAM const disk_record_t *p_tgt_rec, - std::string &tail_oid, // OUT PARAM md5_stats_t *p_stats) { ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::" @@ -1683,22 +1718,11 @@ namespace rgw::dedup { } } - std::string tail_name = generate_split_head_tail_name(src_manifest); - const rgw_bucket_placement &tail_placement = src_manifest.get_tail_placement(); - // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort - if (unlikely(invalid_tail_placement(tail_placement))) { - p_stats->split_head_no_tail_placement++; - ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl; - return -EINVAL; - } - - const rgw_bucket *p_bucket = &tail_placement.bucket; - // tail objects might be on another storage_class/pool, need another ioctx librados::IoCtx tail_ioctx; - ret = get_ioctx_internal(dpp, driver, store, tail_name, p_src_rec->instance, - *p_bucket, &tail_ioctx, &tail_oid); + std::string tail_oid, tail_name; + ret = get_tail_ioctx(p_src_rec, src_manifest, p_stats, &tail_ioctx, &tail_name, + &tail_oid); if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl; return ret; } @@ -1736,7 +1760,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret=" << ret << dendl; } - + const rgw_bucket *p_bucket = &(src_manifest.get_tail_placement().bucket); build_and_set_explicit_manifest(dpp, p_bucket, tail_name, &src_manifest); bufferlist manifest_bl; @@ -1753,7 +1777,6 @@ namespace rgw::dedup { RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, const dedup_table_t::value_t *p_src_val, - std::string &tail_oid, // OUT PARAM md5_stats_t *p_stats) { int ret = 0; @@ -1798,7 +1821,7 @@ namespace rgw::dedup { // in the obj-attributes uint64_t head_size = src_manifest.get_head_size(); if (should_split_head(head_size, src_manifest.get_obj_size())) { - ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, tail_oid, p_stats); + ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_stats); // compare_strong_hash() is called internally by split_head_object() return (ret == 0); } @@ -2027,10 +2050,8 @@ namespace rgw::dedup { return 0; } - - std::string tail_oid; bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest, - tgt_manifest, &src_val, tail_oid, p_stats); + tgt_manifest, &src_val, p_stats); if (unlikely(!success)) { if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) { // set hash attributes on head objects to save calc next time @@ -2049,7 +2070,7 @@ namespace rgw::dedup { } ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats, - &src_val, tail_oid); + &src_val); if (ret == 0) { ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl; p_stats->deduped_objects++; diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h index adca55efebc..99af451cd43 100644 --- a/src/rgw/driver/rados/rgw_dedup.h +++ b/src/rgw/driver/rados/rgw_dedup.h @@ -99,6 +99,15 @@ namespace rgw::dedup { inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes); inline bool should_split_head(uint64_t head_size, uint64_t obj_size); + int get_tail_ioctx(const disk_record_t *p_rec, + const RGWObjManifest &manifest, + md5_stats_t *p_stats /*IN-OUT*/, + librados::IoCtx *p_ioctx /*OUT*/, + std::string *p_tail_name /*OUT*/, + std::string *p_oid /*OUT*/); + void remove_created_tail_object(const disk_record_t *p_rec, + const RGWObjManifest &manifest, + md5_stats_t *p_stats /*IN-OUT*/); void run(); int setup(struct dedup_epoch_t*); void work_shards_barrier(work_shard_t num_work_shards); @@ -191,7 +200,6 @@ namespace rgw::dedup { int split_head_object(disk_record_t *p_src_rec, // IN/OUT PARAM RGWObjManifest &src_manifest, // IN/OUT PARAM const disk_record_t *p_tgt_rec, - std::string &tail_oid, // OUT PARAM md5_stats_t *p_stats); int add_obj_attrs_to_record(disk_record_t *p_rec, @@ -211,7 +219,6 @@ namespace rgw::dedup { RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, const dedup_table_t::value_t *p_src_val, - std::string &tail_oid, // OUT PARAM md5_stats_t *p_stats); int try_deduping_record(dedup_table_t *p_table, disk_record_t *p_rec, @@ -234,8 +241,7 @@ namespace rgw::dedup { const RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, md5_stats_t *p_stats, - const dedup_table_t::value_t *p_src_val, - const std::string &tail_oid); + const dedup_table_t::value_t *p_src_val); #endif int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count); int init_rados_access_handles(bool init_pool); diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py index 15b746a537a..6541726b0f2 100644 --- a/src/test/rgw/dedup/test_dedup.py +++ b/src/test/rgw/dedup/test_dedup.py @@ -1137,6 +1137,28 @@ def verify_objects(bucket_name, files, conn, expected_results, config, delete_du log.debug("verify_objects::completed successfully!!") +#------------------------------------------------------------------------------- +def verify_objects_copy(bucket_name, files, conn, expected_results, config): + if expected_results: + assert expected_results == count_object_parts_in_all_buckets(True) + + tmpfile = OUT_DIR + "temp" + for f in files: + filename=f[0] + obj_size=f[1] + num_copies=f[2] + log.debug("comparing copy =%s, size=%d, copies=%d", filename, obj_size, num_copies) + + for i in range(0, num_copies): + filecmp.clear_cache() + key = gen_object_name(filename, i) + "_cp" + conn.download_file(bucket_name, key, tmpfile, Config=config) + equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False) + assert equal ,"Files %s and %s differ!!" % (key, tmpfile) + os.remove(tmpfile) + + log.debug("verify_objects_copy::completed successfully!!") + #------------------------------------------------------------------------------- def verify_objects_multi(files, conns, bucket_names, expected_results, config, delete_dups): if expected_results: @@ -2288,16 +2310,19 @@ def test_copy_after_dedup(): prepare_test() log.debug("test_copy_after_dedup: connect to AWS ...") + config=default_config max_copies_count=3 num_files=8 files=[] min_size=8*MB + # create files in range [1MB, 4MB] to force split-head + # This will verify server-side-copy with split-head generated tails + gen_files_in_range(files, num_files, 1*MB, 4*MB) + # create files in range [8MB, 32MB] aligned on RADOS_OBJ_SIZE - gen_files_in_range(files, num_files, min_size, min_size*4) + gen_files_in_range(files, num_files, 8*MB, 32*MB) - # add file with excatly MULTIPART_SIZE - write_random(files, MULTIPART_SIZE, 2, 2) bucket_cp= gen_bucket_name() bucket_names=[] try: @@ -2305,11 +2330,14 @@ def test_copy_after_dedup(): conn.create_bucket(Bucket=bucket_cp) bucket_names=create_buckets(conn, max_copies_count) conns=[conn] * max_copies_count - dry_run=False - ret = simple_dedup_with_tenants(files, conns, bucket_names, default_config, - dry_run) + indices=[0] * len(files) + ret=upload_objects_multi(files, conns, bucket_names, indices, config) expected_results = ret[0] dedup_stats = ret[1] + dry_run=False + exec_dedup(dedup_stats, dry_run) + verify_objects_multi(files, conns, bucket_names, expected_results, config, + False) cp_head_count=0 for f in files: @@ -2325,7 +2353,23 @@ def test_copy_after_dedup(): conn.copy_object(CopySource=base_obj, Bucket=bucket_cp, Key=key_cp) cp_head_count += 1 + # Make sure that server-side-copy behaved as expected copying only the head + # object and linking to the existing tail-objects assert (expected_results + cp_head_count) == count_object_parts_in_all_buckets(False, 0) + # delete the original objects and verify server-side-copy objects are valid + for (bucket_name, conn) in zip(bucket_names, conns): + delete_bucket_with_all_objects(bucket_name, conn) + + result = admin(['gc', 'process', '--include-all']) + assert result[1] == 0 + bucket_names.clear() + conns.clear() + + # At this point the original obejcts are all removed + # Objects created by server-side-copy should keep the tail in place + # because of teh refcount + verify_objects_copy(bucket_cp, files, conn, expected_results, config) + finally: # cleanup must be executed even after a failure delete_bucket_with_all_objects(bucket_cp, conn) -- 2.47.3