From: benhanokh Date: Mon, 30 Mar 2026 08:22:51 +0000 (+0300) Subject: rgw/dedup: This PR extends the RGW dedup split-head feature to support objects that... X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7b72f06c992beefe5cf5994a87250c317cc90155;p=ceph.git rgw/dedup: This PR extends the RGW dedup split-head feature to support objects that already have tail RADOS objects (i.e. objects larger than the head chunk size). Previously, split-head was restricted to objects whose entire data fit in the head (≤4 MiB). It also migrates the split-head manifest representation from the legacy explicit-objs format to the prefix+index rules-based format. Refactored should_split_head(): Now performs a larger set of eligibility checks: * d_split_head flag is set * single-part object only * non-empty head * not a legacy manifest * not an Alibaba Cloud OSS AppendObject Explicit skips for unsupported manifest types: — old-style explicit-objs manifests — OSS AppendObject manifests (detected via non-empty override_prefix) New config option: rgw_dedup_split_obj_head: Default is true (split-head enabled). Setting to false disables split-head entirely. Tail object lookup via manifest iterator: Replaces the old get_tail_ioctx() which manually constructed the tail OID via generate_split_head_tail_name(). The new function simply calls manifest.obj_begin() and resolves the first tail object location through the standard manifest iterator. Stats cleanup: Removed the "Potential Dedup" stats section (small_objs_stat, dup_head_bytes, dup_head_bytes_estimate, ingress_skip_too_small_64KB*) which tracked 64KB–4MB objects as potential-but-skipped candidates. Since split-head now covers all sizes, this distinction is no longer meaningful. calc_deduped_bytes() is simplified accordingly. Signed-off-by: benhanokh --- diff --git a/doc/radosgw/s3_objects_dedup.rst b/doc/radosgw/s3_objects_dedup.rst index fe83124d1549..7de91ed00071 100644 --- a/doc/radosgw/s3_objects_dedup.rst +++ b/doc/radosgw/s3_objects_dedup.rst @@ -108,19 +108,21 @@ matches. If they are, we proceed with the deduplication: - Copy the manifest from the source to the target. - Remove all tail objects on the target. - Split Head Mode =============== -Dedup code can split the head object into 2 objects +The dedup code can split a head object into 2 objects: -- one with attributes and no data and +- one with attributes and no data, and - a new tail object with only data. -The new tail object will be deduped, unlike the head objects, which cannot +The new tail object will be deduped, unlike head objects, which cannot be deduplicated. -This feature is only enabled for RGW objects without existing tail objects -(in other words, objects sized 4 MB or less). + +:confval:`rgw_dedup_split_obj_head` (default: true). Setting +this option to ``false`` disables split-head entirely. + +.. confval:: rgw_dedup_split_obj_head Memory Usage diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in index 6a85b076ca5f..57000c5154b5 100644 --- a/src/common/options/rgw.yaml.in +++ b/src/common/options/rgw.yaml.in @@ -81,6 +81,18 @@ options: default: false services: - rgw +- name: rgw_dedup_split_obj_head + type: bool + level: advanced + desc: Enables the split-head functionality + long_desc: Dedup code can split head object into two objects - + one with attributes and no data and a new tail-object with only data. + The new-tail object will be deduped (unlike the head objects which + can't be deduplicated) + default: true + services: + - rgw + with_legacy: true - name: rgw_dedup_min_obj_size_for_dedup type: size level: advanced diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc index 2446d43ff92d..67d59370aa9a 100644 --- a/src/rgw/driver/rados/rgw_dedup.cc +++ b/src/rgw/driver/rados/rgw_dedup.cc @@ -413,12 +413,10 @@ namespace rgw::dedup { { d_head_object_size = cct->_conf->rgw_max_chunk_size; d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup; - - // limit split head to objects without tail - d_max_obj_size_for_split = d_head_object_size; + d_split_head = cct->_conf->rgw_dedup_split_obj_head; ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup - << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split + << "::d_split_head=" << d_split_head << dendl; int ret = init_rados_access_handles(false); @@ -435,11 +433,8 @@ namespace rgw::dedup { //------------------------------------------------------------------------------ uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes) { - return calc_deduped_bytes(d_head_object_size, - d_min_obj_size_for_dedup, - d_max_obj_size_for_split, - num_parts, - size_bytes); + return calc_deduped_bytes(d_head_object_size, d_min_obj_size_for_dedup, + d_split_head, num_parts, size_bytes); } //--------------------------------------------------------------------------- @@ -495,8 +490,7 @@ namespace rgw::dedup { << p_rec->s.md5_low << std::dec << dendl; int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest, - &p_stats->small_objs_stat, &p_stats->big_objs_stat, - &p_stats->dup_head_bytes_estimate); + &p_stats->big_objs_stat); if (ret == 0) { p_stats->loaded_objects ++; ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/" @@ -544,15 +538,14 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - static int get_ioctx_internal(const DoutPrefixProvider* const dpp, - rgw::sal::Driver* driver, - rgw::sal::RadosStore* store, - const std::string &obj_name, - const std::string &instance, - const rgw_bucket &rb, - librados::IoCtx *p_ioctx /*OUT*/, - std::string *p_oid /*OUT*/) + static inline int get_ioctx(const DoutPrefixProvider* const dpp, + rgw::sal::Driver* driver, + rgw::sal::RadosStore* store, + const disk_record_t *p_rec, + librados::IoCtx *p_ioctx /*OUT*/, + std::string *p_oid /*OUT*/) { + rgw_bucket rb{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; unique_ptr bucket; { int ret = driver->load_bucket(dpp, rb, &bucket, null_yield); @@ -564,44 +557,12 @@ namespace rgw::dedup { } string dummy_locator; - const rgw_obj_index_key key(obj_name, instance); + const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance); rgw_obj obj(bucket->get_key(), key); get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator); RGWBucketInfo& bucket_info = bucket->get_info(); return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx); - } - //--------------------------------------------------------------------------- - static inline int get_ioctx(const DoutPrefixProvider* const dpp, - rgw::sal::Driver* driver, - rgw::sal::RadosStore* store, - const disk_record_t *p_rec, - librados::IoCtx *p_ioctx /*OUT*/, - std::string *p_oid /*OUT*/) - { - rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id}; - return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance, - b, p_ioctx, p_oid); - } - - //--------------------------------------------------------------------------- - static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest) - { - static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW); - std::string_view suffix = "0"; - const std::string &prefix = manifest.get_prefix(); - - std::string tail_name; - tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1); - // TBD: - // it is unclear when RGW code pads with "_" before the shadow string - // It won't change correctness, but might look weird - //tail_name.append("_"); - tail_name.append(shadow_string); - tail_name.append("_"); - tail_name.append(prefix); - tail_name.append(suffix); - return tail_name; } //--------------------------------------------------------------------------- @@ -611,43 +572,37 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - int Background::get_tail_ioctx(const disk_record_t *p_rec, - const RGWObjManifest &manifest, - const std::string &tail_name, - md5_stats_t *p_stats /*IN-OUT*/, - librados::IoCtx *p_ioctx /*OUT*/, - std::string *p_oid /*OUT*/) + static int get_first_tail_obj_params(const DoutPrefixProvider *dpp, + RGWRados *rados, + const RGWObjManifest &manifest, + librados::IoCtx *p_tail_ioctx, /*OUT*/ + std::string *p_tail_oid /*OUT*/) { - const rgw_bucket_placement &tail_placement = manifest.get_tail_placement(); - // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort - if (unlikely(invalid_tail_placement(tail_placement))) { - p_stats->split_head_no_tail_placement++; - ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl; - return -EINVAL; - } - - const rgw_bucket& bucket = tail_placement.bucket; - // tail objects might be on another storage_class/pool, need another ioctx - int ret = get_ioctx_internal(dpp, driver, store, tail_name, p_rec->instance, - bucket, p_ioctx, p_oid); - if (unlikely(ret != 0)) { - ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl; + auto p = manifest.obj_begin(dpp); + const rgw_obj_select& os = p.get_location(); + rgw_raw_obj raw_obj = os.get_raw_obj(rados); + rgw_rados_ref obj; + int ret = rgw_get_rados_ref(dpp, rados->get_rados_handle(), raw_obj, &obj); + if (ret < 0) { + ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid=" + << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl; return ret; } + *p_tail_ioctx = obj.ioctx; + *p_tail_oid = raw_obj.oid; + return 0; } //--------------------------------------------------------------------------- void Background::remove_created_tail_object(const disk_record_t *p_rec, const RGWObjManifest &manifest, - const std::string &tail_name, md5_stats_t *p_stats /*IN-OUT*/) { librados::IoCtx tail_ioctx; std::string tail_oid; - int ret = get_tail_ioctx(p_rec, manifest, tail_name, p_stats, &tail_ioctx, - &tail_oid); + int ret = get_first_tail_obj_params(dpp, rados, manifest, &tail_ioctx, &tail_oid); if (unlikely(ret != 0)) { return; } @@ -665,10 +620,41 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size) + inline bool Background::should_split_head(const RGWObjManifest& manifest) { - // Don't split RGW objects with existing tail-objects - return (head_size > 0 && head_size == obj_size); + // Split-head is only applicable for single-part objects with a non-empty head. + // To avoid issues with manifests created via append (specifically for Alibaba Cloud OSS), + // we should disable split-head whenever the manifest contains an override_prefix in the rules. + // We also reject manifests with multiple rules which are exclusively an + // artifact of the OSS AppendObject operation. + // The head_size should either represent the full object or be equal to the stripe_max_size. + + bool success = false; + uint64_t head_size = manifest.get_head_size(); + uint64_t obj_size = manifest.get_obj_size(); + RGWObjManifestRule rule; + if (manifest.get_rule(0, &rule)) { + success = (d_split_head && // split-head was not disabled from yaml + rule.part_size == 0 && // not a multi-part object + head_size > 0 && // non-empty head + !manifest.has_explicit_objs() && // not an explicit manifest + rule.override_prefix.empty() && // not Alibaba Cloud OSS + manifest.get_rules().size() == 1 && // not Alibaba Cloud OSS + (head_size == rule.stripe_max_size || head_size == obj_size)); + + if (unlikely(!success)) { + ldpp_dout(dpp, 20) << __func__ << "::ERR::d_split_head=" << d_split_head + << "::obj_size=" << obj_size + << "::head_size=" << head_size + << "::rule.part_size=" << rule.part_size + << "::rule.stripe_max_size=" << rule.stripe_max_size + << "::rule.override_prefix=" << rule.override_prefix + << "::rule.override_prefix.empty()=" << rule.override_prefix.empty() + << dendl; + } + } // don't split head if can't get rule + + return success; } //--------------------------------------------------------------------------- @@ -806,14 +792,14 @@ namespace rgw::dedup { cls_refcount_get(op, ref_tag, true); d_ctl.metadata_access_throttle.acquire(); ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " - << raw_obj.oid << "::" << obj.obj.oid << dendl; + << obj.obj.oid << "::" << raw_obj.to_str() << dendl; rgw::AioResultList completed = aio->get(obj.obj, rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield), cost, id); ret = rgw::check_for_errors(completed); all_results.splice(all_results.end(), completed); if (ret < 0) { - ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj + ldpp_dout(dpp, 1) << __func__ << "::ERR: failed refcount_get() obj=" << obj << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl; break; } @@ -965,7 +951,6 @@ namespace rgw::dedup { const RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, md5_stats_t *p_stats, - const std::string &tail_name, const dedup_table_t::value_t *p_src_val) { const uint64_t src_head_size = src_manifest.get_head_size(); @@ -990,7 +975,7 @@ namespace rgw::dedup { if (unlikely(ret != 0)) { ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl; if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -1001,7 +986,7 @@ namespace rgw::dedup { ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size << "::tgt_head_size=" << tgt_head_size << dendl; if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } // TBD: can we create a test case (requires control over head-object-size)?? return -ECANCELED; @@ -1013,7 +998,7 @@ namespace rgw::dedup { ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest); if (unlikely(ret != 0)) { if (p_src_rec->s.flags.is_split_head()) { - remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -1053,7 +1038,7 @@ namespace rgw::dedup { << src_oid << "), err is " << cpp_strerror(-ret)<s.flags.is_split_head()) { - remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats); + remove_created_tail_object(p_src_rec, src_manifest, p_stats); } return ret; } @@ -1235,8 +1220,29 @@ namespace rgw::dedup { << "::ERROR: unable to decode manifest" << dendl; return -EINVAL; } - need_to_split_head = should_split_head(manifest.get_head_size(), - p_rec->s.obj_bytes_size); + + if (unlikely(manifest.has_explicit_objs())) { + // we don't support dedup of explicit_objs manifest + p_stats->ingress_skip_explicit_objs++; + ldpp_dout(dpp, 20) << __func__ << "::explicit_objs can't be dedup" << dendl; + return -ENOTSUP; + } + + RGWObjManifestRule rule; + if (!manifest.get_rule(0, &rule) || + // if not a multi-part must have exactly 1 rule + (rule.part_size == 0 && manifest.get_rules().size() != 1) || + !rule.override_prefix.empty()) { + // we don't support dedup of Alibaba Cloud OSS using AppendObject API + p_stats->ingress_skip_alibaba++; + ldpp_dout(dpp, 10) << __func__ << "::Alibaba Cloud OSS can't be dedup" + << "::rules.size()=" << manifest.get_rules().size() + << "::get_rule ret=" << manifest.get_rule(0, &rule) + << "::override_prefix=" << rule.override_prefix << dendl; + return -ENOTSUP; + } + + need_to_split_head = should_split_head(manifest); // force explicit tail_placement as the dedup could be on another bucket const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); @@ -1501,6 +1507,11 @@ namespace rgw::dedup { p_rec->s.flags.clear(); ret = add_obj_attrs_to_record(p_rec, attrs, p_stats); if (unlikely(ret != 0)) { + // don't trace errors for unsupported manifest + if (ret == -ENOTSUP) { + return 0; + } + ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret=" << ret << "::" << cpp_strerror(-ret) << dendl; return ret; @@ -1640,39 +1651,36 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - static void build_and_set_explicit_manifest(const DoutPrefixProvider *dpp, - const rgw_bucket *p_bucket, - const std::string &tail_name, - RGWObjManifest *p_manifest) + static int set_manifest_for_split_head(const DoutPrefixProvider *const dpp, + RGWObjManifest *p_manifest /*IN-OUT*/) { - uint64_t obj_size = p_manifest->get_obj_size(); - ceph_assert(obj_size == p_manifest->get_head_size()); - - const rgw_obj &head_obj = p_manifest->get_obj(); - const rgw_obj_key &head_key = head_obj.key; - rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns); - rgw_obj tail_obj(*p_bucket, tail_key); - - RGWObjManifestPart tail_part; - tail_part.loc = tail_obj; - tail_part.loc_ofs = 0; - tail_part.size = obj_size; - - std::map objs_map; - objs_map[0] = tail_part; - + // Set head-size to zero in the manifest with start_ofs set to zero + // This means no data is stored in the head-object and the first tail-object + // holds the first data byte + uint64_t tail_ofs = 0; p_manifest->set_head_size(0); p_manifest->set_max_head_size(0); - p_manifest->set_prefix(""); - p_manifest->clear_rules(); - p_manifest->set_explicit(obj_size, objs_map); + RGWObjManifestRule rule; + if (p_manifest->get_rule(0, &rule)) { + ldpp_dout(dpp, 20) << "OLD Rule::start_part_num="<< rule.start_part_num + << "::start_ofs=" << rule.start_ofs + << "::part_size=" << rule.part_size + << "::stripe_max_size=" << rule.stripe_max_size + << "::override_prefix=" << rule.override_prefix << dendl; + p_manifest->clear_rules(); + p_manifest->set_trivial_rule(tail_ofs, rule.stripe_max_size); + return 0; + } + else { + // No Rules, probably explicit_objs - should never happen (was checked before) + return -ENOENT; + } } //--------------------------------------------------------------------------- int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM RGWObjManifest &src_manifest, // IN/OUT PARAM const disk_record_t *p_tgt_rec, - std::string *p_tail_name /*OUT*/, md5_stats_t *p_stats /* IN-OUT */) { ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::" @@ -1719,11 +1727,14 @@ namespace rgw::dedup { } } - *p_tail_name = generate_split_head_tail_name(src_manifest); + ret = set_manifest_for_split_head(dpp, &src_manifest); + if (unlikely(ret != 0)) { + return ret; + } + librados::IoCtx tail_ioctx; std::string tail_oid; - ret = get_tail_ioctx(p_src_rec, src_manifest, *p_tail_name, p_stats, - &tail_ioctx, &tail_oid); + ret = get_first_tail_obj_params(dpp, rados, src_manifest, &tail_ioctx, &tail_oid); if (unlikely(ret != 0)) { return ret; } @@ -1762,8 +1773,6 @@ namespace rgw::dedup { ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret=" << ret << dendl; } - const rgw_bucket *p_bucket = &(src_manifest.get_tail_placement().bucket); - build_and_set_explicit_manifest(dpp, p_bucket, *p_tail_name, &src_manifest); bufferlist manifest_bl; encode(src_manifest, manifest_bl); @@ -1779,7 +1788,6 @@ namespace rgw::dedup { RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, const dedup_table_t::value_t *p_src_val, - std::string *p_tail_name /*OUT*/, md5_stats_t *p_stats) { int ret = 0; @@ -1822,9 +1830,8 @@ namespace rgw::dedup { // we might still need to split-head here when hash is valid // can happen if we failed compare before (md5-collison) and stored the src hash // in the obj-attributes - uint64_t head_size = src_manifest.get_head_size(); - if (should_split_head(head_size, src_manifest.get_obj_size())) { - ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_tail_name, p_stats); + if (should_split_head(src_manifest)) { + ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_stats); // compare_strong_hash() is called internally by split_head_object() return (ret == 0); } @@ -2052,9 +2059,9 @@ namespace rgw::dedup { if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) { return 0; } - std::string tail_name; + bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest, - tgt_manifest, &src_val, &tail_name, p_stats); + tgt_manifest, &src_val, p_stats); if (unlikely(!success)) { if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) { // set hash attributes on head objects to save calc next time @@ -2073,7 +2080,7 @@ namespace rgw::dedup { } ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats, - tail_name, &src_val); + &src_val); if (ret == 0) { ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl; p_stats->deduped_objects++; @@ -2083,12 +2090,6 @@ namespace rgw::dedup { << ondisk_byte_size << dendl; p_stats->split_head_dedup_bytes += ondisk_byte_size; } - else if (p_tgt_rec->s.num_parts == 0 && - // if we don't split head it will be duplicated - p_tgt_rec->s.obj_bytes_size > d_head_object_size) { - // single part objects duplicate the head object when dedup is used - p_stats->dup_head_bytes += d_head_object_size; - } // mark the SRC object as a providor of a shared manifest if (!src_val.has_shared_manifest()) { @@ -2287,7 +2288,6 @@ namespace rgw::dedup { // ceph store full blocks so need to round up and multiply by block_size uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size); - // count all objects including too small and non default storage_class objs p_worker_stats->ingress_obj++; p_worker_stats->ingress_obj_bytes += ondisk_byte_size; @@ -2309,17 +2309,11 @@ namespace rgw::dedup { if (ondisk_byte_size < d_min_obj_size_for_dedup) { if (parsed_etag.num_parts == 0) { - // dedup only useful for objects bigger than 4MB + // dedup is only applied to objects larger than the configured minimum size + // `rgw_dedup_min_obj_size_for_dedup` p_worker_stats->ingress_skip_too_small++; p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size; - - if (ondisk_byte_size >= 64*1024) { - p_worker_stats->ingress_skip_too_small_64KB++; - p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size; - } - else { - return 0; - } + return 0; } else { // multipart objects are always good candidates for dedup @@ -2531,8 +2525,6 @@ namespace rgw::dedup { << "::total_count=" << obj_count_in_shard << "::loaded_objects=" << p_stats->loaded_objects << p_stats->big_objs_stat << dendl; - ldpp_dout(dpp, 10) << __func__ << "::small objs::" - << p_stats->small_objs_stat << dendl; } //--------------------------------------------------------------------------- @@ -2557,7 +2549,7 @@ namespace rgw::dedup { return -ECANCELED; } } - p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat); + p_table->count_duplicates(&p_stats->big_objs_stat); display_table_stat_counters(dpp, p_stats); ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl; @@ -2839,7 +2831,7 @@ namespace rgw::dedup { md5_stats_t md5_stats; //DEDUP_DYN_ALLOC dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup, - d_max_obj_size_for_split, raw_mem, raw_mem_size); + d_split_head, raw_mem, raw_mem_size); int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards); if (ret == 0) { md5_stats.duration = ceph_clock_now() - start_time; diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h index ecb1e44088b0..71d980fb58b0 100644 --- a/src/rgw/driver/rados/rgw_dedup.h +++ b/src/rgw/driver/rados/rgw_dedup.h @@ -98,16 +98,9 @@ namespace rgw::dedup { }; inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes); - inline bool should_split_head(uint64_t head_size, uint64_t obj_size); - int get_tail_ioctx(const disk_record_t *p_rec, - const RGWObjManifest &manifest, - const std::string &tail_name, - md5_stats_t *p_stats /*IN-OUT*/, - librados::IoCtx *p_ioctx /*OUT*/, - std::string *p_oid /*OUT*/); + inline bool should_split_head(const RGWObjManifest &manifest); void remove_created_tail_object(const disk_record_t *p_rec, const RGWObjManifest &manifest, - const std::string &tail_name, md5_stats_t *p_stats /*IN-OUT*/); void run(); int setup(struct dedup_epoch_t*); @@ -201,7 +194,6 @@ namespace rgw::dedup { int split_head_object(disk_record_t *p_src_rec, // IN/OUT PARAM RGWObjManifest &src_manifest, // IN/OUT PARAM const disk_record_t *p_tgt_rec, - std::string *p_tail_name /*OUT*/, md5_stats_t *p_stats /* IN-OUT */); int add_obj_attrs_to_record(disk_record_t *p_rec, @@ -221,7 +213,6 @@ namespace rgw::dedup { RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, const dedup_table_t::value_t *p_src_val, - std::string *p_tail_name /*OUT*/, md5_stats_t *p_stats /* IN-OUT */); int try_deduping_record(dedup_table_t *p_table, disk_record_t *p_rec, @@ -244,7 +235,6 @@ namespace rgw::dedup { const RGWObjManifest &src_manifest, const RGWObjManifest &tgt_manifest, md5_stats_t *p_stats, - const std::string &tail_name, const dedup_table_t::value_t *p_src_val); #endif int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count); @@ -266,7 +256,7 @@ namespace rgw::dedup { uint64_t d_all_buckets_obj_size = 0; uint32_t d_min_obj_size_for_dedup = (64ULL * 1024); - uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024); + bool d_split_head = true; uint32_t d_head_object_size = (4ULL * 1024 * 1024); control_t d_ctl; uint64_t d_watch_handle = 0; diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.cc b/src/rgw/driver/rados/rgw_dedup_cluster.cc index fafd66176eff..150aace4ab14 100644 --- a/src/rgw/driver/rados/rgw_dedup_cluster.cc +++ b/src/rgw/driver/rados/rgw_dedup_cluster.cc @@ -973,7 +973,6 @@ namespace rgw::dedup { Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"}; fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); - fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate); if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; @@ -997,7 +996,6 @@ namespace rgw::dedup { Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"}; fmt->dump_unsigned("s3_bytes_before", s3_bytes_before); fmt->dump_unsigned("s3_bytes_after", s3_bytes_after); - fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes); if (s3_bytes_before > s3_bytes_after && s3_bytes_after) { double dedup_ratio = (double)s3_bytes_before/s3_bytes_after; fmt->dump_float("dedup_ratio", dedup_ratio); diff --git a/src/rgw/driver/rados/rgw_dedup_table.cc b/src/rgw/driver/rados/rgw_dedup_table.cc index b27bf7353a6e..898a1304dfda 100644 --- a/src/rgw/driver/rados/rgw_dedup_table.cc +++ b/src/rgw/driver/rados/rgw_dedup_table.cc @@ -23,14 +23,14 @@ namespace rgw::dedup { dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp, uint32_t _head_object_size, uint32_t _min_obj_size_for_dedup, - uint32_t _max_obj_size_for_split, + bool _split_head, uint8_t *p_slab, uint64_t slab_size) { dpp = _dpp; head_object_size = _head_object_size; min_obj_size_for_dedup = _min_obj_size_for_dedup; - max_obj_size_for_split = _max_obj_size_for_split; + split_head = _split_head; memset(p_slab, 0, slab_size); hash_tab = (table_entry_t*)p_slab; entries_count = slab_size/sizeof(table_entry_t); @@ -100,6 +100,9 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- + // find_entry() assumes that entries are not removed during operation + // remove_entry() is only called from remove_singletons_and_redistribute_keys() + // doing a linear pass over the array. uint32_t dedup_table_t::find_entry(const key_t *p_key) const { uint32_t idx = p_key->hash() % entries_count; @@ -113,34 +116,19 @@ namespace rgw::dedup { //--------------------------------------------------------------------------- void dedup_table_t::inc_counters(const key_t *p_key, - dedup_stats_t *p_small_objs, - dedup_stats_t *p_big_objs, - uint64_t *p_duplicate_head_bytes) + dedup_stats_t *p_dedup_stats) { // This is an approximation only since size is stored in 4KB resolution uint64_t byte_size_approx = disk_blocks_to_byte_size(p_key->size_4k_units); - // skip small single part objects which we can't dedup - if (!dedupable_object(p_key->multipart_object(), min_obj_size_for_dedup, byte_size_approx)) { - p_small_objs->duplicate_count ++; - p_small_objs->dedup_bytes_estimate += byte_size_approx; - return; - } - else { - uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size, - min_obj_size_for_dedup, - max_obj_size_for_split, - p_key->num_parts, - byte_size_approx); - p_big_objs->duplicate_count ++; - p_big_objs->dedup_bytes_estimate += dup_bytes_approx; - - // object smaller than max_obj_size_for_split will split their head - // and won't dup it - if (!p_key->multipart_object() && byte_size_approx > max_obj_size_for_split) { - // single part objects duplicate the head object when dedup is used - *p_duplicate_head_bytes += head_object_size; - } + uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size, + min_obj_size_for_dedup, + split_head, + p_key->num_parts, + byte_size_approx); + if (dup_bytes_approx) { + p_dedup_stats->duplicate_count ++; + p_dedup_stats->dedup_bytes_estimate += dup_bytes_approx; } } @@ -149,9 +137,7 @@ namespace rgw::dedup { disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest, - dedup_stats_t *p_small_objs, - dedup_stats_t *p_big_objs, - uint64_t *p_duplicate_head_bytes) + dedup_stats_t *p_dedup_stats) { value_t new_val(block_id, rec_id, shared_manifest); uint32_t idx = find_entry(p_key); @@ -172,7 +158,7 @@ namespace rgw::dedup { else { ceph_assert(hash_tab[idx].key == *p_key); if (val.count <= MAX_COPIES_PER_OBJ) { - inc_counters(p_key, p_small_objs, p_big_objs, p_duplicate_head_bytes); + inc_counters(p_key, p_dedup_stats); } if (val.count < std::numeric_limits::max()) { val.count ++; @@ -280,35 +266,19 @@ namespace rgw::dedup { } //--------------------------------------------------------------------------- - void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs, - dedup_stats_t *p_big_objs) + void dedup_table_t::count_duplicates(dedup_stats_t *p_dedup_stats) { for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) { if (!hash_tab[tab_idx].val.is_occupied()) { continue; } - const key_t &key = hash_tab[tab_idx].key; - // This is an approximation only since size is stored in 4KB resolution - uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units); - - // skip small single part objects which we can't dedup - if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) { - if (hash_tab[tab_idx].val.is_singleton()) { - p_small_objs->singleton_count++; - } - else { - p_small_objs->unique_count ++; - } + if (hash_tab[tab_idx].val.is_singleton()) { + p_dedup_stats->singleton_count++; } else { - if (hash_tab[tab_idx].val.is_singleton()) { - p_big_objs->singleton_count++; - } - else { - ceph_assert(hash_tab[tab_idx].val.count > 1); - p_big_objs->unique_count ++; - } + ceph_assert(hash_tab[tab_idx].val.count > 1); + p_dedup_stats->unique_count ++; } } } diff --git a/src/rgw/driver/rados/rgw_dedup_table.h b/src/rgw/driver/rados/rgw_dedup_table.h index 82efc03480cb..4f26972a2cd7 100644 --- a/src/rgw/driver/rados/rgw_dedup_table.h +++ b/src/rgw/driver/rados/rgw_dedup_table.h @@ -131,16 +131,14 @@ namespace rgw::dedup { dedup_table_t(const DoutPrefixProvider* _dpp, uint32_t _head_object_size, uint32_t _min_obj_size_for_dedup, - uint32_t _max_obj_size_for_split, + bool _split_head, uint8_t *p_slab, uint64_t slab_size); int add_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest, - dedup_stats_t *p_small_objs_stat, - dedup_stats_t *p_big_objs_stat, - uint64_t *p_duplicate_head_bytes); + dedup_stats_t *p_dedup_stats); void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest); @@ -159,9 +157,7 @@ namespace rgw::dedup { bool set_shared_manifest_src, bool set_has_valid_hash_src); - void count_duplicates(dedup_stats_t *p_small_objs_stat, - dedup_stats_t *p_big_objs_stat); - + void count_duplicates(dedup_stats_t *p_dedup_stats); void remove_singletons_and_redistribute_keys(); private: // 32 Bytes unified entries @@ -173,15 +169,13 @@ namespace rgw::dedup { uint32_t find_entry(const key_t *p_key) const; void inc_counters(const key_t *p_key, - dedup_stats_t *p_small_objs, - dedup_stats_t *p_big_objs, - uint64_t *p_duplicate_head_bytes); + dedup_stats_t *p_dedup_stats); uint32_t entries_count = 0; uint32_t occupied_count = 0; uint32_t head_object_size; uint32_t min_obj_size_for_dedup; - uint32_t max_obj_size_for_split; + bool split_head; table_entry_t *hash_tab = nullptr; // stat counters diff --git a/src/rgw/driver/rados/rgw_dedup_utils.cc b/src/rgw/driver/rados/rgw_dedup_utils.cc index 74252a853950..52fdfa2c04f3 100644 --- a/src/rgw/driver/rados/rgw_dedup_utils.cc +++ b/src/rgw/driver/rados/rgw_dedup_utils.cc @@ -382,8 +382,6 @@ namespace rgw::dedup { this->ingress_corrupted_etag += other.ingress_corrupted_etag; this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes; this->ingress_skip_too_small += other.ingress_skip_too_small; - this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes; - this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB; return *this; } @@ -440,13 +438,6 @@ namespace rgw::dedup { this->ingress_skip_too_small); f->dump_unsigned("Ingress skip: too small bytes", this->ingress_skip_too_small_bytes); - - if(this->ingress_skip_too_small_64KB) { - f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj", - this->ingress_skip_too_small_64KB); - f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes", - this->ingress_skip_too_small_64KB_bytes); - } } } @@ -499,9 +490,6 @@ namespace rgw::dedup { encode(w.ingress_skip_too_small_bytes, bl); encode(w.ingress_skip_too_small, bl); - encode(w.ingress_skip_too_small_64KB_bytes, bl); - encode(w.ingress_skip_too_small_64KB, bl); - encode(w.duration, bl); ENCODE_FINISH(bl); } @@ -528,8 +516,6 @@ namespace rgw::dedup { decode(w.ingress_corrupted_etag, bl); decode(w.ingress_skip_too_small_bytes, bl); decode(w.ingress_skip_too_small, bl); - decode(w.ingress_skip_too_small_64KB_bytes, bl); - decode(w.ingress_skip_too_small_64KB, bl); decode(w.duration, bl); DECODE_FINISH(bl); @@ -538,7 +524,6 @@ namespace rgw::dedup { //--------------------------------------------------------------------------- md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other) { - this->small_objs_stat += other.small_objs_stat; this->big_objs_stat += other.big_objs_stat; this->ingress_slabs += other.ingress_slabs; this->ingress_failed_load_bucket += other.ingress_failed_load_bucket; @@ -551,6 +536,8 @@ namespace rgw::dedup { this->ingress_skip_compressed += other.ingress_skip_compressed; this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes; this->ingress_skip_changed_objs += other.ingress_skip_changed_objs; + this->ingress_skip_explicit_objs += other.ingress_skip_explicit_objs; + this->ingress_skip_alibaba += other.ingress_skip_alibaba; this->shared_manifest_dedup_bytes += other.shared_manifest_dedup_bytes; this->skipped_shared_manifest += other.skipped_shared_manifest; @@ -591,10 +578,8 @@ namespace rgw::dedup { this->set_shared_manifest_src += other.set_shared_manifest_src; this->loaded_objects += other.loaded_objects; this->processed_objects += other.processed_objects; - this->dup_head_bytes_estimate += other.dup_head_bytes_estimate; this->deduped_objects += other.deduped_objects; this->deduped_objects_bytes += other.deduped_objects_bytes; - this->dup_head_bytes += other.dup_head_bytes; this->failed_dedup += other.failed_dedup; this->md_throttle_sleep_events += other.md_throttle_sleep_events; @@ -628,7 +613,6 @@ namespace rgw::dedup { f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src); f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects); f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes); - f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes); f->dump_unsigned("Already Deduped bytes (prev cycles)", this->shared_manifest_dedup_bytes); @@ -639,21 +623,6 @@ namespace rgw::dedup { f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate); } - // Potential Dedup Section: - // What could be gained by allowing dedup for smaller objects (64KB-4MB) - // Space wasted because of duplicated head-object (4MB) - { - Formatter::ObjectSection potential(*f, "Potential Dedup"); - const dedup_stats_t &ds = this->small_objs_stat; - f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count); - f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count); - f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count); - f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate); - f->dump_unsigned("Duplicated Head Bytes Estimate", - this->dup_head_bytes_estimate); - f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes); - } - { Formatter::ObjectSection notify(*f, "notify"); if (this->md_throttle_sleep_events) { @@ -726,6 +695,12 @@ namespace rgw::dedup { if (this->ingress_skip_changed_objs) { f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs); } + if (this->ingress_skip_explicit_objs) { + f->dump_unsigned("Skipped Explicit Objs", this->ingress_skip_explicit_objs); + } + if (this->ingress_skip_alibaba) { + f->dump_unsigned("Skipped Alibaba Cloud OSS", this->ingress_skip_alibaba); + } } { @@ -814,7 +789,6 @@ namespace rgw::dedup { { ENCODE_START(1, 1, bl); - encode(m.small_objs_stat, bl); encode(m.big_objs_stat, bl); encode(m.ingress_slabs, bl); encode(m.ingress_failed_load_bucket, bl); @@ -827,6 +801,8 @@ namespace rgw::dedup { encode(m.ingress_skip_compressed, bl); encode(m.ingress_skip_compressed_bytes, bl); encode(m.ingress_skip_changed_objs, bl); + encode(m.ingress_skip_explicit_objs, bl); + encode(m.ingress_skip_alibaba, bl); encode(m.shared_manifest_dedup_bytes, bl); encode(m.skipped_shared_manifest, bl); @@ -867,10 +843,8 @@ namespace rgw::dedup { encode(m.loaded_objects, bl); encode(m.processed_objects, bl); - encode(m.dup_head_bytes_estimate, bl); encode(m.deduped_objects, bl); encode(m.deduped_objects_bytes, bl); - encode(m.dup_head_bytes, bl); encode(m.failed_dedup, bl); encode(m.md_throttle_sleep_events, bl); encode(m.md_throttle_sleep_time_usec, bl); @@ -885,7 +859,6 @@ namespace rgw::dedup { void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl) { DECODE_START(1, bl); - decode(m.small_objs_stat, bl); decode(m.big_objs_stat, bl); decode(m.ingress_slabs, bl); decode(m.ingress_failed_load_bucket, bl); @@ -898,6 +871,8 @@ namespace rgw::dedup { decode(m.ingress_skip_compressed, bl); decode(m.ingress_skip_compressed_bytes, bl); decode(m.ingress_skip_changed_objs, bl); + decode(m.ingress_skip_explicit_objs, bl); + decode(m.ingress_skip_alibaba, bl); decode(m.shared_manifest_dedup_bytes, bl); decode(m.skipped_shared_manifest, bl); @@ -938,10 +913,8 @@ namespace rgw::dedup { decode(m.loaded_objects, bl); decode(m.processed_objects, bl); - decode(m.dup_head_bytes_estimate, bl); decode(m.deduped_objects, bl); decode(m.deduped_objects_bytes, bl); - decode(m.dup_head_bytes, bl); decode(m.failed_dedup, bl); decode(m.md_throttle_sleep_events, bl); decode(m.md_throttle_sleep_time_usec, bl); diff --git a/src/rgw/driver/rados/rgw_dedup_utils.h b/src/rgw/driver/rados/rgw_dedup_utils.h index 579e048a259f..6a7f508cc36e 100644 --- a/src/rgw/driver/rados/rgw_dedup_utils.h +++ b/src/rgw/driver/rados/rgw_dedup_utils.h @@ -198,9 +198,6 @@ namespace rgw::dedup { uint64_t ingress_skip_too_small_bytes = 0; uint64_t ingress_skip_too_small = 0; - uint64_t ingress_skip_too_small_64KB_bytes = 0; - uint64_t ingress_skip_too_small_64KB = 0; - utime_t duration = {0, 0}; }; std::ostream& operator<<(std::ostream &out, const worker_stats_t &s); @@ -212,7 +209,6 @@ namespace rgw::dedup { md5_stats_t& operator +=(const md5_stats_t& other); void dump(Formatter *f) const; - dedup_stats_t small_objs_stat; dedup_stats_t big_objs_stat; uint64_t ingress_slabs = 0; uint64_t ingress_failed_load_bucket = 0; @@ -225,6 +221,8 @@ namespace rgw::dedup { uint64_t ingress_skip_compressed = 0; uint64_t ingress_skip_compressed_bytes = 0; uint64_t ingress_skip_changed_objs = 0; + uint64_t ingress_skip_explicit_objs = 0; + uint64_t ingress_skip_alibaba = 0; uint64_t shared_manifest_dedup_bytes = 0; uint64_t skipped_shared_manifest = 0; @@ -265,11 +263,9 @@ namespace rgw::dedup { uint64_t loaded_objects = 0; uint64_t processed_objects = 0; // counter is using on-disk size affected by block-size - uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes uint64_t deduped_objects = 0; // counter is using s3 byte size disregarding the on-disk size affected by block-size uint64_t deduped_objects_bytes = 0; - uint64_t dup_head_bytes = 0; uint64_t failed_dedup = 0; uint64_t md_throttle_sleep_events = 0; uint64_t md_throttle_sleep_time_usec = 0; @@ -368,7 +364,7 @@ namespace rgw::dedup { //--------------------------------------------------------------------------- static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size, uint32_t min_obj_size_for_dedup, - uint32_t max_obj_size_for_split, + bool split_head, uint16_t num_parts, uint64_t size_bytes) { @@ -376,18 +372,16 @@ namespace rgw::dedup { // multipart objects with an empty head i.e. we achive full dedup return size_bytes; } + else if (size_bytes < min_obj_size_for_dedup) { + return 0; + } + else if (split_head) { + // Head is splitted into an empty obj and a new tail enabling a full dedup + return size_bytes; + } else { - // reduce the head size - if (size_bytes > max_obj_size_for_split) { - return size_bytes - head_obj_size; - } - else if (size_bytes >= min_obj_size_for_dedup) { - // Head is splitted into an empty obj and a new tail enabling a full dedup - return size_bytes; - } - else { - return 0; - } + // reduce the head size which is not dedup + return size_bytes - std::min(size_bytes, (uint64_t)head_obj_size); } } diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc index d423c115d4de..dd24390247ea 100644 --- a/src/rgw/driver/rados/rgw_obj_manifest.cc +++ b/src/rgw/driver/rados/rgw_obj_manifest.cc @@ -181,13 +181,13 @@ int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifes return 0; } -bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) +bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) const { if (rules.empty()) { return false; } - map::iterator iter = rules.upper_bound(ofs); + map::const_iterator iter = rules.upper_bound(ofs); if (iter != rules.begin()) { --iter; } diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h index 4129a015c98b..966db3add84f 100644 --- a/src/rgw/driver/rados/rgw_obj_manifest.h +++ b/src/rgw/driver/rados/rgw_obj_manifest.h @@ -254,7 +254,11 @@ public: } void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs, - std::string *override_prefix, rgw_obj_select *location) const; + const std::string *override_prefix, rgw_obj_select *location) const; + + const std::map& get_rules() const { + return rules; + } void clear_rules() { rules.clear(); @@ -383,7 +387,7 @@ public: int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params); - bool get_rule(uint64_t ofs, RGWObjManifestRule *rule); + bool get_rule(uint64_t ofs, RGWObjManifestRule *rule) const; bool empty() const { if (explicit_objs) diff --git a/src/rgw/rgw_obj_manifest.cc b/src/rgw/rgw_obj_manifest.cc index 042c97aa13cd..418c913b82a8 100644 --- a/src/rgw/rgw_obj_manifest.cc +++ b/src/rgw/rgw_obj_manifest.cc @@ -207,7 +207,7 @@ void RGWObjManifest::obj_iterator::update_location() } void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, - uint64_t ofs, string *override_prefix, rgw_obj_select *location) const + uint64_t ofs, const string *override_prefix, rgw_obj_select *location) const { rgw_obj loc; diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py index 40f82b862748..25177a10ebb1 100644 --- a/src/test/rgw/dedup/test_dedup.py +++ b/src/test/rgw/dedup/test_dedup.py @@ -51,10 +51,6 @@ class Dedup_Stats: duplicate_obj : int = 0 deduped_obj_bytes : int = 0 non_default_storage_class_objs_bytes : int = 0 - potential_singleton_obj : int = 0 - potential_unique_obj : int = 0 - potential_duplicate_obj : int = 0 - potential_dedup_space : int = 0 @dataclass class Dedup_Ratio: @@ -280,9 +276,8 @@ def create_buckets(conn, max_copies_count): OUT_DIR="/tmp/dedup/" KB=(1024) MB=(1024*KB) -POTENTIAL_OBJ_SIZE=(64*KB) DEDUP_MIN_OBJ_SIZE=(64*KB) -SPLIT_HEAD_SIZE=(4*MB) +SPLIT_HEAD=True RADOS_OBJ_SIZE=(4*MB) # The default multipart threshold size for S3cmd is 15 MB. MULTIPART_SIZE=(15*MB) @@ -638,17 +633,16 @@ def calc_head_size(obj_size, config): def calc_dedupable_space(obj_size, config): on_disk_byte_size = calc_on_disk_byte_size(obj_size) - threshold = config.multipart_threshold # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part # multi-part objects got a zero size Head objects - if obj_size >= threshold: + if obj_size >= config.multipart_threshold: dedupable_space = on_disk_byte_size - elif obj_size > SPLIT_HEAD_SIZE: - dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE - elif obj_size >= DEDUP_MIN_OBJ_SIZE: + elif obj_size < DEDUP_MIN_OBJ_SIZE: + dedupable_space = 0 + elif SPLIT_HEAD: dedupable_space = on_disk_byte_size else: - dedupable_space = 0 + dedupable_space = (on_disk_byte_size - min(on_disk_byte_size, RADOS_OBJ_SIZE)) log.debug("obj_size=%.2f MiB, dedupable_space=%.2f MiB", float(obj_size)/MB, float(dedupable_space)/MB) @@ -659,7 +653,7 @@ def calc_split_objs_count(obj_size, num_copies, config): threshold = config.multipart_threshold on_disk_byte_size = calc_on_disk_byte_size(obj_size) - if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold: + if num_copies < 2 or not SPLIT_HEAD or obj_size >= threshold: return 0 if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE: @@ -680,15 +674,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config): if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE: dedup_stats.skip_too_small += num_copies dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies) - - if on_disk_byte_size >= POTENTIAL_OBJ_SIZE: - if num_copies == 1: - dedup_stats.potential_singleton_obj += 1 - else: - dedup_stats.potential_unique_obj += 1 - dedup_stats.potential_duplicate_obj += dups_count - dedup_stats.potential_dedup_space += (on_disk_byte_size * dups_count) - return dedup_stats.total_processed_objects += num_copies @@ -1399,12 +1384,6 @@ def read_dedup_stats(dry_run): dedup_stats.duplicate_obj = main['Duplicate Obj'] dedup_stats.dedup_bytes_estimate = main['Dedup Bytes Estimate'] - potential = md5_stats['Potential Dedup'] - dedup_stats.potential_singleton_obj = potential['Singleton Obj (64KB-4MB)'] - dedup_stats.potential_unique_obj = potential['Unique Obj (64KB-4MB)'] - dedup_stats.potential_duplicate_obj = potential['Duplicate Obj (64KB-4MB)'] - dedup_stats.potential_dedup_space = potential['Dedup Bytes Estimate (64KB-4MB)'] - dedup_work_was_completed=jstats['completed'] if dedup_work_was_completed: dedup_ratio_estimate=read_dedup_ratio(jstats, 'dedup_ratio_estimate') @@ -1486,11 +1465,6 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size if verify_stats == False: return ret - if dedup_stats.potential_unique_obj or expected_dedup_stats.potential_unique_obj: - log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj, - expected_dedup_stats.potential_unique_obj) - - #dedup_stats.set_hash = dedup_stats.invalid_hash if dedup_stats != expected_dedup_stats: log.debug("==================================================") @@ -1513,14 +1487,6 @@ def prepare_test(): os.mkdir(OUT_DIR) -#------------------------------------------------------------------------------- -def copy_potential_stats(new_dedup_stats, dedup_stats): - new_dedup_stats.potential_singleton_obj = dedup_stats.potential_singleton_obj - new_dedup_stats.potential_unique_obj = dedup_stats.potential_unique_obj - new_dedup_stats.potential_duplicate_obj = dedup_stats.potential_duplicate_obj - new_dedup_stats.potential_dedup_space = dedup_stats.potential_dedup_space - - #------------------------------------------------------------------------------- def small_single_part_objs_dedup(conn, bucket_name, dry_run): # 1) generate small random files and store them on disk @@ -1547,8 +1513,6 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run): # expected stats for small objects - all zeros except for skip_too_small small_objs_dedup_stats = Dedup_Stats() - #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects - copy_potential_stats(small_objs_dedup_stats, dedup_stats) small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small = s3_objects_total @@ -1897,6 +1861,8 @@ def test_dedup_with_versions(): min_size=1*KB max_size=MULTIPART_SIZE*2 success=False + # Declare the variable with a type hint + conn: BaseClient try: conn=get_single_connection() conn.create_bucket(Bucket=bucket_name) @@ -2415,8 +2381,6 @@ def test_dedup_small_with_tenants(): # expected stats for small objects - all zeros except for skip_too_small small_objs_dedup_stats = Dedup_Stats() - #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects - copy_potential_stats(small_objs_dedup_stats, dedup_stats) small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small=s3_objects_total @@ -3320,7 +3284,6 @@ def test_dedup_dry_small_with_tenants(): # expected stats for small objects - all zeros except for skip_too_small small_objs_dedup_stats = Dedup_Stats() - copy_potential_stats(small_objs_dedup_stats, dedup_stats) small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup small_objs_dedup_stats.skip_too_small=s3_objects_total @@ -3693,3 +3656,4 @@ def test_dedup_identical_copies_multipart_small(): force_clean=True log.info("test_dedup_identical_copies_multipart:full test") __test_dedup_identical_copies(files, config, dry_run, verify, force_clean) +