rgw/dedup: This PR extends the RGW dedup split-head feature to support objects that...

author benhanokh <gbenhano@redhat.com>

Mon, 30 Mar 2026 08:22:51 +0000 (11:22 +0300)

committer benhanokh <gbenhano@redhat.com>

Thu, 16 Apr 2026 08:24:10 +0000 (11:24 +0300)
author benhanokh <gbenhano@redhat.com>
Mon, 30 Mar 2026 08:22:51 +0000 (11:22 +0300)
committer benhanokh <gbenhano@redhat.com>
Thu, 16 Apr 2026 08:24:10 +0000 (11:24 +0300)
diff --git a/doc/radosgw/s3_objects_dedup.rst b/doc/radosgw/s3_objects_dedup.rst

index fe83124d1549f61752617ce1ba243225b28424b1..7de91ed00071773c9458a8f69bf95173646d98dd 100644 (file)
--- a/doc/radosgw/s3_objects_dedup.rst
+++ b/doc/radosgw/s3_objects_dedup.rst
@@ -108,19 +108,21 @@ matches. If they are, we proceed with the deduplication:
  - Copy the manifest from the source to the target.
  - Remove all tail objects on the target.
  
-
  Split Head Mode
  ===============
  
-Dedup code can split the head object into 2 objects
+The dedup code can split a head object into 2 objects:
  
-- one with attributes and no data and
+- one with attributes and no data, and
  - a new tail object with only data.
  
-The new tail object will be deduped, unlike the head objects, which cannot
+The new tail object will be deduped, unlike head objects, which cannot
  be deduplicated.
-This feature is only enabled for RGW objects without existing tail objects
-(in other words, objects sized 4 MB or less).
+
+:confval:`rgw_dedup_split_obj_head` (default: true). Setting
+this option to ``false`` disables split-head entirely.
+
+.. confval:: rgw_dedup_split_obj_head
  
  
  Memory Usage
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in

index 6a85b076ca5f826bca6ab4782fd00d790dfc20e7..57000c5154b53618b4ddd61a8a75e90d3fa7e87c 100644 (file)
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -81,6 +81,18 @@ options:
    default: false
    services:
    - rgw
+- name: rgw_dedup_split_obj_head
+  type: bool
+  level: advanced
+  desc: Enables the split-head functionality
+  long_desc: Dedup code can split head object into two objects -
+             one with attributes and no data and a new tail-object with only data.
+             The new-tail object will be deduped (unlike the head objects which
+             can't be deduplicated)
+  default: true
+  services:
+  - rgw
+  with_legacy: true
  - name: rgw_dedup_min_obj_size_for_dedup
    type: size
    level: advanced
diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc

index 2446d43ff92d4a1bddeccd1c0d2e3f2f7aa834ac..67d59370aa9a189679ac77d011e574ef43f7b362 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup.cc
+++ b/src/rgw/driver/rados/rgw_dedup.cc
@@ -413,12 +413,10 @@ namespace rgw::dedup {
    {
      d_head_object_size = cct->_conf->rgw_max_chunk_size;
      d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup;
-
-    // limit split head to objects without tail
-    d_max_obj_size_for_split = d_head_object_size;
+    d_split_head = cct->_conf->rgw_dedup_split_obj_head;
      ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size
                         << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup
-                       << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split
+                       << "::d_split_head=" << d_split_head
                         << dendl;
  
      int ret = init_rados_access_handles(false);
@@ -435,11 +433,8 @@ namespace rgw::dedup {
    //------------------------------------------------------------------------------
    uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes)
    {
-    return calc_deduped_bytes(d_head_object_size,
-                              d_min_obj_size_for_dedup,
-                              d_max_obj_size_for_split,
-                              num_parts,
-                              size_bytes);
+    return calc_deduped_bytes(d_head_object_size, d_min_obj_size_for_dedup,
+                              d_split_head, num_parts, size_bytes);
    }
  
    //---------------------------------------------------------------------------
@@ -495,8 +490,7 @@ namespace rgw::dedup {
                         << p_rec->s.md5_low << std::dec << dendl;
  
      int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest,
-                                 &p_stats->small_objs_stat, &p_stats->big_objs_stat,
-                                 &p_stats->dup_head_bytes_estimate);
+                                 &p_stats->big_objs_stat);
      if (ret == 0) {
        p_stats->loaded_objects ++;
        ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
@@ -544,15 +538,14 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  static int get_ioctx_internal(const DoutPrefixProvider* const dpp,
-                                rgw::sal::Driver* driver,
-                                rgw::sal::RadosStore* store,
-                                const std::string &obj_name,
-                                const std::string &instance,
-                                const rgw_bucket &rb,
-                                librados::IoCtx *p_ioctx /*OUT*/,
-                                std::string *p_oid /*OUT*/)
+  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
+                              rgw::sal::Driver* driver,
+                              rgw::sal::RadosStore* store,
+                              const disk_record_t *p_rec,
+                              librados::IoCtx *p_ioctx /*OUT*/,
+                              std::string *p_oid /*OUT*/)
    {
+    rgw_bucket rb{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
      unique_ptr<rgw::sal::Bucket> bucket;
      {
        int ret = driver->load_bucket(dpp, rb, &bucket, null_yield);
@@ -564,44 +557,12 @@ namespace rgw::dedup {
      }
  
      string dummy_locator;
-    const rgw_obj_index_key key(obj_name, instance);
+    const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance);
      rgw_obj obj(bucket->get_key(), key);
      get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
      RGWBucketInfo& bucket_info = bucket->get_info();
      return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
-  }
  
-  //---------------------------------------------------------------------------
-  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
-                              rgw::sal::Driver* driver,
-                              rgw::sal::RadosStore* store,
-                              const disk_record_t *p_rec,
-                              librados::IoCtx *p_ioctx /*OUT*/,
-                              std::string *p_oid /*OUT*/)
-  {
-    rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
-    return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance,
-                              b, p_ioctx, p_oid);
-  }
-
-  //---------------------------------------------------------------------------
-  static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest)
-  {
-    static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW);
-    std::string_view suffix = "0";
-    const std::string &prefix = manifest.get_prefix();
-
-    std::string tail_name;
-    tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1);
-    // TBD:
-    // it is unclear when RGW code pads with "_" before the shadow string
-    // It won't change correctness, but might look weird
-    //tail_name.append("_");
-    tail_name.append(shadow_string);
-    tail_name.append("_");
-    tail_name.append(prefix);
-    tail_name.append(suffix);
-    return tail_name;
    }
  
    //---------------------------------------------------------------------------
@@ -611,43 +572,37 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  int Background::get_tail_ioctx(const disk_record_t *p_rec,
-                                 const RGWObjManifest &manifest,
-                                 const std::string &tail_name,
-                                 md5_stats_t *p_stats /*IN-OUT*/,
-                                 librados::IoCtx *p_ioctx /*OUT*/,
-                                 std::string *p_oid /*OUT*/)
+  static int get_first_tail_obj_params(const DoutPrefixProvider *dpp,
+                                       RGWRados *rados,
+                                       const RGWObjManifest &manifest,
+                                       librados::IoCtx *p_tail_ioctx, /*OUT*/
+                                       std::string *p_tail_oid /*OUT*/)
    {
-    const rgw_bucket_placement &tail_placement = manifest.get_tail_placement();
-    // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort
-    if (unlikely(invalid_tail_placement(tail_placement))) {
-      p_stats->split_head_no_tail_placement++;
-      ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl;
-      return -EINVAL;
-    }
-
-    const rgw_bucket& bucket = tail_placement.bucket;
-    // tail objects might be on another storage_class/pool, need another ioctx
-    int ret = get_ioctx_internal(dpp, driver, store, tail_name, p_rec->instance,
-                                 bucket, p_ioctx, p_oid);
-    if (unlikely(ret != 0)) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl;
+    auto p = manifest.obj_begin(dpp);
+    const rgw_obj_select& os = p.get_location();
+    rgw_raw_obj raw_obj = os.get_raw_obj(rados);
+    rgw_rados_ref obj;
+    int ret = rgw_get_rados_ref(dpp, rados->get_rados_handle(), raw_obj, &obj);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid="
+                        << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
        return ret;
      }
  
+    *p_tail_ioctx = obj.ioctx;
+    *p_tail_oid   = raw_obj.oid;
+
      return 0;
    }
  
    //---------------------------------------------------------------------------
    void Background::remove_created_tail_object(const disk_record_t *p_rec,
                                                const RGWObjManifest &manifest,
-                                              const std::string &tail_name,
                                                md5_stats_t *p_stats /*IN-OUT*/)
    {
      librados::IoCtx tail_ioctx;
      std::string tail_oid;
-    int ret = get_tail_ioctx(p_rec, manifest, tail_name, p_stats, &tail_ioctx,
-                             &tail_oid);
+    int ret = get_first_tail_obj_params(dpp, rados, manifest, &tail_ioctx, &tail_oid);
      if (unlikely(ret != 0)) {
        return;
      }
@@ -665,10 +620,41 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size)
+  inline bool Background::should_split_head(const RGWObjManifest& manifest)
    {
-    // Don't split RGW objects with existing tail-objects
-    return (head_size > 0 && head_size == obj_size);
+    // Split-head is only applicable for single-part objects with a non-empty head.
+    // To avoid issues with manifests created via append (specifically for Alibaba Cloud OSS),
+    //    we should disable split-head whenever the manifest contains an override_prefix in the rules.
+    //    We also reject manifests with multiple rules which are exclusively an
+    //    artifact of the OSS AppendObject operation.
+    // The head_size should either represent the full object or be equal to the stripe_max_size.
+
+    bool     success   = false;
+    uint64_t head_size = manifest.get_head_size();
+    uint64_t obj_size  = manifest.get_obj_size();
+    RGWObjManifestRule rule;
+    if (manifest.get_rule(0, &rule)) {
+      success = (d_split_head        && // split-head was not disabled from yaml
+                 rule.part_size == 0 && // not a multi-part object
+                 head_size > 0       && // non-empty head
+                 !manifest.has_explicit_objs()    && // not an explicit manifest
+                 rule.override_prefix.empty()     && // not Alibaba Cloud OSS
+                 manifest.get_rules().size() == 1 && // not Alibaba Cloud OSS
+                 (head_size == rule.stripe_max_size || head_size == obj_size));
+
+      if (unlikely(!success)) {
+        ldpp_dout(dpp, 20) << __func__ << "::ERR::d_split_head=" << d_split_head
+                           << "::obj_size=" << obj_size
+                           << "::head_size=" << head_size
+                           << "::rule.part_size=" << rule.part_size
+                           << "::rule.stripe_max_size=" << rule.stripe_max_size
+                           << "::rule.override_prefix=" << rule.override_prefix
+                           << "::rule.override_prefix.empty()=" << rule.override_prefix.empty()
+                           << dendl;
+      }
+    } // don't split head if can't get rule
+
+    return success;
    }
  
    //---------------------------------------------------------------------------
@@ -806,14 +792,14 @@ namespace rgw::dedup {
        cls_refcount_get(op, ref_tag, true);
        d_ctl.metadata_access_throttle.acquire();
        ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: "
-                         << raw_obj.oid << "::" << obj.obj.oid << dendl;
+                         << obj.obj.oid << "::" << raw_obj.to_str() << dendl;
        rgw::AioResultList completed = aio->get(obj.obj,
                                                rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                                cost, id);
        ret = rgw::check_for_errors(completed);
        all_results.splice(all_results.end(), completed);
        if (ret < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed refcount_get() obj=" << obj
                            << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl;
          break;
        }
@@ -965,7 +951,6 @@ namespace rgw::dedup {
                                 const RGWObjManifest         &src_manifest,
                                 const RGWObjManifest         &tgt_manifest,
                                 md5_stats_t                  *p_stats,
-                               const std::string            &tail_name,
                                 const dedup_table_t::value_t *p_src_val)
    {
      const uint64_t src_head_size = src_manifest.get_head_size();
@@ -990,7 +975,7 @@ namespace rgw::dedup {
      if (unlikely(ret != 0)) {
        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl;
        if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
        }
        return ret;
      }
@@ -1001,7 +986,7 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size
                          << "::tgt_head_size=" << tgt_head_size << dendl;
        if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
        }
        // TBD: can we create a test case (requires control over head-object-size)??
        return -ECANCELED;
@@ -1013,7 +998,7 @@ namespace rgw::dedup {
      ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
      if (unlikely(ret != 0)) {
        if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
        }
        return ret;
      }
@@ -1053,7 +1038,7 @@ namespace rgw::dedup {
                            << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
          rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
          if (p_src_rec->s.flags.is_split_head()) {
-          remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+          remove_created_tail_object(p_src_rec, src_manifest, p_stats);
          }
          return ret;
        }
@@ -1235,8 +1220,29 @@ namespace rgw::dedup {
                             << "::ERROR: unable to decode manifest" << dendl;
          return -EINVAL;
        }
-      need_to_split_head = should_split_head(manifest.get_head_size(),
-                                             p_rec->s.obj_bytes_size);
+
+      if (unlikely(manifest.has_explicit_objs())) {
+        // we don't support dedup of explicit_objs manifest
+        p_stats->ingress_skip_explicit_objs++;
+        ldpp_dout(dpp, 20)  << __func__ << "::explicit_objs can't be dedup" << dendl;
+        return -ENOTSUP;
+      }
+
+      RGWObjManifestRule rule;
+      if (!manifest.get_rule(0, &rule)                              ||
+          // if not a multi-part must have exactly 1 rule
+          (rule.part_size == 0 && manifest.get_rules().size() != 1) ||
+          !rule.override_prefix.empty()) {
+        // we don't support dedup of Alibaba Cloud OSS using AppendObject API
+        p_stats->ingress_skip_alibaba++;
+        ldpp_dout(dpp, 10)  << __func__ << "::Alibaba Cloud OSS can't be dedup"
+                            << "::rules.size()=" << manifest.get_rules().size()
+                            << "::get_rule ret=" << manifest.get_rule(0, &rule)
+                            << "::override_prefix=" << rule.override_prefix << dendl;
+        return -ENOTSUP;
+      }
+
+      need_to_split_head = should_split_head(manifest);
  
        // force explicit tail_placement as the dedup could be on another bucket
        const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
@@ -1501,6 +1507,11 @@ namespace rgw::dedup {
      p_rec->s.flags.clear();
      ret = add_obj_attrs_to_record(p_rec, attrs, p_stats);
      if (unlikely(ret != 0)) {
+      // don't trace errors for unsupported manifest
+      if (ret == -ENOTSUP) {
+        return 0;
+      }
+
        ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
                          << ret << "::" << cpp_strerror(-ret) << dendl;
        return ret;
@@ -1640,39 +1651,36 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  static void build_and_set_explicit_manifest(const DoutPrefixProvider *dpp,
-                                              const rgw_bucket *p_bucket,
-                                              const std::string &tail_name,
-                                              RGWObjManifest *p_manifest)
+  static int set_manifest_for_split_head(const DoutPrefixProvider *const dpp,
+                                         RGWObjManifest *p_manifest /*IN-OUT*/)
    {
-    uint64_t obj_size = p_manifest->get_obj_size();
-    ceph_assert(obj_size == p_manifest->get_head_size());
-
-    const rgw_obj &head_obj = p_manifest->get_obj();
-    const rgw_obj_key &head_key = head_obj.key;
-    rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns);
-    rgw_obj tail_obj(*p_bucket, tail_key);
-
-    RGWObjManifestPart tail_part;
-    tail_part.loc     = tail_obj;
-    tail_part.loc_ofs = 0;
-    tail_part.size    = obj_size;
-
-    std::map<uint64_t, RGWObjManifestPart> objs_map;
-    objs_map[0] = tail_part;
-
+    // Set head-size to zero in the manifest with start_ofs set to zero
+    // This means no data is stored in the head-object and the first tail-object
+    //      holds the first data byte
+    uint64_t tail_ofs = 0;
      p_manifest->set_head_size(0);
      p_manifest->set_max_head_size(0);
-    p_manifest->set_prefix("");
-    p_manifest->clear_rules();
-    p_manifest->set_explicit(obj_size, objs_map);
+    RGWObjManifestRule rule;
+    if (p_manifest->get_rule(0, &rule)) {
+      ldpp_dout(dpp, 20) << "OLD Rule::start_part_num="<< rule.start_part_num
+                         << "::start_ofs=" << rule.start_ofs
+                         << "::part_size=" << rule.part_size
+                         << "::stripe_max_size=" << rule.stripe_max_size
+                         << "::override_prefix=" << rule.override_prefix << dendl;
+      p_manifest->clear_rules();
+      p_manifest->set_trivial_rule(tail_ofs, rule.stripe_max_size);
+      return 0;
+    }
+    else {
+      // No Rules, probably explicit_objs - should never happen (was checked before)
+      return -ENOENT;
+    }
    }
  
    //---------------------------------------------------------------------------
    int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM
                                      RGWObjManifest &src_manifest, // IN/OUT PARAM
                                      const disk_record_t *p_tgt_rec,
-                                    std::string *p_tail_name /*OUT*/,
                                      md5_stats_t *p_stats /* IN-OUT */)
    {
      ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::"
@@ -1719,11 +1727,14 @@ namespace rgw::dedup {
        }
      }
  
-    *p_tail_name = generate_split_head_tail_name(src_manifest);
+    ret = set_manifest_for_split_head(dpp, &src_manifest);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      librados::IoCtx tail_ioctx;
      std::string tail_oid;
-    ret = get_tail_ioctx(p_src_rec, src_manifest, *p_tail_name, p_stats,
-                         &tail_ioctx, &tail_oid);
+    ret = get_first_tail_obj_params(dpp, rados, src_manifest, &tail_ioctx, &tail_oid);
      if (unlikely(ret != 0)) {
        return ret;
      }
@@ -1762,8 +1773,6 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret="
                           << ret << dendl;
      }
-    const rgw_bucket *p_bucket = &(src_manifest.get_tail_placement().bucket);
-    build_and_set_explicit_manifest(dpp, p_bucket, *p_tail_name, &src_manifest);
  
      bufferlist manifest_bl;
      encode(src_manifest, manifest_bl);
@@ -1779,7 +1788,6 @@ namespace rgw::dedup {
                                               RGWObjManifest &src_manifest,
                                               const RGWObjManifest &tgt_manifest,
                                               const dedup_table_t::value_t *p_src_val,
-                                             std::string *p_tail_name /*OUT*/,
                                               md5_stats_t *p_stats)
    {
      int ret = 0;
@@ -1822,9 +1830,8 @@ namespace rgw::dedup {
      // we might still need to split-head here when hash is valid
      // can happen if we failed compare before (md5-collison) and stored the src hash
      // in the obj-attributes
-    uint64_t head_size = src_manifest.get_head_size();
-    if (should_split_head(head_size, src_manifest.get_obj_size())) {
-      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_tail_name, p_stats);
+    if (should_split_head(src_manifest)) {
+      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_stats);
        // compare_strong_hash() is called internally by split_head_object()
        return (ret == 0);
      }
@@ -2052,9 +2059,9 @@ namespace rgw::dedup {
      if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) {
        return 0;
      }
-    std::string tail_name;
+
      bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest,
-                                             tgt_manifest, &src_val, &tail_name, p_stats);
+                                             tgt_manifest, &src_val, p_stats);
      if (unlikely(!success)) {
        if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) {
          // set hash attributes on head objects to save calc next time
@@ -2073,7 +2080,7 @@ namespace rgw::dedup {
      }
  
      ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats,
-                       tail_name, &src_val);
+                       &src_val);
      if (ret == 0) {
        ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl;
        p_stats->deduped_objects++;
@@ -2083,12 +2090,6 @@ namespace rgw::dedup {
                             << ondisk_byte_size << dendl;
          p_stats->split_head_dedup_bytes += ondisk_byte_size;
        }
-      else if (p_tgt_rec->s.num_parts == 0 &&
-               // if we don't split head it will be duplicated
-               p_tgt_rec->s.obj_bytes_size > d_head_object_size) {
-        // single part objects duplicate the head object when dedup is used
-        p_stats->dup_head_bytes += d_head_object_size;
-      }
  
        // mark the SRC object as a providor of a shared manifest
        if (!src_val.has_shared_manifest()) {
@@ -2287,7 +2288,6 @@ namespace rgw::dedup {
  
      // ceph store full blocks so need to round up and multiply by block_size
      uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size);
-    // count all objects including too small and non default storage_class objs
      p_worker_stats->ingress_obj++;
      p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
  
@@ -2309,17 +2309,11 @@ namespace rgw::dedup {
  
      if (ondisk_byte_size < d_min_obj_size_for_dedup) {
        if (parsed_etag.num_parts == 0) {
-        // dedup only useful for objects bigger than 4MB
+        // dedup is only applied to objects larger than the configured minimum size
+        // `rgw_dedup_min_obj_size_for_dedup`
          p_worker_stats->ingress_skip_too_small++;
          p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size;
-
-        if (ondisk_byte_size >= 64*1024) {
-          p_worker_stats->ingress_skip_too_small_64KB++;
-          p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
-        }
-        else {
-          return 0;
-        }
+        return 0;
        }
        else {
          // multipart objects are always good candidates for dedup
@@ -2531,8 +2525,6 @@ namespace rgw::dedup {
                         << "::total_count="      << obj_count_in_shard
                         << "::loaded_objects="   << p_stats->loaded_objects
                         << p_stats->big_objs_stat << dendl;
-    ldpp_dout(dpp, 10) << __func__ << "::small objs::"
-                       << p_stats->small_objs_stat << dendl;
    }
  
    //---------------------------------------------------------------------------
@@ -2557,7 +2549,7 @@ namespace rgw::dedup {
          return -ECANCELED;
        }
      }
-    p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat);
+    p_table->count_duplicates(&p_stats->big_objs_stat);
      display_table_stat_counters(dpp, p_stats);
  
      ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
@@ -2839,7 +2831,7 @@ namespace rgw::dedup {
      md5_stats_t md5_stats;
      //DEDUP_DYN_ALLOC
      dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup,
-                        d_max_obj_size_for_split, raw_mem, raw_mem_size);
+                        d_split_head, raw_mem, raw_mem_size);
      int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
      if (ret == 0) {
        md5_stats.duration = ceph_clock_now() - start_time;
diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h

index ecb1e44088b02378078941824019a66a62b385e4..71d980fb58b06239bd5d56fa64380959d5a23b8a 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup.h
+++ b/src/rgw/driver/rados/rgw_dedup.h
@@ -98,16 +98,9 @@ namespace rgw::dedup {
      };
  
      inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes);
-    inline bool should_split_head(uint64_t head_size, uint64_t obj_size);
-    int get_tail_ioctx(const disk_record_t *p_rec,
-                       const RGWObjManifest &manifest,
-                       const std::string &tail_name,
-                       md5_stats_t *p_stats /*IN-OUT*/,
-                       librados::IoCtx *p_ioctx /*OUT*/,
-                       std::string *p_oid /*OUT*/);
+    inline bool should_split_head(const RGWObjManifest &manifest);
      void remove_created_tail_object(const disk_record_t *p_rec,
                                      const RGWObjManifest &manifest,
-                                    const std::string &tail_name,
                                      md5_stats_t *p_stats /*IN-OUT*/);
      void run();
      int  setup(struct dedup_epoch_t*);
@@ -201,7 +194,6 @@ namespace rgw::dedup {
      int split_head_object(disk_record_t *p_src_rec,     // IN/OUT PARAM
                            RGWObjManifest &src_manifest, // IN/OUT PARAM
                            const disk_record_t *p_tgt_rec,
-                          std::string *p_tail_name /*OUT*/,
                            md5_stats_t *p_stats /* IN-OUT */);
  
      int add_obj_attrs_to_record(disk_record_t         *p_rec,
@@ -221,7 +213,6 @@ namespace rgw::dedup {
                                     RGWObjManifest &src_manifest,
                                     const RGWObjManifest &tgt_manifest,
                                     const dedup_table_t::value_t *p_src_val,
-                                   std::string *p_tail_name /*OUT*/,
                                     md5_stats_t *p_stats /* IN-OUT */);
      int try_deduping_record(dedup_table_t   *p_table,
                              disk_record_t   *p_rec,
@@ -244,7 +235,6 @@ namespace rgw::dedup {
                       const RGWObjManifest         &src_manifest,
                       const RGWObjManifest         &tgt_manifest,
                       md5_stats_t                  *p_stats,
-                     const std::string            &tail_name,
                       const dedup_table_t::value_t *p_src_val);
  #endif
      int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
@@ -266,7 +256,7 @@ namespace rgw::dedup {
      uint64_t d_all_buckets_obj_size    = 0;
  
      uint32_t d_min_obj_size_for_dedup = (64ULL * 1024);
-    uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024);
+    bool     d_split_head             = true;
      uint32_t d_head_object_size       = (4ULL * 1024 * 1024);
      control_t d_ctl;
      uint64_t d_watch_handle = 0;
diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.cc b/src/rgw/driver/rados/rgw_dedup_cluster.cc

index fafd66176eff99b17562c6ceb107b0563597aec1..150aace4ab145cd2687f51ee1000abd56f2656ba 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_cluster.cc
+++ b/src/rgw/driver/rados/rgw_dedup_cluster.cc
@@ -973,7 +973,6 @@ namespace rgw::dedup {
      Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
      fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
      fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
  
      if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
        double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
@@ -997,7 +996,6 @@ namespace rgw::dedup {
      Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
      fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
      fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
      if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
        double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
        fmt->dump_float("dedup_ratio", dedup_ratio);
diff --git a/src/rgw/driver/rados/rgw_dedup_table.cc b/src/rgw/driver/rados/rgw_dedup_table.cc

index b27bf7353a6eedf63699c9bfa8f0792cdf369353..898a1304dfdaefda30dd3db13831993ef341091b 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_table.cc
+++ b/src/rgw/driver/rados/rgw_dedup_table.cc
@@ -23,14 +23,14 @@ namespace rgw::dedup {
    dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
                                 uint32_t _head_object_size,
                                 uint32_t _min_obj_size_for_dedup,
-                               uint32_t _max_obj_size_for_split,
+                               bool     _split_head,
                                 uint8_t *p_slab,
                                 uint64_t slab_size)
    {
      dpp = _dpp;
      head_object_size = _head_object_size;
      min_obj_size_for_dedup = _min_obj_size_for_dedup;
-    max_obj_size_for_split = _max_obj_size_for_split;
+    split_head = _split_head;
      memset(p_slab, 0, slab_size);
      hash_tab = (table_entry_t*)p_slab;
      entries_count = slab_size/sizeof(table_entry_t);
@@ -100,6 +100,9 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
+  // find_entry() assumes that entries are not removed during operation
+  // remove_entry() is only called from remove_singletons_and_redistribute_keys()
+  //       doing a linear pass over the array.
    uint32_t dedup_table_t::find_entry(const key_t *p_key) const
    {
      uint32_t idx = p_key->hash() % entries_count;
@@ -113,34 +116,19 @@ namespace rgw::dedup {
  
    //---------------------------------------------------------------------------
    void dedup_table_t::inc_counters(const key_t *p_key,
-                                   dedup_stats_t *p_small_objs,
-                                   dedup_stats_t *p_big_objs,
-                                   uint64_t *p_duplicate_head_bytes)
+                                   dedup_stats_t *p_dedup_stats)
    {
      // This is an approximation only since size is stored in 4KB resolution
      uint64_t byte_size_approx = disk_blocks_to_byte_size(p_key->size_4k_units);
  
-    // skip small single part objects which we can't dedup
-    if (!dedupable_object(p_key->multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
-      p_small_objs->duplicate_count ++;
-      p_small_objs->dedup_bytes_estimate += byte_size_approx;
-      return;
-    }
-    else {
-      uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
-                                                     min_obj_size_for_dedup,
-                                                     max_obj_size_for_split,
-                                                     p_key->num_parts,
-                                                     byte_size_approx);
-      p_big_objs->duplicate_count ++;
-      p_big_objs->dedup_bytes_estimate += dup_bytes_approx;
-
-      // object smaller than max_obj_size_for_split will split their head
-      // and won't dup it
-      if (!p_key->multipart_object() && byte_size_approx > max_obj_size_for_split) {
-        // single part objects duplicate the head object when dedup is used
-        *p_duplicate_head_bytes += head_object_size;
-      }
+    uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+                                                   min_obj_size_for_dedup,
+                                                   split_head,
+                                                   p_key->num_parts,
+                                                   byte_size_approx);
+    if (dup_bytes_approx) {
+      p_dedup_stats->duplicate_count ++;
+      p_dedup_stats->dedup_bytes_estimate += dup_bytes_approx;
      }
    }
  
@@ -149,9 +137,7 @@ namespace rgw::dedup {
                                 disk_block_id_t block_id,
                                 record_id_t rec_id,
                                 bool shared_manifest,
-                               dedup_stats_t *p_small_objs,
-                               dedup_stats_t *p_big_objs,
-                               uint64_t *p_duplicate_head_bytes)
+                               dedup_stats_t *p_dedup_stats)
    {
      value_t new_val(block_id, rec_id, shared_manifest);
      uint32_t idx = find_entry(p_key);
@@ -172,7 +158,7 @@ namespace rgw::dedup {
      else {
        ceph_assert(hash_tab[idx].key == *p_key);
        if (val.count <= MAX_COPIES_PER_OBJ) {
-        inc_counters(p_key, p_small_objs, p_big_objs, p_duplicate_head_bytes);
+        inc_counters(p_key, p_dedup_stats);
        }
        if (val.count < std::numeric_limits<std::uint16_t>::max()) {
          val.count ++;
@@ -280,35 +266,19 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
-                                       dedup_stats_t *p_big_objs)
+  void dedup_table_t::count_duplicates(dedup_stats_t *p_dedup_stats)
    {
      for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
        if (!hash_tab[tab_idx].val.is_occupied()) {
          continue;
        }
  
-      const key_t &key = hash_tab[tab_idx].key;
-      // This is an approximation only since size is stored in 4KB resolution
-      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
-
-      // skip small single part objects which we can't dedup
-      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
-        if (hash_tab[tab_idx].val.is_singleton()) {
-          p_small_objs->singleton_count++;
-        }
-        else {
-          p_small_objs->unique_count ++;
-        }
+      if (hash_tab[tab_idx].val.is_singleton()) {
+        p_dedup_stats->singleton_count++;
        }
        else {
-        if (hash_tab[tab_idx].val.is_singleton()) {
-          p_big_objs->singleton_count++;
-        }
-        else {
-          ceph_assert(hash_tab[tab_idx].val.count > 1);
-          p_big_objs->unique_count ++;
-        }
+        ceph_assert(hash_tab[tab_idx].val.count > 1);
+        p_dedup_stats->unique_count ++;
        }
      }
    }
diff --git a/src/rgw/driver/rados/rgw_dedup_table.h b/src/rgw/driver/rados/rgw_dedup_table.h

index 82efc03480cb4330a3411f4c1ef4b0d762e1457e..4f26972a2cd72b5146704049bdeff951e99849b2 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_table.h
+++ b/src/rgw/driver/rados/rgw_dedup_table.h
@@ -131,16 +131,14 @@ namespace rgw::dedup {
      dedup_table_t(const DoutPrefixProvider* _dpp,
                    uint32_t _head_object_size,
                    uint32_t _min_obj_size_for_dedup,
-                  uint32_t _max_obj_size_for_split,
+                  bool     _split_head,
                    uint8_t *p_slab,
                    uint64_t slab_size);
      int add_entry(key_t *p_key,
                    disk_block_id_t block_id,
                    record_id_t rec_id,
                    bool shared_manifest,
-                  dedup_stats_t *p_small_objs_stat,
-                  dedup_stats_t *p_big_objs_stat,
-                  uint64_t *p_duplicate_head_bytes);
+                  dedup_stats_t *p_dedup_stats);
  
      void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
                        bool shared_manifest);
@@ -159,9 +157,7 @@ namespace rgw::dedup {
                       bool set_shared_manifest_src,
                       bool set_has_valid_hash_src);
  
-    void count_duplicates(dedup_stats_t *p_small_objs_stat,
-                          dedup_stats_t *p_big_objs_stat);
-
+    void count_duplicates(dedup_stats_t *p_dedup_stats);
      void remove_singletons_and_redistribute_keys();
    private:
      // 32 Bytes unified entries
@@ -173,15 +169,13 @@ namespace rgw::dedup {
  
      uint32_t find_entry(const key_t *p_key) const;
      void     inc_counters(const key_t *p_key,
-                          dedup_stats_t *p_small_objs,
-                          dedup_stats_t *p_big_objs,
-                          uint64_t *p_duplicate_head_bytes);
+                          dedup_stats_t *p_dedup_stats);
  
      uint32_t       entries_count = 0;
      uint32_t       occupied_count = 0;
      uint32_t       head_object_size;
      uint32_t       min_obj_size_for_dedup;
-    uint32_t       max_obj_size_for_split;
+    bool           split_head;
      table_entry_t *hash_tab = nullptr;
  
      // stat counters
diff --git a/src/rgw/driver/rados/rgw_dedup_utils.cc b/src/rgw/driver/rados/rgw_dedup_utils.cc

index 74252a853950346028f9eea0896acb531cec1559..52fdfa2c04f30983088c742d906a4e630ce658a4 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_utils.cc
+++ b/src/rgw/driver/rados/rgw_dedup_utils.cc
@@ -382,8 +382,6 @@ namespace rgw::dedup {
      this->ingress_corrupted_etag += other.ingress_corrupted_etag;
      this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
      this->ingress_skip_too_small += other.ingress_skip_too_small;
-    this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
-    this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
  
      return *this;
    }
@@ -440,13 +438,6 @@ namespace rgw::dedup {
                           this->ingress_skip_too_small);
          f->dump_unsigned("Ingress skip: too small bytes",
                           this->ingress_skip_too_small_bytes);
-
-        if(this->ingress_skip_too_small_64KB) {
-          f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj",
-                           this->ingress_skip_too_small_64KB);
-          f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes",
-                           this->ingress_skip_too_small_64KB_bytes);
-        }
        }
      }
  
@@ -499,9 +490,6 @@ namespace rgw::dedup {
      encode(w.ingress_skip_too_small_bytes, bl);
      encode(w.ingress_skip_too_small, bl);
  
-    encode(w.ingress_skip_too_small_64KB_bytes, bl);
-    encode(w.ingress_skip_too_small_64KB, bl);
-
      encode(w.duration, bl);
      ENCODE_FINISH(bl);
    }
@@ -528,8 +516,6 @@ namespace rgw::dedup {
      decode(w.ingress_corrupted_etag, bl);
      decode(w.ingress_skip_too_small_bytes, bl);
      decode(w.ingress_skip_too_small, bl);
-    decode(w.ingress_skip_too_small_64KB_bytes, bl);
-    decode(w.ingress_skip_too_small_64KB, bl);
  
      decode(w.duration, bl);
      DECODE_FINISH(bl);
@@ -538,7 +524,6 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
    {
-    this->small_objs_stat               += other.small_objs_stat;
      this->big_objs_stat                 += other.big_objs_stat;
      this->ingress_slabs                 += other.ingress_slabs;
      this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
@@ -551,6 +536,8 @@ namespace rgw::dedup {
      this->ingress_skip_compressed       += other.ingress_skip_compressed;
      this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
      this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
+    this->ingress_skip_explicit_objs    += other.ingress_skip_explicit_objs;
+    this->ingress_skip_alibaba          += other.ingress_skip_alibaba;
      this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
  
      this->skipped_shared_manifest += other.skipped_shared_manifest;
@@ -591,10 +578,8 @@ namespace rgw::dedup {
      this->set_shared_manifest_src += other.set_shared_manifest_src;
      this->loaded_objects          += other.loaded_objects;
      this->processed_objects       += other.processed_objects;
-    this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
      this->deduped_objects         += other.deduped_objects;
      this->deduped_objects_bytes   += other.deduped_objects_bytes;
-    this->dup_head_bytes          += other.dup_head_bytes;
  
      this->failed_dedup            += other.failed_dedup;
      this->md_throttle_sleep_events    += other.md_throttle_sleep_events;
@@ -628,7 +613,6 @@ namespace rgw::dedup {
        f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
        f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
        f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
-      f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
        f->dump_unsigned("Already Deduped bytes (prev cycles)",
                         this->shared_manifest_dedup_bytes);
  
@@ -639,21 +623,6 @@ namespace rgw::dedup {
        f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
      }
  
-    // Potential Dedup Section:
-    // What could be gained by allowing dedup for smaller objects (64KB-4MB)
-    // Space wasted because of duplicated head-object (4MB)
-    {
-      Formatter::ObjectSection potential(*f, "Potential Dedup");
-      const dedup_stats_t &ds = this->small_objs_stat;
-      f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
-      f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
-      f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
-      f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
-      f->dump_unsigned("Duplicated Head Bytes Estimate",
-                       this->dup_head_bytes_estimate);
-      f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
-    }
-
      {
        Formatter::ObjectSection notify(*f, "notify");
        if (this->md_throttle_sleep_events) {
@@ -726,6 +695,12 @@ namespace rgw::dedup {
        if (this->ingress_skip_changed_objs) {
          f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs);
        }
+      if (this->ingress_skip_explicit_objs) {
+        f->dump_unsigned("Skipped Explicit Objs", this->ingress_skip_explicit_objs);
+      }
+      if (this->ingress_skip_alibaba) {
+        f->dump_unsigned("Skipped Alibaba Cloud OSS", this->ingress_skip_alibaba);
+      }
      }
  
      {
@@ -814,7 +789,6 @@ namespace rgw::dedup {
    {
      ENCODE_START(1, 1, bl);
  
-    encode(m.small_objs_stat, bl);
      encode(m.big_objs_stat, bl);
      encode(m.ingress_slabs, bl);
      encode(m.ingress_failed_load_bucket, bl);
@@ -827,6 +801,8 @@ namespace rgw::dedup {
      encode(m.ingress_skip_compressed, bl);
      encode(m.ingress_skip_compressed_bytes, bl);
      encode(m.ingress_skip_changed_objs, bl);
+    encode(m.ingress_skip_explicit_objs, bl);
+    encode(m.ingress_skip_alibaba, bl);
      encode(m.shared_manifest_dedup_bytes, bl);
  
      encode(m.skipped_shared_manifest, bl);
@@ -867,10 +843,8 @@ namespace rgw::dedup {
  
      encode(m.loaded_objects, bl);
      encode(m.processed_objects, bl);
-    encode(m.dup_head_bytes_estimate, bl);
      encode(m.deduped_objects, bl);
      encode(m.deduped_objects_bytes, bl);
-    encode(m.dup_head_bytes, bl);
      encode(m.failed_dedup, bl);
      encode(m.md_throttle_sleep_events, bl);
      encode(m.md_throttle_sleep_time_usec, bl);
@@ -885,7 +859,6 @@ namespace rgw::dedup {
    void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
    {
      DECODE_START(1, bl);
-    decode(m.small_objs_stat, bl);
      decode(m.big_objs_stat, bl);
      decode(m.ingress_slabs, bl);
      decode(m.ingress_failed_load_bucket, bl);
@@ -898,6 +871,8 @@ namespace rgw::dedup {
      decode(m.ingress_skip_compressed, bl);
      decode(m.ingress_skip_compressed_bytes, bl);
      decode(m.ingress_skip_changed_objs, bl);
+    decode(m.ingress_skip_explicit_objs, bl);
+    decode(m.ingress_skip_alibaba, bl);
      decode(m.shared_manifest_dedup_bytes, bl);
  
      decode(m.skipped_shared_manifest, bl);
@@ -938,10 +913,8 @@ namespace rgw::dedup {
  
      decode(m.loaded_objects, bl);
      decode(m.processed_objects, bl);
-    decode(m.dup_head_bytes_estimate, bl);
      decode(m.deduped_objects, bl);
      decode(m.deduped_objects_bytes, bl);
-    decode(m.dup_head_bytes, bl);
      decode(m.failed_dedup, bl);
      decode(m.md_throttle_sleep_events, bl);
      decode(m.md_throttle_sleep_time_usec, bl);
diff --git a/src/rgw/driver/rados/rgw_dedup_utils.h b/src/rgw/driver/rados/rgw_dedup_utils.h

index 579e048a259f0bd0f8650a8bc64ff6d5f25d005a..6a7f508cc36eee11e6c7f83e6c8dbcbe99f1d3e8 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_utils.h
+++ b/src/rgw/driver/rados/rgw_dedup_utils.h
@@ -198,9 +198,6 @@ namespace rgw::dedup {
      uint64_t ingress_skip_too_small_bytes = 0;
      uint64_t ingress_skip_too_small = 0;
  
-    uint64_t ingress_skip_too_small_64KB_bytes = 0;
-    uint64_t ingress_skip_too_small_64KB = 0;
-
      utime_t  duration = {0, 0};
    };
    std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
@@ -212,7 +209,6 @@ namespace rgw::dedup {
      md5_stats_t& operator +=(const md5_stats_t& other);
      void dump(Formatter *f) const;
  
-    dedup_stats_t small_objs_stat;
      dedup_stats_t big_objs_stat;
      uint64_t ingress_slabs = 0;
      uint64_t ingress_failed_load_bucket = 0;
@@ -225,6 +221,8 @@ namespace rgw::dedup {
      uint64_t ingress_skip_compressed = 0;
      uint64_t ingress_skip_compressed_bytes = 0;
      uint64_t ingress_skip_changed_objs = 0;
+    uint64_t ingress_skip_explicit_objs = 0;
+    uint64_t ingress_skip_alibaba = 0;
  
      uint64_t shared_manifest_dedup_bytes = 0;
      uint64_t skipped_shared_manifest = 0;
@@ -265,11 +263,9 @@ namespace rgw::dedup {
      uint64_t loaded_objects = 0;
      uint64_t processed_objects = 0;
      // counter is using on-disk size affected by block-size
-    uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
      uint64_t deduped_objects = 0;
      // counter is using s3 byte size disregarding the on-disk size affected by block-size
      uint64_t deduped_objects_bytes = 0;
-    uint64_t dup_head_bytes = 0;
      uint64_t failed_dedup = 0;
      uint64_t md_throttle_sleep_events = 0;
      uint64_t md_throttle_sleep_time_usec = 0;
@@ -368,7 +364,7 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size,
                                              uint32_t min_obj_size_for_dedup,
-                                            uint32_t max_obj_size_for_split,
+                                            bool     split_head,
                                              uint16_t num_parts,
                                              uint64_t size_bytes)
    {
@@ -376,18 +372,16 @@ namespace rgw::dedup {
        // multipart objects with an empty head i.e. we achive full dedup
        return size_bytes;
      }
+    else if (size_bytes < min_obj_size_for_dedup) {
+      return 0;
+    }
+    else if (split_head) {
+      // Head is splitted into an empty obj and a new tail enabling a full dedup
+      return size_bytes;
+    }
      else {
-      // reduce the head size
-      if (size_bytes > max_obj_size_for_split) {
-        return size_bytes - head_obj_size;
-      }
-      else if (size_bytes >= min_obj_size_for_dedup) {
-        // Head is splitted into an empty obj and a new tail enabling a full dedup
-        return size_bytes;
-      }
-      else {
-        return 0;
-      }
+      // reduce the head size which is not dedup
+      return size_bytes - std::min(size_bytes, (uint64_t)head_obj_size);
      }
    }
  
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc

index d423c115d4de692713b74d458a619be5a8e142de..dd24390247ea1d4e5f740c28e6d16e64481d0f6c 100644 (file)
--- a/src/rgw/driver/rados/rgw_obj_manifest.cc
+++ b/src/rgw/driver/rados/rgw_obj_manifest.cc
@@ -181,13 +181,13 @@ int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifes
    return 0;
  }
  
-bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
+bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) const
  {
    if (rules.empty()) {
      return false;
    }
  
-  map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
+  map<uint64_t, RGWObjManifestRule>::const_iterator iter = rules.upper_bound(ofs);
    if (iter != rules.begin()) {
      --iter;
    }
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h

index 4129a015c98b103aad7d2625a5397b70e6e06b31..966db3add84fbaef0955bc2b706c3c9386301e6a 100644 (file)
--- a/src/rgw/driver/rados/rgw_obj_manifest.h
+++ b/src/rgw/driver/rados/rgw_obj_manifest.h
@@ -254,7 +254,11 @@ public:
    }
  
    void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
-                             std::string *override_prefix, rgw_obj_select *location) const;
+                             const std::string *override_prefix, rgw_obj_select *location) const;
+
+  const std::map<uint64_t, RGWObjManifestRule>& get_rules() const {
+    return rules;
+  }
  
    void clear_rules() {
      rules.clear();
@@ -383,7 +387,7 @@ public:
    int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
               const RGWZoneParams& zone_params);
  
-  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
+  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule) const;
  
    bool empty() const {
      if (explicit_objs)
diff --git a/src/rgw/rgw_obj_manifest.cc b/src/rgw/rgw_obj_manifest.cc

index 042c97aa13cde16226768665df7551bdd4f4123f..418c913b82a8c765c2e7e2ebe23a60229b4c1286 100644 (file)
--- a/src/rgw/rgw_obj_manifest.cc
+++ b/src/rgw/rgw_obj_manifest.cc
@@ -207,7 +207,7 @@ void RGWObjManifest::obj_iterator::update_location()
  }
  
  void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe,
-                                           uint64_t ofs, string *override_prefix, rgw_obj_select *location) const
+                                           uint64_t ofs, const string *override_prefix, rgw_obj_select *location) const
  {
    rgw_obj loc;
  
diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py

index 40f82b862748301a3c7575ea2d53e7633cff16f7..25177a10ebb1f39d5882db6a8f6500e9c42f7380 100644 (file)
--- a/src/test/rgw/dedup/test_dedup.py
+++ b/src/test/rgw/dedup/test_dedup.py
@@ -51,10 +51,6 @@ class Dedup_Stats:
      duplicate_obj : int = 0
      deduped_obj_bytes : int = 0
      non_default_storage_class_objs_bytes : int = 0
-    potential_singleton_obj : int = 0
-    potential_unique_obj : int = 0
-    potential_duplicate_obj : int = 0
-    potential_dedup_space : int = 0
  
  @dataclass
  class Dedup_Ratio:
@@ -280,9 +276,8 @@ def create_buckets(conn, max_copies_count):
  OUT_DIR="/tmp/dedup/"
  KB=(1024)
  MB=(1024*KB)
-POTENTIAL_OBJ_SIZE=(64*KB)
  DEDUP_MIN_OBJ_SIZE=(64*KB)
-SPLIT_HEAD_SIZE=(4*MB)
+SPLIT_HEAD=True
  RADOS_OBJ_SIZE=(4*MB)
  # The default multipart threshold size for S3cmd is 15 MB.
  MULTIPART_SIZE=(15*MB)
@@ -638,17 +633,16 @@ def calc_head_size(obj_size, config):
  def calc_dedupable_space(obj_size, config):
      on_disk_byte_size = calc_on_disk_byte_size(obj_size)
  
-    threshold = config.multipart_threshold
      # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
      # multi-part objects got a zero size Head objects
-    if obj_size >= threshold:
+    if obj_size >= config.multipart_threshold:
          dedupable_space = on_disk_byte_size
-    elif obj_size > SPLIT_HEAD_SIZE:
-        dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE
-    elif obj_size >= DEDUP_MIN_OBJ_SIZE:
+    elif obj_size < DEDUP_MIN_OBJ_SIZE:
+        dedupable_space = 0
+    elif SPLIT_HEAD:
          dedupable_space = on_disk_byte_size
      else:
-        dedupable_space = 0
+        dedupable_space = (on_disk_byte_size - min(on_disk_byte_size, RADOS_OBJ_SIZE))
  
      log.debug("obj_size=%.2f MiB, dedupable_space=%.2f MiB",
                float(obj_size)/MB, float(dedupable_space)/MB)
@@ -659,7 +653,7 @@ def calc_split_objs_count(obj_size, num_copies, config):
      threshold = config.multipart_threshold
      on_disk_byte_size = calc_on_disk_byte_size(obj_size)
  
-    if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold:
+    if num_copies < 2 or not SPLIT_HEAD or obj_size >= threshold:
          return 0
  
      if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE:
@@ -680,15 +674,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
      if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE:
          dedup_stats.skip_too_small += num_copies
          dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
-
-        if on_disk_byte_size >= POTENTIAL_OBJ_SIZE:
-            if num_copies == 1:
-                dedup_stats.potential_singleton_obj += 1
-            else:
-                dedup_stats.potential_unique_obj += 1
-                dedup_stats.potential_duplicate_obj += dups_count
-                dedup_stats.potential_dedup_space += (on_disk_byte_size * dups_count)
-
          return
  
      dedup_stats.total_processed_objects += num_copies
@@ -1399,12 +1384,6 @@ def read_dedup_stats(dry_run):
          dedup_stats.duplicate_obj = main['Duplicate Obj']
          dedup_stats.dedup_bytes_estimate = main['Dedup Bytes Estimate']
  
-        potential = md5_stats['Potential Dedup']
-        dedup_stats.potential_singleton_obj = potential['Singleton Obj (64KB-4MB)']
-        dedup_stats.potential_unique_obj = potential['Unique Obj (64KB-4MB)']
-        dedup_stats.potential_duplicate_obj = potential['Duplicate Obj (64KB-4MB)']
-        dedup_stats.potential_dedup_space = potential['Dedup Bytes Estimate (64KB-4MB)']
-
      dedup_work_was_completed=jstats['completed']
      if dedup_work_was_completed:
          dedup_ratio_estimate=read_dedup_ratio(jstats, 'dedup_ratio_estimate')
@@ -1486,11 +1465,6 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size
      if verify_stats == False:
          return ret
  
-    if dedup_stats.potential_unique_obj or expected_dedup_stats.potential_unique_obj:
-        log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
-                  expected_dedup_stats.potential_unique_obj)
-
-
      #dedup_stats.set_hash = dedup_stats.invalid_hash
      if dedup_stats != expected_dedup_stats:
          log.debug("==================================================")
@@ -1513,14 +1487,6 @@ def prepare_test():
  
      os.mkdir(OUT_DIR)
  
-#-------------------------------------------------------------------------------
-def copy_potential_stats(new_dedup_stats, dedup_stats):
-    new_dedup_stats.potential_singleton_obj = dedup_stats.potential_singleton_obj
-    new_dedup_stats.potential_unique_obj    = dedup_stats.potential_unique_obj
-    new_dedup_stats.potential_duplicate_obj = dedup_stats.potential_duplicate_obj
-    new_dedup_stats.potential_dedup_space   = dedup_stats.potential_dedup_space
-
-
  #-------------------------------------------------------------------------------
  def small_single_part_objs_dedup(conn, bucket_name, dry_run):
      # 1) generate small random files and store them on disk
@@ -1547,8 +1513,6 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
-        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small = s3_objects_total
@@ -1897,6 +1861,8 @@ def test_dedup_with_versions():
      min_size=1*KB
      max_size=MULTIPART_SIZE*2
      success=False
+    # Declare the variable with a type hint
+    conn: BaseClient
      try:
          conn=get_single_connection()
          conn.create_bucket(Bucket=bucket_name)
@@ -2415,8 +2381,6 @@ def test_dedup_small_with_tenants():
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
-        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -3320,7 +3284,6 @@ def test_dedup_dry_small_with_tenants():
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -3693,3 +3656,4 @@ def test_dedup_identical_copies_multipart_small():
      force_clean=True
      log.info("test_dedup_identical_copies_multipart:full test")
      __test_dedup_identical_copies(files, config, dry_run, verify, force_clean)
+
author	benhanokh <gbenhano@redhat.com>
	Mon, 30 Mar 2026 08:22:51 +0000 (11:22 +0300)
committer	benhanokh <gbenhano@redhat.com>
	Thu, 16 Apr 2026 08:24:10 +0000 (11:24 +0300)
doc/radosgw/s3_objects_dedup.rst		patch \| blob \| history
src/common/options/rgw.yaml.in		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup.h		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_cluster.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_table.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_table.h		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_utils.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_utils.h		patch \| blob \| history
src/rgw/driver/rados/rgw_obj_manifest.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_obj_manifest.h		patch \| blob \| history
src/rgw/rgw_obj_manifest.cc		patch \| blob \| history
src/test/rgw/dedup/test_dedup.py		patch \| blob \| history