]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/dedup: This PR extends the RGW dedup split-head feature to support objects that... 68113/head
authorbenhanokh <gbenhano@redhat.com>
Mon, 30 Mar 2026 08:22:51 +0000 (11:22 +0300)
committerbenhanokh <gbenhano@redhat.com>
Thu, 16 Apr 2026 08:24:10 +0000 (11:24 +0300)
Previously, split-head was restricted to objects whose entire data fit in the head (≤4 MiB).
It also migrates the split-head manifest representation from the legacy explicit-objs format to the prefix+index rules-based format.

Refactored should_split_head():
Now performs a larger set of eligibility checks:
 * d_split_head flag is set
 * single-part object only
 * non-empty head
 * not a legacy manifest
 * not an Alibaba Cloud OSS AppendObject

Explicit skips for unsupported manifest types:
 â€” old-style explicit-objs manifests
 â€” OSS AppendObject manifests (detected via non-empty override_prefix)

New config option: rgw_dedup_split_obj_head:
  Default is true (split-head enabled).
  Setting to false disables split-head entirely.

Tail object lookup via manifest iterator:
  Replaces the old get_tail_ioctx() which manually constructed the tail OID via generate_split_head_tail_name().
  The new function simply calls manifest.obj_begin() and resolves the first tail object location through the standard manifest iterator.

Stats cleanup:
Removed the "Potential Dedup" stats section (small_objs_stat, dup_head_bytes, dup_head_bytes_estimate, ingress_skip_too_small_64KB*)
 which tracked 64KB–4MB objects as potential-but-skipped candidates.
 Since split-head now covers all sizes, this distinction is no longer meaningful. calc_deduped_bytes() is simplified accordingly.

Signed-off-by: benhanokh <gbenhano@redhat.com>
13 files changed:
doc/radosgw/s3_objects_dedup.rst
src/common/options/rgw.yaml.in
src/rgw/driver/rados/rgw_dedup.cc
src/rgw/driver/rados/rgw_dedup.h
src/rgw/driver/rados/rgw_dedup_cluster.cc
src/rgw/driver/rados/rgw_dedup_table.cc
src/rgw/driver/rados/rgw_dedup_table.h
src/rgw/driver/rados/rgw_dedup_utils.cc
src/rgw/driver/rados/rgw_dedup_utils.h
src/rgw/driver/rados/rgw_obj_manifest.cc
src/rgw/driver/rados/rgw_obj_manifest.h
src/rgw/rgw_obj_manifest.cc
src/test/rgw/dedup/test_dedup.py

index fe83124d1549f61752617ce1ba243225b28424b1..7de91ed00071773c9458a8f69bf95173646d98dd 100644 (file)
@@ -108,19 +108,21 @@ matches. If they are, we proceed with the deduplication:
 - Copy the manifest from the source to the target.
 - Remove all tail objects on the target.
 
-
 Split Head Mode
 ===============
 
-Dedup code can split the head object into 2 objects
+The dedup code can split a head object into 2 objects:
 
-- one with attributes and no data and
+- one with attributes and no data, and
 - a new tail object with only data.
 
-The new tail object will be deduped, unlike the head objects, which cannot
+The new tail object will be deduped, unlike head objects, which cannot
 be deduplicated.
-This feature is only enabled for RGW objects without existing tail objects
-(in other words, objects sized 4 MB or less).
+
+:confval:`rgw_dedup_split_obj_head` (default: true). Setting
+this option to ``false`` disables split-head entirely.
+
+.. confval:: rgw_dedup_split_obj_head
 
 
 Memory Usage
index 6a85b076ca5f826bca6ab4782fd00d790dfc20e7..57000c5154b53618b4ddd61a8a75e90d3fa7e87c 100644 (file)
@@ -81,6 +81,18 @@ options:
   default: false
   services:
   - rgw
+- name: rgw_dedup_split_obj_head
+  type: bool
+  level: advanced
+  desc: Enables the split-head functionality
+  long_desc: Dedup code can split head object into two objects -
+             one with attributes and no data and a new tail-object with only data.
+             The new-tail object will be deduped (unlike the head objects which
+             can't be deduplicated)
+  default: true
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_dedup_min_obj_size_for_dedup
   type: size
   level: advanced
index 2446d43ff92d4a1bddeccd1c0d2e3f2f7aa834ac..67d59370aa9a189679ac77d011e574ef43f7b362 100644 (file)
@@ -413,12 +413,10 @@ namespace rgw::dedup {
   {
     d_head_object_size = cct->_conf->rgw_max_chunk_size;
     d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup;
-
-    // limit split head to objects without tail
-    d_max_obj_size_for_split = d_head_object_size;
+    d_split_head = cct->_conf->rgw_dedup_split_obj_head;
     ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size
                        << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup
-                       << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split
+                       << "::d_split_head=" << d_split_head
                        << dendl;
 
     int ret = init_rados_access_handles(false);
@@ -435,11 +433,8 @@ namespace rgw::dedup {
   //------------------------------------------------------------------------------
   uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes)
   {
-    return calc_deduped_bytes(d_head_object_size,
-                              d_min_obj_size_for_dedup,
-                              d_max_obj_size_for_split,
-                              num_parts,
-                              size_bytes);
+    return calc_deduped_bytes(d_head_object_size, d_min_obj_size_for_dedup,
+                              d_split_head, num_parts, size_bytes);
   }
 
   //---------------------------------------------------------------------------
@@ -495,8 +490,7 @@ namespace rgw::dedup {
                        << p_rec->s.md5_low << std::dec << dendl;
 
     int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest,
-                                 &p_stats->small_objs_stat, &p_stats->big_objs_stat,
-                                 &p_stats->dup_head_bytes_estimate);
+                                 &p_stats->big_objs_stat);
     if (ret == 0) {
       p_stats->loaded_objects ++;
       ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
@@ -544,15 +538,14 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  static int get_ioctx_internal(const DoutPrefixProvider* const dpp,
-                                rgw::sal::Driver* driver,
-                                rgw::sal::RadosStore* store,
-                                const std::string &obj_name,
-                                const std::string &instance,
-                                const rgw_bucket &rb,
-                                librados::IoCtx *p_ioctx /*OUT*/,
-                                std::string *p_oid /*OUT*/)
+  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
+                              rgw::sal::Driver* driver,
+                              rgw::sal::RadosStore* store,
+                              const disk_record_t *p_rec,
+                              librados::IoCtx *p_ioctx /*OUT*/,
+                              std::string *p_oid /*OUT*/)
   {
+    rgw_bucket rb{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
     unique_ptr<rgw::sal::Bucket> bucket;
     {
       int ret = driver->load_bucket(dpp, rb, &bucket, null_yield);
@@ -564,44 +557,12 @@ namespace rgw::dedup {
     }
 
     string dummy_locator;
-    const rgw_obj_index_key key(obj_name, instance);
+    const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance);
     rgw_obj obj(bucket->get_key(), key);
     get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
     RGWBucketInfo& bucket_info = bucket->get_info();
     return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
-  }
 
-  //---------------------------------------------------------------------------
-  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
-                              rgw::sal::Driver* driver,
-                              rgw::sal::RadosStore* store,
-                              const disk_record_t *p_rec,
-                              librados::IoCtx *p_ioctx /*OUT*/,
-                              std::string *p_oid /*OUT*/)
-  {
-    rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
-    return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance,
-                              b, p_ioctx, p_oid);
-  }
-
-  //---------------------------------------------------------------------------
-  static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest)
-  {
-    static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW);
-    std::string_view suffix = "0";
-    const std::string &prefix = manifest.get_prefix();
-
-    std::string tail_name;
-    tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1);
-    // TBD:
-    // it is unclear when RGW code pads with "_" before the shadow string
-    // It won't change correctness, but might look weird
-    //tail_name.append("_");
-    tail_name.append(shadow_string);
-    tail_name.append("_");
-    tail_name.append(prefix);
-    tail_name.append(suffix);
-    return tail_name;
   }
 
   //---------------------------------------------------------------------------
@@ -611,43 +572,37 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  int Background::get_tail_ioctx(const disk_record_t *p_rec,
-                                 const RGWObjManifest &manifest,
-                                 const std::string &tail_name,
-                                 md5_stats_t *p_stats /*IN-OUT*/,
-                                 librados::IoCtx *p_ioctx /*OUT*/,
-                                 std::string *p_oid /*OUT*/)
+  static int get_first_tail_obj_params(const DoutPrefixProvider *dpp,
+                                       RGWRados *rados,
+                                       const RGWObjManifest &manifest,
+                                       librados::IoCtx *p_tail_ioctx, /*OUT*/
+                                       std::string *p_tail_oid /*OUT*/)
   {
-    const rgw_bucket_placement &tail_placement = manifest.get_tail_placement();
-    // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort
-    if (unlikely(invalid_tail_placement(tail_placement))) {
-      p_stats->split_head_no_tail_placement++;
-      ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl;
-      return -EINVAL;
-    }
-
-    const rgw_bucket& bucket = tail_placement.bucket;
-    // tail objects might be on another storage_class/pool, need another ioctx
-    int ret = get_ioctx_internal(dpp, driver, store, tail_name, p_rec->instance,
-                                 bucket, p_ioctx, p_oid);
-    if (unlikely(ret != 0)) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl;
+    auto p = manifest.obj_begin(dpp);
+    const rgw_obj_select& os = p.get_location();
+    rgw_raw_obj raw_obj = os.get_raw_obj(rados);
+    rgw_rados_ref obj;
+    int ret = rgw_get_rados_ref(dpp, rados->get_rados_handle(), raw_obj, &obj);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid="
+                        << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
       return ret;
     }
 
+    *p_tail_ioctx = obj.ioctx;
+    *p_tail_oid   = raw_obj.oid;
+
     return 0;
   }
 
   //---------------------------------------------------------------------------
   void Background::remove_created_tail_object(const disk_record_t *p_rec,
                                               const RGWObjManifest &manifest,
-                                              const std::string &tail_name,
                                               md5_stats_t *p_stats /*IN-OUT*/)
   {
     librados::IoCtx tail_ioctx;
     std::string tail_oid;
-    int ret = get_tail_ioctx(p_rec, manifest, tail_name, p_stats, &tail_ioctx,
-                             &tail_oid);
+    int ret = get_first_tail_obj_params(dpp, rados, manifest, &tail_ioctx, &tail_oid);
     if (unlikely(ret != 0)) {
       return;
     }
@@ -665,10 +620,41 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size)
+  inline bool Background::should_split_head(const RGWObjManifest& manifest)
   {
-    // Don't split RGW objects with existing tail-objects
-    return (head_size > 0 && head_size == obj_size);
+    // Split-head is only applicable for single-part objects with a non-empty head.
+    // To avoid issues with manifests created via append (specifically for Alibaba Cloud OSS),
+    //    we should disable split-head whenever the manifest contains an override_prefix in the rules.
+    //    We also reject manifests with multiple rules which are exclusively an
+    //    artifact of the OSS AppendObject operation.
+    // The head_size should either represent the full object or be equal to the stripe_max_size.
+
+    bool     success   = false;
+    uint64_t head_size = manifest.get_head_size();
+    uint64_t obj_size  = manifest.get_obj_size();
+    RGWObjManifestRule rule;
+    if (manifest.get_rule(0, &rule)) {
+      success = (d_split_head        && // split-head was not disabled from yaml
+                 rule.part_size == 0 && // not a multi-part object
+                 head_size > 0       && // non-empty head
+                 !manifest.has_explicit_objs()    && // not an explicit manifest
+                 rule.override_prefix.empty()     && // not Alibaba Cloud OSS
+                 manifest.get_rules().size() == 1 && // not Alibaba Cloud OSS
+                 (head_size == rule.stripe_max_size || head_size == obj_size));
+
+      if (unlikely(!success)) {
+        ldpp_dout(dpp, 20) << __func__ << "::ERR::d_split_head=" << d_split_head
+                           << "::obj_size=" << obj_size
+                           << "::head_size=" << head_size
+                           << "::rule.part_size=" << rule.part_size
+                           << "::rule.stripe_max_size=" << rule.stripe_max_size
+                           << "::rule.override_prefix=" << rule.override_prefix
+                           << "::rule.override_prefix.empty()=" << rule.override_prefix.empty()
+                           << dendl;
+      }
+    } // don't split head if can't get rule
+
+    return success;
   }
 
   //---------------------------------------------------------------------------
@@ -806,14 +792,14 @@ namespace rgw::dedup {
       cls_refcount_get(op, ref_tag, true);
       d_ctl.metadata_access_throttle.acquire();
       ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: "
-                         << raw_obj.oid << "::" << obj.obj.oid << dendl;
+                         << obj.obj.oid << "::" << raw_obj.to_str() << dendl;
       rgw::AioResultList completed = aio->get(obj.obj,
                                               rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                               cost, id);
       ret = rgw::check_for_errors(completed);
       all_results.splice(all_results.end(), completed);
       if (ret < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed refcount_get() obj=" << obj
                           << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl;
         break;
       }
@@ -965,7 +951,6 @@ namespace rgw::dedup {
                                const RGWObjManifest         &src_manifest,
                                const RGWObjManifest         &tgt_manifest,
                                md5_stats_t                  *p_stats,
-                               const std::string            &tail_name,
                                const dedup_table_t::value_t *p_src_val)
   {
     const uint64_t src_head_size = src_manifest.get_head_size();
@@ -990,7 +975,7 @@ namespace rgw::dedup {
     if (unlikely(ret != 0)) {
       ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl;
       if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
       }
       return ret;
     }
@@ -1001,7 +986,7 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size
                         << "::tgt_head_size=" << tgt_head_size << dendl;
       if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
       }
       // TBD: can we create a test case (requires control over head-object-size)??
       return -ECANCELED;
@@ -1013,7 +998,7 @@ namespace rgw::dedup {
     ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
     if (unlikely(ret != 0)) {
       if (p_src_rec->s.flags.is_split_head()) {
-        remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+        remove_created_tail_object(p_src_rec, src_manifest, p_stats);
       }
       return ret;
     }
@@ -1053,7 +1038,7 @@ namespace rgw::dedup {
                           << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
         rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
         if (p_src_rec->s.flags.is_split_head()) {
-          remove_created_tail_object(p_src_rec, src_manifest, tail_name, p_stats);
+          remove_created_tail_object(p_src_rec, src_manifest, p_stats);
         }
         return ret;
       }
@@ -1235,8 +1220,29 @@ namespace rgw::dedup {
                            << "::ERROR: unable to decode manifest" << dendl;
         return -EINVAL;
       }
-      need_to_split_head = should_split_head(manifest.get_head_size(),
-                                             p_rec->s.obj_bytes_size);
+
+      if (unlikely(manifest.has_explicit_objs())) {
+        // we don't support dedup of explicit_objs manifest
+        p_stats->ingress_skip_explicit_objs++;
+        ldpp_dout(dpp, 20)  << __func__ << "::explicit_objs can't be dedup" << dendl;
+        return -ENOTSUP;
+      }
+
+      RGWObjManifestRule rule;
+      if (!manifest.get_rule(0, &rule)                              ||
+          // if not a multi-part must have exactly 1 rule
+          (rule.part_size == 0 && manifest.get_rules().size() != 1) ||
+          !rule.override_prefix.empty()) {
+        // we don't support dedup of Alibaba Cloud OSS using AppendObject API
+        p_stats->ingress_skip_alibaba++;
+        ldpp_dout(dpp, 10)  << __func__ << "::Alibaba Cloud OSS can't be dedup"
+                            << "::rules.size()=" << manifest.get_rules().size()
+                            << "::get_rule ret=" << manifest.get_rule(0, &rule)
+                            << "::override_prefix=" << rule.override_prefix << dendl;
+        return -ENOTSUP;
+      }
+
+      need_to_split_head = should_split_head(manifest);
 
       // force explicit tail_placement as the dedup could be on another bucket
       const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
@@ -1501,6 +1507,11 @@ namespace rgw::dedup {
     p_rec->s.flags.clear();
     ret = add_obj_attrs_to_record(p_rec, attrs, p_stats);
     if (unlikely(ret != 0)) {
+      // don't trace errors for unsupported manifest
+      if (ret == -ENOTSUP) {
+        return 0;
+      }
+
       ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
                         << ret << "::" << cpp_strerror(-ret) << dendl;
       return ret;
@@ -1640,39 +1651,36 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  static void build_and_set_explicit_manifest(const DoutPrefixProvider *dpp,
-                                              const rgw_bucket *p_bucket,
-                                              const std::string &tail_name,
-                                              RGWObjManifest *p_manifest)
+  static int set_manifest_for_split_head(const DoutPrefixProvider *const dpp,
+                                         RGWObjManifest *p_manifest /*IN-OUT*/)
   {
-    uint64_t obj_size = p_manifest->get_obj_size();
-    ceph_assert(obj_size == p_manifest->get_head_size());
-
-    const rgw_obj &head_obj = p_manifest->get_obj();
-    const rgw_obj_key &head_key = head_obj.key;
-    rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns);
-    rgw_obj tail_obj(*p_bucket, tail_key);
-
-    RGWObjManifestPart tail_part;
-    tail_part.loc     = tail_obj;
-    tail_part.loc_ofs = 0;
-    tail_part.size    = obj_size;
-
-    std::map<uint64_t, RGWObjManifestPart> objs_map;
-    objs_map[0] = tail_part;
-
+    // Set head-size to zero in the manifest with start_ofs set to zero
+    // This means no data is stored in the head-object and the first tail-object
+    //      holds the first data byte
+    uint64_t tail_ofs = 0;
     p_manifest->set_head_size(0);
     p_manifest->set_max_head_size(0);
-    p_manifest->set_prefix("");
-    p_manifest->clear_rules();
-    p_manifest->set_explicit(obj_size, objs_map);
+    RGWObjManifestRule rule;
+    if (p_manifest->get_rule(0, &rule)) {
+      ldpp_dout(dpp, 20) << "OLD Rule::start_part_num="<< rule.start_part_num
+                         << "::start_ofs=" << rule.start_ofs
+                         << "::part_size=" << rule.part_size
+                         << "::stripe_max_size=" << rule.stripe_max_size
+                         << "::override_prefix=" << rule.override_prefix << dendl;
+      p_manifest->clear_rules();
+      p_manifest->set_trivial_rule(tail_ofs, rule.stripe_max_size);
+      return 0;
+    }
+    else {
+      // No Rules, probably explicit_objs - should never happen (was checked before)
+      return -ENOENT;
+    }
   }
 
   //---------------------------------------------------------------------------
   int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM
                                     RGWObjManifest &src_manifest, // IN/OUT PARAM
                                     const disk_record_t *p_tgt_rec,
-                                    std::string *p_tail_name /*OUT*/,
                                     md5_stats_t *p_stats /* IN-OUT */)
   {
     ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::"
@@ -1719,11 +1727,14 @@ namespace rgw::dedup {
       }
     }
 
-    *p_tail_name = generate_split_head_tail_name(src_manifest);
+    ret = set_manifest_for_split_head(dpp, &src_manifest);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     librados::IoCtx tail_ioctx;
     std::string tail_oid;
-    ret = get_tail_ioctx(p_src_rec, src_manifest, *p_tail_name, p_stats,
-                         &tail_ioctx, &tail_oid);
+    ret = get_first_tail_obj_params(dpp, rados, src_manifest, &tail_ioctx, &tail_oid);
     if (unlikely(ret != 0)) {
       return ret;
     }
@@ -1762,8 +1773,6 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret="
                          << ret << dendl;
     }
-    const rgw_bucket *p_bucket = &(src_manifest.get_tail_placement().bucket);
-    build_and_set_explicit_manifest(dpp, p_bucket, *p_tail_name, &src_manifest);
 
     bufferlist manifest_bl;
     encode(src_manifest, manifest_bl);
@@ -1779,7 +1788,6 @@ namespace rgw::dedup {
                                              RGWObjManifest &src_manifest,
                                              const RGWObjManifest &tgt_manifest,
                                              const dedup_table_t::value_t *p_src_val,
-                                             std::string *p_tail_name /*OUT*/,
                                              md5_stats_t *p_stats)
   {
     int ret = 0;
@@ -1822,9 +1830,8 @@ namespace rgw::dedup {
     // we might still need to split-head here when hash is valid
     // can happen if we failed compare before (md5-collison) and stored the src hash
     // in the obj-attributes
-    uint64_t head_size = src_manifest.get_head_size();
-    if (should_split_head(head_size, src_manifest.get_obj_size())) {
-      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_tail_name, p_stats);
+    if (should_split_head(src_manifest)) {
+      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, p_stats);
       // compare_strong_hash() is called internally by split_head_object()
       return (ret == 0);
     }
@@ -2052,9 +2059,9 @@ namespace rgw::dedup {
     if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) {
       return 0;
     }
-    std::string tail_name;
+
     bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest,
-                                             tgt_manifest, &src_val, &tail_name, p_stats);
+                                             tgt_manifest, &src_val, p_stats);
     if (unlikely(!success)) {
       if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) {
         // set hash attributes on head objects to save calc next time
@@ -2073,7 +2080,7 @@ namespace rgw::dedup {
     }
 
     ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats,
-                       tail_name, &src_val);
+                       &src_val);
     if (ret == 0) {
       ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl;
       p_stats->deduped_objects++;
@@ -2083,12 +2090,6 @@ namespace rgw::dedup {
                            << ondisk_byte_size << dendl;
         p_stats->split_head_dedup_bytes += ondisk_byte_size;
       }
-      else if (p_tgt_rec->s.num_parts == 0 &&
-               // if we don't split head it will be duplicated
-               p_tgt_rec->s.obj_bytes_size > d_head_object_size) {
-        // single part objects duplicate the head object when dedup is used
-        p_stats->dup_head_bytes += d_head_object_size;
-      }
 
       // mark the SRC object as a providor of a shared manifest
       if (!src_val.has_shared_manifest()) {
@@ -2287,7 +2288,6 @@ namespace rgw::dedup {
 
     // ceph store full blocks so need to round up and multiply by block_size
     uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size);
-    // count all objects including too small and non default storage_class objs
     p_worker_stats->ingress_obj++;
     p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
 
@@ -2309,17 +2309,11 @@ namespace rgw::dedup {
 
     if (ondisk_byte_size < d_min_obj_size_for_dedup) {
       if (parsed_etag.num_parts == 0) {
-        // dedup only useful for objects bigger than 4MB
+        // dedup is only applied to objects larger than the configured minimum size
+        // `rgw_dedup_min_obj_size_for_dedup`
         p_worker_stats->ingress_skip_too_small++;
         p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size;
-
-        if (ondisk_byte_size >= 64*1024) {
-          p_worker_stats->ingress_skip_too_small_64KB++;
-          p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
-        }
-        else {
-          return 0;
-        }
+        return 0;
       }
       else {
         // multipart objects are always good candidates for dedup
@@ -2531,8 +2525,6 @@ namespace rgw::dedup {
                        << "::total_count="      << obj_count_in_shard
                        << "::loaded_objects="   << p_stats->loaded_objects
                        << p_stats->big_objs_stat << dendl;
-    ldpp_dout(dpp, 10) << __func__ << "::small objs::"
-                       << p_stats->small_objs_stat << dendl;
   }
 
   //---------------------------------------------------------------------------
@@ -2557,7 +2549,7 @@ namespace rgw::dedup {
         return -ECANCELED;
       }
     }
-    p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat);
+    p_table->count_duplicates(&p_stats->big_objs_stat);
     display_table_stat_counters(dpp, p_stats);
 
     ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
@@ -2839,7 +2831,7 @@ namespace rgw::dedup {
     md5_stats_t md5_stats;
     //DEDUP_DYN_ALLOC
     dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup,
-                        d_max_obj_size_for_split, raw_mem, raw_mem_size);
+                        d_split_head, raw_mem, raw_mem_size);
     int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
     if (ret == 0) {
       md5_stats.duration = ceph_clock_now() - start_time;
index ecb1e44088b02378078941824019a66a62b385e4..71d980fb58b06239bd5d56fa64380959d5a23b8a 100644 (file)
@@ -98,16 +98,9 @@ namespace rgw::dedup {
     };
 
     inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes);
-    inline bool should_split_head(uint64_t head_size, uint64_t obj_size);
-    int get_tail_ioctx(const disk_record_t *p_rec,
-                       const RGWObjManifest &manifest,
-                       const std::string &tail_name,
-                       md5_stats_t *p_stats /*IN-OUT*/,
-                       librados::IoCtx *p_ioctx /*OUT*/,
-                       std::string *p_oid /*OUT*/);
+    inline bool should_split_head(const RGWObjManifest &manifest);
     void remove_created_tail_object(const disk_record_t *p_rec,
                                     const RGWObjManifest &manifest,
-                                    const std::string &tail_name,
                                     md5_stats_t *p_stats /*IN-OUT*/);
     void run();
     int  setup(struct dedup_epoch_t*);
@@ -201,7 +194,6 @@ namespace rgw::dedup {
     int split_head_object(disk_record_t *p_src_rec,     // IN/OUT PARAM
                           RGWObjManifest &src_manifest, // IN/OUT PARAM
                           const disk_record_t *p_tgt_rec,
-                          std::string *p_tail_name /*OUT*/,
                           md5_stats_t *p_stats /* IN-OUT */);
 
     int add_obj_attrs_to_record(disk_record_t         *p_rec,
@@ -221,7 +213,6 @@ namespace rgw::dedup {
                                    RGWObjManifest &src_manifest,
                                    const RGWObjManifest &tgt_manifest,
                                    const dedup_table_t::value_t *p_src_val,
-                                   std::string *p_tail_name /*OUT*/,
                                    md5_stats_t *p_stats /* IN-OUT */);
     int try_deduping_record(dedup_table_t   *p_table,
                             disk_record_t   *p_rec,
@@ -244,7 +235,6 @@ namespace rgw::dedup {
                      const RGWObjManifest         &src_manifest,
                      const RGWObjManifest         &tgt_manifest,
                      md5_stats_t                  *p_stats,
-                     const std::string            &tail_name,
                      const dedup_table_t::value_t *p_src_val);
 #endif
     int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
@@ -266,7 +256,7 @@ namespace rgw::dedup {
     uint64_t d_all_buckets_obj_size    = 0;
 
     uint32_t d_min_obj_size_for_dedup = (64ULL * 1024);
-    uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024);
+    bool     d_split_head             = true;
     uint32_t d_head_object_size       = (4ULL * 1024 * 1024);
     control_t d_ctl;
     uint64_t d_watch_handle = 0;
index fafd66176eff99b17562c6ceb107b0563597aec1..150aace4ab145cd2687f51ee1000abd56f2656ba 100644 (file)
@@ -973,7 +973,6 @@ namespace rgw::dedup {
     Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
     fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
     fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
 
     if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
       double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
@@ -997,7 +996,6 @@ namespace rgw::dedup {
     Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
     fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
     fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
     if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
       double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
       fmt->dump_float("dedup_ratio", dedup_ratio);
index b27bf7353a6eedf63699c9bfa8f0792cdf369353..898a1304dfdaefda30dd3db13831993ef341091b 100644 (file)
@@ -23,14 +23,14 @@ namespace rgw::dedup {
   dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
                                uint32_t _head_object_size,
                                uint32_t _min_obj_size_for_dedup,
-                               uint32_t _max_obj_size_for_split,
+                               bool     _split_head,
                                uint8_t *p_slab,
                                uint64_t slab_size)
   {
     dpp = _dpp;
     head_object_size = _head_object_size;
     min_obj_size_for_dedup = _min_obj_size_for_dedup;
-    max_obj_size_for_split = _max_obj_size_for_split;
+    split_head = _split_head;
     memset(p_slab, 0, slab_size);
     hash_tab = (table_entry_t*)p_slab;
     entries_count = slab_size/sizeof(table_entry_t);
@@ -100,6 +100,9 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
+  // find_entry() assumes that entries are not removed during operation
+  // remove_entry() is only called from remove_singletons_and_redistribute_keys()
+  //       doing a linear pass over the array.
   uint32_t dedup_table_t::find_entry(const key_t *p_key) const
   {
     uint32_t idx = p_key->hash() % entries_count;
@@ -113,34 +116,19 @@ namespace rgw::dedup {
 
   //---------------------------------------------------------------------------
   void dedup_table_t::inc_counters(const key_t *p_key,
-                                   dedup_stats_t *p_small_objs,
-                                   dedup_stats_t *p_big_objs,
-                                   uint64_t *p_duplicate_head_bytes)
+                                   dedup_stats_t *p_dedup_stats)
   {
     // This is an approximation only since size is stored in 4KB resolution
     uint64_t byte_size_approx = disk_blocks_to_byte_size(p_key->size_4k_units);
 
-    // skip small single part objects which we can't dedup
-    if (!dedupable_object(p_key->multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
-      p_small_objs->duplicate_count ++;
-      p_small_objs->dedup_bytes_estimate += byte_size_approx;
-      return;
-    }
-    else {
-      uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
-                                                     min_obj_size_for_dedup,
-                                                     max_obj_size_for_split,
-                                                     p_key->num_parts,
-                                                     byte_size_approx);
-      p_big_objs->duplicate_count ++;
-      p_big_objs->dedup_bytes_estimate += dup_bytes_approx;
-
-      // object smaller than max_obj_size_for_split will split their head
-      // and won't dup it
-      if (!p_key->multipart_object() && byte_size_approx > max_obj_size_for_split) {
-        // single part objects duplicate the head object when dedup is used
-        *p_duplicate_head_bytes += head_object_size;
-      }
+    uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+                                                   min_obj_size_for_dedup,
+                                                   split_head,
+                                                   p_key->num_parts,
+                                                   byte_size_approx);
+    if (dup_bytes_approx) {
+      p_dedup_stats->duplicate_count ++;
+      p_dedup_stats->dedup_bytes_estimate += dup_bytes_approx;
     }
   }
 
@@ -149,9 +137,7 @@ namespace rgw::dedup {
                                disk_block_id_t block_id,
                                record_id_t rec_id,
                                bool shared_manifest,
-                               dedup_stats_t *p_small_objs,
-                               dedup_stats_t *p_big_objs,
-                               uint64_t *p_duplicate_head_bytes)
+                               dedup_stats_t *p_dedup_stats)
   {
     value_t new_val(block_id, rec_id, shared_manifest);
     uint32_t idx = find_entry(p_key);
@@ -172,7 +158,7 @@ namespace rgw::dedup {
     else {
       ceph_assert(hash_tab[idx].key == *p_key);
       if (val.count <= MAX_COPIES_PER_OBJ) {
-        inc_counters(p_key, p_small_objs, p_big_objs, p_duplicate_head_bytes);
+        inc_counters(p_key, p_dedup_stats);
       }
       if (val.count < std::numeric_limits<std::uint16_t>::max()) {
         val.count ++;
@@ -280,35 +266,19 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
-                                       dedup_stats_t *p_big_objs)
+  void dedup_table_t::count_duplicates(dedup_stats_t *p_dedup_stats)
   {
     for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
       if (!hash_tab[tab_idx].val.is_occupied()) {
         continue;
       }
 
-      const key_t &key = hash_tab[tab_idx].key;
-      // This is an approximation only since size is stored in 4KB resolution
-      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
-
-      // skip small single part objects which we can't dedup
-      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
-        if (hash_tab[tab_idx].val.is_singleton()) {
-          p_small_objs->singleton_count++;
-        }
-        else {
-          p_small_objs->unique_count ++;
-        }
+      if (hash_tab[tab_idx].val.is_singleton()) {
+        p_dedup_stats->singleton_count++;
       }
       else {
-        if (hash_tab[tab_idx].val.is_singleton()) {
-          p_big_objs->singleton_count++;
-        }
-        else {
-          ceph_assert(hash_tab[tab_idx].val.count > 1);
-          p_big_objs->unique_count ++;
-        }
+        ceph_assert(hash_tab[tab_idx].val.count > 1);
+        p_dedup_stats->unique_count ++;
       }
     }
   }
index 82efc03480cb4330a3411f4c1ef4b0d762e1457e..4f26972a2cd72b5146704049bdeff951e99849b2 100644 (file)
@@ -131,16 +131,14 @@ namespace rgw::dedup {
     dedup_table_t(const DoutPrefixProvider* _dpp,
                   uint32_t _head_object_size,
                   uint32_t _min_obj_size_for_dedup,
-                  uint32_t _max_obj_size_for_split,
+                  bool     _split_head,
                   uint8_t *p_slab,
                   uint64_t slab_size);
     int add_entry(key_t *p_key,
                   disk_block_id_t block_id,
                   record_id_t rec_id,
                   bool shared_manifest,
-                  dedup_stats_t *p_small_objs_stat,
-                  dedup_stats_t *p_big_objs_stat,
-                  uint64_t *p_duplicate_head_bytes);
+                  dedup_stats_t *p_dedup_stats);
 
     void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
                       bool shared_manifest);
@@ -159,9 +157,7 @@ namespace rgw::dedup {
                      bool set_shared_manifest_src,
                      bool set_has_valid_hash_src);
 
-    void count_duplicates(dedup_stats_t *p_small_objs_stat,
-                          dedup_stats_t *p_big_objs_stat);
-
+    void count_duplicates(dedup_stats_t *p_dedup_stats);
     void remove_singletons_and_redistribute_keys();
   private:
     // 32 Bytes unified entries
@@ -173,15 +169,13 @@ namespace rgw::dedup {
 
     uint32_t find_entry(const key_t *p_key) const;
     void     inc_counters(const key_t *p_key,
-                          dedup_stats_t *p_small_objs,
-                          dedup_stats_t *p_big_objs,
-                          uint64_t *p_duplicate_head_bytes);
+                          dedup_stats_t *p_dedup_stats);
 
     uint32_t       entries_count = 0;
     uint32_t       occupied_count = 0;
     uint32_t       head_object_size;
     uint32_t       min_obj_size_for_dedup;
-    uint32_t       max_obj_size_for_split;
+    bool           split_head;
     table_entry_t *hash_tab = nullptr;
 
     // stat counters
index 74252a853950346028f9eea0896acb531cec1559..52fdfa2c04f30983088c742d906a4e630ce658a4 100644 (file)
@@ -382,8 +382,6 @@ namespace rgw::dedup {
     this->ingress_corrupted_etag += other.ingress_corrupted_etag;
     this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
     this->ingress_skip_too_small += other.ingress_skip_too_small;
-    this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
-    this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
 
     return *this;
   }
@@ -440,13 +438,6 @@ namespace rgw::dedup {
                          this->ingress_skip_too_small);
         f->dump_unsigned("Ingress skip: too small bytes",
                          this->ingress_skip_too_small_bytes);
-
-        if(this->ingress_skip_too_small_64KB) {
-          f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj",
-                           this->ingress_skip_too_small_64KB);
-          f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes",
-                           this->ingress_skip_too_small_64KB_bytes);
-        }
       }
     }
 
@@ -499,9 +490,6 @@ namespace rgw::dedup {
     encode(w.ingress_skip_too_small_bytes, bl);
     encode(w.ingress_skip_too_small, bl);
 
-    encode(w.ingress_skip_too_small_64KB_bytes, bl);
-    encode(w.ingress_skip_too_small_64KB, bl);
-
     encode(w.duration, bl);
     ENCODE_FINISH(bl);
   }
@@ -528,8 +516,6 @@ namespace rgw::dedup {
     decode(w.ingress_corrupted_etag, bl);
     decode(w.ingress_skip_too_small_bytes, bl);
     decode(w.ingress_skip_too_small, bl);
-    decode(w.ingress_skip_too_small_64KB_bytes, bl);
-    decode(w.ingress_skip_too_small_64KB, bl);
 
     decode(w.duration, bl);
     DECODE_FINISH(bl);
@@ -538,7 +524,6 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
   {
-    this->small_objs_stat               += other.small_objs_stat;
     this->big_objs_stat                 += other.big_objs_stat;
     this->ingress_slabs                 += other.ingress_slabs;
     this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
@@ -551,6 +536,8 @@ namespace rgw::dedup {
     this->ingress_skip_compressed       += other.ingress_skip_compressed;
     this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
     this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
+    this->ingress_skip_explicit_objs    += other.ingress_skip_explicit_objs;
+    this->ingress_skip_alibaba          += other.ingress_skip_alibaba;
     this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
 
     this->skipped_shared_manifest += other.skipped_shared_manifest;
@@ -591,10 +578,8 @@ namespace rgw::dedup {
     this->set_shared_manifest_src += other.set_shared_manifest_src;
     this->loaded_objects          += other.loaded_objects;
     this->processed_objects       += other.processed_objects;
-    this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
     this->deduped_objects         += other.deduped_objects;
     this->deduped_objects_bytes   += other.deduped_objects_bytes;
-    this->dup_head_bytes          += other.dup_head_bytes;
 
     this->failed_dedup            += other.failed_dedup;
     this->md_throttle_sleep_events    += other.md_throttle_sleep_events;
@@ -628,7 +613,6 @@ namespace rgw::dedup {
       f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
       f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
       f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
-      f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
       f->dump_unsigned("Already Deduped bytes (prev cycles)",
                        this->shared_manifest_dedup_bytes);
 
@@ -639,21 +623,6 @@ namespace rgw::dedup {
       f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
     }
 
-    // Potential Dedup Section:
-    // What could be gained by allowing dedup for smaller objects (64KB-4MB)
-    // Space wasted because of duplicated head-object (4MB)
-    {
-      Formatter::ObjectSection potential(*f, "Potential Dedup");
-      const dedup_stats_t &ds = this->small_objs_stat;
-      f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
-      f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
-      f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
-      f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
-      f->dump_unsigned("Duplicated Head Bytes Estimate",
-                       this->dup_head_bytes_estimate);
-      f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
-    }
-
     {
       Formatter::ObjectSection notify(*f, "notify");
       if (this->md_throttle_sleep_events) {
@@ -726,6 +695,12 @@ namespace rgw::dedup {
       if (this->ingress_skip_changed_objs) {
         f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs);
       }
+      if (this->ingress_skip_explicit_objs) {
+        f->dump_unsigned("Skipped Explicit Objs", this->ingress_skip_explicit_objs);
+      }
+      if (this->ingress_skip_alibaba) {
+        f->dump_unsigned("Skipped Alibaba Cloud OSS", this->ingress_skip_alibaba);
+      }
     }
 
     {
@@ -814,7 +789,6 @@ namespace rgw::dedup {
   {
     ENCODE_START(1, 1, bl);
 
-    encode(m.small_objs_stat, bl);
     encode(m.big_objs_stat, bl);
     encode(m.ingress_slabs, bl);
     encode(m.ingress_failed_load_bucket, bl);
@@ -827,6 +801,8 @@ namespace rgw::dedup {
     encode(m.ingress_skip_compressed, bl);
     encode(m.ingress_skip_compressed_bytes, bl);
     encode(m.ingress_skip_changed_objs, bl);
+    encode(m.ingress_skip_explicit_objs, bl);
+    encode(m.ingress_skip_alibaba, bl);
     encode(m.shared_manifest_dedup_bytes, bl);
 
     encode(m.skipped_shared_manifest, bl);
@@ -867,10 +843,8 @@ namespace rgw::dedup {
 
     encode(m.loaded_objects, bl);
     encode(m.processed_objects, bl);
-    encode(m.dup_head_bytes_estimate, bl);
     encode(m.deduped_objects, bl);
     encode(m.deduped_objects_bytes, bl);
-    encode(m.dup_head_bytes, bl);
     encode(m.failed_dedup, bl);
     encode(m.md_throttle_sleep_events, bl);
     encode(m.md_throttle_sleep_time_usec, bl);
@@ -885,7 +859,6 @@ namespace rgw::dedup {
   void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
   {
     DECODE_START(1, bl);
-    decode(m.small_objs_stat, bl);
     decode(m.big_objs_stat, bl);
     decode(m.ingress_slabs, bl);
     decode(m.ingress_failed_load_bucket, bl);
@@ -898,6 +871,8 @@ namespace rgw::dedup {
     decode(m.ingress_skip_compressed, bl);
     decode(m.ingress_skip_compressed_bytes, bl);
     decode(m.ingress_skip_changed_objs, bl);
+    decode(m.ingress_skip_explicit_objs, bl);
+    decode(m.ingress_skip_alibaba, bl);
     decode(m.shared_manifest_dedup_bytes, bl);
 
     decode(m.skipped_shared_manifest, bl);
@@ -938,10 +913,8 @@ namespace rgw::dedup {
 
     decode(m.loaded_objects, bl);
     decode(m.processed_objects, bl);
-    decode(m.dup_head_bytes_estimate, bl);
     decode(m.deduped_objects, bl);
     decode(m.deduped_objects_bytes, bl);
-    decode(m.dup_head_bytes, bl);
     decode(m.failed_dedup, bl);
     decode(m.md_throttle_sleep_events, bl);
     decode(m.md_throttle_sleep_time_usec, bl);
index 579e048a259f0bd0f8650a8bc64ff6d5f25d005a..6a7f508cc36eee11e6c7f83e6c8dbcbe99f1d3e8 100644 (file)
@@ -198,9 +198,6 @@ namespace rgw::dedup {
     uint64_t ingress_skip_too_small_bytes = 0;
     uint64_t ingress_skip_too_small = 0;
 
-    uint64_t ingress_skip_too_small_64KB_bytes = 0;
-    uint64_t ingress_skip_too_small_64KB = 0;
-
     utime_t  duration = {0, 0};
   };
   std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
@@ -212,7 +209,6 @@ namespace rgw::dedup {
     md5_stats_t& operator +=(const md5_stats_t& other);
     void dump(Formatter *f) const;
 
-    dedup_stats_t small_objs_stat;
     dedup_stats_t big_objs_stat;
     uint64_t ingress_slabs = 0;
     uint64_t ingress_failed_load_bucket = 0;
@@ -225,6 +221,8 @@ namespace rgw::dedup {
     uint64_t ingress_skip_compressed = 0;
     uint64_t ingress_skip_compressed_bytes = 0;
     uint64_t ingress_skip_changed_objs = 0;
+    uint64_t ingress_skip_explicit_objs = 0;
+    uint64_t ingress_skip_alibaba = 0;
 
     uint64_t shared_manifest_dedup_bytes = 0;
     uint64_t skipped_shared_manifest = 0;
@@ -265,11 +263,9 @@ namespace rgw::dedup {
     uint64_t loaded_objects = 0;
     uint64_t processed_objects = 0;
     // counter is using on-disk size affected by block-size
-    uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
     uint64_t deduped_objects = 0;
     // counter is using s3 byte size disregarding the on-disk size affected by block-size
     uint64_t deduped_objects_bytes = 0;
-    uint64_t dup_head_bytes = 0;
     uint64_t failed_dedup = 0;
     uint64_t md_throttle_sleep_events = 0;
     uint64_t md_throttle_sleep_time_usec = 0;
@@ -368,7 +364,7 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size,
                                             uint32_t min_obj_size_for_dedup,
-                                            uint32_t max_obj_size_for_split,
+                                            bool     split_head,
                                             uint16_t num_parts,
                                             uint64_t size_bytes)
   {
@@ -376,18 +372,16 @@ namespace rgw::dedup {
       // multipart objects with an empty head i.e. we achive full dedup
       return size_bytes;
     }
+    else if (size_bytes < min_obj_size_for_dedup) {
+      return 0;
+    }
+    else if (split_head) {
+      // Head is splitted into an empty obj and a new tail enabling a full dedup
+      return size_bytes;
+    }
     else {
-      // reduce the head size
-      if (size_bytes > max_obj_size_for_split) {
-        return size_bytes - head_obj_size;
-      }
-      else if (size_bytes >= min_obj_size_for_dedup) {
-        // Head is splitted into an empty obj and a new tail enabling a full dedup
-        return size_bytes;
-      }
-      else {
-        return 0;
-      }
+      // reduce the head size which is not dedup
+      return size_bytes - std::min(size_bytes, (uint64_t)head_obj_size);
     }
   }
 
index d423c115d4de692713b74d458a619be5a8e142de..dd24390247ea1d4e5f740c28e6d16e64481d0f6c 100644 (file)
@@ -181,13 +181,13 @@ int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifes
   return 0;
 }
 
-bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
+bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule) const
 {
   if (rules.empty()) {
     return false;
   }
 
-  map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
+  map<uint64_t, RGWObjManifestRule>::const_iterator iter = rules.upper_bound(ofs);
   if (iter != rules.begin()) {
     --iter;
   }
index 4129a015c98b103aad7d2625a5397b70e6e06b31..966db3add84fbaef0955bc2b706c3c9386301e6a 100644 (file)
@@ -254,7 +254,11 @@ public:
   }
 
   void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
-                             std::string *override_prefix, rgw_obj_select *location) const;
+                             const std::string *override_prefix, rgw_obj_select *location) const;
+
+  const std::map<uint64_t, RGWObjManifestRule>& get_rules() const {
+    return rules;
+  }
 
   void clear_rules() {
     rules.clear();
@@ -383,7 +387,7 @@ public:
   int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
              const RGWZoneParams& zone_params);
 
-  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
+  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule) const;
 
   bool empty() const {
     if (explicit_objs)
index 042c97aa13cde16226768665df7551bdd4f4123f..418c913b82a8c765c2e7e2ebe23a60229b4c1286 100644 (file)
@@ -207,7 +207,7 @@ void RGWObjManifest::obj_iterator::update_location()
 }
 
 void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe,
-                                           uint64_t ofs, string *override_prefix, rgw_obj_select *location) const
+                                           uint64_t ofs, const string *override_prefix, rgw_obj_select *location) const
 {
   rgw_obj loc;
 
index 40f82b862748301a3c7575ea2d53e7633cff16f7..25177a10ebb1f39d5882db6a8f6500e9c42f7380 100644 (file)
@@ -51,10 +51,6 @@ class Dedup_Stats:
     duplicate_obj : int = 0
     deduped_obj_bytes : int = 0
     non_default_storage_class_objs_bytes : int = 0
-    potential_singleton_obj : int = 0
-    potential_unique_obj : int = 0
-    potential_duplicate_obj : int = 0
-    potential_dedup_space : int = 0
 
 @dataclass
 class Dedup_Ratio:
@@ -280,9 +276,8 @@ def create_buckets(conn, max_copies_count):
 OUT_DIR="/tmp/dedup/"
 KB=(1024)
 MB=(1024*KB)
-POTENTIAL_OBJ_SIZE=(64*KB)
 DEDUP_MIN_OBJ_SIZE=(64*KB)
-SPLIT_HEAD_SIZE=(4*MB)
+SPLIT_HEAD=True
 RADOS_OBJ_SIZE=(4*MB)
 # The default multipart threshold size for S3cmd is 15 MB.
 MULTIPART_SIZE=(15*MB)
@@ -638,17 +633,16 @@ def calc_head_size(obj_size, config):
 def calc_dedupable_space(obj_size, config):
     on_disk_byte_size = calc_on_disk_byte_size(obj_size)
 
-    threshold = config.multipart_threshold
     # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
     # multi-part objects got a zero size Head objects
-    if obj_size >= threshold:
+    if obj_size >= config.multipart_threshold:
         dedupable_space = on_disk_byte_size
-    elif obj_size > SPLIT_HEAD_SIZE:
-        dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE
-    elif obj_size >= DEDUP_MIN_OBJ_SIZE:
+    elif obj_size < DEDUP_MIN_OBJ_SIZE:
+        dedupable_space = 0
+    elif SPLIT_HEAD:
         dedupable_space = on_disk_byte_size
     else:
-        dedupable_space = 0
+        dedupable_space = (on_disk_byte_size - min(on_disk_byte_size, RADOS_OBJ_SIZE))
 
     log.debug("obj_size=%.2f MiB, dedupable_space=%.2f MiB",
               float(obj_size)/MB, float(dedupable_space)/MB)
@@ -659,7 +653,7 @@ def calc_split_objs_count(obj_size, num_copies, config):
     threshold = config.multipart_threshold
     on_disk_byte_size = calc_on_disk_byte_size(obj_size)
 
-    if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold:
+    if num_copies < 2 or not SPLIT_HEAD or obj_size >= threshold:
         return 0
 
     if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE:
@@ -680,15 +674,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
     if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE:
         dedup_stats.skip_too_small += num_copies
         dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
-
-        if on_disk_byte_size >= POTENTIAL_OBJ_SIZE:
-            if num_copies == 1:
-                dedup_stats.potential_singleton_obj += 1
-            else:
-                dedup_stats.potential_unique_obj += 1
-                dedup_stats.potential_duplicate_obj += dups_count
-                dedup_stats.potential_dedup_space += (on_disk_byte_size * dups_count)
-
         return
 
     dedup_stats.total_processed_objects += num_copies
@@ -1399,12 +1384,6 @@ def read_dedup_stats(dry_run):
         dedup_stats.duplicate_obj = main['Duplicate Obj']
         dedup_stats.dedup_bytes_estimate = main['Dedup Bytes Estimate']
 
-        potential = md5_stats['Potential Dedup']
-        dedup_stats.potential_singleton_obj = potential['Singleton Obj (64KB-4MB)']
-        dedup_stats.potential_unique_obj = potential['Unique Obj (64KB-4MB)']
-        dedup_stats.potential_duplicate_obj = potential['Duplicate Obj (64KB-4MB)']
-        dedup_stats.potential_dedup_space = potential['Dedup Bytes Estimate (64KB-4MB)']
-
     dedup_work_was_completed=jstats['completed']
     if dedup_work_was_completed:
         dedup_ratio_estimate=read_dedup_ratio(jstats, 'dedup_ratio_estimate')
@@ -1486,11 +1465,6 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size
     if verify_stats == False:
         return ret
 
-    if dedup_stats.potential_unique_obj or expected_dedup_stats.potential_unique_obj:
-        log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
-                  expected_dedup_stats.potential_unique_obj)
-
-
     #dedup_stats.set_hash = dedup_stats.invalid_hash
     if dedup_stats != expected_dedup_stats:
         log.debug("==================================================")
@@ -1513,14 +1487,6 @@ def prepare_test():
 
     os.mkdir(OUT_DIR)
 
-#-------------------------------------------------------------------------------
-def copy_potential_stats(new_dedup_stats, dedup_stats):
-    new_dedup_stats.potential_singleton_obj = dedup_stats.potential_singleton_obj
-    new_dedup_stats.potential_unique_obj    = dedup_stats.potential_unique_obj
-    new_dedup_stats.potential_duplicate_obj = dedup_stats.potential_duplicate_obj
-    new_dedup_stats.potential_dedup_space   = dedup_stats.potential_dedup_space
-
-
 #-------------------------------------------------------------------------------
 def small_single_part_objs_dedup(conn, bucket_name, dry_run):
     # 1) generate small random files and store them on disk
@@ -1547,8 +1513,6 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
-        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small = s3_objects_total
@@ -1897,6 +1861,8 @@ def test_dedup_with_versions():
     min_size=1*KB
     max_size=MULTIPART_SIZE*2
     success=False
+    # Declare the variable with a type hint
+    conn: BaseClient
     try:
         conn=get_single_connection()
         conn.create_bucket(Bucket=bucket_name)
@@ -2415,8 +2381,6 @@ def test_dedup_small_with_tenants():
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
-        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -3320,7 +3284,6 @@ def test_dedup_dry_small_with_tenants():
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
-        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -3693,3 +3656,4 @@ def test_dedup_identical_copies_multipart_small():
     force_clean=True
     log.info("test_dedup_identical_copies_multipart:full test")
     __test_dedup_identical_copies(files, config, dry_run, verify, force_clean)
+