rgw/dedup: split-head mechanism

author Gabriel BenHanokh <gbenhano@redhat.com>

Mon, 15 Sep 2025 19:01:02 +0000 (19:01 +0000)

committer benhanokh <gbenhano@redhat.com>

Tue, 24 Feb 2026 19:17:38 +0000 (21:17 +0200)
author Gabriel BenHanokh <gbenhano@redhat.com>
Mon, 15 Sep 2025 19:01:02 +0000 (19:01 +0000)
committer benhanokh <gbenhano@redhat.com>
Tue, 24 Feb 2026 19:17:38 +0000 (21:17 +0200)
diff --git a/doc/radosgw/s3_objects_dedup.rst b/doc/radosgw/s3_objects_dedup.rst

index b0b83d0ddf7d7c06c6952d96ddb2a7064c2a92a6..b8b1ffbefa4ab09d3a19aa409a13b6fcaa597172 100644 (file)
--- a/doc/radosgw/s3_objects_dedup.rst
+++ b/doc/radosgw/s3_objects_dedup.rst
@@ -22,8 +22,6 @@ Admin commands
     Aborts an active dedup session and release all resources used by it.
  - ``radosgw-admin dedup stats``:
     Collects & displays last dedup statistics.
-- ``radosgw-admin dedup estimate``:
-   Starts a new dedup estimate session (aborting first existing session if exists).
  - ``radosgw-admin dedup throttle --max-bucket-index-ops=<count>``:
     Specify max bucket-index requests per second allowed for a single RGW server during dedup, 0 means unlimited.
  - ``radosgw-admin dedup throttle --stat``:
@@ -34,13 +32,17 @@ Skipped Objects
  ***************
  Dedup Estimate process skips the following objects:
  
-- Objects smaller than 4 MB (unless they are multipart).
+- Objects smaller than rgw_dedup_min_obj_size_for_dedup (unless they are multipart).
  - Objects with different placement rules.
  - Objects with different pools.
  - Objects with different storage classes.
  
  The full dedup process skips all the above and it also skips **compressed** and **user-encrypted** objects.
  
+The minimum size object for dedup is controlled by the following config option:
+
+.. confval:: rgw_dedup_min_obj_size_for_dedup
+
  *******************
  Estimate Processing
  *******************
@@ -85,6 +87,22 @@ If they are, we proceed with the deduplication:
  - copying the manifest from the source to the target.
  - removing all tail-objects on the target.
  
+***************
+Split Head Mode
+***************
+Dedup code can split the head object into 2 objects
+
+- one with attributes and no data and
+- a new tail-object with only data.
+
+The new-tail object will be deduped (unlike the head objects which can't be deduplicated)
+
+The split-Head mode is controlled by the following central configuration option:
+
+.. confval:: rgw_dedup_max_obj_size_for_split
+
+We will split head for objects with size smaller or equal to rgw_dedup_max_obj_size_for_split
+
  ************
  Memory Usage
  ************
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in

index a163117fb8b2466daab58cf8765e88b9f964fb67..a72a039a282e2788cfed8c37f321981c2b4e3f54 100644 (file)
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -81,6 +81,29 @@ options:
    default: false
    services:
    - rgw
+- name: rgw_dedup_max_obj_size_for_split
+  type: size
+  level: advanced
+  desc: The maximum RGW object size to split head.
+        A value of 0 (zero) disables the split-head functionality
+  long_desc: Dedup code can split head object into 2 objects -
+             one with attributes and no data and
+             a new tail-object with only data.
+             The new-tail object will be deduped (unlike the head objects which
+             can't be deduplicated)
+             We will split head for objects with size 16MB or less
+  default: 16_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_dedup_min_obj_size_for_dedup
+  type: size
+  level: advanced
+  desc: The minimum RGW object size for dedup (0 means dedup all objects).
+  default: 64_K
+  services:
+  - rgw
+  with_legacy: true
  - name: rgw_max_chunk_size
    type: size
    level: advanced
diff --git a/src/rgw/driver/rados/rgw_dedup.cc b/src/rgw/driver/rados/rgw_dedup.cc

index f841e8aad5a9efbbe06023c2039d0a610ce28bb7..c1174bc7ef4f8e607b46e86404ff4bf5d719c4b3 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup.cc
+++ b/src/rgw/driver/rados/rgw_dedup.cc
@@ -83,6 +83,20 @@ namespace rgw::dedup {
    static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128;
    using storage_class_idx_t = uint8_t;
  
+  //---------------------------------------------------------------------------
+  [[maybe_unused]] static int print_manifest(const DoutPrefixProvider *dpp,
+                                             RGWRados                 *rados,
+                                             const RGWObjManifest     &manifest)
+  {
+    unsigned idx = 0;
+    for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl;
+    }
+    ldpp_dout(dpp, 20) << "==============================================" << dendl;
+    return 0;
+  }
+
    //---------------------------------------------------------------------------
    void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
                                                 uint64_t notifier_id, bufferlist &bl)
@@ -321,7 +335,6 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    static int init_dedup_pool_ioctx(rgw::sal::RadosStore     *store,
                                     const DoutPrefixProvider *dpp,
-                                   bool                      create,
                                     librados::IoCtx          &ioctx)
    {
      const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
@@ -329,11 +342,10 @@ namespace rgw::dedup {
      auto rados_handle = store->getRados()->get_rados_handle();
      int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
      if (pool_id >= 0) {
-      // TBD: what to do when create option is passed
        ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
                           << " already exists, pool_id=" << pool_id << dendl;
      }
-    else if (create) {
+    else {
        pool_id = create_pool(store, dpp, pool_name);
        if (pool_id >= 0) {
          ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
@@ -343,11 +355,6 @@ namespace rgw::dedup {
          return pool_id;
        }
      }
-    else {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: pool doesn't exist and no create option" << dendl;
-      return -ENOENT;
-    }
  
      int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
      if (unlikely(ret < 0)) {
@@ -382,7 +389,7 @@ namespace rgw::dedup {
      rados = store->getRados();
      rados_handle = rados->get_rados_handle();
      if (init_pool) {
-      int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+      int ret = init_dedup_pool_ioctx(store, dpp, d_dedup_cluster_ioctx);
        display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
        return ret;
      }
@@ -398,9 +405,14 @@ namespace rgw::dedup {
      d_cluster(dpp, cct, driver),
      d_watcher_ctx(this)
    {
-    d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size;
      d_head_object_size = cct->_conf->rgw_max_chunk_size;
-    //ceph_assert(4*1024*1024 == d_head_object_size);
+    d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup;
+    d_max_obj_size_for_split = cct->_conf->rgw_dedup_max_obj_size_for_split;
+
+    ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size
+                       << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup
+                       << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split
+                       << dendl;
  
      int ret = init_rados_access_handles(false);
      if (ret != 0) {
@@ -413,6 +425,16 @@ namespace rgw::dedup {
      d_heart_beat_max_elapsed_sec = 3;
    }
  
+  //------------------------------------------------------------------------------
+  uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes)
+  {
+    return calc_deduped_bytes(d_head_object_size,
+                              d_min_obj_size_for_dedup,
+                              d_max_obj_size_for_split,
+                              num_parts,
+                              size_bytes);
+  }
+
    //---------------------------------------------------------------------------
    int Background::add_disk_rec_from_bucket_idx(disk_block_array_t     &disk_arr,
                                                 const rgw::sal::Bucket *p_bucket,
@@ -434,7 +456,8 @@ namespace rgw::dedup {
      }
      ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/"
                         << obj_name << " was written to block_idx="
-                       << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl;
+                       << rec_info.block_id << " rec_id=" << (int)rec_info.rec_id
+                       << dendl;
      return 0;
    }
  
@@ -450,12 +473,11 @@ namespace rgw::dedup {
      storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
                                                   &p_stats->failed_map_overflow);
      if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
-      // TBD: need stat counters
        return -EOVERFLOW;
      }
      key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
                p_rec->s.num_parts, sc_idx);
-    bool has_shared_manifest = p_rec->has_shared_manifest();
+    bool has_shared_manifest = p_rec->s.flags.has_shared_manifest();
      ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name
                         << ", obj=" << p_rec->obj_name << ", block_id="
                         << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id
@@ -504,6 +526,115 @@ namespace rgw::dedup {
    }
  
  #ifdef FULL_DEDUP_SUPPORT
+  //---------------------------------------------------------------------------
+  static inline std::string build_oid(const std::string& bucket_id,
+                                      const std::string& obj_name)
+  {
+    std::string oid;
+    oid.reserve(bucket_id.size() + 1 + obj_name.size());
+    oid.append(bucket_id).append("_").append(obj_name);
+    return oid;
+  }
+
+  //---------------------------------------------------------------------------
+  static int get_ioctx_internal(const DoutPrefixProvider* const dpp,
+                                rgw::sal::Driver* driver,
+                                rgw::sal::RadosStore* store,
+                                const std::string &obj_name,
+                                const std::string &instance,
+                                const rgw_bucket &rb,
+                                librados::IoCtx *p_ioctx,
+                                std::string *p_oid)
+  {
+    unique_ptr<rgw::sal::Bucket> bucket;
+    {
+      int ret = driver->load_bucket(dpp, rb, &bucket, null_yield);
+      if (unlikely(ret != 0)) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
+                          << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+    }
+
+    string dummy_locator;
+    const rgw_obj_index_key key(obj_name, instance);
+    rgw_obj obj(bucket->get_key(), key);
+    get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
+    RGWBucketInfo& bucket_info = bucket->get_info();
+    return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
+                              rgw::sal::Driver* driver,
+                              rgw::sal::RadosStore* store,
+                              const disk_record_t *p_rec,
+                              librados::IoCtx *p_ioctx,
+                              std::string *p_oid)
+  {
+    rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
+    return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance,
+                              b, p_ioctx, p_oid);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest)
+  {
+    static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW);
+    std::string_view suffix = "0";
+    const std::string &prefix = manifest.get_prefix();
+
+    std::string tail_name;
+    tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1);
+    // TBD:
+    // it is unclear when RGW code pads with "_" before the shadow string
+    // It won't change correctness, but might look weird
+    //tail_name.append("_");
+    tail_name.append(shadow_string);
+    tail_name.append("_");
+    tail_name.append(prefix);
+    tail_name.append(suffix);
+    return tail_name;
+  }
+
+  //---------------------------------------------------------------------------
+  static void remove_created_tail_object(const DoutPrefixProvider *dpp,
+                                         librados::IoCtx &ioctx,
+                                         const std::string &tail_oid,
+                                         md5_stats_t *p_stats)
+  {
+    p_stats->rollback_tail_obj++;
+    int ret = ioctx.remove(tail_oid);
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::" << tail_oid
+                         << " was successfully removed" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << tail_oid
+                         << " ), ret=" << ret << "::" << cpp_strerror(-ret) <<dendl;
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size)
+  {
+    // max_obj_size_for_split of zero means don't split!
+    return (head_size > 0            &&
+            d_max_obj_size_for_split &&
+            obj_size <= d_max_obj_size_for_split);
+  }
+
+  //---------------------------------------------------------------------------
+  [[maybe_unused]] static bool empty_rgw_bucket(const rgw_bucket &b)
+  {
+    return (b.tenant.empty()    &&
+            b.name.empty()      &&
+            b.marker.empty()    &&
+            b.bucket_id.empty() &&
+            b.explicit_placement.data_pool.empty()       &&
+            b.explicit_placement.data_extra_pool.empty() &&
+            b.explicit_placement.index_pool.empty());
+  }
  
    static constexpr uint64_t cost = 1; // 1 throttle unit per request
    static constexpr uint64_t id = 0; // ids unused
@@ -527,15 +658,17 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  int Background::free_tail_objs_by_manifest(const string   &ref_tag,
-                                             const string   &oid,
-                                             RGWObjManifest &tgt_manifest)
+  int Background::free_tail_objs_by_manifest(const string         &ref_tag,
+                                             const string         &oid,
+                                             const RGWObjManifest &manifest)
    {
      unsigned idx = 0;
-    for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+    std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
+    for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
        rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
        if (oid == raw_obj.oid) {
-        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
          continue;
        }
  
@@ -546,21 +679,25 @@ namespace rgw::dedup {
                            << obj << dendl;
          continue;
        }
-      librados::IoCtx ioctx = obj.ioctx;
-      ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid
-                         << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid << dendl;
        d_ctl.metadata_access_throttle.acquire();
-      ret = ioctx.remove(raw_obj.oid);
+      ObjectWriteOperation op;
+      rgw::AioResultList completed;
+      cls_refcount_put(op, ref_tag, true);
+      completed = aio->get(obj.obj,
+                           rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+                           cost, id);
      }
-
+    rgw::AioResultList completed = aio->drain();
      return 0;
    }
  
    //---------------------------------------------------------------------------
-  int Background::rollback_ref_by_manifest(const string   &ref_tag,
-                                           const string   &oid,
-                                           RGWObjManifest &manifest)
+  int Background::rollback_ref_by_manifest(const string         &ref_tag,
+                                           const string         &oid,
+                                           const RGWObjManifest &manifest)
    {
+    ldpp_dout(dpp, 20) << __func__ << "::" << oid << dendl;
      unsigned idx = 0;
      int ret_code = 0;
      std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
@@ -595,9 +732,9 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  int Background::inc_ref_count_by_manifest(const string   &ref_tag,
-                                            const string   &oid,
-                                            RGWObjManifest &manifest)
+  int Background::inc_ref_count_by_manifest(const string         &ref_tag,
+                                            const string         &oid,
+                                            const RGWObjManifest &manifest)
    {
      std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
      rgw::AioResultList all_results;
@@ -614,14 +751,15 @@ namespace rgw::dedup {
        ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
        if (ret < 0) {
          ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
-                          << obj << dendl;
+                          << raw_obj.oid << dendl;
          break;
        }
  
        ObjectWriteOperation op;
        cls_refcount_get(op, ref_tag, true);
        d_ctl.metadata_access_throttle.acquire();
-      ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: "
+                         << raw_obj.oid << "::" << obj.obj.oid << dendl;
        rgw::AioResultList completed = aio->get(obj.obj,
                                                rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                                cost, id);
@@ -629,14 +767,14 @@ namespace rgw::dedup {
        all_results.splice(all_results.end(), completed);
        if (ret < 0) {
          ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
-                          << ", the error code = " << ret << dendl;
+                          << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl;
          break;
        }
      }
  
      if (ret == 0) {
        rgw::AioResultList completed = aio->drain();
-      int ret = rgw::check_for_errors(completed);
+      ret = rgw::check_for_errors(completed);
        all_results.splice(all_results.end(), completed);
        if (ret == 0) {
          return 0;
@@ -647,13 +785,14 @@ namespace rgw::dedup {
        }
      }
  
-    // if arrived here we failed somewhere -> rollback all ref-inc operations
      /* wait all pending op done */
      rgw::AioResultList completed = aio->drain();
      all_results.splice(all_results.end(), completed);
      int ret2 = 0;
      for (auto& aio_res : all_results) {
        if (aio_res.result < 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::skip failed refcount inc: "
+                           << aio_res.obj.oid << dendl;
          continue; // skip errors
        }
        rgw_rados_ref obj;
@@ -664,219 +803,302 @@ namespace rgw::dedup {
  
        ObjectWriteOperation op;
        cls_refcount_put(op, ref_tag, true);
+      ldpp_dout(dpp, 10) << __func__ << "::rollback refcount inc on: "
+                         << aio_res.obj.oid << dendl;
        rgw::AioResultList completed = aio->get(obj.obj,
                                                rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                                cost, id);
        ret2 = rgw::check_for_errors(completed);
        if (ret2 < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl;
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj="
+                          << aio_res.obj << dendl;
        }
      }
      completed = aio->drain();
      ret2 = rgw::check_for_errors(completed);
      if (ret2 < 0) {
        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret="
-                        << ret2 <<dendl;
+                        << ret2 << dendl;
      }
  
      return ret;
    }
  
    //---------------------------------------------------------------------------
-  static int get_ioctx(const DoutPrefixProvider* const dpp,
-                       rgw::sal::Driver* driver,
-                       rgw::sal::RadosStore* store,
-                       const disk_record_t *p_rec,
-                       librados::IoCtx *p_ioctx,
-                       std::string *p_oid)
+  static void dedup_object_log(const DoutPrefixProvider *dpp,
+                               const disk_record_t *p_src_rec,
+                               const disk_record_t *p_tgt_rec,
+                               uint64_t             src_head_size,
+                               uint64_t             tgt_head_size,
+                               const bufferlist    &etag_bl)
    {
-    unique_ptr<rgw::sal::Bucket> bucket;
-    {
-      rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
-      int ret = driver->load_bucket(dpp, b, &bucket, null_yield);
-      if (unlikely(ret != 0)) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
-                          << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
-    }
+    ldpp_dout(dpp, 20) << __func__ << "::DEDUP SRC:"
+                       << p_src_rec->bucket_name << "/" << p_src_rec->obj_name
+                       << "(" << src_head_size << ") ::TGT:"
+                       << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name
+                       << "(" << tgt_head_size << ")" << dendl;
+    ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
+                       << "::ETAG=" << etag_bl.to_str() << dendl;
+  }
  
-    string dummy_locator;
-    const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance);
-    rgw_obj obj(bucket->get_key(), key);
-    get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
-    RGWBucketInfo& bucket_info = bucket->get_info();
-    return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+  //---------------------------------------------------------------------------
+  /* The target (TGT) manifest must inherit the source (SRC) manifest, as both share
+   *  the same tail objects.
+   * However, the TGT head object needs to maintain its unique identity, including
+   *  its head-placement-rule and head-object parameters, which are stored in
+   * `rgw_obj`.
+   *
+   * The size of the TGT head object must be adjusted to match the SRC head size.
+   * This is straightforward when Split-Head is enabled, as both heads can be set to
+   *  zero and all data is stored in the tail.
+   *
+   * A potential issue arises if the SRC and TGT have different head sizes and
+   *  Split-Head is not used.
+   * While this scenario is unlikely in practice (as head-size is almost always 4MB),
+   *  if it were to occur, we should abort the deduplication process to prevent data
+   *  inconsistencies.
+   */
+  static void adjust_target_manifest(const RGWObjManifest &src_manifest,
+                                     const RGWObjManifest &tgt_manifest,
+                                     bufferlist           &new_manifest_bl)
+  {
+    // first create new_manifest from the src_manifest
+    RGWObjManifest new_manifest(src_manifest);
+
+    // then, adjust head-object parameters to match the tgt_manifest
+    const uint64_t src_head_size = src_manifest.get_head_size();
+    const auto& tgt_placement_rule = tgt_manifest.get_head_placement_rule();
+    const rgw_obj &tgt_head_obj = tgt_manifest.get_obj();
+
+    new_manifest.set_head(tgt_placement_rule, tgt_head_obj, src_head_size);
+    encode(new_manifest, new_manifest_bl);
    }
  
    //---------------------------------------------------------------------------
-  static void init_cmp_pairs(const disk_record_t *p_rec,
-                             const bufferlist    &etag_bl,
-                             bufferlist          &hash_bl, // OUT PARAM
+  static void init_cmp_pairs(const DoutPrefixProvider *dpp,
+                             const disk_record_t *p_rec,
+                             const bufferlist &etag_bl,
+                             bufferlist &hash_bl, // OUT PARAM
                               librados::ObjectWriteOperation *p_op)
    {
      p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
-    // TBD: do we really need the secondary compare using the full manifest?
-    // Can replace it with something cheaper like size/version?
-    p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
+    bufferlist ref_tag_bl;
+    ref_tag_bl.append(p_rec->ref_tag);
+    if (p_rec->s.flags.is_ref_tag_from_tail()) {
+      p_op->cmpxattr(RGW_ATTR_TAIL_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+    }
+    else {
+      p_op->cmpxattr(RGW_ATTR_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+    }
  
      // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-    const unsigned units = (256 / (sizeof(uint64_t)*8));
-    static_assert(units == 4);
-    for (unsigned i = 0; i < units; i++) {
+    for (unsigned i = 0; i < HASH_UNITS; i++) {
        ceph::encode(p_rec->s.hash[i], hash_bl);
      }
  
      if (!p_rec->s.flags.hash_calculated()) {
+      ldpp_dout(dpp, 20) << __func__ << "::CMP HASH " << p_rec->obj_name << dendl;
        p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
      }
    }
  
    //---------------------------------------------------------------------------
-  int Background::dedup_object(const disk_record_t *p_src_rec,
-                               const disk_record_t *p_tgt_rec,
-                               md5_stats_t         *p_stats,
-                               bool                 has_shared_manifest_src)
+  static inline void build_manifest_hash_bl(const bufferlist &manifest_bl,
+                                            bufferlist &manifest_hash_bl)
    {
-    RGWObjManifest src_manifest;
-    try {
-      auto bl_iter = p_src_rec->manifest_bl.cbegin();
-      decode(src_manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
-      return -EINVAL;
-    }
-    RGWObjManifest tgt_manifest;
-    try {
-      auto bl_iter = p_tgt_rec->manifest_bl.cbegin();
-      decode(tgt_manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl;
-      return -EINVAL;
-    }
-    ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: "
-                       << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> "
-                       << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl;
+    bufferlist hash_bl;
+    crypto::digest<crypto::SHA1>(manifest_bl).encode(hash_bl);
+    // Use a shorter hash (64bit instead of 160bit)
+    hash_bl.splice(0, 8, &manifest_hash_bl);
+  }
  
+  //---------------------------------------------------------------------------
+  int Background::dedup_object(disk_record_t                *p_src_rec,
+                               disk_record_t                *p_tgt_rec,
+                               const RGWObjManifest         &src_manifest,
+                               const RGWObjManifest         &tgt_manifest,
+                               md5_stats_t                  *p_stats,
+                               const dedup_table_t::value_t *p_src_val,
+                               const std::string            &tail_oid)
+  {
+    const uint64_t src_head_size = src_manifest.get_head_size();
+    const uint64_t tgt_head_size = tgt_manifest.get_head_size();
      bufferlist etag_bl;
      etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl);
-    ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
-                       << "::ETAG=" << etag_bl.to_str() << dendl;
-
-    bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
-    crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
-    // Use a shorter hash (64bit instead of 160bit)
-    hash_bl.splice(0, 8, &manifest_hash_bl);
-    librados::ObjectWriteOperation tgt_op;
-    init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
-    tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
-    tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
-    if (p_tgt_rec->s.flags.hash_calculated()) {
-      tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
-      p_stats->set_hash_attrs++;
+    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
+    if (unlikely(should_print_debug)) {
+      dedup_object_log(dpp, p_src_rec, p_tgt_rec, src_head_size, tgt_head_size, etag_bl);
      }
  
      std::string src_oid, tgt_oid;
      librados::IoCtx src_ioctx, tgt_ioctx;
-    int ret1 = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
-    int ret2 = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
-    if (unlikely(ret1 != 0 || ret2 != 0)) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
-      return (ret1 ? ret1 : ret2);
+    int ret = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
+    if (unlikely(ret != 0)) {
+      // can't remove created tail object without an ioctx handle
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed SRC get_ioctx()" << dendl;
+      return ret;
      }
  
-    // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG??
-    string ref_tag = p_tgt_rec->ref_tag;
+    ret = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl;
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+      }
+      return ret;
+    }
+
+    // we don't dedup head-objects so head-size must match (unless split-head)
+    // see explanation in adjust_target_manifest()
+    if (unlikely(src_head_size != 0 && src_head_size != tgt_head_size)) {
+      ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size
+                        << "::tgt_head_size=" << tgt_head_size << dendl;
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+      }
+      // TBD: can we create a test case (requires control over head-object-size)??
+      return -ECANCELED;
+    }
+
+    const string &ref_tag = p_tgt_rec->ref_tag;
      ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl;
-    int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
-    if (ret == 0) {
-      d_ctl.metadata_access_throttle.acquire();
-      ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl;
-      ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
-      if (unlikely(ret != 0)) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
-                          << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
-        rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
-        return ret;
+    ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
+    if (unlikely(ret != 0)) {
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
        }
+      return ret;
+    }
  
-      // free tail objects based on TGT manifest
-      free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+    bufferlist manifest_hash_bl;
+    build_manifest_hash_bl(p_src_rec->manifest_bl, manifest_hash_bl);
  
-      if (!has_shared_manifest_src) {
-        // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
-        // after deduping B and update it in dedup_table, but don't update the
-        // disk-record (as require an expensive random-disk-write).
-        // When deduping C we can trust the shared_manifest state in the table and
-        // skip a redundant update to SRC object attribute
+    if (!p_src_val->has_shared_manifest()) {
+      // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
+      // after deduping B and update it in dedup_table, but don't update the
+      // disk-record (as require an expensive random-disk-write).
+      // When deduping C we can trust the shared_manifest state in the table and
+      // skip a redundant update to SRC object attribute
+      librados::ObjectWriteOperation src_op;
+      {
          bufferlist src_hash_bl;
-        librados::ObjectWriteOperation src_op;
-        init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
+        init_cmp_pairs(dpp, p_src_rec, etag_bl, src_hash_bl, &src_op);
          src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
-        if (p_src_rec->s.flags.hash_calculated()) {
+        if (p_src_rec->s.flags.hash_calculated() && !p_src_val->has_valid_hash()){
            src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
+          ldpp_dout(dpp, 20) << __func__ <<"::Set SRC Strong Hash in CLS"<< dendl;
            p_stats->set_hash_attrs++;
          }
+      }
  
-        d_ctl.metadata_access_throttle.acquire();
-        ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
-        ret = src_ioctx.operate(src_oid, &src_op);
-        if (unlikely(ret != 0)) {
-          ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
-                            << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
-          return ret;
+      if (p_src_rec->s.flags.is_split_head()) {
+        ldpp_dout(dpp, 20) << __func__ <<"::SRC-Split (truncate)" << dendl;
+        src_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+        src_op.truncate(0);
+        p_stats->split_head_src++;
+      }
+      d_ctl.metadata_access_throttle.acquire();
+      ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS"<< dendl;
+      ret = src_ioctx.operate(src_oid, &src_op);
+      if (unlikely(ret != 0)) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
+                          << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
+        rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+        if (p_src_rec->s.flags.is_split_head()) {
+          remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
          }
+        return ret;
+      }
+    }
+
+    librados::ObjectWriteOperation tgt_op;
+    {
+      bufferlist tgt_hash_bl;
+      init_cmp_pairs(dpp, p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
+      tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
+      bufferlist new_manifest_bl;
+      adjust_target_manifest(src_manifest, tgt_manifest, new_manifest_bl);
+      tgt_op.setxattr(RGW_ATTR_MANIFEST, new_manifest_bl);
+      //tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+      if (p_tgt_rec->s.flags.hash_calculated()) {
+        tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
+        ldpp_dout(dpp, 20) << __func__ <<"::Set TGT Strong Hash in CLS"<< dendl;
+        p_stats->set_hash_attrs++;
        }
      }
  
+    // If failed before this point and split-head -> remove the new tail-object
+    if (src_head_size == 0 && tgt_head_size > 0) {
+      ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split OP (truncate)" << dendl;
+      p_tgt_rec->s.flags.set_split_head();
+      tgt_op.truncate(0);
+      p_stats->split_head_tgt++;
+    }
+    d_ctl.metadata_access_throttle.acquire();
+    ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS" << dendl;
+    ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
+                        << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
+      rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+      return ret;
+    }
+
+    // free tail objects based on TGT manifest
+    free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+
      // do we need to set compression on the head object or is it set on tail?
      // RGW_ATTR_COMPRESSION
      return ret;
    }
  
    //---------------------------------------------------------------------------
-  int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
+  int Background::calc_object_blake3(const RGWObjManifest &manifest,
+                                     disk_record_t *p_rec,
+                                     uint8_t *p_hash,
+                                     blake3_hasher *p_pre_calc_hmac)
    {
-    ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
-    RGWObjManifest manifest;
-    try {
-      auto bl_iter = p_rec->manifest_bl.cbegin();
-      decode(manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1)  << __func__ << "::ERROR: bad src manifest for: "
-                         << p_rec->obj_name << dendl;
-      return -EINVAL;
+    ldpp_dout(dpp, 20) << __func__ << "::p_rec->obj_name=" << p_rec->obj_name << dendl;
+
+    blake3_hasher _hmac, *p_hmac = nullptr;
+    if (!p_pre_calc_hmac) {
+      blake3_hasher_init(&_hmac);
+      p_hmac = &_hmac;
+    }
+    else {
+      p_hmac = p_pre_calc_hmac;
      }
  
-    blake3_hasher hmac;
-    blake3_hasher_init(&hmac);
      for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
-      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
-      rgw_rados_ref obj;
-      int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
-      if (ret < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
-                          << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
+      uint64_t offset = p.get_stripe_ofs();
+      const rgw_obj_select& os = p.get_location();
+      if (offset > 0 || !p_pre_calc_hmac) {
+        rgw_raw_obj raw_obj = os.get_raw_obj(rados);
+        rgw_rados_ref obj;
+        int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid="
+                            << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
  
-      bufferlist bl;
-      librados::IoCtx ioctx = obj.ioctx;
-      // read full object
-      ret = ioctx.read(raw_obj.oid, bl, 0, 0);
-      if (ret > 0) {
+        librados::IoCtx ioctx = obj.ioctx;
+        bufferlist bl;
+        // read full object
+        ret = ioctx.read(raw_obj.oid, bl, 0, 0);
+        if (unlikely(ret <= 0)) {
+          ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read oid "
+                            << raw_obj.oid  << ", err is " << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
          for (const auto& bptr : bl.buffers()) {
-          blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
+          blake3_hasher_update(p_hmac, (const unsigned char *)bptr.c_str(), bptr.length());
          }
        }
-      else {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
-                          << ", error is " << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
      }
-
-    blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
+    blake3_hasher_finalize(p_hmac, p_hash, BLAKE3_OUT_LEN);
+    p_rec->s.flags.set_hash_calculated();
+    p_rec->s.flags.set_has_valid_hash();
      return 0;
    }
  
@@ -890,28 +1112,58 @@ namespace rgw::dedup {
    {
      ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name
                         << ", obj=" << p_tgt_rec->obj_name
+                       << ", bytes_size=" << p_tgt_rec->s.obj_bytes_size
                         << ", block_id=" << block_id
-                       << ", rec_id=" << (int)rec_id
-                       << ", md5_shard=" << (int)md5_shard << dendl;
-
-    ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard
-                       << "::" << p_tgt_rec->bucket_name
-                       << "/" << p_tgt_rec->obj_name
+                       << ", rec_id=" << (int)rec_id << "\n"
+                       << ", md5_shard=" << (int)md5_shard
                         << "::num_parts=" << p_tgt_rec->s.num_parts
                         << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
                         << p_tgt_rec->s.md5_low << std::dec << dendl;
    }
  
    //---------------------------------------------------------------------------
-  int Background::add_obj_attrs_to_record(rgw_bucket            *p_rb,
-                                          disk_record_t         *p_rec,
+  static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement)
+  {
+    return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty());
+  }
+
+  //---------------------------------------------------------------------------
+  static void set_explicit_tail_placement(const DoutPrefixProvider* dpp,
+                                          RGWObjManifest *p_manifest,// IN-OUT PARAM
+                                          md5_stats_t *p_stats)
+  {
+    p_stats->manifest_no_tail_placement++;
+    ldpp_dout(dpp, 20) << __func__ << "::invalid_tail_placement -> update" << dendl;
+    const rgw_bucket_placement& tail_placement = p_manifest->get_tail_placement();
+    const rgw_bucket *p_bucket = &tail_placement.bucket;
+
+    if (tail_placement.bucket.name.empty()) {
+      // bucket was not set in tail_placement, force the head bucket explicitly
+      const rgw_obj& head_obj = p_manifest->get_obj();
+      p_bucket = &head_obj.bucket;
+    }
+
+    if (tail_placement.placement_rule.name.empty()) {
+      // explicitly use the head_placement_rule for tail objects and update bucket
+      // if needed
+      const auto &head_placement_rule = p_manifest->get_head_placement_rule();
+      p_manifest->set_tail_placement(head_placement_rule, *p_bucket);
+    }
+    else {
+      // otherwise, keep the tail_placement_rule in place (but still update bucket)
+      p_manifest->set_tail_placement(tail_placement.placement_rule, *p_bucket);
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::add_obj_attrs_to_record(disk_record_t         *p_rec,
                                            const rgw::sal::Attrs &attrs,
-                                          dedup_table_t         *p_table,
                                            md5_stats_t           *p_stats) /*IN-OUT*/
    {
      // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG
      auto itr = attrs.find(RGW_ATTR_TAIL_TAG);
      if (itr != attrs.end()) {
+      p_rec->s.flags.set_ref_tag_from_tail();
        p_rec->ref_tag = itr->second.to_str();
      }
      else {
@@ -929,10 +1181,11 @@ namespace rgw::dedup {
      // clear bufferlist first
      p_rec->manifest_bl.clear();
  
+    bool need_to_split_head = false;
+    RGWObjManifest manifest;
      itr = attrs.find(RGW_ATTR_MANIFEST);
      if (itr != attrs.end()) {
        const bufferlist &bl = itr->second;
-      RGWObjManifest manifest;
        try {
          auto bl_iter = bl.cbegin();
          decode(manifest, bl_iter);
@@ -941,12 +1194,13 @@ namespace rgw::dedup {
                             << "::ERROR: unable to decode manifest" << dendl;
          return -EINVAL;
        }
+      need_to_split_head = should_split_head(manifest.get_head_size(),
+                                             p_rec->s.obj_bytes_size);
  
        // force explicit tail_placement as the dedup could be on another bucket
        const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
-      if (tail_placement.bucket.name.empty()) {
-        ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl;
-        manifest.set_tail_placement(tail_placement.placement_rule, *p_rb);
+      if (unlikely(invalid_tail_placement(tail_placement))) {
+        set_explicit_tail_placement(dpp, &manifest, p_stats);
          encode(manifest, p_rec->manifest_bl);
        }
        else {
@@ -958,6 +1212,18 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 5)  << __func__ << "::ERROR: no manifest" << dendl;
        return -EINVAL;
      }
+    const auto &head_placement_rule = manifest.get_head_placement_rule();
+    const std::string& storage_class =
+      rgw_placement_rule::get_canonical_storage_class(head_placement_rule.storage_class);
+
+    // p_rec holds an the storage_class value taken from the bucket-index/obj-attr
+    if (unlikely(storage_class != p_rec->stor_class)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERROR::manifest storage_class="
+                        << storage_class << " != " << "::bucket-index storage_class="
+                        << p_rec->stor_class << dendl;
+      p_stats->different_storage_class++;
+      return -EINVAL;
+    }
  
      itr = attrs.find(RGW_ATTR_SHARE_MANIFEST);
      if (itr != attrs.end()) {
@@ -983,14 +1249,13 @@ namespace rgw::dedup {
      if (itr != attrs.end()) {
        try {
          auto bl_iter = itr->second.cbegin();
-        // BLAKE3 hash 256 bit splitted into multiple 64bit units
-        const unsigned units = (256 / (sizeof(uint64_t)*8));
-        static_assert(units == 4);
-        for (unsigned i = 0; i < units; i++) {
+        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+        for (unsigned i = 0; i < HASH_UNITS; i++) {
            uint64_t val;
            ceph::decode(val, bl_iter);
            p_rec->s.hash[i] = val;
          }
+        p_rec->s.flags.set_has_valid_hash();
          p_stats->valid_hash_attrs++;
          return 0;
        } catch (buffer::error& err) {
@@ -999,16 +1264,17 @@ namespace rgw::dedup {
        }
      }
  
+    // if arrived here we need to calculate string hash
      p_stats->invalid_hash_attrs++;
-    // TBD: redundant memset...
      memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
-    // BLAKE3_OUT_LEN is 32 Bytes
-    int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
-    if (ret == 0) {
-      p_rec->s.flags.set_hash_calculated();
-    }
  
-    return ret;
+    if (!need_to_split_head) {
+      ldpp_dout(dpp, 20) << __func__ << "::CALC Object Strong Hash::"
+                         << p_rec->obj_name << dendl;
+      return calc_object_blake3(manifest, p_rec, (uint8_t*)p_rec->s.hash);
+    }
+    // else, differ strong-hash calculation for next step and piggy back split-head
+    return 0;
    }
  
    //---------------------------------------------------------------------------
@@ -1035,7 +1301,6 @@ namespace rgw::dedup {
      storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
                                                   &p_stats->failed_map_overflow);
      if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
-      // TBD: need stat counters
        return -EOVERFLOW;
      }
      key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
@@ -1043,7 +1308,7 @@ namespace rgw::dedup {
      dedup_table_t::value_t src_val;
      int ret = p_table->get_val(&key_from_bucket_index, &src_val);
      if (ret != 0) {
-      if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+      if (!dedupable_object(p_rec->multipart_object(), d_min_obj_size_for_dedup, ondisk_byte_size)) {
          // record has no valid entry in table because it is a too small
          // It was loaded to table for calculation and then purged
          p_stats->skipped_purged_small++;
@@ -1113,6 +1378,19 @@ namespace rgw::dedup {
      }
  
      const rgw::sal::Attrs& attrs = p_obj->get_attrs();
+    if (src_val.has_shared_manifest() && (attrs.find(RGW_ATTR_SHARE_MANIFEST) != attrs.end())) {
+      // A shared_manifest object can't be a dedup target
+      // We only need to keep a single shared_manifest object
+      // to be used as a dedup-source (which we already got)
+      p_stats->skipped_shared_manifest++;
+      uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_rec->s.num_parts,
+                                                              ondisk_byte_size);
+      p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
+      ldpp_dout(dpp, 20) << __func__ << "::(1)skipped shared_manifest, SRC::block_id="
+                         << src_val.block_idx << "::rec_id=" << (int)src_val.rec_id << dendl;
+      return 0;
+    }
+
      if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) {
        p_stats->ingress_skip_encrypted++;
        p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size;
@@ -1121,7 +1399,7 @@ namespace rgw::dedup {
        return 0;
      }
  
-    // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
+    // TBD-Future: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
      if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) {
        p_stats->ingress_skip_compressed++;
        p_stats->ingress_skip_compressed_bytes += ondisk_byte_size;
@@ -1154,6 +1432,16 @@ namespace rgw::dedup {
      else {
        storage_class = RGW_STORAGE_CLASS_STANDARD;
      }
+
+    // p_rec holds an the storage_class value taken from the bucket-index
+    if (unlikely(storage_class != p_rec->stor_class)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERROR::ATTR storage_class="
+                        << storage_class << " != " << "::bucket-index storage_class="
+                        << p_rec->stor_class << dendl;
+      p_stats->different_storage_class++;
+      return -EINVAL;
+    }
+
      // no need to check for remap success as we compare keys bellow
      sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow);
      key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low,
@@ -1169,7 +1457,7 @@ namespace rgw::dedup {
  
      // reset flags
      p_rec->s.flags.clear();
-    ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats);
+    ret = add_obj_attrs_to_record(p_rec, attrs, p_stats);
      if (unlikely(ret != 0)) {
        ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
                          << ret << "::" << cpp_strerror(-ret) << dendl;
@@ -1180,13 +1468,16 @@ namespace rgw::dedup {
      ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info);
      if (ret == 0) {
        // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest
-      ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK);
+      if (unlikely(rec_info.rec_id >= MAX_REC_IN_BLOCK)) {
+        p_stats->illegal_rec_id++;
+      }
        ldpp_dout(dpp, 20)  << __func__ << "::" << p_rec->bucket_name << "/"
                            << p_rec->obj_name << " was written to block_idx="
                            << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id
-                          << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl;
+                          << "::shared_manifest="
+                          << p_rec->s.flags.has_shared_manifest() << dendl;
        p_table->update_entry(&key_from_bucket_index, rec_info.block_id,
-                            rec_info.rec_id, p_rec->has_shared_manifest());
+                            rec_info.rec_id, p_rec->s.flags.has_shared_manifest());
      }
      else {
        ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl;
@@ -1198,17 +1489,18 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
-                                           rgw::sal::Driver* driver,
-                                           rgw::sal::RadosStore *store,
-                                           const disk_record_t *p_rec)
+  static int write_hash_object_attribute(const DoutPrefixProvider* const dpp,
+                                         rgw::sal::Driver* driver,
+                                         rgw::sal::RadosStore *store,
+                                         const disk_record_t *p_rec,
+                                         md5_stats_t *p_stats)
    {
      bufferlist etag_bl;
      bufferlist hash_bl;
      librados::ObjectWriteOperation op;
      etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
                         &etag_bl);
-    init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
+    init_cmp_pairs(dpp, p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
      op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
  
      std::string oid;
@@ -1224,9 +1516,420 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
                          << oid << "), err is " << cpp_strerror(-ret) << dendl;
      }
+    ldpp_dout(dpp, 20) << __func__ <<"::Write Strong Hash to " << p_rec->obj_name
+                       << dendl;
+    p_stats->set_hash_attrs++;
      return ret;
    }
  
+  //---------------------------------------------------------------------------
+  static bool compare_strong_hash(const DoutPrefixProvider *const dpp,
+                                  const disk_record_t *p_src_rec,
+                                  const disk_record_t *p_tgt_rec,
+                                  md5_stats_t *p_stats)
+  {
+    if (unlikely(0 != memcmp(p_src_rec->s.hash, p_tgt_rec->s.hash, sizeof(p_src_rec->s.hash)))) {
+      p_stats->hash_mismatch++;
+      ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
+      return false;
+    }
+    ldpp_dout(dpp, 20) << __func__ << "::SRC-TGT Strong-Hash match" << dendl;
+    // all is good
+    return true;
+  }
+
+  //---------------------------------------------------------------------------
+  static int read_hash_and_manifest(const DoutPrefixProvider *const dpp,
+                                    rgw::sal::Driver *driver,
+                                    RGWRados *rados,
+                                    disk_record_t *p_rec)
+  {
+    librados::IoCtx ioctx;
+    std::string oid;
+    int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+      return ret;
+    }
+
+    std::map<std::string, bufferlist> attrset;
+    ret = ioctx.getxattrs(oid, attrset);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.getxattrs("
+                        << oid << "), err is " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    auto itr = attrset.find(RGW_ATTR_BLAKE3);
+    if (itr != attrset.end()) {
+      try {
+        auto bl_iter = itr->second.cbegin();
+        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+        for (unsigned i = 0; i < HASH_UNITS; i++) {
+          uint64_t val;
+          ceph::decode(val, bl_iter);
+          p_rec->s.hash[i] = val;
+        }
+        p_rec->s.flags.set_has_valid_hash();
+        // the hash was taken directly from the object attributes and not calculated
+        p_rec->s.flags.clear_hash_calculated();
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
+        return -EINVAL;
+      }
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: No HASH attribute" << dendl;
+      return -ENOENT;
+    }
+
+    itr = attrset.find(RGW_ATTR_MANIFEST);
+    if (itr != attrset.end()) {
+      ldpp_dout(dpp, 20) << __func__ << "::Got Manifest " << p_rec->obj_name << dendl;
+      p_rec->manifest_bl = itr->second;
+      p_rec->s.manifest_len = p_rec->manifest_bl.length();
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: No Manifest attribute" << dendl;
+      return -ENOENT;
+    }
+
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  static void set_explicit_manifest(RGWObjManifest *p_manifest,
+                                    std::map<uint64_t, RGWObjManifestPart> &objs_map)
+  {
+    uint64_t obj_size = p_manifest->get_obj_size();
+    p_manifest->set_head_size(0);
+    p_manifest->set_max_head_size(0);
+    p_manifest->set_prefix("");
+    p_manifest->clear_rules();
+    p_manifest->set_explicit(obj_size, objs_map);
+  }
+
+  //---------------------------------------------------------------------------
+  // This code is based on RGWObjManifest::convert_to_explicit()
+  static void build_explicit_objs_map(const DoutPrefixProvider *dpp,
+                                      RGWRados *rados,
+                                      const RGWObjManifest &manifest,
+                                      const rgw_bucket *p_bucket,
+                                      std::map<uint64_t, RGWObjManifestPart> *p_objs_map,
+                                      const std::string &tail_name,
+                                      md5_stats_t *p_stats)
+  {
+    bool manifest_raw_obj_logged = false;
+    unsigned idx = 0;
+    auto p = manifest.obj_begin(dpp);
+    while (p != manifest.obj_end(dpp)) {
+      const uint64_t offset = p.get_stripe_ofs();
+      const rgw_obj_select& os = p.get_location();
+      ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]OBJ: "
+                         << os.get_raw_obj(rados).oid << "::ofs=" << p.get_ofs()
+                         << "::strp_offset=" << offset << dendl;
+
+      RGWObjManifestPart& part = (*p_objs_map)[offset];
+      part.loc_ofs = 0;
+
+      if (offset == 0) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] HEAD OBJ: "
+                           << os.get_raw_obj(rados).oid << dendl;
+        const rgw_obj &head_obj = manifest.get_obj();
+        const rgw_obj_key &head_key = head_obj.key;
+        // TBD: Can we have different instance/ns values for head/tail ??
+        // Should we take the instance/ns from the head or tail?
+        // Maybe should refuse objects with different instance/ns on head/tail ?
+        rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns);
+        rgw_obj tail_obj(*p_bucket, tail_key);
+        part.loc = tail_obj;
+      }
+      else {
+        // RGWObjManifest::convert_to_explicit() is assuming raw_obj, but looking
+        // at the RGWObjManifest::obj_iterator code it is clear the obj is not raw.
+        // If it happens to be raw we still handle it correctly (and inc stat-count)
+        std::optional<rgw_obj> obj_opt = os.get_head_obj();
+        if (obj_opt.has_value()) {
+          part.loc = obj_opt.value();
+        }
+        else {
+          // report raw object in manifest only once
+          if (!manifest_raw_obj_logged) {
+            manifest_raw_obj_logged = true;
+            ldpp_dout(dpp, 10) << __func__ << "::WARN: obj is_raw" << dendl;
+            p_stats->manifest_raw_obj++;
+          }
+          const rgw_raw_obj& raw = os.get_raw_obj(rados);
+          RGWSI_Tier_RADOS::raw_obj_to_obj(*p_bucket, raw, &part.loc);
+        }
+      }
+
+      ++p;
+      uint64_t next_offset = p.get_stripe_ofs();
+      part.size = next_offset - offset;
+      idx++;
+    } // while (p != manifest.obj_end())
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM
+                                    RGWObjManifest &src_manifest, // IN/OUT PARAM
+                                    const disk_record_t *p_tgt_rec,
+                                    std::string &tail_oid, // OUT PARAM
+                                    md5_stats_t *p_stats)
+  {
+    ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::"
+                       << p_src_rec->s.obj_bytes_size << dendl;
+
+    uint64_t head_size = src_manifest.get_head_size();
+    bufferlist bl;
+    std::string head_oid;
+    librados::IoCtx ioctx;
+    int ret = get_ioctx(dpp, driver, rados, p_src_rec, &ioctx, &head_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+      return ret;
+    }
+
+    // read the full rados head-object
+    ldpp_dout(dpp, 20) << __func__ << "::ioctx.read(" << head_oid << ")" << dendl;
+    ret = ioctx.read(head_oid, bl, 0, 0);
+    if (unlikely(ret != (int)head_size)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << head_oid
+                        << ", ret=" << ret << ", error is " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    // we might have a valid hash left from a failed dedup (mismatch SRC/TGT)
+    if (!p_src_rec->s.flags.has_valid_hash()) {
+      ldpp_dout(dpp, 20) << __func__ << "::calc BLK3 for SRC "
+                         << p_src_rec->obj_name << dendl;
+      blake3_hasher hmac;
+      blake3_hasher_init(&hmac);
+      for (const auto& bptr : bl.buffers()) {
+        blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(),
+                             bptr.length());
+      }
+      uint8_t *p_hash = (uint8_t*)p_src_rec->s.hash;
+      ret = calc_object_blake3(src_manifest, p_src_rec, p_hash, &hmac);
+      if (unlikely(ret != 0)) {
+        return ret;
+      }
+
+      // cancel split-head operation if strong hash differ
+      if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+        return -ECANCELED;
+      }
+    }
+
+    bool exclusive = true; // block overwrite
+    std::string tail_name = generate_split_head_tail_name(src_manifest);
+    const rgw_bucket_placement &tail_placement = src_manifest.get_tail_placement();
+    // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort
+    if (unlikely(invalid_tail_placement(tail_placement))) {
+      p_stats->split_head_no_tail_placement++;
+      ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl;
+      return -EINVAL;
+    }
+
+    const rgw_bucket *p_bucket = &tail_placement.bucket;
+    // tail objects might be on another storage_class/pool, need another ioctx
+    librados::IoCtx tail_ioctx;
+    ret = get_ioctx_internal(dpp, driver, store, tail_name, p_src_rec->instance,
+                             *p_bucket, &tail_ioctx, &tail_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl;
+      return ret;
+    }
+
+    ret = tail_ioctx.create(tail_oid, exclusive);
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::successfully created: " << tail_oid << dendl;
+    }
+    else if (ret == -EEXIST) {
+      // should not happen as we take the prefix with unused counter 0
+      // better to skip this dedup opportunity
+      ldpp_dout(dpp, 1) << __func__ << "::ERR object " << tail_oid << " exists!" << dendl;
+      p_stats->failed_split_head_creat++;
+      return ret;
+    }
+    else{
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << tail_oid
+                        <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <<dendl;
+      return ret;
+    }
+
+    ret = tail_ioctx.write_full(tail_oid, bl);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to write " << tail_oid
+                        << " with: " << cpp_strerror(-ret) << dendl;
+      // don't leave orphan object behind
+      tail_ioctx.remove(tail_oid);
+      return ret;
+    }
+    else {
+      ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret="
+                         << ret << dendl;
+    }
+
+    std::map<uint64_t, RGWObjManifestPart> objs_map;
+    build_explicit_objs_map(dpp, rados, src_manifest, p_bucket, &objs_map,
+                            tail_name, p_stats);
+    set_explicit_manifest(&src_manifest, objs_map);
+
+    bufferlist manifest_bl;
+    encode(src_manifest, manifest_bl);
+    p_src_rec->manifest_bl = manifest_bl;
+    p_src_rec->s.manifest_len = p_src_rec->manifest_bl.length();
+    p_src_rec->s.flags.set_split_head();
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  bool Background::check_and_set_strong_hash(disk_record_t *p_src_rec,
+                                             disk_record_t *p_tgt_rec,
+                                             RGWObjManifest &src_manifest,
+                                             const RGWObjManifest &tgt_manifest,
+                                             const dedup_table_t::value_t *p_src_val,
+                                             std::string &tail_oid, // OUT PARAM
+                                             md5_stats_t *p_stats)
+  {
+    int ret = 0;
+    // if we don't have a valid strong hash already -> read data and calculate it!
+    if (!p_tgt_rec->s.flags.has_valid_hash()) {
+      ldpp_dout(dpp, 20) << __func__ << "::CALC TGT Strong Hash::"
+                         << p_tgt_rec->obj_name << dendl;
+      ret = calc_object_blake3(tgt_manifest, p_tgt_rec, (uint8_t*)p_tgt_rec->s.hash);
+      if (unlikely(ret != 0)) {
+        // Don't run dedup without a valid strong hash
+        return false;
+      }
+    }
+
+    // SRC hash could have been calculated and stored in obj-attributes before
+    // (will happen when we got multiple targets)
+    if (!p_src_rec->s.flags.has_valid_hash() && p_src_val->has_valid_hash()) {
+      // read the manifest and strong hash from the head-object attributes
+      ldpp_dout(dpp, 20) << __func__ << "::Fetch SRC strong hash from head-object::"
+                         << p_src_rec->obj_name << dendl;
+      if (unlikely(read_hash_and_manifest(dpp, driver, rados, p_src_rec) != 0)) {
+        return false;
+      }
+      try {
+        auto bl_iter = p_src_rec->manifest_bl.cbegin();
+        decode(src_manifest, bl_iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 1)  << __func__ << "::ERR: failed manifest decode" << dendl;
+        return false;
+      }
+    }
+
+    // check hash before trying to split head (can be skipped if not equal)
+    if (p_src_rec->s.flags.has_valid_hash()) {
+      if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+        return false;
+      }
+    }
+
+    // we might still need to split-head here when hash is valid
+    // can happen if we failed compare before (md5-collison) and stored the src hash
+    // in the obj-attributes
+    uint64_t head_size = src_manifest.get_head_size();
+    if (should_split_head(head_size, src_manifest.get_obj_size())) {
+      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, tail_oid, p_stats);
+      // compare_strong_hash() is called internally by split_head_object()
+      return (ret == 0);
+    }
+    else if (!p_src_rec->s.flags.has_valid_hash()) {
+      // object not targeted for split_head it should have a valid hash -> skip it
+      ldpp_dout(dpp, 5)  << __func__
+                         << "::ERR: object not targeted for split_head has no hash" << dendl;
+      p_stats->invalid_hash_no_split_head++;
+      return false;
+    }
+
+    return true;
+  }
+
+  //---------------------------------------------------------------------------
+  static bool parse_manifests(const DoutPrefixProvider *dpp,
+                              const disk_record_t *p_src_rec,
+                              const disk_record_t *p_tgt_rec,
+                              RGWObjManifest      *p_src_manifest,
+                              RGWObjManifest      *p_tgt_manifest)
+  {
+    bool valid_src_manifest = false;
+    try {
+      auto bl_iter = p_src_rec->manifest_bl.cbegin();
+      decode(*p_src_manifest, bl_iter);
+      valid_src_manifest = true;
+      bl_iter = p_tgt_rec->manifest_bl.cbegin();
+      decode(*p_tgt_manifest, bl_iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad "
+                        << (valid_src_manifest? "TGT" : "SRC")
+                        << " manifest" << dendl;
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  static bool has_shared_tail_objects(const DoutPrefixProvider *dpp,
+                                      RGWRados             *rados,
+                                      const disk_record_t  *p_src_rec,
+                                      const disk_record_t  *p_tgt_rec,
+                                      const RGWObjManifest &src_manifest,
+                                      const RGWObjManifest &tgt_manifest,
+                                      md5_stats_t          *p_stats)
+  {
+    // Build a vector with all tail-objects on the SRC and then iterate over
+    // the TGT tail-objects looking for a single tail-object in both manifets.
+    // If found -> abort the dedup
+    // The only case leading to this scenario is server-side-copy
+    // It is probably enough to scan the first few tail-objects, but better safe...
+    std::string src_oid = build_oid(p_src_rec->bucket_id, p_src_rec->obj_name);
+    std::string tgt_oid = build_oid(p_tgt_rec->bucket_id, p_tgt_rec->obj_name);
+    std::vector<std::string> vec;
+    unsigned idx = 0;
+    for (auto p = src_manifest.obj_begin(dpp); p != src_manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      if (src_oid != raw_obj.oid) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+        vec.push_back(raw_obj.oid);
+      }
+      else {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
+        continue;
+      }
+    }
+    idx = 0;
+    for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      if (tgt_oid != raw_obj.oid) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+        // Search for the tail_obj in the vector
+        // should be one of the first entries (first or second)
+        auto itr = std::find(vec.begin(), vec.end(), raw_obj.oid);
+        if (unlikely(itr != vec.end())) {
+          ldpp_dout(dpp, 10) << __func__ << "::tail obj " << raw_obj.oid
+                             << " exists on both SRC and TGT Objects -> Abort DEDUP!"<< dendl;
+          p_stats->skip_shared_tail_objs ++;
+          return true;
+        }
+      }
+      else {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
+        continue;
+      }
+    }
+
+    return false;
+  }
+
    //---------------------------------------------------------------------------
    // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
    //   so all entries left are sources of dedup with multiple copies.
@@ -1238,35 +1941,39 @@ namespace rgw::dedup {
    // we can withstand most errors moving to the next object
    // only report an error if we recived a stop scan request!
    //
-  int Background::try_deduping_record(dedup_table_t       *p_table,
-                                      const disk_record_t *p_tgt_rec,
-                                      disk_block_id_t      block_id,
-                                      record_id_t          rec_id,
-                                      md5_shard_t          md5_shard,
-                                      md5_stats_t         *p_stats, /* IN-OUT */
-                                      remapper_t          *remapper)
-  {
-    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>();
+  int Background::try_deduping_record(dedup_table_t   *p_table,
+                                      disk_record_t   *p_tgt_rec,
+                                      disk_block_id_t  block_id,
+                                      record_id_t      rec_id,
+                                      md5_shard_t      md5_shard,
+                                      md5_stats_t     *p_stats, /* IN-OUT */
+                                      remapper_t      *remapper)
+  {
+    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
      if (unlikely(should_print_debug)) {
        print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard);
      }
-
      uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size);
      storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp,
                                                   &p_stats->failed_map_overflow);
-    ceph_assert(sc_idx != remapper_t::NULL_IDX);
+    if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
+      ldpp_dout(dpp, 5) << __func__ << "::invalid_storage_class_mapping for "
+                        << p_tgt_rec->stor_class << "::" << p_tgt_rec->obj_name << dendl;
+      p_stats->invalid_storage_class_mapping++;
+      return 0;
+    }
      key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units,
                p_tgt_rec->s.num_parts, sc_idx);
      dedup_table_t::value_t src_val;
      int ret = p_table->get_val(&key, &src_val);
-    if (ret != 0) {
+    if (unlikely(ret != 0)) {
        // record has no valid entry in table because it is a singleton
        // should never happened since we purged all singletons before
        ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name
                          << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts
                          << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
                          << p_tgt_rec->s.md5_low << std::dec << dendl;
-      ceph_abort("Unexpcted singleton");
+      p_stats->singleton_after_purge++;
        return 0;
      }
  
@@ -1275,86 +1982,115 @@ namespace rgw::dedup {
      if (block_id == src_block_id && rec_id == src_rec_id) {
        // the table entry point to this record which means it is a dedup source so nothing to do
        p_stats->skipped_source_record++;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::(2)skipped source-record, block_id="
+                         << block_id << "::rec_id=" << (int)rec_id << dendl;
        return 0;
      }
  
-    // ceph store full blocks so need to round up and multiply by block_size
-    uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
-    uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size,
-                                                          p_tgt_rec->s.num_parts,
-                                                          ondisk_byte_size);
+    // should never happen
      if (p_tgt_rec->s.flags.has_shared_manifest()) {
        // record holds a shared_manifest object so can't be a dedup target
-      p_stats->skipped_shared_manifest++;
-      p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl;
+      ldpp_dout(dpp, 1) << __func__ << "::(3)skipped shared_manifest, block_id="
+                        << block_id << "::rec_id=" << (int)rec_id << dendl;
+      p_stats->shared_manifest_after_purge++;
        return 0;
      }
  
+    // ceph store full blocks so need to round up and multiply by block_size
+    uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
+    uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_tgt_rec->s.num_parts,
+                                                            ondisk_byte_size);
+
      // This records is a dedup target with source record on source_block_id
-    disk_record_t src_rec;
-    ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id,
+    disk_record_t src_rec, *p_src_rec = &src_rec;
+    ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, p_src_rec, src_block_id,
                        src_rec_id, md5_shard, dpp);
      if (unlikely(ret != 0)) {
        p_stats->failed_src_load++;
        // we can withstand most errors moving to the next object
        ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record("
-                        << src_block_id << ", " << src_rec_id << ")" << dendl;
+                        << src_block_id << ", " << (int)src_rec_id << ")" << dendl;
        return 0;
      }
  
-    ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name
-                       << "/" << src_rec.obj_name << dendl;
+    ldpp_dout(dpp, 20) << __func__ << "::SRC:" << p_src_rec->bucket_name << "/"
+                       << p_src_rec->obj_name << "::TGT:" << p_tgt_rec->bucket_name
+                       << "/" << p_tgt_rec->obj_name << dendl;
      // verify that SRC and TGT records don't refer to the same physical object
      // This could happen in theory if we read the same objects twice
-    if (src_rec.ref_tag == p_tgt_rec->ref_tag) {
+    if (p_src_rec->ref_tag == p_tgt_rec->ref_tag) {
        p_stats->duplicate_records++;
        ldpp_dout(dpp, 10) << __func__ << "::WARN::REF_TAG::Duplicate records for "
-                         << src_rec.obj_name << "::" << src_rec.ref_tag << "::"
+                         << p_src_rec->obj_name << "::" << p_src_rec->ref_tag <<"::"
                           << p_tgt_rec->obj_name << dendl;
        return 0;
      }
  
      // the hash table size is rounded to the nearest 4KB and will wrap after 16G
-    if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
+    if (unlikely(p_src_rec->s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
        p_stats->size_mismatch++;
        ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::"
-                         << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size
+                         << p_src_rec->obj_name << "::" << p_src_rec->s.obj_bytes_size
                           << "::" << p_tgt_rec->obj_name << "::"
                           << p_tgt_rec->s.obj_bytes_size << dendl;
        return 0;
      }
  
-    if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
-      p_stats->hash_mismatch++;
-      ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
-      // TBD: set hash attributes on head objects to save calc next time
-      if (src_rec.s.flags.hash_calculated()) {
-        write_blake3_object_attribute(dpp, driver, store, &src_rec);
-        p_stats->set_hash_attrs++;
+    ret = parse_manifests(dpp, p_src_rec, p_tgt_rec, &src_manifest, &tgt_manifest);
+    if (unlikely(ret != 0)) {
+      return 0;
+    }
+
+    // make sure objects were not created by server-side-copy
+    if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) {
+      return 0;
+    }
+
+
+    std::string tail_oid;
+    bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest,
+                                             tgt_manifest, &src_val, tail_oid, p_stats);
+    if (unlikely(!success)) {
+      if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) {
+        // set hash attributes on head objects to save calc next time
+        ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid SRC hash" << dendl;
+        ret = write_hash_object_attribute(dpp, driver, store, p_src_rec, p_stats);
+        if (ret == 0) {
+          ldpp_dout(dpp, 20) << __func__ <<"::mark valid_hash in table" << dendl;
+          p_table->set_src_mode(&key, src_block_id, src_rec_id, false, true);
+        }
        }
        if (p_tgt_rec->s.flags.hash_calculated()) {
-        write_blake3_object_attribute(dpp, driver, store, p_tgt_rec);
-        p_stats->set_hash_attrs++;
+        ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid TGT hash" << dendl;
+        write_hash_object_attribute(dpp, driver, store, p_tgt_rec, p_stats);
        }
        return 0;
      }
  
-    ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest());
+    ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats,
+                       &src_val, tail_oid);
      if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl;
        p_stats->deduped_objects++;
        p_stats->deduped_objects_bytes += dedupable_objects_bytes;
-      if (p_tgt_rec->s.num_parts == 0) {
+      if (p_tgt_rec->s.flags.is_split_head()) {
+        ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split: dedup_bytes="
+                           << ondisk_byte_size << dendl;
+        p_stats->split_head_dedup_bytes += ondisk_byte_size;
+      }
+      else if (p_tgt_rec->s.num_parts == 0 &&
+               // if we don't split head it will be duplicated
+               p_tgt_rec->s.obj_bytes_size > d_head_object_size) {
          // single part objects duplicate the head object when dedup is used
          p_stats->dup_head_bytes += d_head_object_size;
        }
  
        // mark the SRC object as a providor of a shared manifest
        if (!src_val.has_shared_manifest()) {
+        ldpp_dout(dpp, 20) << __func__ << "::mark shared_manifest+valid_hash"<< dendl;
          p_stats->set_shared_manifest_src++;
-        // set the shared manifest flag in the dedup table
-        p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id);
+        // We always set strong hash on SRC during dedup so mark in table!
+        p_table->set_src_mode(&key, src_block_id, src_rec_id, true, true);
        }
        else {
          ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl;
@@ -1362,7 +2098,7 @@ namespace rgw::dedup {
      }
      else {
        ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for "
-                         << src_rec.bucket_name << "/" << src_rec.obj_name << dendl;
+                         << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << dendl;
        p_stats->failed_dedup++;
      }
  
@@ -1498,10 +2234,12 @@ namespace rgw::dedup {
          }
  
          has_more = (p_header->offset == BLOCK_MAGIC);
-        ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC);
          if (!has_more) {
            ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id
                               << ", rec_count=" << p_header->rec_count << dendl;
+          if (unlikely(p_header->offset != LAST_BLOCK_MAGIC)) {
+            p_stats->missing_last_block_marker++;
+          }
            break;
          }
        }
@@ -1549,7 +2287,7 @@ namespace rgw::dedup {
      p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
  
      // We limit dedup to objects from the same storage_class
-    // TBD:
+    // TBD-Future:
      // Should we use a skip-list of storage_classes we should skip (like glacier) ?
      const std::string& storage_class =
        rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class);
@@ -1564,7 +2302,7 @@ namespace rgw::dedup {
        p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size;
      }
  
-    if (ondisk_byte_size <= d_min_obj_size_for_dedup) {
+    if (ondisk_byte_size < d_min_obj_size_for_dedup) {
        if (parsed_etag.num_parts == 0) {
          // dedup only useful for objects bigger than 4MB
          p_worker_stats->ingress_skip_too_small++;
@@ -1802,7 +2540,7 @@ namespace rgw::dedup {
      // make sure that the standard storage_class is always in the mapper!
      storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp,
                                                  &p_stats->failed_map_overflow);
-    ceph_assert(sc_idx == 0);
+    ceph_assert(sc_idx != remapper_t::NULL_IDX);
      uint32_t slab_count_arr[num_work_shards];
      // first load all etags to hashtable to find dedups
      // the entries come from bucket-index and got minimal info (etag, size)
@@ -2095,7 +2833,8 @@ namespace rgw::dedup {
      utime_t start_time = ceph_clock_now();
      md5_stats_t md5_stats;
      //DEDUP_DYN_ALLOC
-    dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size);
+    dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup,
+                        d_max_obj_size_for_split, raw_mem, raw_mem_size);
      int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
      if (ret == 0) {
        md5_stats.duration = ceph_clock_now() - start_time;
@@ -2290,6 +3029,7 @@ namespace rgw::dedup {
  
      ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl;
      d_ctl.dedup_type = p_epoch->dedup_type;
+    // TBD: replace with a stat-counter
  #ifdef FULL_DEDUP_SUPPORT
      ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_EXEC ||
                  d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
@@ -2755,7 +3495,6 @@ namespace rgw::dedup {
        }
        d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();});
        if (!d_ctl.should_stop() && !d_ctl.should_pause()) {
-        // TBD: should we release lock here ???
          if (d_cluster.can_start_new_scan(store)) {
            d_ctl.dedup_exec = true;
            d_ctl.remote_aborted = false;
diff --git a/src/rgw/driver/rados/rgw_dedup.h b/src/rgw/driver/rados/rgw_dedup.h

index b1df56249e8bfc04760b333483e92f4003c0caee..adca55efebc5c747b2811564e87caf2ac81b1379 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup.h
+++ b/src/rgw/driver/rados/rgw_dedup.h
@@ -97,6 +97,8 @@ namespace rgw::dedup {
        STEP_REMOVE_DUPLICATES
      };
  
+    inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes);
+    inline bool should_split_head(uint64_t head_size, uint64_t obj_size);
      void run();
      int  setup(struct dedup_epoch_t*);
      void work_shards_barrier(work_shard_t num_work_shards);
@@ -182,11 +184,18 @@ namespace rgw::dedup {
                             remapper_t *remapper);
  
  #ifdef FULL_DEDUP_SUPPORT
-    int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
-    int add_obj_attrs_to_record(rgw_bucket            *p_rb,
-                                disk_record_t         *p_rec,
+    int calc_object_blake3(const RGWObjManifest &manifest,
+                           disk_record_t *p_rec,
+                           uint8_t *p_hash,
+                           blake3_hasher *p_pre_calc_hmac = nullptr);
+    int split_head_object(disk_record_t *p_src_rec,     // IN/OUT PARAM
+                          RGWObjManifest &src_manifest, // IN/OUT PARAM
+                          const disk_record_t *p_tgt_rec,
+                          std::string &tail_oid,        // OUT PARAM
+                          md5_stats_t *p_stats);
+
+    int add_obj_attrs_to_record(disk_record_t         *p_rec,
                                  const rgw::sal::Attrs &attrs,
-                                dedup_table_t         *p_table,
                                  md5_stats_t           *p_stats); /* IN-OUT */
  
      int read_object_attribute(dedup_table_t    *p_table,
@@ -197,26 +206,36 @@ namespace rgw::dedup {
                                md5_stats_t      *p_stats /* IN-OUT */,
                                disk_block_seq_t *p_disk,
                                remapper_t       *remapper);
-    int try_deduping_record(dedup_table_t       *p_table,
-                            const disk_record_t *p_rec,
-                            disk_block_id_t      block_id,
-                            record_id_t          rec_id,
-                            md5_shard_t          md5_shard,
-                            md5_stats_t         *p_stats, /* IN-OUT */
-                            remapper_t          *remapper);
-    int inc_ref_count_by_manifest(const std::string &ref_tag,
-                                  const std::string &oid,
-                                  RGWObjManifest    &manifest);
-    int rollback_ref_by_manifest(const std::string &ref_tag,
-                                 const std::string &oid,
-                                 RGWObjManifest    &tgt_manifest);
-    int free_tail_objs_by_manifest(const std::string &ref_tag,
-                                   const std::string &oid,
-                                   RGWObjManifest    &tgt_manifest);
-    int dedup_object(const disk_record_t *p_src_rec,
-                     const disk_record_t *p_tgt_rec,
-                     md5_stats_t         *p_stats,
-                     bool                 is_shared_manifest_src);
+    bool check_and_set_strong_hash(disk_record_t *p_src_rec, // IN/OUT PARAM
+                                   disk_record_t *p_tgt_rec, // IN/OUT PARAM
+                                   RGWObjManifest &src_manifest,
+                                   const RGWObjManifest &tgt_manifest,
+                                   const dedup_table_t::value_t *p_src_val,
+                                   std::string &tail_oid,    // OUT PARAM
+                                   md5_stats_t *p_stats);
+    int try_deduping_record(dedup_table_t   *p_table,
+                            disk_record_t   *p_rec,
+                            disk_block_id_t  block_id,
+                            record_id_t      rec_id,
+                            md5_shard_t      md5_shard,
+                            md5_stats_t     *p_stats, /* IN-OUT */
+                            remapper_t      *remapper);
+    int inc_ref_count_by_manifest(const std::string    &ref_tag,
+                                  const std::string    &oid,
+                                  const RGWObjManifest &manifest);
+    int rollback_ref_by_manifest(const std::string    &ref_tag,
+                                 const std::string    &oid,
+                                 const RGWObjManifest &tgt_manifest);
+    int free_tail_objs_by_manifest(const std::string    &ref_tag,
+                                   const std::string    &oid,
+                                   const RGWObjManifest &tgt_manifest);
+    int dedup_object(disk_record_t                *p_src_rec,
+                     disk_record_t                *p_tgt_rec,
+                     const RGWObjManifest         &src_manifest,
+                     const RGWObjManifest         &tgt_manifest,
+                     md5_stats_t                  *p_stats,
+                     const dedup_table_t::value_t *p_src_val,
+                     const std::string            &tail_oid);
  #endif
      int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
      int  init_rados_access_handles(bool init_pool);
@@ -235,8 +254,9 @@ namespace rgw::dedup {
      unsigned d_heart_beat_max_elapsed_sec;
      uint64_t d_all_buckets_obj_count   = 0;
      uint64_t d_all_buckets_obj_size    = 0;
-    // we don't benefit from deduping RGW objects smaller than head-object size
-    uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024);
+
+    uint32_t d_min_obj_size_for_dedup = (64ULL * 1024);
+    uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024);
      uint32_t d_head_object_size       = (4ULL * 1024 * 1024);
      control_t d_ctl;
      uint64_t d_watch_handle = 0;
diff --git a/src/rgw/driver/rados/rgw_dedup_cluster.cc b/src/rgw/driver/rados/rgw_dedup_cluster.cc

index ebbbec741803ae9cdf5228ad922fc458081ddd71..fafd66176eff99b17562c6ceb107b0563597aec1 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_cluster.cc
+++ b/src/rgw/driver/rados/rgw_dedup_cluster.cc
@@ -124,7 +124,7 @@ namespace rgw::dedup {
      ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
      bool exclusive = true; // block overwrite of old objects
      ret = ctl_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
+    if (ret == 0) {
        ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
        // now try and take ownership
      }
@@ -495,7 +495,7 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
        bool exclusive = true;
        ret = ctl_ioctx.create(oid, exclusive);
-      if (ret >= 0) {
+      if (ret == 0) {
          ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
        }
        else if (ret == -EEXIST) {
@@ -1124,7 +1124,7 @@ namespace rgw::dedup {
      // create the object to watch (object may already exist)
      bool exclusive = true;
      ret = ctl_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
+    if (ret == 0) {
        ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
                           << " was created!" << dendl;
      }
diff --git a/src/rgw/driver/rados/rgw_dedup_store.cc b/src/rgw/driver/rados/rgw_dedup_store.cc

index d2b62651c6c9c5a2f73b4e8853e533edfeef5aa3..83fdfe1993172c9984fbc31a631c42102f08b51d 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_store.cc
+++ b/src/rgw/driver/rados/rgw_dedup_store.cc
@@ -123,9 +123,7 @@ namespace rgw::dedup {
      else {
        this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest);
        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-      const unsigned units = (256 / (sizeof(uint64_t)*8));
-      static_assert(units == 4);
-      for (unsigned i = 0; i < units; i++) {
+      for (unsigned i = 0; i < HASH_UNITS; i++) {
          this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]);
        }
        this->ref_tag = std::string(p, this->s.ref_tag_len);
@@ -189,9 +187,7 @@ namespace rgw::dedup {
      else {
        p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest);
        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-      const unsigned units = (256 / (sizeof(uint64_t)*8));
-      static_assert(units == 4);
-      for (unsigned i = 0; i < units; i++) {
+      for (unsigned i = 0; i < HASH_UNITS; i++) {
          p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]);
        }
        len = this->ref_tag.length();
@@ -228,7 +224,7 @@ namespace rgw::dedup {
    {
      // optimistic approach
      if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) {
-      ldpp_dout(dpp, 20) << __func__ << "::success" << dendl;
+      ldpp_dout(dpp, 20) << caller << "::validate disk_record success" << dendl;
        return 0;
      }
  
@@ -270,14 +266,12 @@ namespace rgw::dedup {
      stream << "MD5       = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n";
      stream << "HASH      = ";
      // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-    const unsigned units = (256 / (sizeof(uint64_t)*8));
-    static_assert(units == 4);
-    for (unsigned i = 0; i < units; i++) {
+    for (unsigned i = 0; i < HASH_UNITS; i++) {
        stream << rec.s.hash[i];
      }
      stream << "\n";
  
-    if (rec.has_shared_manifest()) {
+    if (rec.s.flags.has_shared_manifest()) {
        stream << "Shared Manifest Object\n";
      }
      else {
@@ -603,19 +597,12 @@ namespace rgw::dedup {
      ceph_assert(bl.length());
  
      int ret = ioctx.write_full(oid, bl);
-    if (ret == (int)bl.length()) {
-      ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to "
-                         << oid << dendl;
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::SLAB was written successfully" << dendl;
      }
      else {
-      if (ret == 0) {
-        // no error reported, but we wrote nothing which should never happen
-        ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid
-                          << ", bl.length()=" << bl.length() << dendl;
-        ret = -ENODATA;
-      }
        ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid
-                        << " with: " << cpp_strerror(-ret) << dendl;
+                        << "::ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
      }
  
      return ret;
diff --git a/src/rgw/driver/rados/rgw_dedup_store.h b/src/rgw/driver/rados/rgw_dedup_store.h

index 7bca5d4e70ec0a3d4b655108e5dd8637f26affde..010e54cd45451107edbc3871ee37259999865802 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_store.h
+++ b/src/rgw/driver/rados/rgw_dedup_store.h
@@ -38,7 +38,8 @@ namespace rgw::dedup {
  #define HTOCEPH_32 htole32
  #define HTOCEPH_64 htole64
  
-  static inline constexpr unsigned DISK_BLOCK_SIZE  = 8*1024;
+  static constexpr unsigned HASH_UNITS = BLAKE3_OUT_LEN/sizeof(uint64_t);
+  static constexpr unsigned DISK_BLOCK_SIZE  = 8*1024;
    // we use 16 bit offset
    static_assert(DISK_BLOCK_SIZE < 64*1024);
    static constexpr unsigned DISK_BLOCK_COUNT = 256;
@@ -132,6 +133,35 @@ namespace rgw::dedup {
      uint32_t block_id;
    };
  
+  struct __attribute__ ((packed)) record_flags_t {
+  private:
+    static constexpr uint8_t RGW_RECORD_FLAG_HAS_VALID_HASH  = 0x01;
+    static constexpr uint8_t RGW_RECORD_FLAG_SHARED_MANIFEST = 0x02;
+    static constexpr uint8_t RGW_RECORD_FLAG_HASH_CALCULATED = 0x04;
+    static constexpr uint8_t RGW_RECORD_FLAG_FASTLANE        = 0x08;
+    static constexpr uint8_t RGW_RECORD_FLAG_SPLIT_HEAD      = 0x10;
+    static constexpr uint8_t RGW_RECORD_FLAG_TAIL_REFTAG     = 0x20;
+  public:
+    record_flags_t() : flags(0) {}
+    record_flags_t(uint8_t _flags) : flags(_flags) {}
+    inline void clear() { this->flags = 0; }
+    inline bool hash_calculated() const { return ((flags & RGW_RECORD_FLAG_HASH_CALCULATED) != 0); }
+    inline void set_hash_calculated()  { flags |= RGW_RECORD_FLAG_HASH_CALCULATED; }
+    inline void clear_hash_calculated()  { flags &= ~RGW_RECORD_FLAG_HASH_CALCULATED; }
+    inline bool has_valid_hash() const { return ((flags & RGW_RECORD_FLAG_HAS_VALID_HASH) != 0); }
+    inline void set_has_valid_hash()  { flags |= RGW_RECORD_FLAG_HAS_VALID_HASH; }
+    inline bool has_shared_manifest() const { return ((flags & RGW_RECORD_FLAG_SHARED_MANIFEST) != 0); }
+    inline void set_shared_manifest() { flags |= RGW_RECORD_FLAG_SHARED_MANIFEST; }
+    inline bool is_fastlane()  const { return ((flags & RGW_RECORD_FLAG_FASTLANE) != 0); }
+    inline void set_fastlane()  { flags |= RGW_RECORD_FLAG_FASTLANE; }
+    inline bool is_split_head() const { return ((flags & RGW_RECORD_FLAG_SPLIT_HEAD) != 0); }
+    inline void set_split_head() { flags |= RGW_RECORD_FLAG_SPLIT_HEAD; }
+    inline bool is_ref_tag_from_tail() const { return ((flags & RGW_RECORD_FLAG_TAIL_REFTAG) != 0); }
+    inline void set_ref_tag_from_tail() { flags |= RGW_RECORD_FLAG_TAIL_REFTAG; }
+  private:
+    uint8_t flags;
+  };
+
    struct disk_record_t
    {
      disk_record_t(const char *buff);
@@ -148,32 +178,29 @@ namespace rgw::dedup {
                   const DoutPrefixProvider* dpp,
                   disk_block_id_t block_id,
                   record_id_t rec_id) const;
-    inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); }
-    inline void set_shared_manifest() { s.flags.set_shared_manifest(); }
-
-    struct __attribute__ ((packed)) packed_rec_t
+    inline bool multipart_object() { return (this->s.num_parts > 0); }
+    struct packed_rec_t
      {
-      uint8_t       rec_version;     // allows changing record format
-      dedup_flags_t flags;           // 1 Byte flags
-      uint16_t      num_parts;       // For multipart upload (AWS MAX-PART is 10,000)
-      uint16_t      obj_name_len;
-      uint16_t      bucket_name_len;
-
+      uint64_t      hash[4];         // 4 * 8 Bytes of HASH
+      uint64_t      shared_manifest; // 64bit hash of the SRC object manifest
        uint64_t      md5_high;        // High Bytes of the Object Data MD5
        uint64_t      md5_low;         // Low  Bytes of the Object Data MD5
        uint64_t      obj_bytes_size;
  
+      uint16_t      num_parts;       // For multipart upload (AWS MAX-PART is 10,000)
+      uint16_t      obj_name_len;
+      uint16_t      bucket_name_len;
        uint16_t      bucket_id_len;
+
        uint16_t      tenant_name_len;
        uint16_t      instance_len;
        uint16_t      stor_class_len;
        uint16_t      ref_tag_len;
-
        uint16_t      manifest_len;
-      uint8_t       pad[6];
  
-      uint64_t      shared_manifest; // 64bit hash of the SRC object manifest
-      uint64_t      hash[4];       // 4 * 8 Bytes of BLAKE3
+      uint8_t       rec_version;     // allows changing record format
+      record_flags_t flags;           // 1 Byte flags
+      uint8_t       pad[6];
      }s;
      std::string obj_name;
      // TBD: find pool name making it easier to get ioctx
@@ -186,6 +213,7 @@ namespace rgw::dedup {
      bufferlist  manifest_bl;
    };
    static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash));
+  static_assert(sizeof(disk_record_t::packed_rec_t) == sizeof(uint64_t)*12);
    std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec);
  
    static constexpr unsigned BLOCK_MAGIC = 0xFACE;
diff --git a/src/rgw/driver/rados/rgw_dedup_table.cc b/src/rgw/driver/rados/rgw_dedup_table.cc

index 4f34b27d18edaad34f8b24f682e8cedd7b533850..d86896473a1819ebf8093542a94dc67dbf48c823 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_table.cc
+++ b/src/rgw/driver/rados/rgw_dedup_table.cc
@@ -22,11 +22,15 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
                                 uint32_t _head_object_size,
+                               uint32_t _min_obj_size_for_dedup,
+                               uint32_t _max_obj_size_for_split,
                                 uint8_t *p_slab,
                                 uint64_t slab_size)
    {
      dpp = _dpp;
      head_object_size = _head_object_size;
+    min_obj_size_for_dedup = _min_obj_size_for_dedup;
+    max_obj_size_for_split = _max_obj_size_for_split;
      memset(p_slab, 0, slab_size);
      hash_tab = (table_entry_t*)p_slab;
      entries_count = slab_size/sizeof(table_entry_t);
@@ -51,7 +55,7 @@ namespace rgw::dedup {
        const key_t &key = hash_tab[tab_idx].key;
        // This is an approximation only since size is stored in 4KB resolution
        uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
-      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
          hash_tab[tab_idx].val.clear_flags();
          redistributed_clear++;
          continue;
@@ -126,12 +130,16 @@ namespace rgw::dedup {
      }
      else {
        uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+                                                     min_obj_size_for_dedup,
+                                                     max_obj_size_for_split,
                                                       p_key->num_parts,
                                                       byte_size_approx);
        p_big_objs->duplicate_count ++;
        p_big_objs->dedup_bytes_estimate += dup_bytes_approx;
  
-      if (!p_key->multipart_object()) {
+      // object smaller than max_obj_size_for_split will split their head
+      // and won't dup it
+      if (!key.multipart_object() && byte_size_approx > max_obj_size_for_split) {
          // single part objects duplicate the head object when dedup is used
          *p_duplicate_head_bytes += head_object_size;
        }
@@ -206,23 +214,31 @@ namespace rgw::dedup {
        // replace value!
        value_t new_val(block_id, rec_id, shared_manifest);
        new_val.count = val.count;
-      hash_tab[idx].val = new_val;
        ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::["
                           << val.block_idx << "/" << (int)val.rec_id << "] -> ["
                           << block_id << "/" << (int)rec_id << "]" << dendl;
+
+      val = new_val;
      }
    }
  
    //---------------------------------------------------------------------------
-  int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key,
-                                                  disk_block_id_t block_id,
-                                                  record_id_t rec_id)
+  int dedup_table_t::set_src_mode(const key_t *p_key,
+                                  disk_block_id_t block_id,
+                                  record_id_t rec_id,
+                                  bool set_shared_manifest_src,
+                                  bool set_has_valid_hash_src)
    {
      uint32_t idx = find_entry(p_key);
      value_t &val = hash_tab[idx].val;
      if (val.is_occupied()) {
        if (val.block_idx == block_id && val.rec_id == rec_id) {
-        val.set_shared_manifest_src();
+        if (set_shared_manifest_src) {
+          val.set_shared_manifest_src();
+        }
+        if (set_has_valid_hash_src) {
+          val.set_has_valid_hash_src();
+        }
          return 0;
        }
      }
@@ -281,7 +297,7 @@ namespace rgw::dedup {
        uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
  
        // skip small single part objects which we can't dedup
-      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
          if (hash_tab[tab_idx].val.is_singleton()) {
            p_small_objs->singleton_count++;
          }
diff --git a/src/rgw/driver/rados/rgw_dedup_table.h b/src/rgw/driver/rados/rgw_dedup_table.h

index 4a46db6e5b7912f5dd09fde275d87a054a1c7946..501cb20d9aa590951bb45923ade7745c313de9fd 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_table.h
+++ b/src/rgw/driver/rados/rgw_dedup_table.h
@@ -63,6 +63,26 @@ namespace rgw::dedup {
    static_assert(sizeof(key_t) == 24);
  
    class dedup_table_t {
+    struct __attribute__ ((packed)) table_flags_t {
+    private:
+      static constexpr uint8_t RGW_TABLE_FLAG_HAS_VALID_HASH  = 0x01;
+      static constexpr uint8_t RGW_TABLE_FLAG_SHARED_MANIFEST = 0x02;
+      static constexpr uint8_t RGW_TABLE_FLAG_OCCUPIED        = 0x04;
+    public:
+      table_flags_t() : flags(0) {}
+      table_flags_t(uint8_t _flags) : flags(_flags) {}
+      inline void clear() { this->flags = 0; }
+      inline bool has_valid_hash() const { return ((flags & RGW_TABLE_FLAG_HAS_VALID_HASH) != 0); }
+      inline void set_has_valid_hash()  { flags |= RGW_TABLE_FLAG_HAS_VALID_HASH; }
+      inline bool has_shared_manifest() const { return ((flags & RGW_TABLE_FLAG_SHARED_MANIFEST) != 0); }
+      inline void set_shared_manifest() { flags |= RGW_TABLE_FLAG_SHARED_MANIFEST; }
+      inline bool is_occupied() const {return ((this->flags & RGW_TABLE_FLAG_OCCUPIED) != 0); }
+      inline void set_occupied() {this->flags |= RGW_TABLE_FLAG_OCCUPIED; }
+      inline void clear_occupied() { this->flags &= ~RGW_TABLE_FLAG_OCCUPIED; }
+    private:
+      uint8_t flags;
+    };
+
    public:
      // 8 Bytes Value
      struct value_t {
@@ -93,6 +113,8 @@ namespace rgw::dedup {
        inline void inc_count() { count ++; }
        inline void reset_count() { count = 0; }
        inline void clear_flags() { flags.clear(); }
+      inline bool has_valid_hash() const {return flags.has_valid_hash(); }
+      inline void set_has_valid_hash_src() { this->flags.set_has_valid_hash(); }
        inline bool is_singleton() const { return (count == 1); }
        inline bool is_occupied() const { return flags.is_occupied(); }
        inline void set_occupied() { this->flags.set_occupied();  }
@@ -102,12 +124,14 @@ namespace rgw::dedup {
        disk_block_id_t block_idx; // 32 bits
        uint16_t        count;     // 16 bits
        record_id_t     rec_id;    //  8 bits
-      dedup_flags_t   flags;     //  8 bits
+      table_flags_t   flags;     //  8 bits
      } __attribute__((__packed__));
      static_assert(sizeof(value_t) == 8);
  
      dedup_table_t(const DoutPrefixProvider* _dpp,
                    uint32_t _head_object_size,
+                  uint32_t _min_obj_size_for_dedup,
+                  uint32_t _max_obj_size_for_split,
                    uint8_t *p_slab,
                    uint64_t slab_size);
      int add_entry(key_t *p_key,
@@ -129,6 +153,12 @@ namespace rgw::dedup {
                                       disk_block_id_t block_id,
                                       record_id_t rec_id);
  
+    int set_src_mode(const key_t *p_key,
+                     disk_block_id_t block_id,
+                     record_id_t rec_id,
+                     bool set_shared_manifest_src,
+                     bool set_has_valid_hash_src);
+
      void count_duplicates(dedup_stats_t *p_small_objs_stat,
                            dedup_stats_t *p_big_objs_stat);
  
@@ -145,7 +175,9 @@ namespace rgw::dedup {
      uint32_t       values_count = 0;
      uint32_t       entries_count = 0;
      uint32_t       occupied_count = 0;
-    uint32_t       head_object_size = (4ULL * 1024 * 1024);
+    uint32_t       head_object_size;
+    uint32_t       min_obj_size_for_dedup;
+    uint32_t       max_obj_size_for_split;
      table_entry_t *hash_tab = nullptr;
  
      // stat counters
diff --git a/src/rgw/driver/rados/rgw_dedup_utils.cc b/src/rgw/driver/rados/rgw_dedup_utils.cc

index 61ad6b91c516ef6cff68dc0ddca89000fefdf44c..74252a853950346028f9eea0896acb531cec1559 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_utils.cc
+++ b/src/rgw/driver/rados/rgw_dedup_utils.cc
@@ -14,8 +14,8 @@
  
  #include "rgw_dedup_utils.h"
  #include "common/ceph_crypto.h"
-
  namespace rgw::dedup {
+
    //---------------------------------------------------------------------------
    std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type)
    {
@@ -566,10 +566,27 @@ namespace rgw::dedup {
      this->failed_rec_load         += other.failed_rec_load;
      this->failed_block_load       += other.failed_block_load;
  
+    this->different_storage_class       += other.different_storage_class;
+    this->invalid_hash_no_split_head    += other.invalid_hash_no_split_head;
+    this->invalid_storage_class_mapping += other.invalid_storage_class_mapping;
+    this->singleton_after_purge         += other.singleton_after_purge;
+    this->shared_manifest_after_purge   += other.shared_manifest_after_purge;
+    this->split_head_no_tail_placement  += other.split_head_no_tail_placement;
+    this->illegal_rec_id                += other.illegal_rec_id;
+    this->missing_last_block_marker     += other.missing_last_block_marker;
+
      this->valid_hash_attrs        += other.valid_hash_attrs;
      this->invalid_hash_attrs      += other.invalid_hash_attrs;
      this->set_hash_attrs          += other.set_hash_attrs;
      this->skip_hash_cmp           += other.skip_hash_cmp;
+    this->manifest_raw_obj        += other.manifest_raw_obj;
+    this->manifest_no_tail_placement += other.manifest_no_tail_placement;
+    this->rollback_tail_obj       += other.rollback_tail_obj;
+    this->failed_split_head_creat += other.failed_split_head_creat;
+    this->skip_shared_tail_objs   += other.skip_shared_tail_objs;
+    this->split_head_src          += other.split_head_src;
+    this->split_head_tgt          += other.split_head_tgt;
+    this->split_head_dedup_bytes  += other.split_head_dedup_bytes;
  
      this->set_shared_manifest_src += other.set_shared_manifest_src;
      this->loaded_objects          += other.loaded_objects;
@@ -659,9 +676,30 @@ namespace rgw::dedup {
          f->dump_unsigned("Set HASH", this->set_hash_attrs);
        }
  
+      if (this->skip_shared_tail_objs) {
+        f->dump_unsigned("Skip Shared Tail Objs (server-side-copy)", this->skip_shared_tail_objs);
+      }
        if (this->skip_hash_cmp) {
          f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp);
        }
+      if (this->manifest_raw_obj) {
+        f->dump_unsigned("Manifest has RAW OBJ", this->manifest_raw_obj);
+      }
+      if (this->manifest_no_tail_placement) {
+        f->dump_unsigned("Manifest has no tail placement", this->manifest_no_tail_placement);
+      }
+      if (this->rollback_tail_obj) {
+        f->dump_unsigned("Rollback tail obj", this->rollback_tail_obj);
+      }
+      if (this->split_head_src) {
+        f->dump_unsigned("Split-Head Src OBJ", this->split_head_src);
+      }
+      if (this->split_head_tgt) {
+        f->dump_unsigned("Split-Head Tgt OBJ", this->split_head_tgt);
+      }
+      if (this->split_head_dedup_bytes) {
+        f->dump_unsigned("Split-Head Dedup-Bytes", this->split_head_dedup_bytes);
+      }
      }
  
      {
@@ -716,6 +754,18 @@ namespace rgw::dedup {
        if (this->failed_block_load) {
          f->dump_unsigned("Failed Block-Load ", this->failed_block_load);
        }
+
+      if (this->illegal_rec_id) {
+        f->dump_unsigned("Failed illegal_rec_id", this->illegal_rec_id );
+      }
+      if (this->missing_last_block_marker) {
+        f->dump_unsigned("Failed missing_last_block_marker in rec",
+                         this->missing_last_block_marker);
+      }
+
+      if (this->failed_split_head_creat) {
+        f->dump_unsigned("Failed Split-Head Create (EEXIST)", this->failed_split_head_creat);
+      }
        if (this->failed_dedup) {
          f->dump_unsigned("Failed Dedup", this->failed_dedup);
        }
@@ -732,6 +782,30 @@ namespace rgw::dedup {
        if (this->size_mismatch) {
          f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch);
        }
+      if (this->different_storage_class) {
+        f->dump_unsigned("different_storage_class",
+                         this->different_storage_class);
+      }
+      if (this->invalid_hash_no_split_head) {
+        f->dump_unsigned("Failed rec has invalid hash w/o split-head ",
+                         this->invalid_hash_no_split_head);
+      }
+      if (this->invalid_storage_class_mapping) {
+        f->dump_unsigned("Failed, invalid_storage_class_mapping",
+                         this->invalid_storage_class_mapping);
+      }
+      if (this->singleton_after_purge) {
+        f->dump_unsigned("Failed, has singleton after purge",
+                         this->singleton_after_purge);
+      }
+      if (this->shared_manifest_after_purge) {
+        f->dump_unsigned("Failed, has shared manifest after purge",
+                         this->shared_manifest_after_purge);
+      }
+      if (this->split_head_no_tail_placement) {
+        f->dump_unsigned("No Tail Placement during Split-Head processing",
+                         this->split_head_no_tail_placement);
+      }
      }
    }
  
@@ -768,10 +842,27 @@ namespace rgw::dedup {
      encode(m.failed_rec_load, bl);
      encode(m.failed_block_load, bl);
  
+    encode(m.different_storage_class, bl);
+    encode(m.invalid_hash_no_split_head, bl);
+    encode(m.invalid_storage_class_mapping, bl);
+    encode(m.singleton_after_purge, bl);
+    encode(m.shared_manifest_after_purge, bl);
+    encode(m.split_head_no_tail_placement, bl);
+    encode(m.illegal_rec_id, bl);
+    encode(m.missing_last_block_marker, bl);
+
      encode(m.valid_hash_attrs, bl);
      encode(m.invalid_hash_attrs, bl);
      encode(m.set_hash_attrs, bl);
      encode(m.skip_hash_cmp, bl);
+    encode(m.manifest_raw_obj, bl);
+    encode(m.manifest_no_tail_placement, bl);
+    encode(m.rollback_tail_obj, bl);
+    encode(m.failed_split_head_creat, bl);
+    encode(m.skip_shared_tail_objs, bl);
+    encode(m.split_head_src, bl);
+    encode(m.split_head_tgt, bl);
+    encode(m.split_head_dedup_bytes, bl);
      encode(m.set_shared_manifest_src, bl);
  
      encode(m.loaded_objects, bl);
@@ -822,10 +913,27 @@ namespace rgw::dedup {
      decode(m.failed_rec_load, bl);
      decode(m.failed_block_load, bl);
  
+    decode(m.different_storage_class, bl);
+    decode(m.invalid_hash_no_split_head, bl);
+    decode(m.invalid_storage_class_mapping, bl);
+    decode(m.singleton_after_purge, bl);
+    decode(m.shared_manifest_after_purge, bl);
+    decode(m.split_head_no_tail_placement, bl);
+    decode(m.illegal_rec_id, bl);
+    decode(m.missing_last_block_marker, bl);
+
      decode(m.valid_hash_attrs, bl);
      decode(m.invalid_hash_attrs, bl);
      decode(m.set_hash_attrs, bl);
      decode(m.skip_hash_cmp, bl);
+    decode(m.manifest_raw_obj, bl);
+    decode(m.manifest_no_tail_placement, bl);
+    decode(m.rollback_tail_obj, bl);
+    decode(m.failed_split_head_creat, bl);
+    decode(m.skip_shared_tail_objs, bl);
+    decode(m.split_head_src, bl);
+    decode(m.split_head_tgt, bl);
+    decode(m.split_head_dedup_bytes, bl);
      decode(m.set_shared_manifest_src, bl);
  
      decode(m.loaded_objects, bl);
diff --git a/src/rgw/driver/rados/rgw_dedup_utils.h b/src/rgw/driver/rados/rgw_dedup_utils.h

index abe624321225b337378f4495a5eac857f47c990c..579e048a259f0bd0f8650a8bc64ff6d5f25d005a 100644 (file)
--- a/src/rgw/driver/rados/rgw_dedup_utils.h
+++ b/src/rgw/driver/rados/rgw_dedup_utils.h
@@ -25,6 +25,7 @@
  #include "common/dout.h"
  
  #define FULL_DEDUP_SUPPORT
+
  namespace rgw::dedup {
    using namespace std::chrono;
    using work_shard_t   = uint16_t;
@@ -68,29 +69,6 @@ namespace rgw::dedup {
    };
  
    std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type);
-  struct __attribute__ ((packed)) dedup_flags_t {
-  private:
-    static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC
-    static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST   = 0x02; // REC + TAB
-    static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED          = 0x04; // TAB
-    static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE          = 0x08; // REC
-
-  public:
-    dedup_flags_t() : flags(0) {}
-    dedup_flags_t(uint8_t _flags) : flags(_flags) {}
-    inline void clear() { this->flags = 0; }
-    inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); }
-    inline void set_hash_calculated()  { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; }
-    inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); }
-    inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; }
-    inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); }
-    inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; }
-    inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; }
-    inline bool is_fastlane()  const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); }
-    inline void set_fastlane()  { flags |= RGW_DEDUP_FLAG_FASTLANE; }
-  private:
-    uint8_t flags;
-  };
  
    class alignas(8) Throttle {
      friend void validate_max_calls_offset();
@@ -262,11 +240,27 @@ namespace rgw::dedup {
      uint64_t failed_rec_load = 0;
      uint64_t failed_block_load = 0;
  
+    uint64_t different_storage_class = 0;
+    uint64_t invalid_hash_no_split_head = 0;
+    uint64_t invalid_storage_class_mapping = 0;
+    uint64_t singleton_after_purge = 0;
+    uint64_t shared_manifest_after_purge = 0;
+    uint64_t split_head_no_tail_placement = 0;
+    uint64_t illegal_rec_id = 0;
+    uint64_t missing_last_block_marker = 0;
+
      uint64_t valid_hash_attrs = 0;
      uint64_t invalid_hash_attrs = 0;
      uint64_t set_hash_attrs = 0;
+    uint64_t skip_shared_tail_objs = 0;
      uint64_t skip_hash_cmp = 0;
-
+    uint64_t manifest_raw_obj = 0;
+    uint64_t manifest_no_tail_placement = 0;
+    uint64_t rollback_tail_obj = 0;
+    uint64_t failed_split_head_creat = 0;
+    uint64_t split_head_src = 0;
+    uint64_t split_head_tgt = 0;
+    uint64_t split_head_dedup_bytes = 0;
      uint64_t set_shared_manifest_src = 0;
      uint64_t loaded_objects = 0;
      uint64_t processed_objects = 0;
@@ -362,7 +356,19 @@ namespace rgw::dedup {
                                  const DoutPrefixProvider* dpp);
  
    //---------------------------------------------------------------------------
-  static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size,
+  static inline bool dedupable_object(bool     multipart_object,
+                                      uint64_t min_obj_size_for_dedup,
+                                      uint64_t object_byte_size)
+  {
+    // all multipart objects are dedupable because the head-object is empty
+    // otherwise make sure object_byte_size is large enough
+    return (multipart_object || object_byte_size >= min_obj_size_for_dedup);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size,
+                                            uint32_t min_obj_size_for_dedup,
+                                            uint32_t max_obj_size_for_split,
                                              uint16_t num_parts,
                                              uint64_t size_bytes)
    {
@@ -372,9 +378,13 @@ namespace rgw::dedup {
      }
      else {
        // reduce the head size
-      if (size_bytes > head_obj_size) {
+      if (size_bytes > max_obj_size_for_split) {
          return size_bytes - head_obj_size;
        }
+      else if (size_bytes >= min_obj_size_for_dedup) {
+        // Head is splitted into an empty obj and a new tail enabling a full dedup
+        return size_bytes;
+      }
        else {
          return 0;
        }
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h

index 1e679a38b8e9f60a8b2a1807db31ce90c7160103..4129a015c98b103aad7d2625a5397b70e6e06b31 100644 (file)
--- a/src/rgw/driver/rados/rgw_obj_manifest.h
+++ b/src/rgw/driver/rados/rgw_obj_manifest.h
@@ -256,6 +256,10 @@ public:
    void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
                               std::string *override_prefix, rgw_obj_select *location) const;
  
+  void clear_rules() {
+    rules.clear();
+  }
+
    void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
      RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
      rules[0] = rule;
@@ -467,6 +471,10 @@ public:
      return max_head_size;
    }
  
+  void set_max_head_size(uint64_t _max_head_size) {
+    max_head_size = _max_head_size;
+  }
+
    const std::string& get_tier_type() {
        return tier_type;
    }
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h

index 87219399678dc95b428bf3f7930d802427f5c508..40ab41c400ffa2e3093ec8417e0da8d522aede14 100644 (file)
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -89,7 +89,6 @@ using ceph::crypto::MD5;
  #define RGW_ATTR_CORS          RGW_ATTR_PREFIX "cors"
  #define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag"
  #define RGW_ATTR_CKSUM          RGW_ATTR_PREFIX "cksum"
-#define RGW_ATTR_SHA256         RGW_ATTR_PREFIX "x-amz-content-sha256"
  #define RGW_ATTR_BLAKE3         RGW_ATTR_PREFIX "blake3"
  #define RGW_ATTR_BUCKETS       RGW_ATTR_PREFIX "buckets"
  #define RGW_ATTR_META_PREFIX   RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py

index 3f3a3d606dd27f9825a065f2f83b5b460c4a695e..f24857e3d993f20d0b73c554536054bbc198b978 100644 (file)
--- a/src/test/rgw/dedup/test_dedup.py
+++ b/src/test/rgw/dedup/test_dedup.py
@@ -262,6 +262,17 @@ def gen_connections_multi(num_tenants):
      log.debug("gen_connections_multi: All connection and buckets are set")
      return (tenants, bucket_names, conns)
  
+#-------------------------------------------------------------------------------
+def create_buckets(conn, max_copies_count):
+    bucket_names=[]
+    for i in range(0, max_copies_count):
+        bucket_name=gen_bucket_name()
+        bucket_names.append(bucket_name)
+        log.debug("conn.create_bucket(Bucket=%s)", bucket_name)
+        conn.create_bucket(Bucket=bucket_name)
+
+    return bucket_names
+
  
  #####################
  # dedup tests
@@ -270,8 +281,11 @@ OUT_DIR="/tmp/dedup/"
  KB=(1024)
  MB=(1024*KB)
  POTENTIAL_OBJ_SIZE=(64*KB)
+DEDUP_MIN_OBJ_SIZE=(64*KB)
+SPLIT_HEAD_SIZE=(16*MB)
  RADOS_OBJ_SIZE=(4*MB)
-MULTIPART_SIZE=(16*MB)
+# The default multipart threshold size for S3cmd is 15 MB.
+MULTIPART_SIZE=(15*MB)
  default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE)
  ETAG_ATTR="user.rgw.etag"
  POOLNAME="default.rgw.buckets.data"
@@ -385,26 +399,41 @@ def count_space_in_all_buckets():
  #-------------------------------------------------------------------------------
  def count_objects_in_bucket(bucket_name, conn):
      max_keys=1000
-    marker=""
+    continuation_token = None
      obj_count=0
      while True:
          log.debug("bucket_name=%s", bucket_name)
-        listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
          if 'Contents' not in listing or len(listing['Contents'])== 0:
              return 0
  
          obj_count += len(listing['Contents'])
  
-        if listing['IsTruncated']:
-            marker=listing['NextMarker']
-            log.debug("marker=%s, obj_count=%d", marker, obj_count)
-            continue
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("count_objects_in_bucket: Token=%s, count=%d",
+                      continuation_token, obj_count)
          else:
              return obj_count
  
  
  #-------------------------------------------------------------------------------
-def count_object_parts_in_all_buckets(verbose=False):
+def copy_obj(base_bucket_name, base_key, bucket_name, key):
+    s3_prefix="s3://"
+    src  = s3_prefix + base_bucket_name + "/" + base_key
+    dest = s3_prefix + bucket_name + "/" + key
+    result = bash(['s3cmd', 'cp', src, dest])
+    assert result[1] == 0
+
+#-------------------------------------------------------------------------------
+def count_object_parts_in_all_buckets(verbose=False, expected_size=0):
      result = rados(['lspools'])
      assert result[1] == 0
      found=False
@@ -420,17 +449,40 @@ def count_object_parts_in_all_buckets(verbose=False):
  
      result = rados(['ls', '-p ', POOLNAME])
      assert result[1] == 0
-
      names=result[0].split()
-    count = 0
-    for name in names:
-        #log.debug(name)
-        count = count + 1
+    rados_count = len(names)
+    if (rados_count > 1000):
+        ### we can only do about 10 stat call per-second!!
+        ### TBD: add obj_size to ls output to allow more efficient size check
+        log.info(">>> rados obj_count(%d) is too high -> skip stat check\n",
+                 len(names))
+        expected_size = 0
+
+    byte_size_total = 0
+    ondisk_size_total = 0
+    start_time = time.perf_counter()
+    for rados_name in names:
+        if verbose:
+            log.debug(rados_name)
+        if expected_size:
+            result = rados(['-p ', POOLNAME, 'stat', rados_name])
+            assert result[1] == 0
+            stat = result[0].split()
+            byte_size=int(stat[-1])
+            byte_size_total += byte_size
+            ondisk_size_total += calc_on_disk_byte_size(byte_size)
+
+    if expected_size:
+        end_time = time.perf_counter()
+        time_elapsed = end_time - start_time
+        log.info("rados_count=%d, ondisk_size_total=%d, expected_size=%d, time=%d(sec)",
+                 rados_count, ondisk_size_total, expected_size, time_elapsed)
+        assert ondisk_size_total == expected_size
  
      if verbose:
-        log.debug("Pool has %d rados objects", count)
+        log.debug("Pool has %d rados objects", rados_count)
  
-    return count
+    return rados_count
  
  
  #-------------------------------------------------------------------------------
@@ -443,29 +495,61 @@ def cleanup_local():
          return False
  
  
+#-------------------------------------------------------------------------------
+def check_delete_objects_response(response):
+    # Check for delete failures
+    if 'Errors' in response and response['Errors']:
+        log.error("Delete failures detected:")
+        for error in response['Errors']:
+            log.error("delete_objects::ERROR::Key=%s, Code=%s, Message=%s",
+                      error['Key'], error['Code'], error['Message'])
+
+    else:
+        log.debug("All objects deleted successfully.")
+
+
+#-------------------------------------------------------------------------------
+def delete_objects(conn, bucket_name, object_keys):
+    response=conn.delete_objects(Bucket=bucket_name,
+                                 Delete={"Objects": [{"Key": key} for key in object_keys]})
+
+    # Check for delete failures
+    check_delete_objects_response(response)
+
+
  #-------------------------------------------------------------------------------
  def delete_bucket_with_all_objects(bucket_name, conn):
      max_keys=1000
-    marker=""
+    continuation_token = None
      obj_count=0
      while True:
-        listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
          if 'Contents' not in listing or len(listing['Contents'])== 0:
              log.debug("Bucket '%s' is empty, skipping...", bucket_name)
              return
  
          objects=[]
          for obj in listing['Contents']:
-            log.debug(obj['Key'])
+            log.debug("delete_bucket_with_all_objects: add obj: %s", obj['Key'])
              objects.append({'Key': obj['Key']})
  
          obj_count += len(objects)
          # delete objects from the bucket
-        conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
-        if listing['IsTruncated']:
-            marker=listing['NextMarker']
-            log.debug("marker=%s, obj_count=%d", marker, obj_count)
-            continue
+        log.debug("delete_bucket_with_all_objects: delete %d objs", obj_count)
+        response=conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
+        check_delete_objects_response(response)
+
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("delete_bucket_with_all_objects: Token=%s, count=%d",
+                      continuation_token, obj_count)
          else:
              break
  
@@ -476,7 +560,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
  def verify_pool_is_empty():
      result = admin(['gc', 'process', '--include-all'])
      assert result[1] == 0
-    assert count_object_parts_in_all_buckets() == 0
+    assert count_object_parts_in_all_buckets(False, 0) == 0
  
  
  #-------------------------------------------------------------------------------
@@ -538,15 +622,39 @@ def calc_rados_obj_count(num_copies, obj_size, config):
      return rados_obj_count
  
  
+BLOCK_SIZE=4096
+#-------------------------------------------------------------------------------
+def calc_on_disk_byte_size(byte_size):
+    return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+
+
+#-------------------------------------------------------------------------------
+def calc_head_size(obj_size, config):
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+    threshold = config.multipart_threshold
+    # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
+    # multi-part objects got a zero size Head objects
+    if obj_size >= threshold:
+        head_size = 0
+    else:
+        head_size = min(RADOS_OBJ_SIZE, on_disk_byte_size)
+
+    return head_size
+
+
  #-------------------------------------------------------------------------------
  def calc_dedupable_space(obj_size, config):
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
      threshold = config.multipart_threshold
      # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
      # multi-part objects got a zero size Head objects
      if obj_size >= threshold:
-        dedupable_space = obj_size
-    elif obj_size > RADOS_OBJ_SIZE:
-        dedupable_space = obj_size - RADOS_OBJ_SIZE
+        dedupable_space = on_disk_byte_size
+    elif obj_size > SPLIT_HEAD_SIZE:
+        dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE
+    elif obj_size >= DEDUP_MIN_OBJ_SIZE:
+        dedupable_space = on_disk_byte_size
      else:
          dedupable_space = 0
  
@@ -554,10 +662,18 @@ def calc_dedupable_space(obj_size, config):
                float(obj_size)/MB, float(dedupable_space)/MB)
      return dedupable_space
  
-BLOCK_SIZE=4096
  #-------------------------------------------------------------------------------
-def calc_on_disk_byte_size(byte_size):
-    return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+def calc_split_objs_count(obj_size, num_copies, config):
+    threshold = config.multipart_threshold
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
+    if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold:
+        return 0
+
+    if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE:
+        return 0
+
+    return 1
  
  
  #-------------------------------------------------------------------------------
@@ -569,7 +685,7 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
      threshold = config.multipart_threshold
      dedup_stats.skip_shared_manifest = 0
      dedup_stats.size_before_dedup += (on_disk_byte_size * num_copies)
-    if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE:
+    if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE:
          dedup_stats.skip_too_small += num_copies
          dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
  
@@ -584,8 +700,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
          return
  
      dedup_stats.total_processed_objects += num_copies
-    #dedup_stats.loaded_objects += num_copies
-
      if num_copies == 1:
          dedup_stats.singleton_obj += 1
          dedup_stats.skip_singleton += 1
@@ -628,21 +742,35 @@ def calc_expected_results(files, config):
  
  
  #-------------------------------------------------------------------------------
-def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=True):
+def print_files(files, config):
+    for idx, f in enumerate(files):
+        filename=f[0]
+        obj_size=f[1]
+        num_copies=f[2]
+        assert(obj_size)
+        split_head = calc_split_objs_count(obj_size, num_copies, config)
+        log.info("[%d]%s::size=%d, num_copies=%d, split_head=%d",
+                 idx, filename, obj_size, num_copies, split_head);
+
+
+#-------------------------------------------------------------------------------
+def upload_objects(bucket_name, files, indices, conn, config, check_obj_count):
      dedup_stats = Dedup_Stats()
      total_space=0
      duplicated_space=0
      duplicated_tail_objs=0
      rados_objects_total=0
      s3_objects_total=0
+    split_head_objs=0
  
      for (f, idx) in zip(files, indices):
          filename=f[0]
          obj_size=f[1]
          num_copies=f[2]
          assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
          dedupable_space=calc_dedupable_space(obj_size, config)
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -652,10 +780,9 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
          s3_objects_total += num_copies
          if s3_objects_total and (s3_objects_total % 1000 == 0):
              log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-                     s3_objects_total, rados_objects_total, total_space/MB)
+                      s3_objects_total, rados_objects_total, total_space/MB)
          for i in range(idx, num_copies):
              key = gen_object_name(filename, i)
-            #log.debug("upload_file %s/%s with crc32", bucket_name, key)
              conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
  
      log.debug("==========================================")
@@ -665,15 +792,70 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
      log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
      log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
-
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    log.info("split_head_objs=%d, rados_objects_total=%d, duplicated_tail_objs=%d",
+             split_head_objs, rados_objects_total, duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
      log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
      if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("upload_objects: verify current Rados state - total_space=%d", total_space)
+        # assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        # skip size check as it is time consuming
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
+
+    return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
+
+
+#-------------------------------------------------------------------------------
+def upload_objects_with_copy(files, conn, bucket_names, indices, config):
+    dedup_stats = Dedup_Stats()
+    total_space=0
+    rados_objects_total=0
+    s3_objects_total=0
+
+    for (f, idx) in zip(files, indices):
+        filename=f[0]
+        obj_size=f[1]
+        num_copies=f[2]
+        assert(obj_size)
+        head_size = calc_head_size(obj_size, config)
+        obj_size_on_disk=calc_on_disk_byte_size(obj_size)
+        log.debug("upload_objects_with_copy:obj_size=%d, on_disk_size=%d, head_size=%d",
+                  obj_size, obj_size_on_disk, head_size);
+        total_space += (obj_size_on_disk + (num_copies-1)*head_size)
+        rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
+        tail_objs_count =rados_obj_count-1
+        rados_objects_total += (tail_objs_count + num_copies)
+        log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
+        s3_objects_total += num_copies
+        if s3_objects_total and (s3_objects_total % 1000 == 0):
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+                      s3_objects_total, rados_objects_total, total_space/MB)
+
+        base_obj=dict()
+        for i in range(idx, num_copies):
+            key = gen_object_name(filename, i)
+            bucket_name=bucket_names[i]
+            if i == 0:
+                base_obj = {'Bucket': bucket_name, 'Key': key}
+                #log.info("upload_file: %s -> %s/%s", filename, bucket_name, key)
+                conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
+            else:
+                log.debug("copy_obj: %s/%s -> %s/%s",
+                          base_obj['Bucket'], base_obj['Key'], bucket_name, key)
+                conn.copy_object(CopySource=base_obj, Bucket=bucket_name, Key=key)
+
+
+    dedup_stats.deduped_obj = 0
+    dedup_stats.size_before_dedup = total_space
+    # No change should happen since tail-objects are already de-duplicated
+    dedup_stats.dedup_bytes_estimate = 0
+    expected_rados_obj_count_post_dedup=rados_objects_total
+
+    log.info("upload_objects: verify current Rados state - total_space=%d", total_space)
+    assert rados_objects_total == count_object_parts_in_all_buckets(False, total_space)
  
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
      return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
  
  
@@ -686,13 +868,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
      duplicated_tail_objs=0
      rados_objects_total=0
      s3_objects_total=0
+    split_head_objs=0
+
      for (f, idx) in zip(files, indices):
          filename=f[0]
          obj_size=f[1]
          num_copies=f[2]
          assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
          dedupable_space=calc_dedupable_space(obj_size, config)
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -702,7 +887,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
          s3_objects_total += num_copies
          if s3_objects_total and (s3_objects_total % 1000 == 0):
              log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-                     s3_objects_total, rados_objects_total, total_space/MB)
+                      s3_objects_total, rados_objects_total, total_space/MB)
          for i in range(idx, num_copies):
              ten_id = i % max_tenants
              key = gen_object_name(filename, i)
@@ -710,8 +895,8 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
              log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key)
  
      log.debug("==========================================")
-    log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-             s3_objects_total, rados_objects_total, total_space/MB)
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+              s3_objects_total, rados_objects_total, total_space/MB)
      log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
      log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
@@ -722,15 +907,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
          s3_object_count += count_objects_in_bucket(bucket_name, conn)
  
      log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
      log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
      if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("upload_objects_multi: verify current Rados state (obj/size)")
+        #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
          assert (s3_object_count == s3_objects_total)
  
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
      return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
  
  
@@ -769,13 +955,16 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
      duplicated_tail_objs=0
      rados_objects_total=0
      s3_objects_total=0
+    split_head_objs=0
+
      for (f, idx) in zip(files, indices):
          filename=f[0]
          obj_size=f[1]
          num_copies=f[2]
          assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
          dedupable_space=calc_dedupable_space(obj_size, config)
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -789,8 +978,8 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
          proc_list[idx].join()
  
      log.debug("==========================================")
-    log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-             s3_objects_total, rados_objects_total, total_space/MB)
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+              s3_objects_total, rados_objects_total, total_space/MB)
      log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
      log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
@@ -801,27 +990,135 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
          s3_object_count += count_objects_in_bucket(bucket_name, conn)
  
      log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
      log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
      if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("procs_upload_objects:: count_object_parts_in_all_buckets()")
+        #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
          assert (s3_object_count == s3_objects_total)
  
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
      return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
  
+#-------------------------------------------------------------------------------
+def check_if_any_obj_exists(bucket_name, delete_list, conn):
+    delete_set = set(delete_list)
+    max_keys=1000
+    continuation_token = None
+
+    while True:
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
+        if 'Contents' in listing:
+            for obj in listing['Contents']:
+                key=obj['Key']
+                log.debug("check_if_any_obj_exists: key=%s", key)
+                if obj['Key'] in delete_set:
+                    log.info("key <%s> was found in bucket", key)
+
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("check_if_any_obj_exists: Token=%s", continuation_token)
+        else:
+            break
+
+
+#-------------------------------------------------------------------------------
+def delete_objects_multi(conns, bucket_names, ten_id, object_keys):
+    conn = conns[ten_id]
+    bucket_name = bucket_names[ten_id]
+    delete_objects(conn, bucket_name, object_keys)
  
  #-------------------------------------------------------------------------------
-def verify_objects(bucket_name, files, conn, expected_results, config):
+def delete_dup_objects_multi(files, conns, bucket_names):
+    max_tenants=len(conns)
+    tenants_obj_lists = [[] for _ in range(max_tenants)]
+
+    for f in files:
+        filename=f[0]
+        num_copies=f[2]
+        if num_copies > 1:
+            start_idx=1
+            for i in range(start_idx, num_copies):
+                key = gen_object_name(filename, i)
+                log.debug("delete_dup_objects_multi: delete key::%s::", key);
+                ten_id = i % max_tenants
+                object_keys = tenants_obj_lists[ten_id]
+                object_keys.append(key)
+                # flush delete request after every 500 objects
+                if len(object_keys) >= 500:
+                    delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+                    object_keys.clear()
+
+    # remove leftover objects
+    for ten_id in range(max_tenants):
+        object_keys = tenants_obj_lists[ten_id]
+        if len(object_keys):
+            delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+
+    # must call garbage collection for predictable count
+    result = admin(['gc', 'process', '--include-all'])
+    assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def delete_dup_objects(bucket_name, files, conn):
+    delete_list_total=[]
+    object_keys=[]
+
+    for f in files:
+        filename=f[0]
+        num_copies=f[2]
+        if num_copies > 1:
+            start_idx=1
+            for i in range(start_idx, num_copies):
+                key = gen_object_name(filename, i)
+                log.debug("delete key::%s::", key);
+                delete_list_total.append(key)
+                object_keys.append(key)
+
+            # flush delete request after every 500 files
+            if len(object_keys) >= 500:
+                delete_objects(conn, bucket_name, object_keys)
+                object_keys.clear()
+
+
+    # remove leftover objects
+    if len(object_keys):
+        delete_objects(conn, bucket_name, object_keys)
+
+    verify=True
+    if verify:
+        log.debug("delete_dup_objects: verify delete_list_total")
+        check_if_any_obj_exists(bucket_name, delete_list_total, conn)
+
+    # must call garbage collection for predictable count
+    result = admin(['gc', 'process', '--include-all'])
+    assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def verify_objects(bucket_name, files, conn, expected_results, config, delete_dups):
+    if expected_results:
+        assert expected_results == count_object_parts_in_all_buckets(True)
+
      tmpfile = OUT_DIR + "temp"
      for f in files:
          filename=f[0]
          obj_size=f[1]
          num_copies=f[2]
          log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
-        for i in range(0, num_copies):
+
+        ### first verify duplicates at index 1 and higher
+        for i in range(1, num_copies):
              filecmp.clear_cache()
              key = gen_object_name(filename, i)
              conn.download_file(bucket_name, key, tmpfile, Config=config)
@@ -829,12 +1126,28 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
              assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
              os.remove(tmpfile)
  
-    log.debug("verify_objects: finished reading all objects")
-    assert expected_results == count_object_parts_in_all_buckets(True)
+    ### Then delete all duplicates
+    if delete_dups:
+        delete_dup_objects(bucket_name, files, conn)
+
+    ### Last, verify the object at index zero making sure refcount works
+    filecmp.clear_cache()
+    i = 0
+    for f in files:
+        filename=f[0]
+        key = gen_object_name(filename, i)
+        conn.download_file(bucket_name, key, tmpfile, Config=config)
+        equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+        assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+        os.remove(tmpfile)
+
      log.debug("verify_objects::completed successfully!!")
  
  #-------------------------------------------------------------------------------
-def verify_objects_multi(files, conns, bucket_names, expected_results, config):
+def verify_objects_multi(files, conns, bucket_names, expected_results, config, delete_dups):
+    if expected_results:
+        assert expected_results == count_object_parts_in_all_buckets(True)
+
      max_tenants=len(conns)
      tmpfile = OUT_DIR + "temp"
      for f in files:
@@ -842,18 +1155,37 @@ def verify_objects_multi(files, conns, bucket_names, expected_results, config):
          obj_size=f[1]
          num_copies=f[2]
          log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
-        for i in range(0, num_copies):
+        ### first verify duplicates at index 1 and higher
+        for i in range(1, num_copies):
              filecmp.clear_cache()
              key = gen_object_name(filename, i)
              log.debug("comparing object %s with file %s", key, filename)
              ten_id = i % max_tenants
-            conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, Config=config)
+            conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+                                        Config=config)
              equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
              assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
              os.remove(tmpfile)
  
-    assert expected_results == count_object_parts_in_all_buckets(True)
-    log.debug("verify_objects::completed successfully!!")
+    ### Then delete all duplicates
+    if delete_dups:
+        delete_dup_objects_multi(files, conns, bucket_names)
+
+    ### Last, verify the object at index zero making sure refcount works
+    filecmp.clear_cache()
+    i = 0
+    for f in files:
+        filename=f[0]
+        key = gen_object_name(filename, i)
+        log.debug("comparing object %s with file %s", key, filename)
+        ten_id = i % max_tenants
+        conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+                                    Config=config)
+        equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+        assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+        os.remove(tmpfile)
+
+    log.debug("verify_objects_multi::completed successfully!!")
  
  
  #-------------------------------------------------------------------------------
@@ -893,7 +1225,7 @@ def threads_verify_objects(files, conns, bucket_names, expected_results, config)
          thread_list[idx].join()
  
      assert expected_results == count_object_parts_in_all_buckets(True)
-    log.debug("verify_objects::completed successfully!!")
+    log.debug("threads_verify_objects::completed successfully!!")
  
  
  #-------------------------------------------------------------------------------
@@ -903,9 +1235,12 @@ def get_stats_line_val(line):
  
  #-------------------------------------------------------------------------------
  def print_dedup_stats(dedup_stats):
+    log.info("===============================================")
+
      for key in dedup_stats.__dict__:
-        log.warning("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
+        log.info("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
  
+    log.info("===============================================")
  
  #-------------------------------------------------------------------------------
  def print_dedup_stats_diff(actual, expected):
@@ -992,8 +1327,14 @@ def verify_dedup_ratio(expected_dedup_stats, dedup_ratio):
      else:
          ratio = 0
  
+
+    log.debug("skip_too_small_bytes = %d", expected_dedup_stats.skip_too_small_bytes)
+    if expected_dedup_stats.non_default_storage_class_objs_bytes:
+        log.debug("non_default_storage_class_objs_bytes= %d",
+                 expected_dedup_stats.non_default_storage_class_objs_bytes)
+
      log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
-    log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
+    log.debug("s3_dedup_bytes = %d", s3_dedup_bytes);
      log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
      log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio)
  
@@ -1098,7 +1439,7 @@ def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
              set_bucket_index_throttling(limit)
  
  #-------------------------------------------------------------------------------
-def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
+def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size=0):
      # dedup should complete in less than 5 minutes
      max_dedup_time = 5*60
      if expected_dedup_stats.deduped_obj > 10000:
@@ -1113,7 +1454,16 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
      dedup_stats = ret[1]
      dedup_ratio_estimate = ret[2]
      dedup_ratio_actual = ret[3]
+    log.debug("exec_dedup: verify_dedup_ratio")
+    verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
+    if post_dedup_size == 0:
+        post_dedup_size = dedup_ratio_estimate.s3_bytes_after
+
+    # no need to check after dry-run which doesn't change anything
+    if dry_run:
+        post_dedup_size = 0
  
+    count_object_parts_in_all_buckets(True, post_dedup_size)
      if verify_stats == False:
          return ret
  
@@ -1121,6 +1471,7 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
          log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
                    expected_dedup_stats.potential_unique_obj)
  
+
      #dedup_stats.set_hash = dedup_stats.invalid_hash
      if dedup_stats != expected_dedup_stats:
          log.debug("==================================================")
@@ -1129,16 +1480,14 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
          log.debug("==================================================\n")
          assert dedup_stats == expected_dedup_stats
  
-    verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
      log.debug("expcted_dedup::stats check completed successfully!!")
      return ret
  
-
  #-------------------------------------------------------------------------------
  def prepare_test():
      cleanup_local()
      #make sure we are starting with all buckets empty
-    if count_object_parts_in_all_buckets() != 0:
+    if count_object_parts_in_all_buckets(False, 0) != 0:
          log.warning("The system was left dirty from previous run");
          log.warning("Make sure to remove all objects before starting");
          assert(0)
@@ -1163,15 +1512,16 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
      prepare_test()
      try:
          files=[]
-        num_files = 8
-        base_size = 4*KB
+        num_files = 5
+        base_size = 1*KB
          log.debug("generate files: base size=%d KiB, max_size=%d KiB",
                    base_size/KB, (pow(2, num_files) * base_size)/KB)
          gen_files(files, base_size, num_files)
          bucket = conn.create_bucket(Bucket=bucket_name)
          log.debug("upload objects to bucket <%s> ...", bucket_name)
          indices = [0] * len(files)
-        ret = upload_objects(bucket_name, files, indices, conn, default_config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
          expected_results = ret[0]
          dedup_stats = ret[1]
          s3_objects_total = ret[2]
@@ -1183,13 +1533,11 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
          small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small = s3_objects_total
-        assert small_objs_dedup_stats == dedup_stats
  
          exec_dedup(dedup_stats, dry_run)
          if dry_run == False:
              log.debug("Verify all objects")
-            verify_objects(bucket_name, files, conn, expected_results, default_config)
-
+            verify_objects(bucket_name, files, conn, expected_results, default_config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
@@ -1221,21 +1569,22 @@ def simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run):
          bucket = conn.create_bucket(Bucket=bucket_name)
          indices = [0] * len(files)
          log.debug("upload objects to bucket <%s> ...", bucket_name)
-        ret = upload_objects(bucket_name, files, indices, conn, config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
          expected_results = ret[0]
          dedup_stats = ret[1]
          log.info("%d S3 objects were uploaded", ret[2])
          exec_dedup(dedup_stats, dry_run)
          if dry_run == False:
              log.debug("Verify all objects")
-            verify_objects(bucket_name, files, conn, expected_results, config)
-
-        return ret
+            verify_objects(bucket_name, files, conn, expected_results, config, run_cleanup_after)
      finally:
          if run_cleanup_after:
              # cleanup must be executed even after a failure
              cleanup(bucket_name, conn)
  
+        return ret
+
  
  #-------------------------------------------------------------------------------
  def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
@@ -1246,7 +1595,8 @@ def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False)
      exec_dedup(dedup_stats, dry_run)
      if dry_run == False:
          log.debug("Verify all objects")
-        verify_objects_multi(files, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files, conns, bucket_names, expected_results, config,
+                             False)
  
      return ret
  
@@ -1267,19 +1617,18 @@ def dedup_basic_with_tenants_common(files, max_copies_count, config, dry_run):
  #-------------------------------------------------------------------------------
  def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
      indices=[0] * len(files)
-
      start = time.time_ns()
-    upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config)
+    check_obj_count=True
+    upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_count)
      upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
      expected_results = upload_ret[0]
      dedup_stats = upload_ret[1]
      s3_objects_total = upload_ret[2]
-
      exec_ret=exec_dedup(dedup_stats, dry_run)
      exec_time_sec=exec_ret[0]
      verify_time_sec=0
      if dry_run == False:
-        log.debug("Verify all objects")
+        log.debug("threads_simple_dedup_with_tenants::Verify all objects")
          start = time.time_ns()
          threads_verify_objects(files, conns, bucket_names,
                                 expected_results, config)
@@ -1578,6 +1927,7 @@ CORRUPTIONS = ("no corruption", "change_etag", "illegal_hex_value",
                 "change_num_parts", "illegal_separator",
                 "illegal_dec_val_num_parts", "illegal_num_parts_overflow")
  
+
  #------------------------------------------------------------------------------
  def change_object_etag(rados_name, new_etag):
      result = rados(['-p ', POOLNAME, 'setxattr', rados_name, ETAG_ATTR, new_etag])
@@ -1646,7 +1996,7 @@ def corrupt_etag(key, corruption, expected_dedup_stats):
      new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats)
  
      log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
-             corruption, old_etag, new_etag)
+              corruption, old_etag, new_etag)
      change_object_etag(rados_name, new_etag)
      return (rados_name, old_etag)
  
@@ -1670,7 +2020,8 @@ def test_dedup_etag_corruption():
  
          bucket = conn.create_bucket(Bucket=bucket_name)
          indices = [0] * len(files)
-        ret = upload_objects(bucket_name, files, indices, conn, default_config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
          expected_results = ret[0]
          expected_dedup_stats = ret[1]
          s3_objects_total = ret[2]
@@ -1751,7 +2102,8 @@ def test_md5_collisions():
          conn=get_single_connection()
          bucket = conn.create_bucket(Bucket=bucket_name)
          indices = [0] * len(files)
-        upload_objects(bucket_name, files, indices, conn, config2)
+        check_obj_count=True
+        upload_objects(bucket_name, files, indices, conn, config2, check_obj_count)
  
          dedup_stats = Dedup_Stats()
          # we wrote 2 different small objects (BLOCK_SIZE) with the same md5
@@ -1770,33 +2122,178 @@ def test_md5_collisions():
          dedup_stats.set_hash=dedup_stats.total_processed_objects
          dedup_stats.hash_mismatch=1
          s3_bytes_before=dedup_stats.size_before_dedup
-        expected_ratio_actual=Dedup_Ratio()
-        expected_ratio_actual.s3_bytes_before=s3_bytes_before
-        expected_ratio_actual.s3_bytes_after=s3_bytes_before
-        expected_ratio_actual.ratio=0
+        expected_ratio=Dedup_Ratio()
+        expected_ratio.s3_bytes_before=s3_bytes_before
+        expected_ratio.s3_bytes_after=s3_bytes_before
+        expected_ratio.ratio=0
  
          dry_run=False
          log.debug("test_md5_collisions: first call to exec_dedup")
-        ret=exec_dedup(dedup_stats, dry_run)
+        ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
          dedup_ratio_actual=ret[3]
-
-        assert expected_ratio_actual == dedup_ratio_actual
+        assert expected_ratio == dedup_ratio_actual
  
          dedup_stats.valid_hash=dedup_stats.total_processed_objects
          dedup_stats.invalid_hash=0
          dedup_stats.set_hash=0
  
          log.debug("test_md5_collisions: second call to exec_dedup")
-        ret=exec_dedup(dedup_stats, dry_run)
+        ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
          dedup_ratio_actual=ret[3]
  
-        assert expected_ratio_actual == dedup_ratio_actual
+        assert expected_ratio == dedup_ratio_actual
  
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
  
  
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head_with_tenants():
+    prepare_test()
+    config=default_config
+    success=False
+    max_copies_count=4
+    files=[]
+    num_files=11 # [16KB-32MB]
+    base_size = 16*KB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+        ret=gen_connections_multi2(max_copies_count)
+        #tenants=ret[0]
+        bucket_names=ret[1]
+        conns=ret[2]
+
+        ret=upload_objects_multi(files, conns, bucket_names, indices, config, True)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+
+        dry_run=False
+        exec_dedup(dedup_stats, dry_run, True)
+        log.debug("Verify all objects")
+        verify_objects_multi(files, conns, bucket_names, expected_results, config,
+                             True)
+        success=True
+    finally:
+        cleanup_all_buckets(bucket_names, conns)
+        if not success:
+            print_files(files, config)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head_with_tenants():
+    #return
+
+    if full_dedup_is_disabled():
+        return
+
+    for idx in range(0, 9):
+        log.debug("test_dedup_split_head_with_tenants: loop #%d", idx);
+        loop_dedup_split_head_with_tenants()
+
+
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head():
+    prepare_test()
+    #bucket_name = gen_bucket_name()
+    bucket_name = "bucket1"
+    config=default_config
+    max_copies_count=4
+    files=[]
+    num_files=11 # [16KB-32MB]
+    base_size = 16*KB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+
+        conn=get_single_connection()
+        conn.create_bucket(Bucket=bucket_name)
+        check_obj_count=True
+        ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+        dry_run=False
+        exec_dedup(dedup_stats, dry_run, True)
+        verify_objects(bucket_name, files, conn, expected_results, config, True)
+    finally:
+        cleanup(bucket_name, conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head():
+    #return
+
+    if full_dedup_is_disabled():
+        return
+
+    for idx in range(0, 9):
+        log.debug("test_dedup_split_head: loop #%d", idx);
+        loop_dedup_split_head()
+
+#-------------------------------------------------------------------------------
+def dedup_copy_internal(multi_buckets):
+    if full_dedup_is_disabled():
+        return
+
+    prepare_test()
+    bucket_names=[]
+    config=default_config
+    max_copies_count=4
+    files=[]
+    num_files=6 # [5MB-512MB]
+    base_size=5*MB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    conn=get_single_connection()
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+        if multi_buckets:
+            bucket_names=create_buckets(conn, max_copies_count)
+        else:
+            bucket_name = "bucket1"
+            conn.create_bucket(Bucket=bucket_name)
+            bucket_names=[bucket_name] * max_copies_count
+
+        ret=upload_objects_with_copy(files, conn, bucket_names, indices, config)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+        dry_run=False
+        max_dedup_time = 5*60
+        exec_dedup_internal(dedup_stats, dry_run, max_dedup_time)
+
+        assert expected_results == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        expected_results=0  # skip object_parts verification
+        conns=[conn] * len(bucket_names)
+        verify_objects_multi(files, conns, bucket_names, expected_results, config, True)
+    finally:
+        # cleanup must be executed even after a failure
+        if multi_buckets:
+            for bucket_name in bucket_names:
+                cleanup(bucket_name, conn)
+        else:
+            cleanup(bucket_names[0], conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy():
+    #return
+    dedup_copy_internal(False)
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy_multi_buckets():
+    #return
+    dedup_copy_internal(True)
+
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_small():
@@ -1822,8 +2319,8 @@ def test_dedup_small_with_tenants():
      prepare_test()
      max_copies_count=3
      files=[]
-    num_files=10 # [4KB-4MB]
-    base_size = 4*KB
+    num_files=5 # [1KB-32KB]
+    base_size = 1*KB
      log.debug("generate files: base size=%d KiB, max_size=%d KiB",
               base_size/KB, (pow(2, num_files) * base_size)/KB)
      try:
@@ -1851,7 +2348,8 @@ def test_dedup_small_with_tenants():
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
          log.debug("Verify all objects")
-        verify_objects_multi(files, conns, bucket_names, expected_results, default_config)
+        verify_objects_multi(files, conns, bucket_names, expected_results,
+                             default_config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -1894,7 +2392,7 @@ def test_dedup_inc_0_with_tenants():
          dedup_stats2.set_shared_manifest_src=0
          dedup_stats2.deduped_obj=0
          dedup_stats2.deduped_obj_bytes=0
-        dedup_stats2.valid_hash=dedup_stats.invalid_hash
+        dedup_stats2.valid_hash=dedup_stats.unique_obj
          dedup_stats2.invalid_hash=0
          dedup_stats2.set_hash=0
  
@@ -1902,7 +2400,8 @@ def test_dedup_inc_0_with_tenants():
          # run dedup again and make sure nothing has changed
          dry_run=False
          exec_dedup(dedup_stats2, dry_run)
-        verify_objects_multi(files, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files, conns, bucket_names, expected_results,
+                             config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -1931,7 +2430,9 @@ def test_dedup_inc_0():
          num_files = 11
          gen_files_in_range(files, num_files, 1*MB, 64*MB)
          # upload objects, dedup, verify, but don't cleanup
-        ret = simple_dedup(conn, files, bucket_name, False, config, False)
+        run_cleanup_after=False
+        dry_run=False
+        ret = simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run)
          expected_results = ret[0]
          dedup_stats = ret[1]
          s3_objects_total = ret[2]
@@ -1942,7 +2443,7 @@ def test_dedup_inc_0():
          dedup_stats2.set_shared_manifest_src=0
          dedup_stats2.deduped_obj=0
          dedup_stats2.deduped_obj_bytes=0
-        dedup_stats2.valid_hash=dedup_stats.invalid_hash
+        dedup_stats2.valid_hash=dedup_stats.unique_obj
          dedup_stats2.invalid_hash=0
          dedup_stats2.set_hash=0
  
@@ -1950,7 +2451,7 @@ def test_dedup_inc_0():
          # run dedup again and make sure nothing has changed
          dry_run=False
          exec_dedup(dedup_stats2, dry_run)
-        verify_objects(bucket_name, files, conn, expected_results, config)
+        verify_objects(bucket_name, files, conn, expected_results, config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
@@ -2002,6 +2503,7 @@ def test_dedup_inc_1_with_tenants():
          ret=upload_objects_multi(files_combined, conns, bucket_names, indices, config, False)
          expected_results=ret[0]
          stats_combined=ret[1]
+
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
          stats_combined.skip_src_record     -= stats_base.skip_src_record
          stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
@@ -2010,15 +2512,16 @@ def test_dedup_inc_1_with_tenants():
          stats_combined.deduped_obj         -= stats_base.deduped_obj
          stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
          stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
  
          log.debug("test_dedup_inc_1_with_tenants: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
-        verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                             config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -2063,7 +2566,8 @@ def test_dedup_inc_1():
              num_copies_combined=num_copies_to_add+num_copies_base
              files_combined.append((filename, obj_size, num_copies_combined))
  
-        ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+        check_obj_count=False
+        ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
          expected_results = ret[0]
          stats_combined = ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2074,15 +2578,15 @@ def test_dedup_inc_1():
          stats_combined.deduped_obj         -= stats_base.deduped_obj
          stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
          stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
  
          log.debug("test_dedup_inc_1: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
-        verify_objects(bucket_name, files_combined, conn, expected_results, config)
+        verify_objects(bucket_name, files_combined, conn, expected_results, config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
@@ -2151,15 +2655,16 @@ def test_dedup_inc_2_with_tenants():
          stats_combined.deduped_obj         -= stats_base.deduped_obj
          stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
          stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
  
          log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
-        verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                             config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -2212,7 +2717,8 @@ def test_dedup_inc_2():
              indices.append(0)
  
          assert(len(indices) == len(files_combined))
-        ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+        check_obj_count=False
+        ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
          expected_results = ret[0]
          stats_combined = ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2223,16 +2729,16 @@ def test_dedup_inc_2():
          stats_combined.deduped_obj         -= stats_base.deduped_obj
          stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
          stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
  
          log.debug("test_dedup_inc_2: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
          verify_objects(bucket_name, files_combined, conn, expected_results,
-                       config)
+                       config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
@@ -2246,7 +2752,6 @@ def test_dedup_inc_2():
  @pytest.mark.basic_test
  def test_dedup_inc_with_remove_multi_tenants():
      #return
-
      if full_dedup_is_disabled():
          return
  
@@ -2259,6 +2764,9 @@ def test_dedup_inc_with_remove_multi_tenants():
      bucket_names=ret[1]
      conns=ret[2]
      try:
+        split_heads_count=0
+        split_heads_removed=0
+        split_heads=[]
          files=[]
          num_files = 17
          # gen_files_in_range creates 2-3 copies
@@ -2268,14 +2776,23 @@ def test_dedup_inc_with_remove_multi_tenants():
          expected_results_base = ret[0]
          stats_base = ret[1]
  
+        ### find which objects got split head before remove
+        for f in files:
+            obj_size=f[1]
+            num_copies=f[2]
+            split_head = calc_split_objs_count(obj_size, num_copies, config)
+            split_heads.append(split_head)
+            if split_head:
+                split_heads_count += 1
+
          # REMOVE some objects and update stats/expected
          src_record=0
          shared_manifest=0
-        valid_sha=0
+        valid_hash=0
          object_keys=[]
          files_sub=[]
          dedup_stats = Dedup_Stats()
-        for f in files:
+        for idx, f in enumerate(files):
              filename=f[0]
              obj_size=f[1]
              num_copies=f[2]
@@ -2283,13 +2800,18 @@ def test_dedup_inc_with_remove_multi_tenants():
              num_copies_2=num_copies-num_remove
              log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
              if num_copies_2:
-                if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
-                    valid_sha += num_copies_2
+                split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+                if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+                    valid_hash += 1
                      src_record += 1
                      shared_manifest += (num_copies_2 - 1)
  
                  files_sub.append((filename, obj_size, num_copies_2))
                  calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+            elif split_heads[idx]:
+                # we removed all copies of a split-head object
+                split_heads_count -= 1
+                split_heads_removed += 1
  
              start_idx=num_copies_2
              for i in range(start_idx, num_copies):
@@ -2305,9 +2827,10 @@ def test_dedup_inc_with_remove_multi_tenants():
          dedup_stats.set_shared_manifest_src=0
          dedup_stats.deduped_obj=0
          dedup_stats.deduped_obj_bytes=0
+
          dedup_stats.skip_src_record=src_record
          dedup_stats.skip_shared_manifest=shared_manifest
-        dedup_stats.valid_hash=valid_sha
+        dedup_stats.valid_hash=valid_hash
          dedup_stats.invalid_hash=0
          dedup_stats.set_hash=0
  
@@ -2315,7 +2838,9 @@ def test_dedup_inc_with_remove_multi_tenants():
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
          expected_results=calc_expected_results(files_sub, config)
-        verify_objects_multi(files_sub, conns, bucket_names, expected_results, config)
+        expected_results += split_heads_count
+        verify_objects_multi(files_sub, conns, bucket_names, expected_results,
+                             config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -2329,7 +2854,6 @@ def test_dedup_inc_with_remove_multi_tenants():
  @pytest.mark.basic_test
  def test_dedup_inc_with_remove():
      #return
-
      if full_dedup_is_disabled():
          return
  
@@ -2339,6 +2863,9 @@ def test_dedup_inc_with_remove():
      log.debug("test_dedup_inc_with_remove: connect to AWS ...")
      conn=get_single_connection()
      try:
+        split_heads_count=0
+        split_heads_removed=0
+        split_heads=[]
          files=[]
          num_files = 17
          gen_files_in_range(files, num_files, 1*MB, 64*MB)
@@ -2347,28 +2874,41 @@ def test_dedup_inc_with_remove():
          expected_results_base = ret[0]
          stats_base = ret[1]
  
+        ### find which objects got split head before remove
+        for f in files:
+            obj_size=f[1]
+            num_copies=f[2]
+            split_head = calc_split_objs_count(obj_size, num_copies, config)
+            split_heads.append(split_head)
+            if split_head:
+                split_heads_count += 1
+
          # REMOVE some objects and update stats/expected
          src_record=0
          shared_manifest=0
-        valid_sha=0
+        valid_hash=0
          object_keys=[]
          files_sub=[]
          dedup_stats = Dedup_Stats()
-        for f in files:
+        for idx, f in enumerate(files):
              filename=f[0]
              obj_size=f[1]
              num_copies=f[2]
              num_remove=random.randint(0, num_copies)
              num_copies_2=num_copies-num_remove
-            log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
              if num_copies_2:
-                if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
-                    valid_sha += num_copies_2
+                split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+                if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+                    valid_hash += 1
                      src_record += 1
                      shared_manifest += (num_copies_2 - 1)
  
                  files_sub.append((filename, obj_size, num_copies_2))
                  calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+            elif split_heads[idx]:
+                # we removed all copies of a split-head object
+                split_heads_count -= 1
+                split_heads_removed += 1
  
              start_idx=num_copies_2
              for i in range(start_idx, num_copies):
@@ -2380,8 +2920,7 @@ def test_dedup_inc_with_remove():
                  log.debug("Skiping file=%s, num_remove=%d", filename, num_remove)
                  continue
  
-            response=conn.delete_objects(Bucket=bucket_name,
-                                         Delete={"Objects": [{"Key": key} for key in object_keys]})
+            delete_objects(conn, bucket_name, object_keys)
  
          # must call garbage collection for predictable count
          result = admin(['gc', 'process', '--include-all'])
@@ -2393,17 +2932,17 @@ def test_dedup_inc_with_remove():
          dedup_stats.deduped_obj_bytes=0
          dedup_stats.skip_src_record=src_record
          dedup_stats.skip_shared_manifest=shared_manifest
-        dedup_stats.valid_hash=valid_sha
+        dedup_stats.valid_hash=valid_hash
          dedup_stats.invalid_hash=0
          dedup_stats.set_hash=0
  
          log.debug("test_dedup_inc_with_remove: incremental dedup:")
          log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
-        log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
          expected_results=calc_expected_results(files_sub, config)
-        verify_objects(bucket_name, files_sub, conn, expected_results, config)
+        expected_results += split_heads_count
+        verify_objects(bucket_name, files_sub, conn, expected_results, config, True)
      finally:
          # cleanup must be executed even after a failure
          cleanup(bucket_name, conn)
@@ -2462,7 +3001,6 @@ def test_dedup_multipart():
  
      simple_dedup(conn, files, bucket_name, True, default_config, False)
  
-
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_basic_with_tenants():
@@ -2497,10 +3035,12 @@ def test_dedup_basic():
      num_files=5
      base_size = MULTIPART_SIZE
      log.debug("generate files: base size=%d MiB, max_size=%d MiB",
-             base_size/MB, (pow(2, num_files) * base_size)/MB)
+              base_size/MB, (pow(2, num_files) * base_size)/MB)
      gen_files(files, base_size, num_files)
      log.debug("call simple_dedup()")
-    simple_dedup(conn, files, bucket_name, True, default_config, False)
+    run_cleanup_after=True
+    dry_run=False
+    simple_dedup(conn, files, bucket_name, run_cleanup_after, default_config, dry_run)
  
  
  #-------------------------------------------------------------------------------
@@ -2552,7 +3092,7 @@ def test_dedup_small_multipart():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_large_scale_with_tenants():
-    return
+    #return
  
      if full_dedup_is_disabled():
          return
@@ -2572,7 +3112,7 @@ def test_dedup_large_scale_with_tenants():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_large_scale():
-    return
+    #return
  
      if full_dedup_is_disabled():
          return
@@ -2592,7 +3132,7 @@ def test_dedup_large_scale():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_empty_bucket():
-    return
+    #return
  
      if full_dedup_is_disabled():
          return
@@ -2632,7 +3172,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
  
      # add new files
      num_files_new = 11
-    gen_files_in_range(files_combined, num_files_new, 2*MB, 32*MB)
+    gen_files_in_range(files_combined, num_files_new, 1*MB, 32*MB)
      pad_count = len(files_combined) - len(files)
      for i in range(0, pad_count):
          indices.append(0)
@@ -2646,7 +3186,8 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
      for f in files_combined:
          obj_size=f[1]
          num_copies=f[2]
-        if num_copies > 1 and obj_size > RADOS_OBJ_SIZE:
+        split_head = calc_split_objs_count(obj_size, num_copies, config)
+        if num_copies > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
              src_record += 1
  
      stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2655,15 +3196,15 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
      stats_combined.deduped_obj         -= stats_base.deduped_obj
      stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
-    stats_combined.valid_hash    = stats_base.set_hash
+    stats_combined.valid_hash    = stats_base.unique_obj
      stats_combined.invalid_hash -= stats_base.set_hash
-    stats_combined.set_hash     -= stats_base.set_hash
-
+    stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
      log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
      # run dedup again
      dry_run=False
      exec_dedup(stats_combined, dry_run)
-    verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+    verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                         config, False)
  
      return (files_combined, stats_combined)
  
@@ -2689,7 +3230,7 @@ def test_dedup_inc_loop_with_tenants():
          files=[]
          num_files = 13
          # gen_files_in_range creates 2-3 copies
-        gen_files_in_range(files, num_files, 1*MB, 64*MB)
+        gen_files_in_range(files, num_files, 256*KB, 64*MB)
          # upload objects, dedup, verify, but don't cleanup
          ret=simple_dedup_with_tenants(files, conns, bucket_names, config)
          stats_base=ret[1]
@@ -2699,9 +3240,10 @@ def test_dedup_inc_loop_with_tenants():
              files=ret[0]
              stats_last=ret[1]
              stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src
-            stats_base.deduped_obj       += stats_last.deduped_obj
-            stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes
-            stats_base.set_hash          += stats_last.set_hash
+            stats_base.unique_obj          += stats_last.set_shared_manifest_src
+            stats_base.deduped_obj         += stats_last.deduped_obj
+            stats_base.deduped_obj_bytes   += stats_last.deduped_obj_bytes
+            stats_base.set_hash            += stats_last.set_hash
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -2718,8 +3260,8 @@ def test_dedup_dry_small_with_tenants():
      prepare_test()
      max_copies_count=3
      files=[]
-    num_files=10 # [4KB-4MB]
-    base_size = 4*KB
+    num_files=5 # [1KB-32KB]
+    base_size = 1*KB
      log.debug("generate files: base size=%d KiB, max_size=%d KiB",
               base_size/KB, (pow(2, num_files) * base_size)/KB)
      try:
@@ -2762,10 +3304,10 @@ def test_dedup_dry_multipart():
  
      num_files=8
      min_size=MULTIPART_SIZE
-    #gen_files_in_range(files, num_files, min_size, 1024*MB)
+    # create files in range [MULTIPART_SIZE, 128MB] aligned on RADOS_OBJ_SIZE
      gen_files_in_range(files, num_files, min_size, 128*MB)
  
-    # add files in range [MULTIPART_SIZE, 4*MULTIPART_SIZE] aligned on MULTIPART_SIZE
+    # add files in range [MULTIPART_SIZE, 8*MULTIPART_SIZE] aligned on MULTIPART_SIZE
      gen_files_in_range(files, num_files, min_size, min_size*8, MULTIPART_SIZE)
  
      # add file with excatly MULTIPART_SIZE
@@ -2862,7 +3404,8 @@ def test_dedup_dry_small_large_mix():
              conns[i].create_bucket(Bucket=bucket_names[i])
  
          indices = [0] * len(files)
-        ret=procs_upload_objects(files, conns, bucket_names, indices, default_config)
+        check_obj_count=True
+        ret=procs_upload_objects(files, conns, bucket_names, indices, default_config, check_obj_count)
          upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
          expected_results = ret[0]
          dedup_stats = ret[1]
@@ -2870,8 +3413,6 @@ def test_dedup_dry_small_large_mix():
          log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total,
                   upload_time_sec)
          exec_dedup(dedup_stats, dry_run)
-        if dry_run == False:
-            verify_objects(bucket_name, files, conn, expected_results, default_config)
      finally:
          # cleanup must be executed even after a failure
          cleanup_all_buckets(bucket_names, conns)
@@ -2946,9 +3487,19 @@ def test_dedup_dry_large_scale_with_tenants():
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.info("test_dedup_dry_large_scale: connect to AWS ...")
      gen_files_fixed_size(files, num_files, size, max_copies_count)
-    threads_dedup_basic_with_tenants_common(files, num_threads, config, True)
+    conns=get_connections(num_threads)
+    bucket_names=get_buckets(num_threads)
+    for i in range(num_threads):
+        conns[i].create_bucket(Bucket=bucket_names[i])
+    try:
+        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
+    except Exception:
+        log.warning("test_dedup_dry_large_scale: failed!!")
+    finally:
+        # cleanup must be executed even after a failure
+        cleanup_all_buckets(bucket_names, conns)
  
  
  #-------------------------------------------------------------------------------
@@ -2957,25 +3508,26 @@ def test_dedup_dry_large_scale():
      #return
  
      prepare_test()
-    max_copies_count=3
-    num_threads=64
-    num_files=32*1024
+    bucket_name = gen_bucket_name()
+    max_copies_count=2
+    num_files=2*1024
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.debug("test_dedup_dry_large_scale_new: connect to AWS ...")
-    gen_files_fixed_size(files, num_files, size, max_copies_count)
-    conns=get_connections(num_threads)
-    bucket_names=get_buckets(num_threads)
-    for i in range(num_threads):
-        conns[i].create_bucket(Bucket=bucket_names[i])
+    log.info("test_dedup_dry_large_scale: connect to AWS ...")
      try:
-        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
-    except:
-        log.warning("test_dedup_dry_large_scale: failed!!")
+        gen_files_fixed_size(files, num_files, size, max_copies_count)
+        indices=[0] * len(files)
+        conn=get_single_connection()
+        conn.create_bucket(Bucket=bucket_name)
+        check_obj_count=True
+        ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+        dedup_stats = ret[1]
+        dry_run=True
+        exec_dedup(dedup_stats, dry_run, True)
      finally:
          # cleanup must be executed even after a failure
-        cleanup_all_buckets(bucket_names, conns)
+        cleanup(bucket_name, conn)
  
  
  #-------------------------------------------------------------------------------
author	Gabriel BenHanokh <gbenhano@redhat.com>
	Mon, 15 Sep 2025 19:01:02 +0000 (19:01 +0000)
committer	benhanokh <gbenhano@redhat.com>
	Tue, 24 Feb 2026 19:17:38 +0000 (21:17 +0200)
doc/radosgw/s3_objects_dedup.rst		patch \| blob \| history
src/common/options/rgw.yaml.in		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup.h		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_cluster.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_store.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_store.h		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_table.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_table.h		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_utils.cc		patch \| blob \| history
src/rgw/driver/rados/rgw_dedup_utils.h		patch \| blob \| history
src/rgw/driver/rados/rgw_obj_manifest.h		patch \| blob \| history
src/rgw/rgw_common.h		patch \| blob \| history
src/test/rgw/dedup/test_dedup.py		patch \| blob \| history