]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
rgw/dedup: split-head mechanism
authorGabriel BenHanokh <gbenhano@redhat.com>
Mon, 15 Sep 2025 19:01:02 +0000 (19:01 +0000)
committerbenhanokh <gbenhano@redhat.com>
Tue, 24 Feb 2026 19:17:38 +0000 (21:17 +0200)
Split head object into 2 objects - one with attributes and no data and
a new tail-object with only data.
The new-tail object will be deduped (unlike the head objects which can't
be dedup)
We will split head for objects with size 16MB or less

A few extra improvemnts:
Skip objects created by server-side-copy
Use reftag for comp-swap instead of manifest
Skip shared-manifest objects after readint attributes
Made max_obj_size_for_split and min_obj_size_for_dedup config value in
rgw.yaml.in

refined test: validate size after dedup
TBD: add rados ls -l to report object size on-bulk to speedup the process
improved tests - verify refcount are working, validate objects, remove
duplicates and then verify the last remaining object making sure it was
not deleted

Signed-off-by: Gabriel BenHanokh <gbenhano@redhat.com>
14 files changed:
doc/radosgw/s3_objects_dedup.rst
src/common/options/rgw.yaml.in
src/rgw/driver/rados/rgw_dedup.cc
src/rgw/driver/rados/rgw_dedup.h
src/rgw/driver/rados/rgw_dedup_cluster.cc
src/rgw/driver/rados/rgw_dedup_store.cc
src/rgw/driver/rados/rgw_dedup_store.h
src/rgw/driver/rados/rgw_dedup_table.cc
src/rgw/driver/rados/rgw_dedup_table.h
src/rgw/driver/rados/rgw_dedup_utils.cc
src/rgw/driver/rados/rgw_dedup_utils.h
src/rgw/driver/rados/rgw_obj_manifest.h
src/rgw/rgw_common.h
src/test/rgw/dedup/test_dedup.py

index b0b83d0ddf7d7c06c6952d96ddb2a7064c2a92a6..b8b1ffbefa4ab09d3a19aa409a13b6fcaa597172 100644 (file)
@@ -22,8 +22,6 @@ Admin commands
    Aborts an active dedup session and release all resources used by it.
 - ``radosgw-admin dedup stats``:
    Collects & displays last dedup statistics.
-- ``radosgw-admin dedup estimate``:
-   Starts a new dedup estimate session (aborting first existing session if exists).
 - ``radosgw-admin dedup throttle --max-bucket-index-ops=<count>``:
    Specify max bucket-index requests per second allowed for a single RGW server during dedup, 0 means unlimited.
 - ``radosgw-admin dedup throttle --stat``:
@@ -34,13 +32,17 @@ Skipped Objects
 ***************
 Dedup Estimate process skips the following objects:
 
-- Objects smaller than 4 MB (unless they are multipart).
+- Objects smaller than rgw_dedup_min_obj_size_for_dedup (unless they are multipart).
 - Objects with different placement rules.
 - Objects with different pools.
 - Objects with different storage classes.
 
 The full dedup process skips all the above and it also skips **compressed** and **user-encrypted** objects.
 
+The minimum size object for dedup is controlled by the following config option:
+
+.. confval:: rgw_dedup_min_obj_size_for_dedup
+
 *******************
 Estimate Processing
 *******************
@@ -85,6 +87,22 @@ If they are, we proceed with the deduplication:
 - copying the manifest from the source to the target.
 - removing all tail-objects on the target.
 
+***************
+Split Head Mode
+***************
+Dedup code can split the head object into 2 objects
+
+- one with attributes and no data and
+- a new tail-object with only data.
+
+The new-tail object will be deduped (unlike the head objects which can't be deduplicated)
+
+The split-Head mode is controlled by the following central configuration option:
+
+.. confval:: rgw_dedup_max_obj_size_for_split
+
+We will split head for objects with size smaller or equal to rgw_dedup_max_obj_size_for_split
+
 ************
 Memory Usage
 ************
index a163117fb8b2466daab58cf8765e88b9f964fb67..a72a039a282e2788cfed8c37f321981c2b4e3f54 100644 (file)
@@ -81,6 +81,29 @@ options:
   default: false
   services:
   - rgw
+- name: rgw_dedup_max_obj_size_for_split
+  type: size
+  level: advanced
+  desc: The maximum RGW object size to split head.
+        A value of 0 (zero) disables the split-head functionality
+  long_desc: Dedup code can split head object into 2 objects -
+             one with attributes and no data and
+             a new tail-object with only data.
+             The new-tail object will be deduped (unlike the head objects which
+             can't be deduplicated)
+             We will split head for objects with size 16MB or less
+  default: 16_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_dedup_min_obj_size_for_dedup
+  type: size
+  level: advanced
+  desc: The minimum RGW object size for dedup (0 means dedup all objects).
+  default: 64_K
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_max_chunk_size
   type: size
   level: advanced
index f841e8aad5a9efbbe06023c2039d0a610ce28bb7..c1174bc7ef4f8e607b46e86404ff4bf5d719c4b3 100644 (file)
@@ -83,6 +83,20 @@ namespace rgw::dedup {
   static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128;
   using storage_class_idx_t = uint8_t;
 
+  //---------------------------------------------------------------------------
+  [[maybe_unused]] static int print_manifest(const DoutPrefixProvider *dpp,
+                                             RGWRados                 *rados,
+                                             const RGWObjManifest     &manifest)
+  {
+    unsigned idx = 0;
+    for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl;
+    }
+    ldpp_dout(dpp, 20) << "==============================================" << dendl;
+    return 0;
+  }
+
   //---------------------------------------------------------------------------
   void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
                                                uint64_t notifier_id, bufferlist &bl)
@@ -321,7 +335,6 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   static int init_dedup_pool_ioctx(rgw::sal::RadosStore     *store,
                                    const DoutPrefixProvider *dpp,
-                                   bool                      create,
                                    librados::IoCtx          &ioctx)
   {
     const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
@@ -329,11 +342,10 @@ namespace rgw::dedup {
     auto rados_handle = store->getRados()->get_rados_handle();
     int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
     if (pool_id >= 0) {
-      // TBD: what to do when create option is passed
       ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
                          << " already exists, pool_id=" << pool_id << dendl;
     }
-    else if (create) {
+    else {
       pool_id = create_pool(store, dpp, pool_name);
       if (pool_id >= 0) {
         ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
@@ -343,11 +355,6 @@ namespace rgw::dedup {
         return pool_id;
       }
     }
-    else {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: pool doesn't exist and no create option" << dendl;
-      return -ENOENT;
-    }
 
     int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
     if (unlikely(ret < 0)) {
@@ -382,7 +389,7 @@ namespace rgw::dedup {
     rados = store->getRados();
     rados_handle = rados->get_rados_handle();
     if (init_pool) {
-      int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+      int ret = init_dedup_pool_ioctx(store, dpp, d_dedup_cluster_ioctx);
       display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
       return ret;
     }
@@ -398,9 +405,14 @@ namespace rgw::dedup {
     d_cluster(dpp, cct, driver),
     d_watcher_ctx(this)
   {
-    d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size;
     d_head_object_size = cct->_conf->rgw_max_chunk_size;
-    //ceph_assert(4*1024*1024 == d_head_object_size);
+    d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup;
+    d_max_obj_size_for_split = cct->_conf->rgw_dedup_max_obj_size_for_split;
+
+    ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size
+                       << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup
+                       << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split
+                       << dendl;
 
     int ret = init_rados_access_handles(false);
     if (ret != 0) {
@@ -413,6 +425,16 @@ namespace rgw::dedup {
     d_heart_beat_max_elapsed_sec = 3;
   }
 
+  //------------------------------------------------------------------------------
+  uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes)
+  {
+    return calc_deduped_bytes(d_head_object_size,
+                              d_min_obj_size_for_dedup,
+                              d_max_obj_size_for_split,
+                              num_parts,
+                              size_bytes);
+  }
+
   //---------------------------------------------------------------------------
   int Background::add_disk_rec_from_bucket_idx(disk_block_array_t     &disk_arr,
                                                const rgw::sal::Bucket *p_bucket,
@@ -434,7 +456,8 @@ namespace rgw::dedup {
     }
     ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/"
                        << obj_name << " was written to block_idx="
-                       << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl;
+                       << rec_info.block_id << " rec_id=" << (int)rec_info.rec_id
+                       << dendl;
     return 0;
   }
 
@@ -450,12 +473,11 @@ namespace rgw::dedup {
     storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
                                                  &p_stats->failed_map_overflow);
     if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
-      // TBD: need stat counters
       return -EOVERFLOW;
     }
     key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
               p_rec->s.num_parts, sc_idx);
-    bool has_shared_manifest = p_rec->has_shared_manifest();
+    bool has_shared_manifest = p_rec->s.flags.has_shared_manifest();
     ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name
                        << ", obj=" << p_rec->obj_name << ", block_id="
                        << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id
@@ -504,6 +526,115 @@ namespace rgw::dedup {
   }
 
 #ifdef FULL_DEDUP_SUPPORT
+  //---------------------------------------------------------------------------
+  static inline std::string build_oid(const std::string& bucket_id,
+                                      const std::string& obj_name)
+  {
+    std::string oid;
+    oid.reserve(bucket_id.size() + 1 + obj_name.size());
+    oid.append(bucket_id).append("_").append(obj_name);
+    return oid;
+  }
+
+  //---------------------------------------------------------------------------
+  static int get_ioctx_internal(const DoutPrefixProvider* const dpp,
+                                rgw::sal::Driver* driver,
+                                rgw::sal::RadosStore* store,
+                                const std::string &obj_name,
+                                const std::string &instance,
+                                const rgw_bucket &rb,
+                                librados::IoCtx *p_ioctx,
+                                std::string *p_oid)
+  {
+    unique_ptr<rgw::sal::Bucket> bucket;
+    {
+      int ret = driver->load_bucket(dpp, rb, &bucket, null_yield);
+      if (unlikely(ret != 0)) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
+                          << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+    }
+
+    string dummy_locator;
+    const rgw_obj_index_key key(obj_name, instance);
+    rgw_obj obj(bucket->get_key(), key);
+    get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
+    RGWBucketInfo& bucket_info = bucket->get_info();
+    return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline int get_ioctx(const DoutPrefixProvider* const dpp,
+                              rgw::sal::Driver* driver,
+                              rgw::sal::RadosStore* store,
+                              const disk_record_t *p_rec,
+                              librados::IoCtx *p_ioctx,
+                              std::string *p_oid)
+  {
+    rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
+    return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance,
+                              b, p_ioctx, p_oid);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest)
+  {
+    static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW);
+    std::string_view suffix = "0";
+    const std::string &prefix = manifest.get_prefix();
+
+    std::string tail_name;
+    tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1);
+    // TBD:
+    // it is unclear when RGW code pads with "_" before the shadow string
+    // It won't change correctness, but might look weird
+    //tail_name.append("_");
+    tail_name.append(shadow_string);
+    tail_name.append("_");
+    tail_name.append(prefix);
+    tail_name.append(suffix);
+    return tail_name;
+  }
+
+  //---------------------------------------------------------------------------
+  static void remove_created_tail_object(const DoutPrefixProvider *dpp,
+                                         librados::IoCtx &ioctx,
+                                         const std::string &tail_oid,
+                                         md5_stats_t *p_stats)
+  {
+    p_stats->rollback_tail_obj++;
+    int ret = ioctx.remove(tail_oid);
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::" << tail_oid
+                         << " was successfully removed" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << tail_oid
+                         << " ), ret=" << ret << "::" << cpp_strerror(-ret) <<dendl;
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size)
+  {
+    // max_obj_size_for_split of zero means don't split!
+    return (head_size > 0            &&
+            d_max_obj_size_for_split &&
+            obj_size <= d_max_obj_size_for_split);
+  }
+
+  //---------------------------------------------------------------------------
+  [[maybe_unused]] static bool empty_rgw_bucket(const rgw_bucket &b)
+  {
+    return (b.tenant.empty()    &&
+            b.name.empty()      &&
+            b.marker.empty()    &&
+            b.bucket_id.empty() &&
+            b.explicit_placement.data_pool.empty()       &&
+            b.explicit_placement.data_extra_pool.empty() &&
+            b.explicit_placement.index_pool.empty());
+  }
 
   static constexpr uint64_t cost = 1; // 1 throttle unit per request
   static constexpr uint64_t id = 0; // ids unused
@@ -527,15 +658,17 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  int Background::free_tail_objs_by_manifest(const string   &ref_tag,
-                                             const string   &oid,
-                                             RGWObjManifest &tgt_manifest)
+  int Background::free_tail_objs_by_manifest(const string         &ref_tag,
+                                             const string         &oid,
+                                             const RGWObjManifest &manifest)
   {
     unsigned idx = 0;
-    for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+    std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
+    for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
       rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
       if (oid == raw_obj.oid) {
-        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
         continue;
       }
 
@@ -546,21 +679,25 @@ namespace rgw::dedup {
                           << obj << dendl;
         continue;
       }
-      librados::IoCtx ioctx = obj.ioctx;
-      ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid
-                         << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid << dendl;
       d_ctl.metadata_access_throttle.acquire();
-      ret = ioctx.remove(raw_obj.oid);
+      ObjectWriteOperation op;
+      rgw::AioResultList completed;
+      cls_refcount_put(op, ref_tag, true);
+      completed = aio->get(obj.obj,
+                           rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+                           cost, id);
     }
-
+    rgw::AioResultList completed = aio->drain();
     return 0;
   }
 
   //---------------------------------------------------------------------------
-  int Background::rollback_ref_by_manifest(const string   &ref_tag,
-                                           const string   &oid,
-                                           RGWObjManifest &manifest)
+  int Background::rollback_ref_by_manifest(const string         &ref_tag,
+                                           const string         &oid,
+                                           const RGWObjManifest &manifest)
   {
+    ldpp_dout(dpp, 20) << __func__ << "::" << oid << dendl;
     unsigned idx = 0;
     int ret_code = 0;
     std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
@@ -595,9 +732,9 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  int Background::inc_ref_count_by_manifest(const string   &ref_tag,
-                                            const string   &oid,
-                                            RGWObjManifest &manifest)
+  int Background::inc_ref_count_by_manifest(const string         &ref_tag,
+                                            const string         &oid,
+                                            const RGWObjManifest &manifest)
   {
     std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
     rgw::AioResultList all_results;
@@ -614,14 +751,15 @@ namespace rgw::dedup {
       ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
       if (ret < 0) {
         ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
-                          << obj << dendl;
+                          << raw_obj.oid << dendl;
         break;
       }
 
       ObjectWriteOperation op;
       cls_refcount_get(op, ref_tag, true);
       d_ctl.metadata_access_throttle.acquire();
-      ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: "
+                         << raw_obj.oid << "::" << obj.obj.oid << dendl;
       rgw::AioResultList completed = aio->get(obj.obj,
                                               rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                               cost, id);
@@ -629,14 +767,14 @@ namespace rgw::dedup {
       all_results.splice(all_results.end(), completed);
       if (ret < 0) {
         ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
-                          << ", the error code = " << ret << dendl;
+                          << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl;
         break;
       }
     }
 
     if (ret == 0) {
       rgw::AioResultList completed = aio->drain();
-      int ret = rgw::check_for_errors(completed);
+      ret = rgw::check_for_errors(completed);
       all_results.splice(all_results.end(), completed);
       if (ret == 0) {
         return 0;
@@ -647,13 +785,14 @@ namespace rgw::dedup {
       }
     }
 
-    // if arrived here we failed somewhere -> rollback all ref-inc operations
     /* wait all pending op done */
     rgw::AioResultList completed = aio->drain();
     all_results.splice(all_results.end(), completed);
     int ret2 = 0;
     for (auto& aio_res : all_results) {
       if (aio_res.result < 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::skip failed refcount inc: "
+                           << aio_res.obj.oid << dendl;
         continue; // skip errors
       }
       rgw_rados_ref obj;
@@ -664,219 +803,302 @@ namespace rgw::dedup {
 
       ObjectWriteOperation op;
       cls_refcount_put(op, ref_tag, true);
+      ldpp_dout(dpp, 10) << __func__ << "::rollback refcount inc on: "
+                         << aio_res.obj.oid << dendl;
       rgw::AioResultList completed = aio->get(obj.obj,
                                               rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
                                               cost, id);
       ret2 = rgw::check_for_errors(completed);
       if (ret2 < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl;
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj="
+                          << aio_res.obj << dendl;
       }
     }
     completed = aio->drain();
     ret2 = rgw::check_for_errors(completed);
     if (ret2 < 0) {
       ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret="
-                        << ret2 <<dendl;
+                        << ret2 << dendl;
     }
 
     return ret;
   }
 
   //---------------------------------------------------------------------------
-  static int get_ioctx(const DoutPrefixProvider* const dpp,
-                       rgw::sal::Driver* driver,
-                       rgw::sal::RadosStore* store,
-                       const disk_record_t *p_rec,
-                       librados::IoCtx *p_ioctx,
-                       std::string *p_oid)
+  static void dedup_object_log(const DoutPrefixProvider *dpp,
+                               const disk_record_t *p_src_rec,
+                               const disk_record_t *p_tgt_rec,
+                               uint64_t             src_head_size,
+                               uint64_t             tgt_head_size,
+                               const bufferlist    &etag_bl)
   {
-    unique_ptr<rgw::sal::Bucket> bucket;
-    {
-      rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
-      int ret = driver->load_bucket(dpp, b, &bucket, null_yield);
-      if (unlikely(ret != 0)) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
-                          << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
-    }
+    ldpp_dout(dpp, 20) << __func__ << "::DEDUP SRC:"
+                       << p_src_rec->bucket_name << "/" << p_src_rec->obj_name
+                       << "(" << src_head_size << ") ::TGT:"
+                       << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name
+                       << "(" << tgt_head_size << ")" << dendl;
+    ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
+                       << "::ETAG=" << etag_bl.to_str() << dendl;
+  }
 
-    string dummy_locator;
-    const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance);
-    rgw_obj obj(bucket->get_key(), key);
-    get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
-    RGWBucketInfo& bucket_info = bucket->get_info();
-    return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+  //---------------------------------------------------------------------------
+  /* The target (TGT) manifest must inherit the source (SRC) manifest, as both share
+   *  the same tail objects.
+   * However, the TGT head object needs to maintain its unique identity, including
+   *  its head-placement-rule and head-object parameters, which are stored in
+   * `rgw_obj`.
+   *
+   * The size of the TGT head object must be adjusted to match the SRC head size.
+   * This is straightforward when Split-Head is enabled, as both heads can be set to
+   *  zero and all data is stored in the tail.
+   *
+   * A potential issue arises if the SRC and TGT have different head sizes and
+   *  Split-Head is not used.
+   * While this scenario is unlikely in practice (as head-size is almost always 4MB),
+   *  if it were to occur, we should abort the deduplication process to prevent data
+   *  inconsistencies.
+   */
+  static void adjust_target_manifest(const RGWObjManifest &src_manifest,
+                                     const RGWObjManifest &tgt_manifest,
+                                     bufferlist           &new_manifest_bl)
+  {
+    // first create new_manifest from the src_manifest
+    RGWObjManifest new_manifest(src_manifest);
+
+    // then, adjust head-object parameters to match the tgt_manifest
+    const uint64_t src_head_size = src_manifest.get_head_size();
+    const auto& tgt_placement_rule = tgt_manifest.get_head_placement_rule();
+    const rgw_obj &tgt_head_obj = tgt_manifest.get_obj();
+
+    new_manifest.set_head(tgt_placement_rule, tgt_head_obj, src_head_size);
+    encode(new_manifest, new_manifest_bl);
   }
 
   //---------------------------------------------------------------------------
-  static void init_cmp_pairs(const disk_record_t *p_rec,
-                             const bufferlist    &etag_bl,
-                             bufferlist          &hash_bl, // OUT PARAM
+  static void init_cmp_pairs(const DoutPrefixProvider *dpp,
+                             const disk_record_t *p_rec,
+                             const bufferlist &etag_bl,
+                             bufferlist &hash_bl, // OUT PARAM
                              librados::ObjectWriteOperation *p_op)
   {
     p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
-    // TBD: do we really need the secondary compare using the full manifest?
-    // Can replace it with something cheaper like size/version?
-    p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
+    bufferlist ref_tag_bl;
+    ref_tag_bl.append(p_rec->ref_tag);
+    if (p_rec->s.flags.is_ref_tag_from_tail()) {
+      p_op->cmpxattr(RGW_ATTR_TAIL_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+    }
+    else {
+      p_op->cmpxattr(RGW_ATTR_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+    }
 
     // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-    const unsigned units = (256 / (sizeof(uint64_t)*8));
-    static_assert(units == 4);
-    for (unsigned i = 0; i < units; i++) {
+    for (unsigned i = 0; i < HASH_UNITS; i++) {
       ceph::encode(p_rec->s.hash[i], hash_bl);
     }
 
     if (!p_rec->s.flags.hash_calculated()) {
+      ldpp_dout(dpp, 20) << __func__ << "::CMP HASH " << p_rec->obj_name << dendl;
       p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
     }
   }
 
   //---------------------------------------------------------------------------
-  int Background::dedup_object(const disk_record_t *p_src_rec,
-                               const disk_record_t *p_tgt_rec,
-                               md5_stats_t         *p_stats,
-                               bool                 has_shared_manifest_src)
+  static inline void build_manifest_hash_bl(const bufferlist &manifest_bl,
+                                            bufferlist &manifest_hash_bl)
   {
-    RGWObjManifest src_manifest;
-    try {
-      auto bl_iter = p_src_rec->manifest_bl.cbegin();
-      decode(src_manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
-      return -EINVAL;
-    }
-    RGWObjManifest tgt_manifest;
-    try {
-      auto bl_iter = p_tgt_rec->manifest_bl.cbegin();
-      decode(tgt_manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl;
-      return -EINVAL;
-    }
-    ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: "
-                       << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> "
-                       << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl;
+    bufferlist hash_bl;
+    crypto::digest<crypto::SHA1>(manifest_bl).encode(hash_bl);
+    // Use a shorter hash (64bit instead of 160bit)
+    hash_bl.splice(0, 8, &manifest_hash_bl);
+  }
 
+  //---------------------------------------------------------------------------
+  int Background::dedup_object(disk_record_t                *p_src_rec,
+                               disk_record_t                *p_tgt_rec,
+                               const RGWObjManifest         &src_manifest,
+                               const RGWObjManifest         &tgt_manifest,
+                               md5_stats_t                  *p_stats,
+                               const dedup_table_t::value_t *p_src_val,
+                               const std::string            &tail_oid)
+  {
+    const uint64_t src_head_size = src_manifest.get_head_size();
+    const uint64_t tgt_head_size = tgt_manifest.get_head_size();
     bufferlist etag_bl;
     etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl);
-    ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
-                       << "::ETAG=" << etag_bl.to_str() << dendl;
-
-    bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
-    crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
-    // Use a shorter hash (64bit instead of 160bit)
-    hash_bl.splice(0, 8, &manifest_hash_bl);
-    librados::ObjectWriteOperation tgt_op;
-    init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
-    tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
-    tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
-    if (p_tgt_rec->s.flags.hash_calculated()) {
-      tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
-      p_stats->set_hash_attrs++;
+    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
+    if (unlikely(should_print_debug)) {
+      dedup_object_log(dpp, p_src_rec, p_tgt_rec, src_head_size, tgt_head_size, etag_bl);
     }
 
     std::string src_oid, tgt_oid;
     librados::IoCtx src_ioctx, tgt_ioctx;
-    int ret1 = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
-    int ret2 = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
-    if (unlikely(ret1 != 0 || ret2 != 0)) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
-      return (ret1 ? ret1 : ret2);
+    int ret = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
+    if (unlikely(ret != 0)) {
+      // can't remove created tail object without an ioctx handle
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed SRC get_ioctx()" << dendl;
+      return ret;
     }
 
-    // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG??
-    string ref_tag = p_tgt_rec->ref_tag;
+    ret = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl;
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+      }
+      return ret;
+    }
+
+    // we don't dedup head-objects so head-size must match (unless split-head)
+    // see explanation in adjust_target_manifest()
+    if (unlikely(src_head_size != 0 && src_head_size != tgt_head_size)) {
+      ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size
+                        << "::tgt_head_size=" << tgt_head_size << dendl;
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+      }
+      // TBD: can we create a test case (requires control over head-object-size)??
+      return -ECANCELED;
+    }
+
+    const string &ref_tag = p_tgt_rec->ref_tag;
     ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl;
-    int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
-    if (ret == 0) {
-      d_ctl.metadata_access_throttle.acquire();
-      ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl;
-      ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
-      if (unlikely(ret != 0)) {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
-                          << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
-        rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
-        return ret;
+    ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
+    if (unlikely(ret != 0)) {
+      if (p_src_rec->s.flags.is_split_head()) {
+        remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
       }
+      return ret;
+    }
 
-      // free tail objects based on TGT manifest
-      free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+    bufferlist manifest_hash_bl;
+    build_manifest_hash_bl(p_src_rec->manifest_bl, manifest_hash_bl);
 
-      if (!has_shared_manifest_src) {
-        // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
-        // after deduping B and update it in dedup_table, but don't update the
-        // disk-record (as require an expensive random-disk-write).
-        // When deduping C we can trust the shared_manifest state in the table and
-        // skip a redundant update to SRC object attribute
+    if (!p_src_val->has_shared_manifest()) {
+      // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
+      // after deduping B and update it in dedup_table, but don't update the
+      // disk-record (as require an expensive random-disk-write).
+      // When deduping C we can trust the shared_manifest state in the table and
+      // skip a redundant update to SRC object attribute
+      librados::ObjectWriteOperation src_op;
+      {
         bufferlist src_hash_bl;
-        librados::ObjectWriteOperation src_op;
-        init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
+        init_cmp_pairs(dpp, p_src_rec, etag_bl, src_hash_bl, &src_op);
         src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
-        if (p_src_rec->s.flags.hash_calculated(){
+        if (p_src_rec->s.flags.hash_calculated() && !p_src_val->has_valid_hash()){
           src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
+          ldpp_dout(dpp, 20) << __func__ <<"::Set SRC Strong Hash in CLS"<< dendl;
           p_stats->set_hash_attrs++;
         }
+      }
 
-        d_ctl.metadata_access_throttle.acquire();
-        ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
-        ret = src_ioctx.operate(src_oid, &src_op);
-        if (unlikely(ret != 0)) {
-          ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
-                            << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
-          return ret;
+      if (p_src_rec->s.flags.is_split_head()) {
+        ldpp_dout(dpp, 20) << __func__ <<"::SRC-Split (truncate)" << dendl;
+        src_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+        src_op.truncate(0);
+        p_stats->split_head_src++;
+      }
+      d_ctl.metadata_access_throttle.acquire();
+      ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS"<< dendl;
+      ret = src_ioctx.operate(src_oid, &src_op);
+      if (unlikely(ret != 0)) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
+                          << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
+        rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+        if (p_src_rec->s.flags.is_split_head()) {
+          remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
         }
+        return ret;
+      }
+    }
+
+    librados::ObjectWriteOperation tgt_op;
+    {
+      bufferlist tgt_hash_bl;
+      init_cmp_pairs(dpp, p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
+      tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
+      bufferlist new_manifest_bl;
+      adjust_target_manifest(src_manifest, tgt_manifest, new_manifest_bl);
+      tgt_op.setxattr(RGW_ATTR_MANIFEST, new_manifest_bl);
+      //tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+      if (p_tgt_rec->s.flags.hash_calculated()) {
+        tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
+        ldpp_dout(dpp, 20) << __func__ <<"::Set TGT Strong Hash in CLS"<< dendl;
+        p_stats->set_hash_attrs++;
       }
     }
 
+    // If failed before this point and split-head -> remove the new tail-object
+    if (src_head_size == 0 && tgt_head_size > 0) {
+      ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split OP (truncate)" << dendl;
+      p_tgt_rec->s.flags.set_split_head();
+      tgt_op.truncate(0);
+      p_stats->split_head_tgt++;
+    }
+    d_ctl.metadata_access_throttle.acquire();
+    ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS" << dendl;
+    ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
+                        << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
+      rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+      return ret;
+    }
+
+    // free tail objects based on TGT manifest
+    free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+
     // do we need to set compression on the head object or is it set on tail?
     // RGW_ATTR_COMPRESSION
     return ret;
   }
 
   //---------------------------------------------------------------------------
-  int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
+  int Background::calc_object_blake3(const RGWObjManifest &manifest,
+                                     disk_record_t *p_rec,
+                                     uint8_t *p_hash,
+                                     blake3_hasher *p_pre_calc_hmac)
   {
-    ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
-    RGWObjManifest manifest;
-    try {
-      auto bl_iter = p_rec->manifest_bl.cbegin();
-      decode(manifest, bl_iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 1)  << __func__ << "::ERROR: bad src manifest for: "
-                         << p_rec->obj_name << dendl;
-      return -EINVAL;
+    ldpp_dout(dpp, 20) << __func__ << "::p_rec->obj_name=" << p_rec->obj_name << dendl;
+
+    blake3_hasher _hmac, *p_hmac = nullptr;
+    if (!p_pre_calc_hmac) {
+      blake3_hasher_init(&_hmac);
+      p_hmac = &_hmac;
+    }
+    else {
+      p_hmac = p_pre_calc_hmac;
     }
 
-    blake3_hasher hmac;
-    blake3_hasher_init(&hmac);
     for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
-      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
-      rgw_rados_ref obj;
-      int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
-      if (ret < 0) {
-        ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
-                          << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
+      uint64_t offset = p.get_stripe_ofs();
+      const rgw_obj_select& os = p.get_location();
+      if (offset > 0 || !p_pre_calc_hmac) {
+        rgw_raw_obj raw_obj = os.get_raw_obj(rados);
+        rgw_rados_ref obj;
+        int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid="
+                            << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
 
-      bufferlist bl;
-      librados::IoCtx ioctx = obj.ioctx;
-      // read full object
-      ret = ioctx.read(raw_obj.oid, bl, 0, 0);
-      if (ret > 0) {
+        librados::IoCtx ioctx = obj.ioctx;
+        bufferlist bl;
+        // read full object
+        ret = ioctx.read(raw_obj.oid, bl, 0, 0);
+        if (unlikely(ret <= 0)) {
+          ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read oid "
+                            << raw_obj.oid  << ", err is " << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
         for (const auto& bptr : bl.buffers()) {
-          blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
+          blake3_hasher_update(p_hmac, (const unsigned char *)bptr.c_str(), bptr.length());
         }
       }
-      else {
-        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
-                          << ", error is " << cpp_strerror(-ret) << dendl;
-        return ret;
-      }
     }
-
-    blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
+    blake3_hasher_finalize(p_hmac, p_hash, BLAKE3_OUT_LEN);
+    p_rec->s.flags.set_hash_calculated();
+    p_rec->s.flags.set_has_valid_hash();
     return 0;
   }
 
@@ -890,28 +1112,58 @@ namespace rgw::dedup {
   {
     ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name
                        << ", obj=" << p_tgt_rec->obj_name
+                       << ", bytes_size=" << p_tgt_rec->s.obj_bytes_size
                        << ", block_id=" << block_id
-                       << ", rec_id=" << (int)rec_id
-                       << ", md5_shard=" << (int)md5_shard << dendl;
-
-    ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard
-                       << "::" << p_tgt_rec->bucket_name
-                       << "/" << p_tgt_rec->obj_name
+                       << ", rec_id=" << (int)rec_id << "\n"
+                       << ", md5_shard=" << (int)md5_shard
                        << "::num_parts=" << p_tgt_rec->s.num_parts
                        << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
                        << p_tgt_rec->s.md5_low << std::dec << dendl;
   }
 
   //---------------------------------------------------------------------------
-  int Background::add_obj_attrs_to_record(rgw_bucket            *p_rb,
-                                          disk_record_t         *p_rec,
+  static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement)
+  {
+    return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty());
+  }
+
+  //---------------------------------------------------------------------------
+  static void set_explicit_tail_placement(const DoutPrefixProvider* dpp,
+                                          RGWObjManifest *p_manifest,// IN-OUT PARAM
+                                          md5_stats_t *p_stats)
+  {
+    p_stats->manifest_no_tail_placement++;
+    ldpp_dout(dpp, 20) << __func__ << "::invalid_tail_placement -> update" << dendl;
+    const rgw_bucket_placement& tail_placement = p_manifest->get_tail_placement();
+    const rgw_bucket *p_bucket = &tail_placement.bucket;
+
+    if (tail_placement.bucket.name.empty()) {
+      // bucket was not set in tail_placement, force the head bucket explicitly
+      const rgw_obj& head_obj = p_manifest->get_obj();
+      p_bucket = &head_obj.bucket;
+    }
+
+    if (tail_placement.placement_rule.name.empty()) {
+      // explicitly use the head_placement_rule for tail objects and update bucket
+      // if needed
+      const auto &head_placement_rule = p_manifest->get_head_placement_rule();
+      p_manifest->set_tail_placement(head_placement_rule, *p_bucket);
+    }
+    else {
+      // otherwise, keep the tail_placement_rule in place (but still update bucket)
+      p_manifest->set_tail_placement(tail_placement.placement_rule, *p_bucket);
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::add_obj_attrs_to_record(disk_record_t         *p_rec,
                                           const rgw::sal::Attrs &attrs,
-                                          dedup_table_t         *p_table,
                                           md5_stats_t           *p_stats) /*IN-OUT*/
   {
     // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG
     auto itr = attrs.find(RGW_ATTR_TAIL_TAG);
     if (itr != attrs.end()) {
+      p_rec->s.flags.set_ref_tag_from_tail();
       p_rec->ref_tag = itr->second.to_str();
     }
     else {
@@ -929,10 +1181,11 @@ namespace rgw::dedup {
     // clear bufferlist first
     p_rec->manifest_bl.clear();
 
+    bool need_to_split_head = false;
+    RGWObjManifest manifest;
     itr = attrs.find(RGW_ATTR_MANIFEST);
     if (itr != attrs.end()) {
       const bufferlist &bl = itr->second;
-      RGWObjManifest manifest;
       try {
         auto bl_iter = bl.cbegin();
         decode(manifest, bl_iter);
@@ -941,12 +1194,13 @@ namespace rgw::dedup {
                            << "::ERROR: unable to decode manifest" << dendl;
         return -EINVAL;
       }
+      need_to_split_head = should_split_head(manifest.get_head_size(),
+                                             p_rec->s.obj_bytes_size);
 
       // force explicit tail_placement as the dedup could be on another bucket
       const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
-      if (tail_placement.bucket.name.empty()) {
-        ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl;
-        manifest.set_tail_placement(tail_placement.placement_rule, *p_rb);
+      if (unlikely(invalid_tail_placement(tail_placement))) {
+        set_explicit_tail_placement(dpp, &manifest, p_stats);
         encode(manifest, p_rec->manifest_bl);
       }
       else {
@@ -958,6 +1212,18 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 5)  << __func__ << "::ERROR: no manifest" << dendl;
       return -EINVAL;
     }
+    const auto &head_placement_rule = manifest.get_head_placement_rule();
+    const std::string& storage_class =
+      rgw_placement_rule::get_canonical_storage_class(head_placement_rule.storage_class);
+
+    // p_rec holds an the storage_class value taken from the bucket-index/obj-attr
+    if (unlikely(storage_class != p_rec->stor_class)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERROR::manifest storage_class="
+                        << storage_class << " != " << "::bucket-index storage_class="
+                        << p_rec->stor_class << dendl;
+      p_stats->different_storage_class++;
+      return -EINVAL;
+    }
 
     itr = attrs.find(RGW_ATTR_SHARE_MANIFEST);
     if (itr != attrs.end()) {
@@ -983,14 +1249,13 @@ namespace rgw::dedup {
     if (itr != attrs.end()) {
       try {
         auto bl_iter = itr->second.cbegin();
-        // BLAKE3 hash 256 bit splitted into multiple 64bit units
-        const unsigned units = (256 / (sizeof(uint64_t)*8));
-        static_assert(units == 4);
-        for (unsigned i = 0; i < units; i++) {
+        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+        for (unsigned i = 0; i < HASH_UNITS; i++) {
           uint64_t val;
           ceph::decode(val, bl_iter);
           p_rec->s.hash[i] = val;
         }
+        p_rec->s.flags.set_has_valid_hash();
         p_stats->valid_hash_attrs++;
         return 0;
       } catch (buffer::error& err) {
@@ -999,16 +1264,17 @@ namespace rgw::dedup {
       }
     }
 
+    // if arrived here we need to calculate string hash
     p_stats->invalid_hash_attrs++;
-    // TBD: redundant memset...
     memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
-    // BLAKE3_OUT_LEN is 32 Bytes
-    int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
-    if (ret == 0) {
-      p_rec->s.flags.set_hash_calculated();
-    }
 
-    return ret;
+    if (!need_to_split_head) {
+      ldpp_dout(dpp, 20) << __func__ << "::CALC Object Strong Hash::"
+                         << p_rec->obj_name << dendl;
+      return calc_object_blake3(manifest, p_rec, (uint8_t*)p_rec->s.hash);
+    }
+    // else, differ strong-hash calculation for next step and piggy back split-head
+    return 0;
   }
 
   //---------------------------------------------------------------------------
@@ -1035,7 +1301,6 @@ namespace rgw::dedup {
     storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
                                                  &p_stats->failed_map_overflow);
     if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
-      // TBD: need stat counters
       return -EOVERFLOW;
     }
     key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
@@ -1043,7 +1308,7 @@ namespace rgw::dedup {
     dedup_table_t::value_t src_val;
     int ret = p_table->get_val(&key_from_bucket_index, &src_val);
     if (ret != 0) {
-      if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+      if (!dedupable_object(p_rec->multipart_object(), d_min_obj_size_for_dedup, ondisk_byte_size)) {
         // record has no valid entry in table because it is a too small
         // It was loaded to table for calculation and then purged
         p_stats->skipped_purged_small++;
@@ -1113,6 +1378,19 @@ namespace rgw::dedup {
     }
 
     const rgw::sal::Attrs& attrs = p_obj->get_attrs();
+    if (src_val.has_shared_manifest() && (attrs.find(RGW_ATTR_SHARE_MANIFEST) != attrs.end())) {
+      // A shared_manifest object can't be a dedup target
+      // We only need to keep a single shared_manifest object
+      // to be used as a dedup-source (which we already got)
+      p_stats->skipped_shared_manifest++;
+      uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_rec->s.num_parts,
+                                                              ondisk_byte_size);
+      p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
+      ldpp_dout(dpp, 20) << __func__ << "::(1)skipped shared_manifest, SRC::block_id="
+                         << src_val.block_idx << "::rec_id=" << (int)src_val.rec_id << dendl;
+      return 0;
+    }
+
     if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) {
       p_stats->ingress_skip_encrypted++;
       p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size;
@@ -1121,7 +1399,7 @@ namespace rgw::dedup {
       return 0;
     }
 
-    // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
+    // TBD-Future: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
     if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) {
       p_stats->ingress_skip_compressed++;
       p_stats->ingress_skip_compressed_bytes += ondisk_byte_size;
@@ -1154,6 +1432,16 @@ namespace rgw::dedup {
     else {
       storage_class = RGW_STORAGE_CLASS_STANDARD;
     }
+
+    // p_rec holds an the storage_class value taken from the bucket-index
+    if (unlikely(storage_class != p_rec->stor_class)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERROR::ATTR storage_class="
+                        << storage_class << " != " << "::bucket-index storage_class="
+                        << p_rec->stor_class << dendl;
+      p_stats->different_storage_class++;
+      return -EINVAL;
+    }
+
     // no need to check for remap success as we compare keys bellow
     sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow);
     key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low,
@@ -1169,7 +1457,7 @@ namespace rgw::dedup {
 
     // reset flags
     p_rec->s.flags.clear();
-    ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats);
+    ret = add_obj_attrs_to_record(p_rec, attrs, p_stats);
     if (unlikely(ret != 0)) {
       ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
                         << ret << "::" << cpp_strerror(-ret) << dendl;
@@ -1180,13 +1468,16 @@ namespace rgw::dedup {
     ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info);
     if (ret == 0) {
       // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest
-      ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK);
+      if (unlikely(rec_info.rec_id >= MAX_REC_IN_BLOCK)) {
+        p_stats->illegal_rec_id++;
+      }
       ldpp_dout(dpp, 20)  << __func__ << "::" << p_rec->bucket_name << "/"
                           << p_rec->obj_name << " was written to block_idx="
                           << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id
-                          << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl;
+                          << "::shared_manifest="
+                          << p_rec->s.flags.has_shared_manifest() << dendl;
       p_table->update_entry(&key_from_bucket_index, rec_info.block_id,
-                            rec_info.rec_id, p_rec->has_shared_manifest());
+                            rec_info.rec_id, p_rec->s.flags.has_shared_manifest());
     }
     else {
       ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl;
@@ -1198,17 +1489,18 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
-                                           rgw::sal::Driver* driver,
-                                           rgw::sal::RadosStore *store,
-                                           const disk_record_t *p_rec)
+  static int write_hash_object_attribute(const DoutPrefixProvider* const dpp,
+                                         rgw::sal::Driver* driver,
+                                         rgw::sal::RadosStore *store,
+                                         const disk_record_t *p_rec,
+                                         md5_stats_t *p_stats)
   {
     bufferlist etag_bl;
     bufferlist hash_bl;
     librados::ObjectWriteOperation op;
     etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
                        &etag_bl);
-    init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
+    init_cmp_pairs(dpp, p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
     op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
 
     std::string oid;
@@ -1224,9 +1516,420 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
                         << oid << "), err is " << cpp_strerror(-ret) << dendl;
     }
+    ldpp_dout(dpp, 20) << __func__ <<"::Write Strong Hash to " << p_rec->obj_name
+                       << dendl;
+    p_stats->set_hash_attrs++;
     return ret;
   }
 
+  //---------------------------------------------------------------------------
+  static bool compare_strong_hash(const DoutPrefixProvider *const dpp,
+                                  const disk_record_t *p_src_rec,
+                                  const disk_record_t *p_tgt_rec,
+                                  md5_stats_t *p_stats)
+  {
+    if (unlikely(0 != memcmp(p_src_rec->s.hash, p_tgt_rec->s.hash, sizeof(p_src_rec->s.hash)))) {
+      p_stats->hash_mismatch++;
+      ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
+      return false;
+    }
+    ldpp_dout(dpp, 20) << __func__ << "::SRC-TGT Strong-Hash match" << dendl;
+    // all is good
+    return true;
+  }
+
+  //---------------------------------------------------------------------------
+  static int read_hash_and_manifest(const DoutPrefixProvider *const dpp,
+                                    rgw::sal::Driver *driver,
+                                    RGWRados *rados,
+                                    disk_record_t *p_rec)
+  {
+    librados::IoCtx ioctx;
+    std::string oid;
+    int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+      return ret;
+    }
+
+    std::map<std::string, bufferlist> attrset;
+    ret = ioctx.getxattrs(oid, attrset);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.getxattrs("
+                        << oid << "), err is " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    auto itr = attrset.find(RGW_ATTR_BLAKE3);
+    if (itr != attrset.end()) {
+      try {
+        auto bl_iter = itr->second.cbegin();
+        // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+        for (unsigned i = 0; i < HASH_UNITS; i++) {
+          uint64_t val;
+          ceph::decode(val, bl_iter);
+          p_rec->s.hash[i] = val;
+        }
+        p_rec->s.flags.set_has_valid_hash();
+        // the hash was taken directly from the object attributes and not calculated
+        p_rec->s.flags.clear_hash_calculated();
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
+        return -EINVAL;
+      }
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: No HASH attribute" << dendl;
+      return -ENOENT;
+    }
+
+    itr = attrset.find(RGW_ATTR_MANIFEST);
+    if (itr != attrset.end()) {
+      ldpp_dout(dpp, 20) << __func__ << "::Got Manifest " << p_rec->obj_name << dendl;
+      p_rec->manifest_bl = itr->second;
+      p_rec->s.manifest_len = p_rec->manifest_bl.length();
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: No Manifest attribute" << dendl;
+      return -ENOENT;
+    }
+
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  static void set_explicit_manifest(RGWObjManifest *p_manifest,
+                                    std::map<uint64_t, RGWObjManifestPart> &objs_map)
+  {
+    uint64_t obj_size = p_manifest->get_obj_size();
+    p_manifest->set_head_size(0);
+    p_manifest->set_max_head_size(0);
+    p_manifest->set_prefix("");
+    p_manifest->clear_rules();
+    p_manifest->set_explicit(obj_size, objs_map);
+  }
+
+  //---------------------------------------------------------------------------
+  // This code is based on RGWObjManifest::convert_to_explicit()
+  static void build_explicit_objs_map(const DoutPrefixProvider *dpp,
+                                      RGWRados *rados,
+                                      const RGWObjManifest &manifest,
+                                      const rgw_bucket *p_bucket,
+                                      std::map<uint64_t, RGWObjManifestPart> *p_objs_map,
+                                      const std::string &tail_name,
+                                      md5_stats_t *p_stats)
+  {
+    bool manifest_raw_obj_logged = false;
+    unsigned idx = 0;
+    auto p = manifest.obj_begin(dpp);
+    while (p != manifest.obj_end(dpp)) {
+      const uint64_t offset = p.get_stripe_ofs();
+      const rgw_obj_select& os = p.get_location();
+      ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]OBJ: "
+                         << os.get_raw_obj(rados).oid << "::ofs=" << p.get_ofs()
+                         << "::strp_offset=" << offset << dendl;
+
+      RGWObjManifestPart& part = (*p_objs_map)[offset];
+      part.loc_ofs = 0;
+
+      if (offset == 0) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] HEAD OBJ: "
+                           << os.get_raw_obj(rados).oid << dendl;
+        const rgw_obj &head_obj = manifest.get_obj();
+        const rgw_obj_key &head_key = head_obj.key;
+        // TBD: Can we have different instance/ns values for head/tail ??
+        // Should we take the instance/ns from the head or tail?
+        // Maybe should refuse objects with different instance/ns on head/tail ?
+        rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns);
+        rgw_obj tail_obj(*p_bucket, tail_key);
+        part.loc = tail_obj;
+      }
+      else {
+        // RGWObjManifest::convert_to_explicit() is assuming raw_obj, but looking
+        // at the RGWObjManifest::obj_iterator code it is clear the obj is not raw.
+        // If it happens to be raw we still handle it correctly (and inc stat-count)
+        std::optional<rgw_obj> obj_opt = os.get_head_obj();
+        if (obj_opt.has_value()) {
+          part.loc = obj_opt.value();
+        }
+        else {
+          // report raw object in manifest only once
+          if (!manifest_raw_obj_logged) {
+            manifest_raw_obj_logged = true;
+            ldpp_dout(dpp, 10) << __func__ << "::WARN: obj is_raw" << dendl;
+            p_stats->manifest_raw_obj++;
+          }
+          const rgw_raw_obj& raw = os.get_raw_obj(rados);
+          RGWSI_Tier_RADOS::raw_obj_to_obj(*p_bucket, raw, &part.loc);
+        }
+      }
+
+      ++p;
+      uint64_t next_offset = p.get_stripe_ofs();
+      part.size = next_offset - offset;
+      idx++;
+    } // while (p != manifest.obj_end())
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM
+                                    RGWObjManifest &src_manifest, // IN/OUT PARAM
+                                    const disk_record_t *p_tgt_rec,
+                                    std::string &tail_oid, // OUT PARAM
+                                    md5_stats_t *p_stats)
+  {
+    ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::"
+                       << p_src_rec->s.obj_bytes_size << dendl;
+
+    uint64_t head_size = src_manifest.get_head_size();
+    bufferlist bl;
+    std::string head_oid;
+    librados::IoCtx ioctx;
+    int ret = get_ioctx(dpp, driver, rados, p_src_rec, &ioctx, &head_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+      return ret;
+    }
+
+    // read the full rados head-object
+    ldpp_dout(dpp, 20) << __func__ << "::ioctx.read(" << head_oid << ")" << dendl;
+    ret = ioctx.read(head_oid, bl, 0, 0);
+    if (unlikely(ret != (int)head_size)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << head_oid
+                        << ", ret=" << ret << ", error is " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    // we might have a valid hash left from a failed dedup (mismatch SRC/TGT)
+    if (!p_src_rec->s.flags.has_valid_hash()) {
+      ldpp_dout(dpp, 20) << __func__ << "::calc BLK3 for SRC "
+                         << p_src_rec->obj_name << dendl;
+      blake3_hasher hmac;
+      blake3_hasher_init(&hmac);
+      for (const auto& bptr : bl.buffers()) {
+        blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(),
+                             bptr.length());
+      }
+      uint8_t *p_hash = (uint8_t*)p_src_rec->s.hash;
+      ret = calc_object_blake3(src_manifest, p_src_rec, p_hash, &hmac);
+      if (unlikely(ret != 0)) {
+        return ret;
+      }
+
+      // cancel split-head operation if strong hash differ
+      if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+        return -ECANCELED;
+      }
+    }
+
+    bool exclusive = true; // block overwrite
+    std::string tail_name = generate_split_head_tail_name(src_manifest);
+    const rgw_bucket_placement &tail_placement = src_manifest.get_tail_placement();
+    // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort
+    if (unlikely(invalid_tail_placement(tail_placement))) {
+      p_stats->split_head_no_tail_placement++;
+      ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl;
+      return -EINVAL;
+    }
+
+    const rgw_bucket *p_bucket = &tail_placement.bucket;
+    // tail objects might be on another storage_class/pool, need another ioctx
+    librados::IoCtx tail_ioctx;
+    ret = get_ioctx_internal(dpp, driver, store, tail_name, p_src_rec->instance,
+                             *p_bucket, &tail_ioctx, &tail_oid);
+    if (unlikely(ret != 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl;
+      return ret;
+    }
+
+    ret = tail_ioctx.create(tail_oid, exclusive);
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::successfully created: " << tail_oid << dendl;
+    }
+    else if (ret == -EEXIST) {
+      // should not happen as we take the prefix with unused counter 0
+      // better to skip this dedup opportunity
+      ldpp_dout(dpp, 1) << __func__ << "::ERR object " << tail_oid << " exists!" << dendl;
+      p_stats->failed_split_head_creat++;
+      return ret;
+    }
+    else{
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << tail_oid
+                        <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <<dendl;
+      return ret;
+    }
+
+    ret = tail_ioctx.write_full(tail_oid, bl);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to write " << tail_oid
+                        << " with: " << cpp_strerror(-ret) << dendl;
+      // don't leave orphan object behind
+      tail_ioctx.remove(tail_oid);
+      return ret;
+    }
+    else {
+      ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret="
+                         << ret << dendl;
+    }
+
+    std::map<uint64_t, RGWObjManifestPart> objs_map;
+    build_explicit_objs_map(dpp, rados, src_manifest, p_bucket, &objs_map,
+                            tail_name, p_stats);
+    set_explicit_manifest(&src_manifest, objs_map);
+
+    bufferlist manifest_bl;
+    encode(src_manifest, manifest_bl);
+    p_src_rec->manifest_bl = manifest_bl;
+    p_src_rec->s.manifest_len = p_src_rec->manifest_bl.length();
+    p_src_rec->s.flags.set_split_head();
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  bool Background::check_and_set_strong_hash(disk_record_t *p_src_rec,
+                                             disk_record_t *p_tgt_rec,
+                                             RGWObjManifest &src_manifest,
+                                             const RGWObjManifest &tgt_manifest,
+                                             const dedup_table_t::value_t *p_src_val,
+                                             std::string &tail_oid, // OUT PARAM
+                                             md5_stats_t *p_stats)
+  {
+    int ret = 0;
+    // if we don't have a valid strong hash already -> read data and calculate it!
+    if (!p_tgt_rec->s.flags.has_valid_hash()) {
+      ldpp_dout(dpp, 20) << __func__ << "::CALC TGT Strong Hash::"
+                         << p_tgt_rec->obj_name << dendl;
+      ret = calc_object_blake3(tgt_manifest, p_tgt_rec, (uint8_t*)p_tgt_rec->s.hash);
+      if (unlikely(ret != 0)) {
+        // Don't run dedup without a valid strong hash
+        return false;
+      }
+    }
+
+    // SRC hash could have been calculated and stored in obj-attributes before
+    // (will happen when we got multiple targets)
+    if (!p_src_rec->s.flags.has_valid_hash() && p_src_val->has_valid_hash()) {
+      // read the manifest and strong hash from the head-object attributes
+      ldpp_dout(dpp, 20) << __func__ << "::Fetch SRC strong hash from head-object::"
+                         << p_src_rec->obj_name << dendl;
+      if (unlikely(read_hash_and_manifest(dpp, driver, rados, p_src_rec) != 0)) {
+        return false;
+      }
+      try {
+        auto bl_iter = p_src_rec->manifest_bl.cbegin();
+        decode(src_manifest, bl_iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 1)  << __func__ << "::ERR: failed manifest decode" << dendl;
+        return false;
+      }
+    }
+
+    // check hash before trying to split head (can be skipped if not equal)
+    if (p_src_rec->s.flags.has_valid_hash()) {
+      if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+        return false;
+      }
+    }
+
+    // we might still need to split-head here when hash is valid
+    // can happen if we failed compare before (md5-collison) and stored the src hash
+    // in the obj-attributes
+    uint64_t head_size = src_manifest.get_head_size();
+    if (should_split_head(head_size, src_manifest.get_obj_size())) {
+      ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, tail_oid, p_stats);
+      // compare_strong_hash() is called internally by split_head_object()
+      return (ret == 0);
+    }
+    else if (!p_src_rec->s.flags.has_valid_hash()) {
+      // object not targeted for split_head it should have a valid hash -> skip it
+      ldpp_dout(dpp, 5)  << __func__
+                         << "::ERR: object not targeted for split_head has no hash" << dendl;
+      p_stats->invalid_hash_no_split_head++;
+      return false;
+    }
+
+    return true;
+  }
+
+  //---------------------------------------------------------------------------
+  static bool parse_manifests(const DoutPrefixProvider *dpp,
+                              const disk_record_t *p_src_rec,
+                              const disk_record_t *p_tgt_rec,
+                              RGWObjManifest      *p_src_manifest,
+                              RGWObjManifest      *p_tgt_manifest)
+  {
+    bool valid_src_manifest = false;
+    try {
+      auto bl_iter = p_src_rec->manifest_bl.cbegin();
+      decode(*p_src_manifest, bl_iter);
+      valid_src_manifest = true;
+      bl_iter = p_tgt_rec->manifest_bl.cbegin();
+      decode(*p_tgt_manifest, bl_iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad "
+                        << (valid_src_manifest? "TGT" : "SRC")
+                        << " manifest" << dendl;
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  static bool has_shared_tail_objects(const DoutPrefixProvider *dpp,
+                                      RGWRados             *rados,
+                                      const disk_record_t  *p_src_rec,
+                                      const disk_record_t  *p_tgt_rec,
+                                      const RGWObjManifest &src_manifest,
+                                      const RGWObjManifest &tgt_manifest,
+                                      md5_stats_t          *p_stats)
+  {
+    // Build a vector with all tail-objects on the SRC and then iterate over
+    // the TGT tail-objects looking for a single tail-object in both manifets.
+    // If found -> abort the dedup
+    // The only case leading to this scenario is server-side-copy
+    // It is probably enough to scan the first few tail-objects, but better safe...
+    std::string src_oid = build_oid(p_src_rec->bucket_id, p_src_rec->obj_name);
+    std::string tgt_oid = build_oid(p_tgt_rec->bucket_id, p_tgt_rec->obj_name);
+    std::vector<std::string> vec;
+    unsigned idx = 0;
+    for (auto p = src_manifest.obj_begin(dpp); p != src_manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      if (src_oid != raw_obj.oid) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+        vec.push_back(raw_obj.oid);
+      }
+      else {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
+        continue;
+      }
+    }
+    idx = 0;
+    for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+      rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+      if (tgt_oid != raw_obj.oid) {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+        // Search for the tail_obj in the vector
+        // should be one of the first entries (first or second)
+        auto itr = std::find(vec.begin(), vec.end(), raw_obj.oid);
+        if (unlikely(itr != vec.end())) {
+          ldpp_dout(dpp, 10) << __func__ << "::tail obj " << raw_obj.oid
+                             << " exists on both SRC and TGT Objects -> Abort DEDUP!"<< dendl;
+          p_stats->skip_shared_tail_objs ++;
+          return true;
+        }
+      }
+      else {
+        ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+                           << raw_obj.oid << dendl;
+        continue;
+      }
+    }
+
+    return false;
+  }
+
   //---------------------------------------------------------------------------
   // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
   //   so all entries left are sources of dedup with multiple copies.
@@ -1238,35 +1941,39 @@ namespace rgw::dedup {
   // we can withstand most errors moving to the next object
   // only report an error if we recived a stop scan request!
   //
-  int Background::try_deduping_record(dedup_table_t       *p_table,
-                                      const disk_record_t *p_tgt_rec,
-                                      disk_block_id_t      block_id,
-                                      record_id_t          rec_id,
-                                      md5_shard_t          md5_shard,
-                                      md5_stats_t         *p_stats, /* IN-OUT */
-                                      remapper_t          *remapper)
-  {
-    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>();
+  int Background::try_deduping_record(dedup_table_t   *p_table,
+                                      disk_record_t   *p_tgt_rec,
+                                      disk_block_id_t  block_id,
+                                      record_id_t      rec_id,
+                                      md5_shard_t      md5_shard,
+                                      md5_stats_t     *p_stats, /* IN-OUT */
+                                      remapper_t      *remapper)
+  {
+    bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
     if (unlikely(should_print_debug)) {
       print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard);
     }
-
     uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size);
     storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp,
                                                  &p_stats->failed_map_overflow);
-    ceph_assert(sc_idx != remapper_t::NULL_IDX);
+    if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
+      ldpp_dout(dpp, 5) << __func__ << "::invalid_storage_class_mapping for "
+                        << p_tgt_rec->stor_class << "::" << p_tgt_rec->obj_name << dendl;
+      p_stats->invalid_storage_class_mapping++;
+      return 0;
+    }
     key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units,
               p_tgt_rec->s.num_parts, sc_idx);
     dedup_table_t::value_t src_val;
     int ret = p_table->get_val(&key, &src_val);
-    if (ret != 0) {
+    if (unlikely(ret != 0)) {
       // record has no valid entry in table because it is a singleton
       // should never happened since we purged all singletons before
       ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name
                         << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts
                         << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
                         << p_tgt_rec->s.md5_low << std::dec << dendl;
-      ceph_abort("Unexpcted singleton");
+      p_stats->singleton_after_purge++;
       return 0;
     }
 
@@ -1275,86 +1982,115 @@ namespace rgw::dedup {
     if (block_id == src_block_id && rec_id == src_rec_id) {
       // the table entry point to this record which means it is a dedup source so nothing to do
       p_stats->skipped_source_record++;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl;
+      ldpp_dout(dpp, 20) << __func__ << "::(2)skipped source-record, block_id="
+                         << block_id << "::rec_id=" << (int)rec_id << dendl;
       return 0;
     }
 
-    // ceph store full blocks so need to round up and multiply by block_size
-    uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
-    uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size,
-                                                          p_tgt_rec->s.num_parts,
-                                                          ondisk_byte_size);
+    // should never happen
     if (p_tgt_rec->s.flags.has_shared_manifest()) {
       // record holds a shared_manifest object so can't be a dedup target
-      p_stats->skipped_shared_manifest++;
-      p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl;
+      ldpp_dout(dpp, 1) << __func__ << "::(3)skipped shared_manifest, block_id="
+                        << block_id << "::rec_id=" << (int)rec_id << dendl;
+      p_stats->shared_manifest_after_purge++;
       return 0;
     }
 
+    // ceph store full blocks so need to round up and multiply by block_size
+    uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
+    uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_tgt_rec->s.num_parts,
+                                                            ondisk_byte_size);
+
     // This records is a dedup target with source record on source_block_id
-    disk_record_t src_rec;
-    ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id,
+    disk_record_t src_rec, *p_src_rec = &src_rec;
+    ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, p_src_rec, src_block_id,
                       src_rec_id, md5_shard, dpp);
     if (unlikely(ret != 0)) {
       p_stats->failed_src_load++;
       // we can withstand most errors moving to the next object
       ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record("
-                        << src_block_id << ", " << src_rec_id << ")" << dendl;
+                        << src_block_id << ", " << (int)src_rec_id << ")" << dendl;
       return 0;
     }
 
-    ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name
-                       << "/" << src_rec.obj_name << dendl;
+    ldpp_dout(dpp, 20) << __func__ << "::SRC:" << p_src_rec->bucket_name << "/"
+                       << p_src_rec->obj_name << "::TGT:" << p_tgt_rec->bucket_name
+                       << "/" << p_tgt_rec->obj_name << dendl;
     // verify that SRC and TGT records don't refer to the same physical object
     // This could happen in theory if we read the same objects twice
-    if (src_rec.ref_tag == p_tgt_rec->ref_tag) {
+    if (p_src_rec->ref_tag == p_tgt_rec->ref_tag) {
       p_stats->duplicate_records++;
       ldpp_dout(dpp, 10) << __func__ << "::WARN::REF_TAG::Duplicate records for "
-                         << src_rec.obj_name << "::" << src_rec.ref_tag << "::"
+                         << p_src_rec->obj_name << "::" << p_src_rec->ref_tag <<"::"
                          << p_tgt_rec->obj_name << dendl;
       return 0;
     }
 
     // the hash table size is rounded to the nearest 4KB and will wrap after 16G
-    if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
+    if (unlikely(p_src_rec->s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
       p_stats->size_mismatch++;
       ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::"
-                         << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size
+                         << p_src_rec->obj_name << "::" << p_src_rec->s.obj_bytes_size
                          << "::" << p_tgt_rec->obj_name << "::"
                          << p_tgt_rec->s.obj_bytes_size << dendl;
       return 0;
     }
 
-    if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
-      p_stats->hash_mismatch++;
-      ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
-      // TBD: set hash attributes on head objects to save calc next time
-      if (src_rec.s.flags.hash_calculated()) {
-        write_blake3_object_attribute(dpp, driver, store, &src_rec);
-        p_stats->set_hash_attrs++;
+    ret = parse_manifests(dpp, p_src_rec, p_tgt_rec, &src_manifest, &tgt_manifest);
+    if (unlikely(ret != 0)) {
+      return 0;
+    }
+
+    // make sure objects were not created by server-side-copy
+    if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) {
+      return 0;
+    }
+
+
+    std::string tail_oid;
+    bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest,
+                                             tgt_manifest, &src_val, tail_oid, p_stats);
+    if (unlikely(!success)) {
+      if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) {
+        // set hash attributes on head objects to save calc next time
+        ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid SRC hash" << dendl;
+        ret = write_hash_object_attribute(dpp, driver, store, p_src_rec, p_stats);
+        if (ret == 0) {
+          ldpp_dout(dpp, 20) << __func__ <<"::mark valid_hash in table" << dendl;
+          p_table->set_src_mode(&key, src_block_id, src_rec_id, false, true);
+        }
       }
       if (p_tgt_rec->s.flags.hash_calculated()) {
-        write_blake3_object_attribute(dpp, driver, store, p_tgt_rec);
-        p_stats->set_hash_attrs++;
+        ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid TGT hash" << dendl;
+        write_hash_object_attribute(dpp, driver, store, p_tgt_rec, p_stats);
       }
       return 0;
     }
 
-    ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest());
+    ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats,
+                       &src_val, tail_oid);
     if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl;
       p_stats->deduped_objects++;
       p_stats->deduped_objects_bytes += dedupable_objects_bytes;
-      if (p_tgt_rec->s.num_parts == 0) {
+      if (p_tgt_rec->s.flags.is_split_head()) {
+        ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split: dedup_bytes="
+                           << ondisk_byte_size << dendl;
+        p_stats->split_head_dedup_bytes += ondisk_byte_size;
+      }
+      else if (p_tgt_rec->s.num_parts == 0 &&
+               // if we don't split head it will be duplicated
+               p_tgt_rec->s.obj_bytes_size > d_head_object_size) {
         // single part objects duplicate the head object when dedup is used
         p_stats->dup_head_bytes += d_head_object_size;
       }
 
       // mark the SRC object as a providor of a shared manifest
       if (!src_val.has_shared_manifest()) {
+        ldpp_dout(dpp, 20) << __func__ << "::mark shared_manifest+valid_hash"<< dendl;
         p_stats->set_shared_manifest_src++;
-        // set the shared manifest flag in the dedup table
-        p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id);
+        // We always set strong hash on SRC during dedup so mark in table!
+        p_table->set_src_mode(&key, src_block_id, src_rec_id, true, true);
       }
       else {
         ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl;
@@ -1362,7 +2098,7 @@ namespace rgw::dedup {
     }
     else {
       ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for "
-                         << src_rec.bucket_name << "/" << src_rec.obj_name << dendl;
+                         << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << dendl;
       p_stats->failed_dedup++;
     }
 
@@ -1498,10 +2234,12 @@ namespace rgw::dedup {
         }
 
         has_more = (p_header->offset == BLOCK_MAGIC);
-        ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC);
         if (!has_more) {
           ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id
                              << ", rec_count=" << p_header->rec_count << dendl;
+          if (unlikely(p_header->offset != LAST_BLOCK_MAGIC)) {
+            p_stats->missing_last_block_marker++;
+          }
           break;
         }
       }
@@ -1549,7 +2287,7 @@ namespace rgw::dedup {
     p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
 
     // We limit dedup to objects from the same storage_class
-    // TBD:
+    // TBD-Future:
     // Should we use a skip-list of storage_classes we should skip (like glacier) ?
     const std::string& storage_class =
       rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class);
@@ -1564,7 +2302,7 @@ namespace rgw::dedup {
       p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size;
     }
 
-    if (ondisk_byte_size <= d_min_obj_size_for_dedup) {
+    if (ondisk_byte_size < d_min_obj_size_for_dedup) {
       if (parsed_etag.num_parts == 0) {
         // dedup only useful for objects bigger than 4MB
         p_worker_stats->ingress_skip_too_small++;
@@ -1802,7 +2540,7 @@ namespace rgw::dedup {
     // make sure that the standard storage_class is always in the mapper!
     storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp,
                                                 &p_stats->failed_map_overflow);
-    ceph_assert(sc_idx == 0);
+    ceph_assert(sc_idx != remapper_t::NULL_IDX);
     uint32_t slab_count_arr[num_work_shards];
     // first load all etags to hashtable to find dedups
     // the entries come from bucket-index and got minimal info (etag, size)
@@ -2095,7 +2833,8 @@ namespace rgw::dedup {
     utime_t start_time = ceph_clock_now();
     md5_stats_t md5_stats;
     //DEDUP_DYN_ALLOC
-    dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size);
+    dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup,
+                        d_max_obj_size_for_split, raw_mem, raw_mem_size);
     int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
     if (ret == 0) {
       md5_stats.duration = ceph_clock_now() - start_time;
@@ -2290,6 +3029,7 @@ namespace rgw::dedup {
 
     ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl;
     d_ctl.dedup_type = p_epoch->dedup_type;
+    // TBD: replace with a stat-counter
 #ifdef FULL_DEDUP_SUPPORT
     ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_EXEC ||
                 d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
@@ -2755,7 +3495,6 @@ namespace rgw::dedup {
       }
       d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();});
       if (!d_ctl.should_stop() && !d_ctl.should_pause()) {
-        // TBD: should we release lock here ???
         if (d_cluster.can_start_new_scan(store)) {
           d_ctl.dedup_exec = true;
           d_ctl.remote_aborted = false;
index b1df56249e8bfc04760b333483e92f4003c0caee..adca55efebc5c747b2811564e87caf2ac81b1379 100644 (file)
@@ -97,6 +97,8 @@ namespace rgw::dedup {
       STEP_REMOVE_DUPLICATES
     };
 
+    inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes);
+    inline bool should_split_head(uint64_t head_size, uint64_t obj_size);
     void run();
     int  setup(struct dedup_epoch_t*);
     void work_shards_barrier(work_shard_t num_work_shards);
@@ -182,11 +184,18 @@ namespace rgw::dedup {
                            remapper_t *remapper);
 
 #ifdef FULL_DEDUP_SUPPORT
-    int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
-    int add_obj_attrs_to_record(rgw_bucket            *p_rb,
-                                disk_record_t         *p_rec,
+    int calc_object_blake3(const RGWObjManifest &manifest,
+                           disk_record_t *p_rec,
+                           uint8_t *p_hash,
+                           blake3_hasher *p_pre_calc_hmac = nullptr);
+    int split_head_object(disk_record_t *p_src_rec,     // IN/OUT PARAM
+                          RGWObjManifest &src_manifest, // IN/OUT PARAM
+                          const disk_record_t *p_tgt_rec,
+                          std::string &tail_oid,        // OUT PARAM
+                          md5_stats_t *p_stats);
+
+    int add_obj_attrs_to_record(disk_record_t         *p_rec,
                                 const rgw::sal::Attrs &attrs,
-                                dedup_table_t         *p_table,
                                 md5_stats_t           *p_stats); /* IN-OUT */
 
     int read_object_attribute(dedup_table_t    *p_table,
@@ -197,26 +206,36 @@ namespace rgw::dedup {
                               md5_stats_t      *p_stats /* IN-OUT */,
                               disk_block_seq_t *p_disk,
                               remapper_t       *remapper);
-    int try_deduping_record(dedup_table_t       *p_table,
-                            const disk_record_t *p_rec,
-                            disk_block_id_t      block_id,
-                            record_id_t          rec_id,
-                            md5_shard_t          md5_shard,
-                            md5_stats_t         *p_stats, /* IN-OUT */
-                            remapper_t          *remapper);
-    int inc_ref_count_by_manifest(const std::string &ref_tag,
-                                  const std::string &oid,
-                                  RGWObjManifest    &manifest);
-    int rollback_ref_by_manifest(const std::string &ref_tag,
-                                 const std::string &oid,
-                                 RGWObjManifest    &tgt_manifest);
-    int free_tail_objs_by_manifest(const std::string &ref_tag,
-                                   const std::string &oid,
-                                   RGWObjManifest    &tgt_manifest);
-    int dedup_object(const disk_record_t *p_src_rec,
-                     const disk_record_t *p_tgt_rec,
-                     md5_stats_t         *p_stats,
-                     bool                 is_shared_manifest_src);
+    bool check_and_set_strong_hash(disk_record_t *p_src_rec, // IN/OUT PARAM
+                                   disk_record_t *p_tgt_rec, // IN/OUT PARAM
+                                   RGWObjManifest &src_manifest,
+                                   const RGWObjManifest &tgt_manifest,
+                                   const dedup_table_t::value_t *p_src_val,
+                                   std::string &tail_oid,    // OUT PARAM
+                                   md5_stats_t *p_stats);
+    int try_deduping_record(dedup_table_t   *p_table,
+                            disk_record_t   *p_rec,
+                            disk_block_id_t  block_id,
+                            record_id_t      rec_id,
+                            md5_shard_t      md5_shard,
+                            md5_stats_t     *p_stats, /* IN-OUT */
+                            remapper_t      *remapper);
+    int inc_ref_count_by_manifest(const std::string    &ref_tag,
+                                  const std::string    &oid,
+                                  const RGWObjManifest &manifest);
+    int rollback_ref_by_manifest(const std::string    &ref_tag,
+                                 const std::string    &oid,
+                                 const RGWObjManifest &tgt_manifest);
+    int free_tail_objs_by_manifest(const std::string    &ref_tag,
+                                   const std::string    &oid,
+                                   const RGWObjManifest &tgt_manifest);
+    int dedup_object(disk_record_t                *p_src_rec,
+                     disk_record_t                *p_tgt_rec,
+                     const RGWObjManifest         &src_manifest,
+                     const RGWObjManifest         &tgt_manifest,
+                     md5_stats_t                  *p_stats,
+                     const dedup_table_t::value_t *p_src_val,
+                     const std::string            &tail_oid);
 #endif
     int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
     int  init_rados_access_handles(bool init_pool);
@@ -235,8 +254,9 @@ namespace rgw::dedup {
     unsigned d_heart_beat_max_elapsed_sec;
     uint64_t d_all_buckets_obj_count   = 0;
     uint64_t d_all_buckets_obj_size    = 0;
-    // we don't benefit from deduping RGW objects smaller than head-object size
-    uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024);
+
+    uint32_t d_min_obj_size_for_dedup = (64ULL * 1024);
+    uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024);
     uint32_t d_head_object_size       = (4ULL * 1024 * 1024);
     control_t d_ctl;
     uint64_t d_watch_handle = 0;
index ebbbec741803ae9cdf5228ad922fc458081ddd71..fafd66176eff99b17562c6ceb107b0563597aec1 100644 (file)
@@ -124,7 +124,7 @@ namespace rgw::dedup {
     ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
     bool exclusive = true; // block overwrite of old objects
     ret = ctl_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
+    if (ret == 0) {
       ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
       // now try and take ownership
     }
@@ -495,7 +495,7 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
       bool exclusive = true;
       ret = ctl_ioctx.create(oid, exclusive);
-      if (ret >= 0) {
+      if (ret == 0) {
         ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
       }
       else if (ret == -EEXIST) {
@@ -1124,7 +1124,7 @@ namespace rgw::dedup {
     // create the object to watch (object may already exist)
     bool exclusive = true;
     ret = ctl_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
+    if (ret == 0) {
       ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
                          << " was created!" << dendl;
     }
index d2b62651c6c9c5a2f73b4e8853e533edfeef5aa3..83fdfe1993172c9984fbc31a631c42102f08b51d 100644 (file)
@@ -123,9 +123,7 @@ namespace rgw::dedup {
     else {
       this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest);
       // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-      const unsigned units = (256 / (sizeof(uint64_t)*8));
-      static_assert(units == 4);
-      for (unsigned i = 0; i < units; i++) {
+      for (unsigned i = 0; i < HASH_UNITS; i++) {
         this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]);
       }
       this->ref_tag = std::string(p, this->s.ref_tag_len);
@@ -189,9 +187,7 @@ namespace rgw::dedup {
     else {
       p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest);
       // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-      const unsigned units = (256 / (sizeof(uint64_t)*8));
-      static_assert(units == 4);
-      for (unsigned i = 0; i < units; i++) {
+      for (unsigned i = 0; i < HASH_UNITS; i++) {
         p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]);
       }
       len = this->ref_tag.length();
@@ -228,7 +224,7 @@ namespace rgw::dedup {
   {
     // optimistic approach
     if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) {
-      ldpp_dout(dpp, 20) << __func__ << "::success" << dendl;
+      ldpp_dout(dpp, 20) << caller << "::validate disk_record success" << dendl;
       return 0;
     }
 
@@ -270,14 +266,12 @@ namespace rgw::dedup {
     stream << "MD5       = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n";
     stream << "HASH      = ";
     // BLAKE3 hash has 256 bit splitted into multiple 64bit units
-    const unsigned units = (256 / (sizeof(uint64_t)*8));
-    static_assert(units == 4);
-    for (unsigned i = 0; i < units; i++) {
+    for (unsigned i = 0; i < HASH_UNITS; i++) {
       stream << rec.s.hash[i];
     }
     stream << "\n";
 
-    if (rec.has_shared_manifest()) {
+    if (rec.s.flags.has_shared_manifest()) {
       stream << "Shared Manifest Object\n";
     }
     else {
@@ -603,19 +597,12 @@ namespace rgw::dedup {
     ceph_assert(bl.length());
 
     int ret = ioctx.write_full(oid, bl);
-    if (ret == (int)bl.length()) {
-      ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to "
-                         << oid << dendl;
+    if (ret == 0) {
+      ldpp_dout(dpp, 20) << __func__ << "::SLAB was written successfully" << dendl;
     }
     else {
-      if (ret == 0) {
-        // no error reported, but we wrote nothing which should never happen
-        ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid
-                          << ", bl.length()=" << bl.length() << dendl;
-        ret = -ENODATA;
-      }
       ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid
-                        << " with: " << cpp_strerror(-ret) << dendl;
+                        << "::ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
     }
 
     return ret;
index 7bca5d4e70ec0a3d4b655108e5dd8637f26affde..010e54cd45451107edbc3871ee37259999865802 100644 (file)
@@ -38,7 +38,8 @@ namespace rgw::dedup {
 #define HTOCEPH_32 htole32
 #define HTOCEPH_64 htole64
 
-  static inline constexpr unsigned DISK_BLOCK_SIZE  = 8*1024;
+  static constexpr unsigned HASH_UNITS = BLAKE3_OUT_LEN/sizeof(uint64_t);
+  static constexpr unsigned DISK_BLOCK_SIZE  = 8*1024;
   // we use 16 bit offset
   static_assert(DISK_BLOCK_SIZE < 64*1024);
   static constexpr unsigned DISK_BLOCK_COUNT = 256;
@@ -132,6 +133,35 @@ namespace rgw::dedup {
     uint32_t block_id;
   };
 
+  struct __attribute__ ((packed)) record_flags_t {
+  private:
+    static constexpr uint8_t RGW_RECORD_FLAG_HAS_VALID_HASH  = 0x01;
+    static constexpr uint8_t RGW_RECORD_FLAG_SHARED_MANIFEST = 0x02;
+    static constexpr uint8_t RGW_RECORD_FLAG_HASH_CALCULATED = 0x04;
+    static constexpr uint8_t RGW_RECORD_FLAG_FASTLANE        = 0x08;
+    static constexpr uint8_t RGW_RECORD_FLAG_SPLIT_HEAD      = 0x10;
+    static constexpr uint8_t RGW_RECORD_FLAG_TAIL_REFTAG     = 0x20;
+  public:
+    record_flags_t() : flags(0) {}
+    record_flags_t(uint8_t _flags) : flags(_flags) {}
+    inline void clear() { this->flags = 0; }
+    inline bool hash_calculated() const { return ((flags & RGW_RECORD_FLAG_HASH_CALCULATED) != 0); }
+    inline void set_hash_calculated()  { flags |= RGW_RECORD_FLAG_HASH_CALCULATED; }
+    inline void clear_hash_calculated()  { flags &= ~RGW_RECORD_FLAG_HASH_CALCULATED; }
+    inline bool has_valid_hash() const { return ((flags & RGW_RECORD_FLAG_HAS_VALID_HASH) != 0); }
+    inline void set_has_valid_hash()  { flags |= RGW_RECORD_FLAG_HAS_VALID_HASH; }
+    inline bool has_shared_manifest() const { return ((flags & RGW_RECORD_FLAG_SHARED_MANIFEST) != 0); }
+    inline void set_shared_manifest() { flags |= RGW_RECORD_FLAG_SHARED_MANIFEST; }
+    inline bool is_fastlane()  const { return ((flags & RGW_RECORD_FLAG_FASTLANE) != 0); }
+    inline void set_fastlane()  { flags |= RGW_RECORD_FLAG_FASTLANE; }
+    inline bool is_split_head() const { return ((flags & RGW_RECORD_FLAG_SPLIT_HEAD) != 0); }
+    inline void set_split_head() { flags |= RGW_RECORD_FLAG_SPLIT_HEAD; }
+    inline bool is_ref_tag_from_tail() const { return ((flags & RGW_RECORD_FLAG_TAIL_REFTAG) != 0); }
+    inline void set_ref_tag_from_tail() { flags |= RGW_RECORD_FLAG_TAIL_REFTAG; }
+  private:
+    uint8_t flags;
+  };
+
   struct disk_record_t
   {
     disk_record_t(const char *buff);
@@ -148,32 +178,29 @@ namespace rgw::dedup {
                  const DoutPrefixProvider* dpp,
                  disk_block_id_t block_id,
                  record_id_t rec_id) const;
-    inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); }
-    inline void set_shared_manifest() { s.flags.set_shared_manifest(); }
-
-    struct __attribute__ ((packed)) packed_rec_t
+    inline bool multipart_object() { return (this->s.num_parts > 0); }
+    struct packed_rec_t
     {
-      uint8_t       rec_version;     // allows changing record format
-      dedup_flags_t flags;           // 1 Byte flags
-      uint16_t      num_parts;       // For multipart upload (AWS MAX-PART is 10,000)
-      uint16_t      obj_name_len;
-      uint16_t      bucket_name_len;
-
+      uint64_t      hash[4];         // 4 * 8 Bytes of HASH
+      uint64_t      shared_manifest; // 64bit hash of the SRC object manifest
       uint64_t      md5_high;        // High Bytes of the Object Data MD5
       uint64_t      md5_low;         // Low  Bytes of the Object Data MD5
       uint64_t      obj_bytes_size;
 
+      uint16_t      num_parts;       // For multipart upload (AWS MAX-PART is 10,000)
+      uint16_t      obj_name_len;
+      uint16_t      bucket_name_len;
       uint16_t      bucket_id_len;
+
       uint16_t      tenant_name_len;
       uint16_t      instance_len;
       uint16_t      stor_class_len;
       uint16_t      ref_tag_len;
-
       uint16_t      manifest_len;
-      uint8_t       pad[6];
 
-      uint64_t      shared_manifest; // 64bit hash of the SRC object manifest
-      uint64_t      hash[4];       // 4 * 8 Bytes of BLAKE3
+      uint8_t       rec_version;     // allows changing record format
+      record_flags_t flags;           // 1 Byte flags
+      uint8_t       pad[6];
     }s;
     std::string obj_name;
     // TBD: find pool name making it easier to get ioctx
@@ -186,6 +213,7 @@ namespace rgw::dedup {
     bufferlist  manifest_bl;
   };
   static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash));
+  static_assert(sizeof(disk_record_t::packed_rec_t) == sizeof(uint64_t)*12);
   std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec);
 
   static constexpr unsigned BLOCK_MAGIC = 0xFACE;
index 4f34b27d18edaad34f8b24f682e8cedd7b533850..d86896473a1819ebf8093542a94dc67dbf48c823 100644 (file)
@@ -22,11 +22,15 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
                                uint32_t _head_object_size,
+                               uint32_t _min_obj_size_for_dedup,
+                               uint32_t _max_obj_size_for_split,
                                uint8_t *p_slab,
                                uint64_t slab_size)
   {
     dpp = _dpp;
     head_object_size = _head_object_size;
+    min_obj_size_for_dedup = _min_obj_size_for_dedup;
+    max_obj_size_for_split = _max_obj_size_for_split;
     memset(p_slab, 0, slab_size);
     hash_tab = (table_entry_t*)p_slab;
     entries_count = slab_size/sizeof(table_entry_t);
@@ -51,7 +55,7 @@ namespace rgw::dedup {
       const key_t &key = hash_tab[tab_idx].key;
       // This is an approximation only since size is stored in 4KB resolution
       uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
-      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
         hash_tab[tab_idx].val.clear_flags();
         redistributed_clear++;
         continue;
@@ -126,12 +130,16 @@ namespace rgw::dedup {
     }
     else {
       uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+                                                     min_obj_size_for_dedup,
+                                                     max_obj_size_for_split,
                                                      p_key->num_parts,
                                                      byte_size_approx);
       p_big_objs->duplicate_count ++;
       p_big_objs->dedup_bytes_estimate += dup_bytes_approx;
 
-      if (!p_key->multipart_object()) {
+      // object smaller than max_obj_size_for_split will split their head
+      // and won't dup it
+      if (!key.multipart_object() && byte_size_approx > max_obj_size_for_split) {
         // single part objects duplicate the head object when dedup is used
         *p_duplicate_head_bytes += head_object_size;
       }
@@ -206,23 +214,31 @@ namespace rgw::dedup {
       // replace value!
       value_t new_val(block_id, rec_id, shared_manifest);
       new_val.count = val.count;
-      hash_tab[idx].val = new_val;
       ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::["
                          << val.block_idx << "/" << (int)val.rec_id << "] -> ["
                          << block_id << "/" << (int)rec_id << "]" << dendl;
+
+      val = new_val;
     }
   }
 
   //---------------------------------------------------------------------------
-  int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key,
-                                                  disk_block_id_t block_id,
-                                                  record_id_t rec_id)
+  int dedup_table_t::set_src_mode(const key_t *p_key,
+                                  disk_block_id_t block_id,
+                                  record_id_t rec_id,
+                                  bool set_shared_manifest_src,
+                                  bool set_has_valid_hash_src)
   {
     uint32_t idx = find_entry(p_key);
     value_t &val = hash_tab[idx].val;
     if (val.is_occupied()) {
       if (val.block_idx == block_id && val.rec_id == rec_id) {
-        val.set_shared_manifest_src();
+        if (set_shared_manifest_src) {
+          val.set_shared_manifest_src();
+        }
+        if (set_has_valid_hash_src) {
+          val.set_has_valid_hash_src();
+        }
         return 0;
       }
     }
@@ -281,7 +297,7 @@ namespace rgw::dedup {
       uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
 
       // skip small single part objects which we can't dedup
-      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+      if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
         if (hash_tab[tab_idx].val.is_singleton()) {
           p_small_objs->singleton_count++;
         }
index 4a46db6e5b7912f5dd09fde275d87a054a1c7946..501cb20d9aa590951bb45923ade7745c313de9fd 100644 (file)
@@ -63,6 +63,26 @@ namespace rgw::dedup {
   static_assert(sizeof(key_t) == 24);
 
   class dedup_table_t {
+    struct __attribute__ ((packed)) table_flags_t {
+    private:
+      static constexpr uint8_t RGW_TABLE_FLAG_HAS_VALID_HASH  = 0x01;
+      static constexpr uint8_t RGW_TABLE_FLAG_SHARED_MANIFEST = 0x02;
+      static constexpr uint8_t RGW_TABLE_FLAG_OCCUPIED        = 0x04;
+    public:
+      table_flags_t() : flags(0) {}
+      table_flags_t(uint8_t _flags) : flags(_flags) {}
+      inline void clear() { this->flags = 0; }
+      inline bool has_valid_hash() const { return ((flags & RGW_TABLE_FLAG_HAS_VALID_HASH) != 0); }
+      inline void set_has_valid_hash()  { flags |= RGW_TABLE_FLAG_HAS_VALID_HASH; }
+      inline bool has_shared_manifest() const { return ((flags & RGW_TABLE_FLAG_SHARED_MANIFEST) != 0); }
+      inline void set_shared_manifest() { flags |= RGW_TABLE_FLAG_SHARED_MANIFEST; }
+      inline bool is_occupied() const {return ((this->flags & RGW_TABLE_FLAG_OCCUPIED) != 0); }
+      inline void set_occupied() {this->flags |= RGW_TABLE_FLAG_OCCUPIED; }
+      inline void clear_occupied() { this->flags &= ~RGW_TABLE_FLAG_OCCUPIED; }
+    private:
+      uint8_t flags;
+    };
+
   public:
     // 8 Bytes Value
     struct value_t {
@@ -93,6 +113,8 @@ namespace rgw::dedup {
       inline void inc_count() { count ++; }
       inline void reset_count() { count = 0; }
       inline void clear_flags() { flags.clear(); }
+      inline bool has_valid_hash() const {return flags.has_valid_hash(); }
+      inline void set_has_valid_hash_src() { this->flags.set_has_valid_hash(); }
       inline bool is_singleton() const { return (count == 1); }
       inline bool is_occupied() const { return flags.is_occupied(); }
       inline void set_occupied() { this->flags.set_occupied();  }
@@ -102,12 +124,14 @@ namespace rgw::dedup {
       disk_block_id_t block_idx; // 32 bits
       uint16_t        count;     // 16 bits
       record_id_t     rec_id;    //  8 bits
-      dedup_flags_t   flags;     //  8 bits
+      table_flags_t   flags;     //  8 bits
     } __attribute__((__packed__));
     static_assert(sizeof(value_t) == 8);
 
     dedup_table_t(const DoutPrefixProvider* _dpp,
                   uint32_t _head_object_size,
+                  uint32_t _min_obj_size_for_dedup,
+                  uint32_t _max_obj_size_for_split,
                   uint8_t *p_slab,
                   uint64_t slab_size);
     int add_entry(key_t *p_key,
@@ -129,6 +153,12 @@ namespace rgw::dedup {
                                      disk_block_id_t block_id,
                                      record_id_t rec_id);
 
+    int set_src_mode(const key_t *p_key,
+                     disk_block_id_t block_id,
+                     record_id_t rec_id,
+                     bool set_shared_manifest_src,
+                     bool set_has_valid_hash_src);
+
     void count_duplicates(dedup_stats_t *p_small_objs_stat,
                           dedup_stats_t *p_big_objs_stat);
 
@@ -145,7 +175,9 @@ namespace rgw::dedup {
     uint32_t       values_count = 0;
     uint32_t       entries_count = 0;
     uint32_t       occupied_count = 0;
-    uint32_t       head_object_size = (4ULL * 1024 * 1024);
+    uint32_t       head_object_size;
+    uint32_t       min_obj_size_for_dedup;
+    uint32_t       max_obj_size_for_split;
     table_entry_t *hash_tab = nullptr;
 
     // stat counters
index 61ad6b91c516ef6cff68dc0ddca89000fefdf44c..74252a853950346028f9eea0896acb531cec1559 100644 (file)
@@ -14,8 +14,8 @@
 
 #include "rgw_dedup_utils.h"
 #include "common/ceph_crypto.h"
-
 namespace rgw::dedup {
+
   //---------------------------------------------------------------------------
   std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type)
   {
@@ -566,10 +566,27 @@ namespace rgw::dedup {
     this->failed_rec_load         += other.failed_rec_load;
     this->failed_block_load       += other.failed_block_load;
 
+    this->different_storage_class       += other.different_storage_class;
+    this->invalid_hash_no_split_head    += other.invalid_hash_no_split_head;
+    this->invalid_storage_class_mapping += other.invalid_storage_class_mapping;
+    this->singleton_after_purge         += other.singleton_after_purge;
+    this->shared_manifest_after_purge   += other.shared_manifest_after_purge;
+    this->split_head_no_tail_placement  += other.split_head_no_tail_placement;
+    this->illegal_rec_id                += other.illegal_rec_id;
+    this->missing_last_block_marker     += other.missing_last_block_marker;
+
     this->valid_hash_attrs        += other.valid_hash_attrs;
     this->invalid_hash_attrs      += other.invalid_hash_attrs;
     this->set_hash_attrs          += other.set_hash_attrs;
     this->skip_hash_cmp           += other.skip_hash_cmp;
+    this->manifest_raw_obj        += other.manifest_raw_obj;
+    this->manifest_no_tail_placement += other.manifest_no_tail_placement;
+    this->rollback_tail_obj       += other.rollback_tail_obj;
+    this->failed_split_head_creat += other.failed_split_head_creat;
+    this->skip_shared_tail_objs   += other.skip_shared_tail_objs;
+    this->split_head_src          += other.split_head_src;
+    this->split_head_tgt          += other.split_head_tgt;
+    this->split_head_dedup_bytes  += other.split_head_dedup_bytes;
 
     this->set_shared_manifest_src += other.set_shared_manifest_src;
     this->loaded_objects          += other.loaded_objects;
@@ -659,9 +676,30 @@ namespace rgw::dedup {
         f->dump_unsigned("Set HASH", this->set_hash_attrs);
       }
 
+      if (this->skip_shared_tail_objs) {
+        f->dump_unsigned("Skip Shared Tail Objs (server-side-copy)", this->skip_shared_tail_objs);
+      }
       if (this->skip_hash_cmp) {
         f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp);
       }
+      if (this->manifest_raw_obj) {
+        f->dump_unsigned("Manifest has RAW OBJ", this->manifest_raw_obj);
+      }
+      if (this->manifest_no_tail_placement) {
+        f->dump_unsigned("Manifest has no tail placement", this->manifest_no_tail_placement);
+      }
+      if (this->rollback_tail_obj) {
+        f->dump_unsigned("Rollback tail obj", this->rollback_tail_obj);
+      }
+      if (this->split_head_src) {
+        f->dump_unsigned("Split-Head Src OBJ", this->split_head_src);
+      }
+      if (this->split_head_tgt) {
+        f->dump_unsigned("Split-Head Tgt OBJ", this->split_head_tgt);
+      }
+      if (this->split_head_dedup_bytes) {
+        f->dump_unsigned("Split-Head Dedup-Bytes", this->split_head_dedup_bytes);
+      }
     }
 
     {
@@ -716,6 +754,18 @@ namespace rgw::dedup {
       if (this->failed_block_load) {
         f->dump_unsigned("Failed Block-Load ", this->failed_block_load);
       }
+
+      if (this->illegal_rec_id) {
+        f->dump_unsigned("Failed illegal_rec_id", this->illegal_rec_id );
+      }
+      if (this->missing_last_block_marker) {
+        f->dump_unsigned("Failed missing_last_block_marker in rec",
+                         this->missing_last_block_marker);
+      }
+
+      if (this->failed_split_head_creat) {
+        f->dump_unsigned("Failed Split-Head Create (EEXIST)", this->failed_split_head_creat);
+      }
       if (this->failed_dedup) {
         f->dump_unsigned("Failed Dedup", this->failed_dedup);
       }
@@ -732,6 +782,30 @@ namespace rgw::dedup {
       if (this->size_mismatch) {
         f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch);
       }
+      if (this->different_storage_class) {
+        f->dump_unsigned("different_storage_class",
+                         this->different_storage_class);
+      }
+      if (this->invalid_hash_no_split_head) {
+        f->dump_unsigned("Failed rec has invalid hash w/o split-head ",
+                         this->invalid_hash_no_split_head);
+      }
+      if (this->invalid_storage_class_mapping) {
+        f->dump_unsigned("Failed, invalid_storage_class_mapping",
+                         this->invalid_storage_class_mapping);
+      }
+      if (this->singleton_after_purge) {
+        f->dump_unsigned("Failed, has singleton after purge",
+                         this->singleton_after_purge);
+      }
+      if (this->shared_manifest_after_purge) {
+        f->dump_unsigned("Failed, has shared manifest after purge",
+                         this->shared_manifest_after_purge);
+      }
+      if (this->split_head_no_tail_placement) {
+        f->dump_unsigned("No Tail Placement during Split-Head processing",
+                         this->split_head_no_tail_placement);
+      }
     }
   }
 
@@ -768,10 +842,27 @@ namespace rgw::dedup {
     encode(m.failed_rec_load, bl);
     encode(m.failed_block_load, bl);
 
+    encode(m.different_storage_class, bl);
+    encode(m.invalid_hash_no_split_head, bl);
+    encode(m.invalid_storage_class_mapping, bl);
+    encode(m.singleton_after_purge, bl);
+    encode(m.shared_manifest_after_purge, bl);
+    encode(m.split_head_no_tail_placement, bl);
+    encode(m.illegal_rec_id, bl);
+    encode(m.missing_last_block_marker, bl);
+
     encode(m.valid_hash_attrs, bl);
     encode(m.invalid_hash_attrs, bl);
     encode(m.set_hash_attrs, bl);
     encode(m.skip_hash_cmp, bl);
+    encode(m.manifest_raw_obj, bl);
+    encode(m.manifest_no_tail_placement, bl);
+    encode(m.rollback_tail_obj, bl);
+    encode(m.failed_split_head_creat, bl);
+    encode(m.skip_shared_tail_objs, bl);
+    encode(m.split_head_src, bl);
+    encode(m.split_head_tgt, bl);
+    encode(m.split_head_dedup_bytes, bl);
     encode(m.set_shared_manifest_src, bl);
 
     encode(m.loaded_objects, bl);
@@ -822,10 +913,27 @@ namespace rgw::dedup {
     decode(m.failed_rec_load, bl);
     decode(m.failed_block_load, bl);
 
+    decode(m.different_storage_class, bl);
+    decode(m.invalid_hash_no_split_head, bl);
+    decode(m.invalid_storage_class_mapping, bl);
+    decode(m.singleton_after_purge, bl);
+    decode(m.shared_manifest_after_purge, bl);
+    decode(m.split_head_no_tail_placement, bl);
+    decode(m.illegal_rec_id, bl);
+    decode(m.missing_last_block_marker, bl);
+
     decode(m.valid_hash_attrs, bl);
     decode(m.invalid_hash_attrs, bl);
     decode(m.set_hash_attrs, bl);
     decode(m.skip_hash_cmp, bl);
+    decode(m.manifest_raw_obj, bl);
+    decode(m.manifest_no_tail_placement, bl);
+    decode(m.rollback_tail_obj, bl);
+    decode(m.failed_split_head_creat, bl);
+    decode(m.skip_shared_tail_objs, bl);
+    decode(m.split_head_src, bl);
+    decode(m.split_head_tgt, bl);
+    decode(m.split_head_dedup_bytes, bl);
     decode(m.set_shared_manifest_src, bl);
 
     decode(m.loaded_objects, bl);
index abe624321225b337378f4495a5eac857f47c990c..579e048a259f0bd0f8650a8bc64ff6d5f25d005a 100644 (file)
@@ -25,6 +25,7 @@
 #include "common/dout.h"
 
 #define FULL_DEDUP_SUPPORT
+
 namespace rgw::dedup {
   using namespace std::chrono;
   using work_shard_t   = uint16_t;
@@ -68,29 +69,6 @@ namespace rgw::dedup {
   };
 
   std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type);
-  struct __attribute__ ((packed)) dedup_flags_t {
-  private:
-    static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC
-    static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST   = 0x02; // REC + TAB
-    static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED          = 0x04; // TAB
-    static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE          = 0x08; // REC
-
-  public:
-    dedup_flags_t() : flags(0) {}
-    dedup_flags_t(uint8_t _flags) : flags(_flags) {}
-    inline void clear() { this->flags = 0; }
-    inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); }
-    inline void set_hash_calculated()  { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; }
-    inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); }
-    inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; }
-    inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); }
-    inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; }
-    inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; }
-    inline bool is_fastlane()  const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); }
-    inline void set_fastlane()  { flags |= RGW_DEDUP_FLAG_FASTLANE; }
-  private:
-    uint8_t flags;
-  };
 
   class alignas(8) Throttle {
     friend void validate_max_calls_offset();
@@ -262,11 +240,27 @@ namespace rgw::dedup {
     uint64_t failed_rec_load = 0;
     uint64_t failed_block_load = 0;
 
+    uint64_t different_storage_class = 0;
+    uint64_t invalid_hash_no_split_head = 0;
+    uint64_t invalid_storage_class_mapping = 0;
+    uint64_t singleton_after_purge = 0;
+    uint64_t shared_manifest_after_purge = 0;
+    uint64_t split_head_no_tail_placement = 0;
+    uint64_t illegal_rec_id = 0;
+    uint64_t missing_last_block_marker = 0;
+
     uint64_t valid_hash_attrs = 0;
     uint64_t invalid_hash_attrs = 0;
     uint64_t set_hash_attrs = 0;
+    uint64_t skip_shared_tail_objs = 0;
     uint64_t skip_hash_cmp = 0;
-
+    uint64_t manifest_raw_obj = 0;
+    uint64_t manifest_no_tail_placement = 0;
+    uint64_t rollback_tail_obj = 0;
+    uint64_t failed_split_head_creat = 0;
+    uint64_t split_head_src = 0;
+    uint64_t split_head_tgt = 0;
+    uint64_t split_head_dedup_bytes = 0;
     uint64_t set_shared_manifest_src = 0;
     uint64_t loaded_objects = 0;
     uint64_t processed_objects = 0;
@@ -362,7 +356,19 @@ namespace rgw::dedup {
                                 const DoutPrefixProvider* dpp);
 
   //---------------------------------------------------------------------------
-  static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size,
+  static inline bool dedupable_object(bool     multipart_object,
+                                      uint64_t min_obj_size_for_dedup,
+                                      uint64_t object_byte_size)
+  {
+    // all multipart objects are dedupable because the head-object is empty
+    // otherwise make sure object_byte_size is large enough
+    return (multipart_object || object_byte_size >= min_obj_size_for_dedup);
+  }
+
+  //---------------------------------------------------------------------------
+  static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size,
+                                            uint32_t min_obj_size_for_dedup,
+                                            uint32_t max_obj_size_for_split,
                                             uint16_t num_parts,
                                             uint64_t size_bytes)
   {
@@ -372,9 +378,13 @@ namespace rgw::dedup {
     }
     else {
       // reduce the head size
-      if (size_bytes > head_obj_size) {
+      if (size_bytes > max_obj_size_for_split) {
         return size_bytes - head_obj_size;
       }
+      else if (size_bytes >= min_obj_size_for_dedup) {
+        // Head is splitted into an empty obj and a new tail enabling a full dedup
+        return size_bytes;
+      }
       else {
         return 0;
       }
index 1e679a38b8e9f60a8b2a1807db31ce90c7160103..4129a015c98b103aad7d2625a5397b70e6e06b31 100644 (file)
@@ -256,6 +256,10 @@ public:
   void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
                              std::string *override_prefix, rgw_obj_select *location) const;
 
+  void clear_rules() {
+    rules.clear();
+  }
+
   void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
     RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
     rules[0] = rule;
@@ -467,6 +471,10 @@ public:
     return max_head_size;
   }
 
+  void set_max_head_size(uint64_t _max_head_size) {
+    max_head_size = _max_head_size;
+  }
+
   const std::string& get_tier_type() {
       return tier_type;
   }
index 87219399678dc95b428bf3f7930d802427f5c508..40ab41c400ffa2e3093ec8417e0da8d522aede14 100644 (file)
@@ -89,7 +89,6 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_CORS          RGW_ATTR_PREFIX "cors"
 #define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag"
 #define RGW_ATTR_CKSUM          RGW_ATTR_PREFIX "cksum"
-#define RGW_ATTR_SHA256         RGW_ATTR_PREFIX "x-amz-content-sha256"
 #define RGW_ATTR_BLAKE3         RGW_ATTR_PREFIX "blake3"
 #define RGW_ATTR_BUCKETS       RGW_ATTR_PREFIX "buckets"
 #define RGW_ATTR_META_PREFIX   RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
index 3f3a3d606dd27f9825a065f2f83b5b460c4a695e..f24857e3d993f20d0b73c554536054bbc198b978 100644 (file)
@@ -262,6 +262,17 @@ def gen_connections_multi(num_tenants):
     log.debug("gen_connections_multi: All connection and buckets are set")
     return (tenants, bucket_names, conns)
 
+#-------------------------------------------------------------------------------
+def create_buckets(conn, max_copies_count):
+    bucket_names=[]
+    for i in range(0, max_copies_count):
+        bucket_name=gen_bucket_name()
+        bucket_names.append(bucket_name)
+        log.debug("conn.create_bucket(Bucket=%s)", bucket_name)
+        conn.create_bucket(Bucket=bucket_name)
+
+    return bucket_names
+
 
 #####################
 # dedup tests
@@ -270,8 +281,11 @@ OUT_DIR="/tmp/dedup/"
 KB=(1024)
 MB=(1024*KB)
 POTENTIAL_OBJ_SIZE=(64*KB)
+DEDUP_MIN_OBJ_SIZE=(64*KB)
+SPLIT_HEAD_SIZE=(16*MB)
 RADOS_OBJ_SIZE=(4*MB)
-MULTIPART_SIZE=(16*MB)
+# The default multipart threshold size for S3cmd is 15 MB.
+MULTIPART_SIZE=(15*MB)
 default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE)
 ETAG_ATTR="user.rgw.etag"
 POOLNAME="default.rgw.buckets.data"
@@ -385,26 +399,41 @@ def count_space_in_all_buckets():
 #-------------------------------------------------------------------------------
 def count_objects_in_bucket(bucket_name, conn):
     max_keys=1000
-    marker=""
+    continuation_token = None
     obj_count=0
     while True:
         log.debug("bucket_name=%s", bucket_name)
-        listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
         if 'Contents' not in listing or len(listing['Contents'])== 0:
             return 0
 
         obj_count += len(listing['Contents'])
 
-        if listing['IsTruncated']:
-            marker=listing['NextMarker']
-            log.debug("marker=%s, obj_count=%d", marker, obj_count)
-            continue
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("count_objects_in_bucket: Token=%s, count=%d",
+                      continuation_token, obj_count)
         else:
             return obj_count
 
 
 #-------------------------------------------------------------------------------
-def count_object_parts_in_all_buckets(verbose=False):
+def copy_obj(base_bucket_name, base_key, bucket_name, key):
+    s3_prefix="s3://"
+    src  = s3_prefix + base_bucket_name + "/" + base_key
+    dest = s3_prefix + bucket_name + "/" + key
+    result = bash(['s3cmd', 'cp', src, dest])
+    assert result[1] == 0
+
+#-------------------------------------------------------------------------------
+def count_object_parts_in_all_buckets(verbose=False, expected_size=0):
     result = rados(['lspools'])
     assert result[1] == 0
     found=False
@@ -420,17 +449,40 @@ def count_object_parts_in_all_buckets(verbose=False):
 
     result = rados(['ls', '-p ', POOLNAME])
     assert result[1] == 0
-
     names=result[0].split()
-    count = 0
-    for name in names:
-        #log.debug(name)
-        count = count + 1
+    rados_count = len(names)
+    if (rados_count > 1000):
+        ### we can only do about 10 stat call per-second!!
+        ### TBD: add obj_size to ls output to allow more efficient size check
+        log.info(">>> rados obj_count(%d) is too high -> skip stat check\n",
+                 len(names))
+        expected_size = 0
+
+    byte_size_total = 0
+    ondisk_size_total = 0
+    start_time = time.perf_counter()
+    for rados_name in names:
+        if verbose:
+            log.debug(rados_name)
+        if expected_size:
+            result = rados(['-p ', POOLNAME, 'stat', rados_name])
+            assert result[1] == 0
+            stat = result[0].split()
+            byte_size=int(stat[-1])
+            byte_size_total += byte_size
+            ondisk_size_total += calc_on_disk_byte_size(byte_size)
+
+    if expected_size:
+        end_time = time.perf_counter()
+        time_elapsed = end_time - start_time
+        log.info("rados_count=%d, ondisk_size_total=%d, expected_size=%d, time=%d(sec)",
+                 rados_count, ondisk_size_total, expected_size, time_elapsed)
+        assert ondisk_size_total == expected_size
 
     if verbose:
-        log.debug("Pool has %d rados objects", count)
+        log.debug("Pool has %d rados objects", rados_count)
 
-    return count
+    return rados_count
 
 
 #-------------------------------------------------------------------------------
@@ -443,29 +495,61 @@ def cleanup_local():
         return False
 
 
+#-------------------------------------------------------------------------------
+def check_delete_objects_response(response):
+    # Check for delete failures
+    if 'Errors' in response and response['Errors']:
+        log.error("Delete failures detected:")
+        for error in response['Errors']:
+            log.error("delete_objects::ERROR::Key=%s, Code=%s, Message=%s",
+                      error['Key'], error['Code'], error['Message'])
+
+    else:
+        log.debug("All objects deleted successfully.")
+
+
+#-------------------------------------------------------------------------------
+def delete_objects(conn, bucket_name, object_keys):
+    response=conn.delete_objects(Bucket=bucket_name,
+                                 Delete={"Objects": [{"Key": key} for key in object_keys]})
+
+    # Check for delete failures
+    check_delete_objects_response(response)
+
+
 #-------------------------------------------------------------------------------
 def delete_bucket_with_all_objects(bucket_name, conn):
     max_keys=1000
-    marker=""
+    continuation_token = None
     obj_count=0
     while True:
-        listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
         if 'Contents' not in listing or len(listing['Contents'])== 0:
             log.debug("Bucket '%s' is empty, skipping...", bucket_name)
             return
 
         objects=[]
         for obj in listing['Contents']:
-            log.debug(obj['Key'])
+            log.debug("delete_bucket_with_all_objects: add obj: %s", obj['Key'])
             objects.append({'Key': obj['Key']})
 
         obj_count += len(objects)
         # delete objects from the bucket
-        conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
-        if listing['IsTruncated']:
-            marker=listing['NextMarker']
-            log.debug("marker=%s, obj_count=%d", marker, obj_count)
-            continue
+        log.debug("delete_bucket_with_all_objects: delete %d objs", obj_count)
+        response=conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
+        check_delete_objects_response(response)
+
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("delete_bucket_with_all_objects: Token=%s, count=%d",
+                      continuation_token, obj_count)
         else:
             break
 
@@ -476,7 +560,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
 def verify_pool_is_empty():
     result = admin(['gc', 'process', '--include-all'])
     assert result[1] == 0
-    assert count_object_parts_in_all_buckets() == 0
+    assert count_object_parts_in_all_buckets(False, 0) == 0
 
 
 #-------------------------------------------------------------------------------
@@ -538,15 +622,39 @@ def calc_rados_obj_count(num_copies, obj_size, config):
     return rados_obj_count
 
 
+BLOCK_SIZE=4096
+#-------------------------------------------------------------------------------
+def calc_on_disk_byte_size(byte_size):
+    return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+
+
+#-------------------------------------------------------------------------------
+def calc_head_size(obj_size, config):
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+    threshold = config.multipart_threshold
+    # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
+    # multi-part objects got a zero size Head objects
+    if obj_size >= threshold:
+        head_size = 0
+    else:
+        head_size = min(RADOS_OBJ_SIZE, on_disk_byte_size)
+
+    return head_size
+
+
 #-------------------------------------------------------------------------------
 def calc_dedupable_space(obj_size, config):
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
     threshold = config.multipart_threshold
     # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
     # multi-part objects got a zero size Head objects
     if obj_size >= threshold:
-        dedupable_space = obj_size
-    elif obj_size > RADOS_OBJ_SIZE:
-        dedupable_space = obj_size - RADOS_OBJ_SIZE
+        dedupable_space = on_disk_byte_size
+    elif obj_size > SPLIT_HEAD_SIZE:
+        dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE
+    elif obj_size >= DEDUP_MIN_OBJ_SIZE:
+        dedupable_space = on_disk_byte_size
     else:
         dedupable_space = 0
 
@@ -554,10 +662,18 @@ def calc_dedupable_space(obj_size, config):
               float(obj_size)/MB, float(dedupable_space)/MB)
     return dedupable_space
 
-BLOCK_SIZE=4096
 #-------------------------------------------------------------------------------
-def calc_on_disk_byte_size(byte_size):
-    return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+def calc_split_objs_count(obj_size, num_copies, config):
+    threshold = config.multipart_threshold
+    on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
+    if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold:
+        return 0
+
+    if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE:
+        return 0
+
+    return 1
 
 
 #-------------------------------------------------------------------------------
@@ -569,7 +685,7 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
     threshold = config.multipart_threshold
     dedup_stats.skip_shared_manifest = 0
     dedup_stats.size_before_dedup += (on_disk_byte_size * num_copies)
-    if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE:
+    if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE:
         dedup_stats.skip_too_small += num_copies
         dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
 
@@ -584,8 +700,6 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
         return
 
     dedup_stats.total_processed_objects += num_copies
-    #dedup_stats.loaded_objects += num_copies
-
     if num_copies == 1:
         dedup_stats.singleton_obj += 1
         dedup_stats.skip_singleton += 1
@@ -628,21 +742,35 @@ def calc_expected_results(files, config):
 
 
 #-------------------------------------------------------------------------------
-def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=True):
+def print_files(files, config):
+    for idx, f in enumerate(files):
+        filename=f[0]
+        obj_size=f[1]
+        num_copies=f[2]
+        assert(obj_size)
+        split_head = calc_split_objs_count(obj_size, num_copies, config)
+        log.info("[%d]%s::size=%d, num_copies=%d, split_head=%d",
+                 idx, filename, obj_size, num_copies, split_head);
+
+
+#-------------------------------------------------------------------------------
+def upload_objects(bucket_name, files, indices, conn, config, check_obj_count):
     dedup_stats = Dedup_Stats()
     total_space=0
     duplicated_space=0
     duplicated_tail_objs=0
     rados_objects_total=0
     s3_objects_total=0
+    split_head_objs=0
 
     for (f, idx) in zip(files, indices):
         filename=f[0]
         obj_size=f[1]
         num_copies=f[2]
         assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
         dedupable_space=calc_dedupable_space(obj_size, config)
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -652,10 +780,9 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
         s3_objects_total += num_copies
         if s3_objects_total and (s3_objects_total % 1000 == 0):
             log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-                     s3_objects_total, rados_objects_total, total_space/MB)
+                      s3_objects_total, rados_objects_total, total_space/MB)
         for i in range(idx, num_copies):
             key = gen_object_name(filename, i)
-            #log.debug("upload_file %s/%s with crc32", bucket_name, key)
             conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
 
     log.debug("==========================================")
@@ -665,15 +792,70 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
     log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
     log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
-
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    log.info("split_head_objs=%d, rados_objects_total=%d, duplicated_tail_objs=%d",
+             split_head_objs, rados_objects_total, duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
     log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
     if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("upload_objects: verify current Rados state - total_space=%d", total_space)
+        # assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        # skip size check as it is time consuming
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
+
+    return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
+
+
+#-------------------------------------------------------------------------------
+def upload_objects_with_copy(files, conn, bucket_names, indices, config):
+    dedup_stats = Dedup_Stats()
+    total_space=0
+    rados_objects_total=0
+    s3_objects_total=0
+
+    for (f, idx) in zip(files, indices):
+        filename=f[0]
+        obj_size=f[1]
+        num_copies=f[2]
+        assert(obj_size)
+        head_size = calc_head_size(obj_size, config)
+        obj_size_on_disk=calc_on_disk_byte_size(obj_size)
+        log.debug("upload_objects_with_copy:obj_size=%d, on_disk_size=%d, head_size=%d",
+                  obj_size, obj_size_on_disk, head_size);
+        total_space += (obj_size_on_disk + (num_copies-1)*head_size)
+        rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
+        tail_objs_count =rados_obj_count-1
+        rados_objects_total += (tail_objs_count + num_copies)
+        log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
+        s3_objects_total += num_copies
+        if s3_objects_total and (s3_objects_total % 1000 == 0):
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+                      s3_objects_total, rados_objects_total, total_space/MB)
+
+        base_obj=dict()
+        for i in range(idx, num_copies):
+            key = gen_object_name(filename, i)
+            bucket_name=bucket_names[i]
+            if i == 0:
+                base_obj = {'Bucket': bucket_name, 'Key': key}
+                #log.info("upload_file: %s -> %s/%s", filename, bucket_name, key)
+                conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
+            else:
+                log.debug("copy_obj: %s/%s -> %s/%s",
+                          base_obj['Bucket'], base_obj['Key'], bucket_name, key)
+                conn.copy_object(CopySource=base_obj, Bucket=bucket_name, Key=key)
+
+
+    dedup_stats.deduped_obj = 0
+    dedup_stats.size_before_dedup = total_space
+    # No change should happen since tail-objects are already de-duplicated
+    dedup_stats.dedup_bytes_estimate = 0
+    expected_rados_obj_count_post_dedup=rados_objects_total
+
+    log.info("upload_objects: verify current Rados state - total_space=%d", total_space)
+    assert rados_objects_total == count_object_parts_in_all_buckets(False, total_space)
 
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
     return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
 
 
@@ -686,13 +868,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
     duplicated_tail_objs=0
     rados_objects_total=0
     s3_objects_total=0
+    split_head_objs=0
+
     for (f, idx) in zip(files, indices):
         filename=f[0]
         obj_size=f[1]
         num_copies=f[2]
         assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
         dedupable_space=calc_dedupable_space(obj_size, config)
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -702,7 +887,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
         s3_objects_total += num_copies
         if s3_objects_total and (s3_objects_total % 1000 == 0):
             log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-                     s3_objects_total, rados_objects_total, total_space/MB)
+                      s3_objects_total, rados_objects_total, total_space/MB)
         for i in range(idx, num_copies):
             ten_id = i % max_tenants
             key = gen_object_name(filename, i)
@@ -710,8 +895,8 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
             log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key)
 
     log.debug("==========================================")
-    log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-             s3_objects_total, rados_objects_total, total_space/MB)
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+              s3_objects_total, rados_objects_total, total_space/MB)
     log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
     log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
@@ -722,15 +907,16 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
         s3_object_count += count_objects_in_bucket(bucket_name, conn)
 
     log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
     log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
     if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("upload_objects_multi: verify current Rados state (obj/size)")
+        #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
         assert (s3_object_count == s3_objects_total)
 
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
     return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
 
 
@@ -769,13 +955,16 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
     duplicated_tail_objs=0
     rados_objects_total=0
     s3_objects_total=0
+    split_head_objs=0
+
     for (f, idx) in zip(files, indices):
         filename=f[0]
         obj_size=f[1]
         num_copies=f[2]
         assert(obj_size)
+        split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
-        total_space += (obj_size * num_copies)
+        total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
         dedupable_space=calc_dedupable_space(obj_size, config)
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
@@ -789,8 +978,8 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
         proc_list[idx].join()
 
     log.debug("==========================================")
-    log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
-             s3_objects_total, rados_objects_total, total_space/MB)
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+              s3_objects_total, rados_objects_total, total_space/MB)
     log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
     log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
@@ -801,27 +990,135 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
         s3_object_count += count_objects_in_bucket(bucket_name, conn)
 
     log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
-    expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+    expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
     log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
     if check_obj_count:
-        assert rados_objects_total == count_object_parts_in_all_buckets()
+        log.debug("procs_upload_objects:: count_object_parts_in_all_buckets()")
+        #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
         assert (s3_object_count == s3_objects_total)
 
-    expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
     return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
 
+#-------------------------------------------------------------------------------
+def check_if_any_obj_exists(bucket_name, delete_list, conn):
+    delete_set = set(delete_list)
+    max_keys=1000
+    continuation_token = None
+
+    while True:
+        list_args = {
+            'Bucket': bucket_name,
+            'MaxKeys': max_keys
+        }
+        if continuation_token:
+            list_args['ContinuationToken'] = continuation_token
+
+        listing=conn.list_objects_v2(**list_args)
+        if 'Contents' in listing:
+            for obj in listing['Contents']:
+                key=obj['Key']
+                log.debug("check_if_any_obj_exists: key=%s", key)
+                if obj['Key'] in delete_set:
+                    log.info("key <%s> was found in bucket", key)
+
+        if 'NextContinuationToken' in listing:
+            continuation_token = listing['NextContinuationToken']
+            log.debug("check_if_any_obj_exists: Token=%s", continuation_token)
+        else:
+            break
+
+
+#-------------------------------------------------------------------------------
+def delete_objects_multi(conns, bucket_names, ten_id, object_keys):
+    conn = conns[ten_id]
+    bucket_name = bucket_names[ten_id]
+    delete_objects(conn, bucket_name, object_keys)
 
 #-------------------------------------------------------------------------------
-def verify_objects(bucket_name, files, conn, expected_results, config):
+def delete_dup_objects_multi(files, conns, bucket_names):
+    max_tenants=len(conns)
+    tenants_obj_lists = [[] for _ in range(max_tenants)]
+
+    for f in files:
+        filename=f[0]
+        num_copies=f[2]
+        if num_copies > 1:
+            start_idx=1
+            for i in range(start_idx, num_copies):
+                key = gen_object_name(filename, i)
+                log.debug("delete_dup_objects_multi: delete key::%s::", key);
+                ten_id = i % max_tenants
+                object_keys = tenants_obj_lists[ten_id]
+                object_keys.append(key)
+                # flush delete request after every 500 objects
+                if len(object_keys) >= 500:
+                    delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+                    object_keys.clear()
+
+    # remove leftover objects
+    for ten_id in range(max_tenants):
+        object_keys = tenants_obj_lists[ten_id]
+        if len(object_keys):
+            delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+
+    # must call garbage collection for predictable count
+    result = admin(['gc', 'process', '--include-all'])
+    assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def delete_dup_objects(bucket_name, files, conn):
+    delete_list_total=[]
+    object_keys=[]
+
+    for f in files:
+        filename=f[0]
+        num_copies=f[2]
+        if num_copies > 1:
+            start_idx=1
+            for i in range(start_idx, num_copies):
+                key = gen_object_name(filename, i)
+                log.debug("delete key::%s::", key);
+                delete_list_total.append(key)
+                object_keys.append(key)
+
+            # flush delete request after every 500 files
+            if len(object_keys) >= 500:
+                delete_objects(conn, bucket_name, object_keys)
+                object_keys.clear()
+
+
+    # remove leftover objects
+    if len(object_keys):
+        delete_objects(conn, bucket_name, object_keys)
+
+    verify=True
+    if verify:
+        log.debug("delete_dup_objects: verify delete_list_total")
+        check_if_any_obj_exists(bucket_name, delete_list_total, conn)
+
+    # must call garbage collection for predictable count
+    result = admin(['gc', 'process', '--include-all'])
+    assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def verify_objects(bucket_name, files, conn, expected_results, config, delete_dups):
+    if expected_results:
+        assert expected_results == count_object_parts_in_all_buckets(True)
+
     tmpfile = OUT_DIR + "temp"
     for f in files:
         filename=f[0]
         obj_size=f[1]
         num_copies=f[2]
         log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
-        for i in range(0, num_copies):
+
+        ### first verify duplicates at index 1 and higher
+        for i in range(1, num_copies):
             filecmp.clear_cache()
             key = gen_object_name(filename, i)
             conn.download_file(bucket_name, key, tmpfile, Config=config)
@@ -829,12 +1126,28 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
             assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
             os.remove(tmpfile)
 
-    log.debug("verify_objects: finished reading all objects")
-    assert expected_results == count_object_parts_in_all_buckets(True)
+    ### Then delete all duplicates
+    if delete_dups:
+        delete_dup_objects(bucket_name, files, conn)
+
+    ### Last, verify the object at index zero making sure refcount works
+    filecmp.clear_cache()
+    i = 0
+    for f in files:
+        filename=f[0]
+        key = gen_object_name(filename, i)
+        conn.download_file(bucket_name, key, tmpfile, Config=config)
+        equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+        assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+        os.remove(tmpfile)
+
     log.debug("verify_objects::completed successfully!!")
 
 #-------------------------------------------------------------------------------
-def verify_objects_multi(files, conns, bucket_names, expected_results, config):
+def verify_objects_multi(files, conns, bucket_names, expected_results, config, delete_dups):
+    if expected_results:
+        assert expected_results == count_object_parts_in_all_buckets(True)
+
     max_tenants=len(conns)
     tmpfile = OUT_DIR + "temp"
     for f in files:
@@ -842,18 +1155,37 @@ def verify_objects_multi(files, conns, bucket_names, expected_results, config):
         obj_size=f[1]
         num_copies=f[2]
         log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
-        for i in range(0, num_copies):
+        ### first verify duplicates at index 1 and higher
+        for i in range(1, num_copies):
             filecmp.clear_cache()
             key = gen_object_name(filename, i)
             log.debug("comparing object %s with file %s", key, filename)
             ten_id = i % max_tenants
-            conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, Config=config)
+            conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+                                        Config=config)
             equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
             assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
             os.remove(tmpfile)
 
-    assert expected_results == count_object_parts_in_all_buckets(True)
-    log.debug("verify_objects::completed successfully!!")
+    ### Then delete all duplicates
+    if delete_dups:
+        delete_dup_objects_multi(files, conns, bucket_names)
+
+    ### Last, verify the object at index zero making sure refcount works
+    filecmp.clear_cache()
+    i = 0
+    for f in files:
+        filename=f[0]
+        key = gen_object_name(filename, i)
+        log.debug("comparing object %s with file %s", key, filename)
+        ten_id = i % max_tenants
+        conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+                                    Config=config)
+        equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+        assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+        os.remove(tmpfile)
+
+    log.debug("verify_objects_multi::completed successfully!!")
 
 
 #-------------------------------------------------------------------------------
@@ -893,7 +1225,7 @@ def threads_verify_objects(files, conns, bucket_names, expected_results, config)
         thread_list[idx].join()
 
     assert expected_results == count_object_parts_in_all_buckets(True)
-    log.debug("verify_objects::completed successfully!!")
+    log.debug("threads_verify_objects::completed successfully!!")
 
 
 #-------------------------------------------------------------------------------
@@ -903,9 +1235,12 @@ def get_stats_line_val(line):
 
 #-------------------------------------------------------------------------------
 def print_dedup_stats(dedup_stats):
+    log.info("===============================================")
+
     for key in dedup_stats.__dict__:
-        log.warning("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
+        log.info("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
 
+    log.info("===============================================")
 
 #-------------------------------------------------------------------------------
 def print_dedup_stats_diff(actual, expected):
@@ -992,8 +1327,14 @@ def verify_dedup_ratio(expected_dedup_stats, dedup_ratio):
     else:
         ratio = 0
 
+
+    log.debug("skip_too_small_bytes = %d", expected_dedup_stats.skip_too_small_bytes)
+    if expected_dedup_stats.non_default_storage_class_objs_bytes:
+        log.debug("non_default_storage_class_objs_bytes= %d",
+                 expected_dedup_stats.non_default_storage_class_objs_bytes)
+
     log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
-    log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
+    log.debug("s3_dedup_bytes = %d", s3_dedup_bytes);
     log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
     log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio)
 
@@ -1098,7 +1439,7 @@ def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
             set_bucket_index_throttling(limit)
 
 #-------------------------------------------------------------------------------
-def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
+def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size=0):
     # dedup should complete in less than 5 minutes
     max_dedup_time = 5*60
     if expected_dedup_stats.deduped_obj > 10000:
@@ -1113,7 +1454,16 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
     dedup_stats = ret[1]
     dedup_ratio_estimate = ret[2]
     dedup_ratio_actual = ret[3]
+    log.debug("exec_dedup: verify_dedup_ratio")
+    verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
+    if post_dedup_size == 0:
+        post_dedup_size = dedup_ratio_estimate.s3_bytes_after
+
+    # no need to check after dry-run which doesn't change anything
+    if dry_run:
+        post_dedup_size = 0
 
+    count_object_parts_in_all_buckets(True, post_dedup_size)
     if verify_stats == False:
         return ret
 
@@ -1121,6 +1471,7 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
         log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
                   expected_dedup_stats.potential_unique_obj)
 
+
     #dedup_stats.set_hash = dedup_stats.invalid_hash
     if dedup_stats != expected_dedup_stats:
         log.debug("==================================================")
@@ -1129,16 +1480,14 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
         log.debug("==================================================\n")
         assert dedup_stats == expected_dedup_stats
 
-    verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
     log.debug("expcted_dedup::stats check completed successfully!!")
     return ret
 
-
 #-------------------------------------------------------------------------------
 def prepare_test():
     cleanup_local()
     #make sure we are starting with all buckets empty
-    if count_object_parts_in_all_buckets() != 0:
+    if count_object_parts_in_all_buckets(False, 0) != 0:
         log.warning("The system was left dirty from previous run");
         log.warning("Make sure to remove all objects before starting");
         assert(0)
@@ -1163,15 +1512,16 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
     prepare_test()
     try:
         files=[]
-        num_files = 8
-        base_size = 4*KB
+        num_files = 5
+        base_size = 1*KB
         log.debug("generate files: base size=%d KiB, max_size=%d KiB",
                   base_size/KB, (pow(2, num_files) * base_size)/KB)
         gen_files(files, base_size, num_files)
         bucket = conn.create_bucket(Bucket=bucket_name)
         log.debug("upload objects to bucket <%s> ...", bucket_name)
         indices = [0] * len(files)
-        ret = upload_objects(bucket_name, files, indices, conn, default_config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
         expected_results = ret[0]
         dedup_stats = ret[1]
         s3_objects_total = ret[2]
@@ -1183,13 +1533,11 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
         small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small = s3_objects_total
-        assert small_objs_dedup_stats == dedup_stats
 
         exec_dedup(dedup_stats, dry_run)
         if dry_run == False:
             log.debug("Verify all objects")
-            verify_objects(bucket_name, files, conn, expected_results, default_config)
-
+            verify_objects(bucket_name, files, conn, expected_results, default_config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
@@ -1221,21 +1569,22 @@ def simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run):
         bucket = conn.create_bucket(Bucket=bucket_name)
         indices = [0] * len(files)
         log.debug("upload objects to bucket <%s> ...", bucket_name)
-        ret = upload_objects(bucket_name, files, indices, conn, config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
         expected_results = ret[0]
         dedup_stats = ret[1]
         log.info("%d S3 objects were uploaded", ret[2])
         exec_dedup(dedup_stats, dry_run)
         if dry_run == False:
             log.debug("Verify all objects")
-            verify_objects(bucket_name, files, conn, expected_results, config)
-
-        return ret
+            verify_objects(bucket_name, files, conn, expected_results, config, run_cleanup_after)
     finally:
         if run_cleanup_after:
             # cleanup must be executed even after a failure
             cleanup(bucket_name, conn)
 
+        return ret
+
 
 #-------------------------------------------------------------------------------
 def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
@@ -1246,7 +1595,8 @@ def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False)
     exec_dedup(dedup_stats, dry_run)
     if dry_run == False:
         log.debug("Verify all objects")
-        verify_objects_multi(files, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files, conns, bucket_names, expected_results, config,
+                             False)
 
     return ret
 
@@ -1267,19 +1617,18 @@ def dedup_basic_with_tenants_common(files, max_copies_count, config, dry_run):
 #-------------------------------------------------------------------------------
 def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
     indices=[0] * len(files)
-
     start = time.time_ns()
-    upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config)
+    check_obj_count=True
+    upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_count)
     upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
     expected_results = upload_ret[0]
     dedup_stats = upload_ret[1]
     s3_objects_total = upload_ret[2]
-
     exec_ret=exec_dedup(dedup_stats, dry_run)
     exec_time_sec=exec_ret[0]
     verify_time_sec=0
     if dry_run == False:
-        log.debug("Verify all objects")
+        log.debug("threads_simple_dedup_with_tenants::Verify all objects")
         start = time.time_ns()
         threads_verify_objects(files, conns, bucket_names,
                                expected_results, config)
@@ -1578,6 +1927,7 @@ CORRUPTIONS = ("no corruption", "change_etag", "illegal_hex_value",
                "change_num_parts", "illegal_separator",
                "illegal_dec_val_num_parts", "illegal_num_parts_overflow")
 
+
 #------------------------------------------------------------------------------
 def change_object_etag(rados_name, new_etag):
     result = rados(['-p ', POOLNAME, 'setxattr', rados_name, ETAG_ATTR, new_etag])
@@ -1646,7 +1996,7 @@ def corrupt_etag(key, corruption, expected_dedup_stats):
     new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats)
 
     log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
-             corruption, old_etag, new_etag)
+              corruption, old_etag, new_etag)
     change_object_etag(rados_name, new_etag)
     return (rados_name, old_etag)
 
@@ -1670,7 +2020,8 @@ def test_dedup_etag_corruption():
 
         bucket = conn.create_bucket(Bucket=bucket_name)
         indices = [0] * len(files)
-        ret = upload_objects(bucket_name, files, indices, conn, default_config)
+        check_obj_count=True
+        ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
         expected_results = ret[0]
         expected_dedup_stats = ret[1]
         s3_objects_total = ret[2]
@@ -1751,7 +2102,8 @@ def test_md5_collisions():
         conn=get_single_connection()
         bucket = conn.create_bucket(Bucket=bucket_name)
         indices = [0] * len(files)
-        upload_objects(bucket_name, files, indices, conn, config2)
+        check_obj_count=True
+        upload_objects(bucket_name, files, indices, conn, config2, check_obj_count)
 
         dedup_stats = Dedup_Stats()
         # we wrote 2 different small objects (BLOCK_SIZE) with the same md5
@@ -1770,33 +2122,178 @@ def test_md5_collisions():
         dedup_stats.set_hash=dedup_stats.total_processed_objects
         dedup_stats.hash_mismatch=1
         s3_bytes_before=dedup_stats.size_before_dedup
-        expected_ratio_actual=Dedup_Ratio()
-        expected_ratio_actual.s3_bytes_before=s3_bytes_before
-        expected_ratio_actual.s3_bytes_after=s3_bytes_before
-        expected_ratio_actual.ratio=0
+        expected_ratio=Dedup_Ratio()
+        expected_ratio.s3_bytes_before=s3_bytes_before
+        expected_ratio.s3_bytes_after=s3_bytes_before
+        expected_ratio.ratio=0
 
         dry_run=False
         log.debug("test_md5_collisions: first call to exec_dedup")
-        ret=exec_dedup(dedup_stats, dry_run)
+        ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
         dedup_ratio_actual=ret[3]
-
-        assert expected_ratio_actual == dedup_ratio_actual
+        assert expected_ratio == dedup_ratio_actual
 
         dedup_stats.valid_hash=dedup_stats.total_processed_objects
         dedup_stats.invalid_hash=0
         dedup_stats.set_hash=0
 
         log.debug("test_md5_collisions: second call to exec_dedup")
-        ret=exec_dedup(dedup_stats, dry_run)
+        ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
         dedup_ratio_actual=ret[3]
 
-        assert expected_ratio_actual == dedup_ratio_actual
+        assert expected_ratio == dedup_ratio_actual
 
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
 
 
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head_with_tenants():
+    prepare_test()
+    config=default_config
+    success=False
+    max_copies_count=4
+    files=[]
+    num_files=11 # [16KB-32MB]
+    base_size = 16*KB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+        ret=gen_connections_multi2(max_copies_count)
+        #tenants=ret[0]
+        bucket_names=ret[1]
+        conns=ret[2]
+
+        ret=upload_objects_multi(files, conns, bucket_names, indices, config, True)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+
+        dry_run=False
+        exec_dedup(dedup_stats, dry_run, True)
+        log.debug("Verify all objects")
+        verify_objects_multi(files, conns, bucket_names, expected_results, config,
+                             True)
+        success=True
+    finally:
+        cleanup_all_buckets(bucket_names, conns)
+        if not success:
+            print_files(files, config)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head_with_tenants():
+    #return
+
+    if full_dedup_is_disabled():
+        return
+
+    for idx in range(0, 9):
+        log.debug("test_dedup_split_head_with_tenants: loop #%d", idx);
+        loop_dedup_split_head_with_tenants()
+
+
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head():
+    prepare_test()
+    #bucket_name = gen_bucket_name()
+    bucket_name = "bucket1"
+    config=default_config
+    max_copies_count=4
+    files=[]
+    num_files=11 # [16KB-32MB]
+    base_size = 16*KB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+
+        conn=get_single_connection()
+        conn.create_bucket(Bucket=bucket_name)
+        check_obj_count=True
+        ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+        dry_run=False
+        exec_dedup(dedup_stats, dry_run, True)
+        verify_objects(bucket_name, files, conn, expected_results, config, True)
+    finally:
+        cleanup(bucket_name, conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head():
+    #return
+
+    if full_dedup_is_disabled():
+        return
+
+    for idx in range(0, 9):
+        log.debug("test_dedup_split_head: loop #%d", idx);
+        loop_dedup_split_head()
+
+#-------------------------------------------------------------------------------
+def dedup_copy_internal(multi_buckets):
+    if full_dedup_is_disabled():
+        return
+
+    prepare_test()
+    bucket_names=[]
+    config=default_config
+    max_copies_count=4
+    files=[]
+    num_files=6 # [5MB-512MB]
+    base_size=5*MB
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+              base_size/KB, (pow(2, num_files) * base_size)/KB)
+    conn=get_single_connection()
+    try:
+        gen_files(files, base_size, num_files, max_copies_count)
+        indices=[0] * len(files)
+        if multi_buckets:
+            bucket_names=create_buckets(conn, max_copies_count)
+        else:
+            bucket_name = "bucket1"
+            conn.create_bucket(Bucket=bucket_name)
+            bucket_names=[bucket_name] * max_copies_count
+
+        ret=upload_objects_with_copy(files, conn, bucket_names, indices, config)
+        expected_results = ret[0]
+        dedup_stats = ret[1]
+        dry_run=False
+        max_dedup_time = 5*60
+        exec_dedup_internal(dedup_stats, dry_run, max_dedup_time)
+
+        assert expected_results == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+        expected_results=0  # skip object_parts verification
+        conns=[conn] * len(bucket_names)
+        verify_objects_multi(files, conns, bucket_names, expected_results, config, True)
+    finally:
+        # cleanup must be executed even after a failure
+        if multi_buckets:
+            for bucket_name in bucket_names:
+                cleanup(bucket_name, conn)
+        else:
+            cleanup(bucket_names[0], conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy():
+    #return
+    dedup_copy_internal(False)
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy_multi_buckets():
+    #return
+    dedup_copy_internal(True)
+
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_small():
@@ -1822,8 +2319,8 @@ def test_dedup_small_with_tenants():
     prepare_test()
     max_copies_count=3
     files=[]
-    num_files=10 # [4KB-4MB]
-    base_size = 4*KB
+    num_files=5 # [1KB-32KB]
+    base_size = 1*KB
     log.debug("generate files: base size=%d KiB, max_size=%d KiB",
              base_size/KB, (pow(2, num_files) * base_size)/KB)
     try:
@@ -1851,7 +2348,8 @@ def test_dedup_small_with_tenants():
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
         log.debug("Verify all objects")
-        verify_objects_multi(files, conns, bucket_names, expected_results, default_config)
+        verify_objects_multi(files, conns, bucket_names, expected_results,
+                             default_config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -1894,7 +2392,7 @@ def test_dedup_inc_0_with_tenants():
         dedup_stats2.set_shared_manifest_src=0
         dedup_stats2.deduped_obj=0
         dedup_stats2.deduped_obj_bytes=0
-        dedup_stats2.valid_hash=dedup_stats.invalid_hash
+        dedup_stats2.valid_hash=dedup_stats.unique_obj
         dedup_stats2.invalid_hash=0
         dedup_stats2.set_hash=0
 
@@ -1902,7 +2400,8 @@ def test_dedup_inc_0_with_tenants():
         # run dedup again and make sure nothing has changed
         dry_run=False
         exec_dedup(dedup_stats2, dry_run)
-        verify_objects_multi(files, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files, conns, bucket_names, expected_results,
+                             config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -1931,7 +2430,9 @@ def test_dedup_inc_0():
         num_files = 11
         gen_files_in_range(files, num_files, 1*MB, 64*MB)
         # upload objects, dedup, verify, but don't cleanup
-        ret = simple_dedup(conn, files, bucket_name, False, config, False)
+        run_cleanup_after=False
+        dry_run=False
+        ret = simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run)
         expected_results = ret[0]
         dedup_stats = ret[1]
         s3_objects_total = ret[2]
@@ -1942,7 +2443,7 @@ def test_dedup_inc_0():
         dedup_stats2.set_shared_manifest_src=0
         dedup_stats2.deduped_obj=0
         dedup_stats2.deduped_obj_bytes=0
-        dedup_stats2.valid_hash=dedup_stats.invalid_hash
+        dedup_stats2.valid_hash=dedup_stats.unique_obj
         dedup_stats2.invalid_hash=0
         dedup_stats2.set_hash=0
 
@@ -1950,7 +2451,7 @@ def test_dedup_inc_0():
         # run dedup again and make sure nothing has changed
         dry_run=False
         exec_dedup(dedup_stats2, dry_run)
-        verify_objects(bucket_name, files, conn, expected_results, config)
+        verify_objects(bucket_name, files, conn, expected_results, config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
@@ -2002,6 +2503,7 @@ def test_dedup_inc_1_with_tenants():
         ret=upload_objects_multi(files_combined, conns, bucket_names, indices, config, False)
         expected_results=ret[0]
         stats_combined=ret[1]
+
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
         stats_combined.skip_src_record     -= stats_base.skip_src_record
         stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
@@ -2010,15 +2512,16 @@ def test_dedup_inc_1_with_tenants():
         stats_combined.deduped_obj         -= stats_base.deduped_obj
         stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
         stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
 
         log.debug("test_dedup_inc_1_with_tenants: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
-        verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                             config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -2063,7 +2566,8 @@ def test_dedup_inc_1():
             num_copies_combined=num_copies_to_add+num_copies_base
             files_combined.append((filename, obj_size, num_copies_combined))
 
-        ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+        check_obj_count=False
+        ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
         expected_results = ret[0]
         stats_combined = ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2074,15 +2578,15 @@ def test_dedup_inc_1():
         stats_combined.deduped_obj         -= stats_base.deduped_obj
         stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
         stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
 
         log.debug("test_dedup_inc_1: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
-        verify_objects(bucket_name, files_combined, conn, expected_results, config)
+        verify_objects(bucket_name, files_combined, conn, expected_results, config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
@@ -2151,15 +2655,16 @@ def test_dedup_inc_2_with_tenants():
         stats_combined.deduped_obj         -= stats_base.deduped_obj
         stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
         stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
 
         log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
-        verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+        verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                             config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -2212,7 +2717,8 @@ def test_dedup_inc_2():
             indices.append(0)
 
         assert(len(indices) == len(files_combined))
-        ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+        check_obj_count=False
+        ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
         expected_results = ret[0]
         stats_combined = ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2223,16 +2729,16 @@ def test_dedup_inc_2():
         stats_combined.deduped_obj         -= stats_base.deduped_obj
         stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
-        stats_combined.valid_hash    = stats_base.set_hash
+        stats_combined.valid_hash    = stats_base.unique_obj
         stats_combined.invalid_hash -= stats_base.set_hash
-        stats_combined.set_hash     -= stats_base.set_hash
+        stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
 
         log.debug("test_dedup_inc_2: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
         verify_objects(bucket_name, files_combined, conn, expected_results,
-                       config)
+                       config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
@@ -2246,7 +2752,6 @@ def test_dedup_inc_2():
 @pytest.mark.basic_test
 def test_dedup_inc_with_remove_multi_tenants():
     #return
-
     if full_dedup_is_disabled():
         return
 
@@ -2259,6 +2764,9 @@ def test_dedup_inc_with_remove_multi_tenants():
     bucket_names=ret[1]
     conns=ret[2]
     try:
+        split_heads_count=0
+        split_heads_removed=0
+        split_heads=[]
         files=[]
         num_files = 17
         # gen_files_in_range creates 2-3 copies
@@ -2268,14 +2776,23 @@ def test_dedup_inc_with_remove_multi_tenants():
         expected_results_base = ret[0]
         stats_base = ret[1]
 
+        ### find which objects got split head before remove
+        for f in files:
+            obj_size=f[1]
+            num_copies=f[2]
+            split_head = calc_split_objs_count(obj_size, num_copies, config)
+            split_heads.append(split_head)
+            if split_head:
+                split_heads_count += 1
+
         # REMOVE some objects and update stats/expected
         src_record=0
         shared_manifest=0
-        valid_sha=0
+        valid_hash=0
         object_keys=[]
         files_sub=[]
         dedup_stats = Dedup_Stats()
-        for f in files:
+        for idx, f in enumerate(files):
             filename=f[0]
             obj_size=f[1]
             num_copies=f[2]
@@ -2283,13 +2800,18 @@ def test_dedup_inc_with_remove_multi_tenants():
             num_copies_2=num_copies-num_remove
             log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
             if num_copies_2:
-                if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
-                    valid_sha += num_copies_2
+                split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+                if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+                    valid_hash += 1
                     src_record += 1
                     shared_manifest += (num_copies_2 - 1)
 
                 files_sub.append((filename, obj_size, num_copies_2))
                 calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+            elif split_heads[idx]:
+                # we removed all copies of a split-head object
+                split_heads_count -= 1
+                split_heads_removed += 1
 
             start_idx=num_copies_2
             for i in range(start_idx, num_copies):
@@ -2305,9 +2827,10 @@ def test_dedup_inc_with_remove_multi_tenants():
         dedup_stats.set_shared_manifest_src=0
         dedup_stats.deduped_obj=0
         dedup_stats.deduped_obj_bytes=0
+
         dedup_stats.skip_src_record=src_record
         dedup_stats.skip_shared_manifest=shared_manifest
-        dedup_stats.valid_hash=valid_sha
+        dedup_stats.valid_hash=valid_hash
         dedup_stats.invalid_hash=0
         dedup_stats.set_hash=0
 
@@ -2315,7 +2838,9 @@ def test_dedup_inc_with_remove_multi_tenants():
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
         expected_results=calc_expected_results(files_sub, config)
-        verify_objects_multi(files_sub, conns, bucket_names, expected_results, config)
+        expected_results += split_heads_count
+        verify_objects_multi(files_sub, conns, bucket_names, expected_results,
+                             config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -2329,7 +2854,6 @@ def test_dedup_inc_with_remove_multi_tenants():
 @pytest.mark.basic_test
 def test_dedup_inc_with_remove():
     #return
-
     if full_dedup_is_disabled():
         return
 
@@ -2339,6 +2863,9 @@ def test_dedup_inc_with_remove():
     log.debug("test_dedup_inc_with_remove: connect to AWS ...")
     conn=get_single_connection()
     try:
+        split_heads_count=0
+        split_heads_removed=0
+        split_heads=[]
         files=[]
         num_files = 17
         gen_files_in_range(files, num_files, 1*MB, 64*MB)
@@ -2347,28 +2874,41 @@ def test_dedup_inc_with_remove():
         expected_results_base = ret[0]
         stats_base = ret[1]
 
+        ### find which objects got split head before remove
+        for f in files:
+            obj_size=f[1]
+            num_copies=f[2]
+            split_head = calc_split_objs_count(obj_size, num_copies, config)
+            split_heads.append(split_head)
+            if split_head:
+                split_heads_count += 1
+
         # REMOVE some objects and update stats/expected
         src_record=0
         shared_manifest=0
-        valid_sha=0
+        valid_hash=0
         object_keys=[]
         files_sub=[]
         dedup_stats = Dedup_Stats()
-        for f in files:
+        for idx, f in enumerate(files):
             filename=f[0]
             obj_size=f[1]
             num_copies=f[2]
             num_remove=random.randint(0, num_copies)
             num_copies_2=num_copies-num_remove
-            log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
             if num_copies_2:
-                if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
-                    valid_sha += num_copies_2
+                split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+                if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+                    valid_hash += 1
                     src_record += 1
                     shared_manifest += (num_copies_2 - 1)
 
                 files_sub.append((filename, obj_size, num_copies_2))
                 calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+            elif split_heads[idx]:
+                # we removed all copies of a split-head object
+                split_heads_count -= 1
+                split_heads_removed += 1
 
             start_idx=num_copies_2
             for i in range(start_idx, num_copies):
@@ -2380,8 +2920,7 @@ def test_dedup_inc_with_remove():
                 log.debug("Skiping file=%s, num_remove=%d", filename, num_remove)
                 continue
 
-            response=conn.delete_objects(Bucket=bucket_name,
-                                         Delete={"Objects": [{"Key": key} for key in object_keys]})
+            delete_objects(conn, bucket_name, object_keys)
 
         # must call garbage collection for predictable count
         result = admin(['gc', 'process', '--include-all'])
@@ -2393,17 +2932,17 @@ def test_dedup_inc_with_remove():
         dedup_stats.deduped_obj_bytes=0
         dedup_stats.skip_src_record=src_record
         dedup_stats.skip_shared_manifest=shared_manifest
-        dedup_stats.valid_hash=valid_sha
+        dedup_stats.valid_hash=valid_hash
         dedup_stats.invalid_hash=0
         dedup_stats.set_hash=0
 
         log.debug("test_dedup_inc_with_remove: incremental dedup:")
         log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
-        log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
         expected_results=calc_expected_results(files_sub, config)
-        verify_objects(bucket_name, files_sub, conn, expected_results, config)
+        expected_results += split_heads_count
+        verify_objects(bucket_name, files_sub, conn, expected_results, config, True)
     finally:
         # cleanup must be executed even after a failure
         cleanup(bucket_name, conn)
@@ -2462,7 +3001,6 @@ def test_dedup_multipart():
 
     simple_dedup(conn, files, bucket_name, True, default_config, False)
 
-
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_basic_with_tenants():
@@ -2497,10 +3035,12 @@ def test_dedup_basic():
     num_files=5
     base_size = MULTIPART_SIZE
     log.debug("generate files: base size=%d MiB, max_size=%d MiB",
-             base_size/MB, (pow(2, num_files) * base_size)/MB)
+              base_size/MB, (pow(2, num_files) * base_size)/MB)
     gen_files(files, base_size, num_files)
     log.debug("call simple_dedup()")
-    simple_dedup(conn, files, bucket_name, True, default_config, False)
+    run_cleanup_after=True
+    dry_run=False
+    simple_dedup(conn, files, bucket_name, run_cleanup_after, default_config, dry_run)
 
 
 #-------------------------------------------------------------------------------
@@ -2552,7 +3092,7 @@ def test_dedup_small_multipart():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_large_scale_with_tenants():
-    return
+    #return
 
     if full_dedup_is_disabled():
         return
@@ -2572,7 +3112,7 @@ def test_dedup_large_scale_with_tenants():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_large_scale():
-    return
+    #return
 
     if full_dedup_is_disabled():
         return
@@ -2592,7 +3132,7 @@ def test_dedup_large_scale():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_empty_bucket():
-    return
+    #return
 
     if full_dedup_is_disabled():
         return
@@ -2632,7 +3172,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
 
     # add new files
     num_files_new = 11
-    gen_files_in_range(files_combined, num_files_new, 2*MB, 32*MB)
+    gen_files_in_range(files_combined, num_files_new, 1*MB, 32*MB)
     pad_count = len(files_combined) - len(files)
     for i in range(0, pad_count):
         indices.append(0)
@@ -2646,7 +3186,8 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
     for f in files_combined:
         obj_size=f[1]
         num_copies=f[2]
-        if num_copies > 1 and obj_size > RADOS_OBJ_SIZE:
+        split_head = calc_split_objs_count(obj_size, num_copies, config)
+        if num_copies > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
             src_record += 1
 
     stats_combined.skip_shared_manifest = stats_base.deduped_obj
@@ -2655,15 +3196,15 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
     stats_combined.deduped_obj         -= stats_base.deduped_obj
     stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
-    stats_combined.valid_hash    = stats_base.set_hash
+    stats_combined.valid_hash    = stats_base.unique_obj
     stats_combined.invalid_hash -= stats_base.set_hash
-    stats_combined.set_hash     -= stats_base.set_hash
-
+    stats_combined.set_hash      = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
     log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
     # run dedup again
     dry_run=False
     exec_dedup(stats_combined, dry_run)
-    verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+    verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+                         config, False)
 
     return (files_combined, stats_combined)
 
@@ -2689,7 +3230,7 @@ def test_dedup_inc_loop_with_tenants():
         files=[]
         num_files = 13
         # gen_files_in_range creates 2-3 copies
-        gen_files_in_range(files, num_files, 1*MB, 64*MB)
+        gen_files_in_range(files, num_files, 256*KB, 64*MB)
         # upload objects, dedup, verify, but don't cleanup
         ret=simple_dedup_with_tenants(files, conns, bucket_names, config)
         stats_base=ret[1]
@@ -2699,9 +3240,10 @@ def test_dedup_inc_loop_with_tenants():
             files=ret[0]
             stats_last=ret[1]
             stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src
-            stats_base.deduped_obj       += stats_last.deduped_obj
-            stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes
-            stats_base.set_hash          += stats_last.set_hash
+            stats_base.unique_obj          += stats_last.set_shared_manifest_src
+            stats_base.deduped_obj         += stats_last.deduped_obj
+            stats_base.deduped_obj_bytes   += stats_last.deduped_obj_bytes
+            stats_base.set_hash            += stats_last.set_hash
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -2718,8 +3260,8 @@ def test_dedup_dry_small_with_tenants():
     prepare_test()
     max_copies_count=3
     files=[]
-    num_files=10 # [4KB-4MB]
-    base_size = 4*KB
+    num_files=5 # [1KB-32KB]
+    base_size = 1*KB
     log.debug("generate files: base size=%d KiB, max_size=%d KiB",
              base_size/KB, (pow(2, num_files) * base_size)/KB)
     try:
@@ -2762,10 +3304,10 @@ def test_dedup_dry_multipart():
 
     num_files=8
     min_size=MULTIPART_SIZE
-    #gen_files_in_range(files, num_files, min_size, 1024*MB)
+    # create files in range [MULTIPART_SIZE, 128MB] aligned on RADOS_OBJ_SIZE
     gen_files_in_range(files, num_files, min_size, 128*MB)
 
-    # add files in range [MULTIPART_SIZE, 4*MULTIPART_SIZE] aligned on MULTIPART_SIZE
+    # add files in range [MULTIPART_SIZE, 8*MULTIPART_SIZE] aligned on MULTIPART_SIZE
     gen_files_in_range(files, num_files, min_size, min_size*8, MULTIPART_SIZE)
 
     # add file with excatly MULTIPART_SIZE
@@ -2862,7 +3404,8 @@ def test_dedup_dry_small_large_mix():
             conns[i].create_bucket(Bucket=bucket_names[i])
 
         indices = [0] * len(files)
-        ret=procs_upload_objects(files, conns, bucket_names, indices, default_config)
+        check_obj_count=True
+        ret=procs_upload_objects(files, conns, bucket_names, indices, default_config, check_obj_count)
         upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
         expected_results = ret[0]
         dedup_stats = ret[1]
@@ -2870,8 +3413,6 @@ def test_dedup_dry_small_large_mix():
         log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total,
                  upload_time_sec)
         exec_dedup(dedup_stats, dry_run)
-        if dry_run == False:
-            verify_objects(bucket_name, files, conn, expected_results, default_config)
     finally:
         # cleanup must be executed even after a failure
         cleanup_all_buckets(bucket_names, conns)
@@ -2946,9 +3487,19 @@ def test_dedup_dry_large_scale_with_tenants():
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.info("test_dedup_dry_large_scale: connect to AWS ...")
     gen_files_fixed_size(files, num_files, size, max_copies_count)
-    threads_dedup_basic_with_tenants_common(files, num_threads, config, True)
+    conns=get_connections(num_threads)
+    bucket_names=get_buckets(num_threads)
+    for i in range(num_threads):
+        conns[i].create_bucket(Bucket=bucket_names[i])
+    try:
+        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
+    except Exception:
+        log.warning("test_dedup_dry_large_scale: failed!!")
+    finally:
+        # cleanup must be executed even after a failure
+        cleanup_all_buckets(bucket_names, conns)
 
 
 #-------------------------------------------------------------------------------
@@ -2957,25 +3508,26 @@ def test_dedup_dry_large_scale():
     #return
 
     prepare_test()
-    max_copies_count=3
-    num_threads=64
-    num_files=32*1024
+    bucket_name = gen_bucket_name()
+    max_copies_count=2
+    num_files=2*1024
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.debug("test_dedup_dry_large_scale_new: connect to AWS ...")
-    gen_files_fixed_size(files, num_files, size, max_copies_count)
-    conns=get_connections(num_threads)
-    bucket_names=get_buckets(num_threads)
-    for i in range(num_threads):
-        conns[i].create_bucket(Bucket=bucket_names[i])
+    log.info("test_dedup_dry_large_scale: connect to AWS ...")
     try:
-        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
-    except:
-        log.warning("test_dedup_dry_large_scale: failed!!")
+        gen_files_fixed_size(files, num_files, size, max_copies_count)
+        indices=[0] * len(files)
+        conn=get_single_connection()
+        conn.create_bucket(Bucket=bucket_name)
+        check_obj_count=True
+        ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+        dedup_stats = ret[1]
+        dry_run=True
+        exec_dedup(dedup_stats, dry_run, True)
     finally:
         # cleanup must be executed even after a failure
-        cleanup_all_buckets(bucket_names, conns)
+        cleanup(bucket_name, conn)
 
 
 #-------------------------------------------------------------------------------