]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/dedup: full object dedup continued work
authorGabriel BenHanokh <gbenhano@redhat.com>
Sun, 21 Jul 2024 11:38:57 +0000 (11:38 +0000)
committerGabriel BenHanokh <gbenhano@redhat.com>
Mon, 30 Jun 2025 11:10:59 +0000 (11:10 +0000)
Moved all control objects (EPOCH, WATCH, Tokens) to default.rgw.control
pool.
Add dedup_pool to RGWZoneParams to make the name unique across zones
rgw.dedup pool is created on dedup start and removed when the scan is
over

report duplicated space after dedup because of the head-object
report potential dedup for smaller objects (64KB-4MB)
added tests for the new reporting facilities

Signed-off-by: Gabriel BenHanokh <gbenhano@redhat.com>
(cherry picked from commit 7e6021f580b2d21fe4871957ab609c720afd0ca8)

13 files changed:
src/rgw/driver/rados/rgw_zone.h
src/rgw/rgw_dedup.cc
src/rgw/rgw_dedup.h
src/rgw/rgw_dedup_cluster.cc
src/rgw/rgw_dedup_cluster.h
src/rgw/rgw_dedup_store.cc
src/rgw/rgw_dedup_table.cc
src/rgw/rgw_dedup_table.h
src/rgw/rgw_dedup_utils.cc
src/rgw/rgw_dedup_utils.h
src/rgw/rgw_zone.cc
src/test/rgw/dedup/pytest.ini
src/test/rgw/dedup/test_dedup.py

index 5fb2b4b809664b0e486418d24d14cf56410306c4..7860bad50f8a633c3c18d3023db876c3ffbb3395 100644 (file)
@@ -117,6 +117,7 @@ struct RGWZoneParams : RGWSystemMetaObj {
   rgw_pool topics_pool;
   rgw_pool account_pool;
   rgw_pool group_pool;
+  rgw_pool dedup_pool;
 
   RGWAccessKey system_key;
 
@@ -153,7 +154,7 @@ struct RGWZoneParams : RGWSystemMetaObj {
   const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
   
   void encode(bufferlist& bl) const override {
-    ENCODE_START(15, 1, bl);
+    ENCODE_START(16, 1, bl);
     encode(domain_root, bl);
     encode(control_pool, bl);
     encode(gc_pool, bl);
@@ -182,11 +183,12 @@ struct RGWZoneParams : RGWSystemMetaObj {
     encode(topics_pool, bl);
     encode(account_pool, bl);
     encode(group_pool, bl);
+    encode(dedup_pool, bl);
     ENCODE_FINISH(bl);
   }
 
   void decode(bufferlist::const_iterator& bl) override {
-    DECODE_START(15, bl);
+    DECODE_START(16, bl);
     decode(domain_root, bl);
     decode(control_pool, bl);
     decode(gc_pool, bl);
@@ -264,6 +266,11 @@ struct RGWZoneParams : RGWSystemMetaObj {
       account_pool = name + ".rgw.meta:accounts";
       group_pool = name + ".rgw.meta:groups";
     }
+    if (struct_v >= 16) {
+      decode(dedup_pool, bl);
+    } else {
+      dedup_pool = name + ".rgw.dedup";
+    }
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
index 0db80229ff14d783980feef1d65f4a099d5202d7..7bb44ecb98968188adad34044914505e6fec3fe8 100644 (file)
@@ -216,7 +216,149 @@ namespace rgw::dedup {
   // rgw::dedup::Background
   //===========================================================================
   //---------------------------------------------------------------------------
-  int Background::init_rados_access_handles()
+  static void display_ioctx_state(const DoutPrefixProvider *dpp,
+                                  const librados::IoCtx &ioctx,
+                                  const char *caller)
+  {
+    if (ioctx.is_valid()) {
+      ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id="
+                        << ioctx.get_instance_id() << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl;
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  static int safe_pool_delete(rgw::sal::RadosStore     *store,
+                              const DoutPrefixProvider *dpp,
+                              int64_t                   expected_pool_id)
+  {
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+    if (pool_id < 0) {
+      int err = pool_id;
+      if (err == ENOENT) {
+        ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::"
+                           << dedup_pool.name << "::expected_pool_id="
+                           << expected_pool_id << dendl;
+      }
+      else {
+        ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name
+                          << ") err=" << cpp_strerror(-err) << dendl;
+      }
+      return err;
+    }
+
+    if (pool_id != expected_pool_id) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: "
+                        << expected_pool_id << " to: " << pool_id
+                        << " abort pool_delete() request!" << dendl;
+      // report Stale file handle
+      return -ESTALE;
+    }
+
+    ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name
+                       << ") pool_id=" << pool_id << dendl;
+    return rados_handle->pool_delete(dedup_pool.name.c_str());
+  }
+
+  //---------------------------------------------------------------------------
+  static int64_t create_pool(rgw::sal::RadosStore     *store,
+                             const DoutPrefixProvider *dpp,
+                             const std::string        &pool_name)
+  {
+#if 0
+    // using Replica-1 for the intermediate data
+    // since it can be regenerated in case of a failure
+    std::string replica_count(std::to_string(1));
+#else
+    // temporary solution until we find a way to disable the health warn on replica1
+    std::string replica_count(std::to_string(2));
+#endif
+    librados::bufferlist inbl;
+    std::string output;
+    std::string command = R"(
+    {
+      "prefix": "osd pool create",
+      "pool": ")" + pool_name +
+      R"(",
+      "pool_type": "replicated",
+      "size": )" + replica_count +
+      R"(
+    })";
+
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
+    if (output.length()) {
+      if (output != "pool 'rgw_dedup_pool' already exists") {
+        ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
+      }
+    }
+    if (ret != 0 && ret != -EEXIST) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
+                        << pool_name << " with: "
+                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+      return ret;
+    }
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    return rados_handle->pool_lookup(dedup_pool.name.c_str());
+  }
+
+  //---------------------------------------------------------------------------
+  static int init_dedup_pool_ioctx(rgw::sal::RadosStore     *store,
+                                   const DoutPrefixProvider *dpp,
+                                   bool                      create,
+                                   librados::IoCtx          &ioctx)
+  {
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    std::string pool_name(dedup_pool.name.c_str());
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+    if (pool_id >= 0) {
+      // TBD: what to do when create option is passed
+      ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                         << " already exists, pool_id=" << pool_id << dendl;
+    }
+    else if (create) {
+      pool_id = create_pool(store, dpp, pool_name);
+      if (pool_id >= 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                           << " was created, pool_id=" << pool_id << dendl;
+      }
+      else {
+        return pool_id;
+      }
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__
+                        << "::ERR: pool doesn't exist and no create option" << dendl;
+      return -ENOENT;
+    }
+
+    int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret
+                        << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = ioctx.application_enable("rgw_dedup", false);
+    if (ret == 0) {
+      ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                         << " was associated with dedup app" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
+                        << dedup_pool.name << " with: "
+                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+    }
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::init_rados_access_handles(bool init_pool)
   {
     store = dynamic_cast<rgw::sal::RadosStore*>(driver);
     if (!store) {
@@ -227,11 +369,12 @@ namespace rgw::dedup {
 
     rados = store->getRados();
     rados_handle = rados->get_rados_handle();
-
-    int ret = init_dedup_pool_ioctx(rados, dpp, d_dedup_cluster_ioctx);
-    ldpp_dout(dpp, 5) << __func__ << "::dedup background: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
-    return ret;
+    if (init_pool) {
+      int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+      display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+      return ret;
+    }
+    return 0;
   }
 
   //---------------------------------------------------------------------------
@@ -247,11 +390,11 @@ namespace rgw::dedup {
     d_head_object_size = cct->_conf->rgw_max_chunk_size;
     //ceph_assert(4*1024*1024 == d_head_object_size);
 
-    int ret = init_rados_access_handles();
+    int ret = init_rados_access_handles(false);
     if (ret != 0) {
       derr << __func__ << "::ERR: failed init_rados_access_handles() ret="
            << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
+      throw std::runtime_error("Failed init_rados_access_handles()");
     }
 
     d_heart_beat_last_update = ceph_clock_now();
@@ -550,7 +693,7 @@ namespace rgw::dedup {
     }
     int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx);
     if (ret < 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioxtc from data pool:"
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:"
                         << data_pool.to_str() << dendl;
       return -EIO;
     }
@@ -713,7 +856,7 @@ namespace rgw::dedup {
       }
 
       if (oid == raw_obj.oid) {
-        ldpp_dout(dpp, 10) << __func__ << "::manifest: head object=" << oid << dendl;
+        ldpp_dout(dpp, 20) << __func__ << "::manifest: head object=" << oid << dendl;
         head_ioctx = obj.ioctx;
       }
       bufferlist bl;
@@ -883,8 +1026,8 @@ namespace rgw::dedup {
     if (unlikely(should_print_debug)) {
       print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard);
     }
-
     p_stats->processed_objects ++;
+
     uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
     uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
     storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
@@ -898,11 +1041,23 @@ namespace rgw::dedup {
     dedup_table_t::value_t src_val;
     int ret = p_table->get_val(&key_from_bucket_index, &src_val);
     if (ret != 0) {
-      // record has no valid entry in table because it is a singleton
-      p_stats->skipped_singleton++;
-      p_stats->skipped_singleton_bytes += ondisk_byte_size;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::" << p_rec->bucket_name
-                         << "/" << p_rec->obj_name << std::dec << dendl;
+      if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+        // record has no valid entry in table because it is a too small
+        // It was loaded to table for calculation and then purged
+        p_stats->skipped_purged_small++;
+        ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::"
+                           << p_rec->obj_name << "::" << ondisk_byte_size << dendl;
+        // help small object tests pass - avoid complication differentiating between
+        // small objects ( < 64KB,  >= 64KB <= 4MB, > 4MB
+        p_stats->processed_objects--;
+      }
+      else {
+        // record has no valid entry in table because it is a singleton
+        p_stats->skipped_singleton++;
+        p_stats->skipped_singleton_bytes += ondisk_byte_size;
+        ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::"
+                           << p_rec->obj_name << std::dec << dendl;
+      }
       return 0;
     }
 
@@ -1168,6 +1323,11 @@ namespace rgw::dedup {
     if (ret == 0) {
       p_stats->deduped_objects++;
       p_stats->deduped_objects_bytes += dedupable_objects_bytes;
+      if (p_tgt_rec->s.num_parts == 0) {
+        // single part objects duplicate the head object when dedup is used
+        p_stats->dup_head_bytes += d_head_object_size;
+      }
+
       // mark the SRC object as a providor of a shared manifest
       if (!src_val.has_shared_manifest()) {
         p_stats->set_shared_manifest_src++;
@@ -1390,7 +1550,9 @@ namespace rgw::dedup {
           p_worker_stats->ingress_skip_too_small_64KB++;
           p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
         }
-        return 0;
+        else {
+          return 0;
+        }
       }
       else {
         // multipart objects are always good candidates for dedup
@@ -1421,8 +1583,8 @@ namespace rgw::dedup {
       ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec="
                          << d_heart_beat_max_elapsed_sec << dendl;
       d_heart_beat_last_update = now;
-      d_cluster.update_shard_token_heartbeat(d_dedup_cluster_ioctx, shard_id,
-                                             count_a, count_b, prefix);
+      d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b,
+                                             prefix);
     }
   }
 
@@ -1585,17 +1747,18 @@ namespace rgw::dedup {
 
   //---------------------------------------------------------------------------
   static void display_table_stat_counters(const DoutPrefixProvider* dpp,
-                                          uint64_t obj_count_in_shard,
                                           const md5_stats_t *p_stats)
   {
+    uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count +
+                                   p_stats->big_objs_stat.unique_count +
+                                   p_stats->big_objs_stat.duplicate_count);
+
     ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n"
                        << "::total_count="      << obj_count_in_shard
                        << "::loaded_objects="   << p_stats->loaded_objects
-                       << "::singleton_count="  << p_stats->singleton_count
-                       << "::unique_count="     << p_stats->unique_count << "\n"
-                       << "::duplicate_count="  << p_stats->duplicate_count
-                       << "::duplicated_bytes=" << p_stats->dedup_bytes_estimate
-                       << dendl;
+                       << p_stats->big_objs_stat << dendl;
+    ldpp_dout(dpp, 10) << __func__ << "::small objs::"
+                       << p_stats->small_objs_stat << dendl;
   }
 
   //---------------------------------------------------------------------------
@@ -1620,11 +1783,9 @@ namespace rgw::dedup {
         return -ECANCELED;
       }
     }
-    p_table->count_duplicates(&p_stats->singleton_count, &p_stats->unique_count,
-                              &p_stats->duplicate_count, &p_stats->dedup_bytes_estimate);
-    uint64_t obj_count_in_shard = (p_stats->singleton_count + p_stats->unique_count
-                                   + p_stats->duplicate_count);
-    display_table_stat_counters(dpp, obj_count_in_shard, p_stats);
+    p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat,
+                              &p_stats->dup_head_bytes_estimate);
+    display_table_stat_counters(dpp, p_stats);
 
     ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
     if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) {
@@ -1881,8 +2042,7 @@ namespace rgw::dedup {
                                                 &worker_stats,raw_mem, raw_mem_size);
     if (ret == 0) {
       worker_stats.duration = ceph_clock_now() - start_time;
-      d_cluster.mark_work_shard_token_completed(d_dedup_cluster_ioctx, worker_id,
-                                                &worker_stats);
+      d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats);
       ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl;
       ldpp_dout(dpp, 10) << "Shard Process Duration   = "
                          << worker_stats.duration << dendl;
@@ -1906,8 +2066,7 @@ namespace rgw::dedup {
     int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
     if (ret == 0) {
       md5_stats.duration = ceph_clock_now() - start_time;
-      d_cluster.mark_md5_shard_token_completed(d_dedup_cluster_ioctx, md5_shard,
-                                               &md5_stats);
+      d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats);
       ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl;
       ldpp_dout(dpp, 10) << "Shard Process Duration   = "
                          << md5_stats.duration << dendl;
@@ -1927,10 +2086,10 @@ namespace rgw::dedup {
       d_heart_beat_last_update = ceph_clock_now();
       uint16_t shard_id;
       if (ingress_work_shards) {
-        shard_id = d_cluster.get_next_work_shard_token(d_dedup_cluster_ioctx, num_work_shards);
+        shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards);
       }
       else {
-        shard_id = d_cluster.get_next_md5_shard_token(d_dedup_cluster_ioctx, num_md5_shards);
+        shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards);
       }
 
       // start with a common error handler
@@ -2063,8 +2222,16 @@ namespace rgw::dedup {
     ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <<d_all_buckets_obj_count
                       << "::num_md5_shards=" << num_md5_shards
                       << "::num_work_shards=" << num_work_shards << dendl;
-    ret = d_cluster.reset(store, d_dedup_cluster_ioctx, p_epoch, num_work_shards,
-                          num_md5_shards);
+    // init handles and create the dedup_pool
+    ret = init_rados_access_handles(true);
+    if (ret != 0) {
+      derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
+           << ret << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+
+    ret = d_cluster.reset(store, p_epoch, num_work_shards, num_md5_shards);
     if (ret != 0) {
       ldpp_dout(dpp, 1) << __func__ << "::ERR: failed cluster.init()" << dendl;
       return ret;
@@ -2101,40 +2268,7 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   int Background::watch_reload(const DoutPrefixProvider* dpp)
   {
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return -ENOENT;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
-    const std::string & oid = DEDUP_WATCH_OBJ;
-    // create the object to watch (object may already exist)
-    bool exclusive = true;
-    int ret = d_dedup_cluster_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
-      ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
-                         << " was created!" << dendl;
-    }
-    else if (ret == -EEXIST) {
-      ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
-    }
-    else {
-      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ioctx.create("
-                        << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    ret = d_dedup_cluster_ioctx.watch2(oid, &d_watch_handle, &d_watcher_ctx);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
-                        << ". error: " << cpp_strerror(-ret) << dendl;
-      d_watch_handle = 0;
-      return ret;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
-                      << oid << "::d_watch_handle=" << d_watch_handle << dendl;
-    return 0;
+    return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx);
   }
 
   //---------------------------------------------------------------------------
@@ -2147,43 +2281,16 @@ namespace rgw::dedup {
       return 0;
     }
 
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): "
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return -ENOENT;
-    }
-
-    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id()
-                      << "::d_watch_handle=" << d_watch_handle << dendl;
-
-    const auto ret = d_dedup_cluster_ioctx.unwatch2(d_watch_handle);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
-                        << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
-                      << DEDUP_WATCH_OBJ << "::d_watch_handle="
+    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle="
                       << d_watch_handle << dendl;
 
-    d_watch_handle = 0;
-    return 0;
-  }
-
-  //---------------------------------------------------------------------------
-  void Background::ack_notify(uint64_t notify_id, uint64_t cookie, int status)
-  {
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return;
+    int ret = cluster::unwatch_reload(store, dpp, d_watch_handle);
+    if (ret == 0) {
+      ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
+                        << "::d_watch_handle=" << d_watch_handle << dendl;
+      d_watch_handle = 0;
     }
-    ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
-    bufferlist reply_bl;
-    ceph::encode(status, reply_bl);
-    encode(d_ctl, reply_bl);
-    d_dedup_cluster_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
+    return ret;
   }
 
   //---------------------------------------------------------------------------
@@ -2207,7 +2314,7 @@ namespace rgw::dedup {
       cond_lock.unlock(); // close lock block------>]
       ldpp_dout(dpp, 5) << __func__
                         << "::system is paused/shutdown -> cancel notification" << dendl;
-      ack_notify(notify_id, cookie, -EBUSY);
+      cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY);
       return;
     }
 
@@ -2266,7 +2373,7 @@ namespace rgw::dedup {
     }
 
     cond_lock.unlock(); // close lock block------>]
-    ack_notify(notify_id, cookie, ret);
+    cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret);
   }
 
   //---------------------------------------------------------------------------
@@ -2275,7 +2382,7 @@ namespace rgw::dedup {
     const DoutPrefixProvider* const dpp = &dp;
     ldpp_dout(dpp, 10) <<  __FILE__ << "::" <<__func__ << dendl;
     {
-      std::unique_lock pause_lock(d_pause_mutex);
+      std::unique_lock pause_lock(d_cond_mutex);
       if (d_ctl.started) {
         // start the thread only once
         ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl;
@@ -2303,6 +2410,8 @@ namespace rgw::dedup {
     d_cond.notify_all();
     ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl;
     d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;});
+    //cond_lock.unlock();
+
     if (nested_call) {
       ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl;
       d_cond.notify_all();
@@ -2323,8 +2432,7 @@ namespace rgw::dedup {
   //---------------------------------------------------------------------------
   void Background::pause()
   {
-    ldpp_dout(dpp, 5) << "dedup_bg->pause() request: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request");
     std::unique_lock cond_lock(d_cond_mutex);
 
     if (d_ctl.local_paused || d_ctl.shutdown_done) {
@@ -2371,14 +2479,14 @@ namespace rgw::dedup {
     }
 
     driver = _driver;
-    int ret = init_rados_access_handles();
+    // can pool change its uid between pause/resume ???
+    int ret = init_rados_access_handles(false);
     if (ret != 0) {
       derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
            << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
+      throw std::runtime_error("Failed init_rados_access_handles()");
     }
-    ldpp_dout(dpp, 5) << __func__ << "::dedup background: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done");
     // create new watch request using the new pool handle
     watch_reload(dpp);
     d_ctl.local_pause_req = false;
@@ -2428,23 +2536,64 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  static bool all_shards_completed(cluster *p_cluster,
-                                   librados::IoCtx &ioctx,
-                                   work_shard_t num_work_shards,
-                                   uint64_t *p_total_ingressed)
+  void Background::work_shards_barrier(work_shard_t num_work_shards)
   {
-    return p_cluster->all_work_shard_tokens_completed(ioctx, num_work_shards,
-                                                      p_total_ingressed);
+    // Wait for other worker to finish ingress step
+    // We can move to the next step even if some token are in failed state
+    const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members
+    unsigned ttl = 3;
+    unsigned time_elapsed = 0;
+
+    while (true) {
+      int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards);
+      // we start incrementing time_elapsed only after all valid tokens finish
+      if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) {
+        break;
+      }
+
+      ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
+                         << ttl << " seconds" << dendl;
+      std::unique_lock cond_lock(d_cond_mutex);
+      d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
+                      [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
+      if (unlikely(d_ctl.should_pause())) {
+        handle_pause_req(__func__);
+      }
+      if (unlikely(d_ctl.should_stop())) {
+        return;
+      }
+
+      if (ret != -EAGAIN) {
+        // All incomplete tokens are corrupted or in time out state
+        // Give them an extra 120 seconds just in case ...
+        time_elapsed += ttl;
+      }
+      // else there are still good tokens in process, wait for them
+    }
+
+    ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n"
+                       << dendl;
+    if (unlikely(d_ctl.should_pause())) {
+      handle_pause_req(__func__);
+    }
   }
 
   //---------------------------------------------------------------------------
-  void Background::work_shards_barrier(work_shard_t num_work_shards)
+  static bool all_md5_shards_completed(cluster *p_cluster,
+                                       rgw::sal::RadosStore *store,
+                                       md5_shard_t num_md5_shards)
   {
-    // Wait for other worker to finish ingress step
-    unsigned ttl = 1;
-    uint64_t total_ingressed = 0;
-    while (!all_shards_completed(&d_cluster, d_dedup_cluster_ioctx, num_work_shards, &total_ingressed)) {
-      ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
+    return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0);
+  }
+
+  //---------------------------------------------------------------------------
+  void Background::md5_shards_barrier(md5_shard_t num_md5_shards)
+  {
+    // Wait for others to finish step
+    unsigned ttl = 3;
+    // require that everything completed successfully before deleting the pool
+    while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) {
+      ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl="
                          << ttl << " seconds" << dendl;
       std::unique_lock cond_lock(d_cond_mutex);
       d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
@@ -2457,8 +2606,8 @@ namespace rgw::dedup {
       }
     }
 
-    ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards! ("
-                       << total_ingressed << ")==\n" << dendl;
+    ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n"
+                       << dendl;
     if (unlikely(d_ctl.should_pause())) {
       handle_pause_req(__func__);
     }
@@ -2483,7 +2632,13 @@ namespace rgw::dedup {
       if (d_ctl.dedup_exec) {
         dedup_epoch_t epoch;
         if (setup(&epoch) != 0) {
-          ldpp_dout(dpp, 1) << "failed setup()" << dendl;
+          ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl;
+          return;
+        }
+        const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+        int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+        if (pool_id < 0) {
+          ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl;
           return;
         }
         work_shard_t num_work_shards = epoch.num_work_shards;
@@ -2505,9 +2660,11 @@ namespace rgw::dedup {
           // Wait for all other workers to finish ingress step
           work_shards_barrier(num_work_shards);
           if (!d_ctl.should_stop()) {
-            process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(), RAW_MEM_SIZE,
-                               num_work_shards, num_md5_shards);
-            ldpp_dout(dpp, 10) << "\n==DEDUP was completed on all shards! ==\n" << dendl;
+            process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(),
+                               RAW_MEM_SIZE, num_work_shards, num_md5_shards);
+            // Wait for all other md5 shards to finish
+            md5_shards_barrier(num_md5_shards);
+            safe_pool_delete(store, dpp, pool_id);
           }
           else {
             ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl;
index 697f8028c666e9b71b9e35cc0edf9297295ae5b0..57ed0e824de5a98ae7d67758141b65ea96ff0444 100644 (file)
@@ -95,10 +95,10 @@ namespace rgw::dedup {
       STEP_REMOVE_DUPLICATES
     };
 
-    void ack_notify(uint64_t notify_id, uint64_t cookie, int status);
     void run();
     int  setup(struct dedup_epoch_t*);
     void work_shards_barrier(work_shard_t num_work_shards);
+    void md5_shards_barrier(md5_shard_t num_md5_shards);
     void handle_pause_req(const char* caller);
     const char* dedup_step_name(dedup_step_t step);
     int  read_buckets();
@@ -216,7 +216,7 @@ namespace rgw::dedup {
                      bool                 is_shared_manifest_src);
 #endif
     int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
-    int  init_rados_access_handles();
+    int  init_rados_access_handles(bool init_pool);
 
     // private data members
     rgw::sal::Driver* driver = nullptr;
@@ -244,7 +244,6 @@ namespace rgw::dedup {
 
     std::thread d_runner;
     std::mutex  d_cond_mutex;
-    std::mutex  d_pause_mutex;
     std::condition_variable d_cond;
   };
 
index 53c24b13acc2c986d54c3ff81e6a19cfb52898d1..f18de129a5a54a89ec42484d10ece01b53f1579d 100644 (file)
 
 namespace rgw::dedup {
   const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN";
+  const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
 
   static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30;
   struct shard_progress_t;
-  static int collect_shard_stats(librados::IoCtx &ioctx,
+  static int collect_shard_stats(rgw::sal::RadosStore *store,
                                  const DoutPrefixProvider *dpp,
                                  utime_t epoch_time,
                                  unsigned shards_count,
@@ -51,14 +52,35 @@ namespace rgw::dedup {
   const char* SHARD_PROGRESS_ATTR = "shard_progress";
 
   //---------------------------------------------------------------------------
-  static int get_epoch(librados::IoCtx &ioctx,
+  static int get_control_ioctx(rgw::sal::RadosStore     *store,
+                               const DoutPrefixProvider *dpp,
+                               librados::IoCtx &ctl_ioctx /* OUT-PARAM */)
+  {
+    const auto& control_pool = store->svc()->zone->get_zone_params().control_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret="
+                        << ret << "::" << cpp_strerror(-ret) << dendl;
+    }
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  static int get_epoch(rgw::sal::RadosStore     *store,
                        const DoutPrefixProvider *dpp,
                        dedup_epoch_t *p_epoch, /* OUT */
                        const char *caller)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     std::string oid(DEDUP_EPOCH_TOKEN);
     bufferlist bl;
-    int ret = ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
+    ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
     if (ret > 0) {
       try {
         auto p = bl.cbegin();
@@ -78,23 +100,29 @@ namespace rgw::dedup {
         ret = -ENODATA;
       }
       ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "")
-                         << "::failed ioctx.getxattr() with: "
+                         << "::failed ctl_ioctx.getxattr() with: "
                          << cpp_strerror(-ret) << ", ret=" << ret << dendl;
       return ret;
     }
   }
 
   //---------------------------------------------------------------------------
-  static int set_epoch(librados::IoCtx &ioctx,
+  static int set_epoch(rgw::sal::RadosStore *store,
                        const std::string &cluster_id,
                        const DoutPrefixProvider *dpp,
                        work_shard_t num_work_shards,
                        md5_shard_t num_md5_shards)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     std::string oid(DEDUP_EPOCH_TOKEN);
     ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
     bool exclusive = true; // block overwrite of old objects
-    int ret = ioctx.create(oid, exclusive);
+    ret = ctl_ioctx.create(oid, exclusive);
     if (ret >= 0) {
       ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
       // now try and take ownership
@@ -120,7 +148,7 @@ namespace rgw::dedup {
     op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
 
     ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
-    ret = ioctx.operate(oid, &op);
+    ret = ctl_ioctx.operate(oid, &op);
     if (ret == 0) {
       ldpp_dout(dpp, 10) << __func__ << "::Epoch object was written" << dendl;
     }
@@ -128,27 +156,33 @@ namespace rgw::dedup {
     // probably best to read attribute from epoch!
     else if (ret == -ECANCELED) {
       dedup_epoch_t epoch;
-      ret = get_epoch(ioctx, dpp, &epoch, __func__);
+      ret = get_epoch(store, dpp, &epoch, __func__);
       if (ret == 0) {
         ldpp_dout(dpp, 10) << __func__ << "::Accept existing Epoch object" << dendl;
       }
       return ret;
     }
     else {
-      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
                         << oid << "), err is " << cpp_strerror(-ret) << dendl;
     }
     return ret;
   }
 
   //---------------------------------------------------------------------------
-  static int swap_epoch(const DoutPrefixProvider *dpp,
-                        librados::IoCtx &ioctx,
+  static int swap_epoch(rgw::sal::RadosStore     *store,
+                        const DoutPrefixProvider *dpp,
                         const dedup_epoch_t *p_old_epoch,
                         dedup_req_type_t dedup_type,
                         work_shard_t num_work_shards,
                         md5_shard_t num_md5_shards)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     dedup_epoch_t new_epoch = { p_old_epoch->serial + 1, dedup_type,
                                 ceph_clock_now(), num_work_shards, num_md5_shards};
     bufferlist old_epoch_bl, new_epoch_bl, err_bl;
@@ -160,9 +194,9 @@ namespace rgw::dedup {
 
     ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
     std::string oid(DEDUP_EPOCH_TOKEN);
-    int ret = ioctx.operate(oid, &op);
+    ret = ctl_ioctx.operate(oid, &op);
     if (ret != 0) {
-      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
                         << oid << "), err is " << cpp_strerror(-ret) << dendl;
     }
 
@@ -193,13 +227,15 @@ namespace rgw::dedup {
       this->progress_a  = _progress_a;
       this->progress_b  = _progress_b;
       this->completed   = _completed;
-      this->update_time = ceph_clock_now();
+
+      utime_t now = ceph_clock_now();
+      this->update_time = now;
 
       if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) {
-        this->creation_time = ceph_clock_now();
+        this->creation_time = now;
       }
       if (_completed) {
-        this->completion_time = ceph_clock_now();
+        this->completion_time = now;
       }
     }
 
@@ -213,6 +249,11 @@ namespace rgw::dedup {
         return false;
       }
     }
+
+    bool was_not_started() const {
+      return (this->creation_time == this->update_time);
+    }
+
     uint64_t    progress_a;
     uint64_t    progress_b;
     bool        completed;
@@ -223,6 +264,17 @@ namespace rgw::dedup {
     bufferlist  stats_bl;
   };
 
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, shard_progress_t& sp)
+  {
+    out << (sp.completed ? " + ::" : " - ::");
+    out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]";
+    out << "::creation: " << sp.creation_time;
+    out << "::update: " << sp.update_time;
+    out << "::completion: " << sp.completion_time;
+    return out;
+  }
+
   //---------------------------------------------------------------------------
   void encode(const shard_progress_t& sp, ceph::bufferlist& bl)
   {
@@ -253,66 +305,6 @@ namespace rgw::dedup {
     DECODE_FINISH(bl);
   }
 
-  //---------------------------------------------------------------------------
-  int init_dedup_pool_ioctx(RGWRados                 *rados,
-                            const DoutPrefixProvider *dpp,
-                            librados::IoCtx          &ioctx)
-  {
-    rgw_pool dedup_pool(DEDUP_POOL_NAME);
-    std::string pool_name(DEDUP_POOL_NAME);
-#if 0
-    // using Replica-1 for the intermediate data
-    // since it can be regenerated in case of a failure
-    std::string replica_count(std::to_string(1));
-#else
-    // temporary solution until we find a way to disable the health warn on replica1
-    std::string replica_count(std::to_string(2));
-#endif
-    librados::bufferlist inbl;
-    std::string output;
-    std::string command = R"(
-    {
-      "prefix": "osd pool create",
-      "pool": ")" + pool_name +
-      R"(",
-      "pool_type": "replicated",
-      "size": )" + replica_count +
-      R"(
-    })";
-
-    auto rados_handle = rados->get_rados_handle();
-    int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
-    if (output.length()) {
-      if (output != "pool 'rgw_dedup_pool' already exists") {
-        ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
-      }
-    }
-    if (ret != 0 && ret != -EEXIST) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
-                        << DEDUP_POOL_NAME << " with: "
-                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
-      return ret;
-    }
-
-    ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::failed to initialize pool for listing with: "
-                        << cpp_strerror(-ret) << dendl;
-    }
-
-    ret = ioctx.application_enable("dedup", false);
-    if (ret == 0) {
-      ldpp_dout(dpp, 10) << __func__ << "::pool " << DEDUP_POOL_NAME
-                         << " was associated with dedup app" << dendl;
-    }
-    else {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
-                        << DEDUP_POOL_NAME << " with: "
-                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
-    }
-    return ret;
-  }
-
   //==========================================================================
 
   //---------------------------------------------------------------------------
@@ -326,9 +318,6 @@ namespace rgw::dedup {
 
     memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers));
     memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5));
-
-    d_total_ingressed_obj = 0;
-    d_num_failed_workers = 0;
   }
 
 
@@ -343,31 +332,10 @@ namespace rgw::dedup {
     d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN))
   {
     clear();
-
-    auto store = dynamic_cast<rgw::sal::RadosStore*>(driver);
-    if (!store) {
-      ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl;
-      ceph_abort("non-rados backend");
-      return;
-    }
-
-    librados::IoCtx ioctx;
-    if (init_dedup_pool_ioctx(store->getRados(), dpp, ioctx) != 0) {
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
-    }
-
-    // generate an empty epoch with zero counters
-    int ret = set_epoch(ioctx, d_cluster_id, dpp, 0, 0);
-    if (ret != 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::failed set_epoch()! ret="
-                        << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed set_epoch()");
-    }
   }
 
   //---------------------------------------------------------------------------
   int cluster::reset(rgw::sal::RadosStore *store,
-                     librados::IoCtx &ioctx,
                      dedup_epoch_t *p_epoch,
                      work_shard_t num_work_shards,
                      md5_shard_t num_md5_shards)
@@ -377,7 +345,7 @@ namespace rgw::dedup {
     clear();
 
     while (true) {
-      int ret = get_epoch(ioctx, dpp, p_epoch, __func__);
+      int ret = get_epoch(store, dpp, p_epoch, __func__);
       if (ret != 0) {
         return ret;
       }
@@ -391,7 +359,7 @@ namespace rgw::dedup {
         break;
       }
       else {
-        ret = swap_epoch(dpp, ioctx, p_epoch,
+        ret = swap_epoch(store, dpp, p_epoch,
                          static_cast<dedup_req_type_t> (p_epoch->dedup_type),
                          num_work_shards, num_md5_shards);
       }
@@ -402,27 +370,33 @@ namespace rgw::dedup {
     const unsigned RETRY_LIMIT = 3;
     int ret = 1;
     for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) {
-      ret = cleanup_prev_run(ioctx);
+      ret = cleanup_prev_run(store);
     }
     if (ret != 0) {
       return ret;
     }
 
-    create_shard_tokens(ioctx, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
-    create_shard_tokens(ioctx, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
+    create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
+    create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
 
-    ret = verify_all_shard_tokens(ioctx, p_epoch->num_work_shards,
+    ret = verify_all_shard_tokens(store, p_epoch->num_work_shards,
                                   WORKER_SHARD_PREFIX);
     if (ret != 0) {
       return ret;
     }
-    return verify_all_shard_tokens(ioctx, p_epoch->num_md5_shards,
+    return verify_all_shard_tokens(store, p_epoch->num_md5_shards,
                                    MD5_SHARD_PREFIX);
   }
 
   //---------------------------------------------------------------------------
-  int cluster::cleanup_prev_run(librados::IoCtx &ioctx)
+  int cluster::cleanup_prev_run(rgw::sal::RadosStore *store)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     int error_code = 0;
     constexpr uint32_t max = 100;
     std::string marker;
@@ -432,7 +406,7 @@ namespace rgw::dedup {
     unsigned failed_count  = 0, no_entry_count = 0;
     do {
       std::vector<std::string> oids;
-      int ret = rgw_list_pool(dpp, ioctx, max, filter, marker, &oids, &truncated);
+      int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated);
       if (ret == -ENOENT) {
         ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl;
         break;
@@ -444,14 +418,15 @@ namespace rgw::dedup {
       }
 
       for (const std::string& oid : oids) {
-        if (oid == DEDUP_WATCH_OBJ || oid == DEDUP_EPOCH_TOKEN) {
+        if (shard_token_oid::legal_oid_name(oid) == false) {
           ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl;
           skipped_count++;
           continue;
         }
+
         uint64_t size;
         struct timespec tspec;
-        ret = ioctx.stat2(oid, &size, &tspec);
+        ret = ctl_ioctx.stat2(oid, &size, &tspec);
         if (ret == -ENOENT) {
           ldpp_dout(dpp, 20) << __func__ << "::" << oid
                              << " was removed by others" << dendl;
@@ -459,7 +434,8 @@ namespace rgw::dedup {
           continue;
         }
         else if (ret != 0) {
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.stat( " << oid << " )" << dendl;
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( "
+                             << oid << " )" << dendl;
           error_code = ret;
           failed_count++;
           continue;
@@ -473,7 +449,7 @@ namespace rgw::dedup {
           continue;
         }
         ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl;
-        ret = ioctx.remove(oid);
+        ret = ctl_ioctx.remove(oid);
         if (ret == 0) {
           deleted_count++;
         }
@@ -486,42 +462,48 @@ namespace rgw::dedup {
         else {
           error_code = ret;
           failed_count++;
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << oid
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid
                              << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
         }
       }
       ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size()
-                         << "::deleted=" << deleted_count
-                         << "::failed="  << failed_count
-                         << "::no entry="  << no_entry_count
-                         << "::skipped=" << skipped_count << dendl;
+                         << "::deleted="  << deleted_count
+                         << "::failed="   << failed_count
+                         << "::no entry=" << no_entry_count
+                         << "::skipped="  << skipped_count << dendl;
     } while (truncated);
 
     return error_code;
   }
 
   //---------------------------------------------------------------------------
-  int cluster::create_shard_tokens(librados::IoCtx &ioctx,
+  int cluster::create_shard_tokens(rgw::sal::RadosStore *store,
                                    unsigned shards_count,
                                    const char *prefix)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     shard_token_oid sto(prefix);
     for (unsigned shard = 0; shard < shards_count; shard++) {
       sto.set_shard(shard);
       std::string oid(sto.get_buff(), sto.get_buff_size());
       ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
       bool exclusive = true;
-      int ret = ioctx.create(oid, exclusive);
+      ret = ctl_ioctx.create(oid, exclusive);
       if (ret >= 0) {
         ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
       }
       else if (ret == -EEXIST) {
-        ldpp_dout(dpp, 15) << __func__ << "::failed ioctx.create("
+        ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create("
                            << oid << ") -EEXIST!" << dendl;
       }
       else {
         // TBD: can it happen legally ?
-        ldpp_dout(dpp, 1) << __func__ << "::failed ioctx.create(" << oid
+        ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid
                           << ") with: " << ret  << "::" << cpp_strerror(-ret) << dendl;
       }
     }
@@ -530,10 +512,16 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  int cluster::verify_all_shard_tokens(librados::IoCtx &ioctx,
+  int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store,
                                        unsigned shards_count,
                                        const char *prefix)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     shard_token_oid sto(prefix);
     for (unsigned shard = 0; shard < shards_count; shard++) {
       sto.set_shard(shard);
@@ -542,9 +530,9 @@ namespace rgw::dedup {
 
       uint64_t size;
       struct timespec tspec;
-      int ret = ioctx.stat2(oid, &size, &tspec);
+      ret = ctl_ioctx.stat2(oid, &size, &tspec);
       if (ret != 0) {
-        ldpp_dout(dpp, 5) << __func__ << "::failed ioctx.stat( " << oid << " )"
+        ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
                           << "::shards_count=" << shards_count << dendl;
         return ret;
       }
@@ -554,12 +542,18 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  int cluster::update_shard_token_heartbeat(librados::IoCtx &ioctx,
+  int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store,
                                             unsigned shard,
                                             uint64_t count_a,
                                             uint64_t count_b,
                                             const char *prefix)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     shard_token_oid sto(prefix, shard);
     std::string oid(sto.get_buff(), sto.get_buff_size());
     bufferlist empty_bl;
@@ -567,16 +561,22 @@ namespace rgw::dedup {
     sp.creation_time = d_token_creation_time;
     bufferlist sp_bl;
     encode(sp, sp_bl);
-    return ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+    return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
   }
 
   //---------------------------------------------------------------------------
-  int cluster::mark_shard_token_completed(librados::IoCtx &ioctx,
+  int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store,
                                           unsigned shard,
                                           uint64_t obj_count,
                                           const char *prefix,
                                           const bufferlist &bl)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     shard_token_oid sto(prefix, shard);
     std::string oid(sto.get_buff(), sto.get_buff_size());
     ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl;
@@ -585,24 +585,31 @@ namespace rgw::dedup {
     sp.creation_time = d_token_creation_time;
     bufferlist sp_bl;
     encode(sp, sp_bl);
-    int ret = ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+    ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
     if (ret == 0) {
-      ldpp_dout(dpp, 10) << __func__ << "::Done ioctx.setxattr(" << oid << ")" << dendl;
+      ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")"
+                         << dendl;
     }
     else {
-      ldpp_dout(dpp, 0) << __func__ << "::Failed ioctx.setxattr(" << oid << ") ret="
-                        << ret << "::" << cpp_strerror(-ret) << dendl;
+      ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid
+                        << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
     }
 
     return ret;
   }
 
   //---------------------------------------------------------------------------
-  int32_t cluster::get_next_shard_token(librados::IoCtx &ioctx,
+  int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store,
                                         uint16_t start_shard,
                                         uint16_t max_shard,
                                         const char *prefix)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     // lock paramters:
     const utime_t     lock_duration;  // zero duration means lock doesn't expire
     const uint8_t     lock_flags = 0; // no flags
@@ -617,7 +624,7 @@ namespace rgw::dedup {
       op.assert_exists();
       rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie,
                              lock_tag, "dedup_shard_token", lock_duration, lock_flags);
-      int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), null_yield);
+      ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield);
       if (ret == -EBUSY) {
         // someone else took this token -> move to the next one
         ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid <<
@@ -641,10 +648,9 @@ namespace rgw::dedup {
       bufferlist empty_bl;
       shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl);
       d_token_creation_time = sp.creation_time;
-
       bufferlist sp_bl;
       encode(sp, sp_bl);
-      ret = ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+      ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
       if (ret == 0) {
         ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl;
         return shard;
@@ -655,11 +661,11 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  work_shard_t cluster::get_next_work_shard_token(librados::IoCtx &ioctx,
+  work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store,
                                                   work_shard_t num_work_shards)
   {
-    int32_t shard = get_next_shard_token(ioctx, d_curr_worker_shard, num_work_shards,
-                                         WORKER_SHARD_PREFIX);
+    int32_t shard = get_next_shard_token(store, d_curr_worker_shard,
+                                         num_work_shards, WORKER_SHARD_PREFIX);
     if (shard >= 0 && shard < num_work_shards) {
       d_curr_worker_shard = shard + 1;
       return shard;
@@ -670,10 +676,10 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  md5_shard_t cluster::get_next_md5_shard_token(librados::IoCtx &ioctx,
+  md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store,
                                                 md5_shard_t num_md5_shards)
   {
-    int32_t shard = get_next_shard_token(ioctx, d_curr_md5_shard, num_md5_shards,
+    int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards,
                                          MD5_SHARD_PREFIX);
     if (shard >= 0 && shard < num_md5_shards) {
       d_curr_md5_shard = shard + 1;
@@ -685,17 +691,23 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  bool cluster::all_shard_tokens_completed(librados::IoCtx &ioctx,
-                                           unsigned shards_count,
-                                           const char *prefix,
-                                           uint16_t *p_num_completed,
-                                           uint8_t completed_arr[],
-                                           uint64_t *p_total_ingressed)
+  int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                          unsigned shards_count,
+                                          const char *prefix,
+                                          uint16_t *p_num_completed,
+                                          uint8_t completed_arr[])
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    int err_code = 0;
     unsigned count = 0;
     shard_token_oid sto(prefix);
     for (unsigned shard = 0; shard < shards_count; shard++) {
-      if (completed_arr[shard] != TOKEN_STATE_PENDING) {
+      if (completed_arr[shard] == TOKEN_STATE_COMPLETED) {
         count++;
         continue;
       }
@@ -704,12 +716,15 @@ namespace rgw::dedup {
       std::string oid(sto.get_buff(), sto.get_buff_size());
       ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
       bufferlist bl;
-      int ret = ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+      ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
       if (unlikely(ret <= 0)) {
         if (ret != -ENODATA) {
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.getxattr() ret="
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret="
                              << ret << "::" << cpp_strerror(-ret) << dendl;
         }
+        completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+        // all failures to get valid token state return ENODATA
+        err_code = -ENODATA;
         continue;
       }
 
@@ -719,50 +734,58 @@ namespace rgw::dedup {
         decode(sp, p);
       }
       catch (const buffer::error&) {
-        ldpp_dout(dpp, 0) << __func__ << "::failed shard_progress_t decode!" << dendl;
-        return false;
+        ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl;
+        completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+        // all failures to get valid token state return ENODATA
+        err_code = -ENODATA;
+        continue;
       }
 
-      if (sp.progress_b == SP_ALL_OBJECTS) {
-        ceph_assert(sp.completed);
+      if (sp.is_completed()) {
         utime_t duration = sp.completion_time - sp.creation_time;
         // mark token completed;
         (*p_num_completed)++;
         completed_arr[shard] = TOKEN_STATE_COMPLETED;
-        d_total_ingressed_obj += sp.progress_a;
         ldpp_dout(dpp, 20) << __func__ << "::" << oid
                            << "::completed! duration=" << duration << dendl;
         count++;
       }
+      else if (sp.was_not_started()) {
+        // token was not started yet
+        // TBD:
+        // If it is not locked we can process it (by why we skipped it)??
+        // If locked, check when it was done and if timed-out
+        ldpp_dout(dpp, 10) << __func__ << "::" << oid
+                           << "::was not started, skipping" << dendl;
+        return -EAGAIN;
+      }
       else {
         static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0);
-        utime_t time_elapsed = sp.update_time - sp.creation_time;
+        utime_t time_elapsed = ceph_clock_now() - sp.update_time;
         if (time_elapsed > heartbeat_timeout) {
           // lock expired -> try and break lock
-          ldpp_dout(dpp, 0) << __func__ << "::" << oid << "::expired lock, skipping" << dendl;
+          ldpp_dout(dpp, 5) << __func__ << "::" << oid
+                            << "::expired lock, skipping:" << time_elapsed
+                            << "::" << sp << dendl;
           completed_arr[shard] = TOKEN_STATE_TIMED_OUT;
-          d_num_failed_workers++;
+          err_code = -ETIME;
           continue;
         }
         else {
-          return false;
+          return -EAGAIN;
         }
-        // TBD: need to store copies and declare token with no progress for N seconds
-        // as failing and then skip it
-        return false;
       }
     } // loop
 
-    *p_total_ingressed = d_total_ingressed_obj;
     if (count < shards_count) {
       unsigned n = shards_count - count;
       ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl;
     }
-    return (count == shards_count);
+    return err_code;
   }
 
   //---------------------------------------------------------------------------
-  static int collect_shard_stats(librados::IoCtx &ioctx,
+  static int collect_shard_stats(rgw::sal::RadosStore *store,
                                  const DoutPrefixProvider *dpp,
                                  utime_t epoch_time,
                                  unsigned shards_count,
@@ -770,6 +793,12 @@ namespace rgw::dedup {
                                  bufferlist bl_arr[],
                                  shard_progress_t *sp_arr)
   {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
     unsigned count = 0;
     cluster::shard_token_oid sto(prefix);
     for (unsigned shard = 0; shard < shards_count; shard++) {
@@ -779,8 +808,8 @@ namespace rgw::dedup {
 
       uint64_t size;
       struct timespec tspec;
-      if (ioctx.stat2(oid, &size, &tspec) != 0) {
-        ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.stat( " << oid << " )"
+      if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
                            << "::shards_count=" << shards_count << dendl;
         continue;
       }
@@ -794,7 +823,7 @@ namespace rgw::dedup {
 
       shard_progress_t sp;
       bufferlist bl;
-      int ret = ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+      ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
       if (ret > 0) {
         try {
           auto p = bl.cbegin();
@@ -926,16 +955,16 @@ namespace rgw::dedup {
 
   //---------------------------------------------------------------------------
   static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum,
-                                            const md5_stats_t    &md5_stats_sum,
+                                            const md5_stats_t &md5_stats_sum,
                                             Formatter *fmt)
   {
     uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
-    uint64_t s3_dedup_bytes  = md5_stats_sum.dedup_bytes_estimate;
+    uint64_t s3_dedup_bytes  = md5_stats_sum.big_objs_stat.dedup_bytes_estimate;
     uint64_t s3_bytes_after  = s3_bytes_before - s3_dedup_bytes;
-
     Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
     fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
     fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
+    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
 
     if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
       double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
@@ -959,7 +988,7 @@ namespace rgw::dedup {
     Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
     fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
     fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-
+    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
     if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
       double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
       fmt->dump_float("dedup_ratio", dedup_ratio);
@@ -975,14 +1004,8 @@ namespace rgw::dedup {
                                        Formatter *fmt,
                                        const DoutPrefixProvider *dpp)
   {
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
-
     dedup_epoch_t epoch;
-    ret = get_epoch(ioctx, dpp, &epoch, nullptr);
+    int ret = get_epoch(store, dpp, &epoch, nullptr);
     if (ret != 0) {
       return ret;
     }
@@ -1000,7 +1023,7 @@ namespace rgw::dedup {
       bool show_time = true;
       bufferlist bl_arr[num_work_shards];
       shard_progress_t sp_arr[num_work_shards];
-      int cnt = collect_shard_stats(ioctx, dpp, epoch.time, num_work_shards,
+      int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards,
                                     WORKER_SHARD_PREFIX, bl_arr, sp_arr);
       if (cnt != num_work_shards && 0) {
         std::cerr << ">>>Partial work shard stats recived " << cnt << " / "
@@ -1037,7 +1060,7 @@ namespace rgw::dedup {
       md5_stats_t md5_stats_sum;
       bufferlist bl_arr[num_md5_shards];
       shard_progress_t sp_arr[num_md5_shards];
-      int cnt = collect_shard_stats(ioctx, dpp, epoch.time, num_md5_shards,
+      int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards,
                                     MD5_SHARD_PREFIX, bl_arr, sp_arr);
       if (cnt != num_md5_shards && 0) {
         std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / "
@@ -1076,13 +1099,97 @@ namespace rgw::dedup {
     return 0;
   }
 
+  //---------------------------------------------------------------------------
+  int cluster::watch_reload(rgw::sal::RadosStore *store,
+                            const DoutPrefixProvider* dpp,
+                            uint64_t *p_watch_handle,
+                            librados::WatchCtx2 *ctx)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    const std::string & oid = DEDUP_WATCH_OBJ;
+    // create the object to watch (object may already exist)
+    bool exclusive = true;
+    ret = ctl_ioctx.create(oid, exclusive);
+    if (ret >= 0) {
+      ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
+                         << " was created!" << dendl;
+    }
+    else if (ret == -EEXIST) {
+      ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create("
+                        << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
+                        << ". error: " << cpp_strerror(-ret) << dendl;
+      *p_watch_handle = 0;
+      return ret;
+    }
+    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
+                      << oid << "::watch_handle=" << *p_watch_handle << dendl;
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  int cluster::unwatch_reload(rgw::sal::RadosStore *store,
+                              const DoutPrefixProvider* dpp,
+                              uint64_t watch_handle)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    ret = ctl_ioctx.unwatch2(watch_handle);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
+                        << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  int cluster::ack_notify(rgw::sal::RadosStore *store,
+                          const DoutPrefixProvider *dpp,
+                          const control_t *p_ctl,
+                          uint64_t notify_id,
+                          uint64_t cookie,
+                          int status)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
+    bufferlist reply_bl;
+    ceph::encode(status, reply_bl);
+    encode(*p_ctl, reply_bl);
+    ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
+
+    return 0;
+  }
+
   //---------------------------------------------------------------------------
   // command-line called from radosgw-admin.cc
   int cluster::dedup_control(rgw::sal::RadosStore *store,
                              const DoutPrefixProvider *dpp,
                              urgent_msg_t urgent_msg)
   {
-    ldpp_dout(dpp, 20) << __func__ << "::dedup_control req = "
+    ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = "
                        << get_urgent_msg_names(urgent_msg) << dendl;
     if (urgent_msg != URGENT_MSG_RESUME  &&
         urgent_msg != URGENT_MSG_PASUE   &&
@@ -1092,16 +1199,17 @@ namespace rgw::dedup {
       return -EINVAL;
     }
 
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
       return ret;
     }
+
     // 10 seconds timeout
     const uint64_t timeout_ms = 10*1000;
     bufferlist reply_bl, urgent_msg_bl;
     ceph::encode(urgent_msg, urgent_msg_bl);
-    ret = rgw_rados_notify(dpp, ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
+    ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
                            timeout_ms, &reply_bl, null_yield);
     if (ret < 0) {
       ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
@@ -1110,7 +1218,7 @@ namespace rgw::dedup {
     }
     std::vector<librados::notify_ack_t> acks;
     std::vector<librados::notify_timeout_t> timeouts;
-    ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
+    ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
     if (timeouts.size() > 0) {
       ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
                         << DEDUP_WATCH_OBJ << ")::timeout error" << dendl;
@@ -1147,17 +1255,21 @@ namespace rgw::dedup {
                                   dedup_req_type_t dedup_type,
                                   const DoutPrefixProvider *dpp)
   {
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
+    ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl;
 
     dedup_epoch_t old_epoch;
     // store the previous epoch for cmp-swap
-    ret = get_epoch(ioctx, dpp, &old_epoch, __func__);
+    int ret = get_epoch(store, dpp, &old_epoch, __func__);
     if (ret != 0) {
-      return ret;
+      // generate an empty epoch with zero counters
+      std::string cluster_id("NULL_CLUSTER_ID");
+      ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: "
+                        << cluster_id << dendl;
+      set_epoch(store, cluster_id, dpp, 0, 0);
+      ret = get_epoch(store, dpp, &old_epoch, __func__);
+      if (ret) {
+        return ret;
+      }
     }
 
     // first abort all dedup work!
@@ -1165,6 +1277,13 @@ namespace rgw::dedup {
     if (ret != 0) {
       return ret;
     }
+#if 0
+    // then delete dedup-pool to ensure a clean start
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl;
+    rados_handle->pool_delete(dedup_pool.name.c_str());
+#endif
 
     ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl;
 #ifdef FULL_DEDUP_SUPPORT
@@ -1173,7 +1292,7 @@ namespace rgw::dedup {
 #else
     ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
 #endif
-    ret = swap_epoch(dpp, ioctx, &old_epoch, dedup_type, 0, 0);
+    ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0);
     if (ret == 0) {
       ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl;
       return dedup_control(store, dpp, URGENT_MSG_RESTART);
@@ -1187,14 +1306,8 @@ namespace rgw::dedup {
   bool cluster::can_start_new_scan(rgw::sal::RadosStore *store)
   {
     ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl;
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
-
     dedup_epoch_t new_epoch;
-    if (get_epoch(ioctx, dpp, &new_epoch, nullptr) != 0) {
+    if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) {
       ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::"
                         << "::scan can be restarted!\n\n\n" << dendl;
       // no epoch object exists -> we should start a new scan
index 52fa6c376501d29544f1d4a5d10951586c3d22b6..64b2c54a4fa28a1e2f69f2c62d8ca212ec892a79 100644 (file)
 #include <string>
 
 namespace rgw::dedup {
-  static constexpr const char* DEDUP_POOL_NAME     = "rgw_dedup_pool";
-  static constexpr const char* MD5_SHARD_PREFIX    = "MD5.SHRD.TK.";
   static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK.";
-
+  static constexpr const char* MD5_SHARD_PREFIX    = "MD5.SHRD.TK.";
+  struct control_t;
   struct dedup_epoch_t;
-  int   init_dedup_pool_ioctx(RGWRados                 *rados,
-                              const DoutPrefixProvider *dpp,
-                              librados::IoCtx          &ioctx);
 
   class cluster{
   public:
@@ -51,6 +47,11 @@ namespace rgw::dedup {
         this->total_len = this->prefix_len + n;
       }
 
+      //---------------------------------------------------------------------------
+      static bool legal_oid_name(const std::string& oid) {
+        return ((oid.length() <= BUFF_SIZE) &&
+                (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX)));
+      }
       inline const char* get_buff() { return this->buff; }
       inline unsigned get_buff_size() { return this->total_len; }
     private:
@@ -65,20 +66,32 @@ namespace rgw::dedup {
             CephContext* cct,
             rgw::sal::Driver* driver);
     int          reset(rgw::sal::RadosStore *store,
-                       librados::IoCtx &ioctx,
                        struct dedup_epoch_t*,
                        work_shard_t num_work_shards,
                        md5_shard_t num_md5_shards);
 
     utime_t      get_epoch_time() { return d_epoch_time; }
-    work_shard_t get_next_work_shard_token(librados::IoCtx &ioctx,
+    work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store,
                                            work_shard_t num_work_shards);
-    md5_shard_t  get_next_md5_shard_token(librados::IoCtx &ioctx,
+    md5_shard_t  get_next_md5_shard_token(rgw::sal::RadosStore *store,
                                           md5_shard_t num_md5_shards);
     bool         can_start_new_scan(rgw::sal::RadosStore *store);
     static int   collect_all_shard_stats(rgw::sal::RadosStore *store,
                                          Formatter *p_formatter,
                                          const DoutPrefixProvider *dpp);
+    static int   watch_reload(rgw::sal::RadosStore *store,
+                              const DoutPrefixProvider* dpp,
+                              uint64_t *p_watch_handle,
+                              librados::WatchCtx2 *ctx);
+    static int   unwatch_reload(rgw::sal::RadosStore *store,
+                                const DoutPrefixProvider* dpp,
+                                uint64_t watch_handle);
+    static int   ack_notify(rgw::sal::RadosStore *store,
+                            const DoutPrefixProvider *dpp,
+                            const struct control_t *p_ctl,
+                            uint64_t notify_id,
+                            uint64_t cookie,
+                            int status);
     static int   dedup_control(rgw::sal::RadosStore *store,
                                const DoutPrefixProvider *dpp,
                                urgent_msg_t urgent_msg);
@@ -87,7 +100,7 @@ namespace rgw::dedup {
                                     const DoutPrefixProvider *dpp);
 
     //---------------------------------------------------------------------------
-    int mark_work_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_work_shard_token_completed(rgw::sal::RadosStore *store,
                                         work_shard_t work_shard,
                                         const worker_stats_t *p_stats)
     {
@@ -95,14 +108,13 @@ namespace rgw::dedup {
       encode(*p_stats, bl);
       d_num_completed_workers++;
       d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED;
-      d_total_ingressed_obj += p_stats->ingress_obj;
 
-      return mark_shard_token_completed(ioctx, work_shard, p_stats->ingress_obj,
+      return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj,
                                         WORKER_SHARD_PREFIX, bl);
     }
 
     //---------------------------------------------------------------------------
-    int mark_md5_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_md5_shard_token_completed(rgw::sal::RadosStore *store,
                                        md5_shard_t md5_shard,
                                        const md5_stats_t *p_stats)
     {
@@ -110,53 +122,56 @@ namespace rgw::dedup {
       encode(*p_stats, bl);
       d_num_completed_md5++;
       d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED;
-      return mark_shard_token_completed(ioctx, md5_shard, p_stats->loaded_objects,
+      return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects,
                                         MD5_SHARD_PREFIX, bl);
     }
 
-    int update_shard_token_heartbeat(librados::IoCtx &ioctx,
+    int update_shard_token_heartbeat(rgw::sal::RadosStore *store,
                                      unsigned shard,
                                      uint64_t count_a,
                                      uint64_t count_b,
                                      const char *prefix);
 
     //---------------------------------------------------------------------------
-    bool all_work_shard_tokens_completed(librados::IoCtx &ioctx,
-                                         work_shard_t num_work_shards,
-                                         uint64_t *p_total_ingressed)
+    int all_work_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                        work_shard_t num_work_shards)
+    {
+      return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX,
+                                        &d_num_completed_workers, d_completed_workers);
+    }
+
+    //---------------------------------------------------------------------------
+    int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                       md5_shard_t num_md5_shards)
     {
-      return all_shard_tokens_completed(ioctx,
-                                        num_work_shards,
-                                        WORKER_SHARD_PREFIX,
-                                        &d_num_completed_workers,
-                                        d_completed_workers,
-                                        p_total_ingressed);
+      return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX,
+                                        &d_num_completed_md5, d_completed_md5);
     }
 
   private:
     static constexpr unsigned TOKEN_STATE_PENDING   = 0x00;
+    static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC;
     static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD;
     static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF;
 
     void clear();
-    bool all_shard_tokens_completed(librados::IoCtx &ioctx,
+    int  all_shard_tokens_completed(rgw::sal::RadosStore *store,
                                     unsigned shards_count,
                                     const char *prefix,
                                     uint16_t *p_num_completed,
-                                    uint8_t completed_arr[],
-                                    uint64_t *p_total_ingressed);
-    int cleanup_prev_run(librados::IoCtx &ioctx);
-    int32_t get_next_shard_token(librados::IoCtx &ioctx,
+                                    uint8_t completed_arr[]);
+    int cleanup_prev_run(rgw::sal::RadosStore *store);
+    int32_t get_next_shard_token(rgw::sal::RadosStore *store,
                                  uint16_t start_shard,
                                  uint16_t max_count,
                                  const char *prefix);
-    int create_shard_tokens(librados::IoCtx &ioctx,
+    int create_shard_tokens(rgw::sal::RadosStore *store,
                             unsigned shards_count,
                             const char *prefix);
-    int verify_all_shard_tokens(librados::IoCtx &ioctx,
+    int verify_all_shard_tokens(rgw::sal::RadosStore *store,
                                 unsigned shards_count,
                                 const char *prefix);
-    int mark_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_shard_token_completed(rgw::sal::RadosStore *store,
                                    unsigned shard,
                                    uint64_t obj_count,
                                    const char *prefix,
@@ -169,12 +184,10 @@ namespace rgw::dedup {
     work_shard_t              d_curr_worker_shard = 0;
     utime_t                   d_epoch_time;
     utime_t                   d_token_creation_time;
-    uint64_t                  d_total_ingressed_obj = 0;
     uint8_t                   d_completed_workers[MAX_WORK_SHARD];
     uint8_t                   d_completed_md5[MAX_MD5_SHARD];
     uint16_t                  d_num_completed_workers = 0;
     uint16_t                  d_num_completed_md5 = 0;
-    uint16_t                  d_num_failed_workers = 0;
   };
 
 } //namespace rgw::dedup
index f2829bfea894363b67e702fdb600cb00fc27950d..18898bbba95298a2739f4930f2de9f589916a9a9 100644 (file)
@@ -32,8 +32,6 @@
 
 namespace rgw::dedup {
 
-  rgw_pool pool(DEDUP_POOL_NAME);
-
   //---------------------------------------------------------------------------
   disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket,
                                const std::string      &obj_name,
index e2798ad5823d0273bf6316e9bbb1b93d46fdbd2c..09335655df626aa225edbfd3e38258adea072389 100644 (file)
@@ -49,6 +49,14 @@ namespace rgw::dedup {
       }
 
       const key_t &key = hash_tab[tab_idx].key;
+      // This is an approximation only since size is stored in 4KB resolution
+      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+        hash_tab[tab_idx].val.clear_flags();
+        redistributed_clear++;
+        continue;
+      }
+
       uint32_t key_idx = key.hash() % entries_count;
       if (key_idx != tab_idx) {
         uint64_t count = 1;
@@ -195,31 +203,50 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  void dedup_table_t::count_duplicates(uint64_t *p_singleton_count,
-                                       uint64_t *p_unique_count,
-                                       uint64_t *p_duplicate_count,
-                                       uint64_t *p_duplicate_bytes_approx)
+  void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
+                                       dedup_stats_t *p_big_objs,
+                                       uint64_t *p_duplicate_head_bytes)
   {
     for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
       if (!hash_tab[tab_idx].val.is_occupied()) {
         continue;
       }
 
+      const key_t &key = hash_tab[tab_idx].key;
+      // This is an approximation only since size is stored in 4KB resolution
+      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+      uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
+
+      // skip small single part objects which we can't dedup
+      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+        if (hash_tab[tab_idx].val.is_singleton()) {
+          p_small_objs->singleton_count++;
+        }
+        else {
+          p_small_objs->duplicate_count += duplicate_count;
+          p_small_objs->unique_count ++;
+          p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx);
+        }
+        continue;
+      }
+
       if (hash_tab[tab_idx].val.is_singleton()) {
-        (*p_singleton_count)++;
+        p_big_objs->singleton_count++;
       }
       else {
         ceph_assert(hash_tab[tab_idx].val.count > 1);
-        uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
-        key_t &key = hash_tab[tab_idx].key;
-        // This is an approximation only since size is stored in 4KB resolution
-        uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
         uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
                                                        key.num_parts,
                                                        byte_size_approx);
-        (*p_duplicate_bytes_approx) += (duplicate_count * dup_bytes_approx);
-        (*p_duplicate_count) += duplicate_count;
-        (*p_unique_count) ++;
+        p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx);
+        p_big_objs->duplicate_count += duplicate_count;
+        p_big_objs->unique_count ++;
+
+        if (!key.multipart_object()) {
+          // single part objects duplicate the head object when dedup is used
+          uint64_t dup_head_bytes = duplicate_count * head_object_size;
+          *p_duplicate_head_bytes += dup_head_bytes;
+        }
       }
     }
   }
index 669f360ffc810ca431c8d0e478ab19bd65590b49..51d36006944f42826b2019b7ed48d4399fe6e8eb 100644 (file)
@@ -49,6 +49,10 @@ namespace rgw::dedup {
       return this->md5_low;
     }
 
+    bool multipart_object() const {
+      return num_parts > 0;
+    }
+
     uint64_t md5_high;      // High Bytes of the Object Data MD5
     uint64_t md5_low;       // Low  Bytes of the Object Data MD5
     uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB)
@@ -110,10 +114,10 @@ namespace rgw::dedup {
                                      disk_block_id_t block_id,
                                      record_id_t rec_id);
 
-    void count_duplicates(uint64_t *p_singleton_count,
-                          uint64_t *p_unique_count,
-                          uint64_t *p_duplicate_count,
-                          uint64_t *p_duplicate_bytes_approx);
+    void count_duplicates(dedup_stats_t *p_small_objs_stat,
+                          dedup_stats_t *p_big_objs_stat,
+                          uint64_t *p_duplicate_head_bytes);
+
     void remove_singletons_and_redistribute_keys();
   private:
     // 32 Bytes unified entries
index ef17ec0d38e7c01810d2dd7f61352eac92a9fe3f..c380bff842c601e959572cc6965e0c838603aea9 100644 (file)
@@ -35,6 +35,48 @@ namespace rgw::dedup {
     return out;
   }
 
+  //---------------------------------------------------------------------------
+  dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other)
+  {
+    this->singleton_count += other.singleton_count;
+    this->unique_count += other.unique_count;
+    this->duplicate_count += other.duplicate_count;
+    this->dedup_bytes_estimate += other.dedup_bytes_estimate;
+    return *this;
+  }
+
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats)
+  {
+    out << "::singleton_count="  << stats.singleton_count
+        << "::unique_count="     << stats.unique_count
+        << "::duplicate_count="  << stats.duplicate_count
+        << "::duplicated_bytes=" << stats.dedup_bytes_estimate;
+    return out;
+  }
+
+  //---------------------------------------------------------------------------
+  void encode(const dedup_stats_t& ds, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+    encode(ds.singleton_count, bl);
+    encode(ds.unique_count, bl);
+    encode(ds.duplicate_count, bl);
+    encode(ds.dedup_bytes_estimate, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(ds.singleton_count, bl);
+    decode(ds.unique_count, bl);
+    decode(ds.duplicate_count, bl);
+    decode(ds.dedup_bytes_estimate, bl);
+    DECODE_FINISH(bl);
+  }
+
   // convert a hex-string to a 64bit integer (max 16 hex digits)
   //---------------------------------------------------------------------------
   bool hex2int(const char *p, const char *p_end, uint64_t *p_val)
@@ -206,7 +248,8 @@ namespace rgw::dedup {
   };
 
   //---------------------------------------------------------------------------
-  const char* get_urgent_msg_names(int msg) {
+  const char* get_urgent_msg_names(int msg)
+  {
     if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) {
       return s_urgent_msg_names[msg];
     }
@@ -216,22 +259,34 @@ namespace rgw::dedup {
   }
 
   //---------------------------------------------------------------------------
-  std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
+  worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other)
   {
-    JSONFormatter formatter(false);
-    s.dump(&formatter);
-    std::stringstream sstream;
-    formatter.flush(sstream);
-    out << sstream.str();
-    return out;
+    this->ingress_obj += other.ingress_obj;
+    this->ingress_obj_bytes += other.ingress_obj_bytes;
+    this->egress_records += other.egress_records;
+    this->egress_blocks += other.egress_blocks;
+    this->egress_slabs += other.egress_slabs;
+    this->single_part_objs += other.single_part_objs;
+    this->multipart_objs += other.multipart_objs;
+    this->small_multipart_obj += other.small_multipart_obj;
+    this->default_storage_class_objs += other.default_storage_class_objs;
+    this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
+    this->non_default_storage_class_objs += other.non_default_storage_class_objs;
+    this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
+    this->ingress_corrupted_etag += other.ingress_corrupted_etag;
+    this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
+    this->ingress_skip_too_small += other.ingress_skip_too_small;
+    this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
+    this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
+
+    return *this;
   }
-
   //---------------------------------------------------------------------------
   void worker_stats_t::dump(Formatter *f) const
   {
     // main section
     {
-      Formatter::ObjectSection notify(*f, "main");
+      Formatter::ObjectSection main(*f, "main");
 
       f->dump_unsigned("Ingress Objs count", this->ingress_obj);
       f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes);
@@ -285,6 +340,122 @@ namespace rgw::dedup {
     }
   }
 
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
+  {
+    JSONFormatter formatter(false);
+    s.dump(&formatter);
+    std::stringstream sstream;
+    formatter.flush(sstream);
+    out << sstream.str();
+    return out;
+  }
+
+  //---------------------------------------------------------------------------
+  void encode(const worker_stats_t& w, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+    encode(w.ingress_obj, bl);
+    encode(w.ingress_obj_bytes, bl);
+    encode(w.egress_records, bl);
+    encode(w.egress_blocks, bl);
+    encode(w.egress_slabs, bl);
+
+    encode(w.single_part_objs, bl);
+    encode(w.multipart_objs, bl);
+    encode(w.small_multipart_obj, bl);
+
+    encode(w.default_storage_class_objs, bl);
+    encode(w.default_storage_class_objs_bytes, bl);
+    encode(w.non_default_storage_class_objs, bl);
+    encode(w.non_default_storage_class_objs_bytes, bl);
+
+    encode(w.ingress_corrupted_etag, bl);
+
+    encode(w.ingress_skip_too_small_bytes, bl);
+    encode(w.ingress_skip_too_small, bl);
+
+    encode(w.ingress_skip_too_small_64KB_bytes, bl);
+    encode(w.ingress_skip_too_small_64KB, bl);
+
+    encode(w.duration, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(w.ingress_obj, bl);
+    decode(w.ingress_obj_bytes, bl);
+    decode(w.egress_records, bl);
+    decode(w.egress_blocks, bl);
+    decode(w.egress_slabs, bl);
+    decode(w.single_part_objs, bl);
+    decode(w.multipart_objs, bl);
+    decode(w.small_multipart_obj, bl);
+    decode(w.default_storage_class_objs, bl);
+    decode(w.default_storage_class_objs_bytes, bl);
+    decode(w.non_default_storage_class_objs, bl);
+    decode(w.non_default_storage_class_objs_bytes, bl);
+    decode(w.ingress_corrupted_etag, bl);
+    decode(w.ingress_skip_too_small_bytes, bl);
+    decode(w.ingress_skip_too_small, bl);
+    decode(w.ingress_skip_too_small_64KB_bytes, bl);
+    decode(w.ingress_skip_too_small_64KB, bl);
+
+    decode(w.duration, bl);
+    DECODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
+  {
+    this->small_objs_stat               += other.small_objs_stat;
+    this->big_objs_stat                 += other.big_objs_stat;
+    this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
+    this->ingress_failed_get_object     += other.ingress_failed_get_object;
+    this->ingress_failed_get_obj_attrs  += other.ingress_failed_get_obj_attrs;
+    this->ingress_corrupted_etag        += other.ingress_corrupted_etag;
+    this->ingress_corrupted_obj_attrs   += other.ingress_corrupted_obj_attrs;
+    this->ingress_skip_encrypted        += other.ingress_skip_encrypted;
+    this->ingress_skip_encrypted_bytes  += other.ingress_skip_encrypted_bytes;
+    this->ingress_skip_compressed       += other.ingress_skip_compressed;
+    this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
+    this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
+    this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
+
+    this->skipped_shared_manifest += other.skipped_shared_manifest;
+    this->skipped_purged_small    += other.skipped_purged_small;
+    this->skipped_singleton       += other.skipped_singleton;
+    this->skipped_singleton_bytes += other.skipped_singleton_bytes;
+    this->skipped_source_record   += other.skipped_source_record;
+    this->duplicate_records       += other.duplicate_records;
+    this->size_mismatch           += other.size_mismatch;
+    this->sha256_mismatch         += other.sha256_mismatch;
+    this->failed_src_load         += other.failed_src_load;
+    this->failed_rec_load         += other.failed_rec_load;
+    this->failed_block_load       += other.failed_block_load;
+
+    this->valid_sha256_attrs      += other.valid_sha256_attrs;
+    this->invalid_sha256_attrs    += other.invalid_sha256_attrs;
+    this->set_sha256_attrs        += other.set_sha256_attrs;
+    this->skip_sha256_cmp         += other.skip_sha256_cmp;
+
+    this->set_shared_manifest_src += other.set_shared_manifest_src;
+    this->loaded_objects          += other.loaded_objects;
+    this->processed_objects       += other.processed_objects;
+    this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
+    this->deduped_objects         += other.deduped_objects;
+    this->deduped_objects_bytes   += other.deduped_objects_bytes;
+    this->dup_head_bytes          += other.dup_head_bytes;
+
+    this->failed_dedup            += other.failed_dedup;
+    this->failed_table_load       += other.failed_table_load;
+    this->failed_map_overflow     += other.failed_map_overflow;
+    return *this;
+  }
+
   //---------------------------------------------------------------------------
   std::ostream& operator<<(std::ostream &out, const md5_stats_t &s)
   {
@@ -301,19 +472,37 @@ namespace rgw::dedup {
   {
     // main section
     {
-      Formatter::ObjectSection notify(*f, "main");
+      Formatter::ObjectSection main(*f, "main");
 
       f->dump_unsigned("Total processed objects", this->processed_objects);
       f->dump_unsigned("Loaded objects", this->loaded_objects);
       f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
       f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
       f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
+      f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
       f->dump_unsigned("Already Deduped bytes (prev cycles)",
                        this->shared_manifest_dedup_bytes);
-      f->dump_unsigned("Singleton Obj", this->singleton_count);
-      f->dump_unsigned("Unique Obj", this->unique_count);
-      f->dump_unsigned("Duplicate Obj", this->duplicate_count);
-      f->dump_unsigned("Dedup Bytes Estimate", this->dedup_bytes_estimate);
+
+      const dedup_stats_t &ds = this->big_objs_stat;
+      f->dump_unsigned("Singleton Obj", ds.singleton_count);
+      f->dump_unsigned("Unique Obj", ds.unique_count);
+      f->dump_unsigned("Duplicate Obj", ds.duplicate_count);
+      f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
+    }
+
+    // Potential Dedup Section:
+    // What could be gained by allowing dedup for smaller objects (64KB-4MB)
+    // Space wasted because of duplicated head-object (4MB)
+    {
+      Formatter::ObjectSection potential(*f, "Potential Dedup");
+      const dedup_stats_t &ds = this->small_objs_stat;
+      f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
+      f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
+      f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
+      f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
+      f->dump_unsigned("Duplicated Head Bytes Estimate",
+                       this->dup_head_bytes_estimate);
+      f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
     }
 
     {
@@ -340,6 +529,7 @@ namespace rgw::dedup {
     {
       Formatter::ObjectSection skipped(*f, "skipped");
       f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest);
+      f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small);
       f->dump_unsigned("Skipped singleton objs", this->skipped_singleton);
       if (this->skipped_singleton) {
         f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes);
@@ -403,4 +593,105 @@ namespace rgw::dedup {
       }
     }
   }
+
+  //---------------------------------------------------------------------------
+  void encode(const md5_stats_t& m, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+
+    encode(m.small_objs_stat, bl);
+    encode(m.big_objs_stat, bl);
+    encode(m.ingress_failed_load_bucket, bl);
+    encode(m.ingress_failed_get_object, bl);
+    encode(m.ingress_failed_get_obj_attrs, bl);
+    encode(m.ingress_corrupted_etag, bl);
+    encode(m.ingress_corrupted_obj_attrs, bl);
+    encode(m.ingress_skip_encrypted, bl);
+    encode(m.ingress_skip_encrypted_bytes, bl);
+    encode(m.ingress_skip_compressed, bl);
+    encode(m.ingress_skip_compressed_bytes, bl);
+    encode(m.ingress_skip_changed_objs, bl);
+    encode(m.shared_manifest_dedup_bytes, bl);
+
+    encode(m.skipped_shared_manifest, bl);
+    encode(m.skipped_purged_small, bl);
+    encode(m.skipped_singleton, bl);
+    encode(m.skipped_singleton_bytes, bl);
+    encode(m.skipped_source_record, bl);
+    encode(m.duplicate_records, bl);
+    encode(m.size_mismatch, bl);
+    encode(m.sha256_mismatch, bl);
+    encode(m.failed_src_load, bl);
+    encode(m.failed_rec_load, bl);
+    encode(m.failed_block_load, bl);
+
+    encode(m.valid_sha256_attrs, bl);
+    encode(m.invalid_sha256_attrs, bl);
+    encode(m.set_sha256_attrs, bl);
+    encode(m.skip_sha256_cmp, bl);
+    encode(m.set_shared_manifest_src, bl);
+
+    encode(m.loaded_objects, bl);
+    encode(m.processed_objects, bl);
+    encode(m.dup_head_bytes_estimate, bl);
+    encode(m.deduped_objects, bl);
+    encode(m.deduped_objects_bytes, bl);
+    encode(m.dup_head_bytes, bl);
+    encode(m.failed_dedup, bl);
+    encode(m.failed_table_load, bl);
+    encode(m.failed_map_overflow, bl);
+
+    encode(m.duration, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(m.small_objs_stat, bl);
+    decode(m.big_objs_stat, bl);
+    decode(m.ingress_failed_load_bucket, bl);
+    decode(m.ingress_failed_get_object, bl);
+    decode(m.ingress_failed_get_obj_attrs, bl);
+    decode(m.ingress_corrupted_etag, bl);
+    decode(m.ingress_corrupted_obj_attrs, bl);
+    decode(m.ingress_skip_encrypted, bl);
+    decode(m.ingress_skip_encrypted_bytes, bl);
+    decode(m.ingress_skip_compressed, bl);
+    decode(m.ingress_skip_compressed_bytes, bl);
+    decode(m.ingress_skip_changed_objs, bl);
+    decode(m.shared_manifest_dedup_bytes, bl);
+
+    decode(m.skipped_shared_manifest, bl);
+    decode(m.skipped_purged_small, bl);
+    decode(m.skipped_singleton, bl);
+    decode(m.skipped_singleton_bytes, bl);
+    decode(m.skipped_source_record, bl);
+    decode(m.duplicate_records, bl);
+    decode(m.size_mismatch, bl);
+    decode(m.sha256_mismatch, bl);
+    decode(m.failed_src_load, bl);
+    decode(m.failed_rec_load, bl);
+    decode(m.failed_block_load, bl);
+
+    decode(m.valid_sha256_attrs, bl);
+    decode(m.invalid_sha256_attrs, bl);
+    decode(m.set_sha256_attrs, bl);
+    decode(m.skip_sha256_cmp, bl);
+    decode(m.set_shared_manifest_src, bl);
+
+    decode(m.loaded_objects, bl);
+    decode(m.processed_objects, bl);
+    decode(m.dup_head_bytes_estimate, bl);
+    decode(m.deduped_objects, bl);
+    decode(m.deduped_objects_bytes, bl);
+    decode(m.dup_head_bytes, bl);
+    decode(m.failed_dedup, bl);
+    decode(m.failed_table_load, bl);
+    decode(m.failed_map_overflow, bl);
+
+    decode(m.duration, bl);
+    DECODE_FINISH(bl);
+  }
 } //namespace rgw::dedup
index 9c862272ca793516926065df921a1784018e3b5f..6a1d0fc0f45833b4ddf43788188c4f0c37c2e0bd 100644 (file)
@@ -25,7 +25,6 @@
 
 //#define FULL_DEDUP_SUPPORT
 namespace rgw::dedup {
-  static constexpr const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
   using work_shard_t   = uint16_t;
   using md5_shard_t    = uint16_t;
 
@@ -86,29 +85,21 @@ namespace rgw::dedup {
     uint8_t flags;
   };
 
-  struct worker_stats_t {
-    worker_stats_t& operator +=(const worker_stats_t& other) {
-      this->ingress_obj += other.ingress_obj;
-      this->ingress_obj_bytes += other.ingress_obj_bytes;
-      this->egress_records += other.egress_records;
-      this->egress_blocks += other.egress_blocks;
-      this->egress_slabs += other.egress_slabs;
-      this->single_part_objs += other.single_part_objs;
-      this->multipart_objs += other.multipart_objs;
-      this->small_multipart_obj += other.small_multipart_obj;
-      this->default_storage_class_objs += other.default_storage_class_objs;
-      this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
-      this->non_default_storage_class_objs += other.non_default_storage_class_objs;
-      this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
-      this->ingress_corrupted_etag += other.ingress_corrupted_etag;
-      this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
-      this->ingress_skip_too_small += other.ingress_skip_too_small;
-      this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
-      this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
-
-      return *this;
-    }
+  struct dedup_stats_t {
+    dedup_stats_t& operator+=(const dedup_stats_t& other);
+
+    uint64_t singleton_count = 0;
+    uint64_t unique_count = 0;
+    uint64_t duplicate_count = 0;
+    uint64_t dedup_bytes_estimate = 0;
+  };
 
+  std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats);
+  void encode(const dedup_stats_t& ds, ceph::bufferlist& bl);
+  void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl);
+
+  struct worker_stats_t {
+    worker_stats_t& operator +=(const worker_stats_t& other);
     void dump(Formatter *f) const;
 
     uint64_t ingress_obj = 0;
@@ -138,109 +129,16 @@ namespace rgw::dedup {
     utime_t  duration = {0, 0};
   };
   std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
+  void encode(const worker_stats_t& w, ceph::bufferlist& bl);
+  void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl);
 
-  inline void encode(const worker_stats_t& w, ceph::bufferlist& bl)
-  {
-    ENCODE_START(1, 1, bl);
-    encode(w.ingress_obj, bl);
-    encode(w.ingress_obj_bytes, bl);
-    encode(w.egress_records, bl);
-    encode(w.egress_blocks, bl);
-    encode(w.egress_slabs, bl);
-
-    encode(w.single_part_objs, bl);
-    encode(w.multipart_objs, bl);
-    encode(w.small_multipart_obj, bl);
-
-    encode(w.default_storage_class_objs, bl);
-    encode(w.default_storage_class_objs_bytes, bl);
-    encode(w.non_default_storage_class_objs, bl);
-    encode(w.non_default_storage_class_objs_bytes, bl);
-
-    encode(w.ingress_corrupted_etag, bl);
-
-    encode(w.ingress_skip_too_small_bytes, bl);
-    encode(w.ingress_skip_too_small, bl);
-
-    encode(w.ingress_skip_too_small_64KB_bytes, bl);
-    encode(w.ingress_skip_too_small_64KB, bl);
-
-    encode(w.duration, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  inline void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
-  {
-    DECODE_START(1, bl);
-    decode(w.ingress_obj, bl);
-    decode(w.ingress_obj_bytes, bl);
-    decode(w.egress_records, bl);
-    decode(w.egress_blocks, bl);
-    decode(w.egress_slabs, bl);
-    decode(w.single_part_objs, bl);
-    decode(w.multipart_objs, bl);
-    decode(w.small_multipart_obj, bl);
-    decode(w.default_storage_class_objs, bl);
-    decode(w.default_storage_class_objs_bytes, bl);
-    decode(w.non_default_storage_class_objs, bl);
-    decode(w.non_default_storage_class_objs_bytes, bl);
-    decode(w.ingress_corrupted_etag, bl);
-    decode(w.ingress_skip_too_small_bytes, bl);
-    decode(w.ingress_skip_too_small, bl);
-    decode(w.ingress_skip_too_small_64KB_bytes, bl);
-    decode(w.ingress_skip_too_small_64KB, bl);
-
-    decode(w.duration, bl);
-    DECODE_FINISH(bl);
-  }
 
   struct md5_stats_t {
-    md5_stats_t& operator +=(const md5_stats_t& other) {
-      this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
-      this->ingress_failed_get_object     += other.ingress_failed_get_object;
-      this->ingress_failed_get_obj_attrs  += other.ingress_failed_get_obj_attrs;
-      this->ingress_corrupted_etag        += other.ingress_corrupted_etag;
-      this->ingress_corrupted_obj_attrs   += other.ingress_corrupted_obj_attrs;
-      this->ingress_skip_encrypted        += other.ingress_skip_encrypted;
-      this->ingress_skip_encrypted_bytes  += other.ingress_skip_encrypted_bytes;
-      this->ingress_skip_compressed       += other.ingress_skip_compressed;
-      this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
-      this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
-      this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
-
-      this->skipped_shared_manifest += other.skipped_shared_manifest;
-      this->skipped_singleton       += other.skipped_singleton;
-      this->skipped_singleton_bytes += other.skipped_singleton_bytes;
-      this->skipped_source_record   += other.skipped_source_record;
-      this->duplicate_records       += other.duplicate_records;
-      this->size_mismatch           += other.size_mismatch;
-      this->sha256_mismatch         += other.sha256_mismatch;
-      this->failed_src_load         += other.failed_src_load;
-      this->failed_rec_load         += other.failed_rec_load;
-      this->failed_block_load       += other.failed_block_load;
-
-      this->valid_sha256_attrs      += other.valid_sha256_attrs;
-      this->invalid_sha256_attrs    += other.invalid_sha256_attrs;
-      this->set_sha256_attrs        += other.set_sha256_attrs;
-      this->skip_sha256_cmp         += other.skip_sha256_cmp;
-
-      this->set_shared_manifest_src += other.set_shared_manifest_src;
-      this->loaded_objects          += other.loaded_objects;
-      this->processed_objects       += other.processed_objects;
-      this->singleton_count         += other.singleton_count;
-      this->duplicate_count         += other.duplicate_count;
-      this->dedup_bytes_estimate    += other.dedup_bytes_estimate;
-      this->unique_count            += other.unique_count;
-      this->deduped_objects         += other.deduped_objects;
-      this->deduped_objects_bytes   += other.deduped_objects_bytes;
-
-      this->failed_dedup            += other.failed_dedup;
-      this->failed_table_load       += other.failed_table_load;
-      this->failed_map_overflow     += other.failed_map_overflow;
-      return *this;
-    }
+    md5_stats_t& operator +=(const md5_stats_t& other);
     void dump(Formatter *f) const;
 
+    dedup_stats_t small_objs_stat;
+    dedup_stats_t big_objs_stat;
     uint64_t ingress_failed_load_bucket = 0;
     uint64_t ingress_failed_get_object = 0;
     uint64_t ingress_failed_get_obj_attrs = 0;
@@ -254,6 +152,7 @@ namespace rgw::dedup {
 
     uint64_t shared_manifest_dedup_bytes = 0;
     uint64_t skipped_shared_manifest = 0;
+    uint64_t skipped_purged_small = 0;
     uint64_t skipped_singleton = 0;
     uint64_t skipped_singleton_bytes = 0;
     uint64_t skipped_source_record = 0;
@@ -272,116 +171,20 @@ namespace rgw::dedup {
     uint64_t set_shared_manifest_src = 0;
     uint64_t loaded_objects = 0;
     uint64_t processed_objects = 0;
-    uint64_t singleton_count = 0;
-    uint64_t duplicate_count = 0;
     // counter is using on-disk size affected by block-size
-    uint64_t dedup_bytes_estimate = 0;
-    uint64_t unique_count = 0;
+    uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
     uint64_t deduped_objects = 0;
     // counter is using s3 byte size disregarding the on-disk size affected by block-size
     uint64_t deduped_objects_bytes = 0;
+    uint64_t dup_head_bytes = 0;
     uint64_t failed_dedup = 0;
     uint64_t failed_table_load = 0;
     uint64_t failed_map_overflow = 0;
     utime_t  duration = {0, 0};
   };
   std::ostream &operator<<(std::ostream &out, const md5_stats_t &s);
-  inline void encode(const md5_stats_t& m, ceph::bufferlist& bl)
-  {
-    ENCODE_START(1, 1, bl);
-
-    encode(m.ingress_failed_load_bucket, bl);
-    encode(m.ingress_failed_get_object, bl);
-    encode(m.ingress_failed_get_obj_attrs, bl);
-    encode(m.ingress_corrupted_etag, bl);
-    encode(m.ingress_corrupted_obj_attrs, bl);
-    encode(m.ingress_skip_encrypted, bl);
-    encode(m.ingress_skip_encrypted_bytes, bl);
-    encode(m.ingress_skip_compressed, bl);
-    encode(m.ingress_skip_compressed_bytes, bl);
-    encode(m.ingress_skip_changed_objs, bl);
-    encode(m.shared_manifest_dedup_bytes, bl);
-
-    encode(m.skipped_shared_manifest, bl);
-    encode(m.skipped_singleton, bl);
-    encode(m.skipped_singleton_bytes, bl);
-    encode(m.skipped_source_record, bl);
-    encode(m.duplicate_records, bl);
-    encode(m.size_mismatch, bl);
-    encode(m.sha256_mismatch, bl);
-    encode(m.failed_src_load, bl);
-    encode(m.failed_rec_load, bl);
-    encode(m.failed_block_load, bl);
-
-    encode(m.valid_sha256_attrs, bl);
-    encode(m.invalid_sha256_attrs, bl);
-    encode(m.set_sha256_attrs, bl);
-    encode(m.skip_sha256_cmp, bl);
-    encode(m.set_shared_manifest_src, bl);
-
-    encode(m.loaded_objects, bl);
-    encode(m.processed_objects, bl);
-    encode(m.singleton_count, bl);
-    encode(m.duplicate_count, bl);
-    encode(m.dedup_bytes_estimate, bl);
-    encode(m.unique_count, bl);
-    encode(m.deduped_objects, bl);
-    encode(m.deduped_objects_bytes, bl);
-    encode(m.failed_dedup, bl);
-    encode(m.failed_table_load, bl);
-    encode(m.failed_map_overflow, bl);
-
-    encode(m.duration, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  inline void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
-  {
-    DECODE_START(1, bl);
-    decode(m.ingress_failed_load_bucket, bl);
-    decode(m.ingress_failed_get_object, bl);
-    decode(m.ingress_failed_get_obj_attrs, bl);
-    decode(m.ingress_corrupted_etag, bl);
-    decode(m.ingress_corrupted_obj_attrs, bl);
-    decode(m.ingress_skip_encrypted, bl);
-    decode(m.ingress_skip_encrypted_bytes, bl);
-    decode(m.ingress_skip_compressed, bl);
-    decode(m.ingress_skip_compressed_bytes, bl);
-    decode(m.ingress_skip_changed_objs, bl);
-    decode(m.shared_manifest_dedup_bytes, bl);
-
-    decode(m.skipped_shared_manifest, bl);
-    decode(m.skipped_singleton, bl);
-    decode(m.skipped_singleton_bytes, bl);
-    decode(m.skipped_source_record, bl);
-    decode(m.duplicate_records, bl);
-    decode(m.size_mismatch, bl);
-    decode(m.sha256_mismatch, bl);
-    decode(m.failed_src_load, bl);
-    decode(m.failed_rec_load, bl);
-    decode(m.failed_block_load, bl);
-
-    decode(m.valid_sha256_attrs, bl);
-    decode(m.invalid_sha256_attrs, bl);
-    decode(m.set_sha256_attrs, bl);
-    decode(m.skip_sha256_cmp, bl);
-    decode(m.set_shared_manifest_src, bl);
-
-    decode(m.loaded_objects, bl);
-    decode(m.processed_objects, bl);
-    decode(m.singleton_count, bl);
-    decode(m.duplicate_count, bl);
-    decode(m.dedup_bytes_estimate, bl);
-    decode(m.unique_count, bl);
-    decode(m.deduped_objects, bl);
-    decode(m.deduped_objects_bytes, bl);
-    decode(m.failed_dedup, bl);
-    decode(m.failed_table_load, bl);
-    decode(m.failed_map_overflow, bl);
-
-    decode(m.duration, bl);
-    DECODE_FINISH(bl);
-  }
+  void encode(const md5_stats_t& m, ceph::bufferlist& bl);
+  void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl);
 
   struct parsed_etag_t {
     uint64_t md5_high;  // High Bytes of the Object Data MD5
index 44338c0fea8130179785ed4361108b837c87647d..ed693b15f9d5d39c8845748289c4ba1cc62cee71 100644 (file)
@@ -284,6 +284,7 @@ void RGWZoneParams::decode_json(JSONObj *obj)
   RGWSystemMetaObj::decode_json(obj);
   JSONDecoder::decode_json("domain_root", domain_root, obj);
   JSONDecoder::decode_json("control_pool", control_pool, obj);
+  JSONDecoder::decode_json("dedup_pool", dedup_pool, obj);
   JSONDecoder::decode_json("gc_pool", gc_pool, obj);
   JSONDecoder::decode_json("lc_pool", lc_pool, obj);
   JSONDecoder::decode_json("log_pool", log_pool, obj);
@@ -311,6 +312,7 @@ void RGWZoneParams::dump(Formatter *f) const
   RGWSystemMetaObj::dump(f);
   encode_json("domain_root", domain_root, f);
   encode_json("control_pool", control_pool, f);
+  encode_json("dedup_pool", dedup_pool, f);
   encode_json("gc_pool", gc_pool, f);
   encode_json("lc_pool", lc_pool, f);
   encode_json("log_pool", log_pool, f);
@@ -472,6 +474,7 @@ void add_zone_pools(const RGWZoneParams& info,
 {
   pools.insert(info.domain_root);
   pools.insert(info.control_pool);
+  pools.insert(info.dedup_pool);
   pools.insert(info.gc_pool);
   pools.insert(info.log_pool);
   pools.insert(info.intent_log_pool);
@@ -1274,6 +1277,7 @@ int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
 {
   info.domain_root = fix_zone_pool_dup(pools, info.name, ".rgw.meta:root", info.domain_root);
   info.control_pool = fix_zone_pool_dup(pools, info.name, ".rgw.control", info.control_pool);
+  info.dedup_pool = fix_zone_pool_dup(pools, info.name, ".rgw.dedup", info.dedup_pool);
   info.gc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:gc", info.gc_pool);
   info.lc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:lc", info.lc_pool);
   info.log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log", info.log_pool);
index e7b344d3e94e1c97209897720c591a124c32dbec..9adbc77ec849f4c2b6afb39567632d9e3beb8a80 100644 (file)
@@ -3,5 +3,5 @@ markers =
   basic_test
 
 log_cli=true
-log_cli_level=WARNING
-#log_cli_level=INFO
+#log_cli_level=WARNING
+log_cli_level=INFO
index 26fb8da61a7a61ff711b1e8ac67fddeb97e0a40e..a339e25b6b417e14011e15685c2c39a8cf6b500d 100644 (file)
@@ -41,15 +41,21 @@ class Dedup_Stats:
     set_sha256: int = 0
     total_processed_objects: int = 0
     size_before_dedup: int = 0
-    loaded_objects: int = 0
+    #loaded_objects: int = 0
     set_shared_manifest_src : int = 0
     deduped_obj: int = 0
     singleton_obj : int = 0
     unique_obj : int = 0
     dedup_bytes_estimate : int = 0
     duplicate_obj : int = 0
+    dup_head_size_estimate : int = 0
+    dup_head_size : int = 0
     deduped_obj_bytes : int = 0
     non_default_storage_class_objs_bytes : int = 0
+    potential_singleton_obj : int = 0
+    potential_unique_obj : int = 0
+    potential_duplicate_obj : int = 0
+    potential_dedup_space : int = 0
 
 @dataclass
 class Dedup_Ratio:
@@ -71,7 +77,7 @@ test_path = os.path.normpath(os.path.dirname(os.path.realpath(__file__))) + '/..
 
 #-----------------------------------------------
 def bash(cmd, **kwargs):
-    #log.info('running command: %s', ' '.join(cmd))
+    #log.debug('running command: %s', ' '.join(cmd))
     kwargs['stdout'] = subprocess.PIPE
     process = subprocess.Popen(cmd, **kwargs)
     s = process.communicate()[0].decode('utf-8')
@@ -95,7 +101,7 @@ def gen_bucket_name():
 
     num_buckets += 1
     bucket_name = run_prefix + '-' + str(num_buckets)
-    log.info("bucket_name=%s", bucket_name);
+    log.debug("bucket_name=%s", bucket_name);
     return bucket_name
 
 #-----------------------------------------------
@@ -118,11 +124,11 @@ def close_all_connections():
     global g_simple_connection
 
     for conn in g_simple_connection:
-        log.info("close simple connection")
+        log.debug("close simple connection")
         conn.close()
 
     for conn in g_tenant_connections:
-        log.info("close tenant connection")
+        log.debug("close tenant connection")
         conn.close()
 
 #-----------------------------------------------
@@ -131,7 +137,7 @@ def get_connections(req_count):
     conns=[]
 
     for i in range(min(req_count, len(g_simple_connection))):
-        log.info("recycle existing connection")
+        log.debug("recycle existing connection")
         conns.append(g_simple_connection[i])
 
     if len(conns) < req_count:
@@ -145,7 +151,7 @@ def get_connections(req_count):
             scheme = 'http://'
 
         for i in range(req_count - len(conns)):
-            log.info("generate new connection")
+            log.debug("generate new connection")
             client = boto3.client('s3',
                                   endpoint_url=scheme+hostname+':'+str(port_no),
                                   aws_access_key_id=access_key,
@@ -194,7 +200,7 @@ def gen_connections_multi2(req_count):
     g_tenants=[]
     global num_conns
 
-    log.info("gen_connections_multi: Create connection and buckets ...")
+    log.debug("gen_connections_multi: Create connection and buckets ...")
     suffix=run_prefix
 
     tenants=[]
@@ -202,7 +208,7 @@ def gen_connections_multi2(req_count):
     conns=[]
 
     for i in range(min(req_count, len(g_tenants))):
-        log.info("recycle existing tenants connection")
+        log.debug("recycle existing tenants connection")
         conns.append(g_tenants_connection[i])
         tenants.append(g_tenants[i])
         # we need to create a new bucket as we remove existing buckets at cleanup
@@ -227,7 +233,7 @@ def gen_connections_multi2(req_count):
             g_tenant_connections.append(conn)
             conns.append(conn)
 
-    log.info("gen_connections_multi: All connection and buckets are set")
+    log.debug("gen_connections_multi: All connection and buckets are set")
     return (tenants, bucket_names, conns)
 
 
@@ -238,7 +244,7 @@ def gen_connections_multi(num_tenants):
     tenants=[]
     bucket_names=[]
     conns=[]
-    log.info("gen_connections_multi: Create connection and buckets ...")
+    log.debug("gen_connections_multi: Create connection and buckets ...")
     suffix=run_prefix
     for i in range(0, num_tenants):
         num_conns += 1
@@ -254,7 +260,7 @@ def gen_connections_multi(num_tenants):
         bucket=conn.create_bucket(Bucket=bucket_name)
         conns.append(conn)
 
-    log.info("gen_connections_multi: All connection and buckets are set")
+    log.debug("gen_connections_multi: All connection and buckets are set")
     return (tenants, bucket_names, conns)
 
 
@@ -264,6 +270,7 @@ def gen_connections_multi(num_tenants):
 OUT_DIR="/tmp/dedup/"
 KB=(1024)
 MB=(1024*KB)
+POTENTIAL_OBJ_SIZE=(64*KB)
 RADOS_OBJ_SIZE=(4*MB)
 MULTIPART_SIZE=(16*MB)
 default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE)
@@ -282,9 +289,9 @@ def write_file(filename, size):
 #-------------------------------------------------------------------------------
 def print_size(caller, size):
     if (size < MB):
-        log.info("%s::size=%.2f KiB (%d Bytes)", caller, size/KB, size)
+        log.debug("%s::size=%.2f KiB (%d Bytes)", caller, size/KB, size)
     else:
-        log.info("%s::size=%.2f MiB", caller, size/MB)
+        log.debug("%s::size=%.2f MiB", caller, size/MB)
 
 
 #-------------------------------------------------------------------------------
@@ -366,13 +373,13 @@ def gen_files(files, start_size, factor, max_copies_count=4):
 def count_space_in_all_buckets():
     result = rados(['df'])
     assert result[1] == 0
-    log.info("=============================================")
+    log.debug("=============================================")
     for line in result[0].splitlines():
         if line.startswith(POOLNAME):
-            log.info(line[:45])
+            log.debug(line[:45])
         elif line.startswith("POOL_NAME"):
-            log.info(line[:45])
-            log.info("=============================================")
+            log.debug(line[:45])
+            log.debug("=============================================")
 
 
 #-------------------------------------------------------------------------------
@@ -381,7 +388,7 @@ def count_objects_in_bucket(bucket_name, conn):
     marker=""
     obj_count=0
     while True:
-        log.info("bucket_name=%s", bucket_name)
+        log.debug("bucket_name=%s", bucket_name)
         listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
         if 'Contents' not in listing or len(listing['Contents'])== 0:
             return 0
@@ -390,7 +397,7 @@ def count_objects_in_bucket(bucket_name, conn):
 
         if listing['IsTruncated']:
             marker=listing['NextMarker']
-            log.info("marker=%s, obj_count=%d", marker, obj_count)
+            log.debug("marker=%s, obj_count=%d", marker, obj_count)
             continue
         else:
             return obj_count
@@ -417,11 +424,11 @@ def count_object_parts_in_all_buckets(verbose=False):
     names=result[0].split()
     count = 0
     for name in names:
-        #log.info(name)
+        #log.debug(name)
         count = count + 1
 
     if verbose:
-        log.info("Pool has %d rados objects", count)
+        log.debug("Pool has %d rados objects", count)
 
     return count
 
@@ -444,7 +451,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
     while True:
         listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
         if 'Contents' not in listing or len(listing['Contents'])== 0:
-            log.info("Bucket '%s' is empty, skipping...", bucket_name)
+            log.debug("Bucket '%s' is empty, skipping...", bucket_name)
             return
 
         objects=[]
@@ -457,7 +464,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
         conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
         if listing['IsTruncated']:
             marker=listing['NextMarker']
-            log.info("marker=%s, obj_count=%d", marker, obj_count)
+            log.debug("marker=%s, obj_count=%d", marker, obj_count)
             continue
         else:
             break
@@ -533,6 +540,7 @@ def calc_rados_obj_count(num_copies, obj_size, config):
 
 #-------------------------------------------------------------------------------
 def calc_dedupable_space(obj_size, config):
+    dup_head_size=0
     threshold = config.multipart_threshold
     # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
     # multi-part objects got a zero size Head objects
@@ -540,12 +548,13 @@ def calc_dedupable_space(obj_size, config):
         dedupable_space = obj_size
     elif obj_size > RADOS_OBJ_SIZE:
         dedupable_space = obj_size - RADOS_OBJ_SIZE
+        dup_head_size = RADOS_OBJ_SIZE
     else:
         dedupable_space = 0
 
     log.debug("obj_size=%.2f MiB, dedupable_space=%.2f MiB",
               float(obj_size)/MB, float(dedupable_space)/MB)
-    return dedupable_space
+    return (dedupable_space, dup_head_size)
 
 BLOCK_SIZE=4096
 #-------------------------------------------------------------------------------
@@ -555,6 +564,7 @@ def calc_on_disk_byte_size(byte_size):
 
 #-------------------------------------------------------------------------------
 def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
+    dups_count = (num_copies - 1)
     on_disk_byte_size = calc_on_disk_byte_size(obj_size)
     log.debug("obj_size=%d, on_disk_byte_size=%d", obj_size, on_disk_byte_size)
     threshold = config.multipart_threshold
@@ -563,10 +573,19 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
     if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE:
         dedup_stats.skip_too_small += num_copies
         dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
+
+        if on_disk_byte_size >= POTENTIAL_OBJ_SIZE:
+            if num_copies == 1:
+                dedup_stats.potential_singleton_obj += 1
+            else:
+                dedup_stats.potential_unique_obj += 1
+                dedup_stats.potential_duplicate_obj += dups_count
+                dedup_stats.potential_dedup_space += (on_disk_byte_size * dups_count)
+
         return
 
     dedup_stats.total_processed_objects += num_copies
-    dedup_stats.loaded_objects += num_copies
+    #dedup_stats.loaded_objects += num_copies
 
     if num_copies == 1:
         dedup_stats.singleton_obj += 1
@@ -578,11 +597,14 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
         dedup_stats.set_sha256 += num_copies
         dedup_stats.invalid_sha256 += num_copies
         dedup_stats.unique_obj += 1
-        dups_count = (num_copies - 1)
         dedup_stats.duplicate_obj += dups_count
         dedup_stats.deduped_obj += dups_count
-        deduped_obj_bytes=calc_dedupable_space(on_disk_byte_size, config)
+        ret=calc_dedupable_space(on_disk_byte_size, config)
+        deduped_obj_bytes=ret[0]
+        dup_head_size=ret[1]
         dedup_stats.deduped_obj_bytes += (deduped_obj_bytes * dups_count)
+        dedup_stats.dup_head_size += (dup_head_size * dups_count)
+        dedup_stats.dup_head_size_estimate += (dup_head_size * dups_count)
         deduped_block_bytes=((deduped_obj_bytes+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE
         dedup_stats.dedup_bytes_estimate += (deduped_block_bytes * dups_count)
 
@@ -626,7 +648,9 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
         assert(obj_size)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
         total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
         rados_objects_total += (rados_obj_count * num_copies)
@@ -634,25 +658,25 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
         log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
         s3_objects_total += num_copies
         if s3_objects_total and (s3_objects_total % 1000 == 0):
-            log.info("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
                      s3_objects_total, rados_objects_total, total_space/MB)
         for i in range(idx, num_copies):
             key = gen_object_name(filename, i)
-            #log.info("upload_file %s/%s with crc32", bucket_name, key)
+            #log.debug("upload_file %s/%s with crc32", bucket_name, key)
             conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config, ExtraArgs={'ChecksumAlgorithm': 'crc32'})
 
     log.debug("==========================================")
-    log.info("Summery:\n%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:\n%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
              s3_objects_total, rados_objects_total, total_space/MB)
     log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
-    log.info("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
-    log.info("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
+    log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
+    log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
 
     expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
-    log.info("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
+    log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
     if check_obj_count:
         assert rados_objects_total == count_object_parts_in_all_buckets()
 
@@ -676,7 +700,9 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
         assert(obj_size)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
         total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
         rados_objects_total += (rados_obj_count * num_copies)
@@ -684,7 +710,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
         log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
         s3_objects_total += num_copies
         if s3_objects_total and (s3_objects_total % 1000 == 0):
-            log.info("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
                      s3_objects_total, rados_objects_total, total_space/MB)
         for i in range(idx, num_copies):
             ten_id = i % max_tenants
@@ -693,7 +719,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
             log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key)
 
     log.debug("==========================================")
-    log.info("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
              s3_objects_total, rados_objects_total, total_space/MB)
     log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
@@ -704,7 +730,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
     for (bucket_name, conn) in zip(bucket_names, conns):
         s3_object_count += count_objects_in_bucket(bucket_name, conn)
 
-    log.info("bucket listings reported a total of %d s3 objects", s3_object_count)
+    log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
     expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
@@ -732,7 +758,7 @@ def proc_upload(proc_id, num_procs, files, conn, bucket_name, indices, config):
             if (proc_id == target_proc):
                 key = gen_object_name(filename, i)
                 conn.upload_file(OUT_DIR+filename, bucket_name, key, Config=config)
-                log.info("[%d]upload_objects::<%s/%s>", proc_id, bucket_name, key)
+                log.debug("[%d]upload_objects::<%s/%s>", proc_id, bucket_name, key)
 
 
 #---------------------------------------------------------------------------
@@ -759,7 +785,9 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
         assert(obj_size)
         calc_expected_stats(dedup_stats, obj_size, num_copies, config)
         total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
         duplicated_space += ((num_copies-1) * dedupable_space)
         rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
         rados_objects_total += (rados_obj_count * num_copies)
@@ -772,7 +800,7 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
         proc_list[idx].join()
 
     log.debug("==========================================")
-    log.info("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
              s3_objects_total, rados_objects_total, total_space/MB)
     log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
     log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
@@ -783,7 +811,7 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
     for (bucket_name, conn) in zip(bucket_names, conns):
         s3_object_count += count_objects_in_bucket(bucket_name, conn)
 
-    log.info("bucket listings reported a total of %d s3 objects", s3_object_count)
+    log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
     expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
     log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
     expcted_space_post_dedup=(total_space-duplicated_space)
@@ -806,7 +834,7 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
         log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
         for i in range(0, num_copies):
             key = gen_object_name(filename, i)
-            #log.info("download_file(%s) with crc32", key)
+            #log.debug("download_file(%s) with crc32", key)
             conn.download_file(bucket_name, key, tempfile, Config=config, ExtraArgs={'ChecksumMode': 'crc32'})
             #conn.download_file(bucket_name, key, tempfile, Config=config)
             result = bash(['cmp', tempfile, OUT_DIR + filename])
@@ -814,7 +842,7 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
             os.remove(tempfile)
 
     assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
 
 
 #-------------------------------------------------------------------------------
@@ -836,7 +864,7 @@ def verify_objects_multi(files, conns, bucket_names, expected_results, config):
             os.remove(tempfile)
 
     assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
 
 
 #-------------------------------------------------------------------------------
@@ -847,13 +875,13 @@ def thread_verify(thread_id, num_threads, files, conn, bucket, config):
         filename=f[0]
         obj_size=f[1]
         num_copies=f[2]
-        log.info("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
+        log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
         for i in range(0, num_copies):
             target_thread = count % num_threads
             count += 1
             if thread_id == target_thread:
                 key = gen_object_name(filename, i)
-                log.info("comparing object %s with file %s", key, filename)
+                log.debug("comparing object %s with file %s", key, filename)
                 conn.download_file(bucket, key, tempfile, Config=config)
                 result = bash(['cmp', tempfile, OUT_DIR + filename])
                 assert result[1] == 0 ,"Files %s and %s differ!!" % (key, tempfile)
@@ -876,7 +904,7 @@ def threads_verify_objects(files, conns, bucket_names, expected_results, config)
         thread_list[idx].join()
 
     assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
 
 
 #-------------------------------------------------------------------------------
@@ -903,6 +931,7 @@ def reset_full_dedup_stats(dedup_stats):
     dedup_stats.total_processed_objects = 0
     dedup_stats.set_shared_manifest_src = 0
     dedup_stats.deduped_obj = 0
+    dedup_stats.dup_head_size = 0
     dedup_stats.deduped_obj_bytes = 0
     dedup_stats.skip_shared_manifest = 0
     dedup_stats.skip_src_record = 0
@@ -959,7 +988,7 @@ def read_dedup_ratio(json):
     dedup_ratio.s3_bytes_after=json['s3_bytes_after']
     dedup_ratio.ratio=json['dedup_ratio']
 
-    log.info("Completed! ::ratio=%f", dedup_ratio.ratio)
+    log.debug("Completed! ::ratio=%f", dedup_ratio.ratio)
     return dedup_ratio
 
 #-------------------------------------------------------------------------------
@@ -975,10 +1004,10 @@ def verify_dedup_ratio(expected_dedup_stats, dedup_ratio):
     else:
         ratio = 0
 
-    log.info("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
-    log.info("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
-    log.info("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
-    log.info("ratio = %f/%f", ratio, dedup_ratio.ratio)
+    log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
+    log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
+    log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
+    log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio)
 
     assert s3_bytes_before == dedup_ratio.s3_bytes_before
     assert s3_bytes_after == dedup_ratio.s3_bytes_after
@@ -1013,7 +1042,7 @@ def read_dedup_stats(dry_run):
     if key in jstats:
         md5_stats=jstats[key]
         main=md5_stats['main']
-        dedup_stats.loaded_objects = main['Loaded objects']
+        #dedup_stats.loaded_objects = main['Loaded objects']
         if dry_run == False:
             read_full_dedup_stats(dedup_stats, md5_stats)
 
@@ -1022,19 +1051,27 @@ def read_dedup_stats(dry_run):
         dedup_stats.duplicate_obj = main['Duplicate Obj']
         dedup_stats.dedup_bytes_estimate = main['Dedup Bytes Estimate']
 
+        potential = md5_stats['Potential Dedup']
+        dedup_stats.dup_head_size_estimate = potential['Duplicated Head Bytes Estimate']
+        dedup_stats.dup_head_size = potential['Duplicated Head Bytes']
+        dedup_stats.potential_singleton_obj = potential['Singleton Obj (64KB-4MB)']
+        dedup_stats.potential_unique_obj = potential['Unique Obj (64KB-4MB)']
+        dedup_stats.potential_duplicate_obj = potential['Duplicate Obj (64KB-4MB)']
+        dedup_stats.potential_dedup_space = potential['Dedup Bytes Estimate (64KB-4MB)']
+
     dedup_work_was_completed=jstats['completed']
     if dedup_work_was_completed:
         dedup_ratio_estimate=read_dedup_ratio(jstats['dedup_ratio_estimate'])
         dedup_ratio_actual=read_dedup_ratio(jstats['dedup_ratio_actual'])
     else:
-        log.info("Uncompleted!")
+        log.debug("Uncompleted!")
 
     return (dedup_work_was_completed, dedup_stats, dedup_ratio_estimate, dedup_ratio_actual)
 
 
 #-------------------------------------------------------------------------------
 def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
-    log.info("sending exec_dedup request: dry_run=%d", dry_run)
+    log.debug("sending exec_dedup request: dry_run=%d", dry_run)
     if dry_run:
         result = admin(['dedup', 'estimate'])
         reset_full_dedup_stats(expected_dedup_stats)
@@ -1042,7 +1079,7 @@ def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
         result = admin(['dedup', 'restart'])
 
     assert result[1] == 0
-    log.info("wait for dedup to complete")
+    log.debug("wait for dedup to complete")
 
     dedup_time = 0
     dedup_timeout = 5
@@ -1080,16 +1117,20 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
     if verify_stats == False:
         return ret
 
+    if dedup_stats.potential_unique_obj or expected_dedup_stats.potential_unique_obj:
+        log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
+                  expected_dedup_stats.potential_unique_obj)
+
     #dedup_stats.set_sha256 = dedup_stats.invalid_sha256
     if dedup_stats != expected_dedup_stats:
-        log.info("==================================================")
+        log.debug("==================================================")
         print_dedup_stats_diff(dedup_stats, expected_dedup_stats)
-        print_dedup_stats(dedup_stats)
-        log.info("==================================================\n")
+        #print_dedup_stats(dedup_stats)
+        log.debug("==================================================\n")
         assert dedup_stats == expected_dedup_stats
 
     verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
-    log.info("expcted_dedup::stats check completed successfully!!")
+    log.debug("expcted_dedup::stats check completed successfully!!")
     return ret
 
 
@@ -1104,6 +1145,13 @@ def prepare_test():
 
     os.mkdir(OUT_DIR)
 
+#-------------------------------------------------------------------------------
+def copy_potential_stats(new_dedup_stats, dedup_stats):
+    new_dedup_stats.potential_singleton_obj = dedup_stats.potential_singleton_obj
+    new_dedup_stats.potential_unique_obj    = dedup_stats.potential_unique_obj
+    new_dedup_stats.potential_duplicate_obj = dedup_stats.potential_duplicate_obj
+    new_dedup_stats.potential_dedup_space   = dedup_stats.potential_dedup_space
+
 
 #-------------------------------------------------------------------------------
 def small_single_part_objs_dedup(conn, bucket_name, dry_run):
@@ -1115,13 +1163,13 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
     prepare_test()
     try:
         files=[]
-        num_files = 10
+        num_files = 8
         base_size = 4*KB
-        log.info("generate files: base size=%d KiB, max_size=%d KiB",
-                 base_size/KB, (pow(2, num_files) * base_size)/KB)
+        log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+                  base_size/KB, (pow(2, num_files) * base_size)/KB)
         gen_files(files, base_size, num_files)
         bucket = conn.create_bucket(Bucket=bucket_name)
-        log.info("upload objects to bucket <%s> ...", bucket_name)
+        log.debug("upload objects to bucket <%s> ...", bucket_name)
         indices = [0] * len(files)
         ret = upload_objects(bucket_name, files, indices, conn, default_config)
         expected_results = ret[0]
@@ -1130,6 +1178,8 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
+        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small = s3_objects_total
@@ -1137,7 +1187,7 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
 
         exec_dedup(dedup_stats, dry_run)
         if dry_run == False:
-            log.info("Verify all objects")
+            log.debug("Verify all objects")
             verify_objects(bucket_name, files, conn, expected_results, default_config)
 
     finally:
@@ -1167,16 +1217,17 @@ def simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run):
     # 9) call GC to make sure everything was removed
     #10) verify that there is nothing left on pool (i.e. ref-count is working)
     try:
-        log.info("conn.create_bucket(%s)", bucket_name)
+        log.debug("conn.create_bucket(%s)", bucket_name)
         bucket = conn.create_bucket(Bucket=bucket_name)
         indices = [0] * len(files)
-        log.info("upload objects to bucket <%s> ...", bucket_name)
+        log.debug("upload objects to bucket <%s> ...", bucket_name)
         ret = upload_objects(bucket_name, files, indices, conn, config)
         expected_results = ret[0]
         dedup_stats = ret[1]
+
         exec_dedup(dedup_stats, dry_run)
         if dry_run == False:
-            log.info("Verify all objects")
+            log.debug("Verify all objects")
             verify_objects(bucket_name, files, conn, expected_results, config)
 
         return ret
@@ -1194,7 +1245,7 @@ def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False)
     dedup_stats = ret[1]
     exec_dedup(dedup_stats, dry_run)
     if dry_run == False:
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
         verify_objects_multi(files, conns, bucket_names, expected_results, config)
 
     return ret
@@ -1228,14 +1279,14 @@ def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_ru
     exec_time_sec=exec_ret[0]
     verify_time_sec=0
     if dry_run == False:
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
         start = time.time_ns()
         threads_verify_objects(files, conns, bucket_names,
                                expected_results, config)
         verify_time_sec = (time.time_ns() - start)  / (1000*1000*1000)
 
     log.info("[%d] obj_count=%d, upload=%d(sec), exec=%d(sec), verify=%d(sec)",
-                len(conns), s3_objects_total, upload_time_sec, exec_time_sec, verify_time_sec);
+             len(conns), s3_objects_total, upload_time_sec, exec_time_sec, verify_time_sec);
     return upload_ret
 
 
@@ -1256,15 +1307,15 @@ def threads_dedup_basic_with_tenants_common(files, num_conns, config, dry_run):
 def check_full_dedup_state():
     global full_dedup_state_was_checked
     global full_dedup_state_disabled
-    log.info("check_full_dedup_state:: sending FULL Dedup request")
+    log.debug("check_full_dedup_state:: sending FULL Dedup request")
     result = admin(['dedup', 'restart'])
     if result[1] == 0:
-        log.info("full dedup is enabled!")
+        log.debug("full dedup is enabled!")
         full_dedup_state_disabled = False
         result = admin(['dedup', 'abort'])
         assert result[1] == 0
     else:
-        log.info("full dedup is disabled, skip all full dedup tests")
+        log.debug("full dedup is disabled, skip all full dedup tests")
         full_dedup_state_disabled = True
 
     full_dedup_state_was_checked = True
@@ -1280,7 +1331,7 @@ def full_dedup_is_disabled():
         full_dedup_state_disabled = check_full_dedup_state()
 
     if full_dedup_state_disabled:
-        log.info("Full Dedup is DISABLED, skipping test...")
+        log.debug("Full Dedup is DISABLED, skipping test...")
 
     return full_dedup_state_disabled
 
@@ -1338,15 +1389,15 @@ def gen_new_etag(etag, corruption, expected_dedup_stats):
 
 #------------------------------------------------------------------------------
 def corrupt_etag(key, corruption, expected_dedup_stats):
-    log.info("key=%s, corruption=%s", key, corruption);
+    log.debug("key=%s, corruption=%s", key, corruption);
     result = rados(['ls', '-p ', POOLNAME])
     assert result[1] == 0
 
     names=result[0].split()
     for name in names:
-        log.info("name=%s", name)
+        log.debug("name=%s", name)
         if key in name:
-            log.info("key=%s is a substring of name=%s", key, name);
+            log.debug("key=%s is a substring of name=%s", key, name);
             rados_name = name
             break;
 
@@ -1356,7 +1407,7 @@ def corrupt_etag(key, corruption, expected_dedup_stats):
 
     new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats)
 
-    log.info("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
+    log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
              corruption, old_etag, new_etag)
     change_object_etag(rados_name, new_etag)
     return (rados_name, old_etag)
@@ -1370,7 +1421,7 @@ def test_dedup_etag_corruption():
         return
 
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_etag_corruption: connect to AWS ...")
+    log.debug("test_dedup_etag_corruption: connect to AWS ...")
     conn=get_single_connection()
     prepare_test()
     try:
@@ -1457,7 +1508,7 @@ def test_md5_collisions():
         write_bin_file(files, s2_bin, "s2")
 
         bucket_name = gen_bucket_name()
-        log.info("test_md5_collisions: connect to AWS ...")
+        log.debug("test_md5_collisions: connect to AWS ...")
         config2=TransferConfig(multipart_threshold=64, multipart_chunksize=1*MB)
         conn=get_single_connection()
         bucket = conn.create_bucket(Bucket=bucket_name)
@@ -1467,7 +1518,7 @@ def test_md5_collisions():
         dedup_stats = Dedup_Stats()
         # we wrote 2 different small objects (BLOCK_SIZE) with the same md5
         dedup_stats.total_processed_objects=2
-        dedup_stats.loaded_objects=dedup_stats.total_processed_objects
+        #dedup_stats.loaded_objects=dedup_stats.total_processed_objects
         # the objects will seem like a duplications with 1 unique and 1 duplicate
         dedup_stats.unique_obj=1
         dedup_stats.duplicate_obj=1
@@ -1487,7 +1538,7 @@ def test_md5_collisions():
         expected_ratio_actual.ratio=0
 
         dry_run=False
-        log.info("test_md5_collisions: first call to exec_dedup")
+        log.debug("test_md5_collisions: first call to exec_dedup")
         ret=exec_dedup(dedup_stats, dry_run)
         dedup_ratio_actual=ret[3]
 
@@ -1497,7 +1548,7 @@ def test_md5_collisions():
         dedup_stats.invalid_sha256=0
         dedup_stats.set_sha256=0
 
-        log.info("test_md5_collisions: second call to exec_dedup")
+        log.debug("test_md5_collisions: second call to exec_dedup")
         ret=exec_dedup(dedup_stats, dry_run)
         dedup_ratio_actual=ret[3]
 
@@ -1517,7 +1568,7 @@ def test_dedup_small():
         return
 
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_small: connect to AWS ...")
+    log.debug("test_dedup_small: connect to AWS ...")
     conn=get_single_connection()
     small_single_part_objs_dedup(conn, bucket_name, False)
 
@@ -1535,7 +1586,7 @@ def test_dedup_small_with_tenants():
     files=[]
     num_files=10 # [4KB-4MB]
     base_size = 4*KB
-    log.info("generate files: base size=%d KiB, max_size=%d KiB",
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
              base_size/KB, (pow(2, num_files) * base_size)/KB)
     try:
         gen_files(files, base_size, num_files, max_copies_count)
@@ -1552,6 +1603,8 @@ def test_dedup_small_with_tenants():
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
+        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -1559,7 +1612,7 @@ def test_dedup_small_with_tenants():
 
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
         verify_objects_multi(files, conns, bucket_names, expected_results, default_config)
     finally:
         # cleanup must be executed even after a failure
@@ -1580,7 +1633,7 @@ def test_dedup_inc_0_with_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_inc_0: connect to AWS ...")
+    log.debug("test_dedup_inc_0: connect to AWS ...")
     max_copies_count=3
     config=default_config
     ret=gen_connections_multi2(max_copies_count)
@@ -1598,6 +1651,7 @@ def test_dedup_inc_0_with_tenants():
         s3_objects_total = ret[2]
 
         dedup_stats2 = dedup_stats
+        dedup_stats2.dup_head_size = 0
         dedup_stats2.skip_shared_manifest=dedup_stats.deduped_obj
         dedup_stats2.skip_src_record=dedup_stats.set_shared_manifest_src
         dedup_stats2.set_shared_manifest_src=0
@@ -1607,7 +1661,7 @@ def test_dedup_inc_0_with_tenants():
         dedup_stats2.invalid_sha256=0
         dedup_stats2.set_sha256=0
 
-        log.info("test_dedup_inc_0_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_0_with_tenants: incremental dedup:")
         # run dedup again and make sure nothing has changed
         dry_run=False
         exec_dedup(dedup_stats2, dry_run)
@@ -1633,7 +1687,7 @@ def test_dedup_inc_0():
     config=default_config
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_0: connect to AWS ...")
+    log.debug("test_dedup_inc_0: connect to AWS ...")
     conn=get_single_connection()
     try:
         files=[]
@@ -1646,6 +1700,7 @@ def test_dedup_inc_0():
         s3_objects_total = ret[2]
 
         dedup_stats2 = dedup_stats
+        dedup_stats2.dup_head_size = 0
         dedup_stats2.skip_shared_manifest=dedup_stats.deduped_obj
         dedup_stats2.skip_src_record=dedup_stats.set_shared_manifest_src
         dedup_stats2.set_shared_manifest_src=0
@@ -1655,7 +1710,7 @@ def test_dedup_inc_0():
         dedup_stats2.invalid_sha256=0
         dedup_stats2.set_sha256=0
 
-        log.info("test_dedup_inc_0: incremental dedup:")
+        log.debug("test_dedup_inc_0: incremental dedup:")
         # run dedup again and make sure nothing has changed
         dry_run=False
         exec_dedup(dedup_stats2, dry_run)
@@ -1678,7 +1733,7 @@ def test_dedup_inc_1_with_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_inc_1_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_1_with_tenants: connect to AWS ...")
     max_copies_count=6
     config=default_config
     ret=gen_connections_multi2(max_copies_count)
@@ -1713,6 +1768,7 @@ def test_dedup_inc_1_with_tenants():
         stats_combined=ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
         stats_combined.skip_src_record     -= stats_base.skip_src_record
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
         stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
 
         stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
@@ -1723,7 +1779,7 @@ def test_dedup_inc_1_with_tenants():
         stats_combined.invalid_sha256 -= stats_base.set_sha256
         stats_combined.set_sha256     -= stats_base.set_sha256
 
-        log.info("test_dedup_inc_1_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_1_with_tenants: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
@@ -1748,7 +1804,7 @@ def test_dedup_inc_1():
     config=default_config
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_1: connect to AWS ...")
+    log.debug("test_dedup_inc_1: connect to AWS ...")
     conn=get_single_connection()
     try:
         files=[]
@@ -1776,6 +1832,7 @@ def test_dedup_inc_1():
         expected_results = ret[0]
         stats_combined = ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
         stats_combined.skip_src_record     -= stats_base.skip_src_record
         stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
 
@@ -1787,7 +1844,7 @@ def test_dedup_inc_1():
         stats_combined.invalid_sha256 -= stats_base.set_sha256
         stats_combined.set_sha256     -= stats_base.set_sha256
 
-        log.info("test_dedup_inc_1: incremental dedup:")
+        log.debug("test_dedup_inc_1: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
@@ -1811,7 +1868,7 @@ def test_dedup_inc_2_with_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_inc_2_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_2_with_tenants: connect to AWS ...")
     max_copies_count=6
     config=default_config
     ret=gen_connections_multi2(max_copies_count)
@@ -1853,6 +1910,7 @@ def test_dedup_inc_2_with_tenants():
         expected_results = ret[0]
         stats_combined = ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
         stats_combined.skip_src_record     -= stats_base.skip_src_record
         stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
 
@@ -1864,7 +1922,7 @@ def test_dedup_inc_2_with_tenants():
         stats_combined.invalid_sha256 -= stats_base.set_sha256
         stats_combined.set_sha256     -= stats_base.set_sha256
 
-        log.info("test_dedup_inc_2_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
@@ -1890,7 +1948,7 @@ def test_dedup_inc_2():
     config=default_config
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_2: connect to AWS ...")
+    log.debug("test_dedup_inc_2: connect to AWS ...")
     conn=get_single_connection()
     try:
         files=[]
@@ -1926,6 +1984,7 @@ def test_dedup_inc_2():
         stats_combined = ret[1]
         stats_combined.skip_shared_manifest = stats_base.deduped_obj
         stats_combined.skip_src_record     -= stats_base.skip_src_record
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
         stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
 
         stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
@@ -1936,7 +1995,7 @@ def test_dedup_inc_2():
         stats_combined.invalid_sha256 -= stats_base.set_sha256
         stats_combined.set_sha256     -= stats_base.set_sha256
 
-        log.info("test_dedup_inc_2: incremental dedup:")
+        log.debug("test_dedup_inc_2: incremental dedup:")
         # run dedup again
         dry_run=False
         exec_dedup(stats_combined, dry_run)
@@ -1960,7 +2019,7 @@ def test_dedup_inc_with_remove_multi_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_inc_with_remove_multi_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_with_remove_multi_tenants: connect to AWS ...")
     max_copies_count=6
     config=default_config
     ret=gen_connections_multi2(max_copies_count)
@@ -2013,6 +2072,7 @@ def test_dedup_inc_with_remove_multi_tenants():
         # run dedup again
         dedup_stats.set_shared_manifest_src=0
         dedup_stats.deduped_obj=0
+        dedup_stats.dup_head_size=0
         dedup_stats.deduped_obj_bytes=0
         dedup_stats.skip_src_record=src_record
         dedup_stats.skip_shared_manifest=shared_manifest
@@ -2020,7 +2080,7 @@ def test_dedup_inc_with_remove_multi_tenants():
         dedup_stats.invalid_sha256=0
         dedup_stats.set_sha256=0
 
-        log.info("test_dedup_inc_with_remove: incremental dedup:")
+        log.debug("test_dedup_inc_with_remove: incremental dedup:")
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
         expected_results=calc_expected_results(files_sub, config)
@@ -2045,7 +2105,7 @@ def test_dedup_inc_with_remove():
     config=default_config
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_with_remove: connect to AWS ...")
+    log.debug("test_dedup_inc_with_remove: connect to AWS ...")
     conn=get_single_connection()
     try:
         files=[]
@@ -2086,7 +2146,7 @@ def test_dedup_inc_with_remove():
                 object_keys.append(key)
 
             if len(object_keys) == 0:
-                log.info("Skiping file=%s, num_remove=%d", filename, num_remove)
+                log.debug("Skiping file=%s, num_remove=%d", filename, num_remove)
                 continue
 
             response=conn.delete_objects(Bucket=bucket_name,
@@ -2099,6 +2159,7 @@ def test_dedup_inc_with_remove():
         # run dedup again
         dedup_stats.set_shared_manifest_src=0
         dedup_stats.deduped_obj=0
+        dedup_stats.dup_head_size=0
         dedup_stats.deduped_obj_bytes=0
         dedup_stats.skip_src_record=src_record
         dedup_stats.skip_shared_manifest=shared_manifest
@@ -2106,9 +2167,9 @@ def test_dedup_inc_with_remove():
         dedup_stats.invalid_sha256=0
         dedup_stats.set_sha256=0
 
-        log.info("test_dedup_inc_with_remove: incremental dedup:")
-        log.info("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
-        log.info("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
+        log.debug("test_dedup_inc_with_remove: incremental dedup:")
+        log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
+        log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
         dry_run=False
         exec_dedup(dedup_stats, dry_run)
         expected_results=calc_expected_results(files_sub, config)
@@ -2127,7 +2188,7 @@ def test_dedup_multipart_with_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_multipart_with_tenants: connect to AWS ...")
     max_copies_count=3
     num_files=8
     files=[]
@@ -2154,7 +2215,7 @@ def test_dedup_multipart():
 
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_multipart: connect to AWS ...")
+    log.debug("test_dedup_multipart: connect to AWS ...")
     conn=get_single_connection()
     files=[]
 
@@ -2185,7 +2246,7 @@ def test_dedup_basic_with_tenants():
     num_files=23
     file_size=33*MB
     files=[]
-    log.info("test_dedup_basic_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_basic_with_tenants: connect to AWS ...")
     gen_files_fixed_size(files, num_files, file_size, max_copies_count)
     dedup_basic_with_tenants_common(files, max_copies_count, default_config, False)
 
@@ -2200,15 +2261,15 @@ def test_dedup_basic():
 
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_basic: connect to AWS ...")
+    log.debug("test_dedup_basic: connect to AWS ...")
     conn=get_single_connection()
     files=[]
     num_files=5
     base_size = MULTIPART_SIZE
-    log.info("generate files: base size=%d MiB, max_size=%d MiB",
+    log.debug("generate files: base size=%d MiB, max_size=%d MiB",
              base_size/MB, (pow(2, num_files) * base_size)/MB)
     gen_files(files, base_size, num_files)
-    log.info("call simple_dedup()")
+    log.debug("call simple_dedup()")
     simple_dedup(conn, files, bucket_name, True, default_config, False)
 
 
@@ -2227,7 +2288,7 @@ def test_dedup_small_multipart_with_tenants():
     max_size=512*KB
     files=[]
     config=TransferConfig(multipart_threshold=min_size, multipart_chunksize=1*MB)
-    log.info("test_dedup_small_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_small_multipart_with_tenants: connect to AWS ...")
 
     # create files in range [4KB-512KB] aligned on 4KB
     gen_files_in_range(files, num_files, min_size, max_size, min_size)
@@ -2243,7 +2304,7 @@ def test_dedup_small_multipart():
         return
 
     prepare_test()
-    log.info("test_dedup_small_multipart: connect to AWS ...")
+    log.debug("test_dedup_small_multipart: connect to AWS ...")
     config2=TransferConfig(multipart_threshold=4*KB, multipart_chunksize=1*MB)
     conn=get_single_connection()
     files=[]
@@ -2261,7 +2322,7 @@ def test_dedup_small_multipart():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_large_scale_with_tenants():
-    #return
+    return
 
     if full_dedup_is_disabled():
         return
@@ -2273,7 +2334,7 @@ def test_dedup_large_scale_with_tenants():
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_large_scale_with_tenants: connect to AWS ...")
     gen_files_fixed_size(files, num_files, size, max_copies_count)
     threads_dedup_basic_with_tenants_common(files, num_threads, config, False)
 
@@ -2281,7 +2342,7 @@ def test_dedup_large_scale_with_tenants():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_dedup_large_scale():
-    #return
+    return
 
     if full_dedup_is_disabled():
         return
@@ -2293,7 +2354,7 @@ def test_dedup_large_scale():
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
     gen_files_fixed_size(files, num_files, size, max_copies_count)
     threads_dedup_basic_with_tenants_common(files, num_threads, config, False)
 
@@ -2301,13 +2362,13 @@ def test_dedup_large_scale():
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_empty_bucket():
-    #return
+    return
 
     if full_dedup_is_disabled():
         return
 
     prepare_test()
-    log.info("test_empty_bucket: connect to AWS ...")
+    log.debug("test_empty_bucket: connect to AWS ...")
 
     max_copies_count=2
     config = default_config
@@ -2361,6 +2422,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
     stats_combined.skip_shared_manifest = stats_base.deduped_obj
     stats_combined.skip_src_record      = src_record
     stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
+    stats_combined.dup_head_size       -= stats_base.dup_head_size
     stats_combined.deduped_obj         -= stats_base.deduped_obj
     stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
 
@@ -2368,7 +2430,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
     stats_combined.invalid_sha256 -= stats_base.set_sha256
     stats_combined.set_sha256     -= stats_base.set_sha256
 
-    log.info("test_dedup_inc_2_with_tenants: incremental dedup:")
+    log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
     # run dedup again
     dry_run=False
     exec_dedup(stats_combined, dry_run)
@@ -2387,7 +2449,7 @@ def test_dedup_inc_loop_with_tenants():
         return
 
     prepare_test()
-    log.info("test_dedup_inc_loop_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_loop_with_tenants: connect to AWS ...")
     max_copies_count=3
     config=default_config
     ret=gen_connections_multi2(max_copies_count)
@@ -2408,6 +2470,7 @@ def test_dedup_inc_loop_with_tenants():
             files=ret[0]
             stats_last=ret[1]
             stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src
+            stats_base.dup_head_size       += stats_last.dup_head_size
             stats_base.deduped_obj         += stats_last.deduped_obj
             stats_base.deduped_obj_bytes   += stats_last.deduped_obj_bytes
             stats_base.set_sha256          += stats_last.set_sha256
@@ -2423,13 +2486,13 @@ def test_dedup_inc_loop_with_tenants():
 def test_dedup_dry_small_with_tenants():
     #return
 
-    log.info("test_dedup_dry_small_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_small_with_tenants: connect to AWS ...")
     prepare_test()
     max_copies_count=3
     files=[]
     num_files=10 # [4KB-4MB]
     base_size = 4*KB
-    log.info("generate files: base size=%d KiB, max_size=%d KiB",
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
              base_size/KB, (pow(2, num_files) * base_size)/KB)
     try:
         gen_files(files, base_size, num_files, max_copies_count)
@@ -2446,6 +2509,7 @@ def test_dedup_dry_small_with_tenants():
 
         # expected stats for small objects - all zeros except for skip_too_small
         small_objs_dedup_stats = Dedup_Stats()
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
         small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
         small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -2464,7 +2528,7 @@ def test_dedup_dry_multipart():
 
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_multipart: connect to AWS ...")
+    log.debug("test_dedup_dry_multipart: connect to AWS ...")
     conn=get_single_connection()
     files=[]
 
@@ -2490,15 +2554,15 @@ def test_dedup_dry_basic():
 
     prepare_test()
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_basic: connect to AWS ...")
+    log.debug("test_dedup_dry_basic: connect to AWS ...")
     conn=get_single_connection()
     files=[]
     num_files=5
-    base_size = MULTIPART_SIZE
-    log.info("generate files: base size=%d MiB, max_size=%d MiB",
+    base_size = 2*MB
+    log.debug("generate files: base size=%d MiB, max_size=%d MiB",
              base_size/MB, (pow(2, num_files) * base_size)/MB)
     gen_files(files, base_size, num_files)
-    log.info("call simple_dedup()")
+    log.debug("call simple_dedup()")
     simple_dedup(conn, files, bucket_name, True, default_config, True)
 
 
@@ -2508,7 +2572,7 @@ def test_dedup_dry_small_multipart():
     #return
 
     prepare_test()
-    log.info("test_dedup_dry_small_multipart: connect to AWS ...")
+    log.debug("test_dedup_dry_small_multipart: connect to AWS ...")
     config2 = TransferConfig(multipart_threshold=4*KB, multipart_chunksize=1*MB)
     conn=get_single_connection()
     files=[]
@@ -2529,7 +2593,7 @@ def test_dedup_dry_small():
     #return
 
     bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_small: connect to AWS ...")
+    log.debug("test_dedup_dry_small: connect to AWS ...")
     conn=get_single_connection()
     small_single_part_objs_dedup(conn, bucket_name, True)
 
@@ -2546,20 +2610,23 @@ def test_dedup_dry_small_large_mix():
     #return
 
     dry_run=True
-    log.info("test_dedup_dry_small_large_mix: connect to AWS ...")
+    log.debug("test_dedup_dry_small_large_mix: connect to AWS ...")
     prepare_test()
 
     num_threads=4
     max_copies_count=3
     small_file_size=1*MB
+    mid_file_size=8*MB
     large_file_size=16*MB
     num_small_files=128
+    num_mid_files=32
     num_large_files=16
     files=[]
     conns=[]
     bucket_names=get_buckets(num_threads)
     try:
         gen_files_fixed_size(files, num_small_files, small_file_size, max_copies_count)
+        gen_files_fixed_size(files, num_mid_files, mid_file_size, max_copies_count)
         gen_files_fixed_size(files, num_large_files, large_file_size, max_copies_count)
 
         start = time.time_ns()
@@ -2573,9 +2640,8 @@ def test_dedup_dry_small_large_mix():
         expected_results = ret[0]
         dedup_stats = ret[1]
         s3_objects_total = ret[2]
-        log.info("new[%d] obj_count=%d, upload_time=%d(sec)",
-                 len(conns), s3_objects_total, upload_time_sec)
-
+        log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total,
+                 upload_time_sec)
         exec_dedup(dedup_stats, dry_run)
         if dry_run == False:
             verify_objects(bucket_name, files, conn, expected_results, default_config)
@@ -2594,7 +2660,7 @@ def test_dedup_dry_basic_with_tenants():
     num_files=23
     file_size=33*MB
     files=[]
-    log.info("test_dedup_basic_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_basic_with_tenants: connect to AWS ...")
     gen_files_fixed_size(files, num_files, file_size, max_copies_count)
     dedup_basic_with_tenants_common(files, max_copies_count, default_config, True)
 
@@ -2605,7 +2671,7 @@ def test_dedup_dry_multipart_with_tenants():
     #return
 
     prepare_test()
-    log.info("test_dedup_dry_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_multipart_with_tenants: connect to AWS ...")
     max_copies_count=3
     num_files=8
     files=[]
@@ -2634,7 +2700,7 @@ def test_dedup_dry_small_multipart_with_tenants():
     max_size=512*KB
     files=[]
     config=TransferConfig(multipart_threshold=min_size, multipart_chunksize=1*MB)
-    log.info("test_dedup_small_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_small_multipart_with_tenants: connect to AWS ...")
 
     # create files in range [4KB-512KB] aligned on 4KB
     gen_files_in_range(files, num_files, min_size, max_size, min_size)
@@ -2653,7 +2719,7 @@ def test_dedup_dry_large_scale_with_tenants():
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
     gen_files_fixed_size(files, num_files, size, max_copies_count)
     threads_dedup_basic_with_tenants_common(files, num_threads, config, True)
 
@@ -2670,7 +2736,7 @@ def test_dedup_dry_large_scale():
     size=1*KB
     files=[]
     config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_new: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_new: connect to AWS ...")
     gen_files_fixed_size(files, num_files, size, max_copies_count)
     conns=get_connections(num_threads)
     bucket_names=get_buckets(num_threads)
@@ -2685,36 +2751,6 @@ def test_dedup_dry_large_scale():
         cleanup_all_buckets(bucket_names, conns)
 
 
-#-------------------------------------------------------------------------------
-@pytest.mark.basic_test
-def test_dedup_dry_large_scale_single_bucket():
-    return
-
-    prepare_test()
-    max_copies_count=3
-    num_threads=16
-    num_files=32*1024
-    size=1*KB
-    files=[]
-    config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_new: connect to AWS ...")
-    gen_files_fixed_size(files, num_files, size, max_copies_count)
-    conns=get_connections(num_threads)
-
-    bucket_name=gen_bucket_name()
-    conns[0].create_bucket(Bucket=bucket_name)
-
-    bucket_names=[bucket_name] * num_threads
-
-    try:
-        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
-    except:
-        log.warning("test_dedup_dry_large_scale: failed!!")
-    finally:
-        # cleanup must be executed even after a failure
-        cleanup(bucket_name, conns[0])
-
-
 #-------------------------------------------------------------------------------
 @pytest.mark.basic_test
 def test_cleanup():