rgw/dedup: full object dedup continued work

author Gabriel BenHanokh <gbenhano@redhat.com>

Sun, 21 Jul 2024 11:38:57 +0000 (11:38 +0000)

committer Gabriel BenHanokh <gbenhano@redhat.com>

Mon, 30 Jun 2025 11:10:59 +0000 (11:10 +0000)
author Gabriel BenHanokh <gbenhano@redhat.com>
Sun, 21 Jul 2024 11:38:57 +0000 (11:38 +0000)
committer Gabriel BenHanokh <gbenhano@redhat.com>
Mon, 30 Jun 2025 11:10:59 +0000 (11:10 +0000)
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h

index 5fb2b4b809664b0e486418d24d14cf56410306c4..7860bad50f8a633c3c18d3023db876c3ffbb3395 100644 (file)
--- a/src/rgw/driver/rados/rgw_zone.h
+++ b/src/rgw/driver/rados/rgw_zone.h
@@ -117,6 +117,7 @@ struct RGWZoneParams : RGWSystemMetaObj {
    rgw_pool topics_pool;
    rgw_pool account_pool;
    rgw_pool group_pool;
+  rgw_pool dedup_pool;
  
    RGWAccessKey system_key;
  
@@ -153,7 +154,7 @@ struct RGWZoneParams : RGWSystemMetaObj {
    const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
    
    void encode(bufferlist& bl) const override {
-    ENCODE_START(15, 1, bl);
+    ENCODE_START(16, 1, bl);
      encode(domain_root, bl);
      encode(control_pool, bl);
      encode(gc_pool, bl);
@@ -182,11 +183,12 @@ struct RGWZoneParams : RGWSystemMetaObj {
      encode(topics_pool, bl);
      encode(account_pool, bl);
      encode(group_pool, bl);
+    encode(dedup_pool, bl);
      ENCODE_FINISH(bl);
    }
  
    void decode(bufferlist::const_iterator& bl) override {
-    DECODE_START(15, bl);
+    DECODE_START(16, bl);
      decode(domain_root, bl);
      decode(control_pool, bl);
      decode(gc_pool, bl);
@@ -264,6 +266,11 @@ struct RGWZoneParams : RGWSystemMetaObj {
        account_pool = name + ".rgw.meta:accounts";
        group_pool = name + ".rgw.meta:groups";
      }
+    if (struct_v >= 16) {
+      decode(dedup_pool, bl);
+    } else {
+      dedup_pool = name + ".rgw.dedup";
+    }
      DECODE_FINISH(bl);
    }
    void dump(Formatter *f) const;
diff --git a/src/rgw/rgw_dedup.cc b/src/rgw/rgw_dedup.cc

index 0db80229ff14d783980feef1d65f4a099d5202d7..7bb44ecb98968188adad34044914505e6fec3fe8 100644 (file)
--- a/src/rgw/rgw_dedup.cc
+++ b/src/rgw/rgw_dedup.cc
@@ -216,7 +216,149 @@ namespace rgw::dedup {
    // rgw::dedup::Background
    //===========================================================================
    //---------------------------------------------------------------------------
-  int Background::init_rados_access_handles()
+  static void display_ioctx_state(const DoutPrefixProvider *dpp,
+                                  const librados::IoCtx &ioctx,
+                                  const char *caller)
+  {
+    if (ioctx.is_valid()) {
+      ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id="
+                        << ioctx.get_instance_id() << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl;
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  static int safe_pool_delete(rgw::sal::RadosStore     *store,
+                              const DoutPrefixProvider *dpp,
+                              int64_t                   expected_pool_id)
+  {
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+    if (pool_id < 0) {
+      int err = pool_id;
+      if (err == ENOENT) {
+        ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::"
+                           << dedup_pool.name << "::expected_pool_id="
+                           << expected_pool_id << dendl;
+      }
+      else {
+        ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name
+                          << ") err=" << cpp_strerror(-err) << dendl;
+      }
+      return err;
+    }
+
+    if (pool_id != expected_pool_id) {
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: "
+                        << expected_pool_id << " to: " << pool_id
+                        << " abort pool_delete() request!" << dendl;
+      // report Stale file handle
+      return -ESTALE;
+    }
+
+    ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name
+                       << ") pool_id=" << pool_id << dendl;
+    return rados_handle->pool_delete(dedup_pool.name.c_str());
+  }
+
+  //---------------------------------------------------------------------------
+  static int64_t create_pool(rgw::sal::RadosStore     *store,
+                             const DoutPrefixProvider *dpp,
+                             const std::string        &pool_name)
+  {
+#if 0
+    // using Replica-1 for the intermediate data
+    // since it can be regenerated in case of a failure
+    std::string replica_count(std::to_string(1));
+#else
+    // temporary solution until we find a way to disable the health warn on replica1
+    std::string replica_count(std::to_string(2));
+#endif
+    librados::bufferlist inbl;
+    std::string output;
+    std::string command = R"(
+    {
+      "prefix": "osd pool create",
+      "pool": ")" + pool_name +
+      R"(",
+      "pool_type": "replicated",
+      "size": )" + replica_count +
+      R"(
+    })";
+
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
+    if (output.length()) {
+      if (output != "pool 'rgw_dedup_pool' already exists") {
+        ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
+      }
+    }
+    if (ret != 0 && ret != -EEXIST) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
+                        << pool_name << " with: "
+                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+      return ret;
+    }
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    return rados_handle->pool_lookup(dedup_pool.name.c_str());
+  }
+
+  //---------------------------------------------------------------------------
+  static int init_dedup_pool_ioctx(rgw::sal::RadosStore     *store,
+                                   const DoutPrefixProvider *dpp,
+                                   bool                      create,
+                                   librados::IoCtx          &ioctx)
+  {
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    std::string pool_name(dedup_pool.name.c_str());
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+    if (pool_id >= 0) {
+      // TBD: what to do when create option is passed
+      ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                         << " already exists, pool_id=" << pool_id << dendl;
+    }
+    else if (create) {
+      pool_id = create_pool(store, dpp, pool_name);
+      if (pool_id >= 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                           << " was created, pool_id=" << pool_id << dendl;
+      }
+      else {
+        return pool_id;
+      }
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__
+                        << "::ERR: pool doesn't exist and no create option" << dendl;
+      return -ENOENT;
+    }
+
+    int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret
+                        << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = ioctx.application_enable("rgw_dedup", false);
+    if (ret == 0) {
+      ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+                         << " was associated with dedup app" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
+                        << dedup_pool.name << " with: "
+                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+    }
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  int Background::init_rados_access_handles(bool init_pool)
    {
      store = dynamic_cast<rgw::sal::RadosStore*>(driver);
      if (!store) {
@@ -227,11 +369,12 @@ namespace rgw::dedup {
  
      rados = store->getRados();
      rados_handle = rados->get_rados_handle();
-
-    int ret = init_dedup_pool_ioctx(rados, dpp, d_dedup_cluster_ioctx);
-    ldpp_dout(dpp, 5) << __func__ << "::dedup background: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
-    return ret;
+    if (init_pool) {
+      int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+      display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+      return ret;
+    }
+    return 0;
    }
  
    //---------------------------------------------------------------------------
@@ -247,11 +390,11 @@ namespace rgw::dedup {
      d_head_object_size = cct->_conf->rgw_max_chunk_size;
      //ceph_assert(4*1024*1024 == d_head_object_size);
  
-    int ret = init_rados_access_handles();
+    int ret = init_rados_access_handles(false);
      if (ret != 0) {
        derr << __func__ << "::ERR: failed init_rados_access_handles() ret="
             << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
+      throw std::runtime_error("Failed init_rados_access_handles()");
      }
  
      d_heart_beat_last_update = ceph_clock_now();
@@ -550,7 +693,7 @@ namespace rgw::dedup {
      }
      int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx);
      if (ret < 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioxtc from data pool:"
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:"
                          << data_pool.to_str() << dendl;
        return -EIO;
      }
@@ -713,7 +856,7 @@ namespace rgw::dedup {
        }
  
        if (oid == raw_obj.oid) {
-        ldpp_dout(dpp, 10) << __func__ << "::manifest: head object=" << oid << dendl;
+        ldpp_dout(dpp, 20) << __func__ << "::manifest: head object=" << oid << dendl;
          head_ioctx = obj.ioctx;
        }
        bufferlist bl;
@@ -883,8 +1026,8 @@ namespace rgw::dedup {
      if (unlikely(should_print_debug)) {
        print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard);
      }
-
      p_stats->processed_objects ++;
+
      uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
      uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
      storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
@@ -898,11 +1041,23 @@ namespace rgw::dedup {
      dedup_table_t::value_t src_val;
      int ret = p_table->get_val(&key_from_bucket_index, &src_val);
      if (ret != 0) {
-      // record has no valid entry in table because it is a singleton
-      p_stats->skipped_singleton++;
-      p_stats->skipped_singleton_bytes += ondisk_byte_size;
-      ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::" << p_rec->bucket_name
-                         << "/" << p_rec->obj_name << std::dec << dendl;
+      if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+        // record has no valid entry in table because it is a too small
+        // It was loaded to table for calculation and then purged
+        p_stats->skipped_purged_small++;
+        ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::"
+                           << p_rec->obj_name << "::" << ondisk_byte_size << dendl;
+        // help small object tests pass - avoid complication differentiating between
+        // small objects ( < 64KB,  >= 64KB <= 4MB, > 4MB
+        p_stats->processed_objects--;
+      }
+      else {
+        // record has no valid entry in table because it is a singleton
+        p_stats->skipped_singleton++;
+        p_stats->skipped_singleton_bytes += ondisk_byte_size;
+        ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::"
+                           << p_rec->obj_name << std::dec << dendl;
+      }
        return 0;
      }
  
@@ -1168,6 +1323,11 @@ namespace rgw::dedup {
      if (ret == 0) {
        p_stats->deduped_objects++;
        p_stats->deduped_objects_bytes += dedupable_objects_bytes;
+      if (p_tgt_rec->s.num_parts == 0) {
+        // single part objects duplicate the head object when dedup is used
+        p_stats->dup_head_bytes += d_head_object_size;
+      }
+
        // mark the SRC object as a providor of a shared manifest
        if (!src_val.has_shared_manifest()) {
          p_stats->set_shared_manifest_src++;
@@ -1390,7 +1550,9 @@ namespace rgw::dedup {
            p_worker_stats->ingress_skip_too_small_64KB++;
            p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
          }
-        return 0;
+        else {
+          return 0;
+        }
        }
        else {
          // multipart objects are always good candidates for dedup
@@ -1421,8 +1583,8 @@ namespace rgw::dedup {
        ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec="
                           << d_heart_beat_max_elapsed_sec << dendl;
        d_heart_beat_last_update = now;
-      d_cluster.update_shard_token_heartbeat(d_dedup_cluster_ioctx, shard_id,
-                                             count_a, count_b, prefix);
+      d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b,
+                                             prefix);
      }
    }
  
@@ -1585,17 +1747,18 @@ namespace rgw::dedup {
  
    //---------------------------------------------------------------------------
    static void display_table_stat_counters(const DoutPrefixProvider* dpp,
-                                          uint64_t obj_count_in_shard,
                                            const md5_stats_t *p_stats)
    {
+    uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count +
+                                   p_stats->big_objs_stat.unique_count +
+                                   p_stats->big_objs_stat.duplicate_count);
+
      ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n"
                         << "::total_count="      << obj_count_in_shard
                         << "::loaded_objects="   << p_stats->loaded_objects
-                       << "::singleton_count="  << p_stats->singleton_count
-                       << "::unique_count="     << p_stats->unique_count << "\n"
-                       << "::duplicate_count="  << p_stats->duplicate_count
-                       << "::duplicated_bytes=" << p_stats->dedup_bytes_estimate
-                       << dendl;
+                       << p_stats->big_objs_stat << dendl;
+    ldpp_dout(dpp, 10) << __func__ << "::small objs::"
+                       << p_stats->small_objs_stat << dendl;
    }
  
    //---------------------------------------------------------------------------
@@ -1620,11 +1783,9 @@ namespace rgw::dedup {
          return -ECANCELED;
        }
      }
-    p_table->count_duplicates(&p_stats->singleton_count, &p_stats->unique_count,
-                              &p_stats->duplicate_count, &p_stats->dedup_bytes_estimate);
-    uint64_t obj_count_in_shard = (p_stats->singleton_count + p_stats->unique_count
-                                   + p_stats->duplicate_count);
-    display_table_stat_counters(dpp, obj_count_in_shard, p_stats);
+    p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat,
+                              &p_stats->dup_head_bytes_estimate);
+    display_table_stat_counters(dpp, p_stats);
  
      ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
      if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) {
@@ -1881,8 +2042,7 @@ namespace rgw::dedup {
                                                  &worker_stats,raw_mem, raw_mem_size);
      if (ret == 0) {
        worker_stats.duration = ceph_clock_now() - start_time;
-      d_cluster.mark_work_shard_token_completed(d_dedup_cluster_ioctx, worker_id,
-                                                &worker_stats);
+      d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats);
        ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl;
        ldpp_dout(dpp, 10) << "Shard Process Duration   = "
                           << worker_stats.duration << dendl;
@@ -1906,8 +2066,7 @@ namespace rgw::dedup {
      int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
      if (ret == 0) {
        md5_stats.duration = ceph_clock_now() - start_time;
-      d_cluster.mark_md5_shard_token_completed(d_dedup_cluster_ioctx, md5_shard,
-                                               &md5_stats);
+      d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats);
        ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl;
        ldpp_dout(dpp, 10) << "Shard Process Duration   = "
                           << md5_stats.duration << dendl;
@@ -1927,10 +2086,10 @@ namespace rgw::dedup {
        d_heart_beat_last_update = ceph_clock_now();
        uint16_t shard_id;
        if (ingress_work_shards) {
-        shard_id = d_cluster.get_next_work_shard_token(d_dedup_cluster_ioctx, num_work_shards);
+        shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards);
        }
        else {
-        shard_id = d_cluster.get_next_md5_shard_token(d_dedup_cluster_ioctx, num_md5_shards);
+        shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards);
        }
  
        // start with a common error handler
@@ -2063,8 +2222,16 @@ namespace rgw::dedup {
      ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <<d_all_buckets_obj_count
                        << "::num_md5_shards=" << num_md5_shards
                        << "::num_work_shards=" << num_work_shards << dendl;
-    ret = d_cluster.reset(store, d_dedup_cluster_ioctx, p_epoch, num_work_shards,
-                          num_md5_shards);
+    // init handles and create the dedup_pool
+    ret = init_rados_access_handles(true);
+    if (ret != 0) {
+      derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
+           << ret << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+
+    ret = d_cluster.reset(store, p_epoch, num_work_shards, num_md5_shards);
      if (ret != 0) {
        ldpp_dout(dpp, 1) << __func__ << "::ERR: failed cluster.init()" << dendl;
        return ret;
@@ -2101,40 +2268,7 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    int Background::watch_reload(const DoutPrefixProvider* dpp)
    {
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return -ENOENT;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
-    const std::string & oid = DEDUP_WATCH_OBJ;
-    // create the object to watch (object may already exist)
-    bool exclusive = true;
-    int ret = d_dedup_cluster_ioctx.create(oid, exclusive);
-    if (ret >= 0) {
-      ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
-                         << " was created!" << dendl;
-    }
-    else if (ret == -EEXIST) {
-      ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
-    }
-    else {
-      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ioctx.create("
-                        << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    ret = d_dedup_cluster_ioctx.watch2(oid, &d_watch_handle, &d_watcher_ctx);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
-                        << ". error: " << cpp_strerror(-ret) << dendl;
-      d_watch_handle = 0;
-      return ret;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
-                      << oid << "::d_watch_handle=" << d_watch_handle << dendl;
-    return 0;
+    return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx);
    }
  
    //---------------------------------------------------------------------------
@@ -2147,43 +2281,16 @@ namespace rgw::dedup {
        return 0;
      }
  
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): "
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return -ENOENT;
-    }
-
-    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id()
-                      << "::d_watch_handle=" << d_watch_handle << dendl;
-
-    const auto ret = d_dedup_cluster_ioctx.unwatch2(d_watch_handle);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
-                        << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
-                      << DEDUP_WATCH_OBJ << "::d_watch_handle="
+    ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle="
                        << d_watch_handle << dendl;
  
-    d_watch_handle = 0;
-    return 0;
-  }
-
-  //---------------------------------------------------------------------------
-  void Background::ack_notify(uint64_t notify_id, uint64_t cookie, int status)
-  {
-    if (!d_dedup_cluster_ioctx.is_valid()) {
-      ldpp_dout(dpp, 1) << __func__
-                        << "::ERR: invalid pool handler (missing pool)" << dendl;
-      return;
+    int ret = cluster::unwatch_reload(store, dpp, d_watch_handle);
+    if (ret == 0) {
+      ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
+                        << "::d_watch_handle=" << d_watch_handle << dendl;
+      d_watch_handle = 0;
      }
-    ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
-    bufferlist reply_bl;
-    ceph::encode(status, reply_bl);
-    encode(d_ctl, reply_bl);
-    d_dedup_cluster_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
+    return ret;
    }
  
    //---------------------------------------------------------------------------
@@ -2207,7 +2314,7 @@ namespace rgw::dedup {
        cond_lock.unlock(); // close lock block------>]
        ldpp_dout(dpp, 5) << __func__
                          << "::system is paused/shutdown -> cancel notification" << dendl;
-      ack_notify(notify_id, cookie, -EBUSY);
+      cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY);
        return;
      }
  
@@ -2266,7 +2373,7 @@ namespace rgw::dedup {
      }
  
      cond_lock.unlock(); // close lock block------>]
-    ack_notify(notify_id, cookie, ret);
+    cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret);
    }
  
    //---------------------------------------------------------------------------
@@ -2275,7 +2382,7 @@ namespace rgw::dedup {
      const DoutPrefixProvider* const dpp = &dp;
      ldpp_dout(dpp, 10) <<  __FILE__ << "::" <<__func__ << dendl;
      {
-      std::unique_lock pause_lock(d_pause_mutex);
+      std::unique_lock pause_lock(d_cond_mutex);
        if (d_ctl.started) {
          // start the thread only once
          ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl;
@@ -2303,6 +2410,8 @@ namespace rgw::dedup {
      d_cond.notify_all();
      ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl;
      d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;});
+    //cond_lock.unlock();
+
      if (nested_call) {
        ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl;
        d_cond.notify_all();
@@ -2323,8 +2432,7 @@ namespace rgw::dedup {
    //---------------------------------------------------------------------------
    void Background::pause()
    {
-    ldpp_dout(dpp, 5) << "dedup_bg->pause() request: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request");
      std::unique_lock cond_lock(d_cond_mutex);
  
      if (d_ctl.local_paused || d_ctl.shutdown_done) {
@@ -2371,14 +2479,14 @@ namespace rgw::dedup {
      }
  
      driver = _driver;
-    int ret = init_rados_access_handles();
+    // can pool change its uid between pause/resume ???
+    int ret = init_rados_access_handles(false);
      if (ret != 0) {
        derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
             << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
+      throw std::runtime_error("Failed init_rados_access_handles()");
      }
-    ldpp_dout(dpp, 5) << __func__ << "::dedup background: ioctx="
-                      << d_dedup_cluster_ioctx.get_instance_id() << dendl;
+    display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done");
      // create new watch request using the new pool handle
      watch_reload(dpp);
      d_ctl.local_pause_req = false;
@@ -2428,23 +2536,64 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  static bool all_shards_completed(cluster *p_cluster,
-                                   librados::IoCtx &ioctx,
-                                   work_shard_t num_work_shards,
-                                   uint64_t *p_total_ingressed)
+  void Background::work_shards_barrier(work_shard_t num_work_shards)
    {
-    return p_cluster->all_work_shard_tokens_completed(ioctx, num_work_shards,
-                                                      p_total_ingressed);
+    // Wait for other worker to finish ingress step
+    // We can move to the next step even if some token are in failed state
+    const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members
+    unsigned ttl = 3;
+    unsigned time_elapsed = 0;
+
+    while (true) {
+      int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards);
+      // we start incrementing time_elapsed only after all valid tokens finish
+      if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) {
+        break;
+      }
+
+      ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
+                         << ttl << " seconds" << dendl;
+      std::unique_lock cond_lock(d_cond_mutex);
+      d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
+                      [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
+      if (unlikely(d_ctl.should_pause())) {
+        handle_pause_req(__func__);
+      }
+      if (unlikely(d_ctl.should_stop())) {
+        return;
+      }
+
+      if (ret != -EAGAIN) {
+        // All incomplete tokens are corrupted or in time out state
+        // Give them an extra 120 seconds just in case ...
+        time_elapsed += ttl;
+      }
+      // else there are still good tokens in process, wait for them
+    }
+
+    ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n"
+                       << dendl;
+    if (unlikely(d_ctl.should_pause())) {
+      handle_pause_req(__func__);
+    }
    }
  
    //---------------------------------------------------------------------------
-  void Background::work_shards_barrier(work_shard_t num_work_shards)
+  static bool all_md5_shards_completed(cluster *p_cluster,
+                                       rgw::sal::RadosStore *store,
+                                       md5_shard_t num_md5_shards)
    {
-    // Wait for other worker to finish ingress step
-    unsigned ttl = 1;
-    uint64_t total_ingressed = 0;
-    while (!all_shards_completed(&d_cluster, d_dedup_cluster_ioctx, num_work_shards, &total_ingressed)) {
-      ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
+    return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0);
+  }
+
+  //---------------------------------------------------------------------------
+  void Background::md5_shards_barrier(md5_shard_t num_md5_shards)
+  {
+    // Wait for others to finish step
+    unsigned ttl = 3;
+    // require that everything completed successfully before deleting the pool
+    while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) {
+      ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl="
                           << ttl << " seconds" << dendl;
        std::unique_lock cond_lock(d_cond_mutex);
        d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
@@ -2457,8 +2606,8 @@ namespace rgw::dedup {
        }
      }
  
-    ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards! ("
-                       << total_ingressed << ")==\n" << dendl;
+    ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n"
+                       << dendl;
      if (unlikely(d_ctl.should_pause())) {
        handle_pause_req(__func__);
      }
@@ -2483,7 +2632,13 @@ namespace rgw::dedup {
        if (d_ctl.dedup_exec) {
          dedup_epoch_t epoch;
          if (setup(&epoch) != 0) {
-          ldpp_dout(dpp, 1) << "failed setup()" << dendl;
+          ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl;
+          return;
+        }
+        const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+        int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+        if (pool_id < 0) {
+          ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl;
            return;
          }
          work_shard_t num_work_shards = epoch.num_work_shards;
@@ -2505,9 +2660,11 @@ namespace rgw::dedup {
            // Wait for all other workers to finish ingress step
            work_shards_barrier(num_work_shards);
            if (!d_ctl.should_stop()) {
-            process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(), RAW_MEM_SIZE,
-                               num_work_shards, num_md5_shards);
-            ldpp_dout(dpp, 10) << "\n==DEDUP was completed on all shards! ==\n" << dendl;
+            process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(),
+                               RAW_MEM_SIZE, num_work_shards, num_md5_shards);
+            // Wait for all other md5 shards to finish
+            md5_shards_barrier(num_md5_shards);
+            safe_pool_delete(store, dpp, pool_id);
            }
            else {
              ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl;
diff --git a/src/rgw/rgw_dedup.h b/src/rgw/rgw_dedup.h

index 697f8028c666e9b71b9e35cc0edf9297295ae5b0..57ed0e824de5a98ae7d67758141b65ea96ff0444 100644 (file)
--- a/src/rgw/rgw_dedup.h
+++ b/src/rgw/rgw_dedup.h
@@ -95,10 +95,10 @@ namespace rgw::dedup {
        STEP_REMOVE_DUPLICATES
      };
  
-    void ack_notify(uint64_t notify_id, uint64_t cookie, int status);
      void run();
      int  setup(struct dedup_epoch_t*);
      void work_shards_barrier(work_shard_t num_work_shards);
+    void md5_shards_barrier(md5_shard_t num_md5_shards);
      void handle_pause_req(const char* caller);
      const char* dedup_step_name(dedup_step_t step);
      int  read_buckets();
@@ -216,7 +216,7 @@ namespace rgw::dedup {
                       bool                 is_shared_manifest_src);
  #endif
      int  remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
-    int  init_rados_access_handles();
+    int  init_rados_access_handles(bool init_pool);
  
      // private data members
      rgw::sal::Driver* driver = nullptr;
@@ -244,7 +244,6 @@ namespace rgw::dedup {
  
      std::thread d_runner;
      std::mutex  d_cond_mutex;
-    std::mutex  d_pause_mutex;
      std::condition_variable d_cond;
    };
  
diff --git a/src/rgw/rgw_dedup_cluster.cc b/src/rgw/rgw_dedup_cluster.cc

index 53c24b13acc2c986d54c3ff81e6a19cfb52898d1..f18de129a5a54a89ec42484d10ece01b53f1579d 100644 (file)
--- a/src/rgw/rgw_dedup_cluster.cc
+++ b/src/rgw/rgw_dedup_cluster.cc
@@ -35,10 +35,11 @@
  
  namespace rgw::dedup {
    const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN";
+  const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
  
    static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30;
    struct shard_progress_t;
-  static int collect_shard_stats(librados::IoCtx &ioctx,
+  static int collect_shard_stats(rgw::sal::RadosStore *store,
                                   const DoutPrefixProvider *dpp,
                                   utime_t epoch_time,
                                   unsigned shards_count,
@@ -51,14 +52,35 @@ namespace rgw::dedup {
    const char* SHARD_PROGRESS_ATTR = "shard_progress";
  
    //---------------------------------------------------------------------------
-  static int get_epoch(librados::IoCtx &ioctx,
+  static int get_control_ioctx(rgw::sal::RadosStore     *store,
+                               const DoutPrefixProvider *dpp,
+                               librados::IoCtx &ctl_ioctx /* OUT-PARAM */)
+  {
+    const auto& control_pool = store->svc()->zone->get_zone_params().control_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx);
+    if (unlikely(ret < 0)) {
+      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret="
+                        << ret << "::" << cpp_strerror(-ret) << dendl;
+    }
+    return ret;
+  }
+
+  //---------------------------------------------------------------------------
+  static int get_epoch(rgw::sal::RadosStore     *store,
                         const DoutPrefixProvider *dpp,
                         dedup_epoch_t *p_epoch, /* OUT */
                         const char *caller)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      std::string oid(DEDUP_EPOCH_TOKEN);
      bufferlist bl;
-    int ret = ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
+    ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
      if (ret > 0) {
        try {
          auto p = bl.cbegin();
@@ -78,23 +100,29 @@ namespace rgw::dedup {
          ret = -ENODATA;
        }
        ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "")
-                         << "::failed ioctx.getxattr() with: "
+                         << "::failed ctl_ioctx.getxattr() with: "
                           << cpp_strerror(-ret) << ", ret=" << ret << dendl;
        return ret;
      }
    }
  
    //---------------------------------------------------------------------------
-  static int set_epoch(librados::IoCtx &ioctx,
+  static int set_epoch(rgw::sal::RadosStore *store,
                         const std::string &cluster_id,
                         const DoutPrefixProvider *dpp,
                         work_shard_t num_work_shards,
                         md5_shard_t num_md5_shards)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      std::string oid(DEDUP_EPOCH_TOKEN);
      ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
      bool exclusive = true; // block overwrite of old objects
-    int ret = ioctx.create(oid, exclusive);
+    ret = ctl_ioctx.create(oid, exclusive);
      if (ret >= 0) {
        ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
        // now try and take ownership
@@ -120,7 +148,7 @@ namespace rgw::dedup {
      op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
  
      ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
-    ret = ioctx.operate(oid, &op);
+    ret = ctl_ioctx.operate(oid, &op);
      if (ret == 0) {
        ldpp_dout(dpp, 10) << __func__ << "::Epoch object was written" << dendl;
      }
@@ -128,27 +156,33 @@ namespace rgw::dedup {
      // probably best to read attribute from epoch!
      else if (ret == -ECANCELED) {
        dedup_epoch_t epoch;
-      ret = get_epoch(ioctx, dpp, &epoch, __func__);
+      ret = get_epoch(store, dpp, &epoch, __func__);
        if (ret == 0) {
          ldpp_dout(dpp, 10) << __func__ << "::Accept existing Epoch object" << dendl;
        }
        return ret;
      }
      else {
-      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
                          << oid << "), err is " << cpp_strerror(-ret) << dendl;
      }
      return ret;
    }
  
    //---------------------------------------------------------------------------
-  static int swap_epoch(const DoutPrefixProvider *dpp,
-                        librados::IoCtx &ioctx,
+  static int swap_epoch(rgw::sal::RadosStore     *store,
+                        const DoutPrefixProvider *dpp,
                          const dedup_epoch_t *p_old_epoch,
                          dedup_req_type_t dedup_type,
                          work_shard_t num_work_shards,
                          md5_shard_t num_md5_shards)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      dedup_epoch_t new_epoch = { p_old_epoch->serial + 1, dedup_type,
                                  ceph_clock_now(), num_work_shards, num_md5_shards};
      bufferlist old_epoch_bl, new_epoch_bl, err_bl;
@@ -160,9 +194,9 @@ namespace rgw::dedup {
  
      ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
      std::string oid(DEDUP_EPOCH_TOKEN);
-    int ret = ioctx.operate(oid, &op);
+    ret = ctl_ioctx.operate(oid, &op);
      if (ret != 0) {
-      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
+      ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
                          << oid << "), err is " << cpp_strerror(-ret) << dendl;
      }
  
@@ -193,13 +227,15 @@ namespace rgw::dedup {
        this->progress_a  = _progress_a;
        this->progress_b  = _progress_b;
        this->completed   = _completed;
-      this->update_time = ceph_clock_now();
+
+      utime_t now = ceph_clock_now();
+      this->update_time = now;
  
        if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) {
-        this->creation_time = ceph_clock_now();
+        this->creation_time = now;
        }
        if (_completed) {
-        this->completion_time = ceph_clock_now();
+        this->completion_time = now;
        }
      }
  
@@ -213,6 +249,11 @@ namespace rgw::dedup {
          return false;
        }
      }
+
+    bool was_not_started() const {
+      return (this->creation_time == this->update_time);
+    }
+
      uint64_t    progress_a;
      uint64_t    progress_b;
      bool        completed;
@@ -223,6 +264,17 @@ namespace rgw::dedup {
      bufferlist  stats_bl;
    };
  
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, shard_progress_t& sp)
+  {
+    out << (sp.completed ? " + ::" : " - ::");
+    out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]";
+    out << "::creation: " << sp.creation_time;
+    out << "::update: " << sp.update_time;
+    out << "::completion: " << sp.completion_time;
+    return out;
+  }
+
    //---------------------------------------------------------------------------
    void encode(const shard_progress_t& sp, ceph::bufferlist& bl)
    {
@@ -253,66 +305,6 @@ namespace rgw::dedup {
      DECODE_FINISH(bl);
    }
  
-  //---------------------------------------------------------------------------
-  int init_dedup_pool_ioctx(RGWRados                 *rados,
-                            const DoutPrefixProvider *dpp,
-                            librados::IoCtx          &ioctx)
-  {
-    rgw_pool dedup_pool(DEDUP_POOL_NAME);
-    std::string pool_name(DEDUP_POOL_NAME);
-#if 0
-    // using Replica-1 for the intermediate data
-    // since it can be regenerated in case of a failure
-    std::string replica_count(std::to_string(1));
-#else
-    // temporary solution until we find a way to disable the health warn on replica1
-    std::string replica_count(std::to_string(2));
-#endif
-    librados::bufferlist inbl;
-    std::string output;
-    std::string command = R"(
-    {
-      "prefix": "osd pool create",
-      "pool": ")" + pool_name +
-      R"(",
-      "pool_type": "replicated",
-      "size": )" + replica_count +
-      R"(
-    })";
-
-    auto rados_handle = rados->get_rados_handle();
-    int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
-    if (output.length()) {
-      if (output != "pool 'rgw_dedup_pool' already exists") {
-        ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
-      }
-    }
-    if (ret != 0 && ret != -EEXIST) {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
-                        << DEDUP_POOL_NAME << " with: "
-                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
-      return ret;
-    }
-
-    ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::failed to initialize pool for listing with: "
-                        << cpp_strerror(-ret) << dendl;
-    }
-
-    ret = ioctx.application_enable("dedup", false);
-    if (ret == 0) {
-      ldpp_dout(dpp, 10) << __func__ << "::pool " << DEDUP_POOL_NAME
-                         << " was associated with dedup app" << dendl;
-    }
-    else {
-      ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
-                        << DEDUP_POOL_NAME << " with: "
-                        << cpp_strerror(-ret) << ", ret=" << ret << dendl;
-    }
-    return ret;
-  }
-
    //==========================================================================
  
    //---------------------------------------------------------------------------
@@ -326,9 +318,6 @@ namespace rgw::dedup {
  
      memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers));
      memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5));
-
-    d_total_ingressed_obj = 0;
-    d_num_failed_workers = 0;
    }
  
  
@@ -343,31 +332,10 @@ namespace rgw::dedup {
      d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN))
    {
      clear();
-
-    auto store = dynamic_cast<rgw::sal::RadosStore*>(driver);
-    if (!store) {
-      ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl;
-      ceph_abort("non-rados backend");
-      return;
-    }
-
-    librados::IoCtx ioctx;
-    if (init_dedup_pool_ioctx(store->getRados(), dpp, ioctx) != 0) {
-      throw std::runtime_error("Failed init_dedup_pool_ioctx()");
-    }
-
-    // generate an empty epoch with zero counters
-    int ret = set_epoch(ioctx, d_cluster_id, dpp, 0, 0);
-    if (ret != 0) {
-      ldpp_dout(dpp, 1) << __func__ << "::failed set_epoch()! ret="
-                        << ret << "::" << cpp_strerror(-ret) << dendl;
-      throw std::runtime_error("Failed set_epoch()");
-    }
    }
  
    //---------------------------------------------------------------------------
    int cluster::reset(rgw::sal::RadosStore *store,
-                     librados::IoCtx &ioctx,
                       dedup_epoch_t *p_epoch,
                       work_shard_t num_work_shards,
                       md5_shard_t num_md5_shards)
@@ -377,7 +345,7 @@ namespace rgw::dedup {
      clear();
  
      while (true) {
-      int ret = get_epoch(ioctx, dpp, p_epoch, __func__);
+      int ret = get_epoch(store, dpp, p_epoch, __func__);
        if (ret != 0) {
          return ret;
        }
@@ -391,7 +359,7 @@ namespace rgw::dedup {
          break;
        }
        else {
-        ret = swap_epoch(dpp, ioctx, p_epoch,
+        ret = swap_epoch(store, dpp, p_epoch,
                           static_cast<dedup_req_type_t> (p_epoch->dedup_type),
                           num_work_shards, num_md5_shards);
        }
@@ -402,27 +370,33 @@ namespace rgw::dedup {
      const unsigned RETRY_LIMIT = 3;
      int ret = 1;
      for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) {
-      ret = cleanup_prev_run(ioctx);
+      ret = cleanup_prev_run(store);
      }
      if (ret != 0) {
        return ret;
      }
  
-    create_shard_tokens(ioctx, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
-    create_shard_tokens(ioctx, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
+    create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
+    create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
  
-    ret = verify_all_shard_tokens(ioctx, p_epoch->num_work_shards,
+    ret = verify_all_shard_tokens(store, p_epoch->num_work_shards,
                                    WORKER_SHARD_PREFIX);
      if (ret != 0) {
        return ret;
      }
-    return verify_all_shard_tokens(ioctx, p_epoch->num_md5_shards,
+    return verify_all_shard_tokens(store, p_epoch->num_md5_shards,
                                     MD5_SHARD_PREFIX);
    }
  
    //---------------------------------------------------------------------------
-  int cluster::cleanup_prev_run(librados::IoCtx &ioctx)
+  int cluster::cleanup_prev_run(rgw::sal::RadosStore *store)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      int error_code = 0;
      constexpr uint32_t max = 100;
      std::string marker;
@@ -432,7 +406,7 @@ namespace rgw::dedup {
      unsigned failed_count  = 0, no_entry_count = 0;
      do {
        std::vector<std::string> oids;
-      int ret = rgw_list_pool(dpp, ioctx, max, filter, marker, &oids, &truncated);
+      int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated);
        if (ret == -ENOENT) {
          ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl;
          break;
@@ -444,14 +418,15 @@ namespace rgw::dedup {
        }
  
        for (const std::string& oid : oids) {
-        if (oid == DEDUP_WATCH_OBJ || oid == DEDUP_EPOCH_TOKEN) {
+        if (shard_token_oid::legal_oid_name(oid) == false) {
            ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl;
            skipped_count++;
            continue;
          }
+
          uint64_t size;
          struct timespec tspec;
-        ret = ioctx.stat2(oid, &size, &tspec);
+        ret = ctl_ioctx.stat2(oid, &size, &tspec);
          if (ret == -ENOENT) {
            ldpp_dout(dpp, 20) << __func__ << "::" << oid
                               << " was removed by others" << dendl;
@@ -459,7 +434,8 @@ namespace rgw::dedup {
            continue;
          }
          else if (ret != 0) {
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.stat( " << oid << " )" << dendl;
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( "
+                             << oid << " )" << dendl;
            error_code = ret;
            failed_count++;
            continue;
@@ -473,7 +449,7 @@ namespace rgw::dedup {
            continue;
          }
          ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl;
-        ret = ioctx.remove(oid);
+        ret = ctl_ioctx.remove(oid);
          if (ret == 0) {
            deleted_count++;
          }
@@ -486,42 +462,48 @@ namespace rgw::dedup {
          else {
            error_code = ret;
            failed_count++;
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << oid
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid
                               << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
          }
        }
        ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size()
-                         << "::deleted=" << deleted_count
-                         << "::failed="  << failed_count
-                         << "::no entry="  << no_entry_count
-                         << "::skipped=" << skipped_count << dendl;
+                         << "::deleted="  << deleted_count
+                         << "::failed="   << failed_count
+                         << "::no entry=" << no_entry_count
+                         << "::skipped="  << skipped_count << dendl;
      } while (truncated);
  
      return error_code;
    }
  
    //---------------------------------------------------------------------------
-  int cluster::create_shard_tokens(librados::IoCtx &ioctx,
+  int cluster::create_shard_tokens(rgw::sal::RadosStore *store,
                                     unsigned shards_count,
                                     const char *prefix)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      shard_token_oid sto(prefix);
      for (unsigned shard = 0; shard < shards_count; shard++) {
        sto.set_shard(shard);
        std::string oid(sto.get_buff(), sto.get_buff_size());
        ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
        bool exclusive = true;
-      int ret = ioctx.create(oid, exclusive);
+      ret = ctl_ioctx.create(oid, exclusive);
        if (ret >= 0) {
          ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
        }
        else if (ret == -EEXIST) {
-        ldpp_dout(dpp, 15) << __func__ << "::failed ioctx.create("
+        ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create("
                             << oid << ") -EEXIST!" << dendl;
        }
        else {
          // TBD: can it happen legally ?
-        ldpp_dout(dpp, 1) << __func__ << "::failed ioctx.create(" << oid
+        ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid
                            << ") with: " << ret  << "::" << cpp_strerror(-ret) << dendl;
        }
      }
@@ -530,10 +512,16 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  int cluster::verify_all_shard_tokens(librados::IoCtx &ioctx,
+  int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store,
                                         unsigned shards_count,
                                         const char *prefix)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      shard_token_oid sto(prefix);
      for (unsigned shard = 0; shard < shards_count; shard++) {
        sto.set_shard(shard);
@@ -542,9 +530,9 @@ namespace rgw::dedup {
  
        uint64_t size;
        struct timespec tspec;
-      int ret = ioctx.stat2(oid, &size, &tspec);
+      ret = ctl_ioctx.stat2(oid, &size, &tspec);
        if (ret != 0) {
-        ldpp_dout(dpp, 5) << __func__ << "::failed ioctx.stat( " << oid << " )"
+        ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
                            << "::shards_count=" << shards_count << dendl;
          return ret;
        }
@@ -554,12 +542,18 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  int cluster::update_shard_token_heartbeat(librados::IoCtx &ioctx,
+  int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store,
                                              unsigned shard,
                                              uint64_t count_a,
                                              uint64_t count_b,
                                              const char *prefix)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      shard_token_oid sto(prefix, shard);
      std::string oid(sto.get_buff(), sto.get_buff_size());
      bufferlist empty_bl;
@@ -567,16 +561,22 @@ namespace rgw::dedup {
      sp.creation_time = d_token_creation_time;
      bufferlist sp_bl;
      encode(sp, sp_bl);
-    return ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+    return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
    }
  
    //---------------------------------------------------------------------------
-  int cluster::mark_shard_token_completed(librados::IoCtx &ioctx,
+  int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store,
                                            unsigned shard,
                                            uint64_t obj_count,
                                            const char *prefix,
                                            const bufferlist &bl)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      shard_token_oid sto(prefix, shard);
      std::string oid(sto.get_buff(), sto.get_buff_size());
      ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl;
@@ -585,24 +585,31 @@ namespace rgw::dedup {
      sp.creation_time = d_token_creation_time;
      bufferlist sp_bl;
      encode(sp, sp_bl);
-    int ret = ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+    ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
      if (ret == 0) {
-      ldpp_dout(dpp, 10) << __func__ << "::Done ioctx.setxattr(" << oid << ")" << dendl;
+      ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")"
+                         << dendl;
      }
      else {
-      ldpp_dout(dpp, 0) << __func__ << "::Failed ioctx.setxattr(" << oid << ") ret="
-                        << ret << "::" << cpp_strerror(-ret) << dendl;
+      ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid
+                        << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
      }
  
      return ret;
    }
  
    //---------------------------------------------------------------------------
-  int32_t cluster::get_next_shard_token(librados::IoCtx &ioctx,
+  int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store,
                                          uint16_t start_shard,
                                          uint16_t max_shard,
                                          const char *prefix)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      // lock paramters:
      const utime_t     lock_duration;  // zero duration means lock doesn't expire
      const uint8_t     lock_flags = 0; // no flags
@@ -617,7 +624,7 @@ namespace rgw::dedup {
        op.assert_exists();
        rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie,
                               lock_tag, "dedup_shard_token", lock_duration, lock_flags);
-      int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), null_yield);
+      ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield);
        if (ret == -EBUSY) {
          // someone else took this token -> move to the next one
          ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid <<
@@ -641,10 +648,9 @@ namespace rgw::dedup {
        bufferlist empty_bl;
        shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl);
        d_token_creation_time = sp.creation_time;
-
        bufferlist sp_bl;
        encode(sp, sp_bl);
-      ret = ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+      ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
        if (ret == 0) {
          ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl;
          return shard;
@@ -655,11 +661,11 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  work_shard_t cluster::get_next_work_shard_token(librados::IoCtx &ioctx,
+  work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store,
                                                    work_shard_t num_work_shards)
    {
-    int32_t shard = get_next_shard_token(ioctx, d_curr_worker_shard, num_work_shards,
-                                         WORKER_SHARD_PREFIX);
+    int32_t shard = get_next_shard_token(store, d_curr_worker_shard,
+                                         num_work_shards, WORKER_SHARD_PREFIX);
      if (shard >= 0 && shard < num_work_shards) {
        d_curr_worker_shard = shard + 1;
        return shard;
@@ -670,10 +676,10 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  md5_shard_t cluster::get_next_md5_shard_token(librados::IoCtx &ioctx,
+  md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store,
                                                  md5_shard_t num_md5_shards)
    {
-    int32_t shard = get_next_shard_token(ioctx, d_curr_md5_shard, num_md5_shards,
+    int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards,
                                           MD5_SHARD_PREFIX);
      if (shard >= 0 && shard < num_md5_shards) {
        d_curr_md5_shard = shard + 1;
@@ -685,17 +691,23 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  bool cluster::all_shard_tokens_completed(librados::IoCtx &ioctx,
-                                           unsigned shards_count,
-                                           const char *prefix,
-                                           uint16_t *p_num_completed,
-                                           uint8_t completed_arr[],
-                                           uint64_t *p_total_ingressed)
+  int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                          unsigned shards_count,
+                                          const char *prefix,
+                                          uint16_t *p_num_completed,
+                                          uint8_t completed_arr[])
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    int err_code = 0;
      unsigned count = 0;
      shard_token_oid sto(prefix);
      for (unsigned shard = 0; shard < shards_count; shard++) {
-      if (completed_arr[shard] != TOKEN_STATE_PENDING) {
+      if (completed_arr[shard] == TOKEN_STATE_COMPLETED) {
          count++;
          continue;
        }
@@ -704,12 +716,15 @@ namespace rgw::dedup {
        std::string oid(sto.get_buff(), sto.get_buff_size());
        ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
        bufferlist bl;
-      int ret = ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+      ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
        if (unlikely(ret <= 0)) {
          if (ret != -ENODATA) {
-          ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.getxattr() ret="
+          ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret="
                               << ret << "::" << cpp_strerror(-ret) << dendl;
          }
+        completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+        // all failures to get valid token state return ENODATA
+        err_code = -ENODATA;
          continue;
        }
  
@@ -719,50 +734,58 @@ namespace rgw::dedup {
          decode(sp, p);
        }
        catch (const buffer::error&) {
-        ldpp_dout(dpp, 0) << __func__ << "::failed shard_progress_t decode!" << dendl;
-        return false;
+        ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl;
+        completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+        // all failures to get valid token state return ENODATA
+        err_code = -ENODATA;
+        continue;
        }
  
-      if (sp.progress_b == SP_ALL_OBJECTS) {
-        ceph_assert(sp.completed);
+      if (sp.is_completed()) {
          utime_t duration = sp.completion_time - sp.creation_time;
          // mark token completed;
          (*p_num_completed)++;
          completed_arr[shard] = TOKEN_STATE_COMPLETED;
-        d_total_ingressed_obj += sp.progress_a;
          ldpp_dout(dpp, 20) << __func__ << "::" << oid
                             << "::completed! duration=" << duration << dendl;
          count++;
        }
+      else if (sp.was_not_started()) {
+        // token was not started yet
+        // TBD:
+        // If it is not locked we can process it (by why we skipped it)??
+        // If locked, check when it was done and if timed-out
+        ldpp_dout(dpp, 10) << __func__ << "::" << oid
+                           << "::was not started, skipping" << dendl;
+        return -EAGAIN;
+      }
        else {
          static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0);
-        utime_t time_elapsed = sp.update_time - sp.creation_time;
+        utime_t time_elapsed = ceph_clock_now() - sp.update_time;
          if (time_elapsed > heartbeat_timeout) {
            // lock expired -> try and break lock
-          ldpp_dout(dpp, 0) << __func__ << "::" << oid << "::expired lock, skipping" << dendl;
+          ldpp_dout(dpp, 5) << __func__ << "::" << oid
+                            << "::expired lock, skipping:" << time_elapsed
+                            << "::" << sp << dendl;
            completed_arr[shard] = TOKEN_STATE_TIMED_OUT;
-          d_num_failed_workers++;
+          err_code = -ETIME;
            continue;
          }
          else {
-          return false;
+          return -EAGAIN;
          }
-        // TBD: need to store copies and declare token with no progress for N seconds
-        // as failing and then skip it
-        return false;
        }
      } // loop
  
-    *p_total_ingressed = d_total_ingressed_obj;
      if (count < shards_count) {
        unsigned n = shards_count - count;
        ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl;
      }
-    return (count == shards_count);
+    return err_code;
    }
  
    //---------------------------------------------------------------------------
-  static int collect_shard_stats(librados::IoCtx &ioctx,
+  static int collect_shard_stats(rgw::sal::RadosStore *store,
                                   const DoutPrefixProvider *dpp,
                                   utime_t epoch_time,
                                   unsigned shards_count,
@@ -770,6 +793,12 @@ namespace rgw::dedup {
                                   bufferlist bl_arr[],
                                   shard_progress_t *sp_arr)
    {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
      unsigned count = 0;
      cluster::shard_token_oid sto(prefix);
      for (unsigned shard = 0; shard < shards_count; shard++) {
@@ -779,8 +808,8 @@ namespace rgw::dedup {
  
        uint64_t size;
        struct timespec tspec;
-      if (ioctx.stat2(oid, &size, &tspec) != 0) {
-        ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.stat( " << oid << " )"
+      if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) {
+        ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
                             << "::shards_count=" << shards_count << dendl;
          continue;
        }
@@ -794,7 +823,7 @@ namespace rgw::dedup {
  
        shard_progress_t sp;
        bufferlist bl;
-      int ret = ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+      ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
        if (ret > 0) {
          try {
            auto p = bl.cbegin();
@@ -926,16 +955,16 @@ namespace rgw::dedup {
  
    //---------------------------------------------------------------------------
    static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum,
-                                            const md5_stats_t    &md5_stats_sum,
+                                            const md5_stats_t &md5_stats_sum,
                                              Formatter *fmt)
    {
      uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
-    uint64_t s3_dedup_bytes  = md5_stats_sum.dedup_bytes_estimate;
+    uint64_t s3_dedup_bytes  = md5_stats_sum.big_objs_stat.dedup_bytes_estimate;
      uint64_t s3_bytes_after  = s3_bytes_before - s3_dedup_bytes;
-
      Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
      fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
      fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
+    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
  
      if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
        double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
@@ -959,7 +988,7 @@ namespace rgw::dedup {
      Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
      fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
      fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
-
+    fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
      if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
        double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
        fmt->dump_float("dedup_ratio", dedup_ratio);
@@ -975,14 +1004,8 @@ namespace rgw::dedup {
                                         Formatter *fmt,
                                         const DoutPrefixProvider *dpp)
    {
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
-
      dedup_epoch_t epoch;
-    ret = get_epoch(ioctx, dpp, &epoch, nullptr);
+    int ret = get_epoch(store, dpp, &epoch, nullptr);
      if (ret != 0) {
        return ret;
      }
@@ -1000,7 +1023,7 @@ namespace rgw::dedup {
        bool show_time = true;
        bufferlist bl_arr[num_work_shards];
        shard_progress_t sp_arr[num_work_shards];
-      int cnt = collect_shard_stats(ioctx, dpp, epoch.time, num_work_shards,
+      int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards,
                                      WORKER_SHARD_PREFIX, bl_arr, sp_arr);
        if (cnt != num_work_shards && 0) {
          std::cerr << ">>>Partial work shard stats recived " << cnt << " / "
@@ -1037,7 +1060,7 @@ namespace rgw::dedup {
        md5_stats_t md5_stats_sum;
        bufferlist bl_arr[num_md5_shards];
        shard_progress_t sp_arr[num_md5_shards];
-      int cnt = collect_shard_stats(ioctx, dpp, epoch.time, num_md5_shards,
+      int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards,
                                      MD5_SHARD_PREFIX, bl_arr, sp_arr);
        if (cnt != num_md5_shards && 0) {
          std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / "
@@ -1076,13 +1099,97 @@ namespace rgw::dedup {
      return 0;
    }
  
+  //---------------------------------------------------------------------------
+  int cluster::watch_reload(rgw::sal::RadosStore *store,
+                            const DoutPrefixProvider* dpp,
+                            uint64_t *p_watch_handle,
+                            librados::WatchCtx2 *ctx)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    const std::string & oid = DEDUP_WATCH_OBJ;
+    // create the object to watch (object may already exist)
+    bool exclusive = true;
+    ret = ctl_ioctx.create(oid, exclusive);
+    if (ret >= 0) {
+      ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
+                         << " was created!" << dendl;
+    }
+    else if (ret == -EEXIST) {
+      ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
+    }
+    else {
+      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create("
+                        << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
+                        << ". error: " << cpp_strerror(-ret) << dendl;
+      *p_watch_handle = 0;
+      return ret;
+    }
+    ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
+                      << oid << "::watch_handle=" << *p_watch_handle << dendl;
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  int cluster::unwatch_reload(rgw::sal::RadosStore *store,
+                              const DoutPrefixProvider* dpp,
+                              uint64_t watch_handle)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    ret = ctl_ioctx.unwatch2(watch_handle);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
+                        << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  //---------------------------------------------------------------------------
+  int cluster::ack_notify(rgw::sal::RadosStore *store,
+                          const DoutPrefixProvider *dpp,
+                          const control_t *p_ctl,
+                          uint64_t notify_id,
+                          uint64_t cookie,
+                          int status)
+  {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
+      return ret;
+    }
+
+    ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
+    bufferlist reply_bl;
+    ceph::encode(status, reply_bl);
+    encode(*p_ctl, reply_bl);
+    ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
+
+    return 0;
+  }
+
    //---------------------------------------------------------------------------
    // command-line called from radosgw-admin.cc
    int cluster::dedup_control(rgw::sal::RadosStore *store,
                               const DoutPrefixProvider *dpp,
                               urgent_msg_t urgent_msg)
    {
-    ldpp_dout(dpp, 20) << __func__ << "::dedup_control req = "
+    ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = "
                         << get_urgent_msg_names(urgent_msg) << dendl;
      if (urgent_msg != URGENT_MSG_RESUME  &&
          urgent_msg != URGENT_MSG_PASUE   &&
@@ -1092,16 +1199,17 @@ namespace rgw::dedup {
        return -EINVAL;
      }
  
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
+    librados::IoCtx ctl_ioctx;
+    int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+    if (unlikely(ret != 0)) {
        return ret;
      }
+
      // 10 seconds timeout
      const uint64_t timeout_ms = 10*1000;
      bufferlist reply_bl, urgent_msg_bl;
      ceph::encode(urgent_msg, urgent_msg_bl);
-    ret = rgw_rados_notify(dpp, ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
+    ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
                             timeout_ms, &reply_bl, null_yield);
      if (ret < 0) {
        ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
@@ -1110,7 +1218,7 @@ namespace rgw::dedup {
      }
      std::vector<librados::notify_ack_t> acks;
      std::vector<librados::notify_timeout_t> timeouts;
-    ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
+    ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
      if (timeouts.size() > 0) {
        ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
                          << DEDUP_WATCH_OBJ << ")::timeout error" << dendl;
@@ -1147,17 +1255,21 @@ namespace rgw::dedup {
                                    dedup_req_type_t dedup_type,
                                    const DoutPrefixProvider *dpp)
    {
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
+    ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl;
  
      dedup_epoch_t old_epoch;
      // store the previous epoch for cmp-swap
-    ret = get_epoch(ioctx, dpp, &old_epoch, __func__);
+    int ret = get_epoch(store, dpp, &old_epoch, __func__);
      if (ret != 0) {
-      return ret;
+      // generate an empty epoch with zero counters
+      std::string cluster_id("NULL_CLUSTER_ID");
+      ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: "
+                        << cluster_id << dendl;
+      set_epoch(store, cluster_id, dpp, 0, 0);
+      ret = get_epoch(store, dpp, &old_epoch, __func__);
+      if (ret) {
+        return ret;
+      }
      }
  
      // first abort all dedup work!
@@ -1165,6 +1277,13 @@ namespace rgw::dedup {
      if (ret != 0) {
        return ret;
      }
+#if 0
+    // then delete dedup-pool to ensure a clean start
+    const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+    auto rados_handle = store->getRados()->get_rados_handle();
+    ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl;
+    rados_handle->pool_delete(dedup_pool.name.c_str());
+#endif
  
      ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl;
  #ifdef FULL_DEDUP_SUPPORT
@@ -1173,7 +1292,7 @@ namespace rgw::dedup {
  #else
      ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
  #endif
-    ret = swap_epoch(dpp, ioctx, &old_epoch, dedup_type, 0, 0);
+    ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0);
      if (ret == 0) {
        ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl;
        return dedup_control(store, dpp, URGENT_MSG_RESTART);
@@ -1187,14 +1306,8 @@ namespace rgw::dedup {
    bool cluster::can_start_new_scan(rgw::sal::RadosStore *store)
    {
      ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl;
-    librados::IoCtx ioctx;
-    int ret = init_dedup_pool_ioctx(store->getRados(), dpp, ioctx);
-    if (ret != 0) {
-      return ret;
-    }
-
      dedup_epoch_t new_epoch;
-    if (get_epoch(ioctx, dpp, &new_epoch, nullptr) != 0) {
+    if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) {
        ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::"
                          << "::scan can be restarted!\n\n\n" << dendl;
        // no epoch object exists -> we should start a new scan
diff --git a/src/rgw/rgw_dedup_cluster.h b/src/rgw/rgw_dedup_cluster.h

index 52fa6c376501d29544f1d4a5d10951586c3d22b6..64b2c54a4fa28a1e2f69f2c62d8ca212ec892a79 100644 (file)
--- a/src/rgw/rgw_dedup_cluster.h
+++ b/src/rgw/rgw_dedup_cluster.h
@@ -19,14 +19,10 @@
  #include <string>
  
  namespace rgw::dedup {
-  static constexpr const char* DEDUP_POOL_NAME     = "rgw_dedup_pool";
-  static constexpr const char* MD5_SHARD_PREFIX    = "MD5.SHRD.TK.";
    static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK.";
-
+  static constexpr const char* MD5_SHARD_PREFIX    = "MD5.SHRD.TK.";
+  struct control_t;
    struct dedup_epoch_t;
-  int   init_dedup_pool_ioctx(RGWRados                 *rados,
-                              const DoutPrefixProvider *dpp,
-                              librados::IoCtx          &ioctx);
  
    class cluster{
    public:
@@ -51,6 +47,11 @@ namespace rgw::dedup {
          this->total_len = this->prefix_len + n;
        }
  
+      //---------------------------------------------------------------------------
+      static bool legal_oid_name(const std::string& oid) {
+        return ((oid.length() <= BUFF_SIZE) &&
+                (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX)));
+      }
        inline const char* get_buff() { return this->buff; }
        inline unsigned get_buff_size() { return this->total_len; }
      private:
@@ -65,20 +66,32 @@ namespace rgw::dedup {
              CephContext* cct,
              rgw::sal::Driver* driver);
      int          reset(rgw::sal::RadosStore *store,
-                       librados::IoCtx &ioctx,
                         struct dedup_epoch_t*,
                         work_shard_t num_work_shards,
                         md5_shard_t num_md5_shards);
  
      utime_t      get_epoch_time() { return d_epoch_time; }
-    work_shard_t get_next_work_shard_token(librados::IoCtx &ioctx,
+    work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store,
                                             work_shard_t num_work_shards);
-    md5_shard_t  get_next_md5_shard_token(librados::IoCtx &ioctx,
+    md5_shard_t  get_next_md5_shard_token(rgw::sal::RadosStore *store,
                                            md5_shard_t num_md5_shards);
      bool         can_start_new_scan(rgw::sal::RadosStore *store);
      static int   collect_all_shard_stats(rgw::sal::RadosStore *store,
                                           Formatter *p_formatter,
                                           const DoutPrefixProvider *dpp);
+    static int   watch_reload(rgw::sal::RadosStore *store,
+                              const DoutPrefixProvider* dpp,
+                              uint64_t *p_watch_handle,
+                              librados::WatchCtx2 *ctx);
+    static int   unwatch_reload(rgw::sal::RadosStore *store,
+                                const DoutPrefixProvider* dpp,
+                                uint64_t watch_handle);
+    static int   ack_notify(rgw::sal::RadosStore *store,
+                            const DoutPrefixProvider *dpp,
+                            const struct control_t *p_ctl,
+                            uint64_t notify_id,
+                            uint64_t cookie,
+                            int status);
      static int   dedup_control(rgw::sal::RadosStore *store,
                                 const DoutPrefixProvider *dpp,
                                 urgent_msg_t urgent_msg);
@@ -87,7 +100,7 @@ namespace rgw::dedup {
                                      const DoutPrefixProvider *dpp);
  
      //---------------------------------------------------------------------------
-    int mark_work_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_work_shard_token_completed(rgw::sal::RadosStore *store,
                                          work_shard_t work_shard,
                                          const worker_stats_t *p_stats)
      {
@@ -95,14 +108,13 @@ namespace rgw::dedup {
        encode(*p_stats, bl);
        d_num_completed_workers++;
        d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED;
-      d_total_ingressed_obj += p_stats->ingress_obj;
  
-      return mark_shard_token_completed(ioctx, work_shard, p_stats->ingress_obj,
+      return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj,
                                          WORKER_SHARD_PREFIX, bl);
      }
  
      //---------------------------------------------------------------------------
-    int mark_md5_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_md5_shard_token_completed(rgw::sal::RadosStore *store,
                                         md5_shard_t md5_shard,
                                         const md5_stats_t *p_stats)
      {
@@ -110,53 +122,56 @@ namespace rgw::dedup {
        encode(*p_stats, bl);
        d_num_completed_md5++;
        d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED;
-      return mark_shard_token_completed(ioctx, md5_shard, p_stats->loaded_objects,
+      return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects,
                                          MD5_SHARD_PREFIX, bl);
      }
  
-    int update_shard_token_heartbeat(librados::IoCtx &ioctx,
+    int update_shard_token_heartbeat(rgw::sal::RadosStore *store,
                                       unsigned shard,
                                       uint64_t count_a,
                                       uint64_t count_b,
                                       const char *prefix);
  
      //---------------------------------------------------------------------------
-    bool all_work_shard_tokens_completed(librados::IoCtx &ioctx,
-                                         work_shard_t num_work_shards,
-                                         uint64_t *p_total_ingressed)
+    int all_work_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                        work_shard_t num_work_shards)
+    {
+      return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX,
+                                        &d_num_completed_workers, d_completed_workers);
+    }
+
+    //---------------------------------------------------------------------------
+    int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store,
+                                       md5_shard_t num_md5_shards)
      {
-      return all_shard_tokens_completed(ioctx,
-                                        num_work_shards,
-                                        WORKER_SHARD_PREFIX,
-                                        &d_num_completed_workers,
-                                        d_completed_workers,
-                                        p_total_ingressed);
+      return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX,
+                                        &d_num_completed_md5, d_completed_md5);
      }
  
    private:
      static constexpr unsigned TOKEN_STATE_PENDING   = 0x00;
+    static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC;
      static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD;
      static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF;
  
      void clear();
-    bool all_shard_tokens_completed(librados::IoCtx &ioctx,
+    int  all_shard_tokens_completed(rgw::sal::RadosStore *store,
                                      unsigned shards_count,
                                      const char *prefix,
                                      uint16_t *p_num_completed,
-                                    uint8_t completed_arr[],
-                                    uint64_t *p_total_ingressed);
-    int cleanup_prev_run(librados::IoCtx &ioctx);
-    int32_t get_next_shard_token(librados::IoCtx &ioctx,
+                                    uint8_t completed_arr[]);
+    int cleanup_prev_run(rgw::sal::RadosStore *store);
+    int32_t get_next_shard_token(rgw::sal::RadosStore *store,
                                   uint16_t start_shard,
                                   uint16_t max_count,
                                   const char *prefix);
-    int create_shard_tokens(librados::IoCtx &ioctx,
+    int create_shard_tokens(rgw::sal::RadosStore *store,
                              unsigned shards_count,
                              const char *prefix);
-    int verify_all_shard_tokens(librados::IoCtx &ioctx,
+    int verify_all_shard_tokens(rgw::sal::RadosStore *store,
                                  unsigned shards_count,
                                  const char *prefix);
-    int mark_shard_token_completed(librados::IoCtx &ioctx,
+    int mark_shard_token_completed(rgw::sal::RadosStore *store,
                                     unsigned shard,
                                     uint64_t obj_count,
                                     const char *prefix,
@@ -169,12 +184,10 @@ namespace rgw::dedup {
      work_shard_t              d_curr_worker_shard = 0;
      utime_t                   d_epoch_time;
      utime_t                   d_token_creation_time;
-    uint64_t                  d_total_ingressed_obj = 0;
      uint8_t                   d_completed_workers[MAX_WORK_SHARD];
      uint8_t                   d_completed_md5[MAX_MD5_SHARD];
      uint16_t                  d_num_completed_workers = 0;
      uint16_t                  d_num_completed_md5 = 0;
-    uint16_t                  d_num_failed_workers = 0;
    };
  
  } //namespace rgw::dedup
diff --git a/src/rgw/rgw_dedup_store.cc b/src/rgw/rgw_dedup_store.cc

index f2829bfea894363b67e702fdb600cb00fc27950d..18898bbba95298a2739f4930f2de9f589916a9a9 100644 (file)
--- a/src/rgw/rgw_dedup_store.cc
+++ b/src/rgw/rgw_dedup_store.cc
@@ -32,8 +32,6 @@
  
  namespace rgw::dedup {
  
-  rgw_pool pool(DEDUP_POOL_NAME);
-
    //---------------------------------------------------------------------------
    disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket,
                                 const std::string      &obj_name,
diff --git a/src/rgw/rgw_dedup_table.cc b/src/rgw/rgw_dedup_table.cc

index e2798ad5823d0273bf6316e9bbb1b93d46fdbd2c..09335655df626aa225edbfd3e38258adea072389 100644 (file)
--- a/src/rgw/rgw_dedup_table.cc
+++ b/src/rgw/rgw_dedup_table.cc
@@ -49,6 +49,14 @@ namespace rgw::dedup {
        }
  
        const key_t &key = hash_tab[tab_idx].key;
+      // This is an approximation only since size is stored in 4KB resolution
+      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+        hash_tab[tab_idx].val.clear_flags();
+        redistributed_clear++;
+        continue;
+      }
+
        uint32_t key_idx = key.hash() % entries_count;
        if (key_idx != tab_idx) {
          uint64_t count = 1;
@@ -195,31 +203,50 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  void dedup_table_t::count_duplicates(uint64_t *p_singleton_count,
-                                       uint64_t *p_unique_count,
-                                       uint64_t *p_duplicate_count,
-                                       uint64_t *p_duplicate_bytes_approx)
+  void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
+                                       dedup_stats_t *p_big_objs,
+                                       uint64_t *p_duplicate_head_bytes)
    {
      for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
        if (!hash_tab[tab_idx].val.is_occupied()) {
          continue;
        }
  
+      const key_t &key = hash_tab[tab_idx].key;
+      // This is an approximation only since size is stored in 4KB resolution
+      uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+      uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
+
+      // skip small single part objects which we can't dedup
+      if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+        if (hash_tab[tab_idx].val.is_singleton()) {
+          p_small_objs->singleton_count++;
+        }
+        else {
+          p_small_objs->duplicate_count += duplicate_count;
+          p_small_objs->unique_count ++;
+          p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx);
+        }
+        continue;
+      }
+
        if (hash_tab[tab_idx].val.is_singleton()) {
-        (*p_singleton_count)++;
+        p_big_objs->singleton_count++;
        }
        else {
          ceph_assert(hash_tab[tab_idx].val.count > 1);
-        uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
-        key_t &key = hash_tab[tab_idx].key;
-        // This is an approximation only since size is stored in 4KB resolution
-        uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
          uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
                                                         key.num_parts,
                                                         byte_size_approx);
-        (*p_duplicate_bytes_approx) += (duplicate_count * dup_bytes_approx);
-        (*p_duplicate_count) += duplicate_count;
-        (*p_unique_count) ++;
+        p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx);
+        p_big_objs->duplicate_count += duplicate_count;
+        p_big_objs->unique_count ++;
+
+        if (!key.multipart_object()) {
+          // single part objects duplicate the head object when dedup is used
+          uint64_t dup_head_bytes = duplicate_count * head_object_size;
+          *p_duplicate_head_bytes += dup_head_bytes;
+        }
        }
      }
    }
diff --git a/src/rgw/rgw_dedup_table.h b/src/rgw/rgw_dedup_table.h

index 669f360ffc810ca431c8d0e478ab19bd65590b49..51d36006944f42826b2019b7ed48d4399fe6e8eb 100644 (file)
--- a/src/rgw/rgw_dedup_table.h
+++ b/src/rgw/rgw_dedup_table.h
@@ -49,6 +49,10 @@ namespace rgw::dedup {
        return this->md5_low;
      }
  
+    bool multipart_object() const {
+      return num_parts > 0;
+    }
+
      uint64_t md5_high;      // High Bytes of the Object Data MD5
      uint64_t md5_low;       // Low  Bytes of the Object Data MD5
      uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB)
@@ -110,10 +114,10 @@ namespace rgw::dedup {
                                       disk_block_id_t block_id,
                                       record_id_t rec_id);
  
-    void count_duplicates(uint64_t *p_singleton_count,
-                          uint64_t *p_unique_count,
-                          uint64_t *p_duplicate_count,
-                          uint64_t *p_duplicate_bytes_approx);
+    void count_duplicates(dedup_stats_t *p_small_objs_stat,
+                          dedup_stats_t *p_big_objs_stat,
+                          uint64_t *p_duplicate_head_bytes);
+
      void remove_singletons_and_redistribute_keys();
    private:
      // 32 Bytes unified entries
diff --git a/src/rgw/rgw_dedup_utils.cc b/src/rgw/rgw_dedup_utils.cc

index ef17ec0d38e7c01810d2dd7f61352eac92a9fe3f..c380bff842c601e959572cc6965e0c838603aea9 100644 (file)
--- a/src/rgw/rgw_dedup_utils.cc
+++ b/src/rgw/rgw_dedup_utils.cc
@@ -35,6 +35,48 @@ namespace rgw::dedup {
      return out;
    }
  
+  //---------------------------------------------------------------------------
+  dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other)
+  {
+    this->singleton_count += other.singleton_count;
+    this->unique_count += other.unique_count;
+    this->duplicate_count += other.duplicate_count;
+    this->dedup_bytes_estimate += other.dedup_bytes_estimate;
+    return *this;
+  }
+
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats)
+  {
+    out << "::singleton_count="  << stats.singleton_count
+        << "::unique_count="     << stats.unique_count
+        << "::duplicate_count="  << stats.duplicate_count
+        << "::duplicated_bytes=" << stats.dedup_bytes_estimate;
+    return out;
+  }
+
+  //---------------------------------------------------------------------------
+  void encode(const dedup_stats_t& ds, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+    encode(ds.singleton_count, bl);
+    encode(ds.unique_count, bl);
+    encode(ds.duplicate_count, bl);
+    encode(ds.dedup_bytes_estimate, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(ds.singleton_count, bl);
+    decode(ds.unique_count, bl);
+    decode(ds.duplicate_count, bl);
+    decode(ds.dedup_bytes_estimate, bl);
+    DECODE_FINISH(bl);
+  }
+
    // convert a hex-string to a 64bit integer (max 16 hex digits)
    //---------------------------------------------------------------------------
    bool hex2int(const char *p, const char *p_end, uint64_t *p_val)
@@ -206,7 +248,8 @@ namespace rgw::dedup {
    };
  
    //---------------------------------------------------------------------------
-  const char* get_urgent_msg_names(int msg) {
+  const char* get_urgent_msg_names(int msg)
+  {
      if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) {
        return s_urgent_msg_names[msg];
      }
@@ -216,22 +259,34 @@ namespace rgw::dedup {
    }
  
    //---------------------------------------------------------------------------
-  std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
+  worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other)
    {
-    JSONFormatter formatter(false);
-    s.dump(&formatter);
-    std::stringstream sstream;
-    formatter.flush(sstream);
-    out << sstream.str();
-    return out;
+    this->ingress_obj += other.ingress_obj;
+    this->ingress_obj_bytes += other.ingress_obj_bytes;
+    this->egress_records += other.egress_records;
+    this->egress_blocks += other.egress_blocks;
+    this->egress_slabs += other.egress_slabs;
+    this->single_part_objs += other.single_part_objs;
+    this->multipart_objs += other.multipart_objs;
+    this->small_multipart_obj += other.small_multipart_obj;
+    this->default_storage_class_objs += other.default_storage_class_objs;
+    this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
+    this->non_default_storage_class_objs += other.non_default_storage_class_objs;
+    this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
+    this->ingress_corrupted_etag += other.ingress_corrupted_etag;
+    this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
+    this->ingress_skip_too_small += other.ingress_skip_too_small;
+    this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
+    this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
+
+    return *this;
    }
-
    //---------------------------------------------------------------------------
    void worker_stats_t::dump(Formatter *f) const
    {
      // main section
      {
-      Formatter::ObjectSection notify(*f, "main");
+      Formatter::ObjectSection main(*f, "main");
  
        f->dump_unsigned("Ingress Objs count", this->ingress_obj);
        f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes);
@@ -285,6 +340,122 @@ namespace rgw::dedup {
      }
    }
  
+  //---------------------------------------------------------------------------
+  std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
+  {
+    JSONFormatter formatter(false);
+    s.dump(&formatter);
+    std::stringstream sstream;
+    formatter.flush(sstream);
+    out << sstream.str();
+    return out;
+  }
+
+  //---------------------------------------------------------------------------
+  void encode(const worker_stats_t& w, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+    encode(w.ingress_obj, bl);
+    encode(w.ingress_obj_bytes, bl);
+    encode(w.egress_records, bl);
+    encode(w.egress_blocks, bl);
+    encode(w.egress_slabs, bl);
+
+    encode(w.single_part_objs, bl);
+    encode(w.multipart_objs, bl);
+    encode(w.small_multipart_obj, bl);
+
+    encode(w.default_storage_class_objs, bl);
+    encode(w.default_storage_class_objs_bytes, bl);
+    encode(w.non_default_storage_class_objs, bl);
+    encode(w.non_default_storage_class_objs_bytes, bl);
+
+    encode(w.ingress_corrupted_etag, bl);
+
+    encode(w.ingress_skip_too_small_bytes, bl);
+    encode(w.ingress_skip_too_small, bl);
+
+    encode(w.ingress_skip_too_small_64KB_bytes, bl);
+    encode(w.ingress_skip_too_small_64KB, bl);
+
+    encode(w.duration, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(w.ingress_obj, bl);
+    decode(w.ingress_obj_bytes, bl);
+    decode(w.egress_records, bl);
+    decode(w.egress_blocks, bl);
+    decode(w.egress_slabs, bl);
+    decode(w.single_part_objs, bl);
+    decode(w.multipart_objs, bl);
+    decode(w.small_multipart_obj, bl);
+    decode(w.default_storage_class_objs, bl);
+    decode(w.default_storage_class_objs_bytes, bl);
+    decode(w.non_default_storage_class_objs, bl);
+    decode(w.non_default_storage_class_objs_bytes, bl);
+    decode(w.ingress_corrupted_etag, bl);
+    decode(w.ingress_skip_too_small_bytes, bl);
+    decode(w.ingress_skip_too_small, bl);
+    decode(w.ingress_skip_too_small_64KB_bytes, bl);
+    decode(w.ingress_skip_too_small_64KB, bl);
+
+    decode(w.duration, bl);
+    DECODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
+  {
+    this->small_objs_stat               += other.small_objs_stat;
+    this->big_objs_stat                 += other.big_objs_stat;
+    this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
+    this->ingress_failed_get_object     += other.ingress_failed_get_object;
+    this->ingress_failed_get_obj_attrs  += other.ingress_failed_get_obj_attrs;
+    this->ingress_corrupted_etag        += other.ingress_corrupted_etag;
+    this->ingress_corrupted_obj_attrs   += other.ingress_corrupted_obj_attrs;
+    this->ingress_skip_encrypted        += other.ingress_skip_encrypted;
+    this->ingress_skip_encrypted_bytes  += other.ingress_skip_encrypted_bytes;
+    this->ingress_skip_compressed       += other.ingress_skip_compressed;
+    this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
+    this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
+    this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
+
+    this->skipped_shared_manifest += other.skipped_shared_manifest;
+    this->skipped_purged_small    += other.skipped_purged_small;
+    this->skipped_singleton       += other.skipped_singleton;
+    this->skipped_singleton_bytes += other.skipped_singleton_bytes;
+    this->skipped_source_record   += other.skipped_source_record;
+    this->duplicate_records       += other.duplicate_records;
+    this->size_mismatch           += other.size_mismatch;
+    this->sha256_mismatch         += other.sha256_mismatch;
+    this->failed_src_load         += other.failed_src_load;
+    this->failed_rec_load         += other.failed_rec_load;
+    this->failed_block_load       += other.failed_block_load;
+
+    this->valid_sha256_attrs      += other.valid_sha256_attrs;
+    this->invalid_sha256_attrs    += other.invalid_sha256_attrs;
+    this->set_sha256_attrs        += other.set_sha256_attrs;
+    this->skip_sha256_cmp         += other.skip_sha256_cmp;
+
+    this->set_shared_manifest_src += other.set_shared_manifest_src;
+    this->loaded_objects          += other.loaded_objects;
+    this->processed_objects       += other.processed_objects;
+    this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
+    this->deduped_objects         += other.deduped_objects;
+    this->deduped_objects_bytes   += other.deduped_objects_bytes;
+    this->dup_head_bytes          += other.dup_head_bytes;
+
+    this->failed_dedup            += other.failed_dedup;
+    this->failed_table_load       += other.failed_table_load;
+    this->failed_map_overflow     += other.failed_map_overflow;
+    return *this;
+  }
+
    //---------------------------------------------------------------------------
    std::ostream& operator<<(std::ostream &out, const md5_stats_t &s)
    {
@@ -301,19 +472,37 @@ namespace rgw::dedup {
    {
      // main section
      {
-      Formatter::ObjectSection notify(*f, "main");
+      Formatter::ObjectSection main(*f, "main");
  
        f->dump_unsigned("Total processed objects", this->processed_objects);
        f->dump_unsigned("Loaded objects", this->loaded_objects);
        f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
        f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
        f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
+      f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
        f->dump_unsigned("Already Deduped bytes (prev cycles)",
                         this->shared_manifest_dedup_bytes);
-      f->dump_unsigned("Singleton Obj", this->singleton_count);
-      f->dump_unsigned("Unique Obj", this->unique_count);
-      f->dump_unsigned("Duplicate Obj", this->duplicate_count);
-      f->dump_unsigned("Dedup Bytes Estimate", this->dedup_bytes_estimate);
+
+      const dedup_stats_t &ds = this->big_objs_stat;
+      f->dump_unsigned("Singleton Obj", ds.singleton_count);
+      f->dump_unsigned("Unique Obj", ds.unique_count);
+      f->dump_unsigned("Duplicate Obj", ds.duplicate_count);
+      f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
+    }
+
+    // Potential Dedup Section:
+    // What could be gained by allowing dedup for smaller objects (64KB-4MB)
+    // Space wasted because of duplicated head-object (4MB)
+    {
+      Formatter::ObjectSection potential(*f, "Potential Dedup");
+      const dedup_stats_t &ds = this->small_objs_stat;
+      f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
+      f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
+      f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
+      f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
+      f->dump_unsigned("Duplicated Head Bytes Estimate",
+                       this->dup_head_bytes_estimate);
+      f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
      }
  
      {
@@ -340,6 +529,7 @@ namespace rgw::dedup {
      {
        Formatter::ObjectSection skipped(*f, "skipped");
        f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest);
+      f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small);
        f->dump_unsigned("Skipped singleton objs", this->skipped_singleton);
        if (this->skipped_singleton) {
          f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes);
@@ -403,4 +593,105 @@ namespace rgw::dedup {
        }
      }
    }
+
+  //---------------------------------------------------------------------------
+  void encode(const md5_stats_t& m, ceph::bufferlist& bl)
+  {
+    ENCODE_START(1, 1, bl);
+
+    encode(m.small_objs_stat, bl);
+    encode(m.big_objs_stat, bl);
+    encode(m.ingress_failed_load_bucket, bl);
+    encode(m.ingress_failed_get_object, bl);
+    encode(m.ingress_failed_get_obj_attrs, bl);
+    encode(m.ingress_corrupted_etag, bl);
+    encode(m.ingress_corrupted_obj_attrs, bl);
+    encode(m.ingress_skip_encrypted, bl);
+    encode(m.ingress_skip_encrypted_bytes, bl);
+    encode(m.ingress_skip_compressed, bl);
+    encode(m.ingress_skip_compressed_bytes, bl);
+    encode(m.ingress_skip_changed_objs, bl);
+    encode(m.shared_manifest_dedup_bytes, bl);
+
+    encode(m.skipped_shared_manifest, bl);
+    encode(m.skipped_purged_small, bl);
+    encode(m.skipped_singleton, bl);
+    encode(m.skipped_singleton_bytes, bl);
+    encode(m.skipped_source_record, bl);
+    encode(m.duplicate_records, bl);
+    encode(m.size_mismatch, bl);
+    encode(m.sha256_mismatch, bl);
+    encode(m.failed_src_load, bl);
+    encode(m.failed_rec_load, bl);
+    encode(m.failed_block_load, bl);
+
+    encode(m.valid_sha256_attrs, bl);
+    encode(m.invalid_sha256_attrs, bl);
+    encode(m.set_sha256_attrs, bl);
+    encode(m.skip_sha256_cmp, bl);
+    encode(m.set_shared_manifest_src, bl);
+
+    encode(m.loaded_objects, bl);
+    encode(m.processed_objects, bl);
+    encode(m.dup_head_bytes_estimate, bl);
+    encode(m.deduped_objects, bl);
+    encode(m.deduped_objects_bytes, bl);
+    encode(m.dup_head_bytes, bl);
+    encode(m.failed_dedup, bl);
+    encode(m.failed_table_load, bl);
+    encode(m.failed_map_overflow, bl);
+
+    encode(m.duration, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  //---------------------------------------------------------------------------
+  void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
+  {
+    DECODE_START(1, bl);
+    decode(m.small_objs_stat, bl);
+    decode(m.big_objs_stat, bl);
+    decode(m.ingress_failed_load_bucket, bl);
+    decode(m.ingress_failed_get_object, bl);
+    decode(m.ingress_failed_get_obj_attrs, bl);
+    decode(m.ingress_corrupted_etag, bl);
+    decode(m.ingress_corrupted_obj_attrs, bl);
+    decode(m.ingress_skip_encrypted, bl);
+    decode(m.ingress_skip_encrypted_bytes, bl);
+    decode(m.ingress_skip_compressed, bl);
+    decode(m.ingress_skip_compressed_bytes, bl);
+    decode(m.ingress_skip_changed_objs, bl);
+    decode(m.shared_manifest_dedup_bytes, bl);
+
+    decode(m.skipped_shared_manifest, bl);
+    decode(m.skipped_purged_small, bl);
+    decode(m.skipped_singleton, bl);
+    decode(m.skipped_singleton_bytes, bl);
+    decode(m.skipped_source_record, bl);
+    decode(m.duplicate_records, bl);
+    decode(m.size_mismatch, bl);
+    decode(m.sha256_mismatch, bl);
+    decode(m.failed_src_load, bl);
+    decode(m.failed_rec_load, bl);
+    decode(m.failed_block_load, bl);
+
+    decode(m.valid_sha256_attrs, bl);
+    decode(m.invalid_sha256_attrs, bl);
+    decode(m.set_sha256_attrs, bl);
+    decode(m.skip_sha256_cmp, bl);
+    decode(m.set_shared_manifest_src, bl);
+
+    decode(m.loaded_objects, bl);
+    decode(m.processed_objects, bl);
+    decode(m.dup_head_bytes_estimate, bl);
+    decode(m.deduped_objects, bl);
+    decode(m.deduped_objects_bytes, bl);
+    decode(m.dup_head_bytes, bl);
+    decode(m.failed_dedup, bl);
+    decode(m.failed_table_load, bl);
+    decode(m.failed_map_overflow, bl);
+
+    decode(m.duration, bl);
+    DECODE_FINISH(bl);
+  }
  } //namespace rgw::dedup
diff --git a/src/rgw/rgw_dedup_utils.h b/src/rgw/rgw_dedup_utils.h

index 9c862272ca793516926065df921a1784018e3b5f..6a1d0fc0f45833b4ddf43788188c4f0c37c2e0bd 100644 (file)
--- a/src/rgw/rgw_dedup_utils.h
+++ b/src/rgw/rgw_dedup_utils.h
@@ -25,7 +25,6 @@
  
  //#define FULL_DEDUP_SUPPORT
  namespace rgw::dedup {
-  static constexpr const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
    using work_shard_t   = uint16_t;
    using md5_shard_t    = uint16_t;
  
@@ -86,29 +85,21 @@ namespace rgw::dedup {
      uint8_t flags;
    };
  
-  struct worker_stats_t {
-    worker_stats_t& operator +=(const worker_stats_t& other) {
-      this->ingress_obj += other.ingress_obj;
-      this->ingress_obj_bytes += other.ingress_obj_bytes;
-      this->egress_records += other.egress_records;
-      this->egress_blocks += other.egress_blocks;
-      this->egress_slabs += other.egress_slabs;
-      this->single_part_objs += other.single_part_objs;
-      this->multipart_objs += other.multipart_objs;
-      this->small_multipart_obj += other.small_multipart_obj;
-      this->default_storage_class_objs += other.default_storage_class_objs;
-      this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
-      this->non_default_storage_class_objs += other.non_default_storage_class_objs;
-      this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
-      this->ingress_corrupted_etag += other.ingress_corrupted_etag;
-      this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
-      this->ingress_skip_too_small += other.ingress_skip_too_small;
-      this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
-      this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
-
-      return *this;
-    }
+  struct dedup_stats_t {
+    dedup_stats_t& operator+=(const dedup_stats_t& other);
+
+    uint64_t singleton_count = 0;
+    uint64_t unique_count = 0;
+    uint64_t duplicate_count = 0;
+    uint64_t dedup_bytes_estimate = 0;
+  };
  
+  std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats);
+  void encode(const dedup_stats_t& ds, ceph::bufferlist& bl);
+  void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl);
+
+  struct worker_stats_t {
+    worker_stats_t& operator +=(const worker_stats_t& other);
      void dump(Formatter *f) const;
  
      uint64_t ingress_obj = 0;
@@ -138,109 +129,16 @@ namespace rgw::dedup {
      utime_t  duration = {0, 0};
    };
    std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
+  void encode(const worker_stats_t& w, ceph::bufferlist& bl);
+  void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl);
  
-  inline void encode(const worker_stats_t& w, ceph::bufferlist& bl)
-  {
-    ENCODE_START(1, 1, bl);
-    encode(w.ingress_obj, bl);
-    encode(w.ingress_obj_bytes, bl);
-    encode(w.egress_records, bl);
-    encode(w.egress_blocks, bl);
-    encode(w.egress_slabs, bl);
-
-    encode(w.single_part_objs, bl);
-    encode(w.multipart_objs, bl);
-    encode(w.small_multipart_obj, bl);
-
-    encode(w.default_storage_class_objs, bl);
-    encode(w.default_storage_class_objs_bytes, bl);
-    encode(w.non_default_storage_class_objs, bl);
-    encode(w.non_default_storage_class_objs_bytes, bl);
-
-    encode(w.ingress_corrupted_etag, bl);
-
-    encode(w.ingress_skip_too_small_bytes, bl);
-    encode(w.ingress_skip_too_small, bl);
-
-    encode(w.ingress_skip_too_small_64KB_bytes, bl);
-    encode(w.ingress_skip_too_small_64KB, bl);
-
-    encode(w.duration, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  inline void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
-  {
-    DECODE_START(1, bl);
-    decode(w.ingress_obj, bl);
-    decode(w.ingress_obj_bytes, bl);
-    decode(w.egress_records, bl);
-    decode(w.egress_blocks, bl);
-    decode(w.egress_slabs, bl);
-    decode(w.single_part_objs, bl);
-    decode(w.multipart_objs, bl);
-    decode(w.small_multipart_obj, bl);
-    decode(w.default_storage_class_objs, bl);
-    decode(w.default_storage_class_objs_bytes, bl);
-    decode(w.non_default_storage_class_objs, bl);
-    decode(w.non_default_storage_class_objs_bytes, bl);
-    decode(w.ingress_corrupted_etag, bl);
-    decode(w.ingress_skip_too_small_bytes, bl);
-    decode(w.ingress_skip_too_small, bl);
-    decode(w.ingress_skip_too_small_64KB_bytes, bl);
-    decode(w.ingress_skip_too_small_64KB, bl);
-
-    decode(w.duration, bl);
-    DECODE_FINISH(bl);
-  }
  
    struct md5_stats_t {
-    md5_stats_t& operator +=(const md5_stats_t& other) {
-      this->ingress_failed_load_bucket    += other.ingress_failed_load_bucket;
-      this->ingress_failed_get_object     += other.ingress_failed_get_object;
-      this->ingress_failed_get_obj_attrs  += other.ingress_failed_get_obj_attrs;
-      this->ingress_corrupted_etag        += other.ingress_corrupted_etag;
-      this->ingress_corrupted_obj_attrs   += other.ingress_corrupted_obj_attrs;
-      this->ingress_skip_encrypted        += other.ingress_skip_encrypted;
-      this->ingress_skip_encrypted_bytes  += other.ingress_skip_encrypted_bytes;
-      this->ingress_skip_compressed       += other.ingress_skip_compressed;
-      this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
-      this->ingress_skip_changed_objs     += other.ingress_skip_changed_objs;
-      this->shared_manifest_dedup_bytes   += other.shared_manifest_dedup_bytes;
-
-      this->skipped_shared_manifest += other.skipped_shared_manifest;
-      this->skipped_singleton       += other.skipped_singleton;
-      this->skipped_singleton_bytes += other.skipped_singleton_bytes;
-      this->skipped_source_record   += other.skipped_source_record;
-      this->duplicate_records       += other.duplicate_records;
-      this->size_mismatch           += other.size_mismatch;
-      this->sha256_mismatch         += other.sha256_mismatch;
-      this->failed_src_load         += other.failed_src_load;
-      this->failed_rec_load         += other.failed_rec_load;
-      this->failed_block_load       += other.failed_block_load;
-
-      this->valid_sha256_attrs      += other.valid_sha256_attrs;
-      this->invalid_sha256_attrs    += other.invalid_sha256_attrs;
-      this->set_sha256_attrs        += other.set_sha256_attrs;
-      this->skip_sha256_cmp         += other.skip_sha256_cmp;
-
-      this->set_shared_manifest_src += other.set_shared_manifest_src;
-      this->loaded_objects          += other.loaded_objects;
-      this->processed_objects       += other.processed_objects;
-      this->singleton_count         += other.singleton_count;
-      this->duplicate_count         += other.duplicate_count;
-      this->dedup_bytes_estimate    += other.dedup_bytes_estimate;
-      this->unique_count            += other.unique_count;
-      this->deduped_objects         += other.deduped_objects;
-      this->deduped_objects_bytes   += other.deduped_objects_bytes;
-
-      this->failed_dedup            += other.failed_dedup;
-      this->failed_table_load       += other.failed_table_load;
-      this->failed_map_overflow     += other.failed_map_overflow;
-      return *this;
-    }
+    md5_stats_t& operator +=(const md5_stats_t& other);
      void dump(Formatter *f) const;
  
+    dedup_stats_t small_objs_stat;
+    dedup_stats_t big_objs_stat;
      uint64_t ingress_failed_load_bucket = 0;
      uint64_t ingress_failed_get_object = 0;
      uint64_t ingress_failed_get_obj_attrs = 0;
@@ -254,6 +152,7 @@ namespace rgw::dedup {
  
      uint64_t shared_manifest_dedup_bytes = 0;
      uint64_t skipped_shared_manifest = 0;
+    uint64_t skipped_purged_small = 0;
      uint64_t skipped_singleton = 0;
      uint64_t skipped_singleton_bytes = 0;
      uint64_t skipped_source_record = 0;
@@ -272,116 +171,20 @@ namespace rgw::dedup {
      uint64_t set_shared_manifest_src = 0;
      uint64_t loaded_objects = 0;
      uint64_t processed_objects = 0;
-    uint64_t singleton_count = 0;
-    uint64_t duplicate_count = 0;
      // counter is using on-disk size affected by block-size
-    uint64_t dedup_bytes_estimate = 0;
-    uint64_t unique_count = 0;
+    uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
      uint64_t deduped_objects = 0;
      // counter is using s3 byte size disregarding the on-disk size affected by block-size
      uint64_t deduped_objects_bytes = 0;
+    uint64_t dup_head_bytes = 0;
      uint64_t failed_dedup = 0;
      uint64_t failed_table_load = 0;
      uint64_t failed_map_overflow = 0;
      utime_t  duration = {0, 0};
    };
    std::ostream &operator<<(std::ostream &out, const md5_stats_t &s);
-  inline void encode(const md5_stats_t& m, ceph::bufferlist& bl)
-  {
-    ENCODE_START(1, 1, bl);
-
-    encode(m.ingress_failed_load_bucket, bl);
-    encode(m.ingress_failed_get_object, bl);
-    encode(m.ingress_failed_get_obj_attrs, bl);
-    encode(m.ingress_corrupted_etag, bl);
-    encode(m.ingress_corrupted_obj_attrs, bl);
-    encode(m.ingress_skip_encrypted, bl);
-    encode(m.ingress_skip_encrypted_bytes, bl);
-    encode(m.ingress_skip_compressed, bl);
-    encode(m.ingress_skip_compressed_bytes, bl);
-    encode(m.ingress_skip_changed_objs, bl);
-    encode(m.shared_manifest_dedup_bytes, bl);
-
-    encode(m.skipped_shared_manifest, bl);
-    encode(m.skipped_singleton, bl);
-    encode(m.skipped_singleton_bytes, bl);
-    encode(m.skipped_source_record, bl);
-    encode(m.duplicate_records, bl);
-    encode(m.size_mismatch, bl);
-    encode(m.sha256_mismatch, bl);
-    encode(m.failed_src_load, bl);
-    encode(m.failed_rec_load, bl);
-    encode(m.failed_block_load, bl);
-
-    encode(m.valid_sha256_attrs, bl);
-    encode(m.invalid_sha256_attrs, bl);
-    encode(m.set_sha256_attrs, bl);
-    encode(m.skip_sha256_cmp, bl);
-    encode(m.set_shared_manifest_src, bl);
-
-    encode(m.loaded_objects, bl);
-    encode(m.processed_objects, bl);
-    encode(m.singleton_count, bl);
-    encode(m.duplicate_count, bl);
-    encode(m.dedup_bytes_estimate, bl);
-    encode(m.unique_count, bl);
-    encode(m.deduped_objects, bl);
-    encode(m.deduped_objects_bytes, bl);
-    encode(m.failed_dedup, bl);
-    encode(m.failed_table_load, bl);
-    encode(m.failed_map_overflow, bl);
-
-    encode(m.duration, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  inline void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
-  {
-    DECODE_START(1, bl);
-    decode(m.ingress_failed_load_bucket, bl);
-    decode(m.ingress_failed_get_object, bl);
-    decode(m.ingress_failed_get_obj_attrs, bl);
-    decode(m.ingress_corrupted_etag, bl);
-    decode(m.ingress_corrupted_obj_attrs, bl);
-    decode(m.ingress_skip_encrypted, bl);
-    decode(m.ingress_skip_encrypted_bytes, bl);
-    decode(m.ingress_skip_compressed, bl);
-    decode(m.ingress_skip_compressed_bytes, bl);
-    decode(m.ingress_skip_changed_objs, bl);
-    decode(m.shared_manifest_dedup_bytes, bl);
-
-    decode(m.skipped_shared_manifest, bl);
-    decode(m.skipped_singleton, bl);
-    decode(m.skipped_singleton_bytes, bl);
-    decode(m.skipped_source_record, bl);
-    decode(m.duplicate_records, bl);
-    decode(m.size_mismatch, bl);
-    decode(m.sha256_mismatch, bl);
-    decode(m.failed_src_load, bl);
-    decode(m.failed_rec_load, bl);
-    decode(m.failed_block_load, bl);
-
-    decode(m.valid_sha256_attrs, bl);
-    decode(m.invalid_sha256_attrs, bl);
-    decode(m.set_sha256_attrs, bl);
-    decode(m.skip_sha256_cmp, bl);
-    decode(m.set_shared_manifest_src, bl);
-
-    decode(m.loaded_objects, bl);
-    decode(m.processed_objects, bl);
-    decode(m.singleton_count, bl);
-    decode(m.duplicate_count, bl);
-    decode(m.dedup_bytes_estimate, bl);
-    decode(m.unique_count, bl);
-    decode(m.deduped_objects, bl);
-    decode(m.deduped_objects_bytes, bl);
-    decode(m.failed_dedup, bl);
-    decode(m.failed_table_load, bl);
-    decode(m.failed_map_overflow, bl);
-
-    decode(m.duration, bl);
-    DECODE_FINISH(bl);
-  }
+  void encode(const md5_stats_t& m, ceph::bufferlist& bl);
+  void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl);
  
    struct parsed_etag_t {
      uint64_t md5_high;  // High Bytes of the Object Data MD5
diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc

index 44338c0fea8130179785ed4361108b837c87647d..ed693b15f9d5d39c8845748289c4ba1cc62cee71 100644 (file)
--- a/src/rgw/rgw_zone.cc
+++ b/src/rgw/rgw_zone.cc
@@ -284,6 +284,7 @@ void RGWZoneParams::decode_json(JSONObj *obj)
    RGWSystemMetaObj::decode_json(obj);
    JSONDecoder::decode_json("domain_root", domain_root, obj);
    JSONDecoder::decode_json("control_pool", control_pool, obj);
+  JSONDecoder::decode_json("dedup_pool", dedup_pool, obj);
    JSONDecoder::decode_json("gc_pool", gc_pool, obj);
    JSONDecoder::decode_json("lc_pool", lc_pool, obj);
    JSONDecoder::decode_json("log_pool", log_pool, obj);
@@ -311,6 +312,7 @@ void RGWZoneParams::dump(Formatter *f) const
    RGWSystemMetaObj::dump(f);
    encode_json("domain_root", domain_root, f);
    encode_json("control_pool", control_pool, f);
+  encode_json("dedup_pool", dedup_pool, f);
    encode_json("gc_pool", gc_pool, f);
    encode_json("lc_pool", lc_pool, f);
    encode_json("log_pool", log_pool, f);
@@ -472,6 +474,7 @@ void add_zone_pools(const RGWZoneParams& info,
  {
    pools.insert(info.domain_root);
    pools.insert(info.control_pool);
+  pools.insert(info.dedup_pool);
    pools.insert(info.gc_pool);
    pools.insert(info.log_pool);
    pools.insert(info.intent_log_pool);
@@ -1274,6 +1277,7 @@ int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
  {
    info.domain_root = fix_zone_pool_dup(pools, info.name, ".rgw.meta:root", info.domain_root);
    info.control_pool = fix_zone_pool_dup(pools, info.name, ".rgw.control", info.control_pool);
+  info.dedup_pool = fix_zone_pool_dup(pools, info.name, ".rgw.dedup", info.dedup_pool);
    info.gc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:gc", info.gc_pool);
    info.lc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:lc", info.lc_pool);
    info.log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log", info.log_pool);
diff --git a/src/test/rgw/dedup/pytest.ini b/src/test/rgw/dedup/pytest.ini

index e7b344d3e94e1c97209897720c591a124c32dbec..9adbc77ec849f4c2b6afb39567632d9e3beb8a80 100644 (file)
--- a/src/test/rgw/dedup/pytest.ini
+++ b/src/test/rgw/dedup/pytest.ini
@@ -3,5 +3,5 @@ markers =
    basic_test
  
  log_cli=true
-log_cli_level=WARNING
-#log_cli_level=INFO
+#log_cli_level=WARNING
+log_cli_level=INFO
diff --git a/src/test/rgw/dedup/test_dedup.py b/src/test/rgw/dedup/test_dedup.py

index 26fb8da61a7a61ff711b1e8ac67fddeb97e0a40e..a339e25b6b417e14011e15685c2c39a8cf6b500d 100644 (file)
--- a/src/test/rgw/dedup/test_dedup.py
+++ b/src/test/rgw/dedup/test_dedup.py
@@ -41,15 +41,21 @@ class Dedup_Stats:
      set_sha256: int = 0
      total_processed_objects: int = 0
      size_before_dedup: int = 0
-    loaded_objects: int = 0
+    #loaded_objects: int = 0
      set_shared_manifest_src : int = 0
      deduped_obj: int = 0
      singleton_obj : int = 0
      unique_obj : int = 0
      dedup_bytes_estimate : int = 0
      duplicate_obj : int = 0
+    dup_head_size_estimate : int = 0
+    dup_head_size : int = 0
      deduped_obj_bytes : int = 0
      non_default_storage_class_objs_bytes : int = 0
+    potential_singleton_obj : int = 0
+    potential_unique_obj : int = 0
+    potential_duplicate_obj : int = 0
+    potential_dedup_space : int = 0
  
  @dataclass
  class Dedup_Ratio:
@@ -71,7 +77,7 @@ test_path = os.path.normpath(os.path.dirname(os.path.realpath(__file__))) + '/..
  
  #-----------------------------------------------
  def bash(cmd, **kwargs):
-    #log.info('running command: %s', ' '.join(cmd))
+    #log.debug('running command: %s', ' '.join(cmd))
      kwargs['stdout'] = subprocess.PIPE
      process = subprocess.Popen(cmd, **kwargs)
      s = process.communicate()[0].decode('utf-8')
@@ -95,7 +101,7 @@ def gen_bucket_name():
  
      num_buckets += 1
      bucket_name = run_prefix + '-' + str(num_buckets)
-    log.info("bucket_name=%s", bucket_name);
+    log.debug("bucket_name=%s", bucket_name);
      return bucket_name
  
  #-----------------------------------------------
@@ -118,11 +124,11 @@ def close_all_connections():
      global g_simple_connection
  
      for conn in g_simple_connection:
-        log.info("close simple connection")
+        log.debug("close simple connection")
          conn.close()
  
      for conn in g_tenant_connections:
-        log.info("close tenant connection")
+        log.debug("close tenant connection")
          conn.close()
  
  #-----------------------------------------------
@@ -131,7 +137,7 @@ def get_connections(req_count):
      conns=[]
  
      for i in range(min(req_count, len(g_simple_connection))):
-        log.info("recycle existing connection")
+        log.debug("recycle existing connection")
          conns.append(g_simple_connection[i])
  
      if len(conns) < req_count:
@@ -145,7 +151,7 @@ def get_connections(req_count):
              scheme = 'http://'
  
          for i in range(req_count - len(conns)):
-            log.info("generate new connection")
+            log.debug("generate new connection")
              client = boto3.client('s3',
                                    endpoint_url=scheme+hostname+':'+str(port_no),
                                    aws_access_key_id=access_key,
@@ -194,7 +200,7 @@ def gen_connections_multi2(req_count):
      g_tenants=[]
      global num_conns
  
-    log.info("gen_connections_multi: Create connection and buckets ...")
+    log.debug("gen_connections_multi: Create connection and buckets ...")
      suffix=run_prefix
  
      tenants=[]
@@ -202,7 +208,7 @@ def gen_connections_multi2(req_count):
      conns=[]
  
      for i in range(min(req_count, len(g_tenants))):
-        log.info("recycle existing tenants connection")
+        log.debug("recycle existing tenants connection")
          conns.append(g_tenants_connection[i])
          tenants.append(g_tenants[i])
          # we need to create a new bucket as we remove existing buckets at cleanup
@@ -227,7 +233,7 @@ def gen_connections_multi2(req_count):
              g_tenant_connections.append(conn)
              conns.append(conn)
  
-    log.info("gen_connections_multi: All connection and buckets are set")
+    log.debug("gen_connections_multi: All connection and buckets are set")
      return (tenants, bucket_names, conns)
  
  
@@ -238,7 +244,7 @@ def gen_connections_multi(num_tenants):
      tenants=[]
      bucket_names=[]
      conns=[]
-    log.info("gen_connections_multi: Create connection and buckets ...")
+    log.debug("gen_connections_multi: Create connection and buckets ...")
      suffix=run_prefix
      for i in range(0, num_tenants):
          num_conns += 1
@@ -254,7 +260,7 @@ def gen_connections_multi(num_tenants):
          bucket=conn.create_bucket(Bucket=bucket_name)
          conns.append(conn)
  
-    log.info("gen_connections_multi: All connection and buckets are set")
+    log.debug("gen_connections_multi: All connection and buckets are set")
      return (tenants, bucket_names, conns)
  
  
@@ -264,6 +270,7 @@ def gen_connections_multi(num_tenants):
  OUT_DIR="/tmp/dedup/"
  KB=(1024)
  MB=(1024*KB)
+POTENTIAL_OBJ_SIZE=(64*KB)
  RADOS_OBJ_SIZE=(4*MB)
  MULTIPART_SIZE=(16*MB)
  default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE)
@@ -282,9 +289,9 @@ def write_file(filename, size):
  #-------------------------------------------------------------------------------
  def print_size(caller, size):
      if (size < MB):
-        log.info("%s::size=%.2f KiB (%d Bytes)", caller, size/KB, size)
+        log.debug("%s::size=%.2f KiB (%d Bytes)", caller, size/KB, size)
      else:
-        log.info("%s::size=%.2f MiB", caller, size/MB)
+        log.debug("%s::size=%.2f MiB", caller, size/MB)
  
  
  #-------------------------------------------------------------------------------
@@ -366,13 +373,13 @@ def gen_files(files, start_size, factor, max_copies_count=4):
  def count_space_in_all_buckets():
      result = rados(['df'])
      assert result[1] == 0
-    log.info("=============================================")
+    log.debug("=============================================")
      for line in result[0].splitlines():
          if line.startswith(POOLNAME):
-            log.info(line[:45])
+            log.debug(line[:45])
          elif line.startswith("POOL_NAME"):
-            log.info(line[:45])
-            log.info("=============================================")
+            log.debug(line[:45])
+            log.debug("=============================================")
  
  
  #-------------------------------------------------------------------------------
@@ -381,7 +388,7 @@ def count_objects_in_bucket(bucket_name, conn):
      marker=""
      obj_count=0
      while True:
-        log.info("bucket_name=%s", bucket_name)
+        log.debug("bucket_name=%s", bucket_name)
          listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
          if 'Contents' not in listing or len(listing['Contents'])== 0:
              return 0
@@ -390,7 +397,7 @@ def count_objects_in_bucket(bucket_name, conn):
  
          if listing['IsTruncated']:
              marker=listing['NextMarker']
-            log.info("marker=%s, obj_count=%d", marker, obj_count)
+            log.debug("marker=%s, obj_count=%d", marker, obj_count)
              continue
          else:
              return obj_count
@@ -417,11 +424,11 @@ def count_object_parts_in_all_buckets(verbose=False):
      names=result[0].split()
      count = 0
      for name in names:
-        #log.info(name)
+        #log.debug(name)
          count = count + 1
  
      if verbose:
-        log.info("Pool has %d rados objects", count)
+        log.debug("Pool has %d rados objects", count)
  
      return count
  
@@ -444,7 +451,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
      while True:
          listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
          if 'Contents' not in listing or len(listing['Contents'])== 0:
-            log.info("Bucket '%s' is empty, skipping...", bucket_name)
+            log.debug("Bucket '%s' is empty, skipping...", bucket_name)
              return
  
          objects=[]
@@ -457,7 +464,7 @@ def delete_bucket_with_all_objects(bucket_name, conn):
          conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
          if listing['IsTruncated']:
              marker=listing['NextMarker']
-            log.info("marker=%s, obj_count=%d", marker, obj_count)
+            log.debug("marker=%s, obj_count=%d", marker, obj_count)
              continue
          else:
              break
@@ -533,6 +540,7 @@ def calc_rados_obj_count(num_copies, obj_size, config):
  
  #-------------------------------------------------------------------------------
  def calc_dedupable_space(obj_size, config):
+    dup_head_size=0
      threshold = config.multipart_threshold
      # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
      # multi-part objects got a zero size Head objects
@@ -540,12 +548,13 @@ def calc_dedupable_space(obj_size, config):
          dedupable_space = obj_size
      elif obj_size > RADOS_OBJ_SIZE:
          dedupable_space = obj_size - RADOS_OBJ_SIZE
+        dup_head_size = RADOS_OBJ_SIZE
      else:
          dedupable_space = 0
  
      log.debug("obj_size=%.2f MiB, dedupable_space=%.2f MiB",
                float(obj_size)/MB, float(dedupable_space)/MB)
-    return dedupable_space
+    return (dedupable_space, dup_head_size)
  
  BLOCK_SIZE=4096
  #-------------------------------------------------------------------------------
@@ -555,6 +564,7 @@ def calc_on_disk_byte_size(byte_size):
  
  #-------------------------------------------------------------------------------
  def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
+    dups_count = (num_copies - 1)
      on_disk_byte_size = calc_on_disk_byte_size(obj_size)
      log.debug("obj_size=%d, on_disk_byte_size=%d", obj_size, on_disk_byte_size)
      threshold = config.multipart_threshold
@@ -563,10 +573,19 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
      if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE:
          dedup_stats.skip_too_small += num_copies
          dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
+
+        if on_disk_byte_size >= POTENTIAL_OBJ_SIZE:
+            if num_copies == 1:
+                dedup_stats.potential_singleton_obj += 1
+            else:
+                dedup_stats.potential_unique_obj += 1
+                dedup_stats.potential_duplicate_obj += dups_count
+                dedup_stats.potential_dedup_space += (on_disk_byte_size * dups_count)
+
          return
  
      dedup_stats.total_processed_objects += num_copies
-    dedup_stats.loaded_objects += num_copies
+    #dedup_stats.loaded_objects += num_copies
  
      if num_copies == 1:
          dedup_stats.singleton_obj += 1
@@ -578,11 +597,14 @@ def calc_expected_stats(dedup_stats, obj_size, num_copies, config):
          dedup_stats.set_sha256 += num_copies
          dedup_stats.invalid_sha256 += num_copies
          dedup_stats.unique_obj += 1
-        dups_count = (num_copies - 1)
          dedup_stats.duplicate_obj += dups_count
          dedup_stats.deduped_obj += dups_count
-        deduped_obj_bytes=calc_dedupable_space(on_disk_byte_size, config)
+        ret=calc_dedupable_space(on_disk_byte_size, config)
+        deduped_obj_bytes=ret[0]
+        dup_head_size=ret[1]
          dedup_stats.deduped_obj_bytes += (deduped_obj_bytes * dups_count)
+        dedup_stats.dup_head_size += (dup_head_size * dups_count)
+        dedup_stats.dup_head_size_estimate += (dup_head_size * dups_count)
          deduped_block_bytes=((deduped_obj_bytes+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE
          dedup_stats.dedup_bytes_estimate += (deduped_block_bytes * dups_count)
  
@@ -626,7 +648,9 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
          assert(obj_size)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
          total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
          rados_objects_total += (rados_obj_count * num_copies)
@@ -634,25 +658,25 @@ def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=Tr
          log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
          s3_objects_total += num_copies
          if s3_objects_total and (s3_objects_total % 1000 == 0):
-            log.info("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
                       s3_objects_total, rados_objects_total, total_space/MB)
          for i in range(idx, num_copies):
              key = gen_object_name(filename, i)
-            #log.info("upload_file %s/%s with crc32", bucket_name, key)
+            #log.debug("upload_file %s/%s with crc32", bucket_name, key)
              conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config, ExtraArgs={'ChecksumAlgorithm': 'crc32'})
  
      log.debug("==========================================")
-    log.info("Summery:\n%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:\n%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
               s3_objects_total, rados_objects_total, total_space/MB)
      log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
-    log.info("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
-    log.info("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
+    log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
+    log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
  
      expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
-    log.info("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
+    log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
      if check_obj_count:
          assert rados_objects_total == count_object_parts_in_all_buckets()
  
@@ -676,7 +700,9 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
          assert(obj_size)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
          total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
          rados_objects_total += (rados_obj_count * num_copies)
@@ -684,7 +710,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
          log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
          s3_objects_total += num_copies
          if s3_objects_total and (s3_objects_total % 1000 == 0):
-            log.info("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+            log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
                       s3_objects_total, rados_objects_total, total_space/MB)
          for i in range(idx, num_copies):
              ten_id = i % max_tenants
@@ -693,7 +719,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
              log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key)
  
      log.debug("==========================================")
-    log.info("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
               s3_objects_total, rados_objects_total, total_space/MB)
      log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
@@ -704,7 +730,7 @@ def upload_objects_multi(files, conns, bucket_names, indices, config, check_obj_
      for (bucket_name, conn) in zip(bucket_names, conns):
          s3_object_count += count_objects_in_bucket(bucket_name, conn)
  
-    log.info("bucket listings reported a total of %d s3 objects", s3_object_count)
+    log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
      expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
@@ -732,7 +758,7 @@ def proc_upload(proc_id, num_procs, files, conn, bucket_name, indices, config):
              if (proc_id == target_proc):
                  key = gen_object_name(filename, i)
                  conn.upload_file(OUT_DIR+filename, bucket_name, key, Config=config)
-                log.info("[%d]upload_objects::<%s/%s>", proc_id, bucket_name, key)
+                log.debug("[%d]upload_objects::<%s/%s>", proc_id, bucket_name, key)
  
  
  #---------------------------------------------------------------------------
@@ -759,7 +785,9 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
          assert(obj_size)
          calc_expected_stats(dedup_stats, obj_size, num_copies, config)
          total_space += (obj_size * num_copies)
-        dedupable_space=calc_dedupable_space(obj_size, config)
+        ret=calc_dedupable_space(obj_size, config)
+        dedupable_space=ret[0]
+        dup_head_size=ret[1]
          duplicated_space += ((num_copies-1) * dedupable_space)
          rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
          rados_objects_total += (rados_obj_count * num_copies)
@@ -772,7 +800,7 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
          proc_list[idx].join()
  
      log.debug("==========================================")
-    log.info("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+    log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
               s3_objects_total, rados_objects_total, total_space/MB)
      log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
      log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
@@ -783,7 +811,7 @@ def procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_
      for (bucket_name, conn) in zip(bucket_names, conns):
          s3_object_count += count_objects_in_bucket(bucket_name, conn)
  
-    log.info("bucket listings reported a total of %d s3 objects", s3_object_count)
+    log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
      expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
      log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
      expcted_space_post_dedup=(total_space-duplicated_space)
@@ -806,7 +834,7 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
          log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
          for i in range(0, num_copies):
              key = gen_object_name(filename, i)
-            #log.info("download_file(%s) with crc32", key)
+            #log.debug("download_file(%s) with crc32", key)
              conn.download_file(bucket_name, key, tempfile, Config=config, ExtraArgs={'ChecksumMode': 'crc32'})
              #conn.download_file(bucket_name, key, tempfile, Config=config)
              result = bash(['cmp', tempfile, OUT_DIR + filename])
@@ -814,7 +842,7 @@ def verify_objects(bucket_name, files, conn, expected_results, config):
              os.remove(tempfile)
  
      assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
  
  
  #-------------------------------------------------------------------------------
@@ -836,7 +864,7 @@ def verify_objects_multi(files, conns, bucket_names, expected_results, config):
              os.remove(tempfile)
  
      assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
  
  
  #-------------------------------------------------------------------------------
@@ -847,13 +875,13 @@ def thread_verify(thread_id, num_threads, files, conn, bucket, config):
          filename=f[0]
          obj_size=f[1]
          num_copies=f[2]
-        log.info("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
+        log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
          for i in range(0, num_copies):
              target_thread = count % num_threads
              count += 1
              if thread_id == target_thread:
                  key = gen_object_name(filename, i)
-                log.info("comparing object %s with file %s", key, filename)
+                log.debug("comparing object %s with file %s", key, filename)
                  conn.download_file(bucket, key, tempfile, Config=config)
                  result = bash(['cmp', tempfile, OUT_DIR + filename])
                  assert result[1] == 0 ,"Files %s and %s differ!!" % (key, tempfile)
@@ -876,7 +904,7 @@ def threads_verify_objects(files, conns, bucket_names, expected_results, config)
          thread_list[idx].join()
  
      assert expected_results == count_object_parts_in_all_buckets(True)
-    log.info("verify_objects::completed successfully!!")
+    log.debug("verify_objects::completed successfully!!")
  
  
  #-------------------------------------------------------------------------------
@@ -903,6 +931,7 @@ def reset_full_dedup_stats(dedup_stats):
      dedup_stats.total_processed_objects = 0
      dedup_stats.set_shared_manifest_src = 0
      dedup_stats.deduped_obj = 0
+    dedup_stats.dup_head_size = 0
      dedup_stats.deduped_obj_bytes = 0
      dedup_stats.skip_shared_manifest = 0
      dedup_stats.skip_src_record = 0
@@ -959,7 +988,7 @@ def read_dedup_ratio(json):
      dedup_ratio.s3_bytes_after=json['s3_bytes_after']
      dedup_ratio.ratio=json['dedup_ratio']
  
-    log.info("Completed! ::ratio=%f", dedup_ratio.ratio)
+    log.debug("Completed! ::ratio=%f", dedup_ratio.ratio)
      return dedup_ratio
  
  #-------------------------------------------------------------------------------
@@ -975,10 +1004,10 @@ def verify_dedup_ratio(expected_dedup_stats, dedup_ratio):
      else:
          ratio = 0
  
-    log.info("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
-    log.info("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
-    log.info("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
-    log.info("ratio = %f/%f", ratio, dedup_ratio.ratio)
+    log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
+    log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
+    log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
+    log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio)
  
      assert s3_bytes_before == dedup_ratio.s3_bytes_before
      assert s3_bytes_after == dedup_ratio.s3_bytes_after
@@ -1013,7 +1042,7 @@ def read_dedup_stats(dry_run):
      if key in jstats:
          md5_stats=jstats[key]
          main=md5_stats['main']
-        dedup_stats.loaded_objects = main['Loaded objects']
+        #dedup_stats.loaded_objects = main['Loaded objects']
          if dry_run == False:
              read_full_dedup_stats(dedup_stats, md5_stats)
  
@@ -1022,19 +1051,27 @@ def read_dedup_stats(dry_run):
          dedup_stats.duplicate_obj = main['Duplicate Obj']
          dedup_stats.dedup_bytes_estimate = main['Dedup Bytes Estimate']
  
+        potential = md5_stats['Potential Dedup']
+        dedup_stats.dup_head_size_estimate = potential['Duplicated Head Bytes Estimate']
+        dedup_stats.dup_head_size = potential['Duplicated Head Bytes']
+        dedup_stats.potential_singleton_obj = potential['Singleton Obj (64KB-4MB)']
+        dedup_stats.potential_unique_obj = potential['Unique Obj (64KB-4MB)']
+        dedup_stats.potential_duplicate_obj = potential['Duplicate Obj (64KB-4MB)']
+        dedup_stats.potential_dedup_space = potential['Dedup Bytes Estimate (64KB-4MB)']
+
      dedup_work_was_completed=jstats['completed']
      if dedup_work_was_completed:
          dedup_ratio_estimate=read_dedup_ratio(jstats['dedup_ratio_estimate'])
          dedup_ratio_actual=read_dedup_ratio(jstats['dedup_ratio_actual'])
      else:
-        log.info("Uncompleted!")
+        log.debug("Uncompleted!")
  
      return (dedup_work_was_completed, dedup_stats, dedup_ratio_estimate, dedup_ratio_actual)
  
  
  #-------------------------------------------------------------------------------
  def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
-    log.info("sending exec_dedup request: dry_run=%d", dry_run)
+    log.debug("sending exec_dedup request: dry_run=%d", dry_run)
      if dry_run:
          result = admin(['dedup', 'estimate'])
          reset_full_dedup_stats(expected_dedup_stats)
@@ -1042,7 +1079,7 @@ def exec_dedup_internal(expected_dedup_stats, dry_run, max_dedup_time):
          result = admin(['dedup', 'restart'])
  
      assert result[1] == 0
-    log.info("wait for dedup to complete")
+    log.debug("wait for dedup to complete")
  
      dedup_time = 0
      dedup_timeout = 5
@@ -1080,16 +1117,20 @@ def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
      if verify_stats == False:
          return ret
  
+    if dedup_stats.potential_unique_obj or expected_dedup_stats.potential_unique_obj:
+        log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
+                  expected_dedup_stats.potential_unique_obj)
+
      #dedup_stats.set_sha256 = dedup_stats.invalid_sha256
      if dedup_stats != expected_dedup_stats:
-        log.info("==================================================")
+        log.debug("==================================================")
          print_dedup_stats_diff(dedup_stats, expected_dedup_stats)
-        print_dedup_stats(dedup_stats)
-        log.info("==================================================\n")
+        #print_dedup_stats(dedup_stats)
+        log.debug("==================================================\n")
          assert dedup_stats == expected_dedup_stats
  
      verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
-    log.info("expcted_dedup::stats check completed successfully!!")
+    log.debug("expcted_dedup::stats check completed successfully!!")
      return ret
  
  
@@ -1104,6 +1145,13 @@ def prepare_test():
  
      os.mkdir(OUT_DIR)
  
+#-------------------------------------------------------------------------------
+def copy_potential_stats(new_dedup_stats, dedup_stats):
+    new_dedup_stats.potential_singleton_obj = dedup_stats.potential_singleton_obj
+    new_dedup_stats.potential_unique_obj    = dedup_stats.potential_unique_obj
+    new_dedup_stats.potential_duplicate_obj = dedup_stats.potential_duplicate_obj
+    new_dedup_stats.potential_dedup_space   = dedup_stats.potential_dedup_space
+
  
  #-------------------------------------------------------------------------------
  def small_single_part_objs_dedup(conn, bucket_name, dry_run):
@@ -1115,13 +1163,13 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
      prepare_test()
      try:
          files=[]
-        num_files = 10
+        num_files = 8
          base_size = 4*KB
-        log.info("generate files: base size=%d KiB, max_size=%d KiB",
-                 base_size/KB, (pow(2, num_files) * base_size)/KB)
+        log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+                  base_size/KB, (pow(2, num_files) * base_size)/KB)
          gen_files(files, base_size, num_files)
          bucket = conn.create_bucket(Bucket=bucket_name)
-        log.info("upload objects to bucket <%s> ...", bucket_name)
+        log.debug("upload objects to bucket <%s> ...", bucket_name)
          indices = [0] * len(files)
          ret = upload_objects(bucket_name, files, indices, conn, default_config)
          expected_results = ret[0]
@@ -1130,6 +1178,8 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
+        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small = s3_objects_total
@@ -1137,7 +1187,7 @@ def small_single_part_objs_dedup(conn, bucket_name, dry_run):
  
          exec_dedup(dedup_stats, dry_run)
          if dry_run == False:
-            log.info("Verify all objects")
+            log.debug("Verify all objects")
              verify_objects(bucket_name, files, conn, expected_results, default_config)
  
      finally:
@@ -1167,16 +1217,17 @@ def simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run):
      # 9) call GC to make sure everything was removed
      #10) verify that there is nothing left on pool (i.e. ref-count is working)
      try:
-        log.info("conn.create_bucket(%s)", bucket_name)
+        log.debug("conn.create_bucket(%s)", bucket_name)
          bucket = conn.create_bucket(Bucket=bucket_name)
          indices = [0] * len(files)
-        log.info("upload objects to bucket <%s> ...", bucket_name)
+        log.debug("upload objects to bucket <%s> ...", bucket_name)
          ret = upload_objects(bucket_name, files, indices, conn, config)
          expected_results = ret[0]
          dedup_stats = ret[1]
+
          exec_dedup(dedup_stats, dry_run)
          if dry_run == False:
-            log.info("Verify all objects")
+            log.debug("Verify all objects")
              verify_objects(bucket_name, files, conn, expected_results, config)
  
          return ret
@@ -1194,7 +1245,7 @@ def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False)
      dedup_stats = ret[1]
      exec_dedup(dedup_stats, dry_run)
      if dry_run == False:
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
          verify_objects_multi(files, conns, bucket_names, expected_results, config)
  
      return ret
@@ -1228,14 +1279,14 @@ def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_ru
      exec_time_sec=exec_ret[0]
      verify_time_sec=0
      if dry_run == False:
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
          start = time.time_ns()
          threads_verify_objects(files, conns, bucket_names,
                                 expected_results, config)
          verify_time_sec = (time.time_ns() - start)  / (1000*1000*1000)
  
      log.info("[%d] obj_count=%d, upload=%d(sec), exec=%d(sec), verify=%d(sec)",
-                len(conns), s3_objects_total, upload_time_sec, exec_time_sec, verify_time_sec);
+             len(conns), s3_objects_total, upload_time_sec, exec_time_sec, verify_time_sec);
      return upload_ret
  
  
@@ -1256,15 +1307,15 @@ def threads_dedup_basic_with_tenants_common(files, num_conns, config, dry_run):
  def check_full_dedup_state():
      global full_dedup_state_was_checked
      global full_dedup_state_disabled
-    log.info("check_full_dedup_state:: sending FULL Dedup request")
+    log.debug("check_full_dedup_state:: sending FULL Dedup request")
      result = admin(['dedup', 'restart'])
      if result[1] == 0:
-        log.info("full dedup is enabled!")
+        log.debug("full dedup is enabled!")
          full_dedup_state_disabled = False
          result = admin(['dedup', 'abort'])
          assert result[1] == 0
      else:
-        log.info("full dedup is disabled, skip all full dedup tests")
+        log.debug("full dedup is disabled, skip all full dedup tests")
          full_dedup_state_disabled = True
  
      full_dedup_state_was_checked = True
@@ -1280,7 +1331,7 @@ def full_dedup_is_disabled():
          full_dedup_state_disabled = check_full_dedup_state()
  
      if full_dedup_state_disabled:
-        log.info("Full Dedup is DISABLED, skipping test...")
+        log.debug("Full Dedup is DISABLED, skipping test...")
  
      return full_dedup_state_disabled
  
@@ -1338,15 +1389,15 @@ def gen_new_etag(etag, corruption, expected_dedup_stats):
  
  #------------------------------------------------------------------------------
  def corrupt_etag(key, corruption, expected_dedup_stats):
-    log.info("key=%s, corruption=%s", key, corruption);
+    log.debug("key=%s, corruption=%s", key, corruption);
      result = rados(['ls', '-p ', POOLNAME])
      assert result[1] == 0
  
      names=result[0].split()
      for name in names:
-        log.info("name=%s", name)
+        log.debug("name=%s", name)
          if key in name:
-            log.info("key=%s is a substring of name=%s", key, name);
+            log.debug("key=%s is a substring of name=%s", key, name);
              rados_name = name
              break;
  
@@ -1356,7 +1407,7 @@ def corrupt_etag(key, corruption, expected_dedup_stats):
  
      new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats)
  
-    log.info("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
+    log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
               corruption, old_etag, new_etag)
      change_object_etag(rados_name, new_etag)
      return (rados_name, old_etag)
@@ -1370,7 +1421,7 @@ def test_dedup_etag_corruption():
          return
  
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_etag_corruption: connect to AWS ...")
+    log.debug("test_dedup_etag_corruption: connect to AWS ...")
      conn=get_single_connection()
      prepare_test()
      try:
@@ -1457,7 +1508,7 @@ def test_md5_collisions():
          write_bin_file(files, s2_bin, "s2")
  
          bucket_name = gen_bucket_name()
-        log.info("test_md5_collisions: connect to AWS ...")
+        log.debug("test_md5_collisions: connect to AWS ...")
          config2=TransferConfig(multipart_threshold=64, multipart_chunksize=1*MB)
          conn=get_single_connection()
          bucket = conn.create_bucket(Bucket=bucket_name)
@@ -1467,7 +1518,7 @@ def test_md5_collisions():
          dedup_stats = Dedup_Stats()
          # we wrote 2 different small objects (BLOCK_SIZE) with the same md5
          dedup_stats.total_processed_objects=2
-        dedup_stats.loaded_objects=dedup_stats.total_processed_objects
+        #dedup_stats.loaded_objects=dedup_stats.total_processed_objects
          # the objects will seem like a duplications with 1 unique and 1 duplicate
          dedup_stats.unique_obj=1
          dedup_stats.duplicate_obj=1
@@ -1487,7 +1538,7 @@ def test_md5_collisions():
          expected_ratio_actual.ratio=0
  
          dry_run=False
-        log.info("test_md5_collisions: first call to exec_dedup")
+        log.debug("test_md5_collisions: first call to exec_dedup")
          ret=exec_dedup(dedup_stats, dry_run)
          dedup_ratio_actual=ret[3]
  
@@ -1497,7 +1548,7 @@ def test_md5_collisions():
          dedup_stats.invalid_sha256=0
          dedup_stats.set_sha256=0
  
-        log.info("test_md5_collisions: second call to exec_dedup")
+        log.debug("test_md5_collisions: second call to exec_dedup")
          ret=exec_dedup(dedup_stats, dry_run)
          dedup_ratio_actual=ret[3]
  
@@ -1517,7 +1568,7 @@ def test_dedup_small():
          return
  
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_small: connect to AWS ...")
+    log.debug("test_dedup_small: connect to AWS ...")
      conn=get_single_connection()
      small_single_part_objs_dedup(conn, bucket_name, False)
  
@@ -1535,7 +1586,7 @@ def test_dedup_small_with_tenants():
      files=[]
      num_files=10 # [4KB-4MB]
      base_size = 4*KB
-    log.info("generate files: base size=%d KiB, max_size=%d KiB",
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
               base_size/KB, (pow(2, num_files) * base_size)/KB)
      try:
          gen_files(files, base_size, num_files, max_copies_count)
@@ -1552,6 +1603,8 @@ def test_dedup_small_with_tenants():
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
+        #small_objs_dedup_stats.loaded_objects=dedup_stats.loaded_objects
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -1559,7 +1612,7 @@ def test_dedup_small_with_tenants():
  
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
-        log.info("Verify all objects")
+        log.debug("Verify all objects")
          verify_objects_multi(files, conns, bucket_names, expected_results, default_config)
      finally:
          # cleanup must be executed even after a failure
@@ -1580,7 +1633,7 @@ def test_dedup_inc_0_with_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_inc_0: connect to AWS ...")
+    log.debug("test_dedup_inc_0: connect to AWS ...")
      max_copies_count=3
      config=default_config
      ret=gen_connections_multi2(max_copies_count)
@@ -1598,6 +1651,7 @@ def test_dedup_inc_0_with_tenants():
          s3_objects_total = ret[2]
  
          dedup_stats2 = dedup_stats
+        dedup_stats2.dup_head_size = 0
          dedup_stats2.skip_shared_manifest=dedup_stats.deduped_obj
          dedup_stats2.skip_src_record=dedup_stats.set_shared_manifest_src
          dedup_stats2.set_shared_manifest_src=0
@@ -1607,7 +1661,7 @@ def test_dedup_inc_0_with_tenants():
          dedup_stats2.invalid_sha256=0
          dedup_stats2.set_sha256=0
  
-        log.info("test_dedup_inc_0_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_0_with_tenants: incremental dedup:")
          # run dedup again and make sure nothing has changed
          dry_run=False
          exec_dedup(dedup_stats2, dry_run)
@@ -1633,7 +1687,7 @@ def test_dedup_inc_0():
      config=default_config
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_0: connect to AWS ...")
+    log.debug("test_dedup_inc_0: connect to AWS ...")
      conn=get_single_connection()
      try:
          files=[]
@@ -1646,6 +1700,7 @@ def test_dedup_inc_0():
          s3_objects_total = ret[2]
  
          dedup_stats2 = dedup_stats
+        dedup_stats2.dup_head_size = 0
          dedup_stats2.skip_shared_manifest=dedup_stats.deduped_obj
          dedup_stats2.skip_src_record=dedup_stats.set_shared_manifest_src
          dedup_stats2.set_shared_manifest_src=0
@@ -1655,7 +1710,7 @@ def test_dedup_inc_0():
          dedup_stats2.invalid_sha256=0
          dedup_stats2.set_sha256=0
  
-        log.info("test_dedup_inc_0: incremental dedup:")
+        log.debug("test_dedup_inc_0: incremental dedup:")
          # run dedup again and make sure nothing has changed
          dry_run=False
          exec_dedup(dedup_stats2, dry_run)
@@ -1678,7 +1733,7 @@ def test_dedup_inc_1_with_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_inc_1_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_1_with_tenants: connect to AWS ...")
      max_copies_count=6
      config=default_config
      ret=gen_connections_multi2(max_copies_count)
@@ -1713,6 +1768,7 @@ def test_dedup_inc_1_with_tenants():
          stats_combined=ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
          stats_combined.skip_src_record     -= stats_base.skip_src_record
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
          stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
  
          stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
@@ -1723,7 +1779,7 @@ def test_dedup_inc_1_with_tenants():
          stats_combined.invalid_sha256 -= stats_base.set_sha256
          stats_combined.set_sha256     -= stats_base.set_sha256
  
-        log.info("test_dedup_inc_1_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_1_with_tenants: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
@@ -1748,7 +1804,7 @@ def test_dedup_inc_1():
      config=default_config
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_1: connect to AWS ...")
+    log.debug("test_dedup_inc_1: connect to AWS ...")
      conn=get_single_connection()
      try:
          files=[]
@@ -1776,6 +1832,7 @@ def test_dedup_inc_1():
          expected_results = ret[0]
          stats_combined = ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
          stats_combined.skip_src_record     -= stats_base.skip_src_record
          stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
  
@@ -1787,7 +1844,7 @@ def test_dedup_inc_1():
          stats_combined.invalid_sha256 -= stats_base.set_sha256
          stats_combined.set_sha256     -= stats_base.set_sha256
  
-        log.info("test_dedup_inc_1: incremental dedup:")
+        log.debug("test_dedup_inc_1: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
@@ -1811,7 +1868,7 @@ def test_dedup_inc_2_with_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_inc_2_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_2_with_tenants: connect to AWS ...")
      max_copies_count=6
      config=default_config
      ret=gen_connections_multi2(max_copies_count)
@@ -1853,6 +1910,7 @@ def test_dedup_inc_2_with_tenants():
          expected_results = ret[0]
          stats_combined = ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
          stats_combined.skip_src_record     -= stats_base.skip_src_record
          stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
  
@@ -1864,7 +1922,7 @@ def test_dedup_inc_2_with_tenants():
          stats_combined.invalid_sha256 -= stats_base.set_sha256
          stats_combined.set_sha256     -= stats_base.set_sha256
  
-        log.info("test_dedup_inc_2_with_tenants: incremental dedup:")
+        log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
@@ -1890,7 +1948,7 @@ def test_dedup_inc_2():
      config=default_config
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_2: connect to AWS ...")
+    log.debug("test_dedup_inc_2: connect to AWS ...")
      conn=get_single_connection()
      try:
          files=[]
@@ -1926,6 +1984,7 @@ def test_dedup_inc_2():
          stats_combined = ret[1]
          stats_combined.skip_shared_manifest = stats_base.deduped_obj
          stats_combined.skip_src_record     -= stats_base.skip_src_record
+        stats_combined.dup_head_size       -= stats_base.dup_head_size
          stats_combined.skip_src_record     += stats_base.set_shared_manifest_src
  
          stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
@@ -1936,7 +1995,7 @@ def test_dedup_inc_2():
          stats_combined.invalid_sha256 -= stats_base.set_sha256
          stats_combined.set_sha256     -= stats_base.set_sha256
  
-        log.info("test_dedup_inc_2: incremental dedup:")
+        log.debug("test_dedup_inc_2: incremental dedup:")
          # run dedup again
          dry_run=False
          exec_dedup(stats_combined, dry_run)
@@ -1960,7 +2019,7 @@ def test_dedup_inc_with_remove_multi_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_inc_with_remove_multi_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_with_remove_multi_tenants: connect to AWS ...")
      max_copies_count=6
      config=default_config
      ret=gen_connections_multi2(max_copies_count)
@@ -2013,6 +2072,7 @@ def test_dedup_inc_with_remove_multi_tenants():
          # run dedup again
          dedup_stats.set_shared_manifest_src=0
          dedup_stats.deduped_obj=0
+        dedup_stats.dup_head_size=0
          dedup_stats.deduped_obj_bytes=0
          dedup_stats.skip_src_record=src_record
          dedup_stats.skip_shared_manifest=shared_manifest
@@ -2020,7 +2080,7 @@ def test_dedup_inc_with_remove_multi_tenants():
          dedup_stats.invalid_sha256=0
          dedup_stats.set_sha256=0
  
-        log.info("test_dedup_inc_with_remove: incremental dedup:")
+        log.debug("test_dedup_inc_with_remove: incremental dedup:")
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
          expected_results=calc_expected_results(files_sub, config)
@@ -2045,7 +2105,7 @@ def test_dedup_inc_with_remove():
      config=default_config
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_inc_with_remove: connect to AWS ...")
+    log.debug("test_dedup_inc_with_remove: connect to AWS ...")
      conn=get_single_connection()
      try:
          files=[]
@@ -2086,7 +2146,7 @@ def test_dedup_inc_with_remove():
                  object_keys.append(key)
  
              if len(object_keys) == 0:
-                log.info("Skiping file=%s, num_remove=%d", filename, num_remove)
+                log.debug("Skiping file=%s, num_remove=%d", filename, num_remove)
                  continue
  
              response=conn.delete_objects(Bucket=bucket_name,
@@ -2099,6 +2159,7 @@ def test_dedup_inc_with_remove():
          # run dedup again
          dedup_stats.set_shared_manifest_src=0
          dedup_stats.deduped_obj=0
+        dedup_stats.dup_head_size=0
          dedup_stats.deduped_obj_bytes=0
          dedup_stats.skip_src_record=src_record
          dedup_stats.skip_shared_manifest=shared_manifest
@@ -2106,9 +2167,9 @@ def test_dedup_inc_with_remove():
          dedup_stats.invalid_sha256=0
          dedup_stats.set_sha256=0
  
-        log.info("test_dedup_inc_with_remove: incremental dedup:")
-        log.info("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
-        log.info("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
+        log.debug("test_dedup_inc_with_remove: incremental dedup:")
+        log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
+        log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
          dry_run=False
          exec_dedup(dedup_stats, dry_run)
          expected_results=calc_expected_results(files_sub, config)
@@ -2127,7 +2188,7 @@ def test_dedup_multipart_with_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_multipart_with_tenants: connect to AWS ...")
      max_copies_count=3
      num_files=8
      files=[]
@@ -2154,7 +2215,7 @@ def test_dedup_multipart():
  
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_multipart: connect to AWS ...")
+    log.debug("test_dedup_multipart: connect to AWS ...")
      conn=get_single_connection()
      files=[]
  
@@ -2185,7 +2246,7 @@ def test_dedup_basic_with_tenants():
      num_files=23
      file_size=33*MB
      files=[]
-    log.info("test_dedup_basic_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_basic_with_tenants: connect to AWS ...")
      gen_files_fixed_size(files, num_files, file_size, max_copies_count)
      dedup_basic_with_tenants_common(files, max_copies_count, default_config, False)
  
@@ -2200,15 +2261,15 @@ def test_dedup_basic():
  
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_basic: connect to AWS ...")
+    log.debug("test_dedup_basic: connect to AWS ...")
      conn=get_single_connection()
      files=[]
      num_files=5
      base_size = MULTIPART_SIZE
-    log.info("generate files: base size=%d MiB, max_size=%d MiB",
+    log.debug("generate files: base size=%d MiB, max_size=%d MiB",
               base_size/MB, (pow(2, num_files) * base_size)/MB)
      gen_files(files, base_size, num_files)
-    log.info("call simple_dedup()")
+    log.debug("call simple_dedup()")
      simple_dedup(conn, files, bucket_name, True, default_config, False)
  
  
@@ -2227,7 +2288,7 @@ def test_dedup_small_multipart_with_tenants():
      max_size=512*KB
      files=[]
      config=TransferConfig(multipart_threshold=min_size, multipart_chunksize=1*MB)
-    log.info("test_dedup_small_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_small_multipart_with_tenants: connect to AWS ...")
  
      # create files in range [4KB-512KB] aligned on 4KB
      gen_files_in_range(files, num_files, min_size, max_size, min_size)
@@ -2243,7 +2304,7 @@ def test_dedup_small_multipart():
          return
  
      prepare_test()
-    log.info("test_dedup_small_multipart: connect to AWS ...")
+    log.debug("test_dedup_small_multipart: connect to AWS ...")
      config2=TransferConfig(multipart_threshold=4*KB, multipart_chunksize=1*MB)
      conn=get_single_connection()
      files=[]
@@ -2261,7 +2322,7 @@ def test_dedup_small_multipart():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_large_scale_with_tenants():
-    #return
+    return
  
      if full_dedup_is_disabled():
          return
@@ -2273,7 +2334,7 @@ def test_dedup_large_scale_with_tenants():
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_large_scale_with_tenants: connect to AWS ...")
      gen_files_fixed_size(files, num_files, size, max_copies_count)
      threads_dedup_basic_with_tenants_common(files, num_threads, config, False)
  
@@ -2281,7 +2342,7 @@ def test_dedup_large_scale_with_tenants():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_dedup_large_scale():
-    #return
+    return
  
      if full_dedup_is_disabled():
          return
@@ -2293,7 +2354,7 @@ def test_dedup_large_scale():
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
      gen_files_fixed_size(files, num_files, size, max_copies_count)
      threads_dedup_basic_with_tenants_common(files, num_threads, config, False)
  
@@ -2301,13 +2362,13 @@ def test_dedup_large_scale():
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_empty_bucket():
-    #return
+    return
  
      if full_dedup_is_disabled():
          return
  
      prepare_test()
-    log.info("test_empty_bucket: connect to AWS ...")
+    log.debug("test_empty_bucket: connect to AWS ...")
  
      max_copies_count=2
      config = default_config
@@ -2361,6 +2422,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
      stats_combined.skip_shared_manifest = stats_base.deduped_obj
      stats_combined.skip_src_record      = src_record
      stats_combined.set_shared_manifest_src -= stats_base.set_shared_manifest_src
+    stats_combined.dup_head_size       -= stats_base.dup_head_size
      stats_combined.deduped_obj         -= stats_base.deduped_obj
      stats_combined.deduped_obj_bytes   -= stats_base.deduped_obj_bytes
  
@@ -2368,7 +2430,7 @@ def inc_step_with_tenants(stats_base, files, conns, bucket_names, config):
      stats_combined.invalid_sha256 -= stats_base.set_sha256
      stats_combined.set_sha256     -= stats_base.set_sha256
  
-    log.info("test_dedup_inc_2_with_tenants: incremental dedup:")
+    log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
      # run dedup again
      dry_run=False
      exec_dedup(stats_combined, dry_run)
@@ -2387,7 +2449,7 @@ def test_dedup_inc_loop_with_tenants():
          return
  
      prepare_test()
-    log.info("test_dedup_inc_loop_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_inc_loop_with_tenants: connect to AWS ...")
      max_copies_count=3
      config=default_config
      ret=gen_connections_multi2(max_copies_count)
@@ -2408,6 +2470,7 @@ def test_dedup_inc_loop_with_tenants():
              files=ret[0]
              stats_last=ret[1]
              stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src
+            stats_base.dup_head_size       += stats_last.dup_head_size
              stats_base.deduped_obj         += stats_last.deduped_obj
              stats_base.deduped_obj_bytes   += stats_last.deduped_obj_bytes
              stats_base.set_sha256          += stats_last.set_sha256
@@ -2423,13 +2486,13 @@ def test_dedup_inc_loop_with_tenants():
  def test_dedup_dry_small_with_tenants():
      #return
  
-    log.info("test_dedup_dry_small_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_small_with_tenants: connect to AWS ...")
      prepare_test()
      max_copies_count=3
      files=[]
      num_files=10 # [4KB-4MB]
      base_size = 4*KB
-    log.info("generate files: base size=%d KiB, max_size=%d KiB",
+    log.debug("generate files: base size=%d KiB, max_size=%d KiB",
               base_size/KB, (pow(2, num_files) * base_size)/KB)
      try:
          gen_files(files, base_size, num_files, max_copies_count)
@@ -2446,6 +2509,7 @@ def test_dedup_dry_small_with_tenants():
  
          # expected stats for small objects - all zeros except for skip_too_small
          small_objs_dedup_stats = Dedup_Stats()
+        copy_potential_stats(small_objs_dedup_stats, dedup_stats)
          small_objs_dedup_stats.size_before_dedup=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
          small_objs_dedup_stats.skip_too_small=s3_objects_total
@@ -2464,7 +2528,7 @@ def test_dedup_dry_multipart():
  
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_multipart: connect to AWS ...")
+    log.debug("test_dedup_dry_multipart: connect to AWS ...")
      conn=get_single_connection()
      files=[]
  
@@ -2490,15 +2554,15 @@ def test_dedup_dry_basic():
  
      prepare_test()
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_basic: connect to AWS ...")
+    log.debug("test_dedup_dry_basic: connect to AWS ...")
      conn=get_single_connection()
      files=[]
      num_files=5
-    base_size = MULTIPART_SIZE
-    log.info("generate files: base size=%d MiB, max_size=%d MiB",
+    base_size = 2*MB
+    log.debug("generate files: base size=%d MiB, max_size=%d MiB",
               base_size/MB, (pow(2, num_files) * base_size)/MB)
      gen_files(files, base_size, num_files)
-    log.info("call simple_dedup()")
+    log.debug("call simple_dedup()")
      simple_dedup(conn, files, bucket_name, True, default_config, True)
  
  
@@ -2508,7 +2572,7 @@ def test_dedup_dry_small_multipart():
      #return
  
      prepare_test()
-    log.info("test_dedup_dry_small_multipart: connect to AWS ...")
+    log.debug("test_dedup_dry_small_multipart: connect to AWS ...")
      config2 = TransferConfig(multipart_threshold=4*KB, multipart_chunksize=1*MB)
      conn=get_single_connection()
      files=[]
@@ -2529,7 +2593,7 @@ def test_dedup_dry_small():
      #return
  
      bucket_name = gen_bucket_name()
-    log.info("test_dedup_dry_small: connect to AWS ...")
+    log.debug("test_dedup_dry_small: connect to AWS ...")
      conn=get_single_connection()
      small_single_part_objs_dedup(conn, bucket_name, True)
  
@@ -2546,20 +2610,23 @@ def test_dedup_dry_small_large_mix():
      #return
  
      dry_run=True
-    log.info("test_dedup_dry_small_large_mix: connect to AWS ...")
+    log.debug("test_dedup_dry_small_large_mix: connect to AWS ...")
      prepare_test()
  
      num_threads=4
      max_copies_count=3
      small_file_size=1*MB
+    mid_file_size=8*MB
      large_file_size=16*MB
      num_small_files=128
+    num_mid_files=32
      num_large_files=16
      files=[]
      conns=[]
      bucket_names=get_buckets(num_threads)
      try:
          gen_files_fixed_size(files, num_small_files, small_file_size, max_copies_count)
+        gen_files_fixed_size(files, num_mid_files, mid_file_size, max_copies_count)
          gen_files_fixed_size(files, num_large_files, large_file_size, max_copies_count)
  
          start = time.time_ns()
@@ -2573,9 +2640,8 @@ def test_dedup_dry_small_large_mix():
          expected_results = ret[0]
          dedup_stats = ret[1]
          s3_objects_total = ret[2]
-        log.info("new[%d] obj_count=%d, upload_time=%d(sec)",
-                 len(conns), s3_objects_total, upload_time_sec)
-
+        log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total,
+                 upload_time_sec)
          exec_dedup(dedup_stats, dry_run)
          if dry_run == False:
              verify_objects(bucket_name, files, conn, expected_results, default_config)
@@ -2594,7 +2660,7 @@ def test_dedup_dry_basic_with_tenants():
      num_files=23
      file_size=33*MB
      files=[]
-    log.info("test_dedup_basic_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_basic_with_tenants: connect to AWS ...")
      gen_files_fixed_size(files, num_files, file_size, max_copies_count)
      dedup_basic_with_tenants_common(files, max_copies_count, default_config, True)
  
@@ -2605,7 +2671,7 @@ def test_dedup_dry_multipart_with_tenants():
      #return
  
      prepare_test()
-    log.info("test_dedup_dry_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_multipart_with_tenants: connect to AWS ...")
      max_copies_count=3
      num_files=8
      files=[]
@@ -2634,7 +2700,7 @@ def test_dedup_dry_small_multipart_with_tenants():
      max_size=512*KB
      files=[]
      config=TransferConfig(multipart_threshold=min_size, multipart_chunksize=1*MB)
-    log.info("test_dedup_small_multipart_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_small_multipart_with_tenants: connect to AWS ...")
  
      # create files in range [4KB-512KB] aligned on 4KB
      gen_files_in_range(files, num_files, min_size, max_size, min_size)
@@ -2653,7 +2719,7 @@ def test_dedup_dry_large_scale_with_tenants():
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
      gen_files_fixed_size(files, num_files, size, max_copies_count)
      threads_dedup_basic_with_tenants_common(files, num_threads, config, True)
  
@@ -2670,7 +2736,7 @@ def test_dedup_dry_large_scale():
      size=1*KB
      files=[]
      config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_new: connect to AWS ...")
+    log.debug("test_dedup_dry_large_scale_new: connect to AWS ...")
      gen_files_fixed_size(files, num_files, size, max_copies_count)
      conns=get_connections(num_threads)
      bucket_names=get_buckets(num_threads)
@@ -2685,36 +2751,6 @@ def test_dedup_dry_large_scale():
          cleanup_all_buckets(bucket_names, conns)
  
  
-#-------------------------------------------------------------------------------
-@pytest.mark.basic_test
-def test_dedup_dry_large_scale_single_bucket():
-    return
-
-    prepare_test()
-    max_copies_count=3
-    num_threads=16
-    num_files=32*1024
-    size=1*KB
-    files=[]
-    config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
-    log.info("test_dedup_dry_large_scale_new: connect to AWS ...")
-    gen_files_fixed_size(files, num_files, size, max_copies_count)
-    conns=get_connections(num_threads)
-
-    bucket_name=gen_bucket_name()
-    conns[0].create_bucket(Bucket=bucket_name)
-
-    bucket_names=[bucket_name] * num_threads
-
-    try:
-        threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
-    except:
-        log.warning("test_dedup_dry_large_scale: failed!!")
-    finally:
-        # cleanup must be executed even after a failure
-        cleanup(bucket_name, conns[0])
-
-
  #-------------------------------------------------------------------------------
  @pytest.mark.basic_test
  def test_cleanup():
author	Gabriel BenHanokh <gbenhano@redhat.com>
	Sun, 21 Jul 2024 11:38:57 +0000 (11:38 +0000)
committer	Gabriel BenHanokh <gbenhano@redhat.com>
	Mon, 30 Jun 2025 11:10:59 +0000 (11:10 +0000)
src/rgw/driver/rados/rgw_zone.h		patch \| blob \| history
src/rgw/rgw_dedup.cc		patch \| blob \| history
src/rgw/rgw_dedup.h		patch \| blob \| history
src/rgw/rgw_dedup_cluster.cc		patch \| blob \| history
src/rgw/rgw_dedup_cluster.h		patch \| blob \| history
src/rgw/rgw_dedup_store.cc		patch \| blob \| history
src/rgw/rgw_dedup_table.cc		patch \| blob \| history
src/rgw/rgw_dedup_table.h		patch \| blob \| history
src/rgw/rgw_dedup_utils.cc		patch \| blob \| history
src/rgw/rgw_dedup_utils.h		patch \| blob \| history
src/rgw/rgw_zone.cc		patch \| blob \| history
src/test/rgw/dedup/pytest.ini		patch \| blob \| history
src/test/rgw/dedup/test_dedup.py		patch \| blob \| history