Aborts an active dedup session and release all resources used by it.
- ``radosgw-admin dedup stats``:
Collects & displays last dedup statistics.
-- ``radosgw-admin dedup estimate``:
- Starts a new dedup estimate session (aborting first existing session if exists).
- ``radosgw-admin dedup throttle --max-bucket-index-ops=<count>``:
Specify max bucket-index requests per second allowed for a single RGW server during dedup, 0 means unlimited.
- ``radosgw-admin dedup throttle --stat``:
***************
Dedup Estimate process skips the following objects:
-- Objects smaller than 4 MB (unless they are multipart).
+- Objects smaller than rgw_dedup_min_obj_size_for_dedup (unless they are multipart).
- Objects with different placement rules.
- Objects with different pools.
- Objects with different storage classes.
The full dedup process skips all the above and it also skips **compressed** and **user-encrypted** objects.
+The minimum size object for dedup is controlled by the following config option:
+
+.. confval:: rgw_dedup_min_obj_size_for_dedup
+
*******************
Estimate Processing
*******************
- copying the manifest from the source to the target.
- removing all tail-objects on the target.
+***************
+Split Head Mode
+***************
+Dedup code can split the head object into 2 objects
+
+- one with attributes and no data and
+- a new tail-object with only data.
+
+The new-tail object will be deduped (unlike the head objects which can't be deduplicated)
+
+The split-Head mode is controlled by the following central configuration option:
+
+.. confval:: rgw_dedup_max_obj_size_for_split
+
+We will split head for objects with size smaller or equal to rgw_dedup_max_obj_size_for_split
+
************
Memory Usage
************
default: false
services:
- rgw
+- name: rgw_dedup_max_obj_size_for_split
+ type: size
+ level: advanced
+ desc: The maximum RGW object size to split head.
+ A value of 0 (zero) disables the split-head functionality
+ long_desc: Dedup code can split head object into 2 objects -
+ one with attributes and no data and
+ a new tail-object with only data.
+ The new-tail object will be deduped (unlike the head objects which
+ can't be deduplicated)
+ We will split head for objects with size 16MB or less
+ default: 16_M
+ services:
+ - rgw
+ with_legacy: true
+- name: rgw_dedup_min_obj_size_for_dedup
+ type: size
+ level: advanced
+ desc: The minimum RGW object size for dedup (0 means dedup all objects).
+ default: 64_K
+ services:
+ - rgw
+ with_legacy: true
- name: rgw_max_chunk_size
type: size
level: advanced
static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128;
using storage_class_idx_t = uint8_t;
+ //---------------------------------------------------------------------------
+ [[maybe_unused]] static int print_manifest(const DoutPrefixProvider *dpp,
+ RGWRados *rados,
+ const RGWObjManifest &manifest)
+ {
+ unsigned idx = 0;
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl;
+ }
+ ldpp_dout(dpp, 20) << "==============================================" << dendl;
+ return 0;
+ }
+
//---------------------------------------------------------------------------
void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
uint64_t notifier_id, bufferlist &bl)
//---------------------------------------------------------------------------
static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store,
const DoutPrefixProvider *dpp,
- bool create,
librados::IoCtx &ioctx)
{
const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
auto rados_handle = store->getRados()->get_rados_handle();
int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
if (pool_id >= 0) {
- // TBD: what to do when create option is passed
ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
<< " already exists, pool_id=" << pool_id << dendl;
}
- else if (create) {
+ else {
pool_id = create_pool(store, dpp, pool_name);
if (pool_id >= 0) {
ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
return pool_id;
}
}
- else {
- ldpp_dout(dpp, 1) << __func__
- << "::ERR: pool doesn't exist and no create option" << dendl;
- return -ENOENT;
- }
int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
if (unlikely(ret < 0)) {
rados = store->getRados();
rados_handle = rados->get_rados_handle();
if (init_pool) {
- int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+ int ret = init_dedup_pool_ioctx(store, dpp, d_dedup_cluster_ioctx);
display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
return ret;
}
d_cluster(dpp, cct, driver),
d_watcher_ctx(this)
{
- d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size;
d_head_object_size = cct->_conf->rgw_max_chunk_size;
- //ceph_assert(4*1024*1024 == d_head_object_size);
+ d_min_obj_size_for_dedup = cct->_conf->rgw_dedup_min_obj_size_for_dedup;
+ d_max_obj_size_for_split = cct->_conf->rgw_dedup_max_obj_size_for_split;
+
+ ldpp_dout(dpp, 10) << "Config Vals::d_head_object_size=" << d_head_object_size
+ << "::d_min_obj_size_for_dedup=" << d_min_obj_size_for_dedup
+ << "::d_max_obj_size_for_split=" << d_max_obj_size_for_split
+ << dendl;
int ret = init_rados_access_handles(false);
if (ret != 0) {
d_heart_beat_max_elapsed_sec = 3;
}
+ //------------------------------------------------------------------------------
+ uint64_t Background::__calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes)
+ {
+ return calc_deduped_bytes(d_head_object_size,
+ d_min_obj_size_for_dedup,
+ d_max_obj_size_for_split,
+ num_parts,
+ size_bytes);
+ }
+
//---------------------------------------------------------------------------
int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr,
const rgw::sal::Bucket *p_bucket,
}
ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/"
<< obj_name << " was written to block_idx="
- << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl;
+ << rec_info.block_id << " rec_id=" << (int)rec_info.rec_id
+ << dendl;
return 0;
}
storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
&p_stats->failed_map_overflow);
if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
- // TBD: need stat counters
return -EOVERFLOW;
}
key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
p_rec->s.num_parts, sc_idx);
- bool has_shared_manifest = p_rec->has_shared_manifest();
+ bool has_shared_manifest = p_rec->s.flags.has_shared_manifest();
ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name
<< ", obj=" << p_rec->obj_name << ", block_id="
<< (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id
}
#ifdef FULL_DEDUP_SUPPORT
+ //---------------------------------------------------------------------------
+ static inline std::string build_oid(const std::string& bucket_id,
+ const std::string& obj_name)
+ {
+ std::string oid;
+ oid.reserve(bucket_id.size() + 1 + obj_name.size());
+ oid.append(bucket_id).append("_").append(obj_name);
+ return oid;
+ }
+
+ //---------------------------------------------------------------------------
+ static int get_ioctx_internal(const DoutPrefixProvider* const dpp,
+ rgw::sal::Driver* driver,
+ rgw::sal::RadosStore* store,
+ const std::string &obj_name,
+ const std::string &instance,
+ const rgw_bucket &rb,
+ librados::IoCtx *p_ioctx,
+ std::string *p_oid)
+ {
+ unique_ptr<rgw::sal::Bucket> bucket;
+ {
+ int ret = driver->load_bucket(dpp, rb, &bucket, null_yield);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ string dummy_locator;
+ const rgw_obj_index_key key(obj_name, instance);
+ rgw_obj obj(bucket->get_key(), key);
+ get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
+ RGWBucketInfo& bucket_info = bucket->get_info();
+ return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+ }
+
+ //---------------------------------------------------------------------------
+ static inline int get_ioctx(const DoutPrefixProvider* const dpp,
+ rgw::sal::Driver* driver,
+ rgw::sal::RadosStore* store,
+ const disk_record_t *p_rec,
+ librados::IoCtx *p_ioctx,
+ std::string *p_oid)
+ {
+ rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
+ return get_ioctx_internal(dpp, driver, store, p_rec->obj_name, p_rec->instance,
+ b, p_ioctx, p_oid);
+ }
+
+ //---------------------------------------------------------------------------
+ static inline std::string generate_split_head_tail_name(const RGWObjManifest &manifest)
+ {
+ static constexpr std::string_view shadow_string(RGW_OBJ_NS_SHADOW);
+ std::string_view suffix = "0";
+ const std::string &prefix = manifest.get_prefix();
+
+ std::string tail_name;
+ tail_name.reserve(shadow_string.size() + prefix.size() + suffix.size() + 1);
+ // TBD:
+ // it is unclear when RGW code pads with "_" before the shadow string
+ // It won't change correctness, but might look weird
+ //tail_name.append("_");
+ tail_name.append(shadow_string);
+ tail_name.append("_");
+ tail_name.append(prefix);
+ tail_name.append(suffix);
+ return tail_name;
+ }
+
+ //---------------------------------------------------------------------------
+ static void remove_created_tail_object(const DoutPrefixProvider *dpp,
+ librados::IoCtx &ioctx,
+ const std::string &tail_oid,
+ md5_stats_t *p_stats)
+ {
+ p_stats->rollback_tail_obj++;
+ int ret = ioctx.remove(tail_oid);
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::" << tail_oid
+ << " was successfully removed" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 10) << __func__ << "::failed ioctx.remove( " << tail_oid
+ << " ), ret=" << ret << "::" << cpp_strerror(-ret) <<dendl;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ inline bool Background::should_split_head(uint64_t head_size, uint64_t obj_size)
+ {
+ // max_obj_size_for_split of zero means don't split!
+ return (head_size > 0 &&
+ d_max_obj_size_for_split &&
+ obj_size <= d_max_obj_size_for_split);
+ }
+
+ //---------------------------------------------------------------------------
+ [[maybe_unused]] static bool empty_rgw_bucket(const rgw_bucket &b)
+ {
+ return (b.tenant.empty() &&
+ b.name.empty() &&
+ b.marker.empty() &&
+ b.bucket_id.empty() &&
+ b.explicit_placement.data_pool.empty() &&
+ b.explicit_placement.data_extra_pool.empty() &&
+ b.explicit_placement.index_pool.empty());
+ }
static constexpr uint64_t cost = 1; // 1 throttle unit per request
static constexpr uint64_t id = 0; // ids unused
}
//---------------------------------------------------------------------------
- int Background::free_tail_objs_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &tgt_manifest)
+ int Background::free_tail_objs_by_manifest(const string &ref_tag,
+ const string &oid,
+ const RGWObjManifest &manifest)
{
unsigned idx = 0;
- for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+ std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
if (oid == raw_obj.oid) {
- ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+ << raw_obj.oid << dendl;
continue;
}
<< obj << dendl;
continue;
}
- librados::IoCtx ioctx = obj.ioctx;
- ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid
- << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid << dendl;
d_ctl.metadata_access_throttle.acquire();
- ret = ioctx.remove(raw_obj.oid);
+ ObjectWriteOperation op;
+ rgw::AioResultList completed;
+ cls_refcount_put(op, ref_tag, true);
+ completed = aio->get(obj.obj,
+ rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+ cost, id);
}
-
+ rgw::AioResultList completed = aio->drain();
return 0;
}
//---------------------------------------------------------------------------
- int Background::rollback_ref_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &manifest)
+ int Background::rollback_ref_by_manifest(const string &ref_tag,
+ const string &oid,
+ const RGWObjManifest &manifest)
{
+ ldpp_dout(dpp, 20) << __func__ << "::" << oid << dendl;
unsigned idx = 0;
int ret_code = 0;
std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
}
//---------------------------------------------------------------------------
- int Background::inc_ref_count_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &manifest)
+ int Background::inc_ref_count_by_manifest(const string &ref_tag,
+ const string &oid,
+ const RGWObjManifest &manifest)
{
std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
rgw::AioResultList all_results;
ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
if (ret < 0) {
ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
- << obj << dendl;
+ << raw_obj.oid << dendl;
break;
}
ObjectWriteOperation op;
cls_refcount_get(op, ref_tag, true);
d_ctl.metadata_access_throttle.acquire();
- ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: "
+ << raw_obj.oid << "::" << obj.obj.oid << dendl;
rgw::AioResultList completed = aio->get(obj.obj,
rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
cost, id);
all_results.splice(all_results.end(), completed);
if (ret < 0) {
ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
- << ", the error code = " << ret << dendl;
+ << ", ret=" << ret << " err is " << cpp_strerror(-ret) << dendl;
break;
}
}
if (ret == 0) {
rgw::AioResultList completed = aio->drain();
- int ret = rgw::check_for_errors(completed);
+ ret = rgw::check_for_errors(completed);
all_results.splice(all_results.end(), completed);
if (ret == 0) {
return 0;
}
}
- // if arrived here we failed somewhere -> rollback all ref-inc operations
/* wait all pending op done */
rgw::AioResultList completed = aio->drain();
all_results.splice(all_results.end(), completed);
int ret2 = 0;
for (auto& aio_res : all_results) {
if (aio_res.result < 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::skip failed refcount inc: "
+ << aio_res.obj.oid << dendl;
continue; // skip errors
}
rgw_rados_ref obj;
ObjectWriteOperation op;
cls_refcount_put(op, ref_tag, true);
+ ldpp_dout(dpp, 10) << __func__ << "::rollback refcount inc on: "
+ << aio_res.obj.oid << dendl;
rgw::AioResultList completed = aio->get(obj.obj,
rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
cost, id);
ret2 = rgw::check_for_errors(completed);
if (ret2 < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl;
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj="
+ << aio_res.obj << dendl;
}
}
completed = aio->drain();
ret2 = rgw::check_for_errors(completed);
if (ret2 < 0) {
ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret="
- << ret2 <<dendl;
+ << ret2 << dendl;
}
return ret;
}
//---------------------------------------------------------------------------
- static int get_ioctx(const DoutPrefixProvider* const dpp,
- rgw::sal::Driver* driver,
- rgw::sal::RadosStore* store,
- const disk_record_t *p_rec,
- librados::IoCtx *p_ioctx,
- std::string *p_oid)
+ static void dedup_object_log(const DoutPrefixProvider *dpp,
+ const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ uint64_t src_head_size,
+ uint64_t tgt_head_size,
+ const bufferlist &etag_bl)
{
- unique_ptr<rgw::sal::Bucket> bucket;
- {
- rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
- int ret = driver->load_bucket(dpp, b, &bucket, null_yield);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
+ ldpp_dout(dpp, 20) << __func__ << "::DEDUP SRC:"
+ << p_src_rec->bucket_name << "/" << p_src_rec->obj_name
+ << "(" << src_head_size << ") ::TGT:"
+ << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name
+ << "(" << tgt_head_size << ")" << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
+ << "::ETAG=" << etag_bl.to_str() << dendl;
+ }
- string dummy_locator;
- const rgw_obj_index_key key(p_rec->obj_name, p_rec->instance);
- rgw_obj obj(bucket->get_key(), key);
- get_obj_bucket_and_oid_loc(obj, *p_oid, dummy_locator);
- RGWBucketInfo& bucket_info = bucket->get_info();
- return store->get_obj_head_ioctx(dpp, bucket_info, obj, p_ioctx);
+ //---------------------------------------------------------------------------
+ /* The target (TGT) manifest must inherit the source (SRC) manifest, as both share
+ * the same tail objects.
+ * However, the TGT head object needs to maintain its unique identity, including
+ * its head-placement-rule and head-object parameters, which are stored in
+ * `rgw_obj`.
+ *
+ * The size of the TGT head object must be adjusted to match the SRC head size.
+ * This is straightforward when Split-Head is enabled, as both heads can be set to
+ * zero and all data is stored in the tail.
+ *
+ * A potential issue arises if the SRC and TGT have different head sizes and
+ * Split-Head is not used.
+ * While this scenario is unlikely in practice (as head-size is almost always 4MB),
+ * if it were to occur, we should abort the deduplication process to prevent data
+ * inconsistencies.
+ */
+ static void adjust_target_manifest(const RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ bufferlist &new_manifest_bl)
+ {
+ // first create new_manifest from the src_manifest
+ RGWObjManifest new_manifest(src_manifest);
+
+ // then, adjust head-object parameters to match the tgt_manifest
+ const uint64_t src_head_size = src_manifest.get_head_size();
+ const auto& tgt_placement_rule = tgt_manifest.get_head_placement_rule();
+ const rgw_obj &tgt_head_obj = tgt_manifest.get_obj();
+
+ new_manifest.set_head(tgt_placement_rule, tgt_head_obj, src_head_size);
+ encode(new_manifest, new_manifest_bl);
}
//---------------------------------------------------------------------------
- static void init_cmp_pairs(const disk_record_t *p_rec,
- const bufferlist &etag_bl,
- bufferlist &hash_bl, // OUT PARAM
+ static void init_cmp_pairs(const DoutPrefixProvider *dpp,
+ const disk_record_t *p_rec,
+ const bufferlist &etag_bl,
+ bufferlist &hash_bl, // OUT PARAM
librados::ObjectWriteOperation *p_op)
{
p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
- // TBD: do we really need the secondary compare using the full manifest?
- // Can replace it with something cheaper like size/version?
- p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
+ bufferlist ref_tag_bl;
+ ref_tag_bl.append(p_rec->ref_tag);
+ if (p_rec->s.flags.is_ref_tag_from_tail()) {
+ p_op->cmpxattr(RGW_ATTR_TAIL_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+ }
+ else {
+ p_op->cmpxattr(RGW_ATTR_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, ref_tag_bl);
+ }
// BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
ceph::encode(p_rec->s.hash[i], hash_bl);
}
if (!p_rec->s.flags.hash_calculated()) {
+ ldpp_dout(dpp, 20) << __func__ << "::CMP HASH " << p_rec->obj_name << dendl;
p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
}
}
//---------------------------------------------------------------------------
- int Background::dedup_object(const disk_record_t *p_src_rec,
- const disk_record_t *p_tgt_rec,
- md5_stats_t *p_stats,
- bool has_shared_manifest_src)
+ static inline void build_manifest_hash_bl(const bufferlist &manifest_bl,
+ bufferlist &manifest_hash_bl)
{
- RGWObjManifest src_manifest;
- try {
- auto bl_iter = p_src_rec->manifest_bl.cbegin();
- decode(src_manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
- return -EINVAL;
- }
- RGWObjManifest tgt_manifest;
- try {
- auto bl_iter = p_tgt_rec->manifest_bl.cbegin();
- decode(tgt_manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl;
- return -EINVAL;
- }
- ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: "
- << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> "
- << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl;
+ bufferlist hash_bl;
+ crypto::digest<crypto::SHA1>(manifest_bl).encode(hash_bl);
+ // Use a shorter hash (64bit instead of 160bit)
+ hash_bl.splice(0, 8, &manifest_hash_bl);
+ }
+ //---------------------------------------------------------------------------
+ int Background::dedup_object(disk_record_t *p_src_rec,
+ disk_record_t *p_tgt_rec,
+ const RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ md5_stats_t *p_stats,
+ const dedup_table_t::value_t *p_src_val,
+ const std::string &tail_oid)
+ {
+ const uint64_t src_head_size = src_manifest.get_head_size();
+ const uint64_t tgt_head_size = tgt_manifest.get_head_size();
bufferlist etag_bl;
etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl);
- ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
- << "::ETAG=" << etag_bl.to_str() << dendl;
-
- bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
- crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
- // Use a shorter hash (64bit instead of 160bit)
- hash_bl.splice(0, 8, &manifest_hash_bl);
- librados::ObjectWriteOperation tgt_op;
- init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
- tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
- tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
- if (p_tgt_rec->s.flags.hash_calculated()) {
- tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
- p_stats->set_hash_attrs++;
+ bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
+ if (unlikely(should_print_debug)) {
+ dedup_object_log(dpp, p_src_rec, p_tgt_rec, src_head_size, tgt_head_size, etag_bl);
}
std::string src_oid, tgt_oid;
librados::IoCtx src_ioctx, tgt_ioctx;
- int ret1 = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
- int ret2 = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
- if (unlikely(ret1 != 0 || ret2 != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
- return (ret1 ? ret1 : ret2);
+ int ret = get_ioctx(dpp, driver, store, p_src_rec, &src_ioctx, &src_oid);
+ if (unlikely(ret != 0)) {
+ // can't remove created tail object without an ioctx handle
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed SRC get_ioctx()" << dendl;
+ return ret;
}
- // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG??
- string ref_tag = p_tgt_rec->ref_tag;
+ ret = get_ioctx(dpp, driver, store, p_tgt_rec, &tgt_ioctx, &tgt_oid);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed TGT get_ioctx()" << dendl;
+ if (p_src_rec->s.flags.is_split_head()) {
+ remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+ }
+ return ret;
+ }
+
+ // we don't dedup head-objects so head-size must match (unless split-head)
+ // see explanation in adjust_target_manifest()
+ if (unlikely(src_head_size != 0 && src_head_size != tgt_head_size)) {
+ ldpp_dout(dpp, 5) << __func__ << "::abort! src_head_size=" << src_head_size
+ << "::tgt_head_size=" << tgt_head_size << dendl;
+ if (p_src_rec->s.flags.is_split_head()) {
+ remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
+ }
+ // TBD: can we create a test case (requires control over head-object-size)??
+ return -ECANCELED;
+ }
+
+ const string &ref_tag = p_tgt_rec->ref_tag;
ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl;
- int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
- if (ret == 0) {
- d_ctl.metadata_access_throttle.acquire();
- ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl;
- ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
- << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
- rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
- return ret;
+ ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
+ if (unlikely(ret != 0)) {
+ if (p_src_rec->s.flags.is_split_head()) {
+ remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
}
+ return ret;
+ }
- // free tail objects based on TGT manifest
- free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+ bufferlist manifest_hash_bl;
+ build_manifest_hash_bl(p_src_rec->manifest_bl, manifest_hash_bl);
- if (!has_shared_manifest_src) {
- // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
- // after deduping B and update it in dedup_table, but don't update the
- // disk-record (as require an expensive random-disk-write).
- // When deduping C we can trust the shared_manifest state in the table and
- // skip a redundant update to SRC object attribute
+ if (!p_src_val->has_shared_manifest()) {
+ // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
+ // after deduping B and update it in dedup_table, but don't update the
+ // disk-record (as require an expensive random-disk-write).
+ // When deduping C we can trust the shared_manifest state in the table and
+ // skip a redundant update to SRC object attribute
+ librados::ObjectWriteOperation src_op;
+ {
bufferlist src_hash_bl;
- librados::ObjectWriteOperation src_op;
- init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
+ init_cmp_pairs(dpp, p_src_rec, etag_bl, src_hash_bl, &src_op);
src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
- if (p_src_rec->s.flags.hash_calculated()) {
+ if (p_src_rec->s.flags.hash_calculated() && !p_src_val->has_valid_hash()){
src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
+ ldpp_dout(dpp, 20) << __func__ <<"::Set SRC Strong Hash in CLS"<< dendl;
p_stats->set_hash_attrs++;
}
+ }
- d_ctl.metadata_access_throttle.acquire();
- ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
- ret = src_ioctx.operate(src_oid, &src_op);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
- << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
- return ret;
+ if (p_src_rec->s.flags.is_split_head()) {
+ ldpp_dout(dpp, 20) << __func__ <<"::SRC-Split (truncate)" << dendl;
+ src_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+ src_op.truncate(0);
+ p_stats->split_head_src++;
+ }
+ d_ctl.metadata_access_throttle.acquire();
+ ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS"<< dendl;
+ ret = src_ioctx.operate(src_oid, &src_op);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
+ << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
+ rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+ if (p_src_rec->s.flags.is_split_head()) {
+ remove_created_tail_object(dpp, src_ioctx, tail_oid, p_stats);
}
+ return ret;
+ }
+ }
+
+ librados::ObjectWriteOperation tgt_op;
+ {
+ bufferlist tgt_hash_bl;
+ init_cmp_pairs(dpp, p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
+ tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
+ bufferlist new_manifest_bl;
+ adjust_target_manifest(src_manifest, tgt_manifest, new_manifest_bl);
+ tgt_op.setxattr(RGW_ATTR_MANIFEST, new_manifest_bl);
+ //tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+ if (p_tgt_rec->s.flags.hash_calculated()) {
+ tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
+ ldpp_dout(dpp, 20) << __func__ <<"::Set TGT Strong Hash in CLS"<< dendl;
+ p_stats->set_hash_attrs++;
}
}
+ // If failed before this point and split-head -> remove the new tail-object
+ if (src_head_size == 0 && tgt_head_size > 0) {
+ ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split OP (truncate)" << dendl;
+ p_tgt_rec->s.flags.set_split_head();
+ tgt_op.truncate(0);
+ p_stats->split_head_tgt++;
+ }
+ d_ctl.metadata_access_throttle.acquire();
+ ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS" << dendl;
+ ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
+ << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
+ rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+ return ret;
+ }
+
+ // free tail objects based on TGT manifest
+ free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+
// do we need to set compression on the head object or is it set on tail?
// RGW_ATTR_COMPRESSION
return ret;
}
//---------------------------------------------------------------------------
- int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
+ int Background::calc_object_blake3(const RGWObjManifest &manifest,
+ disk_record_t *p_rec,
+ uint8_t *p_hash,
+ blake3_hasher *p_pre_calc_hmac)
{
- ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
- RGWObjManifest manifest;
- try {
- auto bl_iter = p_rec->manifest_bl.cbegin();
- decode(manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest for: "
- << p_rec->obj_name << dendl;
- return -EINVAL;
+ ldpp_dout(dpp, 20) << __func__ << "::p_rec->obj_name=" << p_rec->obj_name << dendl;
+
+ blake3_hasher _hmac, *p_hmac = nullptr;
+ if (!p_pre_calc_hmac) {
+ blake3_hasher_init(&_hmac);
+ p_hmac = &_hmac;
+ }
+ else {
+ p_hmac = p_pre_calc_hmac;
}
- blake3_hasher hmac;
- blake3_hasher_init(&hmac);
for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- rgw_rados_ref obj;
- int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
- << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
- return ret;
- }
+ uint64_t offset = p.get_stripe_ofs();
+ const rgw_obj_select& os = p.get_location();
+ if (offset > 0 || !p_pre_calc_hmac) {
+ rgw_raw_obj raw_obj = os.get_raw_obj(rados);
+ rgw_rados_ref obj;
+ int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid="
+ << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
- bufferlist bl;
- librados::IoCtx ioctx = obj.ioctx;
- // read full object
- ret = ioctx.read(raw_obj.oid, bl, 0, 0);
- if (ret > 0) {
+ librados::IoCtx ioctx = obj.ioctx;
+ bufferlist bl;
+ // read full object
+ ret = ioctx.read(raw_obj.oid, bl, 0, 0);
+ if (unlikely(ret <= 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read oid "
+ << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
for (const auto& bptr : bl.buffers()) {
- blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
+ blake3_hasher_update(p_hmac, (const unsigned char *)bptr.c_str(), bptr.length());
}
}
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
- << ", error is " << cpp_strerror(-ret) << dendl;
- return ret;
- }
}
-
- blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
+ blake3_hasher_finalize(p_hmac, p_hash, BLAKE3_OUT_LEN);
+ p_rec->s.flags.set_hash_calculated();
+ p_rec->s.flags.set_has_valid_hash();
return 0;
}
{
ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name
<< ", obj=" << p_tgt_rec->obj_name
+ << ", bytes_size=" << p_tgt_rec->s.obj_bytes_size
<< ", block_id=" << block_id
- << ", rec_id=" << (int)rec_id
- << ", md5_shard=" << (int)md5_shard << dendl;
-
- ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard
- << "::" << p_tgt_rec->bucket_name
- << "/" << p_tgt_rec->obj_name
+ << ", rec_id=" << (int)rec_id << "\n"
+ << ", md5_shard=" << (int)md5_shard
<< "::num_parts=" << p_tgt_rec->s.num_parts
<< "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
<< p_tgt_rec->s.md5_low << std::dec << dendl;
}
//---------------------------------------------------------------------------
- int Background::add_obj_attrs_to_record(rgw_bucket *p_rb,
- disk_record_t *p_rec,
+ static inline bool invalid_tail_placement(const rgw_bucket_placement& tail_placement)
+ {
+ return (tail_placement.bucket.name.empty() || tail_placement.placement_rule.name.empty());
+ }
+
+ //---------------------------------------------------------------------------
+ static void set_explicit_tail_placement(const DoutPrefixProvider* dpp,
+ RGWObjManifest *p_manifest,// IN-OUT PARAM
+ md5_stats_t *p_stats)
+ {
+ p_stats->manifest_no_tail_placement++;
+ ldpp_dout(dpp, 20) << __func__ << "::invalid_tail_placement -> update" << dendl;
+ const rgw_bucket_placement& tail_placement = p_manifest->get_tail_placement();
+ const rgw_bucket *p_bucket = &tail_placement.bucket;
+
+ if (tail_placement.bucket.name.empty()) {
+ // bucket was not set in tail_placement, force the head bucket explicitly
+ const rgw_obj& head_obj = p_manifest->get_obj();
+ p_bucket = &head_obj.bucket;
+ }
+
+ if (tail_placement.placement_rule.name.empty()) {
+ // explicitly use the head_placement_rule for tail objects and update bucket
+ // if needed
+ const auto &head_placement_rule = p_manifest->get_head_placement_rule();
+ p_manifest->set_tail_placement(head_placement_rule, *p_bucket);
+ }
+ else {
+ // otherwise, keep the tail_placement_rule in place (but still update bucket)
+ p_manifest->set_tail_placement(tail_placement.placement_rule, *p_bucket);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::add_obj_attrs_to_record(disk_record_t *p_rec,
const rgw::sal::Attrs &attrs,
- dedup_table_t *p_table,
md5_stats_t *p_stats) /*IN-OUT*/
{
// if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG
auto itr = attrs.find(RGW_ATTR_TAIL_TAG);
if (itr != attrs.end()) {
+ p_rec->s.flags.set_ref_tag_from_tail();
p_rec->ref_tag = itr->second.to_str();
}
else {
// clear bufferlist first
p_rec->manifest_bl.clear();
+ bool need_to_split_head = false;
+ RGWObjManifest manifest;
itr = attrs.find(RGW_ATTR_MANIFEST);
if (itr != attrs.end()) {
const bufferlist &bl = itr->second;
- RGWObjManifest manifest;
try {
auto bl_iter = bl.cbegin();
decode(manifest, bl_iter);
<< "::ERROR: unable to decode manifest" << dendl;
return -EINVAL;
}
+ need_to_split_head = should_split_head(manifest.get_head_size(),
+ p_rec->s.obj_bytes_size);
// force explicit tail_placement as the dedup could be on another bucket
const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
- if (tail_placement.bucket.name.empty()) {
- ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl;
- manifest.set_tail_placement(tail_placement.placement_rule, *p_rb);
+ if (unlikely(invalid_tail_placement(tail_placement))) {
+ set_explicit_tail_placement(dpp, &manifest, p_stats);
encode(manifest, p_rec->manifest_bl);
}
else {
ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl;
return -EINVAL;
}
+ const auto &head_placement_rule = manifest.get_head_placement_rule();
+ const std::string& storage_class =
+ rgw_placement_rule::get_canonical_storage_class(head_placement_rule.storage_class);
+
+ // p_rec holds an the storage_class value taken from the bucket-index/obj-attr
+ if (unlikely(storage_class != p_rec->stor_class)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERROR::manifest storage_class="
+ << storage_class << " != " << "::bucket-index storage_class="
+ << p_rec->stor_class << dendl;
+ p_stats->different_storage_class++;
+ return -EINVAL;
+ }
itr = attrs.find(RGW_ATTR_SHARE_MANIFEST);
if (itr != attrs.end()) {
if (itr != attrs.end()) {
try {
auto bl_iter = itr->second.cbegin();
- // BLAKE3 hash 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
uint64_t val;
ceph::decode(val, bl_iter);
p_rec->s.hash[i] = val;
}
+ p_rec->s.flags.set_has_valid_hash();
p_stats->valid_hash_attrs++;
return 0;
} catch (buffer::error& err) {
}
}
+ // if arrived here we need to calculate string hash
p_stats->invalid_hash_attrs++;
- // TBD: redundant memset...
memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
- // BLAKE3_OUT_LEN is 32 Bytes
- int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
- if (ret == 0) {
- p_rec->s.flags.set_hash_calculated();
- }
- return ret;
+ if (!need_to_split_head) {
+ ldpp_dout(dpp, 20) << __func__ << "::CALC Object Strong Hash::"
+ << p_rec->obj_name << dendl;
+ return calc_object_blake3(manifest, p_rec, (uint8_t*)p_rec->s.hash);
+ }
+ // else, differ strong-hash calculation for next step and piggy back split-head
+ return 0;
}
//---------------------------------------------------------------------------
storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
&p_stats->failed_map_overflow);
if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
- // TBD: need stat counters
return -EOVERFLOW;
}
key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
dedup_table_t::value_t src_val;
int ret = p_table->get_val(&key_from_bucket_index, &src_val);
if (ret != 0) {
- if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+ if (!dedupable_object(p_rec->multipart_object(), d_min_obj_size_for_dedup, ondisk_byte_size)) {
// record has no valid entry in table because it is a too small
// It was loaded to table for calculation and then purged
p_stats->skipped_purged_small++;
}
const rgw::sal::Attrs& attrs = p_obj->get_attrs();
+ if (src_val.has_shared_manifest() && (attrs.find(RGW_ATTR_SHARE_MANIFEST) != attrs.end())) {
+ // A shared_manifest object can't be a dedup target
+ // We only need to keep a single shared_manifest object
+ // to be used as a dedup-source (which we already got)
+ p_stats->skipped_shared_manifest++;
+ uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_rec->s.num_parts,
+ ondisk_byte_size);
+ p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
+ ldpp_dout(dpp, 20) << __func__ << "::(1)skipped shared_manifest, SRC::block_id="
+ << src_val.block_idx << "::rec_id=" << (int)src_val.rec_id << dendl;
+ return 0;
+ }
+
if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) {
p_stats->ingress_skip_encrypted++;
p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size;
return 0;
}
- // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
+ // TBD-Future: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) {
p_stats->ingress_skip_compressed++;
p_stats->ingress_skip_compressed_bytes += ondisk_byte_size;
else {
storage_class = RGW_STORAGE_CLASS_STANDARD;
}
+
+ // p_rec holds an the storage_class value taken from the bucket-index
+ if (unlikely(storage_class != p_rec->stor_class)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERROR::ATTR storage_class="
+ << storage_class << " != " << "::bucket-index storage_class="
+ << p_rec->stor_class << dendl;
+ p_stats->different_storage_class++;
+ return -EINVAL;
+ }
+
// no need to check for remap success as we compare keys bellow
sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow);
key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low,
// reset flags
p_rec->s.flags.clear();
- ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats);
+ ret = add_obj_attrs_to_record(p_rec, attrs, p_stats);
if (unlikely(ret != 0)) {
ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
<< ret << "::" << cpp_strerror(-ret) << dendl;
ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info);
if (ret == 0) {
// set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest
- ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK);
+ if (unlikely(rec_info.rec_id >= MAX_REC_IN_BLOCK)) {
+ p_stats->illegal_rec_id++;
+ }
ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
<< p_rec->obj_name << " was written to block_idx="
<< rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id
- << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl;
+ << "::shared_manifest="
+ << p_rec->s.flags.has_shared_manifest() << dendl;
p_table->update_entry(&key_from_bucket_index, rec_info.block_id,
- rec_info.rec_id, p_rec->has_shared_manifest());
+ rec_info.rec_id, p_rec->s.flags.has_shared_manifest());
}
else {
ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl;
}
//---------------------------------------------------------------------------
- static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
- rgw::sal::Driver* driver,
- rgw::sal::RadosStore *store,
- const disk_record_t *p_rec)
+ static int write_hash_object_attribute(const DoutPrefixProvider* const dpp,
+ rgw::sal::Driver* driver,
+ rgw::sal::RadosStore *store,
+ const disk_record_t *p_rec,
+ md5_stats_t *p_stats)
{
bufferlist etag_bl;
bufferlist hash_bl;
librados::ObjectWriteOperation op;
etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
&etag_bl);
- init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
+ init_cmp_pairs(dpp, p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
std::string oid;
ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
<< oid << "), err is " << cpp_strerror(-ret) << dendl;
}
+ ldpp_dout(dpp, 20) << __func__ <<"::Write Strong Hash to " << p_rec->obj_name
+ << dendl;
+ p_stats->set_hash_attrs++;
return ret;
}
+ //---------------------------------------------------------------------------
+ static bool compare_strong_hash(const DoutPrefixProvider *const dpp,
+ const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ md5_stats_t *p_stats)
+ {
+ if (unlikely(0 != memcmp(p_src_rec->s.hash, p_tgt_rec->s.hash, sizeof(p_src_rec->s.hash)))) {
+ p_stats->hash_mismatch++;
+ ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
+ return false;
+ }
+ ldpp_dout(dpp, 20) << __func__ << "::SRC-TGT Strong-Hash match" << dendl;
+ // all is good
+ return true;
+ }
+
+ //---------------------------------------------------------------------------
+ static int read_hash_and_manifest(const DoutPrefixProvider *const dpp,
+ rgw::sal::Driver *driver,
+ RGWRados *rados,
+ disk_record_t *p_rec)
+ {
+ librados::IoCtx ioctx;
+ std::string oid;
+ int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+ return ret;
+ }
+
+ std::map<std::string, bufferlist> attrset;
+ ret = ioctx.getxattrs(oid, attrset);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.getxattrs("
+ << oid << "), err is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ auto itr = attrset.find(RGW_ATTR_BLAKE3);
+ if (itr != attrset.end()) {
+ try {
+ auto bl_iter = itr->second.cbegin();
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
+ uint64_t val;
+ ceph::decode(val, bl_iter);
+ p_rec->s.hash[i] = val;
+ }
+ p_rec->s.flags.set_has_valid_hash();
+ // the hash was taken directly from the object attributes and not calculated
+ p_rec->s.flags.clear_hash_calculated();
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
+ return -EINVAL;
+ }
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: No HASH attribute" << dendl;
+ return -ENOENT;
+ }
+
+ itr = attrset.find(RGW_ATTR_MANIFEST);
+ if (itr != attrset.end()) {
+ ldpp_dout(dpp, 20) << __func__ << "::Got Manifest " << p_rec->obj_name << dendl;
+ p_rec->manifest_bl = itr->second;
+ p_rec->s.manifest_len = p_rec->manifest_bl.length();
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: No Manifest attribute" << dendl;
+ return -ENOENT;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ static void set_explicit_manifest(RGWObjManifest *p_manifest,
+ std::map<uint64_t, RGWObjManifestPart> &objs_map)
+ {
+ uint64_t obj_size = p_manifest->get_obj_size();
+ p_manifest->set_head_size(0);
+ p_manifest->set_max_head_size(0);
+ p_manifest->set_prefix("");
+ p_manifest->clear_rules();
+ p_manifest->set_explicit(obj_size, objs_map);
+ }
+
+ //---------------------------------------------------------------------------
+ // This code is based on RGWObjManifest::convert_to_explicit()
+ static void build_explicit_objs_map(const DoutPrefixProvider *dpp,
+ RGWRados *rados,
+ const RGWObjManifest &manifest,
+ const rgw_bucket *p_bucket,
+ std::map<uint64_t, RGWObjManifestPart> *p_objs_map,
+ const std::string &tail_name,
+ md5_stats_t *p_stats)
+ {
+ bool manifest_raw_obj_logged = false;
+ unsigned idx = 0;
+ auto p = manifest.obj_begin(dpp);
+ while (p != manifest.obj_end(dpp)) {
+ const uint64_t offset = p.get_stripe_ofs();
+ const rgw_obj_select& os = p.get_location();
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]OBJ: "
+ << os.get_raw_obj(rados).oid << "::ofs=" << p.get_ofs()
+ << "::strp_offset=" << offset << dendl;
+
+ RGWObjManifestPart& part = (*p_objs_map)[offset];
+ part.loc_ofs = 0;
+
+ if (offset == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] HEAD OBJ: "
+ << os.get_raw_obj(rados).oid << dendl;
+ const rgw_obj &head_obj = manifest.get_obj();
+ const rgw_obj_key &head_key = head_obj.key;
+ // TBD: Can we have different instance/ns values for head/tail ??
+ // Should we take the instance/ns from the head or tail?
+ // Maybe should refuse objects with different instance/ns on head/tail ?
+ rgw_obj_key tail_key(tail_name, head_key.instance, head_key.ns);
+ rgw_obj tail_obj(*p_bucket, tail_key);
+ part.loc = tail_obj;
+ }
+ else {
+ // RGWObjManifest::convert_to_explicit() is assuming raw_obj, but looking
+ // at the RGWObjManifest::obj_iterator code it is clear the obj is not raw.
+ // If it happens to be raw we still handle it correctly (and inc stat-count)
+ std::optional<rgw_obj> obj_opt = os.get_head_obj();
+ if (obj_opt.has_value()) {
+ part.loc = obj_opt.value();
+ }
+ else {
+ // report raw object in manifest only once
+ if (!manifest_raw_obj_logged) {
+ manifest_raw_obj_logged = true;
+ ldpp_dout(dpp, 10) << __func__ << "::WARN: obj is_raw" << dendl;
+ p_stats->manifest_raw_obj++;
+ }
+ const rgw_raw_obj& raw = os.get_raw_obj(rados);
+ RGWSI_Tier_RADOS::raw_obj_to_obj(*p_bucket, raw, &part.loc);
+ }
+ }
+
+ ++p;
+ uint64_t next_offset = p.get_stripe_ofs();
+ part.size = next_offset - offset;
+ idx++;
+ } // while (p != manifest.obj_end())
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::split_head_object(disk_record_t *p_src_rec, // IN-OUT PARAM
+ RGWObjManifest &src_manifest, // IN/OUT PARAM
+ const disk_record_t *p_tgt_rec,
+ std::string &tail_oid, // OUT PARAM
+ md5_stats_t *p_stats)
+ {
+ ldpp_dout(dpp, 20) << __func__ << "::" << p_src_rec->obj_name << "::"
+ << p_src_rec->s.obj_bytes_size << dendl;
+
+ uint64_t head_size = src_manifest.get_head_size();
+ bufferlist bl;
+ std::string head_oid;
+ librados::IoCtx ioctx;
+ int ret = get_ioctx(dpp, driver, rados, p_src_rec, &ioctx, &head_oid);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+ return ret;
+ }
+
+ // read the full rados head-object
+ ldpp_dout(dpp, 20) << __func__ << "::ioctx.read(" << head_oid << ")" << dendl;
+ ret = ioctx.read(head_oid, bl, 0, 0);
+ if (unlikely(ret != (int)head_size)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << head_oid
+ << ", ret=" << ret << ", error is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ // we might have a valid hash left from a failed dedup (mismatch SRC/TGT)
+ if (!p_src_rec->s.flags.has_valid_hash()) {
+ ldpp_dout(dpp, 20) << __func__ << "::calc BLK3 for SRC "
+ << p_src_rec->obj_name << dendl;
+ blake3_hasher hmac;
+ blake3_hasher_init(&hmac);
+ for (const auto& bptr : bl.buffers()) {
+ blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(),
+ bptr.length());
+ }
+ uint8_t *p_hash = (uint8_t*)p_src_rec->s.hash;
+ ret = calc_object_blake3(src_manifest, p_src_rec, p_hash, &hmac);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ // cancel split-head operation if strong hash differ
+ if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+ return -ECANCELED;
+ }
+ }
+
+ bool exclusive = true; // block overwrite
+ std::string tail_name = generate_split_head_tail_name(src_manifest);
+ const rgw_bucket_placement &tail_placement = src_manifest.get_tail_placement();
+ // Tail placement_rule was fixed before committed to SLAB, if looks bad -> abort
+ if (unlikely(invalid_tail_placement(tail_placement))) {
+ p_stats->split_head_no_tail_placement++;
+ ldpp_dout(dpp, 1) << __func__ << "::invalid_tail_placement -> abort" << dendl;
+ return -EINVAL;
+ }
+
+ const rgw_bucket *p_bucket = &tail_placement.bucket;
+ // tail objects might be on another storage_class/pool, need another ioctx
+ librados::IoCtx tail_ioctx;
+ ret = get_ioctx_internal(dpp, driver, store, tail_name, p_src_rec->instance,
+ *p_bucket, &tail_ioctx, &tail_oid);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx_internal()" << dendl;
+ return ret;
+ }
+
+ ret = tail_ioctx.create(tail_oid, exclusive);
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::successfully created: " << tail_oid << dendl;
+ }
+ else if (ret == -EEXIST) {
+ // should not happen as we take the prefix with unused counter 0
+ // better to skip this dedup opportunity
+ ldpp_dout(dpp, 1) << __func__ << "::ERR object " << tail_oid << " exists!" << dendl;
+ p_stats->failed_split_head_creat++;
+ return ret;
+ }
+ else{
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << tail_oid
+ <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <<dendl;
+ return ret;
+ }
+
+ ret = tail_ioctx.write_full(tail_oid, bl);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to write " << tail_oid
+ << " with: " << cpp_strerror(-ret) << dendl;
+ // don't leave orphan object behind
+ tail_ioctx.remove(tail_oid);
+ return ret;
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::wrote tail obj:" << tail_oid << "::ret="
+ << ret << dendl;
+ }
+
+ std::map<uint64_t, RGWObjManifestPart> objs_map;
+ build_explicit_objs_map(dpp, rados, src_manifest, p_bucket, &objs_map,
+ tail_name, p_stats);
+ set_explicit_manifest(&src_manifest, objs_map);
+
+ bufferlist manifest_bl;
+ encode(src_manifest, manifest_bl);
+ p_src_rec->manifest_bl = manifest_bl;
+ p_src_rec->s.manifest_len = p_src_rec->manifest_bl.length();
+ p_src_rec->s.flags.set_split_head();
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ bool Background::check_and_set_strong_hash(disk_record_t *p_src_rec,
+ disk_record_t *p_tgt_rec,
+ RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ const dedup_table_t::value_t *p_src_val,
+ std::string &tail_oid, // OUT PARAM
+ md5_stats_t *p_stats)
+ {
+ int ret = 0;
+ // if we don't have a valid strong hash already -> read data and calculate it!
+ if (!p_tgt_rec->s.flags.has_valid_hash()) {
+ ldpp_dout(dpp, 20) << __func__ << "::CALC TGT Strong Hash::"
+ << p_tgt_rec->obj_name << dendl;
+ ret = calc_object_blake3(tgt_manifest, p_tgt_rec, (uint8_t*)p_tgt_rec->s.hash);
+ if (unlikely(ret != 0)) {
+ // Don't run dedup without a valid strong hash
+ return false;
+ }
+ }
+
+ // SRC hash could have been calculated and stored in obj-attributes before
+ // (will happen when we got multiple targets)
+ if (!p_src_rec->s.flags.has_valid_hash() && p_src_val->has_valid_hash()) {
+ // read the manifest and strong hash from the head-object attributes
+ ldpp_dout(dpp, 20) << __func__ << "::Fetch SRC strong hash from head-object::"
+ << p_src_rec->obj_name << dendl;
+ if (unlikely(read_hash_and_manifest(dpp, driver, rados, p_src_rec) != 0)) {
+ return false;
+ }
+ try {
+ auto bl_iter = p_src_rec->manifest_bl.cbegin();
+ decode(src_manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed manifest decode" << dendl;
+ return false;
+ }
+ }
+
+ // check hash before trying to split head (can be skipped if not equal)
+ if (p_src_rec->s.flags.has_valid_hash()) {
+ if (unlikely(!compare_strong_hash(dpp, p_src_rec, p_tgt_rec, p_stats))) {
+ return false;
+ }
+ }
+
+ // we might still need to split-head here when hash is valid
+ // can happen if we failed compare before (md5-collison) and stored the src hash
+ // in the obj-attributes
+ uint64_t head_size = src_manifest.get_head_size();
+ if (should_split_head(head_size, src_manifest.get_obj_size())) {
+ ret = split_head_object(p_src_rec, src_manifest, p_tgt_rec, tail_oid, p_stats);
+ // compare_strong_hash() is called internally by split_head_object()
+ return (ret == 0);
+ }
+ else if (!p_src_rec->s.flags.has_valid_hash()) {
+ // object not targeted for split_head it should have a valid hash -> skip it
+ ldpp_dout(dpp, 5) << __func__
+ << "::ERR: object not targeted for split_head has no hash" << dendl;
+ p_stats->invalid_hash_no_split_head++;
+ return false;
+ }
+
+ return true;
+ }
+
+ //---------------------------------------------------------------------------
+ static bool parse_manifests(const DoutPrefixProvider *dpp,
+ const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ RGWObjManifest *p_src_manifest,
+ RGWObjManifest *p_tgt_manifest)
+ {
+ bool valid_src_manifest = false;
+ try {
+ auto bl_iter = p_src_rec->manifest_bl.cbegin();
+ decode(*p_src_manifest, bl_iter);
+ valid_src_manifest = true;
+ bl_iter = p_tgt_rec->manifest_bl.cbegin();
+ decode(*p_tgt_manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad "
+ << (valid_src_manifest? "TGT" : "SRC")
+ << " manifest" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ static bool has_shared_tail_objects(const DoutPrefixProvider *dpp,
+ RGWRados *rados,
+ const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ const RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ md5_stats_t *p_stats)
+ {
+ // Build a vector with all tail-objects on the SRC and then iterate over
+ // the TGT tail-objects looking for a single tail-object in both manifets.
+ // If found -> abort the dedup
+ // The only case leading to this scenario is server-side-copy
+ // It is probably enough to scan the first few tail-objects, but better safe...
+ std::string src_oid = build_oid(p_src_rec->bucket_id, p_src_rec->obj_name);
+ std::string tgt_oid = build_oid(p_tgt_rec->bucket_id, p_tgt_rec->obj_name);
+ std::vector<std::string> vec;
+ unsigned idx = 0;
+ for (auto p = src_manifest.obj_begin(dpp); p != src_manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ if (src_oid != raw_obj.oid) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+ vec.push_back(raw_obj.oid);
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+ << raw_obj.oid << dendl;
+ continue;
+ }
+ }
+ idx = 0;
+ for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ if (tgt_oid != raw_obj.oid) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"]" << raw_obj.oid << dendl;
+ // Search for the tail_obj in the vector
+ // should be one of the first entries (first or second)
+ auto itr = std::find(vec.begin(), vec.end(), raw_obj.oid);
+ if (unlikely(itr != vec.end())) {
+ ldpp_dout(dpp, 10) << __func__ << "::tail obj " << raw_obj.oid
+ << " exists on both SRC and TGT Objects -> Abort DEDUP!"<< dendl;
+ p_stats->skip_shared_tail_objs ++;
+ return true;
+ }
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+ << raw_obj.oid << dendl;
+ continue;
+ }
+ }
+
+ return false;
+ }
+
//---------------------------------------------------------------------------
// We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
// so all entries left are sources of dedup with multiple copies.
// we can withstand most errors moving to the next object
// only report an error if we recived a stop scan request!
//
- int Background::try_deduping_record(dedup_table_t *p_table,
- const disk_record_t *p_tgt_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats, /* IN-OUT */
- remapper_t *remapper)
- {
- bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>();
+ int Background::try_deduping_record(dedup_table_t *p_table,
+ disk_record_t *p_tgt_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats, /* IN-OUT */
+ remapper_t *remapper)
+ {
+ bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
if (unlikely(should_print_debug)) {
print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard);
}
-
uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size);
storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp,
&p_stats->failed_map_overflow);
- ceph_assert(sc_idx != remapper_t::NULL_IDX);
+ if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
+ ldpp_dout(dpp, 5) << __func__ << "::invalid_storage_class_mapping for "
+ << p_tgt_rec->stor_class << "::" << p_tgt_rec->obj_name << dendl;
+ p_stats->invalid_storage_class_mapping++;
+ return 0;
+ }
key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units,
p_tgt_rec->s.num_parts, sc_idx);
dedup_table_t::value_t src_val;
int ret = p_table->get_val(&key, &src_val);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
// record has no valid entry in table because it is a singleton
// should never happened since we purged all singletons before
ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name
<< "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts
<< "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
<< p_tgt_rec->s.md5_low << std::dec << dendl;
- ceph_abort("Unexpcted singleton");
+ p_stats->singleton_after_purge++;
return 0;
}
if (block_id == src_block_id && rec_id == src_rec_id) {
// the table entry point to this record which means it is a dedup source so nothing to do
p_stats->skipped_source_record++;
- ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::(2)skipped source-record, block_id="
+ << block_id << "::rec_id=" << (int)rec_id << dendl;
return 0;
}
- // ceph store full blocks so need to round up and multiply by block_size
- uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
- uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size,
- p_tgt_rec->s.num_parts,
- ondisk_byte_size);
+ // should never happen
if (p_tgt_rec->s.flags.has_shared_manifest()) {
// record holds a shared_manifest object so can't be a dedup target
- p_stats->skipped_shared_manifest++;
- p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
- ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl;
+ ldpp_dout(dpp, 1) << __func__ << "::(3)skipped shared_manifest, block_id="
+ << block_id << "::rec_id=" << (int)rec_id << dendl;
+ p_stats->shared_manifest_after_purge++;
return 0;
}
+ // ceph store full blocks so need to round up and multiply by block_size
+ uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
+ uint64_t dedupable_objects_bytes = __calc_deduped_bytes(p_tgt_rec->s.num_parts,
+ ondisk_byte_size);
+
// This records is a dedup target with source record on source_block_id
- disk_record_t src_rec;
- ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id,
+ disk_record_t src_rec, *p_src_rec = &src_rec;
+ ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, p_src_rec, src_block_id,
src_rec_id, md5_shard, dpp);
if (unlikely(ret != 0)) {
p_stats->failed_src_load++;
// we can withstand most errors moving to the next object
ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record("
- << src_block_id << ", " << src_rec_id << ")" << dendl;
+ << src_block_id << ", " << (int)src_rec_id << ")" << dendl;
return 0;
}
- ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name
- << "/" << src_rec.obj_name << dendl;
+ ldpp_dout(dpp, 20) << __func__ << "::SRC:" << p_src_rec->bucket_name << "/"
+ << p_src_rec->obj_name << "::TGT:" << p_tgt_rec->bucket_name
+ << "/" << p_tgt_rec->obj_name << dendl;
// verify that SRC and TGT records don't refer to the same physical object
// This could happen in theory if we read the same objects twice
- if (src_rec.ref_tag == p_tgt_rec->ref_tag) {
+ if (p_src_rec->ref_tag == p_tgt_rec->ref_tag) {
p_stats->duplicate_records++;
ldpp_dout(dpp, 10) << __func__ << "::WARN::REF_TAG::Duplicate records for "
- << src_rec.obj_name << "::" << src_rec.ref_tag << "::"
+ << p_src_rec->obj_name << "::" << p_src_rec->ref_tag <<"::"
<< p_tgt_rec->obj_name << dendl;
return 0;
}
// the hash table size is rounded to the nearest 4KB and will wrap after 16G
- if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
+ if (unlikely(p_src_rec->s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
p_stats->size_mismatch++;
ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::"
- << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size
+ << p_src_rec->obj_name << "::" << p_src_rec->s.obj_bytes_size
<< "::" << p_tgt_rec->obj_name << "::"
<< p_tgt_rec->s.obj_bytes_size << dendl;
return 0;
}
- if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
- p_stats->hash_mismatch++;
- ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
- // TBD: set hash attributes on head objects to save calc next time
- if (src_rec.s.flags.hash_calculated()) {
- write_blake3_object_attribute(dpp, driver, store, &src_rec);
- p_stats->set_hash_attrs++;
+ ret = parse_manifests(dpp, p_src_rec, p_tgt_rec, &src_manifest, &tgt_manifest);
+ if (unlikely(ret != 0)) {
+ return 0;
+ }
+
+ // make sure objects were not created by server-side-copy
+ if (unlikely(has_shared_tail_objects(dpp, rados, p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats))) {
+ return 0;
+ }
+
+
+ std::string tail_oid;
+ bool success = check_and_set_strong_hash(p_src_rec, p_tgt_rec, src_manifest,
+ tgt_manifest, &src_val, tail_oid, p_stats);
+ if (unlikely(!success)) {
+ if (p_src_rec->s.flags.hash_calculated() && !src_val.has_valid_hash()) {
+ // set hash attributes on head objects to save calc next time
+ ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid SRC hash" << dendl;
+ ret = write_hash_object_attribute(dpp, driver, store, p_src_rec, p_stats);
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ <<"::mark valid_hash in table" << dendl;
+ p_table->set_src_mode(&key, src_block_id, src_rec_id, false, true);
+ }
}
if (p_tgt_rec->s.flags.hash_calculated()) {
- write_blake3_object_attribute(dpp, driver, store, p_tgt_rec);
- p_stats->set_hash_attrs++;
+ ldpp_dout(dpp, 20) << __func__ <<"::failed: store valid TGT hash" << dendl;
+ write_hash_object_attribute(dpp, driver, store, p_tgt_rec, p_stats);
}
return 0;
}
- ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest());
+ ret = dedup_object(p_src_rec, p_tgt_rec, src_manifest, tgt_manifest, p_stats,
+ &src_val, tail_oid);
if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::dedup success " << p_src_rec->obj_name << dendl;
p_stats->deduped_objects++;
p_stats->deduped_objects_bytes += dedupable_objects_bytes;
- if (p_tgt_rec->s.num_parts == 0) {
+ if (p_tgt_rec->s.flags.is_split_head()) {
+ ldpp_dout(dpp, 20) << __func__ <<"::TGT-Split: dedup_bytes="
+ << ondisk_byte_size << dendl;
+ p_stats->split_head_dedup_bytes += ondisk_byte_size;
+ }
+ else if (p_tgt_rec->s.num_parts == 0 &&
+ // if we don't split head it will be duplicated
+ p_tgt_rec->s.obj_bytes_size > d_head_object_size) {
// single part objects duplicate the head object when dedup is used
p_stats->dup_head_bytes += d_head_object_size;
}
// mark the SRC object as a providor of a shared manifest
if (!src_val.has_shared_manifest()) {
+ ldpp_dout(dpp, 20) << __func__ << "::mark shared_manifest+valid_hash"<< dendl;
p_stats->set_shared_manifest_src++;
- // set the shared manifest flag in the dedup table
- p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id);
+ // We always set strong hash on SRC during dedup so mark in table!
+ p_table->set_src_mode(&key, src_block_id, src_rec_id, true, true);
}
else {
ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl;
}
else {
ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for "
- << src_rec.bucket_name << "/" << src_rec.obj_name << dendl;
+ << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << dendl;
p_stats->failed_dedup++;
}
}
has_more = (p_header->offset == BLOCK_MAGIC);
- ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC);
if (!has_more) {
ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id
<< ", rec_count=" << p_header->rec_count << dendl;
+ if (unlikely(p_header->offset != LAST_BLOCK_MAGIC)) {
+ p_stats->missing_last_block_marker++;
+ }
break;
}
}
p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
// We limit dedup to objects from the same storage_class
- // TBD:
+ // TBD-Future:
// Should we use a skip-list of storage_classes we should skip (like glacier) ?
const std::string& storage_class =
rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class);
p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size;
}
- if (ondisk_byte_size <= d_min_obj_size_for_dedup) {
+ if (ondisk_byte_size < d_min_obj_size_for_dedup) {
if (parsed_etag.num_parts == 0) {
// dedup only useful for objects bigger than 4MB
p_worker_stats->ingress_skip_too_small++;
// make sure that the standard storage_class is always in the mapper!
storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp,
&p_stats->failed_map_overflow);
- ceph_assert(sc_idx == 0);
+ ceph_assert(sc_idx != remapper_t::NULL_IDX);
uint32_t slab_count_arr[num_work_shards];
// first load all etags to hashtable to find dedups
// the entries come from bucket-index and got minimal info (etag, size)
utime_t start_time = ceph_clock_now();
md5_stats_t md5_stats;
//DEDUP_DYN_ALLOC
- dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size);
+ dedup_table_t table(dpp, d_head_object_size, d_min_obj_size_for_dedup,
+ d_max_obj_size_for_split, raw_mem, raw_mem_size);
int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
if (ret == 0) {
md5_stats.duration = ceph_clock_now() - start_time;
ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl;
d_ctl.dedup_type = p_epoch->dedup_type;
+ // TBD: replace with a stat-counter
#ifdef FULL_DEDUP_SUPPORT
ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_EXEC ||
d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
}
d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();});
if (!d_ctl.should_stop() && !d_ctl.should_pause()) {
- // TBD: should we release lock here ???
if (d_cluster.can_start_new_scan(store)) {
d_ctl.dedup_exec = true;
d_ctl.remote_aborted = false;
STEP_REMOVE_DUPLICATES
};
+ inline uint64_t __calc_deduped_bytes(uint16_t num_parts, uint64_t size_bytes);
+ inline bool should_split_head(uint64_t head_size, uint64_t obj_size);
void run();
int setup(struct dedup_epoch_t*);
void work_shards_barrier(work_shard_t num_work_shards);
remapper_t *remapper);
#ifdef FULL_DEDUP_SUPPORT
- int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
- int add_obj_attrs_to_record(rgw_bucket *p_rb,
- disk_record_t *p_rec,
+ int calc_object_blake3(const RGWObjManifest &manifest,
+ disk_record_t *p_rec,
+ uint8_t *p_hash,
+ blake3_hasher *p_pre_calc_hmac = nullptr);
+ int split_head_object(disk_record_t *p_src_rec, // IN/OUT PARAM
+ RGWObjManifest &src_manifest, // IN/OUT PARAM
+ const disk_record_t *p_tgt_rec,
+ std::string &tail_oid, // OUT PARAM
+ md5_stats_t *p_stats);
+
+ int add_obj_attrs_to_record(disk_record_t *p_rec,
const rgw::sal::Attrs &attrs,
- dedup_table_t *p_table,
md5_stats_t *p_stats); /* IN-OUT */
int read_object_attribute(dedup_table_t *p_table,
md5_stats_t *p_stats /* IN-OUT */,
disk_block_seq_t *p_disk,
remapper_t *remapper);
- int try_deduping_record(dedup_table_t *p_table,
- const disk_record_t *p_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats, /* IN-OUT */
- remapper_t *remapper);
- int inc_ref_count_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &manifest);
- int rollback_ref_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &tgt_manifest);
- int free_tail_objs_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &tgt_manifest);
- int dedup_object(const disk_record_t *p_src_rec,
- const disk_record_t *p_tgt_rec,
- md5_stats_t *p_stats,
- bool is_shared_manifest_src);
+ bool check_and_set_strong_hash(disk_record_t *p_src_rec, // IN/OUT PARAM
+ disk_record_t *p_tgt_rec, // IN/OUT PARAM
+ RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ const dedup_table_t::value_t *p_src_val,
+ std::string &tail_oid, // OUT PARAM
+ md5_stats_t *p_stats);
+ int try_deduping_record(dedup_table_t *p_table,
+ disk_record_t *p_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats, /* IN-OUT */
+ remapper_t *remapper);
+ int inc_ref_count_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ const RGWObjManifest &manifest);
+ int rollback_ref_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ const RGWObjManifest &tgt_manifest);
+ int free_tail_objs_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ const RGWObjManifest &tgt_manifest);
+ int dedup_object(disk_record_t *p_src_rec,
+ disk_record_t *p_tgt_rec,
+ const RGWObjManifest &src_manifest,
+ const RGWObjManifest &tgt_manifest,
+ md5_stats_t *p_stats,
+ const dedup_table_t::value_t *p_src_val,
+ const std::string &tail_oid);
#endif
int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
int init_rados_access_handles(bool init_pool);
unsigned d_heart_beat_max_elapsed_sec;
uint64_t d_all_buckets_obj_count = 0;
uint64_t d_all_buckets_obj_size = 0;
- // we don't benefit from deduping RGW objects smaller than head-object size
- uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024);
+
+ uint32_t d_min_obj_size_for_dedup = (64ULL * 1024);
+ uint32_t d_max_obj_size_for_split = (16ULL * 1024 * 1024);
uint32_t d_head_object_size = (4ULL * 1024 * 1024);
control_t d_ctl;
uint64_t d_watch_handle = 0;
ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
bool exclusive = true; // block overwrite of old objects
ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
+ if (ret == 0) {
ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
// now try and take ownership
}
ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
bool exclusive = true;
ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
+ if (ret == 0) {
ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
}
else if (ret == -EEXIST) {
// create the object to watch (object may already exist)
bool exclusive = true;
ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
+ if (ret == 0) {
ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
<< " was created!" << dendl;
}
else {
this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest);
// BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]);
}
this->ref_tag = std::string(p, this->s.ref_tag_len);
else {
p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest);
// BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]);
}
len = this->ref_tag.length();
{
// optimistic approach
if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) {
- ldpp_dout(dpp, 20) << __func__ << "::success" << dendl;
+ ldpp_dout(dpp, 20) << caller << "::validate disk_record success" << dendl;
return 0;
}
stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n";
stream << "HASH = ";
// BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
+ for (unsigned i = 0; i < HASH_UNITS; i++) {
stream << rec.s.hash[i];
}
stream << "\n";
- if (rec.has_shared_manifest()) {
+ if (rec.s.flags.has_shared_manifest()) {
stream << "Shared Manifest Object\n";
}
else {
ceph_assert(bl.length());
int ret = ioctx.write_full(oid, bl);
- if (ret == (int)bl.length()) {
- ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to "
- << oid << dendl;
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::SLAB was written successfully" << dendl;
}
else {
- if (ret == 0) {
- // no error reported, but we wrote nothing which should never happen
- ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid
- << ", bl.length()=" << bl.length() << dendl;
- ret = -ENODATA;
- }
ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid
- << " with: " << cpp_strerror(-ret) << dendl;
+ << "::ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
}
return ret;
#define HTOCEPH_32 htole32
#define HTOCEPH_64 htole64
- static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024;
+ static constexpr unsigned HASH_UNITS = BLAKE3_OUT_LEN/sizeof(uint64_t);
+ static constexpr unsigned DISK_BLOCK_SIZE = 8*1024;
// we use 16 bit offset
static_assert(DISK_BLOCK_SIZE < 64*1024);
static constexpr unsigned DISK_BLOCK_COUNT = 256;
uint32_t block_id;
};
+ struct __attribute__ ((packed)) record_flags_t {
+ private:
+ static constexpr uint8_t RGW_RECORD_FLAG_HAS_VALID_HASH = 0x01;
+ static constexpr uint8_t RGW_RECORD_FLAG_SHARED_MANIFEST = 0x02;
+ static constexpr uint8_t RGW_RECORD_FLAG_HASH_CALCULATED = 0x04;
+ static constexpr uint8_t RGW_RECORD_FLAG_FASTLANE = 0x08;
+ static constexpr uint8_t RGW_RECORD_FLAG_SPLIT_HEAD = 0x10;
+ static constexpr uint8_t RGW_RECORD_FLAG_TAIL_REFTAG = 0x20;
+ public:
+ record_flags_t() : flags(0) {}
+ record_flags_t(uint8_t _flags) : flags(_flags) {}
+ inline void clear() { this->flags = 0; }
+ inline bool hash_calculated() const { return ((flags & RGW_RECORD_FLAG_HASH_CALCULATED) != 0); }
+ inline void set_hash_calculated() { flags |= RGW_RECORD_FLAG_HASH_CALCULATED; }
+ inline void clear_hash_calculated() { flags &= ~RGW_RECORD_FLAG_HASH_CALCULATED; }
+ inline bool has_valid_hash() const { return ((flags & RGW_RECORD_FLAG_HAS_VALID_HASH) != 0); }
+ inline void set_has_valid_hash() { flags |= RGW_RECORD_FLAG_HAS_VALID_HASH; }
+ inline bool has_shared_manifest() const { return ((flags & RGW_RECORD_FLAG_SHARED_MANIFEST) != 0); }
+ inline void set_shared_manifest() { flags |= RGW_RECORD_FLAG_SHARED_MANIFEST; }
+ inline bool is_fastlane() const { return ((flags & RGW_RECORD_FLAG_FASTLANE) != 0); }
+ inline void set_fastlane() { flags |= RGW_RECORD_FLAG_FASTLANE; }
+ inline bool is_split_head() const { return ((flags & RGW_RECORD_FLAG_SPLIT_HEAD) != 0); }
+ inline void set_split_head() { flags |= RGW_RECORD_FLAG_SPLIT_HEAD; }
+ inline bool is_ref_tag_from_tail() const { return ((flags & RGW_RECORD_FLAG_TAIL_REFTAG) != 0); }
+ inline void set_ref_tag_from_tail() { flags |= RGW_RECORD_FLAG_TAIL_REFTAG; }
+ private:
+ uint8_t flags;
+ };
+
struct disk_record_t
{
disk_record_t(const char *buff);
const DoutPrefixProvider* dpp,
disk_block_id_t block_id,
record_id_t rec_id) const;
- inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); }
- inline void set_shared_manifest() { s.flags.set_shared_manifest(); }
-
- struct __attribute__ ((packed)) packed_rec_t
+ inline bool multipart_object() { return (this->s.num_parts > 0); }
+ struct packed_rec_t
{
- uint8_t rec_version; // allows changing record format
- dedup_flags_t flags; // 1 Byte flags
- uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000)
- uint16_t obj_name_len;
- uint16_t bucket_name_len;
-
+ uint64_t hash[4]; // 4 * 8 Bytes of HASH
+ uint64_t shared_manifest; // 64bit hash of the SRC object manifest
uint64_t md5_high; // High Bytes of the Object Data MD5
uint64_t md5_low; // Low Bytes of the Object Data MD5
uint64_t obj_bytes_size;
+ uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000)
+ uint16_t obj_name_len;
+ uint16_t bucket_name_len;
uint16_t bucket_id_len;
+
uint16_t tenant_name_len;
uint16_t instance_len;
uint16_t stor_class_len;
uint16_t ref_tag_len;
-
uint16_t manifest_len;
- uint8_t pad[6];
- uint64_t shared_manifest; // 64bit hash of the SRC object manifest
- uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3
+ uint8_t rec_version; // allows changing record format
+ record_flags_t flags; // 1 Byte flags
+ uint8_t pad[6];
}s;
std::string obj_name;
// TBD: find pool name making it easier to get ioctx
bufferlist manifest_bl;
};
static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash));
+ static_assert(sizeof(disk_record_t::packed_rec_t) == sizeof(uint64_t)*12);
std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec);
static constexpr unsigned BLOCK_MAGIC = 0xFACE;
//---------------------------------------------------------------------------
dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
uint32_t _head_object_size,
+ uint32_t _min_obj_size_for_dedup,
+ uint32_t _max_obj_size_for_split,
uint8_t *p_slab,
uint64_t slab_size)
{
dpp = _dpp;
head_object_size = _head_object_size;
+ min_obj_size_for_dedup = _min_obj_size_for_dedup;
+ max_obj_size_for_split = _max_obj_size_for_split;
memset(p_slab, 0, slab_size);
hash_tab = (table_entry_t*)p_slab;
entries_count = slab_size/sizeof(table_entry_t);
const key_t &key = hash_tab[tab_idx].key;
// This is an approximation only since size is stored in 4KB resolution
uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
- if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+ if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
hash_tab[tab_idx].val.clear_flags();
redistributed_clear++;
continue;
}
else {
uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+ min_obj_size_for_dedup,
+ max_obj_size_for_split,
p_key->num_parts,
byte_size_approx);
p_big_objs->duplicate_count ++;
p_big_objs->dedup_bytes_estimate += dup_bytes_approx;
- if (!p_key->multipart_object()) {
+ // object smaller than max_obj_size_for_split will split their head
+ // and won't dup it
+ if (!key.multipart_object() && byte_size_approx > max_obj_size_for_split) {
// single part objects duplicate the head object when dedup is used
*p_duplicate_head_bytes += head_object_size;
}
// replace value!
value_t new_val(block_id, rec_id, shared_manifest);
new_val.count = val.count;
- hash_tab[idx].val = new_val;
ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::["
<< val.block_idx << "/" << (int)val.rec_id << "] -> ["
<< block_id << "/" << (int)rec_id << "]" << dendl;
+
+ val = new_val;
}
}
//---------------------------------------------------------------------------
- int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key,
- disk_block_id_t block_id,
- record_id_t rec_id)
+ int dedup_table_t::set_src_mode(const key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ bool set_shared_manifest_src,
+ bool set_has_valid_hash_src)
{
uint32_t idx = find_entry(p_key);
value_t &val = hash_tab[idx].val;
if (val.is_occupied()) {
if (val.block_idx == block_id && val.rec_id == rec_id) {
- val.set_shared_manifest_src();
+ if (set_shared_manifest_src) {
+ val.set_shared_manifest_src();
+ }
+ if (set_has_valid_hash_src) {
+ val.set_has_valid_hash_src();
+ }
return 0;
}
}
uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
// skip small single part objects which we can't dedup
- if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+ if (!dedupable_object(key.multipart_object(), min_obj_size_for_dedup, byte_size_approx)) {
if (hash_tab[tab_idx].val.is_singleton()) {
p_small_objs->singleton_count++;
}
static_assert(sizeof(key_t) == 24);
class dedup_table_t {
+ struct __attribute__ ((packed)) table_flags_t {
+ private:
+ static constexpr uint8_t RGW_TABLE_FLAG_HAS_VALID_HASH = 0x01;
+ static constexpr uint8_t RGW_TABLE_FLAG_SHARED_MANIFEST = 0x02;
+ static constexpr uint8_t RGW_TABLE_FLAG_OCCUPIED = 0x04;
+ public:
+ table_flags_t() : flags(0) {}
+ table_flags_t(uint8_t _flags) : flags(_flags) {}
+ inline void clear() { this->flags = 0; }
+ inline bool has_valid_hash() const { return ((flags & RGW_TABLE_FLAG_HAS_VALID_HASH) != 0); }
+ inline void set_has_valid_hash() { flags |= RGW_TABLE_FLAG_HAS_VALID_HASH; }
+ inline bool has_shared_manifest() const { return ((flags & RGW_TABLE_FLAG_SHARED_MANIFEST) != 0); }
+ inline void set_shared_manifest() { flags |= RGW_TABLE_FLAG_SHARED_MANIFEST; }
+ inline bool is_occupied() const {return ((this->flags & RGW_TABLE_FLAG_OCCUPIED) != 0); }
+ inline void set_occupied() {this->flags |= RGW_TABLE_FLAG_OCCUPIED; }
+ inline void clear_occupied() { this->flags &= ~RGW_TABLE_FLAG_OCCUPIED; }
+ private:
+ uint8_t flags;
+ };
+
public:
// 8 Bytes Value
struct value_t {
inline void inc_count() { count ++; }
inline void reset_count() { count = 0; }
inline void clear_flags() { flags.clear(); }
+ inline bool has_valid_hash() const {return flags.has_valid_hash(); }
+ inline void set_has_valid_hash_src() { this->flags.set_has_valid_hash(); }
inline bool is_singleton() const { return (count == 1); }
inline bool is_occupied() const { return flags.is_occupied(); }
inline void set_occupied() { this->flags.set_occupied(); }
disk_block_id_t block_idx; // 32 bits
uint16_t count; // 16 bits
record_id_t rec_id; // 8 bits
- dedup_flags_t flags; // 8 bits
+ table_flags_t flags; // 8 bits
} __attribute__((__packed__));
static_assert(sizeof(value_t) == 8);
dedup_table_t(const DoutPrefixProvider* _dpp,
uint32_t _head_object_size,
+ uint32_t _min_obj_size_for_dedup,
+ uint32_t _max_obj_size_for_split,
uint8_t *p_slab,
uint64_t slab_size);
int add_entry(key_t *p_key,
disk_block_id_t block_id,
record_id_t rec_id);
+ int set_src_mode(const key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ bool set_shared_manifest_src,
+ bool set_has_valid_hash_src);
+
void count_duplicates(dedup_stats_t *p_small_objs_stat,
dedup_stats_t *p_big_objs_stat);
uint32_t values_count = 0;
uint32_t entries_count = 0;
uint32_t occupied_count = 0;
- uint32_t head_object_size = (4ULL * 1024 * 1024);
+ uint32_t head_object_size;
+ uint32_t min_obj_size_for_dedup;
+ uint32_t max_obj_size_for_split;
table_entry_t *hash_tab = nullptr;
// stat counters
#include "rgw_dedup_utils.h"
#include "common/ceph_crypto.h"
-
namespace rgw::dedup {
+
//---------------------------------------------------------------------------
std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type)
{
this->failed_rec_load += other.failed_rec_load;
this->failed_block_load += other.failed_block_load;
+ this->different_storage_class += other.different_storage_class;
+ this->invalid_hash_no_split_head += other.invalid_hash_no_split_head;
+ this->invalid_storage_class_mapping += other.invalid_storage_class_mapping;
+ this->singleton_after_purge += other.singleton_after_purge;
+ this->shared_manifest_after_purge += other.shared_manifest_after_purge;
+ this->split_head_no_tail_placement += other.split_head_no_tail_placement;
+ this->illegal_rec_id += other.illegal_rec_id;
+ this->missing_last_block_marker += other.missing_last_block_marker;
+
this->valid_hash_attrs += other.valid_hash_attrs;
this->invalid_hash_attrs += other.invalid_hash_attrs;
this->set_hash_attrs += other.set_hash_attrs;
this->skip_hash_cmp += other.skip_hash_cmp;
+ this->manifest_raw_obj += other.manifest_raw_obj;
+ this->manifest_no_tail_placement += other.manifest_no_tail_placement;
+ this->rollback_tail_obj += other.rollback_tail_obj;
+ this->failed_split_head_creat += other.failed_split_head_creat;
+ this->skip_shared_tail_objs += other.skip_shared_tail_objs;
+ this->split_head_src += other.split_head_src;
+ this->split_head_tgt += other.split_head_tgt;
+ this->split_head_dedup_bytes += other.split_head_dedup_bytes;
this->set_shared_manifest_src += other.set_shared_manifest_src;
this->loaded_objects += other.loaded_objects;
f->dump_unsigned("Set HASH", this->set_hash_attrs);
}
+ if (this->skip_shared_tail_objs) {
+ f->dump_unsigned("Skip Shared Tail Objs (server-side-copy)", this->skip_shared_tail_objs);
+ }
if (this->skip_hash_cmp) {
f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp);
}
+ if (this->manifest_raw_obj) {
+ f->dump_unsigned("Manifest has RAW OBJ", this->manifest_raw_obj);
+ }
+ if (this->manifest_no_tail_placement) {
+ f->dump_unsigned("Manifest has no tail placement", this->manifest_no_tail_placement);
+ }
+ if (this->rollback_tail_obj) {
+ f->dump_unsigned("Rollback tail obj", this->rollback_tail_obj);
+ }
+ if (this->split_head_src) {
+ f->dump_unsigned("Split-Head Src OBJ", this->split_head_src);
+ }
+ if (this->split_head_tgt) {
+ f->dump_unsigned("Split-Head Tgt OBJ", this->split_head_tgt);
+ }
+ if (this->split_head_dedup_bytes) {
+ f->dump_unsigned("Split-Head Dedup-Bytes", this->split_head_dedup_bytes);
+ }
}
{
if (this->failed_block_load) {
f->dump_unsigned("Failed Block-Load ", this->failed_block_load);
}
+
+ if (this->illegal_rec_id) {
+ f->dump_unsigned("Failed illegal_rec_id", this->illegal_rec_id );
+ }
+ if (this->missing_last_block_marker) {
+ f->dump_unsigned("Failed missing_last_block_marker in rec",
+ this->missing_last_block_marker);
+ }
+
+ if (this->failed_split_head_creat) {
+ f->dump_unsigned("Failed Split-Head Create (EEXIST)", this->failed_split_head_creat);
+ }
if (this->failed_dedup) {
f->dump_unsigned("Failed Dedup", this->failed_dedup);
}
if (this->size_mismatch) {
f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch);
}
+ if (this->different_storage_class) {
+ f->dump_unsigned("different_storage_class",
+ this->different_storage_class);
+ }
+ if (this->invalid_hash_no_split_head) {
+ f->dump_unsigned("Failed rec has invalid hash w/o split-head ",
+ this->invalid_hash_no_split_head);
+ }
+ if (this->invalid_storage_class_mapping) {
+ f->dump_unsigned("Failed, invalid_storage_class_mapping",
+ this->invalid_storage_class_mapping);
+ }
+ if (this->singleton_after_purge) {
+ f->dump_unsigned("Failed, has singleton after purge",
+ this->singleton_after_purge);
+ }
+ if (this->shared_manifest_after_purge) {
+ f->dump_unsigned("Failed, has shared manifest after purge",
+ this->shared_manifest_after_purge);
+ }
+ if (this->split_head_no_tail_placement) {
+ f->dump_unsigned("No Tail Placement during Split-Head processing",
+ this->split_head_no_tail_placement);
+ }
}
}
encode(m.failed_rec_load, bl);
encode(m.failed_block_load, bl);
+ encode(m.different_storage_class, bl);
+ encode(m.invalid_hash_no_split_head, bl);
+ encode(m.invalid_storage_class_mapping, bl);
+ encode(m.singleton_after_purge, bl);
+ encode(m.shared_manifest_after_purge, bl);
+ encode(m.split_head_no_tail_placement, bl);
+ encode(m.illegal_rec_id, bl);
+ encode(m.missing_last_block_marker, bl);
+
encode(m.valid_hash_attrs, bl);
encode(m.invalid_hash_attrs, bl);
encode(m.set_hash_attrs, bl);
encode(m.skip_hash_cmp, bl);
+ encode(m.manifest_raw_obj, bl);
+ encode(m.manifest_no_tail_placement, bl);
+ encode(m.rollback_tail_obj, bl);
+ encode(m.failed_split_head_creat, bl);
+ encode(m.skip_shared_tail_objs, bl);
+ encode(m.split_head_src, bl);
+ encode(m.split_head_tgt, bl);
+ encode(m.split_head_dedup_bytes, bl);
encode(m.set_shared_manifest_src, bl);
encode(m.loaded_objects, bl);
decode(m.failed_rec_load, bl);
decode(m.failed_block_load, bl);
+ decode(m.different_storage_class, bl);
+ decode(m.invalid_hash_no_split_head, bl);
+ decode(m.invalid_storage_class_mapping, bl);
+ decode(m.singleton_after_purge, bl);
+ decode(m.shared_manifest_after_purge, bl);
+ decode(m.split_head_no_tail_placement, bl);
+ decode(m.illegal_rec_id, bl);
+ decode(m.missing_last_block_marker, bl);
+
decode(m.valid_hash_attrs, bl);
decode(m.invalid_hash_attrs, bl);
decode(m.set_hash_attrs, bl);
decode(m.skip_hash_cmp, bl);
+ decode(m.manifest_raw_obj, bl);
+ decode(m.manifest_no_tail_placement, bl);
+ decode(m.rollback_tail_obj, bl);
+ decode(m.failed_split_head_creat, bl);
+ decode(m.skip_shared_tail_objs, bl);
+ decode(m.split_head_src, bl);
+ decode(m.split_head_tgt, bl);
+ decode(m.split_head_dedup_bytes, bl);
decode(m.set_shared_manifest_src, bl);
decode(m.loaded_objects, bl);
#include "common/dout.h"
#define FULL_DEDUP_SUPPORT
+
namespace rgw::dedup {
using namespace std::chrono;
using work_shard_t = uint16_t;
};
std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type);
- struct __attribute__ ((packed)) dedup_flags_t {
- private:
- static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC
- static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB
- static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB
- static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC
-
- public:
- dedup_flags_t() : flags(0) {}
- dedup_flags_t(uint8_t _flags) : flags(_flags) {}
- inline void clear() { this->flags = 0; }
- inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); }
- inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; }
- inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); }
- inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; }
- inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); }
- inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; }
- inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; }
- inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); }
- inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; }
- private:
- uint8_t flags;
- };
class alignas(8) Throttle {
friend void validate_max_calls_offset();
uint64_t failed_rec_load = 0;
uint64_t failed_block_load = 0;
+ uint64_t different_storage_class = 0;
+ uint64_t invalid_hash_no_split_head = 0;
+ uint64_t invalid_storage_class_mapping = 0;
+ uint64_t singleton_after_purge = 0;
+ uint64_t shared_manifest_after_purge = 0;
+ uint64_t split_head_no_tail_placement = 0;
+ uint64_t illegal_rec_id = 0;
+ uint64_t missing_last_block_marker = 0;
+
uint64_t valid_hash_attrs = 0;
uint64_t invalid_hash_attrs = 0;
uint64_t set_hash_attrs = 0;
+ uint64_t skip_shared_tail_objs = 0;
uint64_t skip_hash_cmp = 0;
-
+ uint64_t manifest_raw_obj = 0;
+ uint64_t manifest_no_tail_placement = 0;
+ uint64_t rollback_tail_obj = 0;
+ uint64_t failed_split_head_creat = 0;
+ uint64_t split_head_src = 0;
+ uint64_t split_head_tgt = 0;
+ uint64_t split_head_dedup_bytes = 0;
uint64_t set_shared_manifest_src = 0;
uint64_t loaded_objects = 0;
uint64_t processed_objects = 0;
const DoutPrefixProvider* dpp);
//---------------------------------------------------------------------------
- static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size,
+ static inline bool dedupable_object(bool multipart_object,
+ uint64_t min_obj_size_for_dedup,
+ uint64_t object_byte_size)
+ {
+ // all multipart objects are dedupable because the head-object is empty
+ // otherwise make sure object_byte_size is large enough
+ return (multipart_object || object_byte_size >= min_obj_size_for_dedup);
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t calc_deduped_bytes(uint32_t head_obj_size,
+ uint32_t min_obj_size_for_dedup,
+ uint32_t max_obj_size_for_split,
uint16_t num_parts,
uint64_t size_bytes)
{
}
else {
// reduce the head size
- if (size_bytes > head_obj_size) {
+ if (size_bytes > max_obj_size_for_split) {
return size_bytes - head_obj_size;
}
+ else if (size_bytes >= min_obj_size_for_dedup) {
+ // Head is splitted into an empty obj and a new tail enabling a full dedup
+ return size_bytes;
+ }
else {
return 0;
}
void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
std::string *override_prefix, rgw_obj_select *location) const;
+ void clear_rules() {
+ rules.clear();
+ }
+
void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
rules[0] = rule;
return max_head_size;
}
+ void set_max_head_size(uint64_t _max_head_size) {
+ max_head_size = _max_head_size;
+ }
+
const std::string& get_tier_type() {
return tier_type;
}
#define RGW_ATTR_CORS RGW_ATTR_PREFIX "cors"
#define RGW_ATTR_ETAG RGW_ATTR_PREFIX "etag"
#define RGW_ATTR_CKSUM RGW_ATTR_PREFIX "cksum"
-#define RGW_ATTR_SHA256 RGW_ATTR_PREFIX "x-amz-content-sha256"
#define RGW_ATTR_BLAKE3 RGW_ATTR_PREFIX "blake3"
#define RGW_ATTR_BUCKETS RGW_ATTR_PREFIX "buckets"
#define RGW_ATTR_META_PREFIX RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
log.debug("gen_connections_multi: All connection and buckets are set")
return (tenants, bucket_names, conns)
+#-------------------------------------------------------------------------------
+def create_buckets(conn, max_copies_count):
+ bucket_names=[]
+ for i in range(0, max_copies_count):
+ bucket_name=gen_bucket_name()
+ bucket_names.append(bucket_name)
+ log.debug("conn.create_bucket(Bucket=%s)", bucket_name)
+ conn.create_bucket(Bucket=bucket_name)
+
+ return bucket_names
+
#####################
# dedup tests
KB=(1024)
MB=(1024*KB)
POTENTIAL_OBJ_SIZE=(64*KB)
+DEDUP_MIN_OBJ_SIZE=(64*KB)
+SPLIT_HEAD_SIZE=(16*MB)
RADOS_OBJ_SIZE=(4*MB)
-MULTIPART_SIZE=(16*MB)
+# The default multipart threshold size for S3cmd is 15 MB.
+MULTIPART_SIZE=(15*MB)
default_config = TransferConfig(multipart_threshold=MULTIPART_SIZE, multipart_chunksize=MULTIPART_SIZE)
ETAG_ATTR="user.rgw.etag"
POOLNAME="default.rgw.buckets.data"
#-------------------------------------------------------------------------------
def count_objects_in_bucket(bucket_name, conn):
max_keys=1000
- marker=""
+ continuation_token = None
obj_count=0
while True:
log.debug("bucket_name=%s", bucket_name)
- listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+ list_args = {
+ 'Bucket': bucket_name,
+ 'MaxKeys': max_keys
+ }
+ if continuation_token:
+ list_args['ContinuationToken'] = continuation_token
+
+ listing=conn.list_objects_v2(**list_args)
if 'Contents' not in listing or len(listing['Contents'])== 0:
return 0
obj_count += len(listing['Contents'])
- if listing['IsTruncated']:
- marker=listing['NextMarker']
- log.debug("marker=%s, obj_count=%d", marker, obj_count)
- continue
+ if 'NextContinuationToken' in listing:
+ continuation_token = listing['NextContinuationToken']
+ log.debug("count_objects_in_bucket: Token=%s, count=%d",
+ continuation_token, obj_count)
else:
return obj_count
#-------------------------------------------------------------------------------
-def count_object_parts_in_all_buckets(verbose=False):
+def copy_obj(base_bucket_name, base_key, bucket_name, key):
+ s3_prefix="s3://"
+ src = s3_prefix + base_bucket_name + "/" + base_key
+ dest = s3_prefix + bucket_name + "/" + key
+ result = bash(['s3cmd', 'cp', src, dest])
+ assert result[1] == 0
+
+#-------------------------------------------------------------------------------
+def count_object_parts_in_all_buckets(verbose=False, expected_size=0):
result = rados(['lspools'])
assert result[1] == 0
found=False
result = rados(['ls', '-p ', POOLNAME])
assert result[1] == 0
-
names=result[0].split()
- count = 0
- for name in names:
- #log.debug(name)
- count = count + 1
+ rados_count = len(names)
+ if (rados_count > 1000):
+ ### we can only do about 10 stat call per-second!!
+ ### TBD: add obj_size to ls output to allow more efficient size check
+ log.info(">>> rados obj_count(%d) is too high -> skip stat check\n",
+ len(names))
+ expected_size = 0
+
+ byte_size_total = 0
+ ondisk_size_total = 0
+ start_time = time.perf_counter()
+ for rados_name in names:
+ if verbose:
+ log.debug(rados_name)
+ if expected_size:
+ result = rados(['-p ', POOLNAME, 'stat', rados_name])
+ assert result[1] == 0
+ stat = result[0].split()
+ byte_size=int(stat[-1])
+ byte_size_total += byte_size
+ ondisk_size_total += calc_on_disk_byte_size(byte_size)
+
+ if expected_size:
+ end_time = time.perf_counter()
+ time_elapsed = end_time - start_time
+ log.info("rados_count=%d, ondisk_size_total=%d, expected_size=%d, time=%d(sec)",
+ rados_count, ondisk_size_total, expected_size, time_elapsed)
+ assert ondisk_size_total == expected_size
if verbose:
- log.debug("Pool has %d rados objects", count)
+ log.debug("Pool has %d rados objects", rados_count)
- return count
+ return rados_count
#-------------------------------------------------------------------------------
return False
+#-------------------------------------------------------------------------------
+def check_delete_objects_response(response):
+ # Check for delete failures
+ if 'Errors' in response and response['Errors']:
+ log.error("Delete failures detected:")
+ for error in response['Errors']:
+ log.error("delete_objects::ERROR::Key=%s, Code=%s, Message=%s",
+ error['Key'], error['Code'], error['Message'])
+
+ else:
+ log.debug("All objects deleted successfully.")
+
+
+#-------------------------------------------------------------------------------
+def delete_objects(conn, bucket_name, object_keys):
+ response=conn.delete_objects(Bucket=bucket_name,
+ Delete={"Objects": [{"Key": key} for key in object_keys]})
+
+ # Check for delete failures
+ check_delete_objects_response(response)
+
+
#-------------------------------------------------------------------------------
def delete_bucket_with_all_objects(bucket_name, conn):
max_keys=1000
- marker=""
+ continuation_token = None
obj_count=0
while True:
- listing=conn.list_objects(Bucket=bucket_name, Marker=marker, MaxKeys=max_keys)
+ list_args = {
+ 'Bucket': bucket_name,
+ 'MaxKeys': max_keys
+ }
+ if continuation_token:
+ list_args['ContinuationToken'] = continuation_token
+
+ listing=conn.list_objects_v2(**list_args)
if 'Contents' not in listing or len(listing['Contents'])== 0:
log.debug("Bucket '%s' is empty, skipping...", bucket_name)
return
objects=[]
for obj in listing['Contents']:
- log.debug(obj['Key'])
+ log.debug("delete_bucket_with_all_objects: add obj: %s", obj['Key'])
objects.append({'Key': obj['Key']})
obj_count += len(objects)
# delete objects from the bucket
- conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
- if listing['IsTruncated']:
- marker=listing['NextMarker']
- log.debug("marker=%s, obj_count=%d", marker, obj_count)
- continue
+ log.debug("delete_bucket_with_all_objects: delete %d objs", obj_count)
+ response=conn.delete_objects(Bucket=bucket_name, Delete={'Objects':objects})
+ check_delete_objects_response(response)
+
+ if 'NextContinuationToken' in listing:
+ continuation_token = listing['NextContinuationToken']
+ log.debug("delete_bucket_with_all_objects: Token=%s, count=%d",
+ continuation_token, obj_count)
else:
break
def verify_pool_is_empty():
result = admin(['gc', 'process', '--include-all'])
assert result[1] == 0
- assert count_object_parts_in_all_buckets() == 0
+ assert count_object_parts_in_all_buckets(False, 0) == 0
#-------------------------------------------------------------------------------
return rados_obj_count
+BLOCK_SIZE=4096
+#-------------------------------------------------------------------------------
+def calc_on_disk_byte_size(byte_size):
+ return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+
+
+#-------------------------------------------------------------------------------
+def calc_head_size(obj_size, config):
+ on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+ threshold = config.multipart_threshold
+ # Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
+ # multi-part objects got a zero size Head objects
+ if obj_size >= threshold:
+ head_size = 0
+ else:
+ head_size = min(RADOS_OBJ_SIZE, on_disk_byte_size)
+
+ return head_size
+
+
#-------------------------------------------------------------------------------
def calc_dedupable_space(obj_size, config):
+ on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
threshold = config.multipart_threshold
# Objects with size bigger than MULTIPART_SIZE are uploaded as multi-part
# multi-part objects got a zero size Head objects
if obj_size >= threshold:
- dedupable_space = obj_size
- elif obj_size > RADOS_OBJ_SIZE:
- dedupable_space = obj_size - RADOS_OBJ_SIZE
+ dedupable_space = on_disk_byte_size
+ elif obj_size > SPLIT_HEAD_SIZE:
+ dedupable_space = on_disk_byte_size - RADOS_OBJ_SIZE
+ elif obj_size >= DEDUP_MIN_OBJ_SIZE:
+ dedupable_space = on_disk_byte_size
else:
dedupable_space = 0
float(obj_size)/MB, float(dedupable_space)/MB)
return dedupable_space
-BLOCK_SIZE=4096
#-------------------------------------------------------------------------------
-def calc_on_disk_byte_size(byte_size):
- return (((byte_size+BLOCK_SIZE-1)//BLOCK_SIZE)*BLOCK_SIZE)
+def calc_split_objs_count(obj_size, num_copies, config):
+ threshold = config.multipart_threshold
+ on_disk_byte_size = calc_on_disk_byte_size(obj_size)
+
+ if num_copies < 2 or on_disk_byte_size > SPLIT_HEAD_SIZE or obj_size >= threshold:
+ return 0
+
+ if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE:
+ return 0
+
+ return 1
#-------------------------------------------------------------------------------
threshold = config.multipart_threshold
dedup_stats.skip_shared_manifest = 0
dedup_stats.size_before_dedup += (on_disk_byte_size * num_copies)
- if on_disk_byte_size <= RADOS_OBJ_SIZE and threshold > RADOS_OBJ_SIZE:
+ if on_disk_byte_size < DEDUP_MIN_OBJ_SIZE and threshold > DEDUP_MIN_OBJ_SIZE:
dedup_stats.skip_too_small += num_copies
dedup_stats.skip_too_small_bytes += (on_disk_byte_size * num_copies)
return
dedup_stats.total_processed_objects += num_copies
- #dedup_stats.loaded_objects += num_copies
-
if num_copies == 1:
dedup_stats.singleton_obj += 1
dedup_stats.skip_singleton += 1
#-------------------------------------------------------------------------------
-def upload_objects(bucket_name, files, indices, conn, config, check_obj_count=True):
+def print_files(files, config):
+ for idx, f in enumerate(files):
+ filename=f[0]
+ obj_size=f[1]
+ num_copies=f[2]
+ assert(obj_size)
+ split_head = calc_split_objs_count(obj_size, num_copies, config)
+ log.info("[%d]%s::size=%d, num_copies=%d, split_head=%d",
+ idx, filename, obj_size, num_copies, split_head);
+
+
+#-------------------------------------------------------------------------------
+def upload_objects(bucket_name, files, indices, conn, config, check_obj_count):
dedup_stats = Dedup_Stats()
total_space=0
duplicated_space=0
duplicated_tail_objs=0
rados_objects_total=0
s3_objects_total=0
+ split_head_objs=0
for (f, idx) in zip(files, indices):
filename=f[0]
obj_size=f[1]
num_copies=f[2]
assert(obj_size)
+ split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
calc_expected_stats(dedup_stats, obj_size, num_copies, config)
- total_space += (obj_size * num_copies)
+ total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
dedupable_space=calc_dedupable_space(obj_size, config)
duplicated_space += ((num_copies-1) * dedupable_space)
rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
s3_objects_total += num_copies
if s3_objects_total and (s3_objects_total % 1000 == 0):
log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
- s3_objects_total, rados_objects_total, total_space/MB)
+ s3_objects_total, rados_objects_total, total_space/MB)
for i in range(idx, num_copies):
key = gen_object_name(filename, i)
- #log.debug("upload_file %s/%s with crc32", bucket_name, key)
conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
log.debug("==========================================")
log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
log.debug("Based on calculation we should have %.2f MiB duplicated space in pool", duplicated_space/MB)
-
- expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+ log.info("split_head_objs=%d, rados_objects_total=%d, duplicated_tail_objs=%d",
+ split_head_objs, rados_objects_total, duplicated_tail_objs)
+ expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
expcted_space_post_dedup=(total_space-duplicated_space)
log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
if check_obj_count:
- assert rados_objects_total == count_object_parts_in_all_buckets()
+ log.debug("upload_objects: verify current Rados state - total_space=%d", total_space)
+ # assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+ # skip size check as it is time consuming
+ assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
+
+ return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
+
+
+#-------------------------------------------------------------------------------
+def upload_objects_with_copy(files, conn, bucket_names, indices, config):
+ dedup_stats = Dedup_Stats()
+ total_space=0
+ rados_objects_total=0
+ s3_objects_total=0
+
+ for (f, idx) in zip(files, indices):
+ filename=f[0]
+ obj_size=f[1]
+ num_copies=f[2]
+ assert(obj_size)
+ head_size = calc_head_size(obj_size, config)
+ obj_size_on_disk=calc_on_disk_byte_size(obj_size)
+ log.debug("upload_objects_with_copy:obj_size=%d, on_disk_size=%d, head_size=%d",
+ obj_size, obj_size_on_disk, head_size);
+ total_space += (obj_size_on_disk + (num_copies-1)*head_size)
+ rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
+ tail_objs_count =rados_obj_count-1
+ rados_objects_total += (tail_objs_count + num_copies)
+ log.debug("upload_objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies);
+ s3_objects_total += num_copies
+ if s3_objects_total and (s3_objects_total % 1000 == 0):
+ log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+ s3_objects_total, rados_objects_total, total_space/MB)
+
+ base_obj=dict()
+ for i in range(idx, num_copies):
+ key = gen_object_name(filename, i)
+ bucket_name=bucket_names[i]
+ if i == 0:
+ base_obj = {'Bucket': bucket_name, 'Key': key}
+ #log.info("upload_file: %s -> %s/%s", filename, bucket_name, key)
+ conn.upload_file(OUT_DIR + filename, bucket_name, key, Config=config)
+ else:
+ log.debug("copy_obj: %s/%s -> %s/%s",
+ base_obj['Bucket'], base_obj['Key'], bucket_name, key)
+ conn.copy_object(CopySource=base_obj, Bucket=bucket_name, Key=key)
+
+
+ dedup_stats.deduped_obj = 0
+ dedup_stats.size_before_dedup = total_space
+ # No change should happen since tail-objects are already de-duplicated
+ dedup_stats.dedup_bytes_estimate = 0
+ expected_rados_obj_count_post_dedup=rados_objects_total
+
+ log.info("upload_objects: verify current Rados state - total_space=%d", total_space)
+ assert rados_objects_total == count_object_parts_in_all_buckets(False, total_space)
- expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
duplicated_tail_objs=0
rados_objects_total=0
s3_objects_total=0
+ split_head_objs=0
+
for (f, idx) in zip(files, indices):
filename=f[0]
obj_size=f[1]
num_copies=f[2]
assert(obj_size)
+ split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
calc_expected_stats(dedup_stats, obj_size, num_copies, config)
- total_space += (obj_size * num_copies)
+ total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
dedupable_space=calc_dedupable_space(obj_size, config)
duplicated_space += ((num_copies-1) * dedupable_space)
rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
s3_objects_total += num_copies
if s3_objects_total and (s3_objects_total % 1000 == 0):
log.debug("%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
- s3_objects_total, rados_objects_total, total_space/MB)
+ s3_objects_total, rados_objects_total, total_space/MB)
for i in range(idx, num_copies):
ten_id = i % max_tenants
key = gen_object_name(filename, i)
log.debug("upload_objects::<%s/%s>", bucket_names[ten_id], key)
log.debug("==========================================")
- log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
- s3_objects_total, rados_objects_total, total_space/MB)
+ log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+ s3_objects_total, rados_objects_total, total_space/MB)
log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
s3_object_count += count_objects_in_bucket(bucket_name, conn)
log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
- expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+ expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
expcted_space_post_dedup=(total_space-duplicated_space)
log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
if check_obj_count:
- assert rados_objects_total == count_object_parts_in_all_buckets()
+ log.debug("upload_objects_multi: verify current Rados state (obj/size)")
+ #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+ assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
assert (s3_object_count == s3_objects_total)
- expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
duplicated_tail_objs=0
rados_objects_total=0
s3_objects_total=0
+ split_head_objs=0
+
for (f, idx) in zip(files, indices):
filename=f[0]
obj_size=f[1]
num_copies=f[2]
assert(obj_size)
+ split_head_objs += calc_split_objs_count(obj_size, num_copies, config)
calc_expected_stats(dedup_stats, obj_size, num_copies, config)
- total_space += (obj_size * num_copies)
+ total_space += (calc_on_disk_byte_size(obj_size) * num_copies)
dedupable_space=calc_dedupable_space(obj_size, config)
duplicated_space += ((num_copies-1) * dedupable_space)
rados_obj_count=calc_rados_obj_count(num_copies, obj_size, config)
proc_list[idx].join()
log.debug("==========================================")
- log.debug("Summary:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
- s3_objects_total, rados_objects_total, total_space/MB)
+ log.debug("Summery:%d S3 objects were uploaded (%d rados objects), total size = %.2f MiB",
+ s3_objects_total, rados_objects_total, total_space/MB)
log.debug("Based on calculation we should have %d rados objects", rados_objects_total)
log.debug("Based on calculation we should have %d duplicated tail objs", duplicated_tail_objs)
log.debug("Based on calculation we should have %.2f MiB total in pool", total_space/MB)
s3_object_count += count_objects_in_bucket(bucket_name, conn)
log.debug("bucket listings reported a total of %d s3 objects", s3_object_count)
- expected_rados_obj_count_post_dedup=(rados_objects_total-duplicated_tail_objs)
+ expected_rados_obj_count_post_dedup=(split_head_objs+rados_objects_total-duplicated_tail_objs)
log.debug("Post dedup expcted rados obj count = %d", expected_rados_obj_count_post_dedup)
expcted_space_post_dedup=(total_space-duplicated_space)
log.debug("Post dedup expcted data in pool = %.2f MiB", expcted_space_post_dedup/MB)
if check_obj_count:
- assert rados_objects_total == count_object_parts_in_all_buckets()
+ log.debug("procs_upload_objects:: count_object_parts_in_all_buckets()")
+ #assert rados_objects_total == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+ assert rados_objects_total == count_object_parts_in_all_buckets(False, 0)
assert (s3_object_count == s3_objects_total)
- expected_results=(expected_rados_obj_count_post_dedup, expcted_space_post_dedup)
return (expected_rados_obj_count_post_dedup, dedup_stats, s3_objects_total)
+#-------------------------------------------------------------------------------
+def check_if_any_obj_exists(bucket_name, delete_list, conn):
+ delete_set = set(delete_list)
+ max_keys=1000
+ continuation_token = None
+
+ while True:
+ list_args = {
+ 'Bucket': bucket_name,
+ 'MaxKeys': max_keys
+ }
+ if continuation_token:
+ list_args['ContinuationToken'] = continuation_token
+
+ listing=conn.list_objects_v2(**list_args)
+ if 'Contents' in listing:
+ for obj in listing['Contents']:
+ key=obj['Key']
+ log.debug("check_if_any_obj_exists: key=%s", key)
+ if obj['Key'] in delete_set:
+ log.info("key <%s> was found in bucket", key)
+
+ if 'NextContinuationToken' in listing:
+ continuation_token = listing['NextContinuationToken']
+ log.debug("check_if_any_obj_exists: Token=%s", continuation_token)
+ else:
+ break
+
+
+#-------------------------------------------------------------------------------
+def delete_objects_multi(conns, bucket_names, ten_id, object_keys):
+ conn = conns[ten_id]
+ bucket_name = bucket_names[ten_id]
+ delete_objects(conn, bucket_name, object_keys)
#-------------------------------------------------------------------------------
-def verify_objects(bucket_name, files, conn, expected_results, config):
+def delete_dup_objects_multi(files, conns, bucket_names):
+ max_tenants=len(conns)
+ tenants_obj_lists = [[] for _ in range(max_tenants)]
+
+ for f in files:
+ filename=f[0]
+ num_copies=f[2]
+ if num_copies > 1:
+ start_idx=1
+ for i in range(start_idx, num_copies):
+ key = gen_object_name(filename, i)
+ log.debug("delete_dup_objects_multi: delete key::%s::", key);
+ ten_id = i % max_tenants
+ object_keys = tenants_obj_lists[ten_id]
+ object_keys.append(key)
+ # flush delete request after every 500 objects
+ if len(object_keys) >= 500:
+ delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+ object_keys.clear()
+
+ # remove leftover objects
+ for ten_id in range(max_tenants):
+ object_keys = tenants_obj_lists[ten_id]
+ if len(object_keys):
+ delete_objects_multi(conns, bucket_names, ten_id, object_keys)
+
+ # must call garbage collection for predictable count
+ result = admin(['gc', 'process', '--include-all'])
+ assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def delete_dup_objects(bucket_name, files, conn):
+ delete_list_total=[]
+ object_keys=[]
+
+ for f in files:
+ filename=f[0]
+ num_copies=f[2]
+ if num_copies > 1:
+ start_idx=1
+ for i in range(start_idx, num_copies):
+ key = gen_object_name(filename, i)
+ log.debug("delete key::%s::", key);
+ delete_list_total.append(key)
+ object_keys.append(key)
+
+ # flush delete request after every 500 files
+ if len(object_keys) >= 500:
+ delete_objects(conn, bucket_name, object_keys)
+ object_keys.clear()
+
+
+ # remove leftover objects
+ if len(object_keys):
+ delete_objects(conn, bucket_name, object_keys)
+
+ verify=True
+ if verify:
+ log.debug("delete_dup_objects: verify delete_list_total")
+ check_if_any_obj_exists(bucket_name, delete_list_total, conn)
+
+ # must call garbage collection for predictable count
+ result = admin(['gc', 'process', '--include-all'])
+ assert result[1] == 0
+
+
+#-------------------------------------------------------------------------------
+def verify_objects(bucket_name, files, conn, expected_results, config, delete_dups):
+ if expected_results:
+ assert expected_results == count_object_parts_in_all_buckets(True)
+
tmpfile = OUT_DIR + "temp"
for f in files:
filename=f[0]
obj_size=f[1]
num_copies=f[2]
log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
- for i in range(0, num_copies):
+
+ ### first verify duplicates at index 1 and higher
+ for i in range(1, num_copies):
filecmp.clear_cache()
key = gen_object_name(filename, i)
conn.download_file(bucket_name, key, tmpfile, Config=config)
assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
os.remove(tmpfile)
- log.debug("verify_objects: finished reading all objects")
- assert expected_results == count_object_parts_in_all_buckets(True)
+ ### Then delete all duplicates
+ if delete_dups:
+ delete_dup_objects(bucket_name, files, conn)
+
+ ### Last, verify the object at index zero making sure refcount works
+ filecmp.clear_cache()
+ i = 0
+ for f in files:
+ filename=f[0]
+ key = gen_object_name(filename, i)
+ conn.download_file(bucket_name, key, tmpfile, Config=config)
+ equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+ assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+ os.remove(tmpfile)
+
log.debug("verify_objects::completed successfully!!")
#-------------------------------------------------------------------------------
-def verify_objects_multi(files, conns, bucket_names, expected_results, config):
+def verify_objects_multi(files, conns, bucket_names, expected_results, config, delete_dups):
+ if expected_results:
+ assert expected_results == count_object_parts_in_all_buckets(True)
+
max_tenants=len(conns)
tmpfile = OUT_DIR + "temp"
for f in files:
obj_size=f[1]
num_copies=f[2]
log.debug("comparing file=%s, size=%d, copies=%d", filename, obj_size, num_copies)
- for i in range(0, num_copies):
+ ### first verify duplicates at index 1 and higher
+ for i in range(1, num_copies):
filecmp.clear_cache()
key = gen_object_name(filename, i)
log.debug("comparing object %s with file %s", key, filename)
ten_id = i % max_tenants
- conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile, Config=config)
+ conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+ Config=config)
equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
os.remove(tmpfile)
- assert expected_results == count_object_parts_in_all_buckets(True)
- log.debug("verify_objects::completed successfully!!")
+ ### Then delete all duplicates
+ if delete_dups:
+ delete_dup_objects_multi(files, conns, bucket_names)
+
+ ### Last, verify the object at index zero making sure refcount works
+ filecmp.clear_cache()
+ i = 0
+ for f in files:
+ filename=f[0]
+ key = gen_object_name(filename, i)
+ log.debug("comparing object %s with file %s", key, filename)
+ ten_id = i % max_tenants
+ conns[ten_id].download_file(bucket_names[ten_id], key, tmpfile,
+ Config=config)
+ equal = filecmp.cmp(tmpfile, OUT_DIR + filename, shallow=False)
+ assert equal ,"Files %s and %s differ!!" % (key, tmpfile)
+ os.remove(tmpfile)
+
+ log.debug("verify_objects_multi::completed successfully!!")
#-------------------------------------------------------------------------------
thread_list[idx].join()
assert expected_results == count_object_parts_in_all_buckets(True)
- log.debug("verify_objects::completed successfully!!")
+ log.debug("threads_verify_objects::completed successfully!!")
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
def print_dedup_stats(dedup_stats):
+ log.info("===============================================")
+
for key in dedup_stats.__dict__:
- log.warning("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
+ log.info("dedup_stats[%s] = %d", key, dedup_stats.__dict__[key])
+ log.info("===============================================")
#-------------------------------------------------------------------------------
def print_dedup_stats_diff(actual, expected):
else:
ratio = 0
+
+ log.debug("skip_too_small_bytes = %d", expected_dedup_stats.skip_too_small_bytes)
+ if expected_dedup_stats.non_default_storage_class_objs_bytes:
+ log.debug("non_default_storage_class_objs_bytes= %d",
+ expected_dedup_stats.non_default_storage_class_objs_bytes)
+
log.debug("s3_bytes_before = %d/%d", s3_bytes_before, dedup_ratio.s3_bytes_before)
- log.debug("s3_dedup_bytes = %d", expected_dedup_stats.dedup_bytes_estimate);
+ log.debug("s3_dedup_bytes = %d", s3_dedup_bytes);
log.debug("s3_bytes_after = %d/%d", s3_bytes_after, dedup_ratio.s3_bytes_after)
log.debug("ratio = %f/%f", ratio, dedup_ratio.ratio)
set_bucket_index_throttling(limit)
#-------------------------------------------------------------------------------
-def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True):
+def exec_dedup(expected_dedup_stats, dry_run, verify_stats=True, post_dedup_size=0):
# dedup should complete in less than 5 minutes
max_dedup_time = 5*60
if expected_dedup_stats.deduped_obj > 10000:
dedup_stats = ret[1]
dedup_ratio_estimate = ret[2]
dedup_ratio_actual = ret[3]
+ log.debug("exec_dedup: verify_dedup_ratio")
+ verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
+ if post_dedup_size == 0:
+ post_dedup_size = dedup_ratio_estimate.s3_bytes_after
+
+ # no need to check after dry-run which doesn't change anything
+ if dry_run:
+ post_dedup_size = 0
+ count_object_parts_in_all_buckets(True, post_dedup_size)
if verify_stats == False:
return ret
log.debug("potential_unique_obj= %d / %d ", dedup_stats.potential_unique_obj,
expected_dedup_stats.potential_unique_obj)
+
#dedup_stats.set_hash = dedup_stats.invalid_hash
if dedup_stats != expected_dedup_stats:
log.debug("==================================================")
log.debug("==================================================\n")
assert dedup_stats == expected_dedup_stats
- verify_dedup_ratio(expected_dedup_stats, dedup_ratio_estimate)
log.debug("expcted_dedup::stats check completed successfully!!")
return ret
-
#-------------------------------------------------------------------------------
def prepare_test():
cleanup_local()
#make sure we are starting with all buckets empty
- if count_object_parts_in_all_buckets() != 0:
+ if count_object_parts_in_all_buckets(False, 0) != 0:
log.warning("The system was left dirty from previous run");
log.warning("Make sure to remove all objects before starting");
assert(0)
prepare_test()
try:
files=[]
- num_files = 8
- base_size = 4*KB
+ num_files = 5
+ base_size = 1*KB
log.debug("generate files: base size=%d KiB, max_size=%d KiB",
base_size/KB, (pow(2, num_files) * base_size)/KB)
gen_files(files, base_size, num_files)
bucket = conn.create_bucket(Bucket=bucket_name)
log.debug("upload objects to bucket <%s> ...", bucket_name)
indices = [0] * len(files)
- ret = upload_objects(bucket_name, files, indices, conn, default_config)
+ check_obj_count=True
+ ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
expected_results = ret[0]
dedup_stats = ret[1]
s3_objects_total = ret[2]
small_objs_dedup_stats.size_before_dedup = dedup_stats.size_before_dedup
small_objs_dedup_stats.skip_too_small_bytes=dedup_stats.size_before_dedup
small_objs_dedup_stats.skip_too_small = s3_objects_total
- assert small_objs_dedup_stats == dedup_stats
exec_dedup(dedup_stats, dry_run)
if dry_run == False:
log.debug("Verify all objects")
- verify_objects(bucket_name, files, conn, expected_results, default_config)
-
+ verify_objects(bucket_name, files, conn, expected_results, default_config, True)
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
bucket = conn.create_bucket(Bucket=bucket_name)
indices = [0] * len(files)
log.debug("upload objects to bucket <%s> ...", bucket_name)
- ret = upload_objects(bucket_name, files, indices, conn, config)
+ check_obj_count=True
+ ret = upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
expected_results = ret[0]
dedup_stats = ret[1]
log.info("%d S3 objects were uploaded", ret[2])
exec_dedup(dedup_stats, dry_run)
if dry_run == False:
log.debug("Verify all objects")
- verify_objects(bucket_name, files, conn, expected_results, config)
-
- return ret
+ verify_objects(bucket_name, files, conn, expected_results, config, run_cleanup_after)
finally:
if run_cleanup_after:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
+ return ret
+
#-------------------------------------------------------------------------------
def simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
exec_dedup(dedup_stats, dry_run)
if dry_run == False:
log.debug("Verify all objects")
- verify_objects_multi(files, conns, bucket_names, expected_results, config)
+ verify_objects_multi(files, conns, bucket_names, expected_results, config,
+ False)
return ret
#-------------------------------------------------------------------------------
def threads_simple_dedup_with_tenants(files, conns, bucket_names, config, dry_run=False):
indices=[0] * len(files)
-
start = time.time_ns()
- upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config)
+ check_obj_count=True
+ upload_ret=procs_upload_objects(files, conns, bucket_names, indices, config, check_obj_count)
upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
expected_results = upload_ret[0]
dedup_stats = upload_ret[1]
s3_objects_total = upload_ret[2]
-
exec_ret=exec_dedup(dedup_stats, dry_run)
exec_time_sec=exec_ret[0]
verify_time_sec=0
if dry_run == False:
- log.debug("Verify all objects")
+ log.debug("threads_simple_dedup_with_tenants::Verify all objects")
start = time.time_ns()
threads_verify_objects(files, conns, bucket_names,
expected_results, config)
"change_num_parts", "illegal_separator",
"illegal_dec_val_num_parts", "illegal_num_parts_overflow")
+
#------------------------------------------------------------------------------
def change_object_etag(rados_name, new_etag):
result = rados(['-p ', POOLNAME, 'setxattr', rados_name, ETAG_ATTR, new_etag])
new_etag=gen_new_etag(old_etag, corruption, expected_dedup_stats)
log.debug("Corruption:: %s\nold_etag=%s\nnew_etag=%s",
- corruption, old_etag, new_etag)
+ corruption, old_etag, new_etag)
change_object_etag(rados_name, new_etag)
return (rados_name, old_etag)
bucket = conn.create_bucket(Bucket=bucket_name)
indices = [0] * len(files)
- ret = upload_objects(bucket_name, files, indices, conn, default_config)
+ check_obj_count=True
+ ret = upload_objects(bucket_name, files, indices, conn, default_config, check_obj_count)
expected_results = ret[0]
expected_dedup_stats = ret[1]
s3_objects_total = ret[2]
conn=get_single_connection()
bucket = conn.create_bucket(Bucket=bucket_name)
indices = [0] * len(files)
- upload_objects(bucket_name, files, indices, conn, config2)
+ check_obj_count=True
+ upload_objects(bucket_name, files, indices, conn, config2, check_obj_count)
dedup_stats = Dedup_Stats()
# we wrote 2 different small objects (BLOCK_SIZE) with the same md5
dedup_stats.set_hash=dedup_stats.total_processed_objects
dedup_stats.hash_mismatch=1
s3_bytes_before=dedup_stats.size_before_dedup
- expected_ratio_actual=Dedup_Ratio()
- expected_ratio_actual.s3_bytes_before=s3_bytes_before
- expected_ratio_actual.s3_bytes_after=s3_bytes_before
- expected_ratio_actual.ratio=0
+ expected_ratio=Dedup_Ratio()
+ expected_ratio.s3_bytes_before=s3_bytes_before
+ expected_ratio.s3_bytes_after=s3_bytes_before
+ expected_ratio.ratio=0
dry_run=False
log.debug("test_md5_collisions: first call to exec_dedup")
- ret=exec_dedup(dedup_stats, dry_run)
+ ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
dedup_ratio_actual=ret[3]
-
- assert expected_ratio_actual == dedup_ratio_actual
+ assert expected_ratio == dedup_ratio_actual
dedup_stats.valid_hash=dedup_stats.total_processed_objects
dedup_stats.invalid_hash=0
dedup_stats.set_hash=0
log.debug("test_md5_collisions: second call to exec_dedup")
- ret=exec_dedup(dedup_stats, dry_run)
+ ret=exec_dedup(dedup_stats, dry_run, True, 2*BLOCK_SIZE)
dedup_ratio_actual=ret[3]
- assert expected_ratio_actual == dedup_ratio_actual
+ assert expected_ratio == dedup_ratio_actual
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head_with_tenants():
+ prepare_test()
+ config=default_config
+ success=False
+ max_copies_count=4
+ files=[]
+ num_files=11 # [16KB-32MB]
+ base_size = 16*KB
+ log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+ base_size/KB, (pow(2, num_files) * base_size)/KB)
+ try:
+ gen_files(files, base_size, num_files, max_copies_count)
+ indices=[0] * len(files)
+ ret=gen_connections_multi2(max_copies_count)
+ #tenants=ret[0]
+ bucket_names=ret[1]
+ conns=ret[2]
+
+ ret=upload_objects_multi(files, conns, bucket_names, indices, config, True)
+ expected_results = ret[0]
+ dedup_stats = ret[1]
+
+ dry_run=False
+ exec_dedup(dedup_stats, dry_run, True)
+ log.debug("Verify all objects")
+ verify_objects_multi(files, conns, bucket_names, expected_results, config,
+ True)
+ success=True
+ finally:
+ cleanup_all_buckets(bucket_names, conns)
+ if not success:
+ print_files(files, config)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head_with_tenants():
+ #return
+
+ if full_dedup_is_disabled():
+ return
+
+ for idx in range(0, 9):
+ log.debug("test_dedup_split_head_with_tenants: loop #%d", idx);
+ loop_dedup_split_head_with_tenants()
+
+
+#-------------------------------------------------------------------------------
+def loop_dedup_split_head():
+ prepare_test()
+ #bucket_name = gen_bucket_name()
+ bucket_name = "bucket1"
+ config=default_config
+ max_copies_count=4
+ files=[]
+ num_files=11 # [16KB-32MB]
+ base_size = 16*KB
+ log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+ base_size/KB, (pow(2, num_files) * base_size)/KB)
+ try:
+ gen_files(files, base_size, num_files, max_copies_count)
+ indices=[0] * len(files)
+
+ conn=get_single_connection()
+ conn.create_bucket(Bucket=bucket_name)
+ check_obj_count=True
+ ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+ expected_results = ret[0]
+ dedup_stats = ret[1]
+ dry_run=False
+ exec_dedup(dedup_stats, dry_run, True)
+ verify_objects(bucket_name, files, conn, expected_results, config, True)
+ finally:
+ cleanup(bucket_name, conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_split_head():
+ #return
+
+ if full_dedup_is_disabled():
+ return
+
+ for idx in range(0, 9):
+ log.debug("test_dedup_split_head: loop #%d", idx);
+ loop_dedup_split_head()
+
+#-------------------------------------------------------------------------------
+def dedup_copy_internal(multi_buckets):
+ if full_dedup_is_disabled():
+ return
+
+ prepare_test()
+ bucket_names=[]
+ config=default_config
+ max_copies_count=4
+ files=[]
+ num_files=6 # [5MB-512MB]
+ base_size=5*MB
+ log.debug("generate files: base size=%d KiB, max_size=%d KiB",
+ base_size/KB, (pow(2, num_files) * base_size)/KB)
+ conn=get_single_connection()
+ try:
+ gen_files(files, base_size, num_files, max_copies_count)
+ indices=[0] * len(files)
+ if multi_buckets:
+ bucket_names=create_buckets(conn, max_copies_count)
+ else:
+ bucket_name = "bucket1"
+ conn.create_bucket(Bucket=bucket_name)
+ bucket_names=[bucket_name] * max_copies_count
+
+ ret=upload_objects_with_copy(files, conn, bucket_names, indices, config)
+ expected_results = ret[0]
+ dedup_stats = ret[1]
+ dry_run=False
+ max_dedup_time = 5*60
+ exec_dedup_internal(dedup_stats, dry_run, max_dedup_time)
+
+ assert expected_results == count_object_parts_in_all_buckets(True, dedup_stats.size_before_dedup)
+ expected_results=0 # skip object_parts verification
+ conns=[conn] * len(bucket_names)
+ verify_objects_multi(files, conns, bucket_names, expected_results, config, True)
+ finally:
+ # cleanup must be executed even after a failure
+ if multi_buckets:
+ for bucket_name in bucket_names:
+ cleanup(bucket_name, conn)
+ else:
+ cleanup(bucket_names[0], conn)
+
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy():
+ #return
+ dedup_copy_internal(False)
+
+#-------------------------------------------------------------------------------
+@pytest.mark.basic_test
+def test_dedup_copy_multi_buckets():
+ #return
+ dedup_copy_internal(True)
+
#-------------------------------------------------------------------------------
@pytest.mark.basic_test
def test_dedup_small():
prepare_test()
max_copies_count=3
files=[]
- num_files=10 # [4KB-4MB]
- base_size = 4*KB
+ num_files=5 # [1KB-32KB]
+ base_size = 1*KB
log.debug("generate files: base size=%d KiB, max_size=%d KiB",
base_size/KB, (pow(2, num_files) * base_size)/KB)
try:
dry_run=False
exec_dedup(dedup_stats, dry_run)
log.debug("Verify all objects")
- verify_objects_multi(files, conns, bucket_names, expected_results, default_config)
+ verify_objects_multi(files, conns, bucket_names, expected_results,
+ default_config, True)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
dedup_stats2.set_shared_manifest_src=0
dedup_stats2.deduped_obj=0
dedup_stats2.deduped_obj_bytes=0
- dedup_stats2.valid_hash=dedup_stats.invalid_hash
+ dedup_stats2.valid_hash=dedup_stats.unique_obj
dedup_stats2.invalid_hash=0
dedup_stats2.set_hash=0
# run dedup again and make sure nothing has changed
dry_run=False
exec_dedup(dedup_stats2, dry_run)
- verify_objects_multi(files, conns, bucket_names, expected_results, config)
+ verify_objects_multi(files, conns, bucket_names, expected_results,
+ config, True)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
num_files = 11
gen_files_in_range(files, num_files, 1*MB, 64*MB)
# upload objects, dedup, verify, but don't cleanup
- ret = simple_dedup(conn, files, bucket_name, False, config, False)
+ run_cleanup_after=False
+ dry_run=False
+ ret = simple_dedup(conn, files, bucket_name, run_cleanup_after, config, dry_run)
expected_results = ret[0]
dedup_stats = ret[1]
s3_objects_total = ret[2]
dedup_stats2.set_shared_manifest_src=0
dedup_stats2.deduped_obj=0
dedup_stats2.deduped_obj_bytes=0
- dedup_stats2.valid_hash=dedup_stats.invalid_hash
+ dedup_stats2.valid_hash=dedup_stats.unique_obj
dedup_stats2.invalid_hash=0
dedup_stats2.set_hash=0
# run dedup again and make sure nothing has changed
dry_run=False
exec_dedup(dedup_stats2, dry_run)
- verify_objects(bucket_name, files, conn, expected_results, config)
+ verify_objects(bucket_name, files, conn, expected_results, config, True)
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
ret=upload_objects_multi(files_combined, conns, bucket_names, indices, config, False)
expected_results=ret[0]
stats_combined=ret[1]
+
stats_combined.skip_shared_manifest = stats_base.deduped_obj
stats_combined.skip_src_record -= stats_base.skip_src_record
stats_combined.skip_src_record += stats_base.set_shared_manifest_src
stats_combined.deduped_obj -= stats_base.deduped_obj
stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes
- stats_combined.valid_hash = stats_base.set_hash
+ stats_combined.valid_hash = stats_base.unique_obj
stats_combined.invalid_hash -= stats_base.set_hash
- stats_combined.set_hash -= stats_base.set_hash
+ stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
log.debug("test_dedup_inc_1_with_tenants: incremental dedup:")
# run dedup again
dry_run=False
exec_dedup(stats_combined, dry_run)
- verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+ verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+ config, True)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
num_copies_combined=num_copies_to_add+num_copies_base
files_combined.append((filename, obj_size, num_copies_combined))
- ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+ check_obj_count=False
+ ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
expected_results = ret[0]
stats_combined = ret[1]
stats_combined.skip_shared_manifest = stats_base.deduped_obj
stats_combined.deduped_obj -= stats_base.deduped_obj
stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes
- stats_combined.valid_hash = stats_base.set_hash
+ stats_combined.valid_hash = stats_base.unique_obj
stats_combined.invalid_hash -= stats_base.set_hash
- stats_combined.set_hash -= stats_base.set_hash
+ stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
log.debug("test_dedup_inc_1: incremental dedup:")
# run dedup again
dry_run=False
exec_dedup(stats_combined, dry_run)
- verify_objects(bucket_name, files_combined, conn, expected_results, config)
+ verify_objects(bucket_name, files_combined, conn, expected_results, config, True)
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
stats_combined.deduped_obj -= stats_base.deduped_obj
stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes
- stats_combined.valid_hash = stats_base.set_hash
+ stats_combined.valid_hash = stats_base.unique_obj
stats_combined.invalid_hash -= stats_base.set_hash
- stats_combined.set_hash -= stats_base.set_hash
+ stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
# run dedup again
dry_run=False
exec_dedup(stats_combined, dry_run)
- verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+ verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+ config, True)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
indices.append(0)
assert(len(indices) == len(files_combined))
- ret=upload_objects(bucket_name, files_combined, indices, conn, config, False)
+ check_obj_count=False
+ ret=upload_objects(bucket_name, files_combined, indices, conn, config, check_obj_count)
expected_results = ret[0]
stats_combined = ret[1]
stats_combined.skip_shared_manifest = stats_base.deduped_obj
stats_combined.deduped_obj -= stats_base.deduped_obj
stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes
- stats_combined.valid_hash = stats_base.set_hash
+ stats_combined.valid_hash = stats_base.unique_obj
stats_combined.invalid_hash -= stats_base.set_hash
- stats_combined.set_hash -= stats_base.set_hash
+ stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
log.debug("test_dedup_inc_2: incremental dedup:")
# run dedup again
dry_run=False
exec_dedup(stats_combined, dry_run)
verify_objects(bucket_name, files_combined, conn, expected_results,
- config)
+ config, True)
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
@pytest.mark.basic_test
def test_dedup_inc_with_remove_multi_tenants():
#return
-
if full_dedup_is_disabled():
return
bucket_names=ret[1]
conns=ret[2]
try:
+ split_heads_count=0
+ split_heads_removed=0
+ split_heads=[]
files=[]
num_files = 17
# gen_files_in_range creates 2-3 copies
expected_results_base = ret[0]
stats_base = ret[1]
+ ### find which objects got split head before remove
+ for f in files:
+ obj_size=f[1]
+ num_copies=f[2]
+ split_head = calc_split_objs_count(obj_size, num_copies, config)
+ split_heads.append(split_head)
+ if split_head:
+ split_heads_count += 1
+
# REMOVE some objects and update stats/expected
src_record=0
shared_manifest=0
- valid_sha=0
+ valid_hash=0
object_keys=[]
files_sub=[]
dedup_stats = Dedup_Stats()
- for f in files:
+ for idx, f in enumerate(files):
filename=f[0]
obj_size=f[1]
num_copies=f[2]
num_copies_2=num_copies-num_remove
log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
if num_copies_2:
- if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
- valid_sha += num_copies_2
+ split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+ if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+ valid_hash += 1
src_record += 1
shared_manifest += (num_copies_2 - 1)
files_sub.append((filename, obj_size, num_copies_2))
calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+ elif split_heads[idx]:
+ # we removed all copies of a split-head object
+ split_heads_count -= 1
+ split_heads_removed += 1
start_idx=num_copies_2
for i in range(start_idx, num_copies):
dedup_stats.set_shared_manifest_src=0
dedup_stats.deduped_obj=0
dedup_stats.deduped_obj_bytes=0
+
dedup_stats.skip_src_record=src_record
dedup_stats.skip_shared_manifest=shared_manifest
- dedup_stats.valid_hash=valid_sha
+ dedup_stats.valid_hash=valid_hash
dedup_stats.invalid_hash=0
dedup_stats.set_hash=0
dry_run=False
exec_dedup(dedup_stats, dry_run)
expected_results=calc_expected_results(files_sub, config)
- verify_objects_multi(files_sub, conns, bucket_names, expected_results, config)
+ expected_results += split_heads_count
+ verify_objects_multi(files_sub, conns, bucket_names, expected_results,
+ config, True)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
@pytest.mark.basic_test
def test_dedup_inc_with_remove():
#return
-
if full_dedup_is_disabled():
return
log.debug("test_dedup_inc_with_remove: connect to AWS ...")
conn=get_single_connection()
try:
+ split_heads_count=0
+ split_heads_removed=0
+ split_heads=[]
files=[]
num_files = 17
gen_files_in_range(files, num_files, 1*MB, 64*MB)
expected_results_base = ret[0]
stats_base = ret[1]
+ ### find which objects got split head before remove
+ for f in files:
+ obj_size=f[1]
+ num_copies=f[2]
+ split_head = calc_split_objs_count(obj_size, num_copies, config)
+ split_heads.append(split_head)
+ if split_head:
+ split_heads_count += 1
+
# REMOVE some objects and update stats/expected
src_record=0
shared_manifest=0
- valid_sha=0
+ valid_hash=0
object_keys=[]
files_sub=[]
dedup_stats = Dedup_Stats()
- for f in files:
+ for idx, f in enumerate(files):
filename=f[0]
obj_size=f[1]
num_copies=f[2]
num_remove=random.randint(0, num_copies)
num_copies_2=num_copies-num_remove
- log.debug("objects::%s::size=%d, num_copies=%d", filename, obj_size, num_copies_2);
if num_copies_2:
- if num_copies_2 > 1 and obj_size > RADOS_OBJ_SIZE:
- valid_sha += num_copies_2
+ split_head = calc_split_objs_count(obj_size, num_copies_2, config)
+ if num_copies_2 > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
+ valid_hash += 1
src_record += 1
shared_manifest += (num_copies_2 - 1)
files_sub.append((filename, obj_size, num_copies_2))
calc_expected_stats(dedup_stats, obj_size, num_copies_2, config)
+ elif split_heads[idx]:
+ # we removed all copies of a split-head object
+ split_heads_count -= 1
+ split_heads_removed += 1
start_idx=num_copies_2
for i in range(start_idx, num_copies):
log.debug("Skiping file=%s, num_remove=%d", filename, num_remove)
continue
- response=conn.delete_objects(Bucket=bucket_name,
- Delete={"Objects": [{"Key": key} for key in object_keys]})
+ delete_objects(conn, bucket_name, object_keys)
# must call garbage collection for predictable count
result = admin(['gc', 'process', '--include-all'])
dedup_stats.deduped_obj_bytes=0
dedup_stats.skip_src_record=src_record
dedup_stats.skip_shared_manifest=shared_manifest
- dedup_stats.valid_hash=valid_sha
+ dedup_stats.valid_hash=valid_hash
dedup_stats.invalid_hash=0
dedup_stats.set_hash=0
log.debug("test_dedup_inc_with_remove: incremental dedup:")
log.debug("stats_base.size_before_dedup=%d", stats_base.size_before_dedup)
- log.debug("dedup_stats.size_before_dedup=%d", dedup_stats.size_before_dedup)
dry_run=False
exec_dedup(dedup_stats, dry_run)
expected_results=calc_expected_results(files_sub, config)
- verify_objects(bucket_name, files_sub, conn, expected_results, config)
+ expected_results += split_heads_count
+ verify_objects(bucket_name, files_sub, conn, expected_results, config, True)
finally:
# cleanup must be executed even after a failure
cleanup(bucket_name, conn)
simple_dedup(conn, files, bucket_name, True, default_config, False)
-
#-------------------------------------------------------------------------------
@pytest.mark.basic_test
def test_dedup_basic_with_tenants():
num_files=5
base_size = MULTIPART_SIZE
log.debug("generate files: base size=%d MiB, max_size=%d MiB",
- base_size/MB, (pow(2, num_files) * base_size)/MB)
+ base_size/MB, (pow(2, num_files) * base_size)/MB)
gen_files(files, base_size, num_files)
log.debug("call simple_dedup()")
- simple_dedup(conn, files, bucket_name, True, default_config, False)
+ run_cleanup_after=True
+ dry_run=False
+ simple_dedup(conn, files, bucket_name, run_cleanup_after, default_config, dry_run)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
@pytest.mark.basic_test
def test_dedup_large_scale_with_tenants():
- return
+ #return
if full_dedup_is_disabled():
return
#-------------------------------------------------------------------------------
@pytest.mark.basic_test
def test_dedup_large_scale():
- return
+ #return
if full_dedup_is_disabled():
return
#-------------------------------------------------------------------------------
@pytest.mark.basic_test
def test_empty_bucket():
- return
+ #return
if full_dedup_is_disabled():
return
# add new files
num_files_new = 11
- gen_files_in_range(files_combined, num_files_new, 2*MB, 32*MB)
+ gen_files_in_range(files_combined, num_files_new, 1*MB, 32*MB)
pad_count = len(files_combined) - len(files)
for i in range(0, pad_count):
indices.append(0)
for f in files_combined:
obj_size=f[1]
num_copies=f[2]
- if num_copies > 1 and obj_size > RADOS_OBJ_SIZE:
+ split_head = calc_split_objs_count(obj_size, num_copies, config)
+ if num_copies > 1 and (obj_size > RADOS_OBJ_SIZE or split_head):
src_record += 1
stats_combined.skip_shared_manifest = stats_base.deduped_obj
stats_combined.deduped_obj -= stats_base.deduped_obj
stats_combined.deduped_obj_bytes -= stats_base.deduped_obj_bytes
- stats_combined.valid_hash = stats_base.set_hash
+ stats_combined.valid_hash = stats_base.unique_obj
stats_combined.invalid_hash -= stats_base.set_hash
- stats_combined.set_hash -= stats_base.set_hash
-
+ stats_combined.set_hash = stats_combined.set_shared_manifest_src + stats_combined.deduped_obj
log.debug("test_dedup_inc_2_with_tenants: incremental dedup:")
# run dedup again
dry_run=False
exec_dedup(stats_combined, dry_run)
- verify_objects_multi(files_combined, conns, bucket_names, expected_results, config)
+ verify_objects_multi(files_combined, conns, bucket_names, expected_results,
+ config, False)
return (files_combined, stats_combined)
files=[]
num_files = 13
# gen_files_in_range creates 2-3 copies
- gen_files_in_range(files, num_files, 1*MB, 64*MB)
+ gen_files_in_range(files, num_files, 256*KB, 64*MB)
# upload objects, dedup, verify, but don't cleanup
ret=simple_dedup_with_tenants(files, conns, bucket_names, config)
stats_base=ret[1]
files=ret[0]
stats_last=ret[1]
stats_base.set_shared_manifest_src += stats_last.set_shared_manifest_src
- stats_base.deduped_obj += stats_last.deduped_obj
- stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes
- stats_base.set_hash += stats_last.set_hash
+ stats_base.unique_obj += stats_last.set_shared_manifest_src
+ stats_base.deduped_obj += stats_last.deduped_obj
+ stats_base.deduped_obj_bytes += stats_last.deduped_obj_bytes
+ stats_base.set_hash += stats_last.set_hash
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
prepare_test()
max_copies_count=3
files=[]
- num_files=10 # [4KB-4MB]
- base_size = 4*KB
+ num_files=5 # [1KB-32KB]
+ base_size = 1*KB
log.debug("generate files: base size=%d KiB, max_size=%d KiB",
base_size/KB, (pow(2, num_files) * base_size)/KB)
try:
num_files=8
min_size=MULTIPART_SIZE
- #gen_files_in_range(files, num_files, min_size, 1024*MB)
+ # create files in range [MULTIPART_SIZE, 128MB] aligned on RADOS_OBJ_SIZE
gen_files_in_range(files, num_files, min_size, 128*MB)
- # add files in range [MULTIPART_SIZE, 4*MULTIPART_SIZE] aligned on MULTIPART_SIZE
+ # add files in range [MULTIPART_SIZE, 8*MULTIPART_SIZE] aligned on MULTIPART_SIZE
gen_files_in_range(files, num_files, min_size, min_size*8, MULTIPART_SIZE)
# add file with excatly MULTIPART_SIZE
conns[i].create_bucket(Bucket=bucket_names[i])
indices = [0] * len(files)
- ret=procs_upload_objects(files, conns, bucket_names, indices, default_config)
+ check_obj_count=True
+ ret=procs_upload_objects(files, conns, bucket_names, indices, default_config, check_obj_count)
upload_time_sec = (time.time_ns() - start) / (1000*1000*1000)
expected_results = ret[0]
dedup_stats = ret[1]
log.debug("obj_count=%d, upload_time=%d(sec)", s3_objects_total,
upload_time_sec)
exec_dedup(dedup_stats, dry_run)
- if dry_run == False:
- verify_objects(bucket_name, files, conn, expected_results, default_config)
finally:
# cleanup must be executed even after a failure
cleanup_all_buckets(bucket_names, conns)
size=1*KB
files=[]
config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
- log.debug("test_dedup_dry_large_scale_with_tenants: connect to AWS ...")
+ log.info("test_dedup_dry_large_scale: connect to AWS ...")
gen_files_fixed_size(files, num_files, size, max_copies_count)
- threads_dedup_basic_with_tenants_common(files, num_threads, config, True)
+ conns=get_connections(num_threads)
+ bucket_names=get_buckets(num_threads)
+ for i in range(num_threads):
+ conns[i].create_bucket(Bucket=bucket_names[i])
+ try:
+ threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
+ except Exception:
+ log.warning("test_dedup_dry_large_scale: failed!!")
+ finally:
+ # cleanup must be executed even after a failure
+ cleanup_all_buckets(bucket_names, conns)
#-------------------------------------------------------------------------------
#return
prepare_test()
- max_copies_count=3
- num_threads=64
- num_files=32*1024
+ bucket_name = gen_bucket_name()
+ max_copies_count=2
+ num_files=2*1024
size=1*KB
files=[]
config=TransferConfig(multipart_threshold=size, multipart_chunksize=1*MB)
- log.debug("test_dedup_dry_large_scale_new: connect to AWS ...")
- gen_files_fixed_size(files, num_files, size, max_copies_count)
- conns=get_connections(num_threads)
- bucket_names=get_buckets(num_threads)
- for i in range(num_threads):
- conns[i].create_bucket(Bucket=bucket_names[i])
+ log.info("test_dedup_dry_large_scale: connect to AWS ...")
try:
- threads_simple_dedup_with_tenants(files, conns, bucket_names, config, True)
- except:
- log.warning("test_dedup_dry_large_scale: failed!!")
+ gen_files_fixed_size(files, num_files, size, max_copies_count)
+ indices=[0] * len(files)
+ conn=get_single_connection()
+ conn.create_bucket(Bucket=bucket_name)
+ check_obj_count=True
+ ret=upload_objects(bucket_name, files, indices, conn, config, check_obj_count)
+ dedup_stats = ret[1]
+ dry_run=True
+ exec_dedup(dedup_stats, dry_run, True)
finally:
# cleanup must be executed even after a failure
- cleanup_all_buckets(bucket_names, conns)
+ cleanup(bucket_name, conn)
#-------------------------------------------------------------------------------