From 41d532302d10fb0dacea7332904b8b9f2d29842f Mon Sep 17 00:00:00 2001 From: JinyongHa Date: Mon, 21 Feb 2022 02:09:17 +0000 Subject: [PATCH] dedup-tool: add sampling ratio to crawling By using lower sampling ratio, runtime deduplication with lower overhead. It tries deduplication with sampled few objects in base-pool, and only deduplicates objects or chunks which are highly duplicated among samples. The option "sampling-ratio" is used for controling the ratio of the objects to be picked. Signed-off-by: JinyongHa --- src/tools/ceph_dedup_tool.cc | 47 ++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index 827840f211b..02d278d0acb 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -182,6 +182,7 @@ po::options_description make_usage() { ("debug", ": enable debug") ("pgid", ": set pgid") ("chunk-dedup-threshold", po::value(), ": set the threshold for chunk dedup (number of duplication) ") + ("sampling-ratio", po::value(), ": set the sampling ratio (percentile)") ; desc.add(op_desc); return desc; @@ -588,9 +589,12 @@ public: struct SampleDedupGlobal { FpStore fp_store; double object_dedup_threshold = -1; + double sampling_ratio = -1; SampleDedupGlobal( - int chunk_threshold) : - fp_store(chunk_threshold) { } + int chunk_threshold, + int sampling_ratio) : + fp_store(chunk_threshold), + sampling_ratio(static_cast(sampling_ratio) / 100) { } }; SampleDedupWorkerThread( @@ -663,10 +667,10 @@ void SampleDedupWorkerThread::crawl() // Get the list of object IDs to deduplicate std::tie(objects, current_object) = get_objects(current_object, end, 100); - // Pick few objects to be processed. Crawling mode decides how many - // objects to pick (sampling ratio). Lower sampling ratio makes crawler - // have lower crawling overhead but find less duplication. - std::set sampled_indexes = sample_object(objects.size()); + // Pick few objects to be processed. Sampling ratio decides how many + // objects to pick. Lower sampling ratio makes crawler have lower crawling + // overhead but find less duplication. + auto sampled_indexes = sample_object(objects.size()); for (size_t index : sampled_indexes) { ObjectItem target = objects[index]; try_dedup_and_accumulate_result(target); @@ -720,12 +724,18 @@ std::tuple, ObjectCursor> SampleDedupWorkerThread::get_o return std::make_tuple(objects, next); } -std::set SampleDedupWorkerThread::sample_object(size_t count) +std::vector SampleDedupWorkerThread::sample_object(size_t count) { - std::set indexes; - for (size_t index = 0 ; index < count ; index++) { - indexes.insert(index); + std::vector indexes(count); + for (size_t i = 0 ; i < count ; i++) { + indexes[i] = i; } + default_random_engine generator; + shuffle(indexes.begin(), indexes.end(), generator); + size_t sampling_count = static_cast(count) * + sample_dedup_global.sampling_ratio; + indexes.resize(sampling_count); + return indexes; } @@ -1545,6 +1555,10 @@ int make_crawling_daemon(const po::variables_map &opts) string chunk_pool_name = get_opts_chunk_pool(opts); unsigned max_thread = get_opts_max_thread(opts); + int sampling_ratio = -1; + if (opts.count("sampling-ratio")) { + sampling_ratio = opts["sampling-ratio"].as(); + } size_t chunk_size = 8192; if (opts.count("chunk-size")) { chunk_size = opts["chunk-size"].as(); @@ -1554,7 +1568,7 @@ int make_crawling_daemon(const po::variables_map &opts) uint32_t chunk_dedup_threshold = -1; if (opts.count("chunk-dedup-threshold")) { - chunk_size = opts["chunk-dedup-threshold"].as(); + chunk_dedup_threshold = opts["chunk-dedup-threshold"].as(); } std::string chunk_algo = get_opts_chunk_algo(opts); @@ -1613,8 +1627,15 @@ int make_crawling_daemon(const po::variables_map &opts) cerr << " operate fail : " << cpp_strerror(ret) << std::endl; return ret; } + ret = rados.mon_command( + make_pool_str(base_pool_name, "dedup_tier", chunk_pool_name), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } - cout << "Object Dedup Threshold : " << object_dedup_threshold << std::endl + cout << "SampleRatio : " << sampling_ratio << std::endl << "Chunk Dedup Threshold : " << chunk_dedup_threshold << std::endl << "Chunk Size : " << chunk_size << std::endl << std::endl; @@ -1641,7 +1662,7 @@ int make_crawling_daemon(const po::variables_map &opts) estimate_threads.clear(); SampleDedupWorkerThread::SampleDedupGlobal sample_dedup_global( - chunk_dedup_threshold); + chunk_dedup_threshold, sampling_ratio); for (unsigned i = 0; i < max_thread; i++) { cout << " add thread.. " << std::endl; -- 2.39.5