From c828e5d77c0e10693f80c060673a42ae924f07e7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 21 May 2020 11:01:24 -0500 Subject: [PATCH] common/rabin: remove The implementation is buggy, and slower than FastCDC. Signed-off-by: Sage Weil --- qa/workunits/rados/test_dedup_tool.sh | 40 -------- src/common/CDC.cc | 6 -- src/common/CMakeLists.txt | 1 - src/common/FastCDC.cc | 3 +- src/common/rabin.cc | 135 -------------------------- src/common/rabin.h | 110 --------------------- src/test/common/CMakeLists.txt | 5 - src/test/common/test_cdc.cc | 1 - src/tools/ceph_dedup_tool.cc | 2 +- 9 files changed, 2 insertions(+), 301 deletions(-) delete mode 100644 src/common/rabin.cc delete mode 100644 src/common/rabin.h diff --git a/qa/workunits/rados/test_dedup_tool.sh b/qa/workunits/rados/test_dedup_tool.sh index a6e404ee6af1f..c1a9888d0a75e 100755 --- a/qa/workunits/rados/test_dedup_tool.sh +++ b/qa/workunits/rados/test_dedup_tool.sh @@ -153,48 +153,8 @@ function test_dedup_chunk_scrub() $RADOS_TOOL -p $POOL rm bar } -function test_dedup_ratio_rabin() -{ - # case 1 - echo "abcdefghijklmnop" >> dedup_16 - for num in `seq 0 63` - do - dd if=./dedup_16 bs=16 count=1 >> dedup_object_1k - done - - for num in `seq 0 11` - do - dd if=dedup_object_1k bs=1K count=1 >> test_rabin_object - done - $RADOS_TOOL -p $POOL put $OBJ ./test_rabin_object - RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1015 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}') - if [ 4096 -ne $RESULT ]; - then - die "Estimate failed expecting 4096 result $RESULT" - fi - - echo "a" >> test_rabin_object_2 - dd if=./test_rabin_object bs=8K count=1 >> test_rabin_object_2 - $RADOS_TOOL -p $POOL put $OBJ"_2" ./test_rabin_object_2 - RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1012 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}') - if [ 11259 -ne $RESULT ]; - then - die "Estimate failed expecting 11259 result $RESULT" - fi - - RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1024 --chunk-mask-bit 3 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}') - if [ 7170 -ne $RESULT ]; - then - die "Estimate failed expecting 7170 result $RESULT" - fi - - rm -rf ./dedup_object_1k ./test_rabin_object ./test_rabin_object_2 ./dedup_16 - -} - test_dedup_ratio_fixed test_dedup_chunk_scrub -test_dedup_ratio_rabin $CEPH_TOOL osd pool delete $POOL $POOL --yes-i-really-really-mean-it diff --git a/src/common/CDC.cc b/src/common/CDC.cc index bf455db8a44ae..69cb978278f04 100644 --- a/src/common/CDC.cc +++ b/src/common/CDC.cc @@ -6,18 +6,12 @@ #include "CDC.h" #include "FastCDC.h" #include "FixedCDC.h" -#include "rabin.h" std::unique_ptr CDC::create( const std::string& type, int bits, int windowbits) { - if (type == "rabin") { - auto p = new RabinChunk(); - p->set_target_bits(bits, windowbits); - return std::unique_ptr(p); - } if (type == "fastcdc") { return std::unique_ptr(new FastCDC(bits, windowbits)); } diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index f3c248f290bda..fbf6915f2c6a7 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -91,7 +91,6 @@ set(common_srcs perf_counters_collection.cc perf_histogram.cc pick_address.cc - rabin.cc random_string.cc reverse.c run_cmd.cc diff --git a/src/common/FastCDC.cc b/src/common/FastCDC.cc index 70358e9782eaa..e44eb1b2ce74c 100644 --- a/src/common/FastCDC.cc +++ b/src/common/FastCDC.cc @@ -4,10 +4,9 @@ #include #include "FastCDC.h" -#include "rabin.h" -// Unlike FastCDC describe in the paper, if we are close to the +// Unlike FastCDC described in the paper, if we are close to the // target, use the target mask. If we are very small or very large, // use an adjusted mask--like the paper. This tries to keep more // cut points using the same mask, and fewer using the small or large diff --git a/src/common/rabin.cc b/src/common/rabin.cc deleted file mode 100644 index 9a6bab4100c7d..0000000000000 --- a/src/common/rabin.cc +++ /dev/null @@ -1,135 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include - -#include "include/types.h" -#include "rabin.h" - - -uint64_t RabinChunk::gen_rabin_hash(char* chunk_data, uint64_t off, uint64_t len) const { - uint64_t roll_sum = 0; - uint64_t data_len = len; - if (data_len == 0) { - data_len = window_size; - } - for (uint64_t i = off; i < data_len; i++) { - char cur_byte = *(chunk_data + i); - roll_sum = (roll_sum * rabin_prime + cur_byte ) % (mod_prime) ; - } - return roll_sum; -} - -bool RabinChunk::end_of_chunk(const uint64_t fp , int numbits) const { - return ((fp & rabin_mask[numbits]) == 0) ; -} - -/* - * Given a bufferlist of inputdata, use Rabin-fingerprint to - * chunk it and return the chunked result - * - * Arguments: - * min: min data chunk size - * max: max data chunk size - * - * Returns: - * output_chunks split by Rabin - */ - -int RabinChunk::do_rabin_chunks(ceph::buffer::list& inputdata, - std::vector>& chunks, - uint64_t min_val, uint64_t max_val) const -{ - char *ptr = inputdata.c_str(); - uint64_t data_size = inputdata.length(); - uint64_t min, max; - min = min_val; - max = max_val; - - - if (min == 0 || max == 0) { - min = this->min; - max = this->max; - } - - if (min < window_size) { - return -ERANGE; - } - - if (data_size < min) { - chunks.push_back(std::make_pair(0, data_size)); - return 0; - } - - uint64_t c_offset = 0; - uint64_t c_size = 0; - uint64_t c_start = 0; - uint64_t rabin_hash; - bool start_new_chunk = true; - bool store_chunk = false; - - - while (c_offset + window_size < data_size) { // if it is still possible to calculate rabin hash - - if (start_new_chunk) { - rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk - c_size = window_size; // don't forget set c_size - start_new_chunk = false; - } else { - // use existing rabin to calculate a new rabin hash - // note c_offset already increased by 1 - // old byte pointed by ptr + c_offset - 1 - // new byte pointed by ptr + c_offset + WINDOW_SIZE -1; - - char new_byte = *(ptr + c_offset + window_size - 1); - char old_byte = *(ptr + c_offset - 1); - - // TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int - rabin_hash = (rabin_hash * rabin_prime + new_byte - old_byte * pow) % (mod_prime); - } - - /* - Case 1 : Fingerprint Found - subcase 1 : if c_size < min -> ignore - subcase 2 : if min <= c_size <= max -> store - subcase 3 : if c_size > max -> won't happen - Case 2 : Fingerprint not Found - subcase 1 : if c_size < min -> ignore - subcase 2 : if min <= c_size < max -> ignore - subcase 3 : if c_size == max -> (force) store - */ - - if (end_of_chunk(rabin_hash, num_bits)) { - if((c_size >= min && c_size <= max)) { // a valid chunk with rabin - store_chunk = true; - } else { - store_chunk = false; - } - } else { - if (c_size == max) { - store_chunk = true; - } else { - store_chunk = false; - } - } - - if (store_chunk) { - chunks.push_back(std::make_pair(c_start, c_size)); - c_start += c_size; - c_offset = c_start; - start_new_chunk = true; - continue; - - } - - c_size++; - c_offset++; - } - - if (c_start < data_size) { - c_size = data_size - c_start; - chunks.push_back(std::make_pair(c_start, c_size)); - } - - return 0; -} diff --git a/src/common/rabin.h b/src/common/rabin.h deleted file mode 100644 index 939aa4ff846a1..0000000000000 --- a/src/common/rabin.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Authors : Yuan-Ting Hsieh, Hsuan-Heng Wu, Myoungwon Oh - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_COMMON_RABIN_H_ -#define CEPH_COMMON_RABIN_H_ - -#include -#include -#include - -#include "CDC.h" -#include "include/buffer_fwd.h" - -class RabinChunk : public CDC { -public: - RabinChunk(uint32_t window_size, uint32_t rabin_prime, - uint64_t mod_prime, uint64_t pow, std::vector rabin_mask, - uint64_t min, uint64_t max, uint32_t num_bits): - window_size(window_size), rabin_prime(rabin_prime), - mod_prime(mod_prime), pow(pow), rabin_mask(rabin_mask), min(min), - max(max), num_bits(num_bits) {} - RabinChunk() { - default_init_rabin_options(); - } - - void default_init_rabin_options() { - std::vector _rabin_mask; - for (size_t i = 0; i < 63; ++i) { - _rabin_mask.push_back((1ull << i) - 1); - } - window_size = 48; - rabin_prime = 3; - mod_prime = 6148914691236517051; - pow = 907234050803559263; // pow(prime, window_size) - min = 8000; - max = 16000; - num_bits = 3; - rabin_mask = _rabin_mask; - } - - void calc_chunks( - const ceph::buffer::list& inputdata, - std::vector> *chunks) const override { - bufferlist b = inputdata; - do_rabin_chunks(b, *chunks); - } - - int do_rabin_chunks(ceph::buffer::list& inputdata, - std::vector>& chunks, - uint64_t min=0, uint64_t max=0) const; - uint64_t gen_rabin_hash(char* chunk_data, uint64_t off, uint64_t len = 0) const; - void set_window_size(uint32_t size) { window_size = size; } - void set_rabin_prime(uint32_t r_prime) { rabin_prime = r_prime; } - void set_mod_prime(uint64_t m_prime) { mod_prime = m_prime; } - void set_pow(uint64_t p) { pow = p; } - void set_rabin_mask(std::vector & mask) { rabin_mask = mask; } - void set_min_chunk(uint32_t c_min) { min = c_min; } - void set_max_chunk(uint32_t c_max) { max = c_max; } - - int add_rabin_mask(uint64_t mask) { - rabin_mask.push_back(mask); - for (int i = 0; rabin_mask.size(); i++) { - if (rabin_mask[i] == mask) { - return i; - } - } - return -1; - } - void set_numbits(uint32_t bits) { - ceph_assert(bits > 0); - ceph_assert(bits < 63); - num_bits = bits; - } - - // most users should use this - void set_target_bits(int bits, int windowbits=0) override { - set_numbits(bits); - if (!windowbits) { - windowbits = 2; - } - set_min_chunk(1 << (bits - windowbits)); - set_max_chunk(1 << (bits + windowbits)); - } - -private: - bool end_of_chunk(const uint64_t fp , int numbits) const; - - uint32_t window_size; - uint32_t rabin_prime; - uint64_t mod_prime; - uint64_t pow; - std::vector rabin_mask; - uint64_t min; - uint64_t max; - uint32_t num_bits; -}; - - -#endif // CEPH_COMMON_RABIN_H_ diff --git a/src/test/common/CMakeLists.txt b/src/test/common/CMakeLists.txt index 45fc428fd7470..9ea59cc98b7b9 100644 --- a/src/test/common/CMakeLists.txt +++ b/src/test/common/CMakeLists.txt @@ -329,11 +329,6 @@ add_executable(unittest_async_shared_mutex test_async_shared_mutex.cc) add_ceph_unittest(unittest_async_shared_mutex) target_link_libraries(unittest_async_shared_mutex ceph-common Boost::system) -add_executable(unittest_rabin_chunk test_rabin_chunk.cc - $) -target_link_libraries(unittest_rabin_chunk global ceph-common) -add_ceph_unittest(unittest_rabin_chunk) - add_executable(unittest_cdc test_cdc.cc $) target_link_libraries(unittest_cdc global ceph-common) diff --git a/src/test/common/test_cdc.cc b/src/test/common/test_cdc.cc index 00187804fb179..c8754261f408a 100644 --- a/src/test/common/test_cdc.cc +++ b/src/test/common/test_cdc.cc @@ -180,5 +180,4 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( "fixed", // note: we skip most tests bc this is not content-based "fastcdc" - //, "rabin" // rabin fails insert_{front,middle} )); diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc index d8eb160035329..48d468b157ed6 100644 --- a/src/tools/ceph_dedup_tool.cc +++ b/src/tools/ceph_dedup_tool.cc @@ -133,7 +133,7 @@ void usage() cout << " usage: [--op ] [--pool ] " << std::endl; cout << " --object " << std::endl; cout << " --chunk-size chunk-size (byte) " << std::endl; - cout << " --chunk-algorithm " << std::endl; + cout << " --chunk-algorithm " << std::endl; cout << " --fingerprint-algorithm " << std::endl; cout << " --chunk-pool " << std::endl; cout << " --max-thread " << std::endl; -- 2.39.5