The implementation is buggy, and slower than FastCDC.
Signed-off-by: Sage Weil <sage@newdream.net>
$RADOS_TOOL -p $POOL rm bar
}
-function test_dedup_ratio_rabin()
-{
- # case 1
- echo "abcdefghijklmnop" >> dedup_16
- for num in `seq 0 63`
- do
- dd if=./dedup_16 bs=16 count=1 >> dedup_object_1k
- done
-
- for num in `seq 0 11`
- do
- dd if=dedup_object_1k bs=1K count=1 >> test_rabin_object
- done
- $RADOS_TOOL -p $POOL put $OBJ ./test_rabin_object
- RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1015 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}')
- if [ 4096 -ne $RESULT ];
- then
- die "Estimate failed expecting 4096 result $RESULT"
- fi
-
- echo "a" >> test_rabin_object_2
- dd if=./test_rabin_object bs=8K count=1 >> test_rabin_object_2
- $RADOS_TOOL -p $POOL put $OBJ"_2" ./test_rabin_object_2
- RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1012 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}')
- if [ 11259 -ne $RESULT ];
- then
- die "Estimate failed expecting 11259 result $RESULT"
- fi
-
- RESULT=$($DEDUP_TOOL --op estimate --pool $POOL --min-chunk 1024 --chunk-mask-bit 3 --chunk-algorithm rabin --fingerprint-algorithm rabin --debug | grep result -a | awk '{print$4}')
- if [ 7170 -ne $RESULT ];
- then
- die "Estimate failed expecting 7170 result $RESULT"
- fi
-
- rm -rf ./dedup_object_1k ./test_rabin_object ./test_rabin_object_2 ./dedup_16
-
-}
-
test_dedup_ratio_fixed
test_dedup_chunk_scrub
-test_dedup_ratio_rabin
$CEPH_TOOL osd pool delete $POOL $POOL --yes-i-really-really-mean-it
#include "CDC.h"
#include "FastCDC.h"
#include "FixedCDC.h"
-#include "rabin.h"
std::unique_ptr<CDC> CDC::create(
const std::string& type,
int bits,
int windowbits)
{
- if (type == "rabin") {
- auto p = new RabinChunk();
- p->set_target_bits(bits, windowbits);
- return std::unique_ptr<CDC>(p);
- }
if (type == "fastcdc") {
return std::unique_ptr<CDC>(new FastCDC(bits, windowbits));
}
perf_counters_collection.cc
perf_histogram.cc
pick_address.cc
- rabin.cc
random_string.cc
reverse.c
run_cmd.cc
#include <random>
#include "FastCDC.h"
-#include "rabin.h"
-// Unlike FastCDC describe in the paper, if we are close to the
+// Unlike FastCDC described in the paper, if we are close to the
// target, use the target mask. If we are very small or very large,
// use an adjusted mask--like the paper. This tries to keep more
// cut points using the same mask, and fewer using the small or large
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <cstring>
-
-#include "include/types.h"
-#include "rabin.h"
-
-
-uint64_t RabinChunk::gen_rabin_hash(char* chunk_data, uint64_t off, uint64_t len) const {
- uint64_t roll_sum = 0;
- uint64_t data_len = len;
- if (data_len == 0) {
- data_len = window_size;
- }
- for (uint64_t i = off; i < data_len; i++) {
- char cur_byte = *(chunk_data + i);
- roll_sum = (roll_sum * rabin_prime + cur_byte ) % (mod_prime) ;
- }
- return roll_sum;
-}
-
-bool RabinChunk::end_of_chunk(const uint64_t fp , int numbits) const {
- return ((fp & rabin_mask[numbits]) == 0) ;
-}
-
-/*
- * Given a bufferlist of inputdata, use Rabin-fingerprint to
- * chunk it and return the chunked result
- *
- * Arguments:
- * min: min data chunk size
- * max: max data chunk size
- *
- * Returns:
- * output_chunks split by Rabin
- */
-
-int RabinChunk::do_rabin_chunks(ceph::buffer::list& inputdata,
- std::vector<std::pair<uint64_t, uint64_t>>& chunks,
- uint64_t min_val, uint64_t max_val) const
-{
- char *ptr = inputdata.c_str();
- uint64_t data_size = inputdata.length();
- uint64_t min, max;
- min = min_val;
- max = max_val;
-
-
- if (min == 0 || max == 0) {
- min = this->min;
- max = this->max;
- }
-
- if (min < window_size) {
- return -ERANGE;
- }
-
- if (data_size < min) {
- chunks.push_back(std::make_pair(0, data_size));
- return 0;
- }
-
- uint64_t c_offset = 0;
- uint64_t c_size = 0;
- uint64_t c_start = 0;
- uint64_t rabin_hash;
- bool start_new_chunk = true;
- bool store_chunk = false;
-
-
- while (c_offset + window_size < data_size) { // if it is still possible to calculate rabin hash
-
- if (start_new_chunk) {
- rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk
- c_size = window_size; // don't forget set c_size
- start_new_chunk = false;
- } else {
- // use existing rabin to calculate a new rabin hash
- // note c_offset already increased by 1
- // old byte pointed by ptr + c_offset - 1
- // new byte pointed by ptr + c_offset + WINDOW_SIZE -1;
-
- char new_byte = *(ptr + c_offset + window_size - 1);
- char old_byte = *(ptr + c_offset - 1);
-
- // TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int
- rabin_hash = (rabin_hash * rabin_prime + new_byte - old_byte * pow) % (mod_prime);
- }
-
- /*
- Case 1 : Fingerprint Found
- subcase 1 : if c_size < min -> ignore
- subcase 2 : if min <= c_size <= max -> store
- subcase 3 : if c_size > max -> won't happen
- Case 2 : Fingerprint not Found
- subcase 1 : if c_size < min -> ignore
- subcase 2 : if min <= c_size < max -> ignore
- subcase 3 : if c_size == max -> (force) store
- */
-
- if (end_of_chunk(rabin_hash, num_bits)) {
- if((c_size >= min && c_size <= max)) { // a valid chunk with rabin
- store_chunk = true;
- } else {
- store_chunk = false;
- }
- } else {
- if (c_size == max) {
- store_chunk = true;
- } else {
- store_chunk = false;
- }
- }
-
- if (store_chunk) {
- chunks.push_back(std::make_pair(c_start, c_size));
- c_start += c_size;
- c_offset = c_start;
- start_new_chunk = true;
- continue;
-
- }
-
- c_size++;
- c_offset++;
- }
-
- if (c_start < data_size) {
- c_size = data_size - c_start;
- chunks.push_back(std::make_pair(c_start, c_size));
- }
-
- return 0;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Authors : Yuan-Ting Hsieh, Hsuan-Heng Wu, Myoungwon Oh
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_COMMON_RABIN_H_
-#define CEPH_COMMON_RABIN_H_
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "CDC.h"
-#include "include/buffer_fwd.h"
-
-class RabinChunk : public CDC {
-public:
- RabinChunk(uint32_t window_size, uint32_t rabin_prime,
- uint64_t mod_prime, uint64_t pow, std::vector<uint64_t> rabin_mask,
- uint64_t min, uint64_t max, uint32_t num_bits):
- window_size(window_size), rabin_prime(rabin_prime),
- mod_prime(mod_prime), pow(pow), rabin_mask(rabin_mask), min(min),
- max(max), num_bits(num_bits) {}
- RabinChunk() {
- default_init_rabin_options();
- }
-
- void default_init_rabin_options() {
- std::vector<uint64_t> _rabin_mask;
- for (size_t i = 0; i < 63; ++i) {
- _rabin_mask.push_back((1ull << i) - 1);
- }
- window_size = 48;
- rabin_prime = 3;
- mod_prime = 6148914691236517051;
- pow = 907234050803559263; // pow(prime, window_size)
- min = 8000;
- max = 16000;
- num_bits = 3;
- rabin_mask = _rabin_mask;
- }
-
- void calc_chunks(
- const ceph::buffer::list& inputdata,
- std::vector<std::pair<uint64_t, uint64_t>> *chunks) const override {
- bufferlist b = inputdata;
- do_rabin_chunks(b, *chunks);
- }
-
- int do_rabin_chunks(ceph::buffer::list& inputdata,
- std::vector<std::pair<uint64_t, uint64_t>>& chunks,
- uint64_t min=0, uint64_t max=0) const;
- uint64_t gen_rabin_hash(char* chunk_data, uint64_t off, uint64_t len = 0) const;
- void set_window_size(uint32_t size) { window_size = size; }
- void set_rabin_prime(uint32_t r_prime) { rabin_prime = r_prime; }
- void set_mod_prime(uint64_t m_prime) { mod_prime = m_prime; }
- void set_pow(uint64_t p) { pow = p; }
- void set_rabin_mask(std::vector<uint64_t> & mask) { rabin_mask = mask; }
- void set_min_chunk(uint32_t c_min) { min = c_min; }
- void set_max_chunk(uint32_t c_max) { max = c_max; }
-
- int add_rabin_mask(uint64_t mask) {
- rabin_mask.push_back(mask);
- for (int i = 0; rabin_mask.size(); i++) {
- if (rabin_mask[i] == mask) {
- return i;
- }
- }
- return -1;
- }
- void set_numbits(uint32_t bits) {
- ceph_assert(bits > 0);
- ceph_assert(bits < 63);
- num_bits = bits;
- }
-
- // most users should use this
- void set_target_bits(int bits, int windowbits=0) override {
- set_numbits(bits);
- if (!windowbits) {
- windowbits = 2;
- }
- set_min_chunk(1 << (bits - windowbits));
- set_max_chunk(1 << (bits + windowbits));
- }
-
-private:
- bool end_of_chunk(const uint64_t fp , int numbits) const;
-
- uint32_t window_size;
- uint32_t rabin_prime;
- uint64_t mod_prime;
- uint64_t pow;
- std::vector<uint64_t> rabin_mask;
- uint64_t min;
- uint64_t max;
- uint32_t num_bits;
-};
-
-
-#endif // CEPH_COMMON_RABIN_H_
add_ceph_unittest(unittest_async_shared_mutex)
target_link_libraries(unittest_async_shared_mutex ceph-common Boost::system)
-add_executable(unittest_rabin_chunk test_rabin_chunk.cc
- $<TARGET_OBJECTS:unit-main>)
-target_link_libraries(unittest_rabin_chunk global ceph-common)
-add_ceph_unittest(unittest_rabin_chunk)
-
add_executable(unittest_cdc test_cdc.cc
$<TARGET_OBJECTS:unit-main>)
target_link_libraries(unittest_cdc global ceph-common)
::testing::Values(
"fixed", // note: we skip most tests bc this is not content-based
"fastcdc"
- //, "rabin" // rabin fails insert_{front,middle}
));
cout << " usage: [--op <estimate|chunk_scrub|add_chunk_ref|get_chunk_ref>] [--pool <pool_name> ] " << std::endl;
cout << " --object <object_name> " << std::endl;
cout << " --chunk-size <size> chunk-size (byte) " << std::endl;
- cout << " --chunk-algorithm <fixed|rabin|fastcdc> " << std::endl;
+ cout << " --chunk-algorithm <fixed|fastcdc> " << std::endl;
cout << " --fingerprint-algorithm <sha1|sha256|sha512> " << std::endl;
cout << " --chunk-pool <pool name> " << std::endl;
cout << " --max-thread <threads> " << std::endl;