#include "rabin.h"
#include <string.h>
-char * const_zero = 0;
-uint64_t rabin_mask[] = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
-
-
-uint64_t gen_rabin_hash(char* chunk_data, uint64_t off) {
+uint64_t RabinChunk::gen_rabin_hash(char* chunk_data, uint64_t off) {
uint64_t roll_sum = 0;
- for (uint64_t i = off; i < WINDOW_SIZE; i++) {
+ for (uint64_t i = off; i < window_size; i++) {
char cur_byte = *(chunk_data + i);
- roll_sum = (roll_sum * RABIN_PRIME + cur_byte ) % (MOD_PRIME) ;
+ roll_sum = (roll_sum * rabin_prime + cur_byte ) % (mod_prime) ;
}
return roll_sum;
}
-bool end_of_chunk(const uint64_t fp) {
- return ((fp & RABIN_MASK) == 0) ;
-}
-
-bool end_of_chunk(const uint64_t fp , int numbits) {
+bool RabinChunk::end_of_chunk(const uint64_t fp , int numbits) {
return ((fp & rabin_mask[numbits]) == 0) ;
}
* output_chunks split by Rabin
*/
-void get_rabin_chunks(
- size_t min,
- size_t max,
- bufferlist& inputdata,
- vector<bufferlist> * out, int numbits)
+int RabinChunk::do_rabin_chunks(bufferlist & inputdata,
+ vector<pair<uint64_t, uint64_t>> & chunks,
+ uint64_t min_val = 0, uint64_t max_val = 0)
{
- char * ptr = inputdata.c_str(); // always points at the start to copy
+ char *ptr = inputdata.c_str();
uint64_t data_size = inputdata.length();
- if(const_zero ==0 ) {
- const_zero = (char * ) malloc( sizeof(char) * (min+1));
- memset(const_zero,0,min+1);
+ uint64_t min, max;
+ min = min_val;
+ max = max_val;
+
+
+ if (min == 0 || max == 0) {
+ min = this->min;
+ max = this->max;
+ }
+
+ if (min < window_size) {
+ return -ERANGE;
}
- // Special Case, really small object that can't fit a chunk
- // or can't calculate rabin hash
- if (data_size < min || data_size < WINDOW_SIZE){
- bufferlist chunk;
- bufferptr bptr(min);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0, data_size, ptr);
- chunk.copy_in(data_size,min-data_size , const_zero);
- out->push_back(chunk);
- return;
+
+ if (data_size < min) {
+ chunks.push_back(make_pair(0, data_size));
+ return 0;
}
- uint64_t c_offset = 0; // points at where rabin hash starts calculating
- uint64_t c_size = 0; // size of currently calculating chunk
- uint64_t c_start = 0; // points to start of current chunk.
- bool start_chunk = true;
+ uint64_t c_offset = 0;
+ uint64_t c_size = 0;
+ uint64_t c_start = 0;
uint64_t rabin_hash;
+ bool start_new_chunk = true;
+ bool store_chunk = false;
+
+
+ while (c_offset + window_size < data_size) { // if it is still possible to calculate rabin hash
- while (c_offset + WINDOW_SIZE < data_size) { // if it is still possible to calculate rabin hash
- assert(c_size <= max);
- if (start_chunk) {
+ if (start_new_chunk) {
rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk
- c_size = WINDOW_SIZE; // don't forget set c_size
- start_chunk = false;
+ c_size = window_size; // don't forget set c_size
+ start_new_chunk = false;
} else {
// use existing rabin to calculate a new rabin hash
// note c_offset already increased by 1
// old byte pointed by ptr + c_offset - 1
// new byte pointed by ptr + c_offset + WINDOW_SIZE -1;
- char new_byte = *(ptr + c_offset + WINDOW_SIZE-1);
- char old_byte = *(ptr + c_offset-1);
+ char new_byte = *(ptr + c_offset + window_size - 1);
+ char old_byte = *(ptr + c_offset - 1);
// TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int
- rabin_hash = (rabin_hash * RABIN_PRIME + new_byte - old_byte * POW_47) % (MOD_PRIME);
+ rabin_hash = (rabin_hash * rabin_prime + new_byte - old_byte * pow) % (mod_prime);
}
-
/*
Case 1 : Fingerprint Found
subcase 1 : if c_size < min -> ignore
subcase 3 : if c_size == max -> (force) store
*/
- if (end_of_chunk(rabin_hash,numbits)) {
+ if (end_of_chunk(rabin_hash, num_bits)) {
if((c_size >= min && c_size <= max)) { // a valid chunk with rabin
-
- bufferlist chunk;
- bufferptr bptr(c_size);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0, c_size, ptr+c_start);
-
- out->push_back(chunk);
- c_start += c_size;
- c_offset = c_start;
- start_chunk = true;
- continue;
+ store_chunk = true;
+ } else {
+ store_chunk = false;
}
} else {
if (c_size == max) {
- bufferlist chunk;
- bufferptr bptr(c_size);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0, c_size, ptr+c_start);
- out->push_back(chunk);
-
- c_start += c_size;
- c_offset = c_start;
-
- start_chunk = true;
- continue;
+ store_chunk = true;
+ } else {
+ store_chunk = false;
}
}
- c_size++;
- c_offset++;
- }
-
- /*
- Now c_offset + WINDOW_SIZE == data_size -> We can't compute rabinhash anymore
- Last chunk of data from c_offset to data_size - 1
- c_size = data_size - c_offset;
- */
-
- if (start_chunk) {
- // we need to calculate a new chunk, but there isn't enough bits to calculate rabin hash
-
- if (data_size -c_start < min) {
- bufferlist chunk;
- c_size = data_size - c_start;
- bufferptr bptr(min);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0,c_size,ptr+c_start);
- chunk.copy_in(c_size,min-c_size,const_zero);
- out->push_back(chunk);
+ if (store_chunk) {
+ chunks.push_back(make_pair(c_start, c_size));
+ c_start += c_size;
+ c_offset = c_start;
+ start_new_chunk = true;
+ continue;
}
- else if (c_start < data_size) { // if we still have data to copy
- bufferlist chunk;
- c_size = data_size - c_start;
- bufferptr bptr(c_size);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0, c_size, ptr+c_start);
- out->push_back(chunk);
- }
- } else {
- // we are in the process of calculating rabin hash, but don't have enough bits left to find a fingerprint
- if (data_size -c_start < min) {
- bufferlist chunk;
- c_size = data_size - c_start;
- bufferptr bptr(min);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0,c_size,ptr+c_start);
- chunk.copy_in(c_size,min-c_size,const_zero);
- out->push_back(chunk);
+ c_size++;
+ c_offset++;
+ }
- } else {
- bufferlist chunk;
- c_size = data_size - c_start;
- bufferptr bptr(c_size);
- chunk.push_back(std::move(bptr));
- chunk.copy_in(0, c_size, ptr+c_start);
- out->push_back(chunk);
- }
+ if (c_start < data_size) {
+ c_size = data_size - c_start;
+ chunks.push_back(make_pair(c_start, c_size));
}
-}
+}
/*
* Ceph - scalable distributed file system
*
- * Copyright (C) 2018 Yuan-Ting Hsieh, Hsuan-Heng Wu
+ * Authors : Yuan-Ting Hsieh, Hsuan-Heng Wu, Myoungwon Oh
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
#ifndef CEPH_COMMON_RABIN_H_
#define CEPH_COMMON_RABIN_H_
-#define WINDOW_SIZE 48
-#define RABIN_PRIME 3
-#define RABIN_MASK ((1<<5) -1)
-#define MOD_PRIME 6148914691236517051
-#define POW_47 907234050803559263
+class RabinChunk {
+public:
+ RabinChunk(uint32_t window_size, uint32_t rabin_prime,
+ uint64_t mod_prime, uint64_t pow, vector<uint64_t> rabin_mask, uint64_t min,
+ uint64_t max, uint32_t num_bits):
+ window_size(window_size), rabin_prime(rabin_prime), mod_prime(mod_prime),
+ pow(pow), rabin_mask(rabin_mask), min(min), max(max), num_bits(num_bits) {}
+ RabinChunk() {
+ default_init_rabin_options();
+ }
-/*
- * Given a pointer to data (start of data) and offset
- * returns a Rabin-fingerprint
- */
-uint64_t gen_rabin_hash(char* chunk_data, uint64_t off);
+ void default_init_rabin_options() {
+ vector<uint64_t> _rabin_mask = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+ window_size = 48;
+ rabin_prime = 3;
+ mod_prime = 6148914691236517051;
+ pow = 907234050803559263; // pow(prime, window_size)
+ min = 8000;
+ max = 16000;
+ num_bits = 3;
+ rabin_mask = _rabin_mask;
+ }
-/*
- * Given a Rabin-fingerprint, determines if it is
- * end of chunk
- */
-bool end_of_chunk(const uint64_t fp);
+ int do_rabin_chunks(bufferlist & inputdata,
+ vector<pair<uint64_t, uint64_t>> & chunks,
+ uint64_t min, uint64_t max);
+ uint64_t gen_rabin_hash(char* chunk_data, uint64_t off);
+ bool end_of_chunk(const uint64_t fp , int numbits);
+ void set_window_size(uint32_t size) { window_size = size; }
+ void set_rabin_prime(uint32_t r_prime) { rabin_prime = r_prime; }
+ void set_pow(uint64_t p) { pow = p; }
+ void set_rabin_mask(vector<uint64_t> & mask) { rabin_mask = mask; }
+ void set_numbits(uint32_t bit) { num_bits = bit; }
-bool end_of_chunk(const uint64_t fp, int numbits);
+private:
-/*
- * Given a bufferlist of inputdata, use Rabin-fingerprint to
- * chunk it and return the chunked result
- *
- */
-void get_rabin_chunks(
- size_t min,
- size_t max,
- bufferlist& inputdata,
- vector<bufferlist> * output_chunks);
-
-void get_rabin_chunks(
- size_t min,
- size_t max,
- bufferlist& inputdata,
- vector<bufferlist> * output_chunks, int numbits);
+ uint32_t window_size;
+ uint32_t rabin_prime;
+ uint64_t mod_prime;
+ uint64_t pow;
+ vector<uint64_t> rabin_mask;
+ uint64_t min;
+ uint64_t max;
+ uint32_t num_bits;
+};
#endif // CEPH_COMMON_RABIN_H_
#include "common/rabin.h"
#include "gtest/gtest.h"
-TEST(Rabin, rabin_hash_zero) {
- uint64_t expected;
- uint64_t result;
- //char data[] = "q";
- //expected = 0;
- //result = gen_rabin_hash(data, 0);
- //EXPECT_EQ(expected, result);
-
- char zero_data[1024];
- memset(zero_data, 0, 1024);
- expected = 0;
- result = gen_rabin_hash(zero_data, 0);
- ASSERT_EQ(expected, result);
- ASSERT_EQ(true, end_of_chunk(result));
-}
-
TEST(Rabin, rabin_hash_simple) {
uint64_t expected = 680425538102669423;
uint64_t result;
unsigned int window_size = 48;
char data[window_size + 1];
+ RabinChunk rabin;
memset(data, 0, window_size + 1);
for (unsigned int i = 0; i < window_size; ++i) {
data[i] = i;
}
- result = gen_rabin_hash(data, 0);
+ result = rabin.gen_rabin_hash(data, 0);
ASSERT_EQ(expected, result);
}
-TEST(Rabin, chunk_file_less_than_min) {
- // just put a small file
- const char *fname = "rabin_chunk_testfile";
- ::unlink(fname);
- int fd = ::open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
- ASSERT_NE(fd, -1);
+TEST(Rabin, chunk_check_min_max) {
const char buf[] = "0123456789";
- for (int i = 0; i < 1; i++) {
- ASSERT_EQ((ssize_t)sizeof(buf), write(fd, buf, sizeof(buf)));
- }
- ::close(fd);
- std::string error;
bufferlist bl;
- int err = bl.read_file(fname, &error);
- ASSERT_GE(err, 0);
+ RabinChunk rabin;
+ for (int i = 0; i < 250; i++) {
+ bl.append(buf);
+ }
- std::vector<bufferlist> out;
+ vector<pair<uint64_t, uint64_t>> chunks;
size_t min_chunk = 2000;
size_t max_chunk = 8000;
- get_rabin_chunks(min_chunk, max_chunk, bl, &out);
- for (size_t i = 0; i < out.size(); ++i) {
- // test if min <= chunk <= max
- uint64_t chunk_size = out[i].length();
- ASSERT_GE(chunk_size , min_chunk);
- ASSERT_LE(chunk_size , max_chunk);
- }
- ::unlink(fname);
+ rabin.do_rabin_chunks(bl, chunks, min_chunk, max_chunk);
+ uint64_t chunk_size = chunks[0].second;
+ ASSERT_GE(chunk_size , min_chunk);
+ ASSERT_LE(chunk_size , max_chunk);
}
-TEST(Rabin, chunk_binbash) {
- const char *fname = "/bin/bash";
- std::string error;
- bufferlist bl;
-
- int err = bl.read_file(fname, &error);
- ASSERT_GE(err, 0);
-
- std::vector<bufferlist> out;
- size_t min_chunk = 2000;
- size_t max_chunk = 8000;
- int hist_size = 5;
- int hist [hist_size] = {0};
- size_t range = (max_chunk - min_chunk) / hist_size;
- get_rabin_chunks(min_chunk, max_chunk, bl, &out, 5);
- for (size_t i = 0; i < out.size(); ++i) {
- // test if min <= chunk <= max
- uint64_t chunk_size = out[i].length();
- printf(" chunk has size %zu\n", chunk_size);
- ASSERT_GE(chunk_size , min_chunk);
- ASSERT_LE(chunk_size , max_chunk);
- int bucket = (chunk_size - min_chunk) / range;
- hist[bucket] += 1;
+TEST(Rabin, test_cdc) {
+ const char *base_str = "123456789012345678901234567890123456789012345678";
+ bufferlist bl, cmp_bl;
+ for (int i = 0; i < 100; i++) {
+ bl.append(base_str);
}
- printf("min chunk %zu, max chunk %zu", min_chunk, max_chunk);
- printf("hist size %d, range %zu\n", hist_size, range);
- for (int i = 0; i < hist_size; ++i) {
- printf(" hist %d contains %d chunks\n", i, hist[i]);
+ cmp_bl.append('a');
+ for (int i = 0; i < 100; i++) {
+ cmp_bl.append(base_str);
}
+
+ RabinChunk rabin;
+ vector<pair<uint64_t, uint64_t>> chunks;
+ vector<pair<uint64_t, uint64_t>> cmp_chunks;
+ size_t min_chunk = 200;
+ size_t max_chunk = 800;
+ rabin.do_rabin_chunks(bl, chunks, min_chunk, max_chunk);
+ rabin.do_rabin_chunks(cmp_bl, cmp_chunks, min_chunk, max_chunk);
+ // offset, len will be the same, except in the case of first offset
+ ASSERT_EQ(chunks[4].first + 1, cmp_chunks[4].first);
+ ASSERT_EQ(chunks[4].second, cmp_chunks[4].second);
}