From: myoungwon oh Date: Sat, 2 Mar 2019 07:27:53 +0000 (+0900) Subject: src/common: add rabin fingerprint class X-Git-Tag: v15.1.0~2835^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c332dbcf5c2690c32e08319d4b24362af66cabc6;p=ceph.git src/common: add rabin fingerprint class Signed-off-by: Myoungwon Oh --- diff --git a/src/common/rabin.cc b/src/common/rabin.cc index 68ef6a1fb07..7a4c1c1e7b8 100644 --- a/src/common/rabin.cc +++ b/src/common/rabin.cc @@ -5,25 +5,17 @@ #include "rabin.h" #include -char * const_zero = 0; -uint64_t rabin_mask[] = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; - - -uint64_t gen_rabin_hash(char* chunk_data, uint64_t off) { +uint64_t RabinChunk::gen_rabin_hash(char* chunk_data, uint64_t off) { uint64_t roll_sum = 0; - for (uint64_t i = off; i < WINDOW_SIZE; i++) { + for (uint64_t i = off; i < window_size; i++) { char cur_byte = *(chunk_data + i); - roll_sum = (roll_sum * RABIN_PRIME + cur_byte ) % (MOD_PRIME) ; + roll_sum = (roll_sum * rabin_prime + cur_byte ) % (mod_prime) ; } return roll_sum; } -bool end_of_chunk(const uint64_t fp) { - return ((fp & RABIN_MASK) == 0) ; -} - -bool end_of_chunk(const uint64_t fp , int numbits) { +bool RabinChunk::end_of_chunk(const uint64_t fp , int numbits) { return ((fp & rabin_mask[numbits]) == 0) ; } @@ -39,56 +31,58 @@ bool end_of_chunk(const uint64_t fp , int numbits) { * output_chunks split by Rabin */ -void get_rabin_chunks( - size_t min, - size_t max, - bufferlist& inputdata, - vector * out, int numbits) +int RabinChunk::do_rabin_chunks(bufferlist & inputdata, + vector> & chunks, + uint64_t min_val = 0, uint64_t max_val = 0) { - char * ptr = inputdata.c_str(); // always points at the start to copy + char *ptr = inputdata.c_str(); uint64_t data_size = inputdata.length(); - if(const_zero ==0 ) { - const_zero = (char * ) malloc( sizeof(char) * (min+1)); - memset(const_zero,0,min+1); + uint64_t min, max; + min = min_val; + max = max_val; + + + if (min == 0 || max == 0) { + min = this->min; + max = this->max; + } + + if (min < window_size) { + return -ERANGE; } - // Special Case, really small object that can't fit a chunk - // or can't calculate rabin hash - if (data_size < min || data_size < WINDOW_SIZE){ - bufferlist chunk; - bufferptr bptr(min); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0, data_size, ptr); - chunk.copy_in(data_size,min-data_size , const_zero); - out->push_back(chunk); - return; + + if (data_size < min) { + chunks.push_back(make_pair(0, data_size)); + return 0; } - uint64_t c_offset = 0; // points at where rabin hash starts calculating - uint64_t c_size = 0; // size of currently calculating chunk - uint64_t c_start = 0; // points to start of current chunk. - bool start_chunk = true; + uint64_t c_offset = 0; + uint64_t c_size = 0; + uint64_t c_start = 0; uint64_t rabin_hash; + bool start_new_chunk = true; + bool store_chunk = false; + + + while (c_offset + window_size < data_size) { // if it is still possible to calculate rabin hash - while (c_offset + WINDOW_SIZE < data_size) { // if it is still possible to calculate rabin hash - assert(c_size <= max); - if (start_chunk) { + if (start_new_chunk) { rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk - c_size = WINDOW_SIZE; // don't forget set c_size - start_chunk = false; + c_size = window_size; // don't forget set c_size + start_new_chunk = false; } else { // use existing rabin to calculate a new rabin hash // note c_offset already increased by 1 // old byte pointed by ptr + c_offset - 1 // new byte pointed by ptr + c_offset + WINDOW_SIZE -1; - char new_byte = *(ptr + c_offset + WINDOW_SIZE-1); - char old_byte = *(ptr + c_offset-1); + char new_byte = *(ptr + c_offset + window_size - 1); + char old_byte = *(ptr + c_offset - 1); // TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int - rabin_hash = (rabin_hash * RABIN_PRIME + new_byte - old_byte * POW_47) % (MOD_PRIME); + rabin_hash = (rabin_hash * rabin_prime + new_byte - old_byte * pow) % (mod_prime); } - /* Case 1 : Fingerprint Found subcase 1 : if c_size < min -> ignore @@ -100,87 +94,36 @@ void get_rabin_chunks( subcase 3 : if c_size == max -> (force) store */ - if (end_of_chunk(rabin_hash,numbits)) { + if (end_of_chunk(rabin_hash, num_bits)) { if((c_size >= min && c_size <= max)) { // a valid chunk with rabin - - bufferlist chunk; - bufferptr bptr(c_size); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0, c_size, ptr+c_start); - - out->push_back(chunk); - c_start += c_size; - c_offset = c_start; - start_chunk = true; - continue; + store_chunk = true; + } else { + store_chunk = false; } } else { if (c_size == max) { - bufferlist chunk; - bufferptr bptr(c_size); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0, c_size, ptr+c_start); - out->push_back(chunk); - - c_start += c_size; - c_offset = c_start; - - start_chunk = true; - continue; + store_chunk = true; + } else { + store_chunk = false; } } - c_size++; - c_offset++; - } - - /* - Now c_offset + WINDOW_SIZE == data_size -> We can't compute rabinhash anymore - Last chunk of data from c_offset to data_size - 1 - c_size = data_size - c_offset; - */ - - if (start_chunk) { - // we need to calculate a new chunk, but there isn't enough bits to calculate rabin hash - - if (data_size -c_start < min) { - bufferlist chunk; - c_size = data_size - c_start; - bufferptr bptr(min); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0,c_size,ptr+c_start); - chunk.copy_in(c_size,min-c_size,const_zero); - out->push_back(chunk); + if (store_chunk) { + chunks.push_back(make_pair(c_start, c_size)); + c_start += c_size; + c_offset = c_start; + start_new_chunk = true; + continue; } - else if (c_start < data_size) { // if we still have data to copy - bufferlist chunk; - c_size = data_size - c_start; - bufferptr bptr(c_size); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0, c_size, ptr+c_start); - out->push_back(chunk); - } - } else { - // we are in the process of calculating rabin hash, but don't have enough bits left to find a fingerprint - if (data_size -c_start < min) { - bufferlist chunk; - c_size = data_size - c_start; - bufferptr bptr(min); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0,c_size,ptr+c_start); - chunk.copy_in(c_size,min-c_size,const_zero); - out->push_back(chunk); + c_size++; + c_offset++; + } - } else { - bufferlist chunk; - c_size = data_size - c_start; - bufferptr bptr(c_size); - chunk.push_back(std::move(bptr)); - chunk.copy_in(0, c_size, ptr+c_start); - out->push_back(chunk); - } + if (c_start < data_size) { + c_size = data_size - c_start; + chunks.push_back(make_pair(c_start, c_size)); } -} +} diff --git a/src/common/rabin.h b/src/common/rabin.h index 533bfde9140..ab109b2ae79 100644 --- a/src/common/rabin.h +++ b/src/common/rabin.h @@ -3,7 +3,7 @@ /* * Ceph - scalable distributed file system * - * Copyright (C) 2018 Yuan-Ting Hsieh, Hsuan-Heng Wu + * Authors : Yuan-Ting Hsieh, Hsuan-Heng Wu, Myoungwon Oh * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -15,42 +15,51 @@ #ifndef CEPH_COMMON_RABIN_H_ #define CEPH_COMMON_RABIN_H_ -#define WINDOW_SIZE 48 -#define RABIN_PRIME 3 -#define RABIN_MASK ((1<<5) -1) -#define MOD_PRIME 6148914691236517051 -#define POW_47 907234050803559263 +class RabinChunk { +public: + RabinChunk(uint32_t window_size, uint32_t rabin_prime, + uint64_t mod_prime, uint64_t pow, vector rabin_mask, uint64_t min, + uint64_t max, uint32_t num_bits): + window_size(window_size), rabin_prime(rabin_prime), mod_prime(mod_prime), + pow(pow), rabin_mask(rabin_mask), min(min), max(max), num_bits(num_bits) {} + RabinChunk() { + default_init_rabin_options(); + } -/* - * Given a pointer to data (start of data) and offset - * returns a Rabin-fingerprint - */ -uint64_t gen_rabin_hash(char* chunk_data, uint64_t off); + void default_init_rabin_options() { + vector _rabin_mask = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; + window_size = 48; + rabin_prime = 3; + mod_prime = 6148914691236517051; + pow = 907234050803559263; // pow(prime, window_size) + min = 8000; + max = 16000; + num_bits = 3; + rabin_mask = _rabin_mask; + } -/* - * Given a Rabin-fingerprint, determines if it is - * end of chunk - */ -bool end_of_chunk(const uint64_t fp); + int do_rabin_chunks(bufferlist & inputdata, + vector> & chunks, + uint64_t min, uint64_t max); + uint64_t gen_rabin_hash(char* chunk_data, uint64_t off); + bool end_of_chunk(const uint64_t fp , int numbits); + void set_window_size(uint32_t size) { window_size = size; } + void set_rabin_prime(uint32_t r_prime) { rabin_prime = r_prime; } + void set_pow(uint64_t p) { pow = p; } + void set_rabin_mask(vector & mask) { rabin_mask = mask; } + void set_numbits(uint32_t bit) { num_bits = bit; } -bool end_of_chunk(const uint64_t fp, int numbits); +private: -/* - * Given a bufferlist of inputdata, use Rabin-fingerprint to - * chunk it and return the chunked result - * - */ -void get_rabin_chunks( - size_t min, - size_t max, - bufferlist& inputdata, - vector * output_chunks); - -void get_rabin_chunks( - size_t min, - size_t max, - bufferlist& inputdata, - vector * output_chunks, int numbits); + uint32_t window_size; + uint32_t rabin_prime; + uint64_t mod_prime; + uint64_t pow; + vector rabin_mask; + uint64_t min; + uint64_t max; + uint32_t num_bits; +}; #endif // CEPH_COMMON_RABIN_H_ diff --git a/src/test/common/test_rabin_chunk.cc b/src/test/common/test_rabin_chunk.cc index 20a85cad422..78ec3376647 100644 --- a/src/test/common/test_rabin_chunk.cc +++ b/src/test/common/test_rabin_chunk.cc @@ -10,95 +10,60 @@ #include "common/rabin.h" #include "gtest/gtest.h" -TEST(Rabin, rabin_hash_zero) { - uint64_t expected; - uint64_t result; - //char data[] = "q"; - //expected = 0; - //result = gen_rabin_hash(data, 0); - //EXPECT_EQ(expected, result); - - char zero_data[1024]; - memset(zero_data, 0, 1024); - expected = 0; - result = gen_rabin_hash(zero_data, 0); - ASSERT_EQ(expected, result); - ASSERT_EQ(true, end_of_chunk(result)); -} - TEST(Rabin, rabin_hash_simple) { uint64_t expected = 680425538102669423; uint64_t result; unsigned int window_size = 48; char data[window_size + 1]; + RabinChunk rabin; memset(data, 0, window_size + 1); for (unsigned int i = 0; i < window_size; ++i) { data[i] = i; } - result = gen_rabin_hash(data, 0); + result = rabin.gen_rabin_hash(data, 0); ASSERT_EQ(expected, result); } -TEST(Rabin, chunk_file_less_than_min) { - // just put a small file - const char *fname = "rabin_chunk_testfile"; - ::unlink(fname); - int fd = ::open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); - ASSERT_NE(fd, -1); +TEST(Rabin, chunk_check_min_max) { const char buf[] = "0123456789"; - for (int i = 0; i < 1; i++) { - ASSERT_EQ((ssize_t)sizeof(buf), write(fd, buf, sizeof(buf))); - } - ::close(fd); - std::string error; bufferlist bl; - int err = bl.read_file(fname, &error); - ASSERT_GE(err, 0); + RabinChunk rabin; + for (int i = 0; i < 250; i++) { + bl.append(buf); + } - std::vector out; + vector> chunks; size_t min_chunk = 2000; size_t max_chunk = 8000; - get_rabin_chunks(min_chunk, max_chunk, bl, &out); - for (size_t i = 0; i < out.size(); ++i) { - // test if min <= chunk <= max - uint64_t chunk_size = out[i].length(); - ASSERT_GE(chunk_size , min_chunk); - ASSERT_LE(chunk_size , max_chunk); - } - ::unlink(fname); + rabin.do_rabin_chunks(bl, chunks, min_chunk, max_chunk); + uint64_t chunk_size = chunks[0].second; + ASSERT_GE(chunk_size , min_chunk); + ASSERT_LE(chunk_size , max_chunk); } -TEST(Rabin, chunk_binbash) { - const char *fname = "/bin/bash"; - std::string error; - bufferlist bl; - - int err = bl.read_file(fname, &error); - ASSERT_GE(err, 0); - - std::vector out; - size_t min_chunk = 2000; - size_t max_chunk = 8000; - int hist_size = 5; - int hist [hist_size] = {0}; - size_t range = (max_chunk - min_chunk) / hist_size; - get_rabin_chunks(min_chunk, max_chunk, bl, &out, 5); - for (size_t i = 0; i < out.size(); ++i) { - // test if min <= chunk <= max - uint64_t chunk_size = out[i].length(); - printf(" chunk has size %zu\n", chunk_size); - ASSERT_GE(chunk_size , min_chunk); - ASSERT_LE(chunk_size , max_chunk); - int bucket = (chunk_size - min_chunk) / range; - hist[bucket] += 1; +TEST(Rabin, test_cdc) { + const char *base_str = "123456789012345678901234567890123456789012345678"; + bufferlist bl, cmp_bl; + for (int i = 0; i < 100; i++) { + bl.append(base_str); } - printf("min chunk %zu, max chunk %zu", min_chunk, max_chunk); - printf("hist size %d, range %zu\n", hist_size, range); - for (int i = 0; i < hist_size; ++i) { - printf(" hist %d contains %d chunks\n", i, hist[i]); + cmp_bl.append('a'); + for (int i = 0; i < 100; i++) { + cmp_bl.append(base_str); } + + RabinChunk rabin; + vector> chunks; + vector> cmp_chunks; + size_t min_chunk = 200; + size_t max_chunk = 800; + rabin.do_rabin_chunks(bl, chunks, min_chunk, max_chunk); + rabin.do_rabin_chunks(cmp_bl, cmp_chunks, min_chunk, max_chunk); + // offset, len will be the same, except in the case of first offset + ASSERT_EQ(chunks[4].first + 1, cmp_chunks[4].first); + ASSERT_EQ(chunks[4].second, cmp_chunks[4].second); }