From: Hsuan-Heng, Wu Date: Tue, 18 Dec 2018 09:47:51 +0000 (+0000) Subject: Initail work for rabin fingerprint X-Git-Tag: v15.1.0~2835^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4a5ca610caf12dd9dfa033ddf4d9827854753b2a;p=ceph.git Initail work for rabin fingerprint Signed-off-by: Hsuan-Heng, Wu Signed-off-by: Myoungwon Oh --- diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 65ba10b0f14..4c7f8a987b8 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -80,6 +80,7 @@ set(common_srcs perf_counters_collection.cc perf_histogram.cc pick_address.cc + rabin.cc reverse.c run_cmd.cc scrub_types.cc diff --git a/src/common/rabin.cc b/src/common/rabin.cc new file mode 100644 index 00000000000..68ef6a1fb07 --- /dev/null +++ b/src/common/rabin.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/types.h" +#include "rabin.h" +#include + +char * const_zero = 0; + +uint64_t rabin_mask[] = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; + + +uint64_t gen_rabin_hash(char* chunk_data, uint64_t off) { + uint64_t roll_sum = 0; + for (uint64_t i = off; i < WINDOW_SIZE; i++) { + char cur_byte = *(chunk_data + i); + roll_sum = (roll_sum * RABIN_PRIME + cur_byte ) % (MOD_PRIME) ; + } + return roll_sum; +} + +bool end_of_chunk(const uint64_t fp) { + return ((fp & RABIN_MASK) == 0) ; +} + +bool end_of_chunk(const uint64_t fp , int numbits) { + return ((fp & rabin_mask[numbits]) == 0) ; +} + +/* + * Given a bufferlist of inputdata, use Rabin-fingerprint to + * chunk it and return the chunked result + * + * Arguments: + * min: min data chunk size + * max: max data chunk size + * + * Returns: + * output_chunks split by Rabin + */ + +void get_rabin_chunks( + size_t min, + size_t max, + bufferlist& inputdata, + vector * out, int numbits) +{ + char * ptr = inputdata.c_str(); // always points at the start to copy + uint64_t data_size = inputdata.length(); + if(const_zero ==0 ) { + const_zero = (char * ) malloc( sizeof(char) * (min+1)); + memset(const_zero,0,min+1); + } + // Special Case, really small object that can't fit a chunk + // or can't calculate rabin hash + if (data_size < min || data_size < WINDOW_SIZE){ + bufferlist chunk; + bufferptr bptr(min); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, data_size, ptr); + chunk.copy_in(data_size,min-data_size , const_zero); + out->push_back(chunk); + return; + } + + uint64_t c_offset = 0; // points at where rabin hash starts calculating + uint64_t c_size = 0; // size of currently calculating chunk + uint64_t c_start = 0; // points to start of current chunk. + bool start_chunk = true; + uint64_t rabin_hash; + + while (c_offset + WINDOW_SIZE < data_size) { // if it is still possible to calculate rabin hash + assert(c_size <= max); + if (start_chunk) { + rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk + c_size = WINDOW_SIZE; // don't forget set c_size + start_chunk = false; + } else { + // use existing rabin to calculate a new rabin hash + // note c_offset already increased by 1 + // old byte pointed by ptr + c_offset - 1 + // new byte pointed by ptr + c_offset + WINDOW_SIZE -1; + + char new_byte = *(ptr + c_offset + WINDOW_SIZE-1); + char old_byte = *(ptr + c_offset-1); + + // TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int + rabin_hash = (rabin_hash * RABIN_PRIME + new_byte - old_byte * POW_47) % (MOD_PRIME); + } + + + /* + Case 1 : Fingerprint Found + subcase 1 : if c_size < min -> ignore + subcase 2 : if min <= c_size <= max -> store + subcase 3 : if c_size > max -> won't happen + Case 2 : Fingerprint not Found + subcase 1 : if c_size < min -> ignore + subcase 2 : if min <= c_size < max -> ignore + subcase 3 : if c_size == max -> (force) store + */ + + if (end_of_chunk(rabin_hash,numbits)) { + if((c_size >= min && c_size <= max)) { // a valid chunk with rabin + + bufferlist chunk; + bufferptr bptr(c_size); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, c_size, ptr+c_start); + + out->push_back(chunk); + c_start += c_size; + c_offset = c_start; + start_chunk = true; + continue; + } + } else { + if (c_size == max) { + bufferlist chunk; + bufferptr bptr(c_size); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, c_size, ptr+c_start); + out->push_back(chunk); + + c_start += c_size; + c_offset = c_start; + + start_chunk = true; + continue; + } + } + c_size++; + c_offset++; + } + + + /* + Now c_offset + WINDOW_SIZE == data_size -> We can't compute rabinhash anymore + Last chunk of data from c_offset to data_size - 1 + c_size = data_size - c_offset; + */ + + if (start_chunk) { + // we need to calculate a new chunk, but there isn't enough bits to calculate rabin hash + + if (data_size -c_start < min) { + bufferlist chunk; + c_size = data_size - c_start; + bufferptr bptr(min); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0,c_size,ptr+c_start); + chunk.copy_in(c_size,min-c_size,const_zero); + out->push_back(chunk); + + } + else if (c_start < data_size) { // if we still have data to copy + bufferlist chunk; + c_size = data_size - c_start; + bufferptr bptr(c_size); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, c_size, ptr+c_start); + out->push_back(chunk); + } + + } else { + // we are in the process of calculating rabin hash, but don't have enough bits left to find a fingerprint + if (data_size -c_start < min) { + bufferlist chunk; + c_size = data_size - c_start; + bufferptr bptr(min); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0,c_size,ptr+c_start); + chunk.copy_in(c_size,min-c_size,const_zero); + out->push_back(chunk); + + } else { + bufferlist chunk; + c_size = data_size - c_start; + bufferptr bptr(c_size); + chunk.push_back(std::move(bptr)); + chunk.copy_in(0, c_size, ptr+c_start); + out->push_back(chunk); + } + } +} + diff --git a/src/common/rabin.h b/src/common/rabin.h new file mode 100644 index 00000000000..533bfde9140 --- /dev/null +++ b/src/common/rabin.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Yuan-Ting Hsieh, Hsuan-Heng Wu + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_RABIN_H_ +#define CEPH_COMMON_RABIN_H_ + +#define WINDOW_SIZE 48 +#define RABIN_PRIME 3 +#define RABIN_MASK ((1<<5) -1) +#define MOD_PRIME 6148914691236517051 +#define POW_47 907234050803559263 + +/* + * Given a pointer to data (start of data) and offset + * returns a Rabin-fingerprint + */ +uint64_t gen_rabin_hash(char* chunk_data, uint64_t off); + +/* + * Given a Rabin-fingerprint, determines if it is + * end of chunk + */ +bool end_of_chunk(const uint64_t fp); + +bool end_of_chunk(const uint64_t fp, int numbits); + +/* + * Given a bufferlist of inputdata, use Rabin-fingerprint to + * chunk it and return the chunked result + * + */ +void get_rabin_chunks( + size_t min, + size_t max, + bufferlist& inputdata, + vector * output_chunks); + +void get_rabin_chunks( + size_t min, + size_t max, + bufferlist& inputdata, + vector * output_chunks, int numbits); + + +#endif // CEPH_COMMON_RABIN_H_ diff --git a/src/test/common/CMakeLists.txt b/src/test/common/CMakeLists.txt index 71fb77cda91..3906920d9e9 100644 --- a/src/test/common/CMakeLists.txt +++ b/src/test/common/CMakeLists.txt @@ -320,3 +320,10 @@ target_link_libraries(unittest_async_completion Boost::system) add_executable(unittest_async_shared_mutex test_async_shared_mutex.cc) add_ceph_unittest(unittest_async_shared_mutex) target_link_libraries(unittest_async_shared_mutex ceph-common Boost::system) + +add_executable(unittest_rabin_chunk test_rabin_chunk.cc + $) +target_link_libraries(unittest_rabin_chunk global ceph-common) +add_ceph_unittest(unittest_rabin_chunk) + + diff --git a/src/test/common/test_rabin_chunk.cc b/src/test/common/test_rabin_chunk.cc new file mode 100644 index 00000000000..20a85cad422 --- /dev/null +++ b/src/test/common/test_rabin_chunk.cc @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "include/types.h" +#include "include/buffer.h" + +#include "common/rabin.h" +#include "gtest/gtest.h" + +TEST(Rabin, rabin_hash_zero) { + uint64_t expected; + uint64_t result; + //char data[] = "q"; + //expected = 0; + //result = gen_rabin_hash(data, 0); + //EXPECT_EQ(expected, result); + + char zero_data[1024]; + memset(zero_data, 0, 1024); + expected = 0; + result = gen_rabin_hash(zero_data, 0); + ASSERT_EQ(expected, result); + ASSERT_EQ(true, end_of_chunk(result)); +} + +TEST(Rabin, rabin_hash_simple) { + uint64_t expected = 680425538102669423; + uint64_t result; + + unsigned int window_size = 48; + char data[window_size + 1]; + memset(data, 0, window_size + 1); + for (unsigned int i = 0; i < window_size; ++i) { + data[i] = i; + } + result = gen_rabin_hash(data, 0); + ASSERT_EQ(expected, result); +} + +TEST(Rabin, chunk_file_less_than_min) { + // just put a small file + const char *fname = "rabin_chunk_testfile"; + ::unlink(fname); + int fd = ::open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); + ASSERT_NE(fd, -1); + const char buf[] = "0123456789"; + for (int i = 0; i < 1; i++) { + ASSERT_EQ((ssize_t)sizeof(buf), write(fd, buf, sizeof(buf))); + } + ::close(fd); + + std::string error; + bufferlist bl; + int err = bl.read_file(fname, &error); + ASSERT_GE(err, 0); + + std::vector out; + size_t min_chunk = 2000; + size_t max_chunk = 8000; + get_rabin_chunks(min_chunk, max_chunk, bl, &out); + for (size_t i = 0; i < out.size(); ++i) { + // test if min <= chunk <= max + uint64_t chunk_size = out[i].length(); + ASSERT_GE(chunk_size , min_chunk); + ASSERT_LE(chunk_size , max_chunk); + } + + ::unlink(fname); +} + +TEST(Rabin, chunk_binbash) { + const char *fname = "/bin/bash"; + std::string error; + bufferlist bl; + + int err = bl.read_file(fname, &error); + ASSERT_GE(err, 0); + + std::vector out; + size_t min_chunk = 2000; + size_t max_chunk = 8000; + int hist_size = 5; + int hist [hist_size] = {0}; + size_t range = (max_chunk - min_chunk) / hist_size; + get_rabin_chunks(min_chunk, max_chunk, bl, &out, 5); + for (size_t i = 0; i < out.size(); ++i) { + // test if min <= chunk <= max + uint64_t chunk_size = out[i].length(); + printf(" chunk has size %zu\n", chunk_size); + ASSERT_GE(chunk_size , min_chunk); + ASSERT_LE(chunk_size , max_chunk); + int bucket = (chunk_size - min_chunk) / range; + hist[bucket] += 1; + } + printf("min chunk %zu, max chunk %zu", min_chunk, max_chunk); + printf("hist size %d, range %zu\n", hist_size, range); + for (int i = 0; i < hist_size; ++i) { + printf(" hist %d contains %d chunks\n", i, hist[i]); + } +} +