perf_counters_collection.cc
perf_histogram.cc
pick_address.cc
+ rabin.cc
reverse.c
run_cmd.cc
scrub_types.cc
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "rabin.h"
+#include <string.h>
+
+char * const_zero = 0;
+
+uint64_t rabin_mask[] = {0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+
+uint64_t gen_rabin_hash(char* chunk_data, uint64_t off) {
+ uint64_t roll_sum = 0;
+ for (uint64_t i = off; i < WINDOW_SIZE; i++) {
+ char cur_byte = *(chunk_data + i);
+ roll_sum = (roll_sum * RABIN_PRIME + cur_byte ) % (MOD_PRIME) ;
+ }
+ return roll_sum;
+}
+
+bool end_of_chunk(const uint64_t fp) {
+ return ((fp & RABIN_MASK) == 0) ;
+}
+
+bool end_of_chunk(const uint64_t fp , int numbits) {
+ return ((fp & rabin_mask[numbits]) == 0) ;
+}
+
+/*
+ * Given a bufferlist of inputdata, use Rabin-fingerprint to
+ * chunk it and return the chunked result
+ *
+ * Arguments:
+ * min: min data chunk size
+ * max: max data chunk size
+ *
+ * Returns:
+ * output_chunks split by Rabin
+ */
+
+void get_rabin_chunks(
+ size_t min,
+ size_t max,
+ bufferlist& inputdata,
+ vector<bufferlist> * out, int numbits)
+{
+ char * ptr = inputdata.c_str(); // always points at the start to copy
+ uint64_t data_size = inputdata.length();
+ if(const_zero ==0 ) {
+ const_zero = (char * ) malloc( sizeof(char) * (min+1));
+ memset(const_zero,0,min+1);
+ }
+ // Special Case, really small object that can't fit a chunk
+ // or can't calculate rabin hash
+ if (data_size < min || data_size < WINDOW_SIZE){
+ bufferlist chunk;
+ bufferptr bptr(min);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, data_size, ptr);
+ chunk.copy_in(data_size,min-data_size , const_zero);
+ out->push_back(chunk);
+ return;
+ }
+
+ uint64_t c_offset = 0; // points at where rabin hash starts calculating
+ uint64_t c_size = 0; // size of currently calculating chunk
+ uint64_t c_start = 0; // points to start of current chunk.
+ bool start_chunk = true;
+ uint64_t rabin_hash;
+
+ while (c_offset + WINDOW_SIZE < data_size) { // if it is still possible to calculate rabin hash
+ assert(c_size <= max);
+ if (start_chunk) {
+ rabin_hash = gen_rabin_hash(ptr, c_offset); // start calculating for a new chunk
+ c_size = WINDOW_SIZE; // don't forget set c_size
+ start_chunk = false;
+ } else {
+ // use existing rabin to calculate a new rabin hash
+ // note c_offset already increased by 1
+ // old byte pointed by ptr + c_offset - 1
+ // new byte pointed by ptr + c_offset + WINDOW_SIZE -1;
+
+ char new_byte = *(ptr + c_offset + WINDOW_SIZE-1);
+ char old_byte = *(ptr + c_offset-1);
+
+ // TODO modulus POW_47 is too large a constant in c++ even for 64 bit unsinged int
+ rabin_hash = (rabin_hash * RABIN_PRIME + new_byte - old_byte * POW_47) % (MOD_PRIME);
+ }
+
+
+ /*
+ Case 1 : Fingerprint Found
+ subcase 1 : if c_size < min -> ignore
+ subcase 2 : if min <= c_size <= max -> store
+ subcase 3 : if c_size > max -> won't happen
+ Case 2 : Fingerprint not Found
+ subcase 1 : if c_size < min -> ignore
+ subcase 2 : if min <= c_size < max -> ignore
+ subcase 3 : if c_size == max -> (force) store
+ */
+
+ if (end_of_chunk(rabin_hash,numbits)) {
+ if((c_size >= min && c_size <= max)) { // a valid chunk with rabin
+
+ bufferlist chunk;
+ bufferptr bptr(c_size);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, c_size, ptr+c_start);
+
+ out->push_back(chunk);
+ c_start += c_size;
+ c_offset = c_start;
+ start_chunk = true;
+ continue;
+ }
+ } else {
+ if (c_size == max) {
+ bufferlist chunk;
+ bufferptr bptr(c_size);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, c_size, ptr+c_start);
+ out->push_back(chunk);
+
+ c_start += c_size;
+ c_offset = c_start;
+
+ start_chunk = true;
+ continue;
+ }
+ }
+ c_size++;
+ c_offset++;
+ }
+
+
+ /*
+ Now c_offset + WINDOW_SIZE == data_size -> We can't compute rabinhash anymore
+ Last chunk of data from c_offset to data_size - 1
+ c_size = data_size - c_offset;
+ */
+
+ if (start_chunk) {
+ // we need to calculate a new chunk, but there isn't enough bits to calculate rabin hash
+
+ if (data_size -c_start < min) {
+ bufferlist chunk;
+ c_size = data_size - c_start;
+ bufferptr bptr(min);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0,c_size,ptr+c_start);
+ chunk.copy_in(c_size,min-c_size,const_zero);
+ out->push_back(chunk);
+
+ }
+ else if (c_start < data_size) { // if we still have data to copy
+ bufferlist chunk;
+ c_size = data_size - c_start;
+ bufferptr bptr(c_size);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, c_size, ptr+c_start);
+ out->push_back(chunk);
+ }
+
+ } else {
+ // we are in the process of calculating rabin hash, but don't have enough bits left to find a fingerprint
+ if (data_size -c_start < min) {
+ bufferlist chunk;
+ c_size = data_size - c_start;
+ bufferptr bptr(min);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0,c_size,ptr+c_start);
+ chunk.copy_in(c_size,min-c_size,const_zero);
+ out->push_back(chunk);
+
+ } else {
+ bufferlist chunk;
+ c_size = data_size - c_start;
+ bufferptr bptr(c_size);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, c_size, ptr+c_start);
+ out->push_back(chunk);
+ }
+ }
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Yuan-Ting Hsieh, Hsuan-Heng Wu
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_RABIN_H_
+#define CEPH_COMMON_RABIN_H_
+
+#define WINDOW_SIZE 48
+#define RABIN_PRIME 3
+#define RABIN_MASK ((1<<5) -1)
+#define MOD_PRIME 6148914691236517051
+#define POW_47 907234050803559263
+
+/*
+ * Given a pointer to data (start of data) and offset
+ * returns a Rabin-fingerprint
+ */
+uint64_t gen_rabin_hash(char* chunk_data, uint64_t off);
+
+/*
+ * Given a Rabin-fingerprint, determines if it is
+ * end of chunk
+ */
+bool end_of_chunk(const uint64_t fp);
+
+bool end_of_chunk(const uint64_t fp, int numbits);
+
+/*
+ * Given a bufferlist of inputdata, use Rabin-fingerprint to
+ * chunk it and return the chunked result
+ *
+ */
+void get_rabin_chunks(
+ size_t min,
+ size_t max,
+ bufferlist& inputdata,
+ vector<bufferlist> * output_chunks);
+
+void get_rabin_chunks(
+ size_t min,
+ size_t max,
+ bufferlist& inputdata,
+ vector<bufferlist> * output_chunks, int numbits);
+
+
+#endif // CEPH_COMMON_RABIN_H_
add_executable(unittest_async_shared_mutex test_async_shared_mutex.cc)
add_ceph_unittest(unittest_async_shared_mutex)
target_link_libraries(unittest_async_shared_mutex ceph-common Boost::system)
+
+add_executable(unittest_rabin_chunk test_rabin_chunk.cc
+ $<TARGET_OBJECTS:unit-main>)
+target_link_libraries(unittest_rabin_chunk global ceph-common)
+add_ceph_unittest(unittest_rabin_chunk)
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <vector>
+#include <cstring>
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+#include "common/rabin.h"
+#include "gtest/gtest.h"
+
+TEST(Rabin, rabin_hash_zero) {
+ uint64_t expected;
+ uint64_t result;
+ //char data[] = "q";
+ //expected = 0;
+ //result = gen_rabin_hash(data, 0);
+ //EXPECT_EQ(expected, result);
+
+ char zero_data[1024];
+ memset(zero_data, 0, 1024);
+ expected = 0;
+ result = gen_rabin_hash(zero_data, 0);
+ ASSERT_EQ(expected, result);
+ ASSERT_EQ(true, end_of_chunk(result));
+}
+
+TEST(Rabin, rabin_hash_simple) {
+ uint64_t expected = 680425538102669423;
+ uint64_t result;
+
+ unsigned int window_size = 48;
+ char data[window_size + 1];
+ memset(data, 0, window_size + 1);
+ for (unsigned int i = 0; i < window_size; ++i) {
+ data[i] = i;
+ }
+ result = gen_rabin_hash(data, 0);
+ ASSERT_EQ(expected, result);
+}
+
+TEST(Rabin, chunk_file_less_than_min) {
+ // just put a small file
+ const char *fname = "rabin_chunk_testfile";
+ ::unlink(fname);
+ int fd = ::open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+ ASSERT_NE(fd, -1);
+ const char buf[] = "0123456789";
+ for (int i = 0; i < 1; i++) {
+ ASSERT_EQ((ssize_t)sizeof(buf), write(fd, buf, sizeof(buf)));
+ }
+ ::close(fd);
+
+ std::string error;
+ bufferlist bl;
+ int err = bl.read_file(fname, &error);
+ ASSERT_GE(err, 0);
+
+ std::vector<bufferlist> out;
+ size_t min_chunk = 2000;
+ size_t max_chunk = 8000;
+ get_rabin_chunks(min_chunk, max_chunk, bl, &out);
+ for (size_t i = 0; i < out.size(); ++i) {
+ // test if min <= chunk <= max
+ uint64_t chunk_size = out[i].length();
+ ASSERT_GE(chunk_size , min_chunk);
+ ASSERT_LE(chunk_size , max_chunk);
+ }
+
+ ::unlink(fname);
+}
+
+TEST(Rabin, chunk_binbash) {
+ const char *fname = "/bin/bash";
+ std::string error;
+ bufferlist bl;
+
+ int err = bl.read_file(fname, &error);
+ ASSERT_GE(err, 0);
+
+ std::vector<bufferlist> out;
+ size_t min_chunk = 2000;
+ size_t max_chunk = 8000;
+ int hist_size = 5;
+ int hist [hist_size] = {0};
+ size_t range = (max_chunk - min_chunk) / hist_size;
+ get_rabin_chunks(min_chunk, max_chunk, bl, &out, 5);
+ for (size_t i = 0; i < out.size(); ++i) {
+ // test if min <= chunk <= max
+ uint64_t chunk_size = out[i].length();
+ printf(" chunk has size %zu\n", chunk_size);
+ ASSERT_GE(chunk_size , min_chunk);
+ ASSERT_LE(chunk_size , max_chunk);
+ int bucket = (chunk_size - min_chunk) / range;
+ hist[bucket] += 1;
+ }
+ printf("min chunk %zu, max chunk %zu", min_chunk, max_chunk);
+ printf("hist size %d, range %zu\n", hist_size, range);
+ for (int i = 0; i < hist_size; ++i) {
+ printf(" hist %d contains %d chunks\n", i, hist[i]);
+ }
+}
+