From 2a3e56b22c5b52e4834a9fd5047b3cdff7e8859f Mon Sep 17 00:00:00 2001 From: Myna V Date: Thu, 6 Sep 2018 11:23:28 -0500 Subject: [PATCH] erasure-code: add clay codes Introducing Clay codes to ceph. These codes are optimal in terms of network, disk traffic utilized during repair of a lost chunk. This feature also provides gradual increase in network, disk traffic with respect to the number of lost chunks. Authors: Myna, Elita. Fixes: http://tracker.ceph.com/issues/19278 Signed-off-by: Myna V --- src/erasure-code/CMakeLists.txt | 4 +- src/erasure-code/ErasureCode.cc | 18 +- src/erasure-code/ErasureCode.h | 4 +- src/erasure-code/clay/CMakeLists.txt | 14 + src/erasure-code/clay/ErasureCodeClay.cc | 900 ++++++++++++++++++ src/erasure-code/clay/ErasureCodeClay.h | 128 +++ .../clay/ErasureCodePluginClay.cc | 44 + src/erasure-code/clay/ErasureCodePluginClay.h | 30 + src/test/erasure-code/CMakeLists.txt | 28 + src/test/erasure-code/TestErasureCodeClay.cc | 595 ++++++++++++ .../erasure-code/TestErasureCodePluginClay.cc | 113 +++ 11 files changed, 1872 insertions(+), 6 deletions(-) create mode 100644 src/erasure-code/clay/CMakeLists.txt create mode 100644 src/erasure-code/clay/ErasureCodeClay.cc create mode 100644 src/erasure-code/clay/ErasureCodeClay.h create mode 100644 src/erasure-code/clay/ErasureCodePluginClay.cc create mode 100644 src/erasure-code/clay/ErasureCodePluginClay.h create mode 100644 src/test/erasure-code/TestErasureCodeClay.cc create mode 100644 src/test/erasure-code/TestErasureCodePluginClay.cc diff --git a/src/erasure-code/CMakeLists.txt b/src/erasure-code/CMakeLists.txt index 9bc62129458..0c9832ebf2e 100644 --- a/src/erasure-code/CMakeLists.txt +++ b/src/erasure-code/CMakeLists.txt @@ -20,6 +20,7 @@ endif() add_subdirectory(jerasure) add_subdirectory(lrc) add_subdirectory(shec) +add_subdirectory(clay) if (HAVE_BETTER_YASM_ELF64) add_subdirectory(isa) @@ -35,4 +36,5 @@ add_custom_target(erasure_code_plugins DEPENDS ${EC_ISA_LIB} ec_lrc ec_jerasure - ec_shec) + ec_shec + ec_clay) diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc index 22f9cd48d20..8ca3262dc22 100644 --- a/src/erasure-code/ErasureCode.cc +++ b/src/erasure-code/ErasureCode.cc @@ -24,13 +24,25 @@ #include "include/buffer.h" #include "crush/CrushWrapper.h" #include "osd/osd_types.h" - - -const unsigned ErasureCode::SIMD_ALIGN = 32; +#include "common/debug.h" #define DEFAULT_RULE_ROOT "default" #define DEFAULT_RULE_FAILURE_DOMAIN "host" +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +using namespace std; + +static ostream& _prefix(std::ostream* _dout) +{ + return *_dout << "ErasureCode: "; +} + +const unsigned ErasureCode::SIMD_ALIGN = 32; + int ErasureCode::init( ErasureCodeProfile &profile, std::ostream *ss) diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h index 06a3d7a5166..d3c5badb1f1 100644 --- a/src/erasure-code/ErasureCode.h +++ b/src/erasure-code/ErasureCode.h @@ -66,7 +66,7 @@ namespace ceph { int minimum_to_decode(const std::set &want_to_read, const std::set &available, - std::map>> *minimum) final override; + std::map>> *minimum) override; int minimum_to_decode_with_cost(const std::set &want_to_read, const std::map &available, @@ -84,7 +84,7 @@ namespace ceph { int decode(const std::set &want_to_read, const std::map &chunks, - std::map *decoded, int chunk_size) override final; + std::map *decoded, int chunk_size) override; virtual int _decode(const std::set &want_to_read, const std::map &chunks, diff --git a/src/erasure-code/clay/CMakeLists.txt b/src/erasure-code/clay/CMakeLists.txt new file mode 100644 index 00000000000..c5af1e22698 --- /dev/null +++ b/src/erasure-code/clay/CMakeLists.txt @@ -0,0 +1,14 @@ +# clay plugin + +set(clay_srcs + ErasureCodePluginClay.cc + ErasureCodeClay.cc + $ + $ + ${CMAKE_SOURCE_DIR}/src/common/str_map.cc +) + +add_library(ec_clay SHARED ${clay_srcs}) +set_target_properties(ec_clay PROPERTIES + INSTALL_RPATH "") +install(TARGETS ec_clay DESTINATION ${erasure_plugin_dir}) diff --git a/src/erasure-code/clay/ErasureCodeClay.cc b/src/erasure-code/clay/ErasureCodeClay.cc new file mode 100644 index 00000000000..d4f25e31d5f --- /dev/null +++ b/src/erasure-code/clay/ErasureCodeClay.cc @@ -0,0 +1,900 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Myna Vajha + * + * Author: Myna Vajha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include "ErasureCodeClay.h" + +#include "common/debug.h" +#include "erasure-code/ErasureCodePlugin.h" +#include "include/ceph_assert.h" +#include "include/str_map.h" +#include "include/stringify.h" +#include "osd/osd_types.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +#define LARGEST_VECTOR_WORDSIZE 16 +#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) + +using namespace std; + +static ostream& _prefix(std::ostream* _dout) +{ + return *_dout << "ErasureCodeClay: "; +} + +static int pow_int(int a, int x) { + int power = 1; + while (x) { + if (x & 1) power *= a; + x /= 2; + a *= a; + } + return power; +} + +ErasureCodeClay::~ErasureCodeClay() +{ + for (int i = 0; i < q*t; i++) { + if (U_buf[i].length() != 0) U_buf[i].clear(); + } +} + +int ErasureCodeClay::init(ErasureCodeProfile &profile, + ostream *ss) +{ + int r; + r = parse(profile, ss); + if (r) + return r; + + r = ErasureCode::init(profile, ss); + if (r) + return r; + ErasureCodePluginRegistry ®istry = ErasureCodePluginRegistry::instance(); + r = registry.factory(mds.profile["plugin"], + directory, + mds.profile, + &mds.erasure_code, + ss); + if (r) + return r; + r = registry.factory(pft.profile["plugin"], + directory, + pft.profile, + &pft.erasure_code, + ss); + return r; + +} + +unsigned int ErasureCodeClay::get_chunk_size(unsigned int object_size) const +{ + unsigned alignment = get_alignment(); + unsigned tail = object_size % alignment; + unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 ); + + ceph_assert(padded_length % (k*sub_chunk_no) == 0); + + return padded_length / k; +} + +int ErasureCodeClay::minimum_to_decode(const set &want_to_read, + const set &available, + map>> *minimum) +{ + if (is_repair(want_to_read, available)) { + return minimum_to_repair(want_to_read, available, minimum); + } else { + return ErasureCode::minimum_to_decode(want_to_read, available, minimum); + } +} + +int ErasureCodeClay::decode(const set &want_to_read, + const map &chunks, + map *decoded, int chunk_size) +{ + set avail; + for ([[maybe_unused]] auto& [node, bl] : chunks) { + avail.insert(node); + } + + if (is_repair(want_to_read, avail)) { + return repair(want_to_read, chunks, decoded, chunk_size); + } else { + return ErasureCode::_decode(want_to_read, chunks, decoded); + } +} + +void p(const set &s) { cerr << s; } // for gdb + +int ErasureCodeClay::encode_chunks(const set &want_to_encode, + map *encoded) +{ + map chunks; + set parity_chunks; + int chunk_size = (*encoded)[0].length(); + + for (int i = 0; i < k + m; i++) { + if (i < k) { + chunks[i] = (*encoded)[i]; + } else { + chunks[i+nu] = (*encoded)[i]; + parity_chunks.insert(i+nu); + } + } + + for (int i = k; i < k + nu; i++) { + bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN)); + buf.zero(); + chunks[i].push_back(std::move(buf)); + } + + int res = decode_layered(parity_chunks, &chunks); + for (int i = k ; i < k + nu; i++) { + // need to clean some of the intermediate chunks here!! + chunks[i].clear(); + } + return res; +} + +int ErasureCodeClay::decode_chunks(const set &want_to_read, + const map &chunks, + map *decoded) +{ + set erasures; + map coded_chunks; + + for (int i = 0; i < k + m; i++) { + if (chunks.count(i) == 0) { + erasures.insert(i < k ? i : i+nu); + } + ceph_assert(decoded->count(i) > 0); + coded_chunks[i < k ? i : i+nu] = (*decoded)[i]; + } + int chunk_size = coded_chunks[0].length(); + + for (int i = k; i < k+nu; i++) { + bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN)); + buf.zero(); + coded_chunks[i].push_back(std::move(buf)); + } + + int res = decode_layered(erasures, &coded_chunks); + for (int i = k; i < k+nu; i++) { + coded_chunks[i].clear(); + } + return res; +} + +unsigned int ErasureCodeClay::get_alignment() const +{ + unsigned alignment = k * sub_chunk_no * w * sizeof(int); + if ((w * sizeof(int)) % LARGEST_VECTOR_WORDSIZE) { + alignment = k * sub_chunk_no * w * LARGEST_VECTOR_WORDSIZE; + } + return alignment; +} + +int ErasureCodeClay::parse(ErasureCodeProfile &profile, + ostream *ss) +{ + int err = 0; + err = ErasureCode::parse(profile, ss); + err |= to_int("k", profile, &k, DEFAULT_K, ss); + err |= to_int("m", profile, &m, DEFAULT_M, ss); + + err |= sanity_check_k(k, ss); + + err |= to_int("d", profile, &d, std::to_string(k+m-1), ss); + + // check for scalar_mds in profile input + if (profile.find("scalar_mds") == profile.end() || + profile.find("scalar_mds")->second.empty()) { + mds.profile["plugin"] = "jerasure"; + pft.profile["plugin"] = "jerasure"; + } else { + std::string p = profile.find("scalar_mds")->second; + if ((p == "jerasure") || (p == "isa") || (p == "shec")) { + mds.profile["plugin"] = p; + pft.profile["plugin"] = p; + } else { + *ss << "scalar_mds " << mds.profile["plugin"] << + "is not currently supported, use one of 'jerasure',"<< + " 'isa', 'shec'" << std::endl; + err = -EINVAL; + return err; + } + } + + if (profile.find("technique") == profile.end() || + profile.find("technique")->second.empty()) { + if ((mds.profile["plugin"]=="jerasure") || (mds.profile["plugin"]=="isa") ) { + mds.profile["technique"] = "reed_sol_van"; + pft.profile["technique"] = "reed_sol_van"; + } else { + mds.profile["technique"] = "single"; + pft.profile["technique"] = "single"; + } + } else { + std::string p = profile.find("technique")->second; + if (mds.profile["plugin"] == "jerasure") { + if ( (p == "reed_sol_van") || (p == "reed_sol_r6_op") || (p == "cauchy_orig") + || (p == "cauchy_good") || (p == "liber8tion")) { + mds.profile["technique"] = p; + pft.profile["technique"] = p; + } else { + *ss << "technique " << p << "is not currently supported, use one of " + << "reed_sol_van', 'reed_sol_r6_op','cauchy_orig'," + << "'cauchy_good','liber8tion'"<< std::endl; + err = -EINVAL; + return err; + } + } else if (mds.profile["plugin"] == "isa") { + if ( (p == "reed_sol_van") || (p == "cauchy")) { + mds.profile["technique"] = p; + pft.profile["technique"] = p; + } else { + *ss << "technique " << p << "is not currently supported, use one of" + << "'reed_sol_van','cauchy'"<< std::endl; + err = -EINVAL; + return err; + } + } else { + if ( (p == "single") || (p == "multiple")) { + mds.profile["technique"] = p; + pft.profile["technique"] = p; + } else { + *ss << "technique " << p << "is not currently supported, use one of"<< + "'single','multiple'"<< std::endl; + err = -EINVAL; + return err; + } + } + } + if ((d < k) || (d > k + m - 1)) { + *ss << "value of d " << d + << " must be within [ " << k << "," << k+m-1 << "]" << std::endl; + err = -EINVAL; + return err; + } + + q = d - k + 1; + if ((k + m) % q) { + nu = q - (k + m) % q; + } else { + nu = 0; + } + + if (k+m+nu > 254) { + err = -EINVAL; + return err; + } + + if (mds.profile["plugin"] == "shec") { + mds.profile["c"] = '2'; + pft.profile["c"] = '2'; + } + mds.profile["k"] = std::to_string(k+nu); + mds.profile["m"] = std::to_string(m); + mds.profile["w"] = '8'; + + pft.profile["k"] = '2'; + pft.profile["m"] = '2'; + pft.profile["w"] = '8'; + + t = (k + m + nu) / q; + sub_chunk_no = pow_int(q, t); + + dout(10) << __func__ + << " (q,t,nu)=(" << q << "," << t << "," << nu <<")" << dendl; + + return err; +} + +int ErasureCodeClay::is_repair(const set &want_to_read, + const set &available_chunks) { + + if (includes(available_chunks.begin(), available_chunks.end(), + want_to_read.begin(), want_to_read.end())) return 0; + if (want_to_read.size() > 1) return 0; + + int i = *want_to_read.begin(); + int lost_node_id = (i < k) ? i: i+nu; + for (int x = 0; x < q; x++) { + int node = (lost_node_id/q)*q+x; + node = (node < k) ? node : node-nu; + if (node != i) { // node in the same group other than erased node + if (available_chunks.count(node) == 0) return 0; + } + } + + if (available_chunks.size() < (unsigned)d) return 0; + return 1; +} + +int ErasureCodeClay::minimum_to_repair(const set &want_to_read, + const set &available_chunks, + map>> *minimum) +{ + int i = *want_to_read.begin(); + int lost_node_index = (i < k) ? i : i+nu; + int rep_node_index = 0; + + // add all the nodes in lost node's y column. + vector> sub_chunk_ind; + get_repair_subchunks(lost_node_index, sub_chunk_ind); + if ((available_chunks.size() >= (unsigned)d)) { + for (int j = 0; j < q; j++) { + if (j != lost_node_index%q) { + rep_node_index = (lost_node_index/q)*q+j; + if (rep_node_index < k) { + minimum->insert(make_pair(rep_node_index, sub_chunk_ind)); + } else if (rep_node_index >= k+nu) { + minimum->insert(make_pair(rep_node_index-nu, sub_chunk_ind)); + } + } + } + for (auto chunk : available_chunks) { + if (minimum->size() >= (unsigned)d) { + break; + } + if (!minimum->count(chunk)) { + minimum->emplace(chunk, sub_chunk_ind); + } + } + } else { + dout(0) << "minimum_to_repair: shouldn't have come here" << dendl; + ceph_assert(0); + } + ceph_assert(minimum->size() == (unsigned)d); + return 0; +} + +void ErasureCodeClay::get_repair_subchunks(const int &lost_node, + vector> &repair_sub_chunks_ind) +{ + const int y_lost = lost_node / q; + const int x_lost = lost_node % q; + + const int seq_sc_count = pow_int(q, t-1-y_lost); + const int num_seq = pow_int(q, y_lost); + + int index = x_lost * seq_sc_count; + for (int ind_seq = 0; ind_seq < num_seq; ind_seq++) { + repair_sub_chunks_ind.push_back(make_pair(index, seq_sc_count)); + index += q * seq_sc_count; + } +} + +int ErasureCodeClay::get_repair_sub_chunk_count(const set &want_to_read) +{ + int weight_vector[t] = {0}; + for (auto to_read : want_to_read) { + weight_vector[to_read / q]++; + } + + int repair_subchunks_count = 1; + for (int y = 0; y < t; y++) { + repair_subchunks_count = repair_subchunks_count*(q-weight_vector[y]); + } + + return sub_chunk_no - repair_subchunks_count; +} + +int ErasureCodeClay::repair(const set &want_to_read, + const map &chunks, + map *repaired, int chunk_size) +{ + + ceph_assert((want_to_read.size() == 1) && (chunks.size() == (unsigned)d)); + + int repair_sub_chunk_no = get_repair_sub_chunk_count(want_to_read); + vector> repair_sub_chunks_ind; + + unsigned repair_blocksize = chunks.begin()->second.length(); + assert(repair_blocksize%repair_sub_chunk_no == 0); + + unsigned sub_chunksize = repair_blocksize/repair_sub_chunk_no; + unsigned chunksize = sub_chunk_no*sub_chunksize; + + ceph_assert(chunksize == (unsigned)chunk_size); + + map recovered_data; + map helper_data; + set aloof_nodes; + + for (int i = 0; i < k + m; i++) { + // included helper data only for d+nu nodes. + if (auto found = chunks.find(i); found != chunks.end()) { // i is a helper + if (isecond; + } else { + helper_data[i+nu] = found->second; + } + } else { + if (i != *want_to_read.begin()) { // aloof node case. + int aloof_node_id = (i < k) ? i: i+nu; + aloof_nodes.insert(aloof_node_id); + } else { + bufferptr ptr(buffer::create_aligned(chunksize, SIMD_ALIGN)); + ptr.zero(); + int lost_node_id = (i < k) ? i : i+nu; + (*repaired)[i].push_back(ptr); + recovered_data[lost_node_id] = (*repaired)[i]; + get_repair_subchunks(lost_node_id, repair_sub_chunks_ind); + } + } + } + + // this is for shortened codes i.e., when nu > 0 + for (int i=k; i < k+nu; i++) { + bufferptr ptr(buffer::create_aligned(repair_blocksize, SIMD_ALIGN)); + ptr.zero(); + helper_data[i].push_back(ptr); + } + + ceph_assert(helper_data.size()+aloof_nodes.size()+recovered_data.size() == + (unsigned) q*t); + + int r = repair_one_lost_chunk(recovered_data, aloof_nodes, + helper_data, repair_blocksize, + repair_sub_chunks_ind); + + // clear buffers created for the purpose of shortening + for (int i = k; i < k+nu; i++) { + helper_data[i].clear(); + } + + return r; +} + +int ErasureCodeClay::repair_one_lost_chunk(map &recovered_data, + set &aloof_nodes, + map &helper_data, + int repair_blocksize, + vector> &repair_sub_chunks_ind) +{ + unsigned repair_subchunks = (unsigned)sub_chunk_no / q; + unsigned sub_chunksize = repair_blocksize / repair_subchunks; + + int z_vec[t]; + map > ordered_planes; + map repair_plane_to_ind; + int count_retrieved_sub_chunks = 0; + int plane_ind = 0; + + bufferptr buf(buffer::create_aligned(sub_chunksize, SIMD_ALIGN)); + bufferlist temp_buf; + temp_buf.push_back(buf); + + for (auto [index,count] : repair_sub_chunks_ind) { + for (int j = index; j < index + count; j++) { + get_plane_vector(j, z_vec); + int order = 0; + // check across all erasures and aloof nodes + for ([[maybe_unused]] auto& [node, bl] : recovered_data) { + if (node % q == z_vec[node / q]) order++; + } + for (auto node : aloof_nodes) { + if (node % q == z_vec[node / q]) order++; + } + ceph_assert(order > 0); + ordered_planes[order].insert(j); + // to keep track of a sub chunk within helper buffer recieved + repair_plane_to_ind[j] = plane_ind; + plane_ind++; + } + } + assert((unsigned)plane_ind == repair_subchunks); + + int plane_count = 0; + for (int i = 0; i < q*t; i++) { + if (U_buf[i].length() == 0) { + bufferptr buf(buffer::create_aligned(sub_chunk_no*sub_chunksize, SIMD_ALIGN)); + buf.zero(); + U_buf[i].push_back(std::move(buf)); + } + } + + int lost_chunk; + int count = 0; + for ([[maybe_unused]] auto& [node, bl] : recovered_data) { + lost_chunk = node; + count++; + } + ceph_assert(count == 1); + + set erasures; + for (int i = 0; i < q; i++) { + erasures.insert(lost_chunk - lost_chunk % q + i); + } + for (auto node : aloof_nodes) { + erasures.insert(node); + } + + for (int order = 1; ;order++) { + if (ordered_planes.count(order) == 0) { + break; + } + plane_count += ordered_planes[order].size(); + for (auto z : ordered_planes[order]) { + get_plane_vector(z, z_vec); + + for (int y = 0; y < t; y++) { + for (int x = 0; x < q; x++) { + int node_xy = y*q + x; + map known_subchunks; + map pftsubchunks; + set pft_erasures; + if (erasures.count(node_xy) == 0) { + assert(helper_data.count(node_xy) > 0); + int z_sw = z + (x - z_vec[y])*pow_int(q,t-1-y); + int node_sw = y*q + z_vec[y]; + int i0 = 0, i1 = 1, i2 = 2, i3 = 3; + if (z_vec[y] > x) { + i0 = 1; + i1 = 0; + i2 = 3; + i3 = 2; + } + if (aloof_nodes.count(node_sw) > 0) { + assert(repair_plane_to_ind.count(z) > 0); + assert(repair_plane_to_ind.count(z_sw) > 0); + pft_erasures.insert(i2); + known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); + known_subchunks[i3].substr_of(U_buf[node_sw], z_sw*sub_chunksize, sub_chunksize); + pftsubchunks[i0] = known_subchunks[i0]; + pftsubchunks[i1] = temp_buf; + pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize); + pftsubchunks[i3] = known_subchunks[i3]; + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); + } + pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); + } else { + ceph_assert(helper_data.count(node_sw) > 0); + ceph_assert(repair_plane_to_ind.count(z) > 0); + if (z_vec[y] != x){ + pft_erasures.insert(i2); + ceph_assert(repair_plane_to_ind.count(z_sw) > 0); + known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); + known_subchunks[i1].substr_of(helper_data[node_sw], repair_plane_to_ind[z_sw]*sub_chunksize, sub_chunksize); + pftsubchunks[i0] = known_subchunks[i0]; + pftsubchunks[i1] = known_subchunks[i1]; + pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize); + pftsubchunks[i3].substr_of(temp_buf, 0, sub_chunksize); + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); + } + pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); + } else { + char* uncoupled_chunk = U_buf[node_xy].c_str(); + char* coupled_chunk = helper_data[node_xy].c_str(); + memcpy(&uncoupled_chunk[z*sub_chunksize], + &coupled_chunk[repair_plane_to_ind[z]*sub_chunksize], + sub_chunksize); + } + } + } + } // x + } // y + ceph_assert(erasures.size() <= (unsigned)m); + decode_uncoupled(erasures, z, sub_chunksize); + + for (auto i : erasures) { + int x = i % q; + int y = i / q; + int node_sw = y*q+z_vec[y]; + int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); + set pft_erasures; + map known_subchunks; + map pftsubchunks; + int i0 = 0, i1 = 1, i2 = 2, i3 = 3; + if (z_vec[y] > x) { + i0 = 1; + i1 = 0; + i2 = 3; + i3 = 2; + } + // make sure it is not an aloof node before you retrieve repaired_data + if (aloof_nodes.count(i) == 0) { + if (x == z_vec[y]) { // hole-dot pair (type 0) + char* coupled_chunk = recovered_data[i].c_str(); + char* uncoupled_chunk = U_buf[i].c_str(); + memcpy(&coupled_chunk[z*sub_chunksize], + &uncoupled_chunk[z*sub_chunksize], + sub_chunksize); + count_retrieved_sub_chunks++; + } else { + ceph_assert(y == lost_chunk / q); + ceph_assert(node_sw == lost_chunk); + ceph_assert(helper_data.count(i) > 0); + pft_erasures.insert(i1); + known_subchunks[i0].substr_of(helper_data[i], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); + known_subchunks[i2].substr_of(U_buf[i], z*sub_chunksize, sub_chunksize); + + pftsubchunks[i0] = known_subchunks[i0]; + pftsubchunks[i1].substr_of(recovered_data[node_sw], z_sw*sub_chunksize, sub_chunksize); + pftsubchunks[i2] = known_subchunks[i2]; + pftsubchunks[i3] = temp_buf; + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); + } + pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); + } + } + } // recover all erasures + } // planes of particular order + } // order + + return 0; +} + + +int ErasureCodeClay::decode_layered(set &erased_chunks, + map *chunks) +{ + int num_erasures = erased_chunks.size(); + + int size = (*chunks)[0].length(); + ceph_assert(size%sub_chunk_no == 0); + int sc_size = size / sub_chunk_no; + + ceph_assert(num_erasures > 0); + + for (int i = k+nu; (num_erasures < m) && (i < q*t); i++) { + if ([[maybe_unused]] auto [it, added] = erased_chunks.emplace(i); added) { + num_erasures++; + } + } + ceph_assert(num_erasures == m); + + int max_iscore = get_max_iscore(erased_chunks); + int order[sub_chunk_no]; + int z_vec[t]; + for (int i = 0; i < q*t; i++) { + if (U_buf[i].length() == 0) { + bufferptr buf(buffer::create_aligned(size, SIMD_ALIGN)); + buf.zero(); + U_buf[i].push_back(std::move(buf)); + } + } + + set_planes_sequential_decoding_order(order, erased_chunks); + + for (int iscore = 0; iscore <= max_iscore; iscore++) { + for (int z = 0; z < sub_chunk_no; z++) { + if (order[z] == iscore) { + decode_erasures(erased_chunks, z, chunks, sc_size); + } + } + + for (int z = 0; z < sub_chunk_no; z++) { + if (order[z] == iscore) { + get_plane_vector(z, z_vec); + for (auto node_xy : erased_chunks) { + int x = node_xy % q; + int y = node_xy / q; + int node_sw = y*q+z_vec[y]; + if (z_vec[y] != x) { + if (erased_chunks.count(node_sw) == 0) { + recover_type1_erasure(chunks, x, y, z, z_vec, sc_size); + } else if (z_vec[y] < x){ + ceph_assert(erased_chunks.count(node_sw) > 0); + ceph_assert(z_vec[y] != x); + get_coupled_from_uncoupled(chunks, x, y, z, z_vec, sc_size); + } + } else { + char* C = (*chunks)[node_xy].c_str(); + char* U = U_buf[node_xy].c_str(); + memcpy(&C[z*sc_size], &U[z*sc_size], sc_size); + } + } + } + } // plane + } // iscore, order + + return 0; +} + +int ErasureCodeClay::decode_erasures(const set& erased_chunks, int z, + map* chunks, int sc_size) +{ + int z_vec[t]; + + get_plane_vector(z, z_vec); + + for (int x = 0; x < q; x++) { + for (int y = 0; y < t; y++) { + int node_xy = q*y+x; + int node_sw = q*y+z_vec[y]; + if (erased_chunks.count(node_xy) == 0) { + if (z_vec[y] < x) { + get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size); + } else if (z_vec[y] == x) { + char* uncoupled_chunk = U_buf[node_xy].c_str(); + char* coupled_chunk = (*chunks)[node_xy].c_str(); + memcpy(&uncoupled_chunk[z*sc_size], &coupled_chunk[z*sc_size], sc_size); + } else { + if (erased_chunks.count(node_sw) > 0) { + get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size); + } + } + } + } + } + return decode_uncoupled(erased_chunks, z, sc_size); +} + +int ErasureCodeClay::decode_uncoupled(const set& erased_chunks, int z, int sc_size) +{ + map known_subchunks; + map all_subchunks; + + for (int i = 0; i < q*t; i++) { + if (erased_chunks.count(i) == 0) { + known_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size); + all_subchunks[i] = known_subchunks[i]; + } else { + all_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size); + } + all_subchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); + assert(all_subchunks[i].is_contiguous()); + } + + mds.erasure_code->decode_chunks(erased_chunks, known_subchunks, &all_subchunks); + return 0; +} + +void ErasureCodeClay::set_planes_sequential_decoding_order(int* order, set& erasures) { + int z_vec[t]; + for (int z = 0; z < sub_chunk_no; z++) { + get_plane_vector(z,z_vec); + order[z] = 0; + for (auto i : erasures) { + if (i % q == z_vec[i / q]) { + order[z] = order[z] + 1; + } + } + } +} + +void ErasureCodeClay::recover_type1_erasure(map* chunks, + int x, int y, int z, + int* z_vec, int sc_size) +{ + set erased_chunks; + + int node_xy = y*q+x; + int node_sw = y*q+z_vec[y]; + int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); + + map known_subchunks; + map pftsubchunks; + bufferptr ptr(buffer::create_aligned(sc_size, SIMD_ALIGN)); + ptr.zero(); + + int i0 = 0, i1 = 1, i2 = 2, i3 = 3; + if (z_vec[y] > x) { + i0 = 1; + i1 = 0; + i2 = 3; + i3 = 2; + } + + erased_chunks.insert(i0); + pftsubchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); + known_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); + known_subchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size); + pftsubchunks[i1] = known_subchunks[i1]; + pftsubchunks[i2] = known_subchunks[i2]; + pftsubchunks[i3].push_back(ptr); + + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); + } + + pft.erasure_code->decode_chunks(erased_chunks, known_subchunks, &pftsubchunks); +} + +void ErasureCodeClay::get_coupled_from_uncoupled(map* chunks, + int x, int y, int z, + int* z_vec, int sc_size) +{ + set erased_chunks = {0, 1}; + + int node_xy = y*q+x; + int node_sw = y*q+z_vec[y]; + int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); + + ceph_assert(z_vec[y] < x); + map uncoupled_subchunks; + uncoupled_subchunks[2].substr_of(U_buf[node_xy], z * sc_size, sc_size); + uncoupled_subchunks[3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size); + + map pftsubchunks; + pftsubchunks[0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); + pftsubchunks[1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); + pftsubchunks[2] = uncoupled_subchunks[2]; + pftsubchunks[3] = uncoupled_subchunks[3]; + + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); + } + pft.erasure_code->decode_chunks(erased_chunks, uncoupled_subchunks, &pftsubchunks); +} + +void ErasureCodeClay::get_uncoupled_from_coupled(map* chunks, + int x, int y, int z, + int* z_vec, int sc_size) +{ + set erased_chunks = {2, 3}; + + int node_xy = y*q+x; + int node_sw = y*q+z_vec[y]; + int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); + + int i0 = 0, i1 = 1, i2 = 2, i3 = 3; + if (z_vec[y] > x) { + i0 = 1; + i1 = 0; + i2 = 3; + i3 = 2; + } + map coupled_subchunks; + coupled_subchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); + coupled_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); + + map pftsubchunks; + pftsubchunks[0] = coupled_subchunks[0]; + pftsubchunks[1] = coupled_subchunks[1]; + pftsubchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size); + pftsubchunks[i3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size); + for (int i=0; i<3; i++) { + pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); + } + pft.erasure_code->decode_chunks(erased_chunks, coupled_subchunks, &pftsubchunks); +} + +int ErasureCodeClay::get_max_iscore(set& erased_chunks) +{ + int weight_vec[t]; + int iscore = 0; + memset(weight_vec, 0, sizeof(int)*t); + + for (auto i : erased_chunks) { + if (weight_vec[i / q] == 0) { + weight_vec[i / q] = 1; + iscore++; + } + } + return iscore; +} + +void ErasureCodeClay::get_plane_vector(int z, int* z_vec) +{ + for (int i = 0; i < t; i++) { + z_vec[t-1-i] = z % q; + z = (z - z_vec[t-1-i]) / q; + } +} diff --git a/src/erasure-code/clay/ErasureCodeClay.h b/src/erasure-code/clay/ErasureCodeClay.h new file mode 100644 index 00000000000..6f0849de1e4 --- /dev/null +++ b/src/erasure-code/clay/ErasureCodeClay.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Myna Vajha + * + * Author: Myna Vajha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_ERASURE_CODE_CLAY_H +#define CEPH_ERASURE_CODE_CLAY_H + +#include "include/err.h" +#include "include/buffer_fwd.h" +#include "erasure-code/ErasureCode.h" + +class ErasureCodeClay final : public ceph::ErasureCode { +public: + std::string DEFAULT_K{"4"}; + std::string DEFAULT_M{"2"}; + std::string DEFAULT_W{"8"}; + int k = 0, m = 0, d = 0, w = 8; + int q = 0, t = 0, nu = 0; + int sub_chunk_no = 0; + + std::map U_buf; + + struct ScalarMDS { + ceph::ErasureCodeInterfaceRef erasure_code; + ceph::ErasureCodeProfile profile; + }; + ScalarMDS mds; + ScalarMDS pft; + const std::string directory; + + explicit ErasureCodeClay(const std::string& dir) + : directory(dir) + {} + + ~ErasureCodeClay() override; + + unsigned int get_chunk_count() const override { + return k+m; + } + + unsigned int get_data_chunk_count() const override { + return k; + } + + int get_sub_chunk_count() override { + return sub_chunk_no; + } + + unsigned int get_chunk_size(unsigned int object_size) const override; + + int minimum_to_decode(const std::set &want_to_read, + const std::set &available, + std::map>> *minimum) override; + + int decode(const std::set &want_to_read, + const std::map &chunks, + std::map *decoded, int chunk_size) override; + + int encode_chunks(const std::set &want_to_encode, + std::map *encoded) override; + + int decode_chunks(const std::set &want_to_read, + const std::map &chunks, + std::map *decoded) override; + + int init(ceph::ErasureCodeProfile &profile, std::ostream *ss) override; + + int is_repair(const std::set &want_to_read, + const std::set &available_chunks); + + int get_repair_sub_chunk_count(const std::set &want_to_read); + + virtual int parse(ceph::ErasureCodeProfile &profile, std::ostream *ss); + + unsigned get_alignment() const; + +private: + int minimum_to_repair(const std::set &want_to_read, + const std::set &available_chunks, + std::map>> *minimum); + + int repair(const std::set &want_to_read, + const std::map &chunks, + std::map *recovered, int chunk_size); + + int decode_layered(std::set& erased_chunks, std::map* chunks); + + int repair_one_lost_chunk(std::map &recovered_data, std::set &aloof_nodes, + std::map &helper_data, int repair_blocksize, + std::vector> &repair_sub_chunks_ind); + + void get_repair_subchunks(const int &lost_node, + std::vector> &repair_sub_chunks_ind); + + int decode_erasures(const std::set& erased_chunks, int z, + std::map* chunks, int sc_size); + + int decode_uncoupled(const std::set& erasures, int z, int ss_size); + + void set_planes_sequential_decoding_order(int* order, std::set& erasures); + + void recover_type1_erasure(std::map* chunks, int x, int y, int z, + int* z_vec, int sc_size); + + void get_uncoupled_from_coupled(std::map* chunks, int x, int y, int z, + int* z_vec, int sc_size); + + void get_coupled_from_uncoupled(std::map* chunks, int x, int y, int z, + int* z_vec, int sc_size); + + void get_plane_vector(int z, int* z_vec); + + int get_max_iscore(std::set& erased_chunks); +}; + +#endif diff --git a/src/erasure-code/clay/ErasureCodePluginClay.cc b/src/erasure-code/clay/ErasureCodePluginClay.cc new file mode 100644 index 00000000000..ab3d1db7f27 --- /dev/null +++ b/src/erasure-code/clay/ErasureCodePluginClay.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Myna Vajha + * + * Author: Myna Vajha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include "ceph_ver.h" +#include "common/debug.h" +#include "ErasureCodePluginClay.h" +#include "ErasureCodeClay.h" + +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +int ErasureCodePluginClay::factory(const std::string &directory, + ErasureCodeProfile &profile, + ErasureCodeInterfaceRef *erasure_code, + std::ostream *ss) { + auto interface = std::make_unique(directory); + if (int r = interface->init(profile, ss); r) { + return r; + } + *erasure_code = ErasureCodeInterfaceRef(interface.release()); + return 0; +}; + +const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; } + +int __erasure_code_init(char *plugin_name, char *directory) +{ + ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance(); + return instance.add(plugin_name, new ErasureCodePluginClay()); +} diff --git a/src/erasure-code/clay/ErasureCodePluginClay.h b/src/erasure-code/clay/ErasureCodePluginClay.h new file mode 100644 index 00000000000..d6641287c2e --- /dev/null +++ b/src/erasure-code/clay/ErasureCodePluginClay.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2018 Myna Vajha + * + * Author: Myna Vajha + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_ERASURE_CODE_PLUGIN_CLAY_H +#define CEPH_ERASURE_CODE_PLUGIN_CLAY_H + +#include "erasure-code/ErasureCodePlugin.h" + +class ErasureCodePluginClay : public ErasureCodePlugin { +public: + int factory(const std::string& directory, + ErasureCodeProfile &profile, + ErasureCodeInterfaceRef *erasure_code, + ostream *ss) override; +}; + +#endif diff --git a/src/test/erasure-code/CMakeLists.txt b/src/test/erasure-code/CMakeLists.txt index 2cd7cf2c983..721f6c36750 100644 --- a/src/test/erasure-code/CMakeLists.txt +++ b/src/test/erasure-code/CMakeLists.txt @@ -234,3 +234,31 @@ target_link_libraries(unittest_erasure_code_shec_arguments ceph-common ec_shec ) + +#unitest_erasure_code_clay +add_executable(unittest_erasure_code_clay + TestErasureCodeClay.cc + $) +add_ceph_unittest(unittest_erasure_code_clay) +target_link_libraries(unittest_erasure_code_clay + global + ${CMAKE_DL_LIBS} + ${UNITTEST_LIBS} + ceph-common + ec_clay + ) + +# unittest_erasure_code_plugin_clay +add_executable(unittest_erasure_code_plugin_clay + TestErasureCodePluginClay.cc + $) +add_ceph_unittest(unittest_erasure_code_plugin_clay) +add_dependencies(unittest_erasure_code_plugin_clay + ec_clay) +target_link_libraries(unittest_erasure_code_plugin_clay + GTest::Main + global + ${CMAKE_DL_LIBS} + ${UNITTEST_LIBS} + ceph-common) + diff --git a/src/test/erasure-code/TestErasureCodeClay.cc b/src/test/erasure-code/TestErasureCodeClay.cc new file mode 100644 index 00000000000..9f932c952d4 --- /dev/null +++ b/src/test/erasure-code/TestErasureCodeClay.cc @@ -0,0 +1,595 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2013,2014 Cloudwatt + * Copyright (C) 2014 Red Hat + * + * Author: Loic Dachary + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include "crush/CrushWrapper.h" +#include "include/stringify.h" +#include "erasure-code/clay/ErasureCodeClay.h" +#include "global/global_context.h" +#include "common/config_proxy.h" +#include "gtest/gtest.h" + +TEST(ErasureCodeClay, sanity_check_k) +{ + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "1"; + profile["m"] = "1"; + ostringstream errors; + EXPECT_EQ(-EINVAL, clay.init(profile, &errors)); + EXPECT_NE(std::string::npos, errors.str().find("must be >= 2")); +} + +TEST(ErasureCodeClay, encode_decode) +{ + ostringstream errors; + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + int r= clay.init(profile, &cerr); + EXPECT_EQ(0, r); + +#define LARGE_ENOUGH 2048 + bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH)); + in_ptr.zero(); + in_ptr.set_length(0); + const char *payload = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + in_ptr.append(payload, strlen(payload)); + bufferlist in; + in.push_back(in_ptr); + int want_to_encode[] = { 0, 1, 2, 3 }; + map encoded; + EXPECT_EQ(0, clay.encode(set(want_to_encode, want_to_encode+4), + in, + &encoded)); + EXPECT_EQ(4u, encoded.size()); + unsigned length = encoded[0].length(); + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), length)); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str() + length, + in.length() - length)); + + + // all chunks are available + { + int want_to_decode[] = { 0, 1 }; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+2), + encoded, + &decoded)); + EXPECT_EQ(2u, decoded.size()); + EXPECT_EQ(length, decoded[0].length()); + EXPECT_EQ(0, memcmp(decoded[0].c_str(), in.c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[1].c_str(), in.c_str() + length, + in.length() - length)); + } + + // check all two chunks missing possibilities and recover them + for (int i=1; i<4; i++) { + for (int j=0; j degraded = encoded; + degraded.erase(j); + degraded.erase(i); + EXPECT_EQ(2u, degraded.size()); + int want_to_decode[] = {j,i}; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+2), + degraded, + &decoded)); + EXPECT_EQ(4u, decoded.size()); + EXPECT_EQ(length, decoded[j].length()); + EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } + } + //check for all one chunk missing possibilities + int sc_size = length/clay.sub_chunk_no; + int avail[] = {0,1,2,3}; + for (int i=0; i < 4; i++) { + set want_to_read; + want_to_read.insert(i); + set available(avail, avail+4); + available.erase(i); + map>> minimum; + EXPECT_EQ(0, clay.minimum_to_decode(want_to_read, available, &minimum)); + map helper; + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + for(vector>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) { + bufferlist temp; + temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size); + helper[h->first].append(temp); + } + } + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + EXPECT_EQ(length/clay.q, helper[h->first].length()); + } + EXPECT_EQ(3u, helper.size()); + map decoded; + EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length)); + EXPECT_EQ(1u, decoded.size()); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } +} + + +TEST(ErasureCodeClay, encode_decode_aloof_nodes) +{ + ostringstream errors; + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "3"; + profile["m"] = "3"; + profile["d"] = "4"; + int r= clay.init(profile, &cerr); + EXPECT_EQ(0, r); + +#define LARGE_ENOUGH 2048 + bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH)); + in_ptr.zero(); + in_ptr.set_length(0); + const char *payload = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + in_ptr.append(payload, strlen(payload)); + bufferlist in; + in.push_back(in_ptr); + int want_to_encode[] = { 0, 1, 2, 3, 4, 5 }; + map encoded; + EXPECT_EQ(0, clay.encode(set(want_to_encode, want_to_encode+6), + in, + &encoded)); + EXPECT_EQ(6u, encoded.size()); + unsigned length = encoded[0].length(); + if (in.length() < length) { + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + } else if (in.length() <= 2*length ) { + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, in.length()-length)); + } else { + EXPECT_EQ(1, in.length() <= 3*length); + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length)); + EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, in.length()-2*length)); + } + + // all chunks are available + { + int want_to_decode[] = { 0, 1, 2 }; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+3), + encoded, + &decoded)); + EXPECT_EQ(3u, decoded.size()); + EXPECT_EQ(length, decoded[0].length()); + EXPECT_EQ(0, memcmp(decoded[0].c_str(), encoded[0].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[1].c_str(), encoded[1].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[2].c_str(), encoded[2].c_str(), length)); + } + + // check all three chunks missing possibilities and recover them + for (int i=2; i<6; i++) { + for (int j=1; j degraded = encoded; + degraded.erase(k); + degraded.erase(j); + degraded.erase(i); + EXPECT_EQ(3u, degraded.size()); + int want_to_decode[] = {k,j,i}; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+3), + degraded, + &decoded)); + EXPECT_EQ(6u, decoded.size()); + EXPECT_EQ(length, decoded[j].length()); + EXPECT_EQ(0, memcmp(decoded[k].c_str(), encoded[k].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } + } + } + //check for all one chunk missing possibilities + int sc_size = length/clay.sub_chunk_no; + int avail[] = {0,1,2,3,4,5}; + for (int i=0; i < 6; i++) { + vector> repair_subchunks; + map>> minimum; + set want_to_read; + want_to_read.insert(i); + set available(avail, avail+6); + available.erase(i); + clay.minimum_to_decode(want_to_read, available, &minimum); + map helper; + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + for(vector>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) { + bufferlist temp; + temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size); + helper[h->first].append(temp); + } + } + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + EXPECT_EQ(length/clay.q, helper[h->first].length()); + } + EXPECT_EQ((unsigned)clay.d, helper.size()); + map decoded; + EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length)); + EXPECT_EQ(1u, decoded.size()); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } +} + +TEST(ErasureCodeClay, encode_decode_shortening_case) +{ + ostringstream errors; + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "4"; + profile["m"] = "3"; + profile["d"] = "5"; + int r= clay.init(profile, &cerr); + EXPECT_EQ(0, r); + + EXPECT_EQ(2, clay.q); + EXPECT_EQ(4, clay.t); + EXPECT_EQ(1, clay.nu); + + bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH)); + in_ptr.zero(); + in_ptr.set_length(0); + const char *payload = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + in_ptr.append(payload, strlen(payload)); + bufferlist in; + in.push_back(in_ptr); + int want_to_encode[] = { 0, 1, 2, 3, 4, 5, 6 }; + map encoded; + EXPECT_EQ(0, clay.encode(set(want_to_encode, want_to_encode+7), + in, + &encoded)); + EXPECT_EQ(7u, encoded.size()); + unsigned length = encoded[0].length(); + if (in.length() < length) { + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + } else if (in.length() <= 2*length) { + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, in.length()-length)); + } else if (in.length() <= 3*length) { + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length)); + EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, in.length()-2*length)); + } else { + EXPECT_EQ(1, in.length() <= 4*length); + EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length())); + EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length)); + EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, length)); + EXPECT_EQ(0, memcmp(encoded[3].c_str(), in.c_str()+3*length, in.length()-3*length)); + } + + // all chunks are available + { + int want_to_decode[] = { 0, 1, 2, 3 }; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+4), + encoded, + &decoded)); + EXPECT_EQ(4u, decoded.size()); + EXPECT_EQ(length, decoded[0].length()); + EXPECT_EQ(0, memcmp(decoded[0].c_str(), encoded[0].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[1].c_str(), encoded[1].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[2].c_str(), encoded[2].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[3].c_str(), encoded[3].c_str(), length)); + } + + // check all three chunks missing possibilities and recover them + for (int i=2; i<7; i++) { + for (int j=1; j degraded = encoded; + degraded.erase(k); + degraded.erase(j); + degraded.erase(i); + EXPECT_EQ(4u, degraded.size()); + int want_to_decode[] = {k,j,i}; + map decoded; + EXPECT_EQ(0, clay._decode(set(want_to_decode, want_to_decode+3), + degraded, + &decoded)); + EXPECT_EQ(7u, decoded.size()); + EXPECT_EQ(length, decoded[j].length()); + EXPECT_EQ(0, memcmp(decoded[k].c_str(), encoded[k].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length)); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } + } + } + //check for all one chunk missing possibilities + int sc_size = length/clay.sub_chunk_no; + int avail[] = {0,1,2,3,4,5,6}; + for (int i=0; i < 7; i++) { + vector> repair_subchunks; + map>> minimum; + set want_to_read; + want_to_read.insert(i); + set available(avail, avail+7); + available.erase(i); + clay.minimum_to_decode(want_to_read, available, &minimum); + map helper; + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + for(vector>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) { + bufferlist temp; + temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size); + helper[h->first].append(temp); + } + } + for (map>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) { + EXPECT_EQ(length/clay.q, helper[h->first].length()); + } + EXPECT_EQ(static_cast(clay.d), helper.size()); + map decoded; + EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length)); + EXPECT_EQ(1u, decoded.size()); + EXPECT_EQ(length, decoded[i].length()); + EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length)); + } +} + +TEST(ErasureCodeClay, minimum_to_decode) +{ + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + EXPECT_EQ(0, clay.init(profile, &cerr)); + + // + // If trying to read nothing, the minimum is empty. + // + { + set want_to_read; + set available_chunks; + set minimum; + + EXPECT_EQ(0, clay._minimum_to_decode(want_to_read, + available_chunks, + &minimum)); + EXPECT_TRUE(minimum.empty()); + } + // + // There is no way to read a chunk if none are available. + // + { + set want_to_read; + set available_chunks; + set minimum; + + want_to_read.insert(0); + + EXPECT_EQ(-EIO, clay._minimum_to_decode(want_to_read, + available_chunks, + &minimum)); + } + // + // Reading a subset of the available chunks is always possible. + // + { + set want_to_read; + set available_chunks; + set minimum; + + want_to_read.insert(0); + available_chunks.insert(0); + + EXPECT_EQ(0, clay._minimum_to_decode(want_to_read, + available_chunks, + &minimum)); + EXPECT_EQ(want_to_read, minimum); + } + // + // There is no way to read a missing chunk if there is less than k + // chunks available. + // + { + set want_to_read; + set available_chunks; + set minimum; + + want_to_read.insert(0); + want_to_read.insert(1); + available_chunks.insert(0); + + EXPECT_EQ(-EIO, clay._minimum_to_decode(want_to_read, + available_chunks, + &minimum)); + } + // + // When chunks are not available, the minimum can be made of any + // chunks. For instance, to read 1 and 3 below the minimum could be + // 2 and 3 which may seem better because it contains one of the + // chunks to be read. But it won't be more efficient than retrieving + // 0 and 2 instead because, in both cases, the decode function will + // need to run the same recovery operation and use the same amount + // of CPU and memory. + // + { + set want_to_read; + set available_chunks; + set minimum; + + want_to_read.insert(1); + want_to_read.insert(3); + available_chunks.insert(0); + available_chunks.insert(2); + available_chunks.insert(3); + + EXPECT_EQ(0, clay._minimum_to_decode(want_to_read, + available_chunks, + &minimum)); + EXPECT_EQ(2u, minimum.size()); + EXPECT_EQ(0u, minimum.count(3)); + } +} + +TEST(ErasureCodeClay, encode) +{ + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + EXPECT_EQ(0, clay.init(profile, &cerr)); + + unsigned aligned_object_size = clay.get_alignment() * 2; + { + // + // When the input bufferlist needs to be padded because + // it is not properly aligned, it is padded with zeros. + // + bufferlist in; + map encoded; + int want_to_encode[] = { 0, 1, 2, 3 }; + int trail_length = 1; + in.append(string(aligned_object_size + trail_length, 'X')); + EXPECT_EQ(0, clay.encode(set(want_to_encode, want_to_encode+4), + in, + &encoded)); + EXPECT_EQ(4u, encoded.size()); + char *last_chunk = encoded[1].c_str(); + int length =encoded[1].length(); + EXPECT_EQ('X', last_chunk[0]); + EXPECT_EQ('\0', last_chunk[length - trail_length]); + } + + { + // + // When only the first chunk is required, the encoded map only + // contains the first chunk. Although the clay encode + // internally allocated a buffer because of padding requirements + // and also computes the coding chunks, they are released before + // the return of the method, as shown when running the tests thru + // valgrind (there is no leak). + // + bufferlist in; + map encoded; + set want_to_encode; + want_to_encode.insert(0); + int trail_length = 1; + in.append(string(aligned_object_size + trail_length, 'X')); + EXPECT_EQ(0, clay.encode(want_to_encode, in, &encoded)); + EXPECT_EQ(1u, encoded.size()); + } +} + +TEST(ErasureCodeClay, create_rule) +{ + std::unique_ptr c = std::make_unique(); + c->create(); + int root_type = 2; + c->set_type_name(root_type, "root"); + int host_type = 1; + c->set_type_name(host_type, "host"); + int osd_type = 0; + c->set_type_name(osd_type, "osd"); + + int rootno; + c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1, + root_type, 0, NULL, NULL, &rootno); + c->set_item_name(rootno, "default"); + + map loc; + loc["root"] = "default"; + + int num_host = 4; + int num_osd = 5; + int osd = 0; + for (int h=0; hinsert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc); + } + } + + c->finalize(); + + { + stringstream ss; + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + EXPECT_EQ(0, clay.init(profile, &cerr)); + int ruleset = clay.create_rule("myrule", *c, &ss); + EXPECT_EQ(0, ruleset); + EXPECT_EQ(-EEXIST, clay.create_rule("myrule", *c, &ss)); + // + // the minimum that is expected from the created ruleset is to + // successfully map get_chunk_count() devices from the crushmap, + // at least once. + // + vector<__u32> weight(c->get_max_devices(), 0x10000); + vector out; + int x = 0; + c->do_rule(ruleset, x, out, clay.get_chunk_count(), weight, 0); + ASSERT_EQ(out.size(), clay.get_chunk_count()); + for (unsigned i=0; i("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + profile["crush-root"] = "BAD"; + EXPECT_EQ(0, clay.init(profile, &cerr)); + EXPECT_EQ(-ENOENT, clay.create_rule("otherrule", *c, &ss)); + EXPECT_EQ("root item BAD does not exist", ss.str()); + } + { + stringstream ss; + ErasureCodeClay clay(g_conf().get_val("erasure_code_dir")); + ErasureCodeProfile profile; + profile["k"] = "2"; + profile["m"] = "2"; + profile["crush-failure-domain"] = "WORSE"; + EXPECT_EQ(0, clay.init(profile, &cerr)); + EXPECT_EQ(-EINVAL, clay.create_rule("otherrule", *c, &ss)); + EXPECT_EQ("unknown type WORSE", ss.str()); + } +} + +/* + * Local Variables: + * compile-command: "cd ../.. ; + * make -j4 unittest_erasure_code_clay && + * valgrind --tool=memcheck \ + * ./unittest_erasure_code_clay \ + * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20" + * End: + */ diff --git a/src/test/erasure-code/TestErasureCodePluginClay.cc b/src/test/erasure-code/TestErasureCodePluginClay.cc new file mode 100644 index 00000000000..6885db324e8 --- /dev/null +++ b/src/test/erasure-code/TestErasureCodePluginClay.cc @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph distributed storage system + * + * Copyright (C) 2013,2014 Cloudwatt + * Copyright (C) 2014 Red Hat + * + * Author: Loic Dachary + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include "erasure-code/ErasureCodePlugin.h" +#include "log/Log.h" +#include "global/global_context.h" +#include "common/config_proxy.h" +#include "gtest/gtest.h" + +TEST(ErasureCodePlugin, factory) +{ + ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance(); + ErasureCodeProfile profile; + { + ErasureCodeInterfaceRef erasure_code; + EXPECT_FALSE(erasure_code); + EXPECT_EQ(0, instance.factory("clay", + g_conf().get_val("erasure_code_dir"), + profile, + &erasure_code, &cerr)); + EXPECT_TRUE(erasure_code); + } + //check clay plugin with scalar_mds=jerasure + { + const char *techniques[] = { + "reed_sol_van", + "reed_sol_r6_op", + "cauchy_orig", + "cauchy_good", + "liber8tion", + 0 + }; + for(const char **technique = techniques; *technique; technique++) { + ErasureCodeInterfaceRef erasure_code; + ErasureCodeProfile profile; + profile["scalar_mds"] = "jerasure"; + profile["technique"] = *technique; + EXPECT_FALSE(erasure_code); + EXPECT_EQ(0, instance.factory("clay", + g_conf().get_val("erasure_code_dir"), + profile, + &erasure_code, &cerr)); + EXPECT_TRUE(erasure_code.get()); + } + } +#ifdef HAVE_BETTER_YASM_ELF64 + //check clay plugin with scalar_mds=isa + { + const char *techniques[] = { + "reed_sol_van", + "cauchy", + 0 + }; + for(const char **technique = techniques; *technique; technique++) { + ErasureCodeInterfaceRef erasure_code; + ErasureCodeProfile profile; + profile["scalar_mds"] = "isa"; + profile["technique"] = *technique; + EXPECT_FALSE(erasure_code); + EXPECT_EQ(0, instance.factory("clay", + g_conf().get_val("erasure_code_dir"), + profile, + &erasure_code, &cerr)); + EXPECT_TRUE(erasure_code.get()); + } + } +#endif + //check clay plugin with scalar_mds=shec + { + const char *techniques[] = { + "single", + "multiple", + 0 + }; + for(const char **technique = techniques; *technique; technique++) { + ErasureCodeInterfaceRef erasure_code; + ErasureCodeProfile profile; + profile["scalar_mds"] = "shec"; + profile["technique"] = *technique; + EXPECT_FALSE(erasure_code); + EXPECT_EQ(0, instance.factory("clay", + g_conf().get_val("erasure_code_dir"), + profile, + &erasure_code, &cerr)); + EXPECT_TRUE(erasure_code.get()); + } + } +} + +/* + * Local Variables: + * compile-command: "cd ../.. ; make -j4 && + * make unittest_erasure_code_plugin_clay && + * valgrind --tool=memcheck ./unittest_erasure_code_plugin_clay \ + * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20" + * End: + */ -- 2.39.5