]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
erasure-code: add clay codes
authorMyna V <mynaramana@gmail.com>
Thu, 6 Sep 2018 16:23:28 +0000 (11:23 -0500)
committerKefu Chai <kchai@redhat.com>
Thu, 27 Sep 2018 10:56:21 +0000 (18:56 +0800)
Introducing Clay codes to ceph. These codes are optimal
in terms of network, disk traffic utilized during repair of a lost chunk.
This feature also provides gradual increase in network, disk traffic with
respect to the number of lost chunks.

Authors: Myna, Elita.
Fixes: http://tracker.ceph.com/issues/19278
Signed-off-by: Myna V <mynaramana@gmail.com>
src/erasure-code/CMakeLists.txt
src/erasure-code/ErasureCode.cc
src/erasure-code/ErasureCode.h
src/erasure-code/clay/CMakeLists.txt [new file with mode: 0644]
src/erasure-code/clay/ErasureCodeClay.cc [new file with mode: 0644]
src/erasure-code/clay/ErasureCodeClay.h [new file with mode: 0644]
src/erasure-code/clay/ErasureCodePluginClay.cc [new file with mode: 0644]
src/erasure-code/clay/ErasureCodePluginClay.h [new file with mode: 0644]
src/test/erasure-code/CMakeLists.txt
src/test/erasure-code/TestErasureCodeClay.cc [new file with mode: 0644]
src/test/erasure-code/TestErasureCodePluginClay.cc [new file with mode: 0644]

index 9bc62129458821e9a2f60ebbb87442571963701a..0c9832ebf2e393cc8605659236b2b5c15a4d350f 100644 (file)
@@ -20,6 +20,7 @@ endif()
 add_subdirectory(jerasure)
 add_subdirectory(lrc)
 add_subdirectory(shec)
+add_subdirectory(clay)
 
 if (HAVE_BETTER_YASM_ELF64)
   add_subdirectory(isa)
@@ -35,4 +36,5 @@ add_custom_target(erasure_code_plugins DEPENDS
     ${EC_ISA_LIB}
     ec_lrc
     ec_jerasure
-    ec_shec)
+    ec_shec
+    ec_clay)
index 22f9cd48d204cc1fe2916b128d8c820caccada6f..8ca3262dc22a6e8d17df33a4f4eda8da65fd9294 100644 (file)
 #include "include/buffer.h"
 #include "crush/CrushWrapper.h"
 #include "osd/osd_types.h"
-
-
-const unsigned ErasureCode::SIMD_ALIGN = 32;
+#include "common/debug.h"
 
 #define DEFAULT_RULE_ROOT "default"
 #define DEFAULT_RULE_FAILURE_DOMAIN "host"
 
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd 
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+using namespace std;
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "ErasureCode: ";
+}
+
+const unsigned ErasureCode::SIMD_ALIGN = 32;
+
 int ErasureCode::init(
   ErasureCodeProfile &profile,
   std::ostream *ss)
index 06a3d7a5166883b81f2f1caec62154f93ea43631..d3c5badb1f1626678255f5e09892531838c37b33 100644 (file)
@@ -66,7 +66,7 @@ namespace ceph {
 
     int minimum_to_decode(const std::set<int> &want_to_read,
                          const std::set<int> &available,
-                         std::map<int, std::vector<std::pair<int, int>>> *minimum) final override;
+                         std::map<int, std::vector<std::pair<int, int>>> *minimum) override;
 
     int minimum_to_decode_with_cost(const std::set<int> &want_to_read,
                                             const std::map<int, int> &available,
@@ -84,7 +84,7 @@ namespace ceph {
 
     int decode(const std::set<int> &want_to_read,
                 const std::map<int, bufferlist> &chunks,
-                std::map<int, bufferlist> *decoded, int chunk_size) override final;
+                std::map<int, bufferlist> *decoded, int chunk_size) override;
 
     virtual int _decode(const std::set<int> &want_to_read,
                        const std::map<int, bufferlist> &chunks,
diff --git a/src/erasure-code/clay/CMakeLists.txt b/src/erasure-code/clay/CMakeLists.txt
new file mode 100644 (file)
index 0000000..c5af1e2
--- /dev/null
@@ -0,0 +1,14 @@
+# clay plugin
+
+set(clay_srcs
+  ErasureCodePluginClay.cc
+  ErasureCodeClay.cc
+  $<TARGET_OBJECTS:erasure_code_objs>
+  $<TARGET_OBJECTS:crush_objs>
+  ${CMAKE_SOURCE_DIR}/src/common/str_map.cc
+)
+
+add_library(ec_clay SHARED ${clay_srcs})
+set_target_properties(ec_clay PROPERTIES
+  INSTALL_RPATH "")
+install(TARGETS ec_clay DESTINATION ${erasure_plugin_dir})
diff --git a/src/erasure-code/clay/ErasureCodeClay.cc b/src/erasure-code/clay/ErasureCodeClay.cc
new file mode 100644 (file)
index 0000000..d4f25e3
--- /dev/null
@@ -0,0 +1,900 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Myna Vajha <mynaramana@gmail.com>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <algorithm>
+
+#include "ErasureCodeClay.h"
+
+#include "common/debug.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "include/ceph_assert.h"
+#include "include/str_map.h"
+#include "include/stringify.h"
+#include "osd/osd_types.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+#define LARGEST_VECTOR_WORDSIZE 16
+#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
+
+using namespace std;
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "ErasureCodeClay: ";
+}
+
+static int pow_int(int a, int x) {
+  int power = 1;
+  while (x) {
+    if (x & 1) power *= a;
+    x /= 2;
+    a *= a;
+  }
+  return power;
+}
+
+ErasureCodeClay::~ErasureCodeClay()
+{
+  for (int i = 0; i < q*t; i++) {
+    if (U_buf[i].length() != 0) U_buf[i].clear();
+  }
+}
+
+int ErasureCodeClay::init(ErasureCodeProfile &profile,
+                         ostream *ss)
+{
+  int r;
+  r = parse(profile, ss);
+  if (r)
+    return r;
+
+  r = ErasureCode::init(profile, ss);
+  if (r)
+    return r;
+  ErasureCodePluginRegistry &registry = ErasureCodePluginRegistry::instance();
+  r = registry.factory(mds.profile["plugin"],
+                      directory,
+                      mds.profile,
+                      &mds.erasure_code,
+                      ss);
+  if (r)
+    return r;
+  r = registry.factory(pft.profile["plugin"],
+                      directory,
+                      pft.profile,
+                      &pft.erasure_code,
+                      ss);
+  return r;
+
+}
+
+unsigned int ErasureCodeClay::get_chunk_size(unsigned int object_size) const
+{
+  unsigned alignment = get_alignment();
+  unsigned tail = object_size % alignment;
+  unsigned padded_length = object_size + ( tail ?  ( alignment - tail ) : 0 );
+
+  ceph_assert(padded_length % (k*sub_chunk_no) == 0);
+
+  return padded_length / k;
+}
+
+int ErasureCodeClay::minimum_to_decode(const set<int> &want_to_read,
+                                      const set<int> &available,
+                                      map<int, vector<pair<int, int>>> *minimum)
+{
+  if (is_repair(want_to_read, available)) {
+    return minimum_to_repair(want_to_read, available, minimum);
+  } else {
+    return ErasureCode::minimum_to_decode(want_to_read, available, minimum);
+  }
+}
+
+int ErasureCodeClay::decode(const set<int> &want_to_read,
+                           const map<int, bufferlist> &chunks,
+                           map<int, bufferlist> *decoded, int chunk_size)
+{
+  set<int> avail;
+  for ([[maybe_unused]] auto& [node, bl] : chunks) {
+    avail.insert(node);
+  }
+
+  if (is_repair(want_to_read, avail)) {
+    return repair(want_to_read, chunks, decoded, chunk_size);
+  } else {
+    return ErasureCode::_decode(want_to_read, chunks, decoded);
+  }
+}
+
+void p(const set<int> &s) { cerr << s; } // for gdb
+
+int ErasureCodeClay::encode_chunks(const set<int> &want_to_encode,
+                                  map<int, bufferlist> *encoded)
+{
+  map<int, bufferlist> chunks;
+  set<int> parity_chunks;
+  int chunk_size = (*encoded)[0].length();
+
+  for (int i = 0; i < k + m; i++) {
+    if (i < k) {
+      chunks[i] = (*encoded)[i];
+    } else {
+      chunks[i+nu] = (*encoded)[i];
+      parity_chunks.insert(i+nu);
+    }
+  }
+
+  for (int i = k; i < k + nu; i++) {
+    bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
+    buf.zero();
+    chunks[i].push_back(std::move(buf));
+  }
+
+  int res = decode_layered(parity_chunks, &chunks);
+  for (int i = k ; i < k + nu; i++) {
+    // need to clean some of the intermediate chunks here!!
+    chunks[i].clear();
+  }
+  return res;
+}
+
+int ErasureCodeClay::decode_chunks(const set<int> &want_to_read,
+                                  const map<int, bufferlist> &chunks,
+                                  map<int, bufferlist> *decoded)
+{
+  set<int> erasures;
+  map<int, bufferlist> coded_chunks;
+
+  for (int i = 0; i < k + m; i++) {
+    if (chunks.count(i) == 0) {
+      erasures.insert(i < k ? i : i+nu);
+    }
+    ceph_assert(decoded->count(i) > 0);
+    coded_chunks[i < k ? i : i+nu] = (*decoded)[i];
+  }
+  int chunk_size = coded_chunks[0].length();
+
+  for (int i = k; i < k+nu; i++) {
+    bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
+    buf.zero();
+    coded_chunks[i].push_back(std::move(buf));
+  }
+
+  int res = decode_layered(erasures, &coded_chunks);
+  for (int i = k; i < k+nu; i++) {
+    coded_chunks[i].clear();
+  }
+  return res;
+}
+
+unsigned int ErasureCodeClay::get_alignment() const
+{
+  unsigned alignment = k * sub_chunk_no * w * sizeof(int);
+  if ((w * sizeof(int)) % LARGEST_VECTOR_WORDSIZE) {
+    alignment = k * sub_chunk_no * w * LARGEST_VECTOR_WORDSIZE;
+  }
+  return alignment;
+}
+
+int ErasureCodeClay::parse(ErasureCodeProfile &profile,
+                          ostream *ss)
+{
+  int err = 0;
+  err = ErasureCode::parse(profile, ss);
+  err |= to_int("k", profile, &k, DEFAULT_K, ss);
+  err |= to_int("m", profile, &m, DEFAULT_M, ss);
+
+  err |= sanity_check_k(k, ss);
+
+  err |= to_int("d", profile, &d, std::to_string(k+m-1), ss);
+
+  // check for scalar_mds in profile input
+  if (profile.find("scalar_mds") == profile.end() ||
+      profile.find("scalar_mds")->second.empty()) {
+    mds.profile["plugin"] = "jerasure";
+    pft.profile["plugin"] = "jerasure";
+  } else {
+    std::string p = profile.find("scalar_mds")->second;
+    if ((p == "jerasure") || (p == "isa") || (p == "shec")) {
+      mds.profile["plugin"] = p;
+      pft.profile["plugin"] = p;
+    } else {
+        *ss << "scalar_mds " << mds.profile["plugin"] <<
+               "is not currently supported, use one of 'jerasure',"<<
+               " 'isa', 'shec'" << std::endl;
+        err = -EINVAL;
+        return err;
+    }
+  }
+
+  if (profile.find("technique") == profile.end() ||
+      profile.find("technique")->second.empty()) {
+    if ((mds.profile["plugin"]=="jerasure") || (mds.profile["plugin"]=="isa") ) {
+      mds.profile["technique"] = "reed_sol_van";
+      pft.profile["technique"] = "reed_sol_van";
+    } else {
+      mds.profile["technique"] = "single";
+      pft.profile["technique"] = "single";
+    }
+  } else {
+    std::string p = profile.find("technique")->second;
+    if (mds.profile["plugin"] == "jerasure") {
+      if ( (p == "reed_sol_van") || (p == "reed_sol_r6_op") || (p == "cauchy_orig")
+           || (p == "cauchy_good") || (p == "liber8tion")) {
+        mds.profile["technique"] = p;
+        pft.profile["technique"] = p;
+      } else {
+        *ss << "technique " << p << "is not currently supported, use one of "
+           << "reed_sol_van', 'reed_sol_r6_op','cauchy_orig',"
+           << "'cauchy_good','liber8tion'"<< std::endl;
+        err = -EINVAL;
+        return err;
+      }
+    } else if (mds.profile["plugin"] == "isa") {
+      if ( (p == "reed_sol_van") || (p == "cauchy")) {
+        mds.profile["technique"] = p;
+        pft.profile["technique"] = p;
+      } else {
+        *ss << "technique " << p << "is not currently supported, use one of"
+           << "'reed_sol_van','cauchy'"<< std::endl;
+        err = -EINVAL;
+        return err;
+      }
+    } else {
+      if ( (p == "single") || (p == "multiple")) {
+        mds.profile["technique"] = p;
+        pft.profile["technique"] = p;
+      } else {
+        *ss << "technique " << p << "is not currently supported, use one of"<<
+               "'single','multiple'"<< std::endl;
+        err = -EINVAL;
+        return err;
+      }
+    }
+  }
+  if ((d < k) || (d > k + m - 1)) {
+    *ss << "value of d " << d
+        << " must be within [ " << k << "," << k+m-1 << "]" << std::endl;
+    err = -EINVAL;
+    return err;
+  }
+
+  q = d - k + 1;
+  if ((k + m) % q) {
+    nu = q - (k + m) % q;
+  } else {
+    nu = 0;
+  }
+
+  if (k+m+nu > 254) {
+    err = -EINVAL;
+    return err;
+  }
+
+  if (mds.profile["plugin"] == "shec") {
+    mds.profile["c"] = '2';
+    pft.profile["c"] = '2';
+  }
+  mds.profile["k"] = std::to_string(k+nu);
+  mds.profile["m"] = std::to_string(m);
+  mds.profile["w"] = '8';
+
+  pft.profile["k"] = '2';
+  pft.profile["m"] = '2';
+  pft.profile["w"] = '8';
+
+  t = (k + m + nu) / q;
+  sub_chunk_no = pow_int(q, t);
+
+  dout(10) << __func__
+          << " (q,t,nu)=(" << q << "," << t << "," << nu <<")" << dendl;
+
+  return err;
+}
+
+int ErasureCodeClay::is_repair(const set<int> &want_to_read,
+                              const set<int> &available_chunks) {
+
+  if (includes(available_chunks.begin(), available_chunks.end(),
+               want_to_read.begin(), want_to_read.end())) return 0;
+  if (want_to_read.size() > 1) return 0;
+
+  int i = *want_to_read.begin();
+  int lost_node_id = (i < k) ? i: i+nu;
+  for (int x = 0; x < q; x++) {
+    int node = (lost_node_id/q)*q+x;
+    node = (node < k) ? node : node-nu;
+    if (node != i) { // node in the same group other than erased node
+      if (available_chunks.count(node) == 0) return 0;
+    }
+  }
+
+  if (available_chunks.size() < (unsigned)d) return 0;
+  return 1;
+}
+
+int ErasureCodeClay::minimum_to_repair(const set<int> &want_to_read,
+                                      const set<int> &available_chunks,
+                                      map<int, vector<pair<int, int>>> *minimum)
+{
+  int i = *want_to_read.begin();
+  int lost_node_index = (i < k) ? i : i+nu;
+  int rep_node_index = 0;
+
+  // add all the nodes in lost node's y column.
+  vector<pair<int, int>> sub_chunk_ind;
+  get_repair_subchunks(lost_node_index, sub_chunk_ind);
+  if ((available_chunks.size() >= (unsigned)d)) {
+    for (int j = 0; j < q; j++) {
+      if (j != lost_node_index%q) {
+        rep_node_index = (lost_node_index/q)*q+j;
+        if (rep_node_index < k) {
+          minimum->insert(make_pair(rep_node_index, sub_chunk_ind));
+        } else if (rep_node_index >= k+nu) {
+          minimum->insert(make_pair(rep_node_index-nu, sub_chunk_ind));
+        }
+      }
+    }
+    for (auto chunk : available_chunks) {
+      if (minimum->size() >= (unsigned)d) {
+       break;
+      }
+      if (!minimum->count(chunk)) {
+       minimum->emplace(chunk, sub_chunk_ind);
+      }
+    }
+  } else {
+    dout(0) << "minimum_to_repair: shouldn't have come here" << dendl;
+    ceph_assert(0);
+  }
+  ceph_assert(minimum->size() == (unsigned)d);
+  return 0;
+}
+
+void ErasureCodeClay::get_repair_subchunks(const int &lost_node,
+                                          vector<pair<int, int>> &repair_sub_chunks_ind)
+{
+  const int y_lost = lost_node / q;
+  const int x_lost = lost_node % q;
+
+  const int seq_sc_count = pow_int(q, t-1-y_lost);
+  const int num_seq = pow_int(q, y_lost);
+
+  int index = x_lost * seq_sc_count;
+  for (int ind_seq = 0; ind_seq < num_seq; ind_seq++) {
+    repair_sub_chunks_ind.push_back(make_pair(index, seq_sc_count));
+    index += q * seq_sc_count;
+  }
+}
+
+int ErasureCodeClay::get_repair_sub_chunk_count(const set<int> &want_to_read)
+{
+  int weight_vector[t] = {0};
+  for (auto to_read : want_to_read) {
+    weight_vector[to_read / q]++;
+  }
+
+  int repair_subchunks_count = 1;
+  for (int y = 0; y < t; y++) {
+    repair_subchunks_count = repair_subchunks_count*(q-weight_vector[y]);
+  }
+
+  return sub_chunk_no - repair_subchunks_count;
+}
+
+int ErasureCodeClay::repair(const set<int> &want_to_read,
+                           const map<int, bufferlist> &chunks,
+                           map<int, bufferlist> *repaired, int chunk_size)
+{
+
+  ceph_assert((want_to_read.size() == 1) && (chunks.size() == (unsigned)d));
+
+  int repair_sub_chunk_no = get_repair_sub_chunk_count(want_to_read);
+  vector<pair<int, int>> repair_sub_chunks_ind;
+
+  unsigned repair_blocksize = chunks.begin()->second.length();
+  assert(repair_blocksize%repair_sub_chunk_no == 0);
+
+  unsigned sub_chunksize = repair_blocksize/repair_sub_chunk_no;
+  unsigned chunksize = sub_chunk_no*sub_chunksize;
+
+  ceph_assert(chunksize == (unsigned)chunk_size);
+
+  map<int, bufferlist> recovered_data;
+  map<int, bufferlist> helper_data;
+  set<int> aloof_nodes;
+
+  for (int i =  0; i < k + m; i++) {
+    // included helper data only for d+nu nodes.
+    if (auto found = chunks.find(i); found != chunks.end()) { // i is a helper
+      if (i<k) {
+       helper_data[i] = found->second;
+      } else {
+       helper_data[i+nu] = found->second;
+      }
+    } else {
+      if (i != *want_to_read.begin()) { // aloof node case.
+        int aloof_node_id = (i < k) ? i: i+nu;
+        aloof_nodes.insert(aloof_node_id);
+      } else {
+        bufferptr ptr(buffer::create_aligned(chunksize, SIMD_ALIGN));
+       ptr.zero();
+        int lost_node_id = (i < k) ? i : i+nu;
+        (*repaired)[i].push_back(ptr);
+        recovered_data[lost_node_id] = (*repaired)[i];
+        get_repair_subchunks(lost_node_id, repair_sub_chunks_ind);
+      }
+    }
+  }
+
+  // this is for shortened codes i.e., when nu > 0
+  for (int i=k; i < k+nu; i++) {
+    bufferptr ptr(buffer::create_aligned(repair_blocksize, SIMD_ALIGN));
+    ptr.zero();
+    helper_data[i].push_back(ptr);
+  }
+
+  ceph_assert(helper_data.size()+aloof_nodes.size()+recovered_data.size() ==
+             (unsigned) q*t);
+
+  int r = repair_one_lost_chunk(recovered_data, aloof_nodes,
+                               helper_data, repair_blocksize,
+                               repair_sub_chunks_ind);
+
+  // clear buffers created for the purpose of shortening
+  for (int i = k; i < k+nu; i++) {
+    helper_data[i].clear();
+  }
+
+  return r;
+}
+
+int ErasureCodeClay::repair_one_lost_chunk(map<int, bufferlist> &recovered_data,
+                                          set<int> &aloof_nodes,
+                                          map<int, bufferlist> &helper_data,
+                                          int repair_blocksize,
+                                          vector<pair<int,int>> &repair_sub_chunks_ind)
+{
+  unsigned repair_subchunks = (unsigned)sub_chunk_no / q;
+  unsigned sub_chunksize = repair_blocksize / repair_subchunks;
+
+  int z_vec[t];
+  map<int, set<int> > ordered_planes;
+  map<int, int> repair_plane_to_ind;
+  int count_retrieved_sub_chunks = 0;
+  int plane_ind = 0;
+
+  bufferptr buf(buffer::create_aligned(sub_chunksize, SIMD_ALIGN));
+  bufferlist temp_buf;
+  temp_buf.push_back(buf);
+
+  for (auto [index,count] : repair_sub_chunks_ind) {
+    for (int j = index; j < index + count; j++) {
+      get_plane_vector(j, z_vec);
+      int order = 0;
+      // check across all erasures and aloof nodes
+      for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
+        if (node % q == z_vec[node / q]) order++;
+      }
+      for (auto node : aloof_nodes) {
+        if (node % q == z_vec[node / q]) order++;
+      }
+      ceph_assert(order > 0);
+      ordered_planes[order].insert(j);
+      // to keep track of a sub chunk within helper buffer recieved
+      repair_plane_to_ind[j] = plane_ind;
+      plane_ind++;
+    }
+  }
+  assert((unsigned)plane_ind == repair_subchunks);
+
+  int plane_count = 0;
+  for (int i = 0; i < q*t; i++) {
+    if (U_buf[i].length() == 0) {
+      bufferptr buf(buffer::create_aligned(sub_chunk_no*sub_chunksize, SIMD_ALIGN));
+      buf.zero();
+      U_buf[i].push_back(std::move(buf));
+    }
+  }
+
+  int lost_chunk;
+  int count = 0;
+  for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
+    lost_chunk = node;
+    count++;
+  }
+  ceph_assert(count == 1);
+
+  set<int> erasures;
+  for (int i = 0; i < q; i++) {
+    erasures.insert(lost_chunk - lost_chunk % q + i);
+  }
+  for (auto node : aloof_nodes) {
+    erasures.insert(node);
+  }
+
+  for (int order = 1; ;order++) {
+    if (ordered_planes.count(order) == 0) {
+      break;
+    }
+    plane_count += ordered_planes[order].size();
+    for (auto z : ordered_planes[order]) {
+      get_plane_vector(z, z_vec);
+
+      for (int y = 0; y < t; y++) {
+       for (int x = 0; x < q; x++) {
+         int node_xy = y*q + x;
+         map<int, bufferlist> known_subchunks;
+         map<int, bufferlist> pftsubchunks;
+         set<int> pft_erasures;
+         if (erasures.count(node_xy) == 0) {
+           assert(helper_data.count(node_xy) > 0);
+           int z_sw = z + (x - z_vec[y])*pow_int(q,t-1-y);
+           int node_sw = y*q + z_vec[y];
+           int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
+           if (z_vec[y] > x) {
+             i0 = 1;
+             i1 = 0;
+             i2 = 3;
+             i3 = 2;
+           }
+           if (aloof_nodes.count(node_sw) > 0) {
+             assert(repair_plane_to_ind.count(z) > 0);
+             assert(repair_plane_to_ind.count(z_sw) > 0);
+             pft_erasures.insert(i2);
+             known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
+             known_subchunks[i3].substr_of(U_buf[node_sw], z_sw*sub_chunksize, sub_chunksize);
+             pftsubchunks[i0] = known_subchunks[i0];
+             pftsubchunks[i1] = temp_buf;
+             pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
+             pftsubchunks[i3] = known_subchunks[i3];
+             for (int i=0; i<3; i++) {
+               pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
+             }
+             pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
+           } else {
+             ceph_assert(helper_data.count(node_sw) > 0);
+             ceph_assert(repair_plane_to_ind.count(z) > 0);
+             if (z_vec[y] != x){
+               pft_erasures.insert(i2);
+               ceph_assert(repair_plane_to_ind.count(z_sw) > 0);
+               known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
+               known_subchunks[i1].substr_of(helper_data[node_sw], repair_plane_to_ind[z_sw]*sub_chunksize, sub_chunksize);
+               pftsubchunks[i0] = known_subchunks[i0];
+               pftsubchunks[i1] = known_subchunks[i1];
+               pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
+               pftsubchunks[i3].substr_of(temp_buf, 0, sub_chunksize);
+               for (int i=0; i<3; i++) {
+                 pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
+               }
+               pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
+             } else {
+               char* uncoupled_chunk = U_buf[node_xy].c_str();
+               char* coupled_chunk = helper_data[node_xy].c_str();
+               memcpy(&uncoupled_chunk[z*sub_chunksize],
+                      &coupled_chunk[repair_plane_to_ind[z]*sub_chunksize],
+                      sub_chunksize);
+             }
+           }
+         }
+       } // x
+      } // y
+      ceph_assert(erasures.size() <= (unsigned)m);
+      decode_uncoupled(erasures, z, sub_chunksize);
+
+      for (auto i : erasures) {
+       int x = i % q;
+       int y = i / q;
+       int node_sw = y*q+z_vec[y];
+       int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
+       set<int> pft_erasures;
+       map<int, bufferlist> known_subchunks;
+       map<int, bufferlist> pftsubchunks;
+       int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
+       if (z_vec[y] > x) {
+         i0 = 1;
+         i1 = 0;
+         i2 = 3;
+         i3 = 2;
+       }
+       // make sure it is not an aloof node before you retrieve repaired_data
+       if (aloof_nodes.count(i) == 0) {
+         if (x == z_vec[y]) { // hole-dot pair (type 0)
+           char* coupled_chunk = recovered_data[i].c_str();
+           char* uncoupled_chunk = U_buf[i].c_str();
+           memcpy(&coupled_chunk[z*sub_chunksize],
+                  &uncoupled_chunk[z*sub_chunksize],
+                  sub_chunksize);
+           count_retrieved_sub_chunks++;
+         } else {
+           ceph_assert(y == lost_chunk / q);
+           ceph_assert(node_sw == lost_chunk);
+           ceph_assert(helper_data.count(i) > 0);
+           pft_erasures.insert(i1);
+           known_subchunks[i0].substr_of(helper_data[i], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
+           known_subchunks[i2].substr_of(U_buf[i], z*sub_chunksize, sub_chunksize);
+
+           pftsubchunks[i0] = known_subchunks[i0];
+           pftsubchunks[i1].substr_of(recovered_data[node_sw], z_sw*sub_chunksize, sub_chunksize);
+           pftsubchunks[i2] = known_subchunks[i2];
+           pftsubchunks[i3] = temp_buf;
+           for (int i=0; i<3; i++) {
+             pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
+           }
+           pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
+         }
+       }
+      } // recover all erasures
+    } // planes of particular order
+  } // order
+
+  return 0;
+}
+
+
+int ErasureCodeClay::decode_layered(set<int> &erased_chunks,
+                                    map<int, bufferlist> *chunks)
+{
+  int num_erasures = erased_chunks.size();
+
+  int size = (*chunks)[0].length();
+  ceph_assert(size%sub_chunk_no == 0);
+  int sc_size = size / sub_chunk_no;
+
+  ceph_assert(num_erasures > 0);
+
+  for (int i = k+nu; (num_erasures < m) && (i < q*t); i++) {
+    if ([[maybe_unused]] auto [it, added] = erased_chunks.emplace(i); added) {
+      num_erasures++;
+    }
+  }
+  ceph_assert(num_erasures == m);
+
+  int max_iscore = get_max_iscore(erased_chunks);
+  int order[sub_chunk_no];
+  int z_vec[t];
+  for (int i = 0; i < q*t; i++) {
+    if (U_buf[i].length() == 0) {
+      bufferptr buf(buffer::create_aligned(size, SIMD_ALIGN));
+      buf.zero();
+      U_buf[i].push_back(std::move(buf));
+    }
+  }
+
+  set_planes_sequential_decoding_order(order, erased_chunks);
+
+  for (int iscore = 0; iscore <= max_iscore; iscore++) {
+   for (int z = 0; z < sub_chunk_no; z++) {
+      if (order[z] == iscore) {
+        decode_erasures(erased_chunks, z, chunks, sc_size);
+      }
+    }
+
+    for (int z = 0; z < sub_chunk_no; z++) {
+      if (order[z] == iscore) {
+       get_plane_vector(z, z_vec);
+        for (auto node_xy : erased_chunks) {
+          int x = node_xy % q;
+          int y = node_xy / q;
+         int node_sw = y*q+z_vec[y];
+          if (z_vec[y] != x) {
+            if (erased_chunks.count(node_sw) == 0) {
+             recover_type1_erasure(chunks, x, y, z, z_vec, sc_size);
+           } else if (z_vec[y] < x){
+              ceph_assert(erased_chunks.count(node_sw) > 0);
+              ceph_assert(z_vec[y] != x);
+              get_coupled_from_uncoupled(chunks, x, y, z, z_vec, sc_size);
+           }
+         } else {
+           char* C = (*chunks)[node_xy].c_str();
+            char* U = U_buf[node_xy].c_str();
+            memcpy(&C[z*sc_size], &U[z*sc_size], sc_size);
+          }
+        }
+      }
+    } // plane
+  } // iscore, order
+
+  return 0;
+}
+
+int ErasureCodeClay::decode_erasures(const set<int>& erased_chunks, int z,
+                                    map<int, bufferlist>* chunks, int sc_size)
+{
+  int z_vec[t];
+
+  get_plane_vector(z, z_vec);
+
+  for (int x = 0; x < q; x++) {
+    for (int y = 0; y < t; y++) {
+      int node_xy = q*y+x;
+      int node_sw = q*y+z_vec[y];
+      if (erased_chunks.count(node_xy) == 0) {
+       if (z_vec[y] < x) {
+         get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
+       } else if (z_vec[y] == x) {
+         char* uncoupled_chunk = U_buf[node_xy].c_str();
+         char* coupled_chunk = (*chunks)[node_xy].c_str();
+          memcpy(&uncoupled_chunk[z*sc_size], &coupled_chunk[z*sc_size], sc_size);
+        } else {
+          if (erased_chunks.count(node_sw) > 0) {
+            get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
+          }
+        }
+      }
+    }
+  }
+  return decode_uncoupled(erased_chunks, z, sc_size);
+}
+
+int ErasureCodeClay::decode_uncoupled(const set<int>& erased_chunks, int z, int sc_size)
+{
+  map<int, bufferlist> known_subchunks;
+  map<int, bufferlist> all_subchunks;
+
+  for (int i = 0; i < q*t; i++) {
+    if (erased_chunks.count(i) == 0) {
+      known_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
+      all_subchunks[i] = known_subchunks[i];
+    } else {
+      all_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
+    }
+    all_subchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
+    assert(all_subchunks[i].is_contiguous());
+  }
+
+  mds.erasure_code->decode_chunks(erased_chunks, known_subchunks, &all_subchunks);
+  return 0;
+}
+
+void ErasureCodeClay::set_planes_sequential_decoding_order(int* order, set<int>& erasures) {
+  int z_vec[t];
+  for (int z = 0; z < sub_chunk_no; z++) {
+    get_plane_vector(z,z_vec);
+    order[z] = 0;
+    for (auto i : erasures) {
+      if (i % q == z_vec[i / q]) {
+       order[z] = order[z] + 1;
+      }
+    }
+  }
+}
+
+void ErasureCodeClay::recover_type1_erasure(map<int, bufferlist>* chunks,
+                                           int x, int y, int z,
+                                           int* z_vec, int sc_size)
+{
+  set<int> erased_chunks;
+
+  int node_xy = y*q+x;
+  int node_sw = y*q+z_vec[y];
+  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
+
+  map<int, bufferlist> known_subchunks;
+  map<int, bufferlist> pftsubchunks;
+  bufferptr ptr(buffer::create_aligned(sc_size, SIMD_ALIGN));
+  ptr.zero();
+
+  int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
+  if (z_vec[y] > x) {
+    i0 = 1;
+    i1 = 0;
+    i2 = 3;
+    i3 = 2;
+  }
+
+  erased_chunks.insert(i0);
+  pftsubchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
+  known_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
+  known_subchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
+  pftsubchunks[i1] = known_subchunks[i1];
+  pftsubchunks[i2] = known_subchunks[i2];
+  pftsubchunks[i3].push_back(ptr);
+
+  for (int i=0; i<3; i++) {
+    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
+  }
+
+  pft.erasure_code->decode_chunks(erased_chunks, known_subchunks, &pftsubchunks);
+}
+
+void ErasureCodeClay::get_coupled_from_uncoupled(map<int, bufferlist>* chunks,
+                                                int x, int y, int z,
+                                                int* z_vec, int sc_size)
+{
+  set<int> erased_chunks = {0, 1};
+
+  int node_xy = y*q+x;
+  int node_sw = y*q+z_vec[y];
+  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
+
+  ceph_assert(z_vec[y] < x);
+  map<int, bufferlist> uncoupled_subchunks;
+  uncoupled_subchunks[2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
+  uncoupled_subchunks[3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
+
+  map<int, bufferlist> pftsubchunks;
+  pftsubchunks[0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
+  pftsubchunks[1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
+  pftsubchunks[2] = uncoupled_subchunks[2];
+  pftsubchunks[3] = uncoupled_subchunks[3];
+
+  for (int i=0; i<3; i++) {
+    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
+  }
+  pft.erasure_code->decode_chunks(erased_chunks, uncoupled_subchunks, &pftsubchunks);
+}
+
+void ErasureCodeClay::get_uncoupled_from_coupled(map<int, bufferlist>* chunks,
+                                                int x, int y, int z,
+                                                int* z_vec, int sc_size)
+{
+  set<int> erased_chunks = {2, 3};
+
+  int node_xy = y*q+x;
+  int node_sw = y*q+z_vec[y];
+  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
+
+  int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
+  if (z_vec[y] > x) {
+    i0 = 1;
+    i1 = 0;
+    i2 = 3;
+    i3 = 2;
+  }
+  map<int, bufferlist> coupled_subchunks;
+  coupled_subchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
+  coupled_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
+
+  map<int, bufferlist> pftsubchunks;
+  pftsubchunks[0] = coupled_subchunks[0];
+  pftsubchunks[1] = coupled_subchunks[1];
+  pftsubchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
+  pftsubchunks[i3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
+  for (int i=0; i<3; i++) {
+    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
+  }
+  pft.erasure_code->decode_chunks(erased_chunks, coupled_subchunks, &pftsubchunks);
+}
+
+int ErasureCodeClay::get_max_iscore(set<int>& erased_chunks)
+{
+  int weight_vec[t];
+  int iscore = 0;
+  memset(weight_vec, 0, sizeof(int)*t);
+
+  for (auto i : erased_chunks) {
+    if (weight_vec[i / q] == 0) {
+      weight_vec[i / q] = 1;
+      iscore++;
+    }
+  }
+  return iscore;
+}
+
+void ErasureCodeClay::get_plane_vector(int z, int* z_vec)
+{
+  for (int i = 0; i < t; i++) {
+    z_vec[t-1-i] = z % q;
+    z = (z - z_vec[t-1-i]) / q;
+  }
+}
diff --git a/src/erasure-code/clay/ErasureCodeClay.h b/src/erasure-code/clay/ErasureCodeClay.h
new file mode 100644 (file)
index 0000000..6f0849d
--- /dev/null
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Myna Vajha <mynaramana@gmail.com>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_ERASURE_CODE_CLAY_H
+#define CEPH_ERASURE_CODE_CLAY_H
+
+#include "include/err.h"
+#include "include/buffer_fwd.h"
+#include "erasure-code/ErasureCode.h"
+
+class ErasureCodeClay final : public ceph::ErasureCode {
+public:
+  std::string DEFAULT_K{"4"};
+  std::string DEFAULT_M{"2"};
+  std::string DEFAULT_W{"8"};
+  int k = 0, m = 0, d = 0, w = 8;
+  int q = 0, t = 0, nu = 0;
+  int sub_chunk_no = 0;
+
+  std::map<int, ceph::bufferlist> U_buf;
+
+  struct ScalarMDS {
+    ceph::ErasureCodeInterfaceRef erasure_code;
+    ceph::ErasureCodeProfile profile;
+  };
+  ScalarMDS mds;
+  ScalarMDS pft;
+  const std::string directory;
+
+  explicit ErasureCodeClay(const std::string& dir)
+    : directory(dir)
+  {}
+
+  ~ErasureCodeClay() override;
+
+  unsigned int get_chunk_count() const override {
+    return k+m;
+  }
+
+  unsigned int get_data_chunk_count() const override {
+    return k;
+  }
+
+  int get_sub_chunk_count() override {
+    return sub_chunk_no;
+  }
+
+  unsigned int get_chunk_size(unsigned int object_size) const override;
+
+  int minimum_to_decode(const std::set<int> &want_to_read,
+                       const std::set<int> &available,
+                       std::map<int, std::vector<std::pair<int, int>>> *minimum) override;
+
+  int decode(const std::set<int> &want_to_read,
+             const std::map<int, ceph::bufferlist> &chunks,
+             std::map<int, ceph::bufferlist> *decoded, int chunk_size) override;
+
+  int encode_chunks(const std::set<int> &want_to_encode,
+                   std::map<int, ceph::bufferlist> *encoded) override;
+
+  int decode_chunks(const std::set<int> &want_to_read,
+                   const std::map<int, ceph::bufferlist> &chunks,
+                   std::map<int, ceph::bufferlist> *decoded) override;
+
+  int init(ceph::ErasureCodeProfile &profile, std::ostream *ss) override;
+
+  int is_repair(const std::set<int> &want_to_read,
+                const std::set<int> &available_chunks);
+
+  int get_repair_sub_chunk_count(const std::set<int> &want_to_read);
+
+  virtual int parse(ceph::ErasureCodeProfile &profile, std::ostream *ss);
+
+  unsigned get_alignment() const;
+
+private:
+  int minimum_to_repair(const std::set<int> &want_to_read,
+                        const std::set<int> &available_chunks,
+                        std::map<int, std::vector<std::pair<int, int>>> *minimum);
+
+  int repair(const std::set<int> &want_to_read,
+             const std::map<int, ceph::bufferlist> &chunks,
+             std::map<int, ceph::bufferlist> *recovered, int chunk_size);
+
+  int decode_layered(std::set<int>& erased_chunks, std::map<int, ceph::bufferlist>* chunks);
+
+  int repair_one_lost_chunk(std::map<int, ceph::bufferlist> &recovered_data, std::set<int> &aloof_nodes,
+                            std::map<int, ceph::bufferlist> &helper_data, int repair_blocksize,
+                            std::vector<std::pair<int,int>> &repair_sub_chunks_ind);
+
+  void get_repair_subchunks(const int &lost_node,
+                           std::vector<std::pair<int, int>> &repair_sub_chunks_ind);
+
+  int decode_erasures(const std::set<int>& erased_chunks, int z,
+                      std::map<int, ceph::bufferlist>* chunks, int sc_size);
+
+  int decode_uncoupled(const std::set<int>& erasures, int z, int ss_size);
+
+  void set_planes_sequential_decoding_order(int* order, std::set<int>& erasures);
+
+  void recover_type1_erasure(std::map<int, ceph::bufferlist>* chunks, int x, int y, int z,
+                             int* z_vec, int sc_size);
+
+  void get_uncoupled_from_coupled(std::map<int, ceph::bufferlist>* chunks, int x, int y, int z,
+                                  int* z_vec, int sc_size);
+
+  void get_coupled_from_uncoupled(std::map<int, ceph::bufferlist>* chunks, int x, int y, int z,
+                                  int* z_vec, int sc_size);
+
+  void get_plane_vector(int z, int* z_vec);
+
+  int get_max_iscore(std::set<int>& erased_chunks);
+};
+
+#endif
diff --git a/src/erasure-code/clay/ErasureCodePluginClay.cc b/src/erasure-code/clay/ErasureCodePluginClay.cc
new file mode 100644 (file)
index 0000000..ab3d1db
--- /dev/null
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Myna Vajha <mynaramana@gmail.com>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "ceph_ver.h"
+#include "common/debug.h"
+#include "ErasureCodePluginClay.h"
+#include "ErasureCodeClay.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+int ErasureCodePluginClay::factory(const std::string &directory,
+                     ErasureCodeProfile &profile,
+                     ErasureCodeInterfaceRef *erasure_code,
+                     std::ostream *ss) {
+  auto interface = std::make_unique<ErasureCodeClay>(directory);
+  if (int r = interface->init(profile, ss); r) {
+    return r;
+  }
+  *erasure_code = ErasureCodeInterfaceRef(interface.release());
+  return 0;
+};
+
+const char *__erasure_code_version() { return CEPH_GIT_NICE_VER; }
+
+int __erasure_code_init(char *plugin_name, char *directory)
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  return instance.add(plugin_name, new ErasureCodePluginClay());
+}
diff --git a/src/erasure-code/clay/ErasureCodePluginClay.h b/src/erasure-code/clay/ErasureCodePluginClay.h
new file mode 100644 (file)
index 0000000..d664128
--- /dev/null
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2018 Myna Vajha <mynaramana@gmail.com>
+ *
+ * Author: Myna Vajha <mynaramana@gmail.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_ERASURE_CODE_PLUGIN_CLAY_H
+#define CEPH_ERASURE_CODE_PLUGIN_CLAY_H
+
+#include "erasure-code/ErasureCodePlugin.h"
+
+class ErasureCodePluginClay : public ErasureCodePlugin {
+public:
+  int factory(const std::string& directory,
+                     ErasureCodeProfile &profile,
+                     ErasureCodeInterfaceRef *erasure_code,
+                     ostream *ss) override;
+};
+
+#endif
index 2cd7cf2c98355e45f9a49ccbcd3649d4facd5f84..721f6c36750fa9bf25658bfb8bd4fb6afdcb4e49 100644 (file)
@@ -234,3 +234,31 @@ target_link_libraries(unittest_erasure_code_shec_arguments
   ceph-common
   ec_shec
   )
+
+#unitest_erasure_code_clay
+add_executable(unittest_erasure_code_clay
+  TestErasureCodeClay.cc
+  $<TARGET_OBJECTS:unit-main>)
+add_ceph_unittest(unittest_erasure_code_clay)
+target_link_libraries(unittest_erasure_code_clay
+  global
+  ${CMAKE_DL_LIBS}
+  ${UNITTEST_LIBS}
+  ceph-common
+  ec_clay
+  )
+
+# unittest_erasure_code_plugin_clay
+add_executable(unittest_erasure_code_plugin_clay
+  TestErasureCodePluginClay.cc
+  $<TARGET_OBJECTS:unit-main>)
+add_ceph_unittest(unittest_erasure_code_plugin_clay)
+add_dependencies(unittest_erasure_code_plugin_clay
+  ec_clay)
+target_link_libraries(unittest_erasure_code_plugin_clay
+  GTest::Main
+  global
+  ${CMAKE_DL_LIBS}
+  ${UNITTEST_LIBS}
+  ceph-common)
+
diff --git a/src/test/erasure-code/TestErasureCodeClay.cc b/src/test/erasure-code/TestErasureCodeClay.cc
new file mode 100644 (file)
index 0000000..9f932c9
--- /dev/null
@@ -0,0 +1,595 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "crush/CrushWrapper.h"
+#include "include/stringify.h"
+#include "erasure-code/clay/ErasureCodeClay.h"
+#include "global/global_context.h"
+#include "common/config_proxy.h"
+#include "gtest/gtest.h"
+
+TEST(ErasureCodeClay, sanity_check_k)
+{
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "1";
+  profile["m"] = "1";
+  ostringstream errors;
+  EXPECT_EQ(-EINVAL, clay.init(profile, &errors));
+  EXPECT_NE(std::string::npos, errors.str().find("must be >= 2"));
+}
+
+TEST(ErasureCodeClay, encode_decode)
+{
+  ostringstream errors;
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  int r= clay.init(profile, &cerr);
+  EXPECT_EQ(0, r);
+
+#define LARGE_ENOUGH 2048
+  bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
+  in_ptr.zero();
+  in_ptr.set_length(0);
+  const char *payload =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  in_ptr.append(payload, strlen(payload));
+  bufferlist in;
+  in.push_back(in_ptr);
+  int want_to_encode[] = { 0, 1, 2, 3 };
+  map<int, bufferlist> encoded;
+  EXPECT_EQ(0, clay.encode(set<int>(want_to_encode, want_to_encode+4),
+                          in,
+                          &encoded));
+  EXPECT_EQ(4u, encoded.size());
+  unsigned length =  encoded[0].length();
+  EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), length));
+  EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str() + length,
+                      in.length() - length));
+
+
+  // all chunks are available
+  {
+    int want_to_decode[] = { 0, 1 };
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+2),
+                             encoded,
+                             &decoded));
+    EXPECT_EQ(2u, decoded.size()); 
+    EXPECT_EQ(length, decoded[0].length());
+    EXPECT_EQ(0, memcmp(decoded[0].c_str(), in.c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[1].c_str(), in.c_str() + length,
+                       in.length() - length));
+  }
+
+  // check all two chunks missing possibilities and recover them
+  for (int i=1; i<4; i++) {
+    for (int j=0; j<i; j++) {
+      map<int, bufferlist> degraded = encoded;
+      degraded.erase(j);
+      degraded.erase(i);
+      EXPECT_EQ(2u, degraded.size());
+      int want_to_decode[] = {j,i};
+      map<int, bufferlist> decoded;
+      EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+2),
+                               degraded,
+                               &decoded));
+      EXPECT_EQ(4u, decoded.size()); 
+      EXPECT_EQ(length, decoded[j].length());
+      EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length));
+      EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+    }
+  }
+  //check for all one chunk missing possibilities
+  int sc_size = length/clay.sub_chunk_no;
+  int avail[] = {0,1,2,3};
+  for (int i=0; i < 4; i++) {
+    set<int> want_to_read;
+    want_to_read.insert(i);
+    set<int> available(avail, avail+4);
+    available.erase(i);
+    map<int, vector<pair<int,int>>> minimum;
+    EXPECT_EQ(0, clay.minimum_to_decode(want_to_read, available, &minimum));
+    map<int, bufferlist> helper;
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      for(vector<pair<int,int>>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) {
+       bufferlist temp;
+       temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size);
+       helper[h->first].append(temp);
+      }
+    }
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      EXPECT_EQ(length/clay.q, helper[h->first].length());
+    }
+    EXPECT_EQ(3u, helper.size());
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length));
+    EXPECT_EQ(1u, decoded.size());
+    EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+  }
+}
+
+
+TEST(ErasureCodeClay, encode_decode_aloof_nodes)
+{
+  ostringstream errors;
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "3";
+  profile["m"] = "3";
+  profile["d"] = "4";
+  int r= clay.init(profile, &cerr);
+  EXPECT_EQ(0, r);
+
+#define LARGE_ENOUGH 2048
+  bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
+  in_ptr.zero();
+  in_ptr.set_length(0);
+  const char *payload =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  in_ptr.append(payload, strlen(payload));
+  bufferlist in;
+  in.push_back(in_ptr);
+  int want_to_encode[] = { 0, 1, 2, 3, 4, 5 };
+  map<int, bufferlist> encoded;
+  EXPECT_EQ(0, clay.encode(set<int>(want_to_encode, want_to_encode+6),
+                          in,
+                          &encoded));
+  EXPECT_EQ(6u, encoded.size());
+  unsigned length =  encoded[0].length();
+  if (in.length() < length) {
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+  } else if (in.length() <= 2*length ) {
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+    EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, in.length()-length));
+  } else {
+    EXPECT_EQ(1, in.length() <= 3*length);
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+    EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length));
+    EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, in.length()-2*length));
+  }
+
+  // all chunks are available
+  {
+    int want_to_decode[] = { 0, 1, 2 };
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+3),
+                             encoded,
+                             &decoded));
+    EXPECT_EQ(3u, decoded.size()); 
+    EXPECT_EQ(length, decoded[0].length());
+    EXPECT_EQ(0, memcmp(decoded[0].c_str(), encoded[0].c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[1].c_str(), encoded[1].c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[2].c_str(), encoded[2].c_str(), length));
+  }
+
+  // check all three chunks missing possibilities and recover them
+  for (int i=2; i<6; i++) {
+    for (int j=1; j<i; j++) {
+      for(int k=0; k<j; k++) {
+       map<int, bufferlist> degraded = encoded;
+       degraded.erase(k);
+       degraded.erase(j);
+       degraded.erase(i);
+       EXPECT_EQ(3u, degraded.size());
+       int want_to_decode[] = {k,j,i};
+       map<int, bufferlist> decoded;
+       EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+3),
+                                 degraded,
+                                 &decoded));
+       EXPECT_EQ(6u, decoded.size()); 
+       EXPECT_EQ(length, decoded[j].length());
+       EXPECT_EQ(0, memcmp(decoded[k].c_str(), encoded[k].c_str(), length));
+       EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length));
+       EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+      }
+    }
+  }
+  //check for all one chunk missing possibilities
+  int sc_size = length/clay.sub_chunk_no;
+  int avail[] = {0,1,2,3,4,5};
+  for (int i=0; i < 6; i++) {
+    vector<pair<int,int>> repair_subchunks;
+    map<int, vector<pair<int,int>>> minimum;
+    set<int> want_to_read;
+    want_to_read.insert(i);
+    set<int> available(avail, avail+6);
+    available.erase(i);
+    clay.minimum_to_decode(want_to_read, available, &minimum);
+    map<int, bufferlist> helper;
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      for(vector<pair<int,int>>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) {
+       bufferlist temp;
+       temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size);
+       helper[h->first].append(temp);
+      }
+    }  
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      EXPECT_EQ(length/clay.q, helper[h->first].length());
+    }
+    EXPECT_EQ((unsigned)clay.d, helper.size());
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length));
+    EXPECT_EQ(1u, decoded.size());
+    EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+  }
+}
+
+TEST(ErasureCodeClay, encode_decode_shortening_case)
+{
+  ostringstream errors;
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "4";
+  profile["m"] = "3";
+  profile["d"] = "5";
+  int r= clay.init(profile, &cerr);
+  EXPECT_EQ(0, r);
+
+  EXPECT_EQ(2, clay.q);
+  EXPECT_EQ(4, clay.t);
+  EXPECT_EQ(1, clay.nu);
+
+  bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
+  in_ptr.zero();
+  in_ptr.set_length(0);
+  const char *payload =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  in_ptr.append(payload, strlen(payload));
+  bufferlist in;
+  in.push_back(in_ptr);
+  int want_to_encode[] = { 0, 1, 2, 3, 4, 5, 6 };
+  map<int, bufferlist> encoded;
+  EXPECT_EQ(0, clay.encode(set<int>(want_to_encode, want_to_encode+7),
+                          in,
+                          &encoded));
+  EXPECT_EQ(7u, encoded.size());
+  unsigned length =  encoded[0].length();
+  if (in.length() < length) {
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+  } else if (in.length() <= 2*length) {
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+    EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, in.length()-length));
+  } else if (in.length() <= 3*length) {
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+    EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length));
+    EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, in.length()-2*length));
+  } else {
+    EXPECT_EQ(1, in.length() <= 4*length);
+    EXPECT_EQ(0, memcmp(encoded[0].c_str(), in.c_str(), in.length()));
+    EXPECT_EQ(0, memcmp(encoded[1].c_str(), in.c_str()+length, length));
+    EXPECT_EQ(0, memcmp(encoded[2].c_str(), in.c_str()+2*length, length));
+    EXPECT_EQ(0, memcmp(encoded[3].c_str(), in.c_str()+3*length, in.length()-3*length));
+  }
+
+  // all chunks are available
+  {
+    int want_to_decode[] = { 0, 1, 2, 3 };
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+4),
+                             encoded,
+                             &decoded));
+    EXPECT_EQ(4u, decoded.size()); 
+    EXPECT_EQ(length, decoded[0].length());
+    EXPECT_EQ(0, memcmp(decoded[0].c_str(), encoded[0].c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[1].c_str(), encoded[1].c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[2].c_str(), encoded[2].c_str(), length));
+    EXPECT_EQ(0, memcmp(decoded[3].c_str(), encoded[3].c_str(), length));
+  }
+
+  // check all three chunks missing possibilities and recover them
+  for (int i=2; i<7; i++) {
+    for (int j=1; j<i; j++) {
+      for(int k=0; k<j; k++) {
+       map<int, bufferlist> degraded = encoded;
+       degraded.erase(k);
+       degraded.erase(j);
+       degraded.erase(i);
+       EXPECT_EQ(4u, degraded.size());
+       int want_to_decode[] = {k,j,i};
+       map<int, bufferlist> decoded;
+       EXPECT_EQ(0, clay._decode(set<int>(want_to_decode, want_to_decode+3),
+                                 degraded,
+                                 &decoded));
+       EXPECT_EQ(7u, decoded.size()); 
+       EXPECT_EQ(length, decoded[j].length());
+       EXPECT_EQ(0, memcmp(decoded[k].c_str(), encoded[k].c_str(), length));
+       EXPECT_EQ(0, memcmp(decoded[j].c_str(), encoded[j].c_str(), length));
+       EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+      }
+    }
+  }
+  //check for all one chunk missing possibilities
+  int sc_size = length/clay.sub_chunk_no;
+  int avail[] = {0,1,2,3,4,5,6};
+  for (int i=0; i < 7; i++) {
+    vector<pair<int,int>> repair_subchunks;
+    map<int, vector<pair<int,int>>> minimum;
+    set<int> want_to_read;
+    want_to_read.insert(i);
+    set<int> available(avail, avail+7);
+    available.erase(i);
+    clay.minimum_to_decode(want_to_read, available, &minimum);
+    map<int, bufferlist> helper;
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      for(vector<pair<int,int>>::iterator ind=h->second.begin(); ind != h->second.end(); ++ind) {
+       bufferlist temp;
+       temp.substr_of(encoded[h->first], ind->first*sc_size, ind->second*sc_size);
+       helper[h->first].append(temp);
+      }
+    }  
+    for (map<int, vector<pair<int,int>>>::iterator h=minimum.begin(); h!= minimum.end(); ++h) {
+      EXPECT_EQ(length/clay.q, helper[h->first].length());
+    }
+    EXPECT_EQ(static_cast<size_t>(clay.d), helper.size());
+    map<int, bufferlist> decoded;
+    EXPECT_EQ(0, clay.decode(want_to_read, helper, &decoded, length));
+    EXPECT_EQ(1u, decoded.size());
+    EXPECT_EQ(length, decoded[i].length());
+    EXPECT_EQ(0, memcmp(decoded[i].c_str(), encoded[i].c_str(), length));
+  }
+}
+
+TEST(ErasureCodeClay, minimum_to_decode)
+{
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  EXPECT_EQ(0, clay.init(profile, &cerr));
+
+  //
+  // If trying to read nothing, the minimum is empty.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    EXPECT_EQ(0, clay._minimum_to_decode(want_to_read,
+                                        available_chunks,
+                                        &minimum));
+    EXPECT_TRUE(minimum.empty());
+  }
+  //
+  // There is no way to read a chunk if none are available.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+
+    EXPECT_EQ(-EIO, clay._minimum_to_decode(want_to_read,
+                                           available_chunks,
+                                           &minimum));
+  }
+  //
+  // Reading a subset of the available chunks is always possible.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+    available_chunks.insert(0);
+
+    EXPECT_EQ(0, clay._minimum_to_decode(want_to_read,
+                                        available_chunks,
+                                        &minimum));
+    EXPECT_EQ(want_to_read, minimum);
+  }
+  //
+  // There is no way to read a missing chunk if there is less than k
+  // chunks available.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(0);
+    want_to_read.insert(1);
+    available_chunks.insert(0);
+
+    EXPECT_EQ(-EIO, clay._minimum_to_decode(want_to_read,
+                                           available_chunks,
+                                           &minimum));
+  }
+  //
+  // When chunks are not available, the minimum can be made of any
+  // chunks. For instance, to read 1 and 3 below the minimum could be
+  // 2 and 3 which may seem better because it contains one of the
+  // chunks to be read. But it won't be more efficient than retrieving
+  // 0 and 2 instead because, in both cases, the decode function will
+  // need to run the same recovery operation and use the same amount
+  // of CPU and memory.
+  //
+  {
+    set<int> want_to_read;
+    set<int> available_chunks;
+    set<int> minimum;
+
+    want_to_read.insert(1);
+    want_to_read.insert(3);
+    available_chunks.insert(0);
+    available_chunks.insert(2);
+    available_chunks.insert(3);
+
+    EXPECT_EQ(0, clay._minimum_to_decode(want_to_read,
+                                        available_chunks,
+                                        &minimum));
+    EXPECT_EQ(2u, minimum.size());
+    EXPECT_EQ(0u, minimum.count(3));
+  }
+}
+
+TEST(ErasureCodeClay, encode)
+{
+  ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+  ErasureCodeProfile profile;
+  profile["k"] = "2";
+  profile["m"] = "2";
+  EXPECT_EQ(0, clay.init(profile, &cerr));
+
+  unsigned aligned_object_size = clay.get_alignment() * 2;
+  {
+    //
+    // When the input bufferlist needs to be padded because
+    // it is not properly aligned, it is padded with zeros.
+    //
+    bufferlist in;
+    map<int,bufferlist> encoded;
+    int want_to_encode[] = { 0, 1, 2, 3 };
+    int trail_length = 1;
+    in.append(string(aligned_object_size + trail_length, 'X'));
+    EXPECT_EQ(0, clay.encode(set<int>(want_to_encode, want_to_encode+4),
+                            in,
+                            &encoded));
+    EXPECT_EQ(4u, encoded.size());
+    char *last_chunk = encoded[1].c_str();
+    int length =encoded[1].length();
+    EXPECT_EQ('X', last_chunk[0]);
+    EXPECT_EQ('\0', last_chunk[length - trail_length]);
+  }
+
+  {
+    //
+    // When only the first chunk is required, the encoded map only
+    // contains the first chunk. Although the clay encode
+    // internally allocated a buffer because of padding requirements
+    // and also computes the coding chunks, they are released before
+    // the return of the method, as shown when running the tests thru
+    // valgrind (there is no leak).
+    //
+    bufferlist in;
+    map<int,bufferlist> encoded;
+    set<int> want_to_encode;
+    want_to_encode.insert(0);
+    int trail_length = 1;
+    in.append(string(aligned_object_size + trail_length, 'X'));
+    EXPECT_EQ(0, clay.encode(want_to_encode, in, &encoded));
+    EXPECT_EQ(1u, encoded.size());
+  }
+}
+
+TEST(ErasureCodeClay, create_rule)
+{
+  std::unique_ptr<CrushWrapper> c = std::make_unique<CrushWrapper>();
+  c->create();
+  int root_type = 2;
+  c->set_type_name(root_type, "root");
+  int host_type = 1;
+  c->set_type_name(host_type, "host");
+  int osd_type = 0;
+  c->set_type_name(osd_type, "osd");
+
+  int rootno;
+  c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+               root_type, 0, NULL, NULL, &rootno);
+  c->set_item_name(rootno, "default");
+
+  map<string,string> loc;
+  loc["root"] = "default";
+
+  int num_host = 4;
+  int num_osd = 5;
+  int osd = 0;
+  for (int h=0; h<num_host; ++h) {
+    loc["host"] = string("host-") + stringify(h);
+    for (int o=0; o<num_osd; ++o, ++osd) {
+      c->insert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc);
+    }
+  }
+
+  c->finalize();
+
+  {
+    stringstream ss;
+    ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    EXPECT_EQ(0, clay.init(profile, &cerr));
+    int ruleset = clay.create_rule("myrule", *c, &ss);
+    EXPECT_EQ(0, ruleset);
+    EXPECT_EQ(-EEXIST, clay.create_rule("myrule", *c, &ss));
+    //
+    // the minimum that is expected from the created ruleset is to
+    // successfully map get_chunk_count() devices from the crushmap,
+    // at least once.
+    //
+    vector<__u32> weight(c->get_max_devices(), 0x10000);
+    vector<int> out;
+    int x = 0;
+    c->do_rule(ruleset, x, out,   clay.get_chunk_count(), weight, 0);
+    ASSERT_EQ(out.size(), clay.get_chunk_count());
+    for (unsigned i=0; i<out.size(); ++i)
+      ASSERT_NE(CRUSH_ITEM_NONE, out[i]);
+  }
+  {
+    stringstream ss;
+    ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["crush-root"] = "BAD";
+    EXPECT_EQ(0, clay.init(profile, &cerr));
+    EXPECT_EQ(-ENOENT, clay.create_rule("otherrule", *c, &ss));
+    EXPECT_EQ("root item BAD does not exist", ss.str());
+  }
+  {
+    stringstream ss;
+    ErasureCodeClay clay(g_conf().get_val<std::string>("erasure_code_dir"));
+    ErasureCodeProfile profile;
+    profile["k"] = "2";
+    profile["m"] = "2";
+    profile["crush-failure-domain"] = "WORSE";
+    EXPECT_EQ(0, clay.init(profile, &cerr));
+    EXPECT_EQ(-EINVAL, clay.create_rule("otherrule", *c, &ss));
+    EXPECT_EQ("unknown type WORSE", ss.str());
+  }
+}
+
+/* 
+ * Local Variables:
+ * compile-command: "cd ../.. ;
+ *   make -j4 unittest_erasure_code_clay &&
+ *   valgrind --tool=memcheck \
+ *      ./unittest_erasure_code_clay \
+ *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
diff --git a/src/test/erasure-code/TestErasureCodePluginClay.cc b/src/test/erasure-code/TestErasureCodePluginClay.cc
new file mode 100644 (file)
index 0000000..6885db3
--- /dev/null
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include "erasure-code/ErasureCodePlugin.h"
+#include "log/Log.h"
+#include "global/global_context.h"
+#include "common/config_proxy.h"
+#include "gtest/gtest.h"
+
+TEST(ErasureCodePlugin, factory)
+{
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeProfile profile;
+  {
+    ErasureCodeInterfaceRef erasure_code;
+    EXPECT_FALSE(erasure_code);
+    EXPECT_EQ(0, instance.factory("clay",
+                                  g_conf().get_val<std::string>("erasure_code_dir"),
+                                 profile,
+                                  &erasure_code, &cerr));
+    EXPECT_TRUE(erasure_code);
+  }
+  //check clay plugin with scalar_mds=jerasure
+  {
+    const char *techniques[] = {
+      "reed_sol_van",
+      "reed_sol_r6_op",
+      "cauchy_orig",
+      "cauchy_good",
+      "liber8tion",
+      0
+    };
+    for(const char **technique = techniques; *technique; technique++) {
+      ErasureCodeInterfaceRef erasure_code;
+      ErasureCodeProfile profile;
+      profile["scalar_mds"] = "jerasure";
+      profile["technique"] = *technique;
+      EXPECT_FALSE(erasure_code);
+      EXPECT_EQ(0, instance.factory("clay",
+                                    g_conf().get_val<std::string>("erasure_code_dir"),
+                                   profile,
+                                    &erasure_code, &cerr));
+      EXPECT_TRUE(erasure_code.get());
+    }
+  }
+#ifdef HAVE_BETTER_YASM_ELF64
+  //check clay plugin with scalar_mds=isa
+  {
+    const char *techniques[] = {
+      "reed_sol_van",
+      "cauchy",
+      0
+    };
+    for(const char **technique = techniques; *technique; technique++) {
+      ErasureCodeInterfaceRef erasure_code;
+      ErasureCodeProfile profile;
+      profile["scalar_mds"] = "isa";
+      profile["technique"] = *technique;
+      EXPECT_FALSE(erasure_code);
+      EXPECT_EQ(0, instance.factory("clay",
+                                    g_conf().get_val<std::string>("erasure_code_dir"),
+                                   profile,
+                                    &erasure_code, &cerr));
+      EXPECT_TRUE(erasure_code.get());
+    }
+  }
+#endif
+  //check clay plugin with scalar_mds=shec
+  {
+    const char *techniques[] = {
+      "single",
+      "multiple",
+      0
+    };
+    for(const char **technique = techniques; *technique; technique++) {
+      ErasureCodeInterfaceRef erasure_code;
+      ErasureCodeProfile profile;
+      profile["scalar_mds"] = "shec";
+      profile["technique"] = *technique;
+      EXPECT_FALSE(erasure_code);
+      EXPECT_EQ(0, instance.factory("clay",
+                                    g_conf().get_val<std::string>("erasure_code_dir"),
+                                   profile,
+                                    &erasure_code, &cerr));
+      EXPECT_TRUE(erasure_code.get());
+    }
+  }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ *   make unittest_erasure_code_plugin_clay &&
+ *   valgrind --tool=memcheck ./unittest_erasure_code_plugin_clay \
+ *      --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */