From 5813ce6c190995d304454d2437acf7fe76632685 Mon Sep 17 00:00:00 2001
From: Radoslaw Zarzynski <rzarzyns@redhat.com>
Date: Fri, 12 Jan 2024 15:44:51 +0000
Subject: [PATCH] osd: EC Partial Stripe Reads (Retry of #23138 and #52746)

This commit is a further ressurection of the EC partial reads
concept; this time of the Mark Nelson's work sent as PR #52746.
The modifications in this commit are mostly about settling
Mark's work on top of the recent rework of `ECBackend` which
had shared the EC codebase with the crimson-osd.

At the original description says, Mark's work is based on earlier
attempt from Xiaofei Cui.

Therefore credits go to:
  * Mark Nelson (Clyso),
  * Xiaofei Cui (cuixiaofei@sangfor.com.cn).

The original commit description is preserved below:

> This is a re-implementation of PR #23138 rebased on main with a couple of nitpicky changes to make the code a little more clear (to me at least).  Credit goes to Xiaofei Cui [cuixiaofei@sangfor.com.cn](mailto:cuixiaofei@sangfor.com.cn) for the original implementation.
>
> Looking at the original PR's review, it does not appear that we can use the same technique as in 468ad4b.  We don't have the ReadOp yet.  I'm not sure if @gregsforytwo's idea to query the plugin works, but it's clear we are not doing the efficient thing from the get-go here.
>
> The performance and efficiency benefits for small random reads appears to be quite substantial, especially for large stripe widths.

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
---
 src/erasure-code/ErasureCode.cc               |  26 +++-
 src/erasure-code/ErasureCode.h                |   2 +
 src/erasure-code/ErasureCodeInterface.h       |   6 +
 src/osd/ECBackend.cc                          |   9 +-
 src/osd/ECCommon.cc                           | 147 +++++++++++++++++-
 src/osd/ECCommon.h                            |  51 +++++-
 src/osd/ECUtil.cc                             |   2 +
 src/osd/ECUtil.h                              |  15 +-
 .../erasure-code/TestErasureCodeExample.cc    |   7 +-
 9 files changed, 254 insertions(+), 11 deletions(-)
diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 6784fa355cf7d..b0b5e414bc3c1 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -352,17 +352,41 @@ int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
 			       bufferlist *decoded)
 {
   set<int> want_to_read;
+  set<int> decode_chunks;
+  bool need_decode = false;
 
   for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
     want_to_read.insert(chunk_index(i));
   }
+#if 1
+  if (chunks.size() < get_data_chunk_count()) {
+    // for partial_read
+    for (const auto& [key, bl] : chunks) {
+      if (want_to_read.contains(key)) {
+        decode_chunks.insert(key);
+      } else {
+        need_decode = true;
+        break;
+      }
+    }
+    if (!need_decode) {
+      want_to_read.swap(decode_chunks);
+    }
+  }
+#endif
   map<int, bufferlist> decoded_map;
   int r = _decode(want_to_read, chunks, &decoded_map);
   if (r == 0) {
     for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
-      decoded->claim_append(decoded_map[chunk_index(i)]);
+      if (decoded_map.contains(chunk_index(i))) {
+        decoded->claim_append(decoded_map[chunk_index(i)]);
+      }
     }
   }
   return r;
 }
+
+bool ErasureCode::is_systematic() const {
+  return true;
+}
 }
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index fd6d1a41f714d..5813109b70038 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -115,6 +115,8 @@ namespace ceph {
     int decode_concat(const std::map<int, bufferlist> &chunks,
 			      bufferlist *decoded) override;
 
+    bool is_systematic() const override;
+
   protected:
     int parse(const ErasureCodeProfile &profile,
 	      std::ostream *ss);
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h
index 7107f978dd4fb..2e5c3d91e3ace 100644
--- a/src/erasure-code/ErasureCodeInterface.h
+++ b/src/erasure-code/ErasureCodeInterface.h
@@ -459,6 +459,12 @@ namespace ceph {
      */
     virtual int decode_concat(const std::map<int, bufferlist> &chunks,
 			      bufferlist *decoded) = 0;
+
+
+    /**
+     * @return **true** if the EC plugin's data placement is systematic.
+     */
+    virtual bool is_systematic() const = 0;
   };
 
   typedef std::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index e2de6dfee9860..4c19cc2bf3b36 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1233,10 +1233,10 @@ void ECBackend::handle_sub_read_reply(
 	 ++j, ++req_iter, ++riter) {
       ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
       ceph_assert(riter != rop.complete[i->first].returned.end());
-      pair<uint64_t, uint64_t> adjusted =
+      pair<uint64_t, uint64_t> aligned =
 	sinfo.aligned_offset_len_to_chunk(
 	  make_pair(req_iter->get<0>(), req_iter->get<1>()));
-      ceph_assert(adjusted.first == j->first);
+      ceph_assert(aligned.first == j->first);
       riter->get<2>()[from] = std::move(j->second);
     }
   }
@@ -1554,9 +1554,14 @@ void ECBackend::objects_read_async(
 	 to_read.begin();
        i != to_read.end();
        ++i) {
+#if 0
     pair<uint64_t, uint64_t> tmp =
       sinfo.offset_len_to_stripe_bounds(
 	make_pair(i->first.get<0>(), i->first.get<1>()));
+#else
+    pair<uint64_t, uint64_t> tmp =
+      make_pair(i->first.get<0>(), i->first.get<1>());
+#endif
 
     es.union_insert(tmp.first, tmp.second);
     flags |= i->first.get<2>();
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 6b16431e01fc0..3fc54dce9f03e 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -298,6 +298,30 @@ int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
   return 0;
 }
 
+void ECCommon::ReadPipeline::get_min_want_to_read_shards(
+  pair<uint64_t, uint64_t> off_len,
+  set<int> *want_to_read)
+{
+  uint64_t off = off_len.first;
+  uint64_t len = off_len.second;
+  uint64_t chunk_size = sinfo.get_chunk_size();
+  int data_chunk_count = sinfo.get_stripe_width() / sinfo.get_chunk_size();
+  const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
+
+  int total_chunks = (chunk_size - 1 + len) / chunk_size;
+  int first_chunk = (off / chunk_size) % data_chunk_count;
+
+  if (total_chunks > data_chunk_count) {
+    total_chunks = data_chunk_count;
+  }
+
+  for(int i = 0; i < total_chunks; i++) {
+    int j = (first_chunk + i) % data_chunk_count;
+    int chunk = (int)chunk_mapping.size()  > j ? chunk_mapping[j] : j;
+    want_to_read->insert(chunk);
+  }
+}
+
 int ECCommon::ReadPipeline::get_remaining_shards(
   const hobject_t &hoid,
   const set<int> &avail,
@@ -478,11 +502,21 @@ struct ClientReadCompleter : ECCommon::ReadCompleter {
     ceph_assert(res.returned.size() == to_read.size());
     ceph_assert(res.errors.empty());
     for (auto &&read: to_read) {
+#if 0
       pair<uint64_t, uint64_t> adjusted =
 	read_pipeline.sinfo.offset_len_to_stripe_bounds(
 	  make_pair(read.get<0>(), read.get<1>()));
       ceph_assert(res.returned.front().get<0>() == adjusted.first);
       ceph_assert(res.returned.front().get<1>() == adjusted.second);
+#else
+      auto bounds = make_pair(read.get<0>(), read.get<1>());
+      auto aligned = read_pipeline.sinfo.offset_len_to_stripe_bounds(bounds);
+      if (aligned.first != read.get<0>() || aligned.second != read.get<1>()) {
+        aligned = read_pipeline.sinfo.offset_len_to_chunk_bounds(bounds);
+      }
+      ceph_assert(res.returned.front().get<0>() == aligned.first);
+      ceph_assert(res.returned.front().get<1>() == aligned.second);
+#endif
       map<int, bufferlist> to_decode;
       bufferlist bl;
       for (map<pg_shard_t, bufferlist>::iterator j =
@@ -501,11 +535,19 @@ struct ClientReadCompleter : ECCommon::ReadCompleter {
         goto out;
       }
       bufferlist trimmed;
+#if 0
       trimmed.substr_of(
 	bl,
 	read.get<0>() - adjusted.first,
 	std::min(read.get<1>(),
 	    bl.length() - (read.get<0>() - adjusted.first)));
+#else
+      // XXX: hmm, just refactor, at least at first glance
+      auto off = read.get<0>() - aligned.first;
+      auto len =
+          std::min(read.get<1>(), bl.length() - (read.get<0>() - aligned.first));
+      trimmed.substr_of(bl, off, len);
+#endif
       result.insert(
 	read.get<0>(), trimmed.length(), std::move(trimmed));
       res.returned.pop_front();
@@ -524,6 +566,46 @@ out:
   ECCommon::ClientAsyncReadStatus *status;
 };
 
+bool ECCommon::ReadPipeline::should_partial_read(
+  const hobject_t &hoid,
+  std::list<ECCommon::ec_align_t> to_read,
+  const std::set<int> &want,
+  bool fast_read,
+  bool for_recovery)
+{
+    // Don't partial read if we are doing a fast_read
+    if (fast_read) {
+      return false;
+    }
+    // Don't partial read if the EC isn't systematic
+    if (!ec_impl->is_systematic()) {
+      return false;
+    }
+    // Don't partial read if we have multiple stripes
+    if (to_read.size() != 1) {
+      return false;
+    }
+    // Only partial read if the length is inside the stripe boundary
+    auto read = to_read.front();
+    auto bounds = make_pair(read.get<0>(), read.get<1>());
+    auto aligned = sinfo.offset_len_to_stripe_bounds(bounds);
+    if (sinfo.get_stripe_width() != aligned.second) {
+      return false;
+    }
+
+    set<int> have;
+    map<shard_id_t, pg_shard_t> shards;
+    set<pg_shard_t> error_shards;
+    get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+    set<int> data_shards;
+    get_want_to_read_shards(&data_shards);
+
+    return includes(data_shards.begin(), data_shards.end(), want.begin(), want.end())
+        && includes(have.begin(), have.end(), want.begin(), want.end());
+}
+
+
 void ECCommon::ReadPipeline::objects_read_and_reconstruct(
   const map<hobject_t, std::list<ECCommon::ec_align_t>> &reads,
   bool fast_read,
@@ -537,11 +619,49 @@ void ECCommon::ReadPipeline::objects_read_and_reconstruct(
   }
 
   map<hobject_t, set<int>> obj_want_to_read;
-  set<int> want_to_read;
-  get_want_to_read_shards(&want_to_read);
     
   map<hobject_t, read_request_t> for_read_op;
   for (auto &&to_read: reads) {
+    set<int> want_to_read;
+    get_want_to_read_shards(&want_to_read);
+#if 1
+    std::list<ec_align_t> align_offsets;
+    bool partial_read = should_partial_read(
+        to_read.first,
+        to_read.second,
+        want_to_read,
+        fast_read,
+        false);
+
+    // Our extent set and flags
+    extent_set es;
+    uint32_t flags = 0;
+
+    for (auto read : to_read.second) {
+      auto bounds = make_pair(read.get<0>(), read.get<1>()); 
+
+      // By default, align to the stripe
+      auto aligned = sinfo.offset_len_to_stripe_bounds(bounds);
+      if (partial_read) {
+        // align to the chunk instead
+        aligned = sinfo.offset_len_to_chunk_bounds(bounds);
+        set<int> new_want_to_read;
+        get_min_want_to_read_shards(aligned, &new_want_to_read);
+        want_to_read = new_want_to_read;
+      }
+      // Add the new extents/flags
+      extent_set new_es;
+      new_es.insert(aligned.first, aligned.second);
+      es.union_of(new_es);
+      flags |= read.get<2>();
+    }
+    if (!es.empty()) {
+      for (auto e = es.begin(); e != es.end(); ++e) {
+        align_offsets.push_back(
+          boost::make_tuple(e.get_start(), e.get_len(), flags));
+      }
+    }
+#endif
     map<pg_shard_t, vector<pair<int, int>>> shards;
     int r = get_min_avail_to_read_shards(
       to_read.first,
@@ -555,9 +675,14 @@ void ECCommon::ReadPipeline::objects_read_and_reconstruct(
       make_pair(
 	to_read.first,
 	read_request_t(
+#if 1
+	  align_offsets,
+#else
 	  to_read.second,
+#endif
 	  shards,
-	  false)));
+	  false,
+	  partial_read)));
     obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
   }
 
@@ -589,6 +714,19 @@ int ECCommon::ReadPipeline::send_all_remaining_reads(
 
   list<ec_align_t> to_read = rop.to_read.find(hoid)->second.to_read;
 
+  bool partial_read = rop.to_read.find(hoid)->second.partial_read;
+  // realign the offset and len to make partial reads normal reads.
+  if (partial_read) {
+    list<ec_align_t> new_to_read;
+    for(const auto& read : to_read) {
+      auto bounds = make_pair(read.get<0>(), read.get<1>());
+      auto aligned = sinfo.offset_len_to_stripe_bounds(bounds);
+      new_to_read.push_back(
+          boost::make_tuple(aligned.first, aligned.second, read.get<2>()));
+    }
+    to_read = new_to_read;
+  }
+
   // (Note cuixf) If we need to read attrs and we read failed, try to read again.
   bool want_attrs =
     rop.to_read.find(hoid)->second.want_attrs &&
@@ -604,6 +742,9 @@ int ECCommon::ReadPipeline::send_all_remaining_reads(
 	to_read,
 	shards,
 	want_attrs)));
+  if (partial_read) {
+    rop.refresh_complete(hoid);
+  }
   return 0;
 }
 
diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h
index 7112080cb2a82..a27c2e7065958 100644
--- a/src/osd/ECCommon.h
+++ b/src/osd/ECCommon.h
@@ -231,11 +231,13 @@ struct ECCommon {
     const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
     std::map<pg_shard_t, std::vector<std::pair<int, int>>> need;
     bool want_attrs;
+    bool partial_read;
     read_request_t(
       const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read,
       const std::map<pg_shard_t, std::vector<std::pair<int, int>>> &need,
-      bool want_attrs)
-      : to_read(to_read), need(need), want_attrs(want_attrs) {}
+      bool want_attrs,
+      bool partial_read=false)
+      : to_read(to_read), need(need), want_attrs(want_attrs), partial_read(partial_read) {}
   };
   friend std::ostream &operator<<(std::ostream &lhs, const read_request_t &rhs);
   struct ReadOp;
@@ -364,6 +366,22 @@ struct ECCommon {
     ReadOp() = delete;
     ReadOp(const ReadOp &) = delete; // due to on_complete being unique_ptr
     ReadOp(ReadOp &&) = default;
+
+    void refresh_complete(const hobject_t &hoid) {
+      std::list<
+        boost::tuple<
+          uint64_t, uint64_t, std::map<pg_shard_t, bufferlist>>> new_returned;
+      auto returned = complete[hoid].returned;
+      auto reads = to_read.find(hoid)->second.to_read;
+
+      auto r = returned.begin();
+      for (auto read : reads) {
+        new_returned.push_back(
+            boost::make_tuple(read.get<0>(), read.get<1>(), r->get<2>()));
+        ++r;
+      }
+      complete[hoid].returned = new_returned;
+    }
   };
   struct ReadPipeline {
     void objects_read_and_reconstruct(
@@ -371,6 +389,13 @@ struct ECCommon {
       bool fast_read,
       GenContextURef<ec_extents_t &&> &&func);
 
+    bool should_partial_read(
+      const hobject_t &hoid,
+      std::list<ec_align_t> to_read,
+      const std::set<int> &want,
+      bool fast_read,
+      bool for_recovery);
+
     template <class F, class G>
     void filter_read_op(
       const OSDMapRef& osdmap,
@@ -430,6 +455,28 @@ struct ECCommon {
         parent(parent) {
     }
 
+    /**
+     * The basic idea here is that we only call this function when there is a
+     * partial read.  The criteria for performing a partial read involves
+     * checking to make sure that we don't read across multiple stripes and that
+     * the read is within the stripe boundary (see should_partial_read).
+     *
+     * While get_want_to_read_shards creates a want_to_read based on the EC
+     * plugin's get_data_chunk_count(), this instead uses the number of chunks
+     * necessary to read the length of data and only inserts those chunks.  Just
+     * like in get_want_to_read_shards, we check the plugin's mapping but the
+     * difference is that we start at first_chunk and we end when we no longer
+     * have chunks based on the read's length.
+     *
+     * The resulting want_to_read has fewer chunks than a normal read, and thus
+     * gets intercepted in ErasureCode::decode_concat to be handled differently
+     * than when get_want_to_read_shards is used and we decode all data chunks.
+     */
+    void get_min_want_to_read_shards(
+      std::pair<uint64_t, uint64_t> off_len,    ///< [in]
+      std::set<int> *want_to_read               ///< [out]
+      );
+
     int get_remaining_shards(
       const hobject_t &hoid,
       const std::set<int> &avail,
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
index 94b32845847a6..e1cb9de43e09b 100644
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -41,7 +41,9 @@ int ECUtil::decode(
     bufferlist bl;
     int r = ec_impl->decode_concat(chunks, &bl);
     ceph_assert(r == 0);
+#if 0 //ndef EC_PARTIAL_READ
     ceph_assert(bl.length() == sinfo.get_stripe_width());
+#endif
     out->claim_append(bl);
   }
   return 0;
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h
index dce78b8a8683e..28c2f3bd69da4 100644
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -66,9 +66,11 @@ public:
   }
   std::pair<uint64_t, uint64_t> aligned_offset_len_to_chunk(
     std::pair<uint64_t, uint64_t> in) const {
+    // we need to align to stripe again to deal with partial chunk read.
+    std::pair<uint64_t, uint64_t> aligned = offset_len_to_stripe_bounds(in);
     return std::make_pair(
-      aligned_logical_offset_to_chunk_offset(in.first),
-      aligned_logical_offset_to_chunk_offset(in.second));
+      aligned_logical_offset_to_chunk_offset(aligned.first),
+      aligned_logical_offset_to_chunk_offset(aligned.second));
   }
   std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds(
     std::pair<uint64_t, uint64_t> in) const {
@@ -77,6 +79,15 @@ public:
       (in.first - off) + in.second);
     return std::make_pair(off, len);
   }
+  std::pair<uint64_t, uint64_t> offset_len_to_chunk_bounds(
+    std::pair<uint64_t, uint64_t> in) const {
+    uint64_t off = in.first - (in.first % chunk_size);
+    uint64_t tmp_len = (in.first - off) + in.second;
+    uint64_t len = ((tmp_len % chunk_size) ?
+      (tmp_len - (tmp_len % chunk_size) + chunk_size) :
+      tmp_len);
+    return std::make_pair(off, len);
+  }
 };
 
 int decode(
diff --git a/src/test/erasure-code/TestErasureCodeExample.cc b/src/test/erasure-code/TestErasureCodeExample.cc
index b488a604b61e7..9fcf5c3632885 100644
--- a/src/test/erasure-code/TestErasureCodeExample.cc
+++ b/src/test/erasure-code/TestErasureCodeExample.cc
@@ -197,9 +197,14 @@ TEST(ErasureCodeExample, decode)
   usable.substr_of(out, 0, in.length());
   EXPECT_TRUE(usable == in);
 
+  // partial chunk decode
+  map<int, bufferlist> partial_decode;
+  partial_decode[0] = encoded[0];
+  EXPECT_EQ(0, example.decode_concat(partial_decode, &out));
+
   // cannot recover
   map<int, bufferlist> degraded;  
-  degraded[0] = encoded[0];
+  degraded[2] = encoded[2];
   EXPECT_EQ(-ERANGE, example.decode_concat(degraded, &out));
 }
 
-- 
2.39.5