osd: Introduce optimized EC

author Alex Ainscow <aainscow@uk.ibm.com>

Thu, 3 Apr 2025 13:47:28 +0000 (14:47 +0100)

committer Alex Ainscow <aainscow@uk.ibm.com>

Tue, 22 Apr 2025 07:42:41 +0000 (08:42 +0100)
author Alex Ainscow <aainscow@uk.ibm.com>
Thu, 3 Apr 2025 13:47:28 +0000 (14:47 +0100)
committer Alex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:42:41 +0000 (08:42 +0100)
diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt

index e7f579f38410cbf0a095955012c982e847e7d825..d350607353478f4e0ca18e502046ea7aa00ac724 100644 (file)
--- a/src/osd/CMakeLists.txt
+++ b/src/osd/CMakeLists.txt
@@ -48,7 +48,7 @@ set(osd_srcs
    ECUtilL.cc
    ECCommon.cc
    ECBackend.cc
-  ExtentCache.cc
+  ECExtentCache.cc
    ECTransaction.cc
    ECUtil.cc
    ECInject.cc
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index 63dfc99015d74c90e073e6530df231dd37b2f322..88e985c77af6f8e48c64682be5d1a8854fddcd86 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -29,7 +29,6 @@
  #include "ECSwitch.h"
  
  #include "PrimaryLogPG.h"
-#include "osd_tracer.h"
  
  #define dout_context cct
  #define dout_subsys ceph_subsys_osd
@@ -56,11 +55,11 @@ using ceph::bufferptr;
  using ceph::ErasureCodeInterfaceRef;
  using ceph::Formatter;
  
-static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) {
+static ostream &_prefix(std::ostream *_dout, ECBackend *pgb) {
    return pgb->get_parent()->gen_dbg_prefix(*_dout);
  }
  
-static ostream& _prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) {
+static ostream &_prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) {
    return pgb->get_parent()->gen_dbg_prefix(*_dout);
  }
  
@@ -68,50 +67,7 @@ struct ECRecoveryHandle : public PGBackend::RecoveryHandle {
    list<ECBackend::RecoveryBackend::RecoveryOp> ops;
  };
  
-static ostream &operator<<(ostream &lhs, const map<pg_shard_t, bufferlist> &rhs)
-{
-  lhs << "[";
-  for (map<pg_shard_t, bufferlist>::const_iterator i = rhs.begin();
-       i != rhs.end();
-       ++i) {
-    if (i != rhs.begin())
-      lhs << ", ";
-    lhs << make_pair(i->first, i->second.length());
-  }
-  return lhs << "]";
-}
-
-static ostream &operator<<(ostream &lhs, const map<int, bufferlist> &rhs)
-{
-  lhs << "[";
-  for (map<int, bufferlist>::const_iterator i = rhs.begin();
-       i != rhs.end();
-       ++i) {
-    if (i != rhs.begin())
-      lhs << ", ";
-    lhs << make_pair(i->first, i->second.length());
-  }
-  return lhs << "]";
-}
-
-ostream &operator<<(ostream &lhs, const ECBackend::RecoveryBackend::RecoveryOp &rhs)
-{
-  return lhs << "RecoveryOp("
-            << "hoid=" << rhs.hoid
-            << " v=" << rhs.v
-            << " missing_on=" << rhs.missing_on
-            << " missing_on_shards=" << rhs.missing_on_shards
-            << " recovery_info=" << rhs.recovery_info
-            << " recovery_progress=" << rhs.recovery_progress
-            << " obc refcount=" << rhs.obc.use_count()
-            << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(rhs.state)
-            << " waiting_on_pushes=" << rhs.waiting_on_pushes
-            << " extent_requested=" << rhs.extent_requested
-            << ")";
-}
-
-void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const
-{
+void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const {
    f->dump_stream("hoid") << hoid;
    f->dump_stream("v") << v;
    f->dump_stream("missing_on") << missing_on;
@@ -120,7 +76,6 @@ void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const
    f->dump_stream("recovery_progress") << recovery_progress;
    f->dump_stream("state") << tostr(state);
    f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
-  f->dump_stream("extent_requested") << extent_requested;
  }
  
  ECBackend::ECBackend(
@@ -129,32 +84,38 @@ ECBackend::ECBackend(
    ErasureCodeInterfaceRef ec_impl,
    uint64_t stripe_width,
    ECSwitch *s,
-  ECExtentCache::LRU &ignored)
+  ECExtentCache::LRU &ec_extent_cache_lru)
    : parent(pg), cct(cct), switcher(s),
      read_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener()),
-    rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(), *this),
-    recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline, unstable_hashinfo_registry, get_parent(), this),
+    rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(),
+                 *this, ec_extent_cache_lru),
+    recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline,
+                     unstable_hashinfo_registry, get_parent(), this),
      ec_impl(ec_impl),
-    sinfo(ec_impl, stripe_width),
+    sinfo(ec_impl, &(get_parent()->get_pool()), stripe_width),
      unstable_hashinfo_registry(cct, ec_impl) {
+
+  /* EC makes some assumptions about how the plugin organises the *data* shards:
+   * - The chunk size is constant for a particular profile.
+   * - A stripe consists of k chunks.
+   */
    ceph_assert((ec_impl->get_data_chunk_count() *
-         ec_impl->get_chunk_size(stripe_width)) == stripe_width);
+    ec_impl->get_chunk_size(stripe_width)) == stripe_width);
  }
  
-PGBackend::RecoveryHandle *ECBackend::open_recovery_op()
-{
+PGBackend::RecoveryHandle *ECBackend::open_recovery_op() {
    return recovery_backend.open_recovery_op();
  }
  
  ECBackend::RecoveryBackend::RecoveryBackend(
-  CephContext* cct,
+  CephContext *cct,
    const coll_t &coll,
    ceph::ErasureCodeInterfaceRef ec_impl,
-  const ECUtil::stripe_info_t& sinfo,
-  ReadPipeline& read_pipeline,
-  UnstableHashInfoRegistry& unstable_hashinfo_registry,
-  ECListener* parent,
-  ECBackend* ecbackend)
+  const ECUtil::stripe_info_t &sinfo,
+  ReadPipeline &read_pipeline,
+  UnstableHashInfoRegistry &unstable_hashinfo_registry,
+  ECListener *parent,
+  ECBackend *ecbackend)
    : cct(cct),
      coll(coll),
      ec_impl(std::move(ec_impl)),
@@ -162,16 +123,14 @@ ECBackend::RecoveryBackend::RecoveryBackend(
      read_pipeline(read_pipeline),
      unstable_hashinfo_registry(unstable_hashinfo_registry),
      parent(parent),
-    ecbackend(ecbackend) {
-}
+    ecbackend(ecbackend) {}
  
-PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op()
-{
+PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op() {
    return new ECRecoveryHandle;
  }
  
-void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, ECCommon::read_result_t &res)
-{
+void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid,
+                                              ECCommon::read_result_t &res) {
    dout(10) << __func__ << ": Read error " << hoid << " r="
            << res.r << " errors=" << res.errors << dendl;
    dout(10) << __func__ << ": canceling recovery op for obj " << hoid
@@ -181,70 +140,55 @@ void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, ECCommon::r
    recovery_ops.erase(hoid);
  
    set<pg_shard_t> fl;
-  for (auto&& i : res.errors) {
+  for (auto &&i: res.errors) {
      fl.insert(i.first);
    }
    get_parent()->on_failed_pull(fl, hoid, v);
  }
  
  struct RecoveryMessages {
-  map<hobject_t,
-      ECCommon::read_request_t> recovery_reads;
-  map<hobject_t, set<int>> want_to_read;
-
-  void recovery_read(
-    const hobject_t &hoid, uint64_t off, uint64_t len,
-    set<int> &&_want_to_read,
-    const map<pg_shard_t, vector<pair<int, int>>> &need,
-    bool attrs)
-  {
-    list<ec_align_t> to_read;
-    to_read.emplace_back(ec_align_t{off, len, 0});
+  map<hobject_t, ECCommon::read_request_t> recovery_reads;
+
+  void recovery_read(const hobject_t &hoid,
+                     const ECCommon::read_request_t &read_request) {
      ceph_assert(!recovery_reads.count(hoid));
-    want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
-    recovery_reads.insert(
-      make_pair(
-       hoid,
-       ECCommon::read_request_t(
-         to_read,
-         need,
-         attrs)));
-  }
-
-  map<pg_shard_t, vector<PushOp> > pushes;
-  map<pg_shard_t, vector<PushReplyOp> > push_replies;
+    recovery_reads.insert(make_pair(hoid, read_request));
+  }
+
+  map<pg_shard_t, vector<PushOp>> pushes;
+  map<pg_shard_t, vector<PushReplyOp>> push_replies;
    ObjectStore::Transaction t;
  };
  
  void ECBackend::handle_recovery_push(
    const PushOp &op,
    RecoveryMessages *m,
-  bool is_repair)
-{
+  bool is_repair) {
    if (get_parent()->pg_is_remote_backfilling()) {
      get_parent()->pg_add_local_num_bytes(op.data.length());
-    get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
+    get_parent()->pg_add_num_bytes(op.data.length() * sinfo.get_k());
      dout(10) << __func__ << " " << op.soid
               << " add new actual data by " << op.data.length()
-             << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
+             << " add new num_bytes by " << op.data.length() * sinfo.get_k()
               << dendl;
    }
  
    recovery_backend.handle_recovery_push(op, m, is_repair);
  
    if (op.after_progress.data_complete &&
-     !(get_parent()->pgb_is_primary()) &&
-     get_parent()->pg_is_remote_backfilling()) {
+    !(get_parent()->pgb_is_primary()) &&
+    get_parent()->pg_is_remote_backfilling()) {
      struct stat st;
-    int r = switcher->store->stat(switcher->ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
-                        get_parent()->whoami_shard().shard), &st);
+    int r = switcher->store->stat(switcher->ch, ghobject_t(
+                                    op.soid, ghobject_t::NO_GEN,
+                                    get_parent()->whoami_shard().shard), &st);
      if (r == 0) {
        get_parent()->pg_sub_local_num_bytes(st.st_size);
        // XXX: This can be way overestimated for small objects
-      get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
+      get_parent()->pg_sub_num_bytes(st.st_size * sinfo.get_k());
        dout(10) << __func__ << " " << op.soid
                 << " sub actual data by " << st.st_size
-               << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
+               << " sub num_bytes by " << st.st_size * sinfo.get_k()
                 << dendl;
      }
    }
@@ -253,10 +197,10 @@ void ECBackend::handle_recovery_push(
  void ECBackend::RecoveryBackend::handle_recovery_push(
    const PushOp &op,
    RecoveryMessages *m,
-  bool is_repair)
-{
+  bool is_repair) {
    if (get_parent()->check_failsafe_full()) {
-    dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+    dout(10) << __func__ << " Out of space (failsafe) processing push request."
+ << dendl;
      ceph_abort();
    }
  
@@ -264,12 +208,12 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
    ghobject_t tobj;
    if (oneshot) {
      tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
-                     get_parent()->whoami_shard().shard);
+                      get_parent()->whoami_shard().shard);
    } else {
      tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid,
-                                                            op.version),
-                     ghobject_t::NO_GEN,
-                     get_parent()->whoami_shard().shard);
+                                                             op.version),
+                      ghobject_t::NO_GEN,
+                      get_parent()->whoami_shard().shard);
      if (op.before_progress.first) {
        dout(10) << __func__ << ": Adding oid "
                << tobj.hobj << " in the temp collection" << dendl;
@@ -310,11 +254,12 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
              << tobj.hobj << " from the temp collection" << dendl;
      clear_temp_obj(tobj.hobj);
      m->t.remove(coll, ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+                  op.soid, ghobject_t::NO_GEN,
+                  get_parent()->whoami_shard().shard));
      m->t.collection_move_rename(
        coll, tobj,
        coll, ghobject_t(
-       op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+        op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
    }
    if (op.after_progress.data_complete) {
      if ((get_parent()->pgb_is_primary())) {
@@ -323,21 +268,21 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
        if (get_parent()->pg_is_repair() || is_repair)
          get_parent()->inc_osd_stat_repaired();
        get_parent()->on_local_recover(
-       op.soid,
-       op.recovery_info,
-       recovery_ops[op.soid].obc,
-       false,
-       &m->t);
+        op.soid,
+        op.recovery_info,
+        recovery_ops[op.soid].obc,
+        false,
+        &m->t);
      } else {
        // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired
        if (is_repair)
          get_parent()->inc_osd_stat_repaired();
        get_parent()->on_local_recover(
-       op.soid,
-       op.recovery_info,
-       ObjectContextRef(),
-       false,
-       &m->t);
+        op.soid,
+        op.recovery_info,
+        ObjectContextRef(),
+        false,
+        &m->t);
      }
    }
    m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
@@ -347,8 +292,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push(
  void ECBackend::RecoveryBackend::handle_recovery_push_reply(
    const PushReplyOp &op,
    pg_shard_t from,
-  RecoveryMessages *m)
-{
+  RecoveryMessages *m) {
    if (!recovery_ops.count(op.soid))
      return;
    RecoveryOp &rop = recovery_ops[op.soid];
@@ -359,35 +303,14 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply(
  
  void ECBackend::RecoveryBackend::handle_recovery_read_complete(
    const hobject_t &hoid,
-  boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read,
-  std::optional<map<string, bufferlist, less<>> > attrs,
-  RecoveryMessages *m)
-{
-  dout(10) << __func__ << ": returned " << hoid << " "
-          << "(" << to_read.get<0>()
-          << ", " << to_read.get<1>()
-          << ", " << to_read.get<2>()
-          << ")"
-          << dendl;
-  ceph_assert(recovery_ops.count(hoid));
+  ECUtil::shard_extent_map_t &&buffers_read,
+  std::optional<map<string, bufferlist, less<>>> attrs,
+  const ECUtil::shard_extent_set_t &want_to_read,
+  RecoveryMessages *m) {
+  dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl;
+  ceph_assert(recovery_ops.contains(hoid));
    RecoveryBackend::RecoveryOp &op = recovery_ops[hoid];
-  ceph_assert(op.returned_data.empty());
-  map<int, bufferlist*> target;
-  for (set<shard_id_t>::iterator i = op.missing_on_shards.begin();
-       i != op.missing_on_shards.end();
-       ++i) {
-    target[static_cast<int>(*i)] = &(op.returned_data[static_cast<int>(*i)]);
-  }
-  map<int, bufferlist> from;
-  for(map<pg_shard_t, bufferlist>::iterator i = to_read.get<2>().begin();
-      i != to_read.get<2>().end();
-      ++i) {
-    from[static_cast<int>(i->first.shard)] = std::move(i->second);
-  }
-  dout(10) << __func__ << ": " << from << dendl;
-  int r;
-  r = ECUtil::decode(sinfo, ec_impl, from, target);
-  ceph_assert(r == 0);
+
    if (attrs) {
      op.xattrs.swap(*attrs);
  
@@ -413,99 +336,145 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete(
        op.recovery_info.oi = op.obc->obs.oi;
      }
  
-    ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
-    if (op.obc->obs.oi.size > 0) {
-      ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
-      auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
-      decode(hinfo, bp);
+    if (sinfo.require_hinfo()) {
+      ECUtil::HashInfo hinfo(sinfo.get_k_plus_m());
+      if (op.obc->obs.oi.size > 0) {
+        ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
+        auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
+        decode(hinfo, bp);
+      }
+      op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info(
+        hoid, std::move(hinfo));
      }
-    op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info(hoid, std::move(hinfo));
    }
    ceph_assert(op.xattrs.size());
    ceph_assert(op.obc);
+
+  op.returned_data.emplace(std::move(buffers_read));
+
+  ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+  sinfo.ro_size_to_read_mask(op.recovery_info.size, read_mask);
+  ECUtil::shard_extent_set_t shard_want_to_read(sinfo.get_k_plus_m());
+
+  for (auto &[shard, eset] : want_to_read) {
+    /* Read buffers do not need recovering! */
+    if (buffers_read.contains(shard)) {
+      continue;
+    }
+
+    /* Read-buffers will be truncated to the end-of-object. Do not attempt
+     * to recover off-the-end.
+     */
+    shard_want_to_read[shard].intersection_of(read_mask.get(shard),eset);
+
+    /* Some shards may be empty */
+    if (shard_want_to_read[shard].empty()) {
+      shard_want_to_read.erase(shard);
+    }
+  }
+
+  uint64_t aligned_size = ECUtil::align_page_next(op.obc->obs.oi.size);
+
+  int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
+  ceph_assert(r == 0);
+  // We are never appending here, so we never need hinfo.
+  op.returned_data->insert_parity_buffers();
+  r = op.returned_data->encode(ec_impl, NULL, 0);
+  ceph_assert(r==0);
+
+  // Finally, we don't want to write any padding, so truncate the buffer
+  // to remove it.
+  op.returned_data->erase_after_ro_offset(aligned_size);
+
+  for (auto &&shard: op.missing_on_shards) {
+    if (read_mask.contains(shard) && op.returned_data->contains_shard(shard)) {
+      ceph_assert(read_mask.at(shard).range_end() >=
+        op.returned_data->get_extent_map(shard).get_end_off());
+    }
+  }
+
+  dout(20) << __func__ << ": oid=" << op.hoid << " "
+           << op.returned_data->debug_string(2048, 8) << dendl;
+
    continue_recovery_op(op, m);
  }
  
  struct SendPushReplies : public Context {
    PGBackend::Listener *l;
    epoch_t epoch;
-  map<int, MOSDPGPushReply*> replies;
+  std::map<int, MOSDPGPushReply*> replies;
+
    SendPushReplies(
      PGBackend::Listener *l,
      epoch_t epoch,
-    map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
+    std::map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
      replies.swap(in);
    }
+
    void finish(int) override {
      std::vector<std::pair<int, Message*>> messages;
      messages.reserve(replies.size());
-    for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
-        i != replies.end();
-        ++i) {
-      messages.push_back(std::make_pair(i->first, i->second));
+    for (auto & reply : replies) {
+      messages.push_back(reply);
      }
      if (!messages.empty()) {
        l->send_message_osd_cluster(messages, epoch);
      }
      replies.clear();
    }
+
    ~SendPushReplies() override {
-    for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
-        i != replies.end();
-        ++i) {
-      i->second->put();
+    for (auto & [_, reply] : replies) {
+      reply->put();
      }
      replies.clear();
    }
  };
  
  struct RecoveryReadCompleter : ECCommon::ReadCompleter {
-  RecoveryReadCompleter(ECBackend::RecoveryBackend& backend)
+  RecoveryReadCompleter(ECBackend::RecoveryBackend &backend)
      : backend(backend) {}
  
    void finish_single_request(
-    const hobject_t &hoid,
-    ECCommon::read_result_t &res,
-    list<ec_align_t>,
-    set<int> wanted_to_read) override
-  {
+      const hobject_t &hoid,
+      ECCommon::read_result_t &&res,
+      ECCommon::read_request_t &req) override {
      if (!(res.r == 0 && res.errors.empty())) {
        backend._failed_push(hoid, res);
        return;
      }
-    ceph_assert(res.returned.size() == 1);
+    ceph_assert(req.to_read.size() == 0);
      backend.handle_recovery_read_complete(
        hoid,
-      res.returned.back(),
+      std::move(res.buffers_read),
        res.attrs,
+      req.shard_want_to_read,
        &rm);
    }
  
-  void finish(int priority) && override
-  {
+  void finish(int priority) && override {
      backend.dispatch_recovery_messages(rm, priority);
    }
  
-  ECBackend::RecoveryBackend& backend;
+  ECBackend::RecoveryBackend &backend;
    RecoveryMessages rm;
  };
  
  void ECBackend::ECRecoveryBackend::commit_txn_send_replies(
-  ceph::os::Transaction&& txn,
-  std::map<int, MOSDPGPushReply*> replies)
-{
+  ceph::os::Transaction &&txn,
+  std::map<int, MOSDPGPushReply*> replies) {
    txn.register_on_complete(
-      get_parent()->bless_context(
-        new SendPushReplies(
-          get_parent(),
-          get_osdmap_epoch(),
-          replies)));
+    get_parent()->bless_context(
+      new SendPushReplies(
+        get_parent(),
+        get_osdmap_epoch(),
+        replies)));
    get_parent()->queue_transaction(std::move(txn));
  }
  
-void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
-{
-  for (map<pg_shard_t, vector<PushOp> >::iterator i = m.pushes.begin();
+void ECBackend::RecoveryBackend::dispatch_recovery_messages(
+  RecoveryMessages &m, int priority) {
+  for (map<pg_shard_t, vector<PushOp>>::iterator i = m.pushes.begin();
         i != m.pushes.end();
         m.pushes.erase(i++)) {
      MOSDPGPush *msg = new MOSDPGPush();
@@ -517,14 +486,14 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m,
      msg->pushes.swap(i->second);
      msg->compute_cost(cct);
      msg->is_repair = get_parent()->pg_is_repair();
-    std::vector wrapped_msg {
+    std::vector wrapped_msg{
        std::make_pair(i->first.osd, static_cast<Message*>(msg))
      };
      get_parent()->send_message_osd_cluster(wrapped_msg, msg->map_epoch);
    }
-  map<int, MOSDPGPushReply*> replies;
-  for (map<pg_shard_t, vector<PushReplyOp> >::iterator i =
-        m.push_replies.begin();
+  std::map<int, MOSDPGPushReply*> replies;
+  for (map<pg_shard_t, vector<PushReplyOp>>::iterator i =
+         m.push_replies.begin();
         i != m.push_replies.end();
         m.push_replies.erase(i++)) {
      MOSDPGPushReply *msg = new MOSDPGPushReply();
@@ -535,7 +504,7 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m,
      msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
      msg->replies.swap(i->second);
      msg->compute_cost(cct);
-    replies.insert(make_pair(i->first.osd, msg));
+    replies.insert(std::pair(i->first.osd, msg));
    }
  
  #if 1
@@ -548,9 +517,7 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m,
      return;
    read_pipeline.start_read_op(
      priority,
-    m.want_to_read,
      m.recovery_reads,
-    OpRequestRef(),
      false,
      true,
      std::make_unique<RecoveryReadCompleter>(*this));
@@ -558,120 +525,128 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m,
  
  void ECBackend::RecoveryBackend::continue_recovery_op(
    RecoveryBackend::RecoveryOp &op,
-  RecoveryMessages *m)
-{
+  RecoveryMessages *m) {
    dout(10) << __func__ << ": continuing " << op << dendl;
    using RecoveryOp = RecoveryBackend::RecoveryOp;
    while (1) {
      switch (op.state) {
      case RecoveryOp::IDLE: {
-      // start read
-      op.state = RecoveryOp::READING;
        ceph_assert(!op.recovery_progress.data_complete);
-      set<int> want(op.missing_on_shards.begin(), op.missing_on_shards.end());
-      uint64_t from = op.recovery_progress.data_recovered_to;
-      uint64_t amount = get_recovery_chunk_size();
+      ECUtil::shard_extent_set_t want(sinfo.get_k_plus_m());
+
+      op.state = RecoveryOp::READING;
+
+      // We always read the recovery chunk size (default 8MiB + parity). If that
+      // amount of data is not available, then the backend will truncate the
+      // response.
+      sinfo.ro_range_to_shard_extent_set_with_parity(
+        op.recovery_progress.data_recovered_to,
+        get_recovery_chunk_size(), want);
  
        if (op.recovery_progress.first && op.obc) {
-        if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(op.hoid);
-           r >= 0 || r == -ENOENT) {
-          op.hinfo = unstable_hashinfo_registry.get_hash_info(op.hoid, false, attrs, size);
-        } else {
-          derr << __func__ << ": can't stat-or-getattr on " << op.hoid << dendl;
-       }
-       if (!op.hinfo) {
-          derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
+        op.xattrs = op.obc->attr_cache;
+        if (sinfo.require_hinfo()) {
+          if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(
+              op.hoid);
+            r >= 0 || r == -ENOENT) {
+            op.hinfo = unstable_hashinfo_registry.get_hash_info(
+              op.hoid, false, attrs, size);
+          } else {
+            derr << __func__ << ": can't stat-or-getattr on " << op.hoid <<
+          dendl;
+          }
+          if (!op.hinfo) {
+            derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
                 << dendl;
-          ceph_assert(recovery_ops.count(op.hoid));
-          eversion_t v = recovery_ops[op.hoid].v;
-          recovery_ops.erase(op.hoid);
-         // TODO: not in crimson yet
-          get_parent()->on_failed_pull({get_parent()->whoami_shard()},
-                                       op.hoid, v);
-          return;
+            ceph_assert(recovery_ops.count(op.hoid));
+            eversion_t v = recovery_ops[op.hoid].v;
+            recovery_ops.erase(op.hoid);
+            // TODO: not in crimson yet
+            get_parent()->on_failed_pull({get_parent()->whoami_shard()},
+                                         op.hoid, v);
+            return;
+          }
+          encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
          }
-       op.xattrs = op.obc->attr_cache;
-       encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
        }
  
-      map<pg_shard_t, vector<pair<int, int>>> to_read;
+      read_request_t read_request(std::move(want),
+                                  op.recovery_progress.first && !op.obc,
+                                  op.obc
+                                    ? op.obc->obs.oi.size
+                                    : get_recovery_chunk_size());
+
        int r = read_pipeline.get_min_avail_to_read_shards(
-       op.hoid, want, true, false, &to_read);
+        op.hoid, true, false, read_request);
+
        if (r != 0) {
-       // we must have lost a recovery source
-       ceph_assert(!op.recovery_progress.first);
-       dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
-                << dendl;
-       // in crimson
-       get_parent()->cancel_pull(op.hoid);
-       recovery_ops.erase(op.hoid);
-       return;
+        // we must have lost a recovery source
+        ceph_assert(!op.recovery_progress.first);
+        dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
+                 << dendl;
+        // in crimson
+        get_parent()->cancel_pull(op.hoid);
+        recovery_ops.erase(op.hoid);
+        return;
+      }
+      if (read_request.shard_reads.empty()) {
+        ceph_assert(op.obc);
+        ceph_assert(0 == op.obc->obs.oi.size);
+        dout(10) << __func__ << "Zero size object recovery, skipping reads."
+                 << op << dendl;
+        // Create an empty read result and fall through.
+        op.returned_data.emplace(&sinfo);
+      } else {
+        m->recovery_read(
+          op.hoid,
+          read_request);
+        dout(10) << __func__ << ": IDLE return " << op << dendl;
+        return;
        }
-      m->recovery_read(
-       op.hoid,
-       op.recovery_progress.data_recovered_to,
-       amount,
-       std::move(want),
-       to_read,
-       op.recovery_progress.first && !op.obc);
-      op.extent_requested = make_pair(
-       from,
-       amount);
-      dout(10) << __func__ << ": IDLE return " << op << dendl;
-      return;
      }
+      [[fallthrough]];
      case RecoveryOp::READING: {
        // read completed, start write
        ceph_assert(op.xattrs.size());
-      ceph_assert(op.returned_data.size());
+      ceph_assert(op.returned_data);
+      dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
        op.state = RecoveryOp::WRITING;
        ObjectRecoveryProgress after_progress = op.recovery_progress;
-      after_progress.data_recovered_to += op.extent_requested.second;
+      after_progress.data_recovered_to = op.returned_data->get_ro_end();
        after_progress.first = false;
        if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
-       after_progress.data_recovered_to =
-         sinfo.logical_to_next_stripe_offset(
-           op.obc->obs.oi.size);
-       after_progress.data_complete = true;
+        after_progress.data_complete = true;
        }
-      for (set<pg_shard_t>::iterator mi = op.missing_on.begin();
-          mi != op.missing_on.end();
-          ++mi) {
-       ceph_assert(op.returned_data.count(static_cast<int>(mi->shard)));
-       m->pushes[*mi].push_back(PushOp());
-       PushOp &pop = m->pushes[*mi].back();
-       pop.soid = op.hoid;
-       pop.version = op.v;
-       pop.data = op.returned_data[static_cast<int>(mi->shard)];
-       dout(10) << __func__ << ": before_progress=" << op.recovery_progress
+      for (auto &&pg_shard: op.missing_on) {
+        m->pushes[pg_shard].push_back(PushOp());
+        PushOp &pop = m->pushes[pg_shard].back();
+        pop.soid = op.hoid;
+        pop.version = op.v;
+        op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data);
+        dout(10) << __func__ << ": pop shard=" << pg_shard
+                 << ", oid=" << pop.soid
+                 << ", before_progress=" << op.recovery_progress
                  << ", after_progress=" << after_progress
                  << ", pop.data.length()=" << pop.data.length()
                  << ", size=" << op.obc->obs.oi.size << dendl;
-       ceph_assert(
-         pop.data.length() ==
-         sinfo.aligned_logical_offset_to_chunk_offset(
-           after_progress.data_recovered_to -
-           op.recovery_progress.data_recovered_to)
-         );
-       if (pop.data.length())
-         pop.data_included.insert(
-           sinfo.aligned_logical_offset_to_chunk_offset(
-             op.recovery_progress.data_recovered_to),
-           pop.data.length()
-           );
-       if (op.recovery_progress.first) {
-         pop.attrset = op.xattrs;
-       }
-       pop.recovery_info = op.recovery_info;
-       pop.before_progress = op.recovery_progress;
-       pop.after_progress = after_progress;
-       if (*mi != get_parent()->primary_shard())
-         // already in crimson -- junction point with PeeringState
-         get_parent()->begin_peer_recover(
-           *mi,
-           op.hoid);
+        if (pop.data.length())
+          pop.data_included.union_insert(
+            op.returned_data->get_shard_first_offset(pg_shard.shard),
+            pop.data.length());
+        if (op.recovery_progress.first) {
+          pop.attrset = op.xattrs;
+        }
+        pop.recovery_info = op.recovery_info;
+        pop.before_progress = op.recovery_progress;
+        pop.after_progress = after_progress;
+        if (pg_shard != get_parent()->primary_shard()) {
+          // already in crimson -- junction point with PeeringState
+          get_parent()->begin_peer_recover(
+            pg_shard,
+            op.hoid);
+        }
        }
-      op.returned_data.clear();
+      op.returned_data.reset();
        op.waiting_on_pushes = op.missing_on;
        op.recovery_progress = after_progress;
        dout(10) << __func__ << ": READING return " << op << dendl;
@@ -679,37 +654,37 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
      }
      case RecoveryOp::WRITING: {
        if (op.waiting_on_pushes.empty()) {
-       if (op.recovery_progress.data_complete) {
-         op.state = RecoveryOp::COMPLETE;
-         for (set<pg_shard_t>::iterator i = op.missing_on.begin();
-              i != op.missing_on.end();
-              ++i) {
-           if (*i != get_parent()->primary_shard()) {
-             dout(10) << __func__ << ": on_peer_recover on " << *i
+        if (op.recovery_progress.data_complete) {
+          op.state = RecoveryOp::COMPLETE;
+          for (set<pg_shard_t>::iterator i = op.missing_on.begin();
+               i != op.missing_on.end();
+               ++i) {
+            if (*i != get_parent()->primary_shard()) {
+              dout(10) << __func__ << ": on_peer_recover on " << *i
                        << ", obj " << op.hoid << dendl;
-             get_parent()->on_peer_recover(
-               *i,
-               op.hoid,
-               op.recovery_info);
-           }
-         }
-         object_stat_sum_t stat;
-         stat.num_bytes_recovered = op.recovery_info.size;
-         stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
-         stat.num_objects_recovered = 1;
-         // TODO: not in crimson yet
-         if (get_parent()->pg_is_repair())
-           stat.num_objects_repaired = 1;
-         // pg_recovery.cc in crimson has it
-         get_parent()->on_global_recover(op.hoid, stat, false);
-         dout(10) << __func__ << ": WRITING return " << op << dendl;
-         recovery_ops.erase(op.hoid);
-         return;
-       } else {
-         op.state = RecoveryOp::IDLE;
-         dout(10) << __func__ << ": WRITING continue " << op << dendl;
-         continue;
-       }
+              get_parent()->on_peer_recover(
+                *i,
+                op.hoid,
+                op.recovery_info);
+            }
+          }
+          object_stat_sum_t stat;
+          stat.num_bytes_recovered = op.recovery_info.size;
+          stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
+          stat.num_objects_recovered = 1;
+          // TODO: not in crimson yet
+          if (get_parent()->pg_is_repair())
+            stat.num_objects_repaired = 1;
+          // pg_recovery.cc in crimson has it
+          get_parent()->on_global_recover(op.hoid, stat, false);
+          dout(10) << __func__ << ": WRITING return " << op << dendl;
+          recovery_ops.erase(op.hoid);
+          return;
+        } else {
+          op.state = RecoveryOp::IDLE;
+          dout(10) << __func__ << ": WRITING continue " << op << dendl;
+          continue;
+        }
        }
        return;
      }
@@ -724,8 +699,7 @@ void ECBackend::RecoveryBackend::continue_recovery_op(
  
  void ECBackend::run_recovery_op(
    PGBackend::RecoveryHandle *_h,
-  int priority)
-{
+  int priority) {
    ceph_assert(_h);
    ECRecoveryHandle &h = static_cast<ECRecoveryHandle&>(*_h);
    recovery_backend.run_recovery_op(h, priority);
@@ -735,8 +709,7 @@ void ECBackend::run_recovery_op(
  
  void ECBackend::RecoveryBackend::run_recovery_op(
    ECRecoveryHandle &h,
-  int priority)
-{
+  int priority) {
    RecoveryMessages m;
    for (list<RecoveryOp>::iterator i = h.ops.begin();
         i != h.ops.end();
@@ -754,8 +727,7 @@ int ECBackend::recover_object(
    eversion_t v,
    ObjectContextRef head,
    ObjectContextRef obc,
-  PGBackend::RecoveryHandle *_h)
-{
+  PGBackend::RecoveryHandle *_h) {
    return recovery_backend.recover_object(hoid, v, head, obc, _h);
  }
  
@@ -764,8 +736,7 @@ int ECBackend::RecoveryBackend::recover_object(
    eversion_t v,
    ObjectContextRef head,
    ObjectContextRef obc,
-  PGBackend::RecoveryHandle *_h)
-{
+  PGBackend::RecoveryHandle *_h) {
    ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
    h->ops.push_back(RecoveryOp());
    h->ops.back().v = v;
@@ -790,7 +761,7 @@ int ECBackend::RecoveryBackend::recover_object(
    }
    h->ops.back().recovery_progress.omap_complete = true;
    for (set<pg_shard_t>::const_iterator i =
-        get_parent()->get_acting_recovery_backfill_shards().begin();
+         get_parent()->get_acting_recovery_backfill_shards().begin();
         i != get_parent()->get_acting_recovery_backfill_shards().end();
         ++i) {
      dout(10) << "checking " << *i << dendl;
@@ -804,14 +775,12 @@ int ECBackend::RecoveryBackend::recover_object(
  }
  
  bool ECBackend::can_handle_while_inactive(
-  OpRequestRef _op)
-{
+  OpRequestRef _op) {
    return false;
  }
  
  bool ECBackend::_handle_message(
-  OpRequestRef _op)
-{
+  OpRequestRef _op) {
    dout(10) << __func__ << ": " << *_op->get_req() << dendl;
    int priority = _op->get_req()->get_priority();
    switch (_op->get_req()->get_type()) {
@@ -822,7 +791,8 @@ bool ECBackend::_handle_message(
      MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
        _op->get_nonconst_req());
      parent->maybe_preempt_replica_scrub(op->op.soid);
-    handle_sub_write(op->op.from, _op, op->op, _op->pg_trace, *get_parent()->get_eclistener());
+    handle_sub_write(op->op.from, _op, op->op, _op->pg_trace,
+                     *get_parent()->get_eclistener());
      return true;
    }
    case MSG_OSD_EC_WRITE_REPLY: {
@@ -857,20 +827,20 @@ bool ECBackend::_handle_message(
      auto op = _op->get_req<MOSDPGPush>();
      RecoveryMessages rm;
      for (vector<PushOp>::const_iterator i = op->pushes.begin();
-        i != op->pushes.end();
-        ++i) {
+         i != op->pushes.end();
+         ++i) {
        handle_recovery_push(*i, &rm, op->is_repair);
      }
      recovery_backend.dispatch_recovery_messages(rm, priority);
      return true;
    }
    case MSG_OSD_PG_PUSH_REPLY: {
-    const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply *>(
+    const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply*>(
        _op->get_req());
      RecoveryMessages rm;
      for (vector<PushReplyOp>::const_iterator i = op->replies.begin();
-        i != op->replies.end();
-        ++i) {
+         i != op->replies.end();
+         ++i) {
        recovery_backend.handle_recovery_push_reply(*i, op->from, &rm);
      }
      recovery_backend.dispatch_recovery_messages(rm, priority);
@@ -889,6 +859,7 @@ struct SubWriteCommitted : public Context {
    eversion_t version;
    eversion_t last_complete;
    const ZTracer::Trace trace;
+
    SubWriteCommitted(
      ECBackend *pg,
      OpRequestRef msg,
@@ -898,12 +869,14 @@ struct SubWriteCommitted : public Context {
      const ZTracer::Trace &trace)
      : pg(pg), msg(msg), tid(tid),
        version(version), last_complete(last_complete), trace(trace) {}
+
    void finish(int) override {
      if (msg)
        msg->mark_event("sub_op_committed");
      pg->sub_write_committed(tid, version, last_complete, trace);
    }
  };
+
  void ECBackend::sub_write_committed(
    ceph_tid_t tid, eversion_t version, eversion_t last_complete,
    const ZTracer::Trace &trace) {
@@ -941,15 +914,14 @@ void ECBackend::handle_sub_write(
    OpRequestRef msg,
    ECSubWrite &op,
    const ZTracer::Trace &trace,
-  ECListener&)
-{
+  ECListener &) {
    if (msg) {
      msg->mark_event("sub_op_started");
    }
    trace.event("handle_sub_write");
  
    if (cct->_conf->bluestore_debug_inject_read_err &&
-      ECInject::test_write_error3(op.soid)) {
+    ECInject::test_write_error3(op.soid)) {
      ceph_abort_msg("Error inject - OSD down");
    }
    if (!get_parent()->pgb_is_primary())
@@ -960,26 +932,28 @@ void ECBackend::handle_sub_write(
    }
    if (op.backfill_or_async_recovery) {
      for (set<hobject_t>::iterator i = op.temp_removed.begin();
-        i != op.temp_removed.end();
-        ++i) {
+         i != op.temp_removed.end();
+         ++i) {
        dout(10) << __func__ << ": removing object " << *i
                << " since we won't get the transaction" << dendl;
        localt.remove(
-       switcher->coll,
-       ghobject_t(
-         *i,
-         ghobject_t::NO_GEN,
-         get_parent()->whoami_shard().shard));
+        switcher->coll,
+        ghobject_t(
+          *i,
+          ghobject_t::NO_GEN,
+          get_parent()->whoami_shard().shard));
      }
    }
    switcher->clear_temp_objs(op.temp_removed);
-  dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+  dout(30) << __func__ << " missing before " <<
+    get_parent()->get_log().get_missing().get_items() << dendl;
    // flag set to true during async recovery
    bool async = false;
    pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
    if (pmissing.is_missing(op.soid)) {
      async = true;
-    dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl;
+    dout(30) << __func__ << " is_missing " <<
+      pmissing.is_missing(op.soid) << dendl;
      for (auto &&e: op.log_entries) {
        dout(30) << " add_next_event entry " << e << dendl;
        get_parent()->add_local_next_event(e);
@@ -997,21 +971,24 @@ void ECBackend::handle_sub_write(
      async);
  
    if (!get_parent()->pg_is_undersized() &&
-      (unsigned)get_parent()->whoami_shard().shard >= sinfo.get_k())
+    get_parent()->whoami_shard().shard >= sinfo.get_k())
      op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
  
    localt.register_on_commit(
      get_parent()->bless_context(
        new SubWriteCommitted(
-       this, msg, op.tid,
-       op.at_version,
-       get_parent()->get_info().last_complete, trace)));
+        this, msg, op.tid,
+        op.at_version,
+        get_parent()->get_info().last_complete, trace)));
    vector<ObjectStore::Transaction> tls;
    tls.reserve(2);
    tls.push_back(std::move(op.t));
    tls.push_back(std::move(localt));
    get_parent()->queue_transactions(tls, msg);
-  dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+  dout(30) << __func__ << " missing after" << get_parent()->get_log().
+                                                            get_missing().
+                                                            get_items() << dendl
+  ;
    if (op.at_version != eversion_t()) {
      // dummy rollforward transaction doesn't get at_version (and doesn't advance it)
      get_parent()->op_applied(op.at_version);
@@ -1022,26 +999,21 @@ void ECBackend::handle_sub_read(
    pg_shard_t from,
    const ECSubRead &op,
    ECSubReadReply *reply,
-  const ZTracer::Trace &trace)
-{
+  const ZTracer::Trace &trace) {
    trace.event("handle sub read");
    shard_id_t shard = get_parent()->whoami_shard().shard;
-  for(auto i = op.to_read.begin();
-      i != op.to_read.end();
-      ++i) {
+  for (auto &&[hoid, to_read]: op.to_read) {
      int r = 0;
-    for (auto j = i->second.begin(); j != i->second.end(); ++j) {
+    for (auto &&[offset, len, flags]: to_read) {
        bufferlist bl;
-      if ((op.subchunks.find(i->first)->second.size() == 1) && 
-          (op.subchunks.find(i->first)->second.front().second == 
-                                            ec_impl->get_sub_chunk_count())) {
+      auto &subchunks = op.subchunks.at(hoid);
+      if ((subchunks.size() == 1) &&
+        (subchunks.front().second == ec_impl->get_sub_chunk_count())) {
          dout(20) << __func__ << " case1: reading the complete chunk/shard." << dendl;
          r = switcher->store->read(
-         switcher->ch,
-         ghobject_t(i->first, ghobject_t::NO_GEN, shard),
-         j->get<0>(),
-         j->get<1>(),
-         bl, j->get<2>()); // Allow EIO return
+          switcher->ch,
+          ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+          offset, len, bl, flags); // Allow EIO return
        } else {
          int subchunk_size =
            sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
@@ -1049,16 +1021,16 @@ void ECBackend::handle_sub_read(
                  << " subchunk_size=" << subchunk_size
                  << " chunk_size=" << sinfo.get_chunk_size() << dendl;
          bool error = false;
-        for (int m = 0; m < (int)j->get<1>() && !error;
+        for (int m = 0; m < (int)len && !error;
               m += sinfo.get_chunk_size()) {
-          for (auto &&k:op.subchunks.find(i->first)->second) {
+          for (auto &&k: subchunks) {
              bufferlist bl0;
              r = switcher->store->read(
-                switcher->ch,
-                ghobject_t(i->first, ghobject_t::NO_GEN, shard),
-                j->get<0>() + m + (k.first)*subchunk_size,
-                (k.second)*subchunk_size,
-                bl0, j->get<2>());
+              switcher->ch,
+              ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+              offset + m + (k.first) * subchunk_size,
+              (k.second) * subchunk_size,
+              bl0, flags);
              if (r < 0) {
                error = true;
                break;
@@ -1069,92 +1041,93 @@ void ECBackend::handle_sub_read(
        }
  
        if (r < 0) {
-       // if we are doing fast reads, it's possible for one of the shard
-       // reads to cross paths with another update and get a (harmless)
-       // ENOENT.  Suppress the message to the cluster log in that case.
-       if (r == -ENOENT && get_parent()->get_pool().fast_read) {
-         dout(5) << __func__ << ": Error " << r
-                 << " reading " << i->first << ", fast read, probably ok"
+        // if we are doing fast reads, it's possible for one of the shard
+        // reads to cross paths with another update and get a (harmless)
+        // ENOENT.  Suppress the message to the cluster log in that case.
+        if (r == -ENOENT && get_parent()->get_pool().fast_read) {
+          dout(5) << __func__ << ": Error " << r
+                 << " reading " << hoid << ", fast read, probably ok"
                   << dendl;
-       } else {
-         get_parent()->clog_error() << "Error " << r
-                                    << " reading object "
-                                    << i->first;
-         dout(5) << __func__ << ": Error " << r
-                 << " reading " << i->first << dendl;
-       }
-       goto error;
+        } else {
+          get_parent()->clog_error() << "Error " << r
+            << " reading object "
+            << hoid;
+          dout(5) << __func__ << ": Error " << r
+                 << " reading " << hoid << dendl;
+        }
+        goto error;
        } else {
-        dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl;
-       reply->buffers_read[i->first].push_back(
-         make_pair(
-           j->get<0>(),
-           bl)
-         );
+        dout(20) << __func__ << " read request=" << len << " r=" << r << " len="
+          << bl.length() << dendl;
+        reply->buffers_read[hoid].push_back(make_pair(offset, bl));
        }
  
-      if (!get_parent()->get_pool().allows_ecoverwrites()) {
-       // This shows that we still need deep scrub because large enough files
-       // are read in sections, so the digest check here won't be done here.
-       // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
-       // the state of our chunk in case other chunks could substitute.
+      if (!sinfo.supports_ec_overwrites()) {
+        // This shows that we still need deep scrub because large enough files
+        // are read in sections, so the digest check here won't be done here.
+        // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
+        // the state of our chunk in case other chunks could substitute.
          ECUtil::HashInfoRef hinfo;
          map<string, bufferlist, less<>> attrs;
-       struct stat st;
-       int r = object_stat(i->first, &st);
+        struct stat st;
+        int r = object_stat(hoid, &st);
+        if (r >= 0) {
+          dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
+          r = switcher->objects_get_attrs_with_hinfo(hoid, &attrs);
+        }
          if (r >= 0) {
-         dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
-         r = switcher->objects_get_attrs_with_hinfo(i->first, &attrs);
-       }
-       if (r >= 0) {
-         hinfo = unstable_hashinfo_registry.get_hash_info(i->first, false, attrs, st.st_size);
-       } else {
-         derr << __func__ << ": access (attrs) on " << i->first << " failed: "
+          hinfo = unstable_hashinfo_registry.get_hash_info(
+            hoid, false, attrs, st.st_size);
+        } else {
+          derr << __func__ << ": access (attrs) on " << hoid << " failed: "
                << cpp_strerror(r) << dendl;
-       }
+        }
          if (!hinfo) {
            r = -EIO;
            get_parent()->clog_error() << "Corruption detected: object "
-                                     << i->first
-                                     << " is missing hash_info";
-          dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
+            << hoid
+            << " is missing hash_info";
+          dout(5) << __func__ << ": No hinfo for " << hoid << dendl;
            goto error;
          }
-       ceph_assert(hinfo->has_chunk_hash());
-       if ((bl.length() == hinfo->get_total_chunk_size()) &&
-           (j->get<0>() == 0)) {
-         dout(20) << __func__ << ": Checking hash of " << i->first << dendl;
-         bufferhash h(-1);
-         h << bl;
-         if (h.digest() != hinfo->get_chunk_hash(shard)) {
-           get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x"
-                                      << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec;
-           dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
-                   << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
-           r = -EIO;
-           goto error;
-         }
-       }
+        ceph_assert(hinfo->has_chunk_hash());
+        if ((bl.length() == hinfo->get_total_chunk_size()) &&
+          (offset == 0)) {
+          dout(20) << __func__ << ": Checking hash of " << hoid << dendl;
+          bufferhash h(-1);
+          h << bl;
+          if (h.digest() != hinfo->get_chunk_hash(shard)) {
+            get_parent()->clog_error() << "Bad hash for " << hoid <<
+              " digest 0x"
+              << hex << h.digest() << " expected 0x" << hinfo->
+              get_chunk_hash(shard) << dec;
+            dout(5) << __func__ << ": Bad hash for " << hoid << " digest 0x"
+                   << hex << h.digest() << " expected 0x" << hinfo->
+get_chunk_hash(shard) << dec << dendl;
+            r = -EIO;
+            goto error;
+          }
+        }
        }
      }
      continue;
-error:
+  error:
      // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
      // the state of our chunk in case other chunks could substitute.
-    reply->buffers_read.erase(i->first);
-    reply->errors[i->first] = r;
+    reply->buffers_read.erase(hoid);
+    reply->errors[hoid] = r;
    }
    for (set<hobject_t>::iterator i = op.attrs_to_read.begin();
         i != op.attrs_to_read.end();
         ++i) {
      dout(10) << __func__ << ": fulfilling attr request on "
              << *i << dendl;
-    if (reply->errors.count(*i))
+    if (reply->errors.contains(*i))
        continue;
      int r = switcher->store->getattrs(
        switcher->ch,
        ghobject_t(
-       *i, ghobject_t::NO_GEN, shard),
+        *i, ghobject_t::NO_GEN, shard),
        reply->attrs_read[*i]);
      if (r < 0) {
        // If we read error, we should not return the attrs too.
@@ -1169,54 +1142,43 @@ error:
  
  void ECBackend::handle_sub_write_reply(
    pg_shard_t from,
-  const ECSubWriteReply &op,
-  const ZTracer::Trace &trace)
-{
-  map<ceph_tid_t, RMWPipeline::OpRef>::iterator i = rmw_pipeline.tid_to_op_map.find(op.tid);
-  ceph_assert(i != rmw_pipeline.tid_to_op_map.end());
-  if (op.committed) {
+  const ECSubWriteReply &ec_write_reply_op,
+  const ZTracer::Trace &trace) {
+  RMWPipeline::OpRef &op = rmw_pipeline.tid_to_op_map.at(ec_write_reply_op.tid);
+  if (ec_write_reply_op.committed) {
      trace.event("sub write committed");
-    ceph_assert(i->second->pending_commit.count(from));
-    i->second->pending_commit.erase(from);
+    ceph_assert(op->pending_commits > 0);
+    op->pending_commits--;
      if (from != get_parent()->whoami_shard()) {
-      get_parent()->update_peer_last_complete_ondisk(from, op.last_complete);
+      get_parent()->update_peer_last_complete_ondisk(
+        from, ec_write_reply_op.last_complete);
      }
    }
-  if (op.applied) {
-    trace.event("sub write applied");
-    ceph_assert(i->second->pending_apply.count(from));
-    i->second->pending_apply.erase(from);
-  }
  
-  if (i->second->pending_commit.empty() &&
-      i->second->on_all_commit &&
-      // also wait for apply, to preserve ordering with luminous peers.
-      i->second->pending_apply.empty()) {
-    dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl;
-    i->second->on_all_commit->complete(0);
-    i->second->on_all_commit = 0;
-    i->second->trace.event("ec write all committed");
-  }
    if (cct->_conf->bluestore_debug_inject_read_err &&
-      (i->second->pending_commit.size() == 1) &&
-      ECInject::test_write_error2(i->second->hoid)) {
+    (op->pending_commits == 1) &&
+    ECInject::test_write_error2(op->hoid)) {
      std::string cmd =
-      "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }";
+      "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string(
+        get_parent()->whoami()) + "\"] }";
      vector<std::string> vcmd{cmd};
      dout(0) << __func__ << " Error inject - marking OSD down" << dendl;
      get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
    }
-  rmw_pipeline.check_ops();
+
+  if (op->pending_commits == 0) {
+    rmw_pipeline.try_finish_rmw();
+  }
  }
  
  void ECBackend::handle_sub_read_reply(
    pg_shard_t from,
    ECSubReadReply &op,
-  const ZTracer::Trace &trace)
-{
+  const ZTracer::Trace &trace) {
    trace.event("ec sub read reply");
    dout(10) << __func__ << ": reply " << op << dendl;
-  map<ceph_tid_t, ReadOp>::iterator iter = read_pipeline.tid_to_read_map.find(op.tid);
+  map<ceph_tid_t, ReadOp>::iterator iter = read_pipeline.tid_to_read_map.
+                                                         find(op.tid);
    if (iter == read_pipeline.tid_to_read_map.end()) {
      //canceled
      dout(20) << __func__ << ": dropped " << op << dendl;
@@ -1225,68 +1187,90 @@ void ECBackend::handle_sub_read_reply(
    ReadOp &rop = iter->second;
    if (cct->_conf->bluestore_debug_inject_read_err) {
      for (auto i = op.buffers_read.begin();
-        i != op.buffers_read.end();
-        ++i) {
-      if (ECInject::test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) {
-       dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl;
-       op.buffers_read.erase(i->first);
-       op.attrs_read.erase(i->first);
-       op.errors[i->first] = -EIO;
+         i != op.buffers_read.end();
+         ++i) {
+      if (ECInject::test_read_error0(
+        ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) {
+        dout(0) << __func__ << " Error inject - EIO error for shard " << op.from
+                                                                           .shard
+ << dendl;
+        op.buffers_read.erase(i->first);
+        op.attrs_read.erase(i->first);
+        op.errors[i->first] = -EIO;
+        rop.debug_log.emplace_back(ECUtil::INJECT_EIO, op.from);
        }
-
      }
    }
-  for (auto i = op.buffers_read.begin();
-       i != op.buffers_read.end();
-       ++i) {
-    ceph_assert(!op.errors.count(i->first));   // If attribute error we better not have sent a buffer
-    if (!rop.to_read.count(i->first)) {
+  for (auto &&[hoid, offset_buffer_map]: op.buffers_read) {
+    ceph_assert(!op.errors.contains(hoid));
+    // If attribute error we better not have sent a buffer
+    if (!rop.to_read.contains(hoid)) {
+      rop.debug_log.emplace_back(ECUtil::CANCELLED, op.from);
+
        // We canceled this read! @see filter_read_op
        dout(20) << __func__ << " to_read skipping" << dendl;
        continue;
      }
-    list<ec_align_t>::const_iterator req_iter =
-      rop.to_read.find(i->first)->second.to_read.begin();
-    list<
-      boost::tuple<
-       uint64_t, uint64_t, map<pg_shard_t, bufferlist> > >::iterator riter =
-      rop.complete[i->first].returned.begin();
-    for (list<pair<uint64_t, bufferlist> >::iterator j = i->second.begin();
-        j != i->second.end();
-        ++j, ++req_iter, ++riter) {
-      ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
-      ceph_assert(riter != rop.complete[i->first].returned.end());
-      pair<uint64_t, uint64_t> aligned =
-       sinfo.chunk_aligned_offset_len_to_chunk(
-         make_pair(req_iter->offset, req_iter->size));
-      ceph_assert(aligned.first == j->first);
-      riter->get<2>()[from] = std::move(j->second);
+
+    if (!rop.complete.contains(hoid)) {
+      rop.complete.emplace(hoid, &sinfo);
+    }
+
+    auto &buffers_read = rop.complete.at(hoid).buffers_read;
+    for (auto &&[offset, buffer_list]: offset_buffer_map) {
+      buffers_read.insert_in_shard(from.shard, offset, buffer_list);
      }
+    rop.debug_log.emplace_back(ECUtil::READ_DONE, op.from, buffers_read);
    }
-  for (auto i = op.attrs_read.begin();
-       i != op.attrs_read.end();
-       ++i) {
-    ceph_assert(!op.errors.count(i->first));   // if read error better not have sent an attribute
-    if (!rop.to_read.count(i->first)) {
+  for (auto &&[hoid, req]: rop.to_read) {
+    if (!rop.complete.contains(hoid)) {
+      rop.complete.emplace(hoid, &sinfo);
+    }
+    auto &complete = rop.complete.at(hoid);
+    for (auto &&[shard, read]: std::as_const(req.shard_reads)) {
+      if (complete.errors.contains(read.pg_shard)) continue;
+
+      complete.processed_read_requests[shard].union_of(read.extents);
+
+      if (!rop.complete.contains(hoid) ||
+        !complete.buffers_read.contains(shard)) {
+        if (!read.extents.empty()) continue; // Complete the actual read first.
+
+        // If we are first here, populate the completion.
+        if (!rop.complete.contains(hoid)) {
+          rop.complete.emplace(hoid, read_result_t(&sinfo));
+        }
+      }
+    }
+  }
+  for (auto &&[hoid, attr]: op.attrs_read) {
+    ceph_assert(!op.errors.count(hoid));
+    // if read error better not have sent an attribute
+    if (!rop.to_read.count(hoid)) {
        // We canceled this read! @see filter_read_op
        dout(20) << __func__ << " to_read skipping" << dendl;
        continue;
      }
-    rop.complete[i->first].attrs.emplace();
-    (*(rop.complete[i->first].attrs)).swap(i->second);
+    if (!rop.complete.contains(hoid)) {
+      rop.complete.emplace(hoid, &sinfo);
+    }
+    rop.complete.at(hoid).attrs.emplace();
+    (*(rop.complete.at(hoid).attrs)).swap(attr);
    }
-  for (auto i = op.errors.begin();
-       i != op.errors.end();
-       ++i) {
-    rop.complete[i->first].errors.insert(
-      make_pair(
-       from,
-       i->second));
-    dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl;
+  for (auto &&[hoid, err]: op.errors) {
+    if (!rop.complete.contains(hoid)) {
+      rop.complete.emplace(hoid, &sinfo);
+    }
+    auto &complete = rop.complete.at(hoid);
+    complete.errors.emplace(from, err);
+    rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read);
+    complete.buffers_read.erase_shard(from.shard);
+    complete.processed_read_requests.erase(from.shard);
+    dout(20) << __func__ << " shard=" << from << " error=" << err << dendl;
    }
  
-  map<pg_shard_t, set<ceph_tid_t> >::iterator siter =
-                                       read_pipeline.shard_to_read_map.find(from);
+  map<pg_shard_t, set<ceph_tid_t>>::iterator siter =
+    read_pipeline.shard_to_read_map.find(from);
    ceph_assert(siter != read_pipeline.shard_to_read_map.end());
    ceph_assert(siter->second.count(op.tid));
    siter->second.erase(op.tid);
@@ -1298,96 +1282,104 @@ void ECBackend::handle_sub_read_reply(
    // For redundant reads check for completion as each shard comes in,
    // or in a non-recovery read check for completion once all the shards read.
    if (rop.do_redundant_reads || rop.in_progress.empty()) {
-    for (map<hobject_t, read_result_t>::const_iterator iter =
-        rop.complete.begin();
-      iter != rop.complete.end();
-      ++iter) {
-      set<int> have;
-      for (map<pg_shard_t, bufferlist>::const_iterator j =
-          iter->second.returned.front().get<2>().begin();
-        j != iter->second.returned.front().get<2>().end();
-        ++j) {
-        have.insert(static_cast<int>(j->first.shard));
-        dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
-      }
-      map<int, vector<pair<int, int>>> dummy_minimum;
-      int err;
-      if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
-       dout(20) << __func__ << " minimum_to_decode failed" << dendl;
+    for (auto &&[oid, read_result]: rop.complete) {
+      shard_id_set have;
+      read_result.processed_read_requests.populate_shard_id_set(have);
+      shard_id_set dummy_minimum;
+      shard_id_set want_to_read;
+      rop.to_read.at(oid).shard_want_to_read.
+          populate_shard_id_set(want_to_read);
+
+      int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum,
+                                            nullptr);
+      if (err) {
+        dout(20) << __func__ << " minimum_to_decode failed" << dendl;
          if (rop.in_progress.empty()) {
-         // If we don't have enough copies, try other pg_shard_ts if available.
-         // During recovery there may be multiple osds with copies of the same shard,
-         // so getting EIO from one may result in multiple passes through this code path.
-         if (!rop.do_redundant_reads) {
-           int r = read_pipeline.send_all_remaining_reads(iter->first, rop);
-           if (r == 0) {
-             // We changed the rop's to_read and not incrementing is_complete
-             need_resend = true;
-             continue;
-           }
-           // Couldn't read any additional shards so handle as completed with errors
-         }
-         // We don't want to confuse clients / RBD with objectstore error
-         // values in particular ENOENT.  We may have different error returns
-         // from different shards, so we'll return minimum_to_decode() error
-         // (usually EIO) to reader.  It is likely an error here is due to a
-         // damaged pg.
-         rop.complete[iter->first].r = err;
-         ++is_complete;
-       }
-      } else {
-        ceph_assert(rop.complete[iter->first].r == 0);
-       if (!rop.complete[iter->first].errors.empty()) {
-         if (cct->_conf->osd_read_ec_check_for_errors) {
-           dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl;
-           err = rop.complete[iter->first].errors.begin()->second;
-            rop.complete[iter->first].r = err;
-         } else {
-           get_parent()->clog_warn() << "Error(s) ignored for "
-                                      << iter->first << " enough copies available";
-           dout(10) << __func__ << " Error(s) ignored for " << iter->first
+          // If we don't have enough copies, try other pg_shard_ts if available.
+          // During recovery there may be multiple osds with copies of the same shard,
+          // so getting EIO from one may result in multiple passes through this code path.
+          if (!rop.do_redundant_reads) {
+            rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from);
+            int r = read_pipeline.send_all_remaining_reads(oid, rop);
+            if (r == 0) {
+              // We found that new reads are required to do a decode.
+              need_resend = true;
+              continue;
+            } else if (r >  0) {
+              // No new reads were requested. This means that some parity
+              // shards can be assumed to be zeros.
+              err = 0;
+            }
+            // else insufficient shards are available, keep the errors.
+          }
+          // Couldn't read any additional shards so handle as completed with errors
+          // We don't want to confuse clients / RBD with objectstore error
+          // values in particular ENOENT.  We may have different error returns
+          // from different shards, so we'll return minimum_to_decode() error
+          // (usually EIO) to reader.  It is likely an error here is due to a
+          // damaged pg.
+          rop.complete.at(oid).r = err;
+          ++is_complete;
+        }
+      }
+
+      if (!err) {
+        ceph_assert(rop.complete.at(oid).r == 0);
+        if (!rop.complete.at(oid).errors.empty()) {
+          if (cct->_conf->osd_read_ec_check_for_errors) {
+            rop.debug_log.emplace_back(ECUtil::COMPLETE_ERROR, op.from);
+            dout(10) << __func__ << ": Not ignoring errors, use one shard" << dendl;
+            err = rop.complete.at(oid).errors.begin()->second;
+            rop.complete.at(oid).r = err;
+          } else {
+            get_parent()->clog_warn() << "Error(s) ignored for "
+              << iter->first << " enough copies available";
+            dout(10) << __func__ << " Error(s) ignored for " << iter->first
                      << " enough copies available" << dendl;
-           rop.complete[iter->first].errors.clear();
-         }
-       }
-       // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
-       rop.to_read.at(iter->first).need.clear();
-       rop.to_read.at(iter->first).want_attrs = false;
-       ++is_complete;
+            rop.debug_log.emplace_back(ECUtil::ERROR_CLEAR, op.from);
+            rop.complete.at(oid).errors.clear();
+          }
+        }
+        // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
+        rop.to_read.at(oid).shard_reads.clear();
+        rop.to_read.at(oid).want_attrs = false;
+        ++is_complete;
        }
      }
    }
    if (need_resend) {
      read_pipeline.do_read_op(rop);
-  } else if (rop.in_progress.empty() || 
+  } else if (rop.in_progress.empty() ||
               is_complete == rop.complete.size()) {
      dout(20) << __func__ << " Complete: " << rop << dendl;
      rop.trace.event("ec read complete");
-    read_pipeline.complete_read_op(rop);
+    rop.debug_log.emplace_back(ECUtil::COMPLETE, op.from);
+    read_pipeline.complete_read_op(std::move(rop));
    } else {
      dout(10) << __func__ << " readop not complete: " << rop << dendl;
    }
  }
  
-void ECBackend::check_recovery_sources(const OSDMapRef& osdmap)
-{
-  struct FinishReadOp : public GenContext<ThreadPool::TPHandle&>  {
-    ECCommon::ReadPipeline& read_pipeline;
+void ECBackend::check_recovery_sources(const OSDMapRef &osdmap) {
+  struct FinishReadOp : public GenContext<ThreadPool::TPHandle&> {
+    ECCommon::ReadPipeline &read_pipeline;
      ceph_tid_t tid;
-    FinishReadOp(ECCommon::ReadPipeline& read_pipeline, ceph_tid_t tid)
+
+    FinishReadOp(ECCommon::ReadPipeline &read_pipeline, ceph_tid_t tid)
        : read_pipeline(read_pipeline), tid(tid) {}
-    void finish(ThreadPool::TPHandle&) override {
+
+    void finish(ThreadPool::TPHandle &) override {
        auto ropiter = read_pipeline.tid_to_read_map.find(tid);
        ceph_assert(ropiter != read_pipeline.tid_to_read_map.end());
-      read_pipeline.complete_read_op(ropiter->second);
+      read_pipeline.complete_read_op(std::move(ropiter->second));
      }
    };
    read_pipeline.check_recovery_sources(
      osdmap,
-    [this] (const hobject_t& obj) {
+    [this](const hobject_t &obj) {
        recovery_backend.recovery_ops.erase(obj);
      },
-    [this] (const ReadOp& op) {
+    [this](const ReadOp &op) {
        get_parent()->schedule_recovery_work(
          get_parent()->bless_unlocked_gencontext(
            new FinishReadOp(read_pipeline, op.tid)),
@@ -1395,22 +1387,21 @@ void ECBackend::check_recovery_sources(const OSDMapRef& osdmap)
      });
  }
  
-void ECBackend::on_change()
-{
+void ECBackend::on_change() {
    rmw_pipeline.on_change();
    read_pipeline.on_change();
+  rmw_pipeline.on_change2();
    clear_recovery_state();
  }
  
-void ECBackend::clear_recovery_state()
-{
+void ECBackend::clear_recovery_state() {
    recovery_backend.recovery_ops.clear();
  }
  
-void ECBackend::dump_recovery_info(Formatter *f) const
-{
+void ECBackend::dump_recovery_info(Formatter *f) const {
    f->open_array_section("recovery_ops");
-  for (map<hobject_t, RecoveryBackend::RecoveryOp>::const_iterator i = recovery_backend.recovery_ops.begin();
+  for (map<hobject_t, RecoveryBackend::RecoveryOp>::const_iterator i =
+         recovery_backend.recovery_ops.begin();
         i != recovery_backend.recovery_ops.end();
         ++i) {
      f->open_object_section("op");
@@ -1419,7 +1410,8 @@ void ECBackend::dump_recovery_info(Formatter *f) const
    }
    f->close_section();
    f->open_array_section("read_ops");
-  for (map<ceph_tid_t, ReadOp>::const_iterator i = read_pipeline.tid_to_read_map.begin();
+  for (map<ceph_tid_t, ReadOp>::const_iterator i = read_pipeline.tid_to_read_map
+                                                                .begin();
         i != read_pipeline.tid_to_read_map.end();
         ++i) {
      f->open_object_section("read_op");
@@ -1433,43 +1425,39 @@ struct ECClassicalOp : ECCommon::RMWPipeline::Op {
    PGTransactionUPtr t;
  
    void generate_transactions(
-      ceph::ErasureCodeInterfaceRef &ecimpl,
-      pg_t pgid,
-      const ECUtil::stripe_info_t &sinfo,
-      std::map<hobject_t,extent_map> *written,
-      std::map<shard_id_t, ObjectStore::Transaction> *transactions,
-      DoutPrefixProvider *dpp,
-      const ceph_release_t require_osd_release) final
-  {
+    ceph::ErasureCodeInterfaceRef &ec_impl,
+    pg_t pgid,
+    const ECUtil::stripe_info_t &sinfo,
+    map<hobject_t, ECUtil::shard_extent_map_t> *written,
+    shard_id_map<ObjectStore::Transaction> *transactions,
+    DoutPrefixProvider *dpp,
+    const OSDMapRef &osdmap) final {
      assert(t);
      ECTransaction::generate_transactions(
        t.get(),
        plan,
-      ecimpl,
+      ec_impl,
        pgid,
        sinfo,
-      remote_read_result,
+      remote_shard_extent_map,
        log_entries,
        written,
        transactions,
        &temp_added,
        &temp_cleared,
        dpp,
-      require_osd_release);
+      osdmap);
    }
  
-  template <typename F>
-  static ECTransaction::WritePlan get_write_plan(
-    const ECUtil::stripe_info_t &sinfo,
-    PGTransaction& t,
-    F &&get_hinfo,
-    DoutPrefixProvider *dpp)
-  {
-    return ECTransaction::get_write_plan(
-      sinfo,
-      t,
-      std::forward<F>(get_hinfo),
-      dpp);
+  bool skip_transaction(
+      std::set<shard_id_t> &pending_roll_forward,
+      shard_id_t shard,
+      ceph::os::Transaction &transaction) final {
+    if (transaction.empty()) {
+      return true;
+    }
+    pending_roll_forward.insert(shard);
+    return false;
    }
  };
  
@@ -1477,19 +1465,38 @@ std::tuple<
    int,
    map<string, bufferlist, less<>>,
    size_t
-> ECBackend::get_attrs_n_size_from_disk(const hobject_t& hoid)
-{
+> ECBackend::get_attrs_n_size_from_disk(const hobject_t &hoid) {
    struct stat st;
    if (int r = object_stat(hoid, &st); r < 0) {
      dout(10) << __func__ << ": stat error " << r << " on" << hoid << dendl;
-    return { r, {}, 0 };
+    return {r, {}, 0};
    }
    map<string, bufferlist, less<>> real_attrs;
    if (int r = switcher->objects_get_attrs_with_hinfo(hoid, &real_attrs); r < 0) {
      dout(10) << __func__ << ": get attr error " << r << " on" << hoid << dendl;
-    return { r, {}, 0 };
+    return {r, {}, 0};
    }
-  return { 0, real_attrs, st.st_size };
+  return {0, real_attrs, st.st_size};
+}
+
+ECUtil::HashInfoRef ECBackend::get_hinfo_from_disk(hobject_t oid) {
+  auto [r, attrs, size] = get_attrs_n_size_from_disk(oid);
+  ceph_assert(r >= 0 || r == -ENOENT);
+  ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(
+    oid, true, attrs, size);
+  return hinfo;
+}
+
+std::optional<object_info_t> ECBackend::get_object_info_from_obc(
+    ObjectContextRef &obc) {
+  std::optional<object_info_t> ret;
+
+  auto attr_cache = obc->attr_cache;
+  if (!attr_cache.contains(OI_ATTR))
+    return ret;
+
+  ret.emplace(attr_cache.at(OI_ATTR));
+  return ret;
  }
  
  void ECBackend::submit_transaction(
@@ -1499,15 +1506,15 @@ void ECBackend::submit_transaction(
    PGTransactionUPtr &&t,
    const eversion_t &trim_to,
    const eversion_t &pg_committed_to,
-  vector<pg_log_entry_t>&& log_entries,
+  vector<pg_log_entry_t> &&log_entries,
    std::optional<pg_hit_set_history_t> &hset_history,
    Context *on_all_commit,
    ceph_tid_t tid,
    osd_reqid_t reqid,
    OpRequestRef client_op
-  )
-{
-  auto op = std::make_unique<ECClassicalOp>();
+) {
+  auto op = std::make_shared<ECClassicalOp>();
+  auto obc_map = t->obc_map;
    op->t = std::move(t);
    op->hoid = hoid;
    op->delta_stats = delta_stats;
@@ -1516,7 +1523,7 @@ void ECBackend::submit_transaction(
    /* We update PeeringState::pg_committed_to via the callback
     * invoked from ECBackend::handle_sub_write_reply immediately
     * before updating rmw_pipeline.commited_to via
-   * rmw_pipeline.check_ops()->try_finish_rmw(), so these will
+   * rmw_pipeline.check_ops()->finish_rmw(), so these will
     * *usually* match.  However, the PrimaryLogPG::submit_log_entries
     * pathway can perform an out-of-band log update which updates
     * PeeringState::pg_committed_to independently.  Thus, the value
@@ -1528,33 +1535,66 @@ void ECBackend::submit_transaction(
    op->tid = tid;
    op->reqid = reqid;
    op->client_op = client_op;
+  op->pipeline = &rmw_pipeline;
    if (client_op) {
      op->trace = client_op->pg_trace;
    }
-  op->plan = op->get_write_plan(
-    sinfo,
-    *(op->t),
-    [&](const hobject_t &i) {
-      dout(10) << "submit_transaction: obtaining hash info for get_write_plan" << dendl;
-      ECUtil::HashInfoRef ref;
-      if (auto [r, attrs, size] = get_attrs_n_size_from_disk(i); r >= 0 || r == -ENOENT) {
-        ref = unstable_hashinfo_registry.get_hash_info(
-         i,
-         true,
-         attrs, //op->t->obc_map[hoid]->attr_cache,
-         size); //op->t->obc_map[hoid]->obs.oi.size);
+  ECTransaction::WritePlan &plans = op->plan;
+
+  op->t->safe_create_traverse(
+    [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
+      const auto &[oid, inner_op] = i;
+      ECUtil::HashInfoRef shinfo;
+      auto &obc = obc_map.at(oid);
+      object_info_t oi = obc->obs.oi;
+      std::optional<object_info_t> soi;
+      ECUtil::HashInfoRef hinfo;
+
+      if (!sinfo.supports_ec_overwrites()) {
+        hinfo = get_hinfo_from_disk(oid);
        }
-      if (!ref) {
-       derr << __func__ << ": get_hash_info(" << i << ")"
-            << " returned a null pointer and there is no "
-            << " way to recover from such an error in this "
-            << " context" << dendl;
-       ceph_abort();
+
+      hobject_t source;
+      if (inner_op.has_source(&source)) {
+        if (!sinfo.supports_ec_overwrites()) {
+          shinfo = get_hinfo_from_disk(source);
+        }
+        if (!inner_op.is_rename()) {
+          soi = get_object_info_from_obc(obc_map.at(source));
+        }
        }
-      return ref;
-    },
-    get_parent()->get_dpp());
-  dout(10) << __func__ << ": op " << *op << " starting" << dendl;
+
+      uint64_t old_object_size = 0;
+      bool object_in_cache = false;
+      if (rmw_pipeline.extent_cache.contains_object(oid)) {
+        /* We have a valid extent cache for this object. If we need to read, we
+         * need to behave as if the object is already the size projected by the
+         * extent cache, or we may not read enough data.
+         */
+        old_object_size = rmw_pipeline.extent_cache.get_projected_size(oid);
+        object_in_cache = true;
+      } else {
+        std::optional<object_info_t> old_oi = get_object_info_from_obc(obc);
+        if (old_oi && !inner_op.delete_first) {
+          old_object_size = old_oi->size;
+        }
+      }
+
+      auto [readable_shards, writable_shards] =
+        read_pipeline.get_readable_writable_shard_id_sets();
+      ECTransaction::WritePlanObj plan(oid, inner_op, sinfo, readable_shards,
+                                       writable_shards,
+                                       object_in_cache, old_object_size,
+                                       oi, soi, std::move(hinfo),
+                                       std::move(shinfo),
+                                       rmw_pipeline.ec_pdw_write_mode);
+
+      if (plan.to_read) plans.want_read = true;
+      plans.plans.emplace_back(std::move(plan));
+    });
+  ldpp_dout(get_parent()->get_dpp(), 20) << __func__
+             << " plans=" << plans
+             << dendl;
    rmw_pipeline.start_rmw(std::move(op));
  }
  
@@ -1563,8 +1603,7 @@ int ECBackend::objects_read_sync(
    uint64_t off,
    uint64_t len,
    uint32_t op_flags,
-  bufferlist *bl)
-{
+  bufferlist *bl) {
    return -EOPNOTSUPP;
  }
  
@@ -1574,18 +1613,18 @@ void ECBackend::objects_read_async(
    const list<pair<ec_align_t,
                    pair<bufferlist*, Context*>>> &to_read,
    Context *on_complete,
-  bool fast_read)
-{
-  map<hobject_t,std::list<ec_align_t>> reads;
+  bool fast_read) {
+  map<hobject_t, std::list<ec_align_t>> reads;
  
    uint32_t flags = 0;
    extent_set es;
-  for (const auto& [read, ctx] : to_read) {
+  for (const auto &[read, ctx]: to_read) {
      pair<uint64_t, uint64_t> tmp;
      if (!cct->_conf->osd_ec_partial_reads || fast_read) {
-      tmp = sinfo.offset_len_to_stripe_bounds(make_pair(read.offset, read.size));
+      tmp = sinfo.ro_offset_len_to_stripe_ro_offset_len(read.offset, read.size);
      } else {
-      tmp = sinfo.offset_len_to_chunk_bounds(make_pair(read.offset, read.size));
+      tmp.first = read.offset;
+      tmp.second = read.size;
      }
      es.union_insert(tmp.first, tmp.second);
      flags |= read.flags;
@@ -1593,10 +1632,8 @@ void ECBackend::objects_read_async(
  
    if (!es.empty()) {
      auto &offsets = reads[hoid];
-    for (auto j = es.begin();
-        j != es.end();
-        ++j) {
-      offsets.emplace_back(ec_align_t{j.get_start(), j.get_len(), flags});
+    for (auto [off, len]: es) {
+      offsets.emplace_back(ec_align_t{off, len, flags});
      }
    }
  
@@ -1604,69 +1641,71 @@ void ECBackend::objects_read_async(
      ECBackend *ec;
      hobject_t hoid;
      list<pair<ec_align_t,
-             pair<bufferlist*, Context*> > > to_read;
+              pair<bufferlist*, Context*>>> to_read;
      unique_ptr<Context> on_complete;
-    cb(const cb&) = delete;
+    cb(const cb &) = delete;
      cb(cb &&) = default;
+
      cb(ECBackend *ec,
         const hobject_t &hoid,
         const list<pair<ec_align_t,
-                  pair<bufferlist*, Context*> > > &to_read,
+                       pair<bufferlist*, Context*>>> &to_read,
         Context *on_complete)
        : ec(ec),
-       hoid(hoid),
-       to_read(to_read),
-       on_complete(on_complete) {}
+        hoid(hoid),
+        to_read(to_read),
+        on_complete(on_complete) {}
+
      void operator()(ECCommon::ec_extents_t &&results) {
        auto dpp = ec->get_parent()->get_dpp();
        ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results
                          << dendl;
-      ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->rmw_pipeline.cache
-                        << dendl;
  
-      auto &got = results[hoid];
+      auto &got = results.at(hoid);
  
        int r = 0;
-      for (auto &&read: to_read) {
-       if (got.err < 0) {
-         // error handling
-         if (read.second.second) {
-           read.second.second->complete(got.err);
-         }
-         if (r == 0)
-           r = got.err;
-       } else {
-         ceph_assert(read.second.first);
-         uint64_t offset = read.first.offset;
-         uint64_t length = read.first.size;
-         auto range = got.emap.get_containing_range(offset, length);
-         uint64_t range_offset = range.first.get_off();
-         uint64_t range_length = range.first.get_len();
-         ceph_assert(range.first != range.second);
-         ceph_assert(range_offset <= offset);
+      for (auto &&[read, result]: to_read) {
+        auto &&[bufs, ctx] = result;
+        if (got.err < 0) {
+          // error handling
+          if (ctx) {
+            ctx->complete(got.err);
+          }
+          if (r == 0)
+            r = got.err;
+        } else {
+          ceph_assert(bufs);
+          uint64_t offset = read.offset;
+          uint64_t length = read.size;
+          auto range = got.emap.get_containing_range(offset, length);
+          uint64_t range_offset = range.first.get_off();
+          uint64_t range_length = range.first.get_len();
+          ceph_assert(range.first != range.second);
+          ceph_assert(range_offset <= offset);
            ldpp_dout(dpp, 20) << "offset: " << offset << dendl;
            ldpp_dout(dpp, 20) << "range offset: " << range_offset << dendl;
            ldpp_dout(dpp, 20) << "length: " << length << dendl;
            ldpp_dout(dpp, 20) << "range length: " << range_length << dendl;
-         ceph_assert(offset + length <= range_offset + range_length);
-         read.second.first->substr_of(
-           range.first.get_val(),
-           offset - range_offset,
-           length);
-         if (read.second.second) {
-           read.second.second->complete(length);
-           read.second.second = nullptr;
-         }
-       }
+          ceph_assert((offset + length) <= (range_offset + range_length));
+          bufs->substr_of(
+            range.first.get_val(),
+            offset - range_offset,
+            length);
+          if (ctx) {
+            ctx->complete(length);
+            ctx = nullptr;
+          }
+        }
        }
        to_read.clear();
        if (on_complete) {
-       on_complete.release()->complete(r);
+        on_complete.release()->complete(r);
        }
      }
+
      ~cb() {
        for (auto &&i: to_read) {
-       delete i.second.second;
+        delete i.second.second;
        }
        to_read.clear();
      }
@@ -1674,23 +1713,29 @@ void ECBackend::objects_read_async(
    objects_read_and_reconstruct(
      reads,
      fast_read,
+    object_size,
      make_gen_lambda_context<
-      ECCommon::ec_extents_t &&, cb>(
-       cb(this,
-          hoid,
-          to_read,
-          on_complete)));
+      ECCommon::ec_extents_t&&, cb>(
+      cb(this,
+         hoid,
+         to_read,
+         on_complete)));
  }
  
  void ECBackend::objects_read_and_reconstruct(
-  const map<hobject_t,
-    std::list<ec_align_t>
-  > &reads,
+  const map<hobject_t, std::list<ec_align_t>> &reads,
    bool fast_read,
-  GenContextURef<ECCommon::ec_extents_t &&> &&func)
-{
+  uint64_t object_size,
+  GenContextURef<ECCommon::ec_extents_t&&> &&func) {
    return read_pipeline.objects_read_and_reconstruct(
-    reads, fast_read, std::move(func));
+    reads, fast_read, object_size, std::move(func));
+}
+
+void ECBackend::objects_read_and_reconstruct_for_rmw(
+  map<hobject_t, read_request_t> &&to_read,
+  GenContextURef<ECCommon::ec_extents_t&&> &&func) {
+  return read_pipeline.objects_read_and_reconstruct_for_rmw(
+    std::move(to_read), std::move(func));
  }
  
  void ECBackend::kick_reads() {
@@ -1699,8 +1744,7 @@ void ECBackend::kick_reads() {
  
  int ECBackend::object_stat(
    const hobject_t &hoid,
-  struct stat* st)
-{
+  struct stat *st) {
    int r = switcher->store->stat(
      switcher->ch,
      ghobject_t{hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard},
@@ -1710,11 +1754,10 @@ int ECBackend::object_stat(
  
  int ECBackend::objects_get_attrs(
    const hobject_t &hoid,
-  map<string, bufferlist, less<>> *out)
-{
+  map<string, bufferlist, less<>> *out) {
    for (map<string, bufferlist>::iterator i = out->begin();
         i != out->end();
-       ) {
+  ) {
      if (ECUtil::is_hinfo_key_string(i->first))
        out->erase(i++);
      else
@@ -1727,14 +1770,13 @@ int ECBackend::be_deep_scrub(
    const hobject_t &poid,
    ScrubMap &map,
    ScrubMapBuilder &pos,
-  ScrubMap::object &o)
-{
+  ScrubMap::object &o) {
    dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
    int r;
  
    uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
-                           CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | 
-                           CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
+    CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+    CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
  
    utime_t sleeptime;
    sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
@@ -1780,36 +1822,37 @@ int ECBackend::be_deep_scrub(
      return -EINPROGRESS;
    }
  
-  ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(poid, false, o.attrs, o.size);
+  ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(
+    poid, false, o.attrs, o.size);
    if (!hinfo) {
      dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
      o.read_error = true;
      o.digest_present = false;
      return 0;
    } else {
-    if (!get_parent()->get_pool().allows_ecoverwrites()) {
+    if (!sinfo.supports_ec_overwrites()) {
        if (!hinfo->has_chunk_hash()) {
          dout(0) << "_scan_list  " << poid << " got invalid hash info" << dendl;
          o.ec_size_mismatch = true;
          return 0;
        }
        if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
-       dout(0) << "_scan_list  " << poid << " got incorrect size on read 0x"
+        dout(0) << "_scan_list  " << poid << " got incorrect size on read 0x"
                 << std::hex << pos
                 << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
                 << dendl;
-       o.ec_size_mismatch = true;
-       return 0;
+        o.ec_size_mismatch = true;
+        return 0;
        }
  
        if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
-         pos.data_hash.digest()) {
-       dout(0) << "_scan_list  " << poid << " got incorrect hash on read 0x"
+        pos.data_hash.digest()) {
+        dout(0) << "_scan_list  " << poid << " got incorrect hash on read 0x"
                 << std::hex << pos.data_hash.digest() << " !=  expected 0x"
                 << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
                 << std::dec << dendl;
-       o.ec_hash_mismatch = true;
-       return 0;
+        o.ec_hash_mismatch = true;
+        return 0;
        }
  
        /* We checked above that we match our own stored hash.  We cannot
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h

index b11b946183fbc744d4f0dbe9dcb05c1a38dcdfc5..82b89f3e4afe2e7ff2aab2b1bd4a97a0b3f51ec5 100644 (file)
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -14,22 +14,23 @@
  
  #pragma once
  
-#include <boost/intrusive/set.hpp>
  #include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
  
  #include "ECCommon.h"
+#include "ECExtentCache.h"
+#include "ECListener.h"
+#include "ECTypes.h"
+#include "ECUtil.h"
  #include "OSD.h"
  #include "PGBackend.h"
  #include "erasure-code/ErasureCodeInterface.h"
-#include "ECUtil.h"
-#include "ECTransaction.h"
-#include "ExtentCache.h"
-#include "ECListener.h"
+#include "include/buffer.h"
+#include "osd/scrubber/scrub_backend.h"
  
  /* This file is soon going to be replaced (before next release), so we are going
   * to simply ignore all deprecated warnings.
   * */
-IGNORE_DEPRECATED
  
  //forward declaration
  struct ECSubWrite;
@@ -39,63 +40,61 @@ struct ECSubReadReply;
  class ECSwitch;
  
  struct RecoveryMessages;
+class ECSwitch;
  
  class ECBackend : public ECCommon {
-public:
-    PGBackend::RecoveryHandle *open_recovery_op();
+ public:
+  PGBackend::RecoveryHandle *open_recovery_op();
  
    void run_recovery_op(
        PGBackend::RecoveryHandle *h,
-    int priority
-      );
+      int priority
+    );
  
    int recover_object(
-    const hobject_t &hoid,
-    eversion_t v,
-    ObjectContextRef head,
-    ObjectContextRef obc,
+      const hobject_t &hoid,
+      eversion_t v,
+      ObjectContextRef head,
+      ObjectContextRef obc,
        PGBackend::RecoveryHandle *h
-      );
-
-  bool _handle_message(
-    OpRequestRef op
-      );
-  bool can_handle_while_inactive(
-    OpRequestRef op
-      );
+    );
+
+  bool _handle_message(OpRequestRef op);
+  bool can_handle_while_inactive(OpRequestRef op);
    friend struct SubWriteApplied;
    friend struct SubWriteCommitted;
    void sub_write_committed(
-    ceph_tid_t tid,
-    eversion_t version,
-    eversion_t last_complete,
-    const ZTracer::Trace &trace);
+      ceph_tid_t tid,
+      eversion_t version,
+      eversion_t last_complete,
+      const ZTracer::Trace &trace
+    );
    void handle_sub_write(
-    pg_shard_t from,
-    OpRequestRef msg,
-    ECSubWrite &op,
-    const ZTracer::Trace &trace,
-    ECListener& eclistener
+      pg_shard_t from,
+      OpRequestRef msg,
+      ECSubWrite &op,
+      const ZTracer::Trace &trace,
+      ECListener &eclistener
      ) override;
    void handle_sub_read(
-    pg_shard_t from,
-    const ECSubRead &op,
-    ECSubReadReply *reply,
-    const ZTracer::Trace &trace
+      pg_shard_t from,
+      const ECSubRead &op,
+      ECSubReadReply *reply,
+      const ZTracer::Trace &trace
      );
    void handle_sub_write_reply(
-    pg_shard_t from,
-    const ECSubWriteReply &op,
-    const ZTracer::Trace &trace
+      pg_shard_t from,
+      const ECSubWriteReply &op,
+      const ZTracer::Trace &trace
      );
    void handle_sub_read_reply(
-    pg_shard_t from,
-    ECSubReadReply &op,
-    const ZTracer::Trace &trace
+      pg_shard_t from,
+      ECSubReadReply &op,
+      const ZTracer::Trace &trace
      );
  
    /// @see ReadOp below
-  void check_recovery_sources(const OSDMapRef& osdmap);
+  void check_recovery_sources(const OSDMapRef &osdmap);
  
    void on_change();
    void clear_recovery_state();
@@ -107,26 +106,27 @@ public:
    }
  
    void submit_transaction(
-    const hobject_t &hoid,
-    const object_stat_sum_t &delta_stats,
-    const eversion_t &at_version,
-    PGTransactionUPtr &&t,
-    const eversion_t &trim_to,
-    const eversion_t &pg_committed_to,
-    std::vector<pg_log_entry_t>&& log_entries,
-    std::optional<pg_hit_set_history_t> &hset_history,
-    Context *on_all_commit,
-    ceph_tid_t tid,
-    osd_reqid_t reqid,
-    OpRequestRef op
+      const hobject_t &hoid,
+      const object_stat_sum_t &delta_stats,
+      const eversion_t &at_version,
+      PGTransactionUPtr &&t,
+      const eversion_t &trim_to,
+      const eversion_t &pg_committed_to,
+      std::vector<pg_log_entry_t> &&log_entries,
+      std::optional<pg_hit_set_history_t> &hset_history,
+      Context *on_all_commit,
+      ceph_tid_t tid,
+      osd_reqid_t reqid,
+      OpRequestRef op
      );
  
    int objects_read_sync(
-    const hobject_t &hoid,
-    uint64_t off,
-    uint64_t len,
-    uint32_t op_flags,
-    ceph::buffer::list *bl);
+      const hobject_t &hoid,
+      uint64_t off,
+      uint64_t len,
+      uint32_t op_flags,
+      ceph::buffer::list *bl
+    );
  
    /**
     * Async read mechanism
@@ -147,19 +147,35 @@ public:
     * check_recovery_sources.
     */
    void objects_read_and_reconstruct(
-    const std::map<hobject_t, std::list<ec_align_t>> &reads,
-    bool fast_read,
-    GenContextURef<ECCommon::ec_extents_t &&> &&func) override;
+      const std::map<hobject_t, std::list<ec_align_t>> &reads,
+      bool fast_read,
+      uint64_t object_size,
+      GenContextURef<ECCommon::ec_extents_t&&> &&func
+    ) override;
+
+  /**
+   * Async read mechanism for read-modify-write (RMW) code paths. Here wthe
+   * client already knows the set of shard reads that are required, so these
+   * can be passed in directly.  The "fast_read" mechanism is not needed.
+   *
+   * Otherwise this is the same as objects_read_and_reconstruct.
+   */
+  void objects_read_and_reconstruct_for_rmw(
+      std::map<hobject_t, read_request_t> &&reads,
+      GenContextURef<ECCommon::ec_extents_t&&> &&func
+    ) override;
  
    void objects_read_async(
-    const hobject_t &hoid,
-    uint64_t object_size,
-    const std::list<std::pair<ec_align_t,
-                              std::pair<ceph::buffer::list*, Context*>>> &to_read,
-    Context *on_complete,
-    bool fast_read = false);
-
-private:
+      const hobject_t &hoid,
+      uint64_t object_size,
+      const std::list<std::pair<ec_align_t,
+                                std::pair<ceph::buffer::list*, Context*>>> &
+      to_read,
+      Context *on_complete,
+      bool fast_read = false
+    );
+
+ private:
    friend struct ECRecoveryHandle;
  
    void kick_reads();
@@ -194,159 +210,185 @@ private:
     * Transaction, and reads in a RecoveryMessages object which is passed
     * among the recovery methods.
     */
-public:
+ public:
    struct RecoveryBackend {
-    CephContext* cct;
+    CephContext *cct;
      const coll_t &coll;
      ceph::ErasureCodeInterfaceRef ec_impl;
-    const ECUtil::stripe_info_t& sinfo;
-    ReadPipeline& read_pipeline;
-    UnstableHashInfoRegistry& unstable_hashinfo_registry;
+    const ECUtil::stripe_info_t &sinfo;
+    ReadPipeline &read_pipeline;
+    UnstableHashInfoRegistry &unstable_hashinfo_registry;
      // TODO: lay an interface down here
-    ECListener* parent;
-    ECBackend* ecbackend;
+    ECListener *parent;
+    ECBackend *ecbackend;
  
      ECListener *get_parent() const { return parent; }
-    const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
-    epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
+
+    const OSDMapRef &get_osdmap() const {
+      return get_parent()->pgb_get_osdmap();
+    }
+
+    epoch_t get_osdmap_epoch() const {
+      return get_parent()->pgb_get_osdmap_epoch();
+    }
+
      const pg_info_t &get_info() { return get_parent()->get_info(); }
      void add_temp_obj(const hobject_t &oid) { get_parent()->add_temp_obj(oid); }
-    void clear_temp_obj(const hobject_t &oid) { get_parent()->clear_temp_obj(oid); }
-
-    RecoveryBackend(CephContext* cct,
-                   const coll_t &coll,
-                   ceph::ErasureCodeInterfaceRef ec_impl,
-                   const ECUtil::stripe_info_t& sinfo,
-                   ReadPipeline& read_pipeline,
-                   UnstableHashInfoRegistry& unstable_hashinfo_registry,
-                   ECListener* parent,
-                   ECBackend* ecbackend);
-  struct RecoveryOp {
-    hobject_t hoid;
-    eversion_t v;
-    std::set<pg_shard_t> missing_on;
-    std::set<shard_id_t> missing_on_shards;
-
-    ObjectRecoveryInfo recovery_info;
-    ObjectRecoveryProgress recovery_progress;
-
-    enum state_t { IDLE, READING, WRITING, COMPLETE } state;
-
-    static const char* tostr(state_t state) {
-      switch (state) {
-      case RecoveryOp::IDLE:
-       return "IDLE";
-      case RecoveryOp::READING:
-       return "READING";
-      case RecoveryOp::WRITING:
-       return "WRITING";
-      case RecoveryOp::COMPLETE:
-       return "COMPLETE";
-      default:
-       ceph_abort();
-       return "";
-      }
-    }
  
-    // must be filled if state == WRITING
-    std::map<int, ceph::buffer::list> returned_data;
-    std::map<std::string, ceph::buffer::list, std::less<>> xattrs;
-    ECUtil::HashInfoRef hinfo;
-    ObjectContextRef obc;
-    std::set<pg_shard_t> waiting_on_pushes;
+    void clear_temp_obj(const hobject_t &oid) {
+      get_parent()->clear_temp_obj(oid);
+    }
  
-    // valid in state READING
-    std::pair<uint64_t, uint64_t> extent_requested;
+    RecoveryBackend(CephContext *cct,
+                    const coll_t &coll,
+                    ceph::ErasureCodeInterfaceRef ec_impl,
+                    const ECUtil::stripe_info_t &sinfo,
+                    ReadPipeline &read_pipeline,
+                    UnstableHashInfoRegistry &unstable_hashinfo_registry,
+                    ECListener *parent,
+                    ECBackend *ecbackend);
+
+    struct RecoveryOp {
+      hobject_t hoid;
+      eversion_t v;
+      std::set<pg_shard_t> missing_on;
+      std::set<shard_id_t> missing_on_shards;
+
+      ObjectRecoveryInfo recovery_info;
+      ObjectRecoveryProgress recovery_progress;
+
+      enum state_t { IDLE, READING, WRITING, COMPLETE } state;
+
+      static const char *tostr(state_t state) {
+        switch (state) {
+        case RecoveryOp::IDLE:
+          return "IDLE";
+        case RecoveryOp::READING:
+          return "READING";
+        case RecoveryOp::WRITING:
+          return "WRITING";
+        case RecoveryOp::COMPLETE:
+          return "COMPLETE";
+        default:
+          ceph_abort();
+          return "";
+        }
+      }
  
-    void dump(ceph::Formatter *f) const;
+      // must be filled if state == WRITING
+      std::optional<ECUtil::shard_extent_map_t> returned_data;
+      std::map<std::string, ceph::buffer::list, std::less<>> xattrs;
+      ECUtil::HashInfoRef hinfo;
+      ObjectContextRef obc;
+      std::set<pg_shard_t> waiting_on_pushes;
+
+      void dump(ceph::Formatter *f) const;
+
+      RecoveryOp() : state(IDLE) {}
+
+      void print(std::ostream &os) const {
+        os << "RecoveryOp("
+            << "hoid=" << hoid
+            << " v=" << v
+            << " missing_on=" << missing_on
+            << " missing_on_shards=" << missing_on_shards
+            << " recovery_info=" << recovery_info
+            << " recovery_progress=" << recovery_progress
+            << " obc refcount=" << obc.use_count()
+            << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(state)
+            << " waiting_on_pushes=" << waiting_on_pushes
+            << ")";
+      }
+    };
  
-    RecoveryOp() : state(IDLE) {}
-  };
-  friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
-  std::map<hobject_t, RecoveryOp> recovery_ops;
+    std::map<hobject_t, RecoveryOp> recovery_ops;
  
-  uint64_t get_recovery_chunk_size() const {
-    return round_up_to(cct->_conf->osd_recovery_max_chunk,
-                       sinfo.get_stripe_width());
-  }
+    uint64_t get_recovery_chunk_size() const {
+      return round_up_to(cct->_conf->osd_recovery_max_chunk,
+                         sinfo.get_stripe_width());
+    }
  
-  virtual ~RecoveryBackend() = default;
-  virtual void commit_txn_send_replies(
-    ceph::os::Transaction&& txn,
-    std::map<int, MOSDPGPushReply*> replies) = 0;
-  void dispatch_recovery_messages(RecoveryMessages &m, int priority);
+    virtual ~RecoveryBackend() = default;
+    virtual void commit_txn_send_replies(
+        ceph::os::Transaction &&txn,
+        std::map<int, MOSDPGPushReply*> replies) = 0;
+    void dispatch_recovery_messages(RecoveryMessages &m, int priority);
  
-  PGBackend::RecoveryHandle *open_recovery_op();
-  void run_recovery_op(
-    struct ECRecoveryHandle &h,
-    int priority);
-  int recover_object(
-    const hobject_t &hoid,
-    eversion_t v,
-    ObjectContextRef head,
-    ObjectContextRef obc,
-    PGBackend::RecoveryHandle *h);
-  void continue_recovery_op(
-    RecoveryBackend::RecoveryOp &op,
-    RecoveryMessages *m);
-  void handle_recovery_read_complete(
-    const hobject_t &hoid,
-    boost::tuple<uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > &to_read,
-    std::optional<std::map<std::string, ceph::buffer::list, std::less<>> > attrs,
-    RecoveryMessages *m);
-  void handle_recovery_push(
-    const PushOp &op,
-    RecoveryMessages *m,
-    bool is_repair);
-  void handle_recovery_push_reply(
-    const PushReplyOp &op,
-    pg_shard_t from,
-    RecoveryMessages *m);
-  friend struct RecoveryMessages;
-  int get_ec_data_chunk_count() const {
-    return ec_impl->get_data_chunk_count();
-  }
-  void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res);
+    PGBackend::RecoveryHandle *open_recovery_op();
+    void run_recovery_op(
+        struct ECRecoveryHandle &h,
+        int priority);
+    int recover_object(
+        const hobject_t &hoid,
+        eversion_t v,
+        ObjectContextRef head,
+        ObjectContextRef obc,
+        PGBackend::RecoveryHandle *h);
+    void continue_recovery_op(
+        RecoveryBackend::RecoveryOp &op,
+        RecoveryMessages *m);
+    void handle_recovery_read_complete(
+        const hobject_t &hoid,
+        ECUtil::shard_extent_map_t &&buffers_read,
+        std::optional<std::map<std::string, ceph::buffer::list, std::less<>>>
+          attrs,
+        const ECUtil::shard_extent_set_t &want_to_read,
+        RecoveryMessages *m);
+    void handle_recovery_push(
+        const PushOp &op,
+        RecoveryMessages *m,
+        bool is_repair);
+    void handle_recovery_push_reply(
+        const PushReplyOp &op,
+        pg_shard_t from,
+        RecoveryMessages *m);
+    friend struct RecoveryMessages;
+    void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res);
    };
+
    struct ECRecoveryBackend : RecoveryBackend {
-    ECRecoveryBackend(CephContext* cct,
-                     const coll_t &coll,
-                     ceph::ErasureCodeInterfaceRef ec_impl,
-                     const ECUtil::stripe_info_t& sinfo,
-                     ReadPipeline& read_pipeline,
-                     UnstableHashInfoRegistry& unstable_hashinfo_registry,
-                     PGBackend::Listener* parent,
-                     ECBackend* ecbackend)
-      : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline, unstable_hashinfo_registry, parent->get_eclistener(), ecbackend),
-       parent(parent) {
-    }
+    ECRecoveryBackend(CephContext *cct,
+                      const coll_t &coll,
+                      ceph::ErasureCodeInterfaceRef ec_impl,
+                      const ECUtil::stripe_info_t &sinfo,
+                      ReadPipeline &read_pipeline,
+                      UnstableHashInfoRegistry &unstable_hashinfo_registry,
+                      PGBackend::Listener *parent,
+                      ECBackend *ecbackend)
+      : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline,
+                        unstable_hashinfo_registry, parent->get_eclistener(),
+                        ecbackend),
+        parent(parent) {}
  
      void commit_txn_send_replies(
-      ceph::os::Transaction&& txn,
-      std::map<int, MOSDPGPushReply*> replies) override;
+        ceph::os::Transaction &&txn,
+        std::map<int, MOSDPGPushReply*> replies) override;
  
      PGBackend::Listener *get_parent() const { return parent; }
  
-  private:
+   private:
      PGBackend::Listener *parent;
    };
-  friend ostream &operator<<(ostream &lhs, const RecoveryBackend::RecoveryOp &rhs);
+
+  friend ostream &operator<<(ostream &lhs,
+                             const RecoveryBackend::RecoveryOp &rhs
+    );
    friend struct RecoveryMessages;
    friend struct OnRecoveryReadComplete;
    friend struct RecoveryReadCompleter;
  
    void handle_recovery_push(
-    const PushOp &op,
-    RecoveryMessages *m,
-    bool is_repair);
+      const PushOp &op,
+      RecoveryMessages *m,
+      bool is_repair
+    );
  
-public:
-    PGBackend::Listener *parent;
-    CephContext *cct;
-    ECSwitch *switcher;
-  struct ReadPipeline read_pipeline;
-  struct RMWPipeline rmw_pipeline;
-  struct ECRecoveryBackend recovery_backend;
+  PGBackend::Listener *parent;
+  CephContext *cct;
+  ECSwitch *switcher;
+  ReadPipeline read_pipeline;
+  RMWPipeline rmw_pipeline;
+  ECRecoveryBackend recovery_backend;
  
    ceph::ErasureCodeInterfaceRef ec_impl;
  
@@ -355,46 +397,55 @@ public:
    /**
     * ECRecPred
     *
-   * Determines the whether _have is sufficient to recover an object
+   * Determines whether _have is sufficient to recover an object
     */
    class ECRecPred : public IsPGRecoverablePredicate {
-    std::set<int> want;
+    shard_id_set want;
+    const ECUtil::stripe_info_t *sinfo;
      ceph::ErasureCodeInterfaceRef ec_impl;
-  public:
-    explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) {
-      for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) {
-       want.insert(i);
-      }
+
+   public:
+    explicit ECRecPred(const ECUtil::stripe_info_t *sinfo,
+                       ceph::ErasureCodeInterfaceRef ec_impl) :
+      sinfo(sinfo), ec_impl(ec_impl) {
+      want.insert_range(shard_id_t(0), sinfo->get_k_plus_m());
      }
+
      bool operator()(const std::set<pg_shard_t> &_have) const override {
-      std::set<int> have;
-      for (std::set<pg_shard_t>::const_iterator i = _have.begin();
-          i != _have.end();
-          ++i) {
-       have.insert(static_cast<int>(i->shard));
+      shard_id_set have;
+      for (pg_shard_t p: _have) {
+        have.insert(p.shard);
+      }
+      std::unique_ptr<shard_id_map<std::vector<std::pair<int, int>>>>
+          min_sub_chunks = nullptr;
+      if (sinfo->supports_sub_chunks()) {
+        min_sub_chunks = std::make_unique<shard_id_map<std::vector<std::pair<
+          int, int>>>>(sinfo->get_k_plus_m());
        }
-      std::map<int, std::vector<std::pair<int, int>>> min;
+      shard_id_set min;
  
-      return ec_impl->minimum_to_decode(want, have, &min) == 0;
+      return ec_impl->minimum_to_decode(want, have, min, min_sub_chunks.get())
+          == 0;
      }
    };
+
    std::unique_ptr<ECRecPred> get_is_recoverable_predicate() const {
-    return std::make_unique<ECRecPred>(ec_impl);
+    return std::make_unique<ECRecPred>(&sinfo, ec_impl);
    }
  
-    unsigned get_ec_data_chunk_count() const {
-    return ec_impl->get_data_chunk_count();
+  unsigned get_ec_data_chunk_count() const {
+    return sinfo.get_k();
    }
+
    int get_ec_stripe_chunk_size() const {
      return sinfo.get_chunk_size();
    }
-  uint64_t object_size_to_shard_size(const uint64_t size,
-                                    shard_id_t shard) const {
-    if (size == std::numeric_limits<uint64_t>::max()) {
-      return size;
-    }
-    return sinfo.logical_to_next_chunk_offset(size);
+
+  uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard
+    ) const {
+    return sinfo.object_size_to_shard_size(size, shard);
    }
+
    /**
     * ECReadPred
     *
@@ -403,16 +454,21 @@ public:
    class ECReadPred : public IsPGReadablePredicate {
      pg_shard_t whoami;
      ECRecPred rec_pred;
-  public:
+
+   public:
      ECReadPred(
-      pg_shard_t whoami,
-      ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {}
+        pg_shard_t whoami,
+        const ECUtil::stripe_info_t *sinfo,
+        ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(sinfo, ec_impl) {}
+
      bool operator()(const std::set<pg_shard_t> &_have) const override {
        return _have.count(whoami) && rec_pred(_have);
      }
    };
-  std::unique_ptr<ECReadPred> get_is_readable_predicate(pg_shard_t whoami) const {
-    return std::make_unique<ECReadPred>(whoami, ec_impl);
+
+  std::unique_ptr<ECReadPred>
+  get_is_readable_predicate(pg_shard_t whoami) const {
+    return std::make_unique<ECReadPred>(whoami, &sinfo, ec_impl);
    }
  
    const ECUtil::stripe_info_t sinfo;
@@ -424,34 +480,41 @@ public:
      int,
      std::map<std::string, ceph::bufferlist, std::less<>>,
      size_t
-  > get_attrs_n_size_from_disk(const hobject_t& hoid);
+  > get_attrs_n_size_from_disk(const hobject_t &hoid);
+
+  ECUtil::HashInfoRef get_hinfo_from_disk(hobject_t oid);
+
+  std::optional<object_info_t> get_object_info_from_obc(
+      ObjectContextRef &obc_map
+    );
  
-public:
-  int object_stat(const hobject_t &hoid, struct stat* st);
+ public:
+  int object_stat(const hobject_t &hoid, struct stat *st);
    ECBackend(
-    PGBackend::Listener *pg,
-    CephContext *cct,
-    ceph::ErasureCodeInterfaceRef ec_impl,
-    uint64_t stripe_width,
-    ECSwitch *s,
-    ECExtentCache::LRU &ignored);
+      PGBackend::Listener *pg,
+      CephContext *cct,
+      ceph::ErasureCodeInterfaceRef ec_impl,
+      uint64_t stripe_width,
+      ECSwitch *s,
+      ECExtentCache::LRU &ec_extent_cache_lru
+    );
  
    int objects_get_attrs(
-    const hobject_t &hoid,
-    std::map<std::string, ceph::buffer::list, std::less<>> *out);
+      const hobject_t &hoid,
+      std::map<std::string, ceph::buffer::list, std::less<>> *out
+    );
  
    bool auto_repair_supported() const { return true; }
  
    int be_deep_scrub(
-    const hobject_t &poid,
-    ScrubMap &map,
-    ScrubMapBuilder &pos,
-    ScrubMap::object &o);
+      const hobject_t &poid,
+      ScrubMap &map,
+      ScrubMapBuilder &pos,
+      ScrubMap::object &o
+    );
  
-  uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t ignored) const {
-    return sinfo.logical_to_next_chunk_offset(logical_size);
+  uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t shard_id
+    ) const {
+    return object_size_to_shard_size(logical_size, shard_id);
    }
  };
-ostream &operator<<(ostream &lhs, const ECBackend::RMWPipeline::pipeline_state_t &rhs);
-
-END_IGNORE_DEPRECATED
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc

index 1b197284161df4f3ed07c10ae1870f721bc5791f..6a4d64ba41516b398474f35583fba5150d1195f4 100644 (file)
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -16,18 +16,15 @@
  
  #include <iostream>
  #include <sstream>
+#include <ranges>
+#include <fmt/ostream.h>
  
  #include "ECInject.h"
-#include "messages/MOSDPGPush.h"
-#include "messages/MOSDPGPushReply.h"
  #include "messages/MOSDECSubOpWrite.h"
-#include "messages/MOSDECSubOpWriteReply.h"
  #include "messages/MOSDECSubOpRead.h"
-#include "messages/MOSDECSubOpReadReply.h"
  #include "common/debug.h"
  #include "ECMsgTypes.h"
  #include "PGLog.h"
-
  #include "osd_tracer.h"
  
  #define dout_context cct
@@ -36,11 +33,6 @@
  #undef dout_prefix
  #define dout_prefix _prefix(_dout, this)
  
-/* This file is soon going to be replaced (before next release), so we are going
- * to simply ignore all deprecated warnings.
- * */
-IGNORE_DEPRECATED
-
  using std::dec;
  using std::hex;
  using std::less;
@@ -60,77 +52,27 @@ using ceph::bufferptr;
  using ceph::ErasureCodeInterfaceRef;
  using ceph::Formatter;
  
-static ostream& _prefix(std::ostream *_dout, ECCommon::RMWPipeline *rmw_pipeline) {
+static ostream &_prefix(std::ostream *_dout,
+                        ECCommon::RMWPipeline const *rmw_pipeline) {
    return rmw_pipeline->get_parent()->gen_dbg_prefix(*_dout);
  }
-static ostream& _prefix(std::ostream *_dout, ECCommon::ReadPipeline *read_pipeline) {
-  return read_pipeline->get_parent()->gen_dbg_prefix(*_dout);
-}
-static ostream& _prefix(std::ostream *_dout,
-                       ECCommon::UnstableHashInfoRegistry *unstable_hash_info_registry) {
-  // TODO: backref to ECListener?
-  return *_dout;
-}
-static ostream& _prefix(std::ostream *_dout, struct ClientReadCompleter *read_completer);
-
-ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::pipeline_state_t &rhs) {
-  switch (rhs.pipeline_state) {
-  case ECCommon::RMWPipeline::pipeline_state_t::CACHE_VALID:
-    return lhs << "CACHE_VALID";
-  case ECCommon::RMWPipeline::pipeline_state_t::CACHE_INVALID:
-    return lhs << "CACHE_INVALID";
-  default:
-    ceph_abort_msg("invalid pipeline state");
-  }
-  return lhs; // unreachable
-}
  
-ostream &operator<<(ostream &lhs, const ECCommon::ec_extent_t &rhs)
-{
-  return lhs << rhs.err << ","
-            << rhs.emap;
-}
-
-ostream &operator<<(ostream &lhs, const ECCommon::read_request_t &rhs)
-{
-  return lhs << "read_request_t(to_read=[" << rhs.to_read << "]"
-            << ", need=" << rhs.need
-            << ", want_attrs=" << rhs.want_attrs
-            << ")";
+static ostream &_prefix(std::ostream *_dout,
+                        ECCommon::ReadPipeline const *read_pipeline) {
+  return read_pipeline->get_parent()->gen_dbg_prefix(*_dout);
  }
  
-ostream &operator<<(ostream &lhs, const ECCommon::read_result_t &rhs)
-{
-  lhs << "read_result_t(r=" << rhs.r
-      << ", errors=" << rhs.errors;
-  if (rhs.attrs) {
-    lhs << ", attrs=" << *(rhs.attrs);
-  } else {
-    lhs << ", noattrs";
-  }
-  return lhs << ", returned=" << rhs.returned << ")";
+static ostream &_prefix(std::ostream *_dout,
+                        ECCommon::UnstableHashInfoRegistry *
+                        unstable_hash_info_registry) {
+  return *_dout;
  }
  
-ostream &operator<<(ostream &lhs, const ECCommon::ReadOp &rhs)
-{
-  lhs << "ReadOp(tid=" << rhs.tid;
-#ifndef WITH_CRIMSON
-  if (rhs.op && rhs.op->get_req()) {
-    lhs << ", op=";
-    rhs.op->get_req()->print(lhs);
-  }
-#endif
-  return lhs << ", to_read=" << rhs.to_read
-            << ", complete=" << rhs.complete
-            << ", priority=" << rhs.priority
-            << ", obj_to_source=" << rhs.obj_to_source
-            << ", source_to_obj=" << rhs.source_to_obj
-            << ", want_to_read" << rhs.want_to_read
-            << ", in_progress=" << rhs.in_progress << ")";
-}
+static ostream &_prefix(std::ostream *_dout,
+                        struct ClientReadCompleter const *read_completer
+  );
  
-void ECCommon::ReadOp::dump(Formatter *f) const
-{
+void ECCommon::ReadOp::dump(Formatter *f) const {
    f->dump_unsigned("tid", tid);
  #ifndef WITH_CRIMSON
    if (op && op->get_req()) {
@@ -142,272 +84,287 @@ void ECCommon::ReadOp::dump(Formatter *f) const
    f->dump_int("priority", priority);
    f->dump_stream("obj_to_source") << obj_to_source;
    f->dump_stream("source_to_obj") << source_to_obj;
-  f->dump_stream("want_to_read") << want_to_read;
    f->dump_stream("in_progress") << in_progress;
  }
  
-ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::Op &rhs)
-{
-  lhs << "Op(" << rhs.hoid
-      << " v=" << rhs.version
-      << " tt=" << rhs.trim_to
-      << " tid=" << rhs.tid
-      << " reqid=" << rhs.reqid;
-#ifndef WITH_CRIMSON
-  if (rhs.client_op && rhs.client_op->get_req()) {
-    lhs << " client_op=";
-    rhs.client_op->get_req()->print(lhs);
-  }
-#endif
-  lhs << " pg_committed_to=" << rhs.pg_committed_to
-      << " temp_added=" << rhs.temp_added
-      << " temp_cleared=" << rhs.temp_cleared
-      << " pending_read=" << rhs.pending_read
-      << " remote_read=" << rhs.remote_read
-      << " remote_read_result=" << rhs.remote_read_result
-      << " pending_apply=" << rhs.pending_apply
-      << " pending_commit=" << rhs.pending_commit
-      << " plan.to_read=" << rhs.plan.to_read
-      << " plan.will_write=" << rhs.plan.will_write
-      << ")";
-  return lhs;
-}
-
-void ECCommon::ReadPipeline::complete_read_op(ReadOp &rop)
-{
+void ECCommon::ReadPipeline::complete_read_op(ReadOp &&rop) {
    dout(20) << __func__ << " completing " << rop << dendl;
-  map<hobject_t, read_request_t>::iterator req_iter =
-    rop.to_read.begin();
-  map<hobject_t, read_result_t>::iterator resiter =
-    rop.complete.begin();
+  auto req_iter = rop.to_read.begin();
+  auto resiter = rop.complete.begin();
    ceph_assert(rop.to_read.size() == rop.complete.size());
    for (; req_iter != rop.to_read.end(); ++req_iter, ++resiter) {
-    ceph_assert(rop.want_to_read.contains(req_iter->first));
+    auto &hoid = req_iter->first;
+    read_result_t &res = resiter->second;
+    read_request_t &req = req_iter->second;
      rop.on_complete->finish_single_request(
-      req_iter->first,
-      resiter->second,
-      req_iter->second.to_read,
-      rop.want_to_read[req_iter->first]);
+      hoid, std::move(res), req);
    }
    ceph_assert(rop.on_complete);
    std::move(*rop.on_complete).finish(rop.priority);
    rop.on_complete = nullptr;
+
    // if the read op is over. clean all the data of this tid.
-  for (set<pg_shard_t>::iterator iter = rop.in_progress.begin();
-    iter != rop.in_progress.end();
-    iter++) {
-    shard_to_read_map[*iter].erase(rop.tid);
+  for (auto &pg_shard: rop.in_progress) {
+    shard_to_read_map[pg_shard].erase(rop.tid);
    }
    rop.in_progress.clear();
    tid_to_read_map.erase(rop.tid);
  }
  
-void ECCommon::ReadPipeline::on_change()
-{
-  for (map<ceph_tid_t, ReadOp>::iterator i = tid_to_read_map.begin();
-       i != tid_to_read_map.end();
-       ++i) {
-    dout(10) << __func__ << ": cancelling " << i->second << dendl;
+void ECCommon::ReadPipeline::on_change() {
+  for (auto &rop: std::views::keys(tid_to_read_map)) {
+    dout(10) << __func__ << ": cancelling " << rop << dendl;
    }
    tid_to_read_map.clear();
    shard_to_read_map.clear();
    in_progress_client_reads.clear();
  }
  
+std::pair<const shard_id_set, const shard_id_set>
+ECCommon::ReadPipeline::get_readable_writable_shard_id_sets() {
+  shard_id_set readable;
+  shard_id_set writable;
+
+  for (auto &&pg_shard: get_parent()->get_acting_shards()) {
+    readable.insert(pg_shard.shard);
+  }
+
+  writable = get_parent()->get_acting_recovery_backfill_shard_id_set();
+  return std::make_pair(std::move(readable), std::move(writable));
+}
+
  void ECCommon::ReadPipeline::get_all_avail_shards(
-  const hobject_t &hoid,
-  const set<pg_shard_t> &error_shards,
-  set<int> &have,
-  map<shard_id_t, pg_shard_t> &shards,
-  bool for_recovery)
-{
-  for (set<pg_shard_t>::const_iterator i =
-        get_parent()->get_acting_shards().begin();
-       i != get_parent()->get_acting_shards().end();
-       ++i) {
-    dout(10) << __func__ << ": checking acting " << *i << dendl;
-    const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
-    if (error_shards.contains(*i)) {
+    const hobject_t &hoid,
+    shard_id_set &have,
+    shard_id_map<pg_shard_t> &shards,
+    const bool for_recovery,
+    const std::optional<set<pg_shard_t>> &error_shards) {
+  for (auto &&pg_shard: get_parent()->get_acting_shards()) {
+    dout(10) << __func__ << ": checking acting " << pg_shard << dendl;
+    const pg_missing_t &missing = get_parent()->get_shard_missing(pg_shard);
+    if (error_shards && error_shards->contains(pg_shard)) {
        continue;
      }
+    const shard_id_t &shard = pg_shard.shard;
      if (cct->_conf->bluestore_debug_inject_read_err &&
-          ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) {
-      dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl;
+      ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, shard))) {
+      dout(0) << __func__ << " Error inject - Missing shard " << shard << dendl;
        continue;
      }
      if (!missing.is_missing(hoid)) {
-      ceph_assert(!have.count(static_cast<int>(i->shard)));
-      have.insert(static_cast<int>(i->shard));
-      ceph_assert(!shards.count(i->shard));
-      shards.insert(make_pair(i->shard, *i));
+      ceph_assert(!have.contains(shard));
+      have.insert(shard);
+      ceph_assert(!shards.contains(shard));
+      shards.insert(shard, pg_shard);
      }
    }
  
    if (for_recovery) {
-    for (set<pg_shard_t>::const_iterator i =
-          get_parent()->get_backfill_shards().begin();
-        i != get_parent()->get_backfill_shards().end();
-        ++i) {
-      if (error_shards.find(*i) != error_shards.end())
-       continue;
-      if (have.count(static_cast<int>(i->shard))) {
-       ceph_assert(shards.count(i->shard));
-       continue;
+    for (auto &&pg_shard: get_parent()->get_backfill_shards()) {
+      if (error_shards && error_shards->contains(pg_shard))
+        continue;
+      const shard_id_t &shard = pg_shard.shard;
+      if (have.contains(shard)) {
+        ceph_assert(shards.contains(shard));
+        continue;
        }
-      dout(10) << __func__ << ": checking backfill " << *i << dendl;
-      ceph_assert(!shards.count(i->shard));
-      const pg_info_t &info = get_parent()->get_shard_info(*i);
-      const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+      dout(10) << __func__ << ": checking backfill " << pg_shard << dendl;
+      ceph_assert(!shards.count(shard));
+      const pg_info_t &info = get_parent()->get_shard_info(pg_shard);
        if (hoid < info.last_backfill &&
-         !missing.is_missing(hoid)) {
-       have.insert(static_cast<int>(i->shard));
-       shards.insert(make_pair(i->shard, *i));
+        !get_parent()->get_shard_missing(pg_shard).is_missing(hoid)) {
+        have.insert(shard);
+        shards.insert(shard, pg_shard);
        }
      }
  
-    map<hobject_t, set<pg_shard_t>>::const_iterator miter =
-      get_parent()->get_missing_loc_shards().find(hoid);
+    auto miter = get_parent()->get_missing_loc_shards().find(hoid);
      if (miter != get_parent()->get_missing_loc_shards().end()) {
-      for (set<pg_shard_t>::iterator i = miter->second.begin();
-          i != miter->second.end();
-          ++i) {
-       dout(10) << __func__ << ": checking missing_loc " << *i << dendl;
-       auto m = get_parent()->maybe_get_shard_missing(*i);
-       if (m) {
-         ceph_assert(!(*m).is_missing(hoid));
-       }
-       if (error_shards.find(*i) != error_shards.end())
-         continue;
-       have.insert(static_cast<int>(i->shard));
-       shards.insert(make_pair(i->shard, *i));
+      for (auto &&pg_shard: miter->second) {
+        dout(10) << __func__ << ": checking missing_loc " << pg_shard << dendl;
+        if (const auto m = get_parent()->maybe_get_shard_missing(pg_shard)) {
+          ceph_assert(!m->is_missing(hoid));
+        }
+        if (error_shards && error_shards->contains(pg_shard)) {
+          continue;
+        }
+        have.insert(pg_shard.shard);
+        shards.insert(pg_shard.shard, pg_shard);
        }
      }
    }
  }
  
  int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
-  const hobject_t &hoid,
-  const set<int> &want,
-  bool for_recovery,
-  bool do_redundant_reads,
-  map<pg_shard_t, vector<pair<int, int>>> *to_read)
-{
+    const hobject_t &hoid,
+    bool for_recovery,
+    bool do_redundant_reads,
+    read_request_t &read_request,
+    const std::optional<set<pg_shard_t>> &error_shards) {
    // Make sure we don't do redundant reads for recovery
    ceph_assert(!for_recovery || !do_redundant_reads);
  
-  set<int> have;
-  map<shard_id_t, pg_shard_t> shards;
-  set<pg_shard_t> error_shards;
+  if (read_request.object_size == 0) {
+    dout(10) << __func__ << " empty read" << dendl;
+    return 0;
+  }
  
-  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+  shard_id_set have;
+  shard_id_map<pg_shard_t> shards(sinfo.get_k_plus_m());
  
-  map<int, vector<pair<int, int>>> need;
-  int r = ec_impl->minimum_to_decode(want, have, &need);
-  if (r < 0)
+  get_all_avail_shards(hoid, have, shards, for_recovery, error_shards);
+
+  std::unique_ptr<shard_id_map<vector<pair<int, int>>>> need_sub_chunks =
+      nullptr;
+  if (sinfo.supports_sub_chunks()) {
+    need_sub_chunks = std::make_unique<shard_id_map<vector<pair<int, int>>>>(
+      sinfo.get_k_plus_m());
+  }
+  shard_id_set need_set;
+  shard_id_set want;
+
+  read_request.shard_want_to_read.populate_shard_id_set(want);
+
+  int r = ec_impl->minimum_to_decode(want, have, need_set,
+                                     need_sub_chunks.get());
+  if (r < 0) {
+    dout(20) << "minimum_to_decode_failed r: " << r << "want: " << want
+      << " have: " << have << " need: " << need_set << dendl;
      return r;
+  }
  
    if (do_redundant_reads) {
+    if (need_sub_chunks) {
        vector<pair<int, int>> subchunks_list;
        subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
        for (auto &&i: have) {
-        need[i] = subchunks_list;
+        (*need_sub_chunks)[i] = subchunks_list;
        }
-  } 
+    }
+    for (auto &&i: have) {
+      need_set.insert(i);
+    }
+  }
  
-  if (!to_read)
-    return 0;
+  extent_set extra_extents;
+  ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+  ECUtil::shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+
+  sinfo.ro_size_to_read_mask(read_request.object_size, read_mask);
+  sinfo.ro_size_to_zero_mask(read_request.object_size, zero_mask);
  
-  for (auto &&i:need) {
-    ceph_assert(shards.count(shard_id_t(i.first)));
-    to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second));
+  /* First deal with missing shards */
+  for (auto &&[shard, extent_set]: read_request.shard_want_to_read) {
+    /* Work out what extra extents we need to read on each shard. If do
+     * redundant reads is set, then we want to have the same reads on
+     * every extent. Otherwise, we need to read every shard only if the
+     * necessary shard is missing.
+     */
+    if (!have.contains(shard) || do_redundant_reads) {
+      extra_extents.union_of(extent_set);
+    }
    }
-  return 0;
-}
  
-// a static for the sake of unittesting
-void ECCommon::ReadPipeline::get_min_want_to_read_shards(
-  const uint64_t offset,
-  const uint64_t length,
-  const ECUtil::stripe_info_t& sinfo,
-  set<int> *want_to_read)
-{
-  const auto [left_chunk_index, right_chunk_index] =
-    sinfo.offset_length_to_data_chunk_indices(offset, length);
-  const auto distance =
-    std::min(right_chunk_index - left_chunk_index, (uint64_t)sinfo.get_k());
-  for(uint64_t i = 0; i < distance; i++) {
-    raw_shard_id_t raw_shard((left_chunk_index + i) % sinfo.get_k());
-    want_to_read->insert(static_cast<int>(sinfo.get_shard(raw_shard)));
+  for (auto &shard: need_set) {
+    if (!have.contains(shard)) {
+      continue;
+    }
+    shard_id_t shard_id(shard);
+    extent_set extents = extra_extents;
+    shard_read_t shard_read;
+    if (need_sub_chunks) {
+      shard_read.subchunk = need_sub_chunks->at(shard_id);
+    }
+    shard_read.pg_shard = shards[shard_id];
+
+    if (read_request.shard_want_to_read.contains(shard)) {
+      extents.union_of(read_request.shard_want_to_read.at(shard));
+    }
+
+    extents.align(CEPH_PAGE_SIZE);
+    if (read_mask.contains(shard)) {
+      shard_read.extents.intersection_of(extents, read_mask.at(shard));
+    }
+
+    if (!shard_read.extents.empty()) {
+      read_request.shard_reads[shard_id] = std::move(shard_read);
+    }
    }
+
+  dout(20) << __func__ << " for_recovery: " << for_recovery
+    << " do_redundant_reads: " << do_redundant_reads
+    << " read_request: " << read_request
+    << " error_shards: " << error_shards
+    << dendl;
+  return 0;
  }
  
+
  void ECCommon::ReadPipeline::get_min_want_to_read_shards(
-  const uint64_t offset,
-  const uint64_t length,
-  set<int> *want_to_read)
-{
-  get_min_want_to_read_shards(offset, length, sinfo, want_to_read);
-  dout(20) << __func__ << ": offset " << offset << " length " << length
-          << " want_to_read " << *want_to_read << dendl;
+    const ec_align_t &to_read,
+    ECUtil::shard_extent_set_t &want_shard_reads) {
+  sinfo.ro_range_to_shard_extent_set(to_read.offset, to_read.size,
+                                     want_shard_reads);
+  dout(20) << __func__ << ": to_read " << to_read
+          << " read_request " << want_shard_reads << dendl;
  }
  
  int ECCommon::ReadPipeline::get_remaining_shards(
-  const hobject_t &hoid,
-  const set<int> &avail,
-  const set<int> &want,
-  const read_result_t &result,
-  map<pg_shard_t, vector<pair<int, int>>> *to_read,
-  bool for_recovery)
-{
-  ceph_assert(to_read);
-
-  set<int> have;
-  map<shard_id_t, pg_shard_t> shards;
+    const hobject_t &hoid,
+    read_result_t &read_result,
+    read_request_t &read_request,
+    const bool for_recovery,
+    const bool fast_read) {
+  shard_id_map<pg_shard_t> shards(sinfo.get_k_plus_m());
    set<pg_shard_t> error_shards;
-  for (auto &p : result.errors) {
-    error_shards.insert(p.first);
+  for (auto &shard: std::views::keys(read_result.errors)) {
+    error_shards.insert(shard);
    }
  
-  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+  const int r = get_min_avail_to_read_shards(
+    hoid,
+    for_recovery,
+    fast_read,
+    read_request,
+    error_shards);
  
-  map<int, vector<pair<int, int>>> need;
-  int r = ec_impl->minimum_to_decode(want, have, &need);
-  if (r < 0) {
+  if (r) {
      dout(0) << __func__ << " not enough shards left to try for " << hoid
-           << " read result was " << result << dendl;
+           << " read result was " << read_result << dendl;
      return -EIO;
    }
  
-  set<int> shards_left;
-  for (auto p : need) {
-    if (avail.find(p.first) == avail.end()) {
-      shards_left.insert(p.first);
+  // Rather than repeating whole read, we can remove everything we already have.
+  for (auto iter = read_request.shard_reads.begin();
+       iter != read_request.shard_reads.end();) {
+    auto &&[shard_id, shard_read] = *iter;
+    bool do_erase = false;
+
+    // Ignore where shard has not been read at all.
+    if (read_result.processed_read_requests.contains(shard_id)) {
+      shard_read.extents.subtract(
+        read_result.processed_read_requests.at(shard_id));
+      do_erase = shard_read.extents.empty();
      }
-  }
  
-  vector<pair<int, int>> subchunks;
-  subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
-  for (set<int>::iterator i = shards_left.begin();
-       i != shards_left.end();
-       ++i) {
-    ceph_assert(shards.count(shard_id_t(*i)));
-    ceph_assert(avail.find(*i) == avail.end());
-    to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
+    if (do_erase) {
+      iter = read_request.shard_reads.erase(iter);
+    } else {
+      ++iter;
+    }
    }
-  return 0;
+
+  return read_request.shard_reads.empty()?1:0;
  }
  
  void ECCommon::ReadPipeline::start_read_op(
-  int priority,
-  map<hobject_t, set<int>> &want_to_read,
-  map<hobject_t, read_request_t> &to_read,
-  OpRequestRef _op,
-  bool do_redundant_reads,
-  bool for_recovery,
-  std::unique_ptr<ECCommon::ReadCompleter> on_complete)
-{
+    const int priority,
+    map<hobject_t, read_request_t> &to_read,
+    const bool do_redundant_reads,
+    const bool for_recovery,
+    std::unique_ptr<ReadCompleter> on_complete) {
    ceph_tid_t tid = get_parent()->get_tid();
-  ceph_assert(!tid_to_read_map.count(tid));
+  ceph_assert(!tid_to_read_map.contains(tid));
    auto &op = tid_to_read_map.emplace(
      tid,
      ReadOp(
@@ -416,527 +373,419 @@ void ECCommon::ReadPipeline::start_read_op(
        do_redundant_reads,
        for_recovery,
        std::move(on_complete),
-      _op,
-      std::move(want_to_read),
        std::move(to_read))).first->second;
    dout(10) << __func__ << ": starting " << op << dendl;
-  if (_op) {
+  if (op.op) {
  #ifndef WITH_CRIMSON
-    op.trace = _op->pg_trace;
+    op.trace = op.op->pg_trace;
  #endif
      op.trace.event("start ec read");
    }
    do_read_op(op);
  }
  
-void ECCommon::ReadPipeline::do_read_op(ReadOp &op)
-{
-  int priority = op.priority;
-  ceph_tid_t tid = op.tid;
+void ECCommon::ReadPipeline::do_read_op(ReadOp &rop) {
+  const int priority = rop.priority;
+  const ceph_tid_t tid = rop.tid;
  
-  dout(10) << __func__ << ": starting read " << op << dendl;
+  dout(10) << __func__ << ": starting read " << rop << dendl;
+  ceph_assert(!rop.to_read.empty());
  
    map<pg_shard_t, ECSubRead> messages;
-  for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin();
-       i != op.to_read.end();
-       ++i) {
-    bool need_attrs = i->second.want_attrs;
-
-    for (auto j = i->second.need.begin();
-        j != i->second.need.end();
-        ++j) {
-      if (need_attrs) {
-       messages[j->first].attrs_to_read.insert(i->first);
-       need_attrs = false;
+  for (auto &&[hoid, read_request]: rop.to_read) {
+    bool need_attrs = read_request.want_attrs;
+    ceph_assert(!read_request.shard_reads.empty());
+
+    for (auto &&[shard, shard_read]: read_request.shard_reads) {
+      if (need_attrs && !sinfo.is_nonprimary_shard(shard)) {
+        messages[shard_read.pg_shard].attrs_to_read.insert(hoid);
+        need_attrs = false;
        }
-      messages[j->first].subchunks[i->first] = j->second;
-      op.obj_to_source[i->first].insert(j->first);
-      op.source_to_obj[j->first].insert(i->first);
+      if (shard_read.subchunk) {
+        messages[shard_read.pg_shard].subchunks[hoid] = *shard_read.subchunk;
+      } else {
+        static const std::vector default_sub_chunk = {make_pair(0, 1)};
+        messages[shard_read.pg_shard].subchunks[hoid] = default_sub_chunk;
+      }
+      rop.obj_to_source[hoid].insert(shard_read.pg_shard);
+      rop.source_to_obj[shard_read.pg_shard].insert(hoid);
      }
-    for (const auto& read : i->second.to_read) {
-      auto p = make_pair(read.offset, read.size);
-      pair<uint64_t, uint64_t> chunk_off_len = sinfo.chunk_aligned_offset_len_to_chunk(p);
-      for (auto k = i->second.need.begin();
-          k != i->second.need.end();
-          ++k) {
-       messages[k->first].to_read[i->first].push_back(
-         boost::make_tuple(
-           chunk_off_len.first,
-           chunk_off_len.second,
-           read.flags));
+    for (auto &[_, shard_read]: read_request.shard_reads) {
+      ceph_assert(!shard_read.extents.empty());
+      rop.debug_log.emplace_back(ECUtil::READ_REQUEST, shard_read.pg_shard,
+                                   shard_read.extents);
+      for (auto &[start, len]: shard_read.extents) {
+        messages[shard_read.pg_shard].to_read[hoid].emplace_back(
+          boost::make_tuple(start, len, read_request.flags));
        }
-      ceph_assert(!need_attrs);
      }
+    ceph_assert(!need_attrs);
    }
  
    std::vector<std::pair<int, Message*>> m;
    m.reserve(messages.size());
-  for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin();
-       i != messages.end();
-       ++i) {
-    op.in_progress.insert(i->first);
-    shard_to_read_map[i->first].insert(op.tid);
-    i->second.tid = tid;
-    MOSDECSubOpRead *msg = new MOSDECSubOpRead;
+  for (auto &&[pg_shard, read]: messages) {
+    rop.in_progress.insert(pg_shard);
+    shard_to_read_map[pg_shard].insert(rop.tid);
+    read.tid = tid;
+    auto *msg = new MOSDECSubOpRead;
      msg->set_priority(priority);
-    msg->pgid = spg_t(
-      get_info().pgid.pgid,
-      i->first.shard);
+    msg->pgid = spg_t(get_info().pgid.pgid, pg_shard.shard);
      msg->map_epoch = get_osdmap_epoch();
      msg->min_epoch = get_parent()->get_interval_start_epoch();
-    msg->op = i->second;
+    msg->op = read;
      msg->op.from = get_parent()->whoami_shard();
      msg->op.tid = tid;
-    if (op.trace) {
+    if (rop.trace) {
        // initialize a child span for this shard
-      msg->trace.init("ec sub read", nullptr, &op.trace);
-      msg->trace.keyval("shard", i->first.shard.id);
+      msg->trace.init("ec sub read", nullptr, &rop.trace);
+      msg->trace.keyval("shard", pg_shard.shard.id);
      }
-    m.push_back(std::make_pair(i->first.osd, msg));
+    m.push_back(std::make_pair(pg_shard.osd, msg));
    }
    if (!m.empty()) {
      get_parent()->send_message_osd_cluster(m, get_osdmap_epoch());
    }
  
-  dout(10) << __func__ << ": started " << op << dendl;
+  dout(10) << __func__ << ": started " << rop << dendl;
  }
  
  void ECCommon::ReadPipeline::get_want_to_read_shards(
-  std::set<int> *want_to_read) const
-{
-  for (raw_shard_id_t i; i < (int)sinfo.get_k(); ++i) {
-    want_to_read->insert(static_cast<int>(sinfo.get_shard(i)));
+    const list<ec_align_t> &to_read,
+    ECUtil::shard_extent_set_t &want_shard_reads) {
+  if (sinfo.supports_partial_reads()) {
+    // Optimised.
+    for (const auto &single_region: to_read) {
+      get_min_want_to_read_shards(single_region, want_shard_reads);
+    }
+    return;
+  }
+
+  // Non-optimised version.
+  for (const shard_id_t shard: sinfo.get_data_shards()) {
+    for (auto &&read: to_read) {
+      auto &&[offset, len] = sinfo.chunk_aligned_ro_range_to_shard_ro_range(
+        read.offset, read.size);
+      want_shard_reads[shard].union_insert(offset, len);
+    }
    }
  }
  
-struct ClientReadCompleter : ECCommon::ReadCompleter {
+struct ClientReadCompleter final : ECCommon::ReadCompleter {
    ClientReadCompleter(ECCommon::ReadPipeline &read_pipeline,
-                      ECCommon::ClientAsyncReadStatus *status)
+                      ECCommon::ClientAsyncReadStatus *status
+    )
      : read_pipeline(read_pipeline),
        status(status) {}
  
    void finish_single_request(
-    const hobject_t &hoid,
-    ECCommon::read_result_t &res,
-    list<ec_align_t> to_read,
-    set<int> wanted_to_read) override
-  {
-    auto* cct = read_pipeline.cct;
+      const hobject_t &hoid,
+      ECCommon::read_result_t &&res,
+      ECCommon::read_request_t &req) override {
+    auto *cct = read_pipeline.cct;
      dout(20) << __func__ << " completing hoid=" << hoid
-             << " res=" << res << " to_read="  << to_read << dendl;
+             << " res=" << res << " req=" << req << dendl;
      extent_map result;
-    if (res.r != 0)
-      goto out;
-    ceph_assert(res.returned.size() == to_read.size());
-    ceph_assert(res.errors.empty());
-    for (auto &&read: to_read) {
-      const auto bounds = make_pair(read.offset, read.size);
-      const auto aligned =
-       read_pipeline.sinfo.offset_len_to_chunk_bounds(bounds);
-      ceph_assert(res.returned.front().get<0>() == aligned.first);
-      ceph_assert(res.returned.front().get<1>() == aligned.second);
-      map<int, bufferlist> to_decode;
-      bufferlist bl;
-      for (map<pg_shard_t, bufferlist>::iterator j =
-            res.returned.front().get<2>().begin();
-          j != res.returned.front().get<2>().end();
-          ++j) {
-       to_decode[static_cast<int>(j->first.shard)] = std::move(j->second);
-      }
-      dout(20) << __func__ << " going to decode: "
-               << " wanted_to_read=" << wanted_to_read
-               << " to_decode=" << to_decode
-               << dendl;
-      int r = ECUtil::decode(
-       read_pipeline.sinfo,
-       read_pipeline.ec_impl,
-       wanted_to_read,
-       to_decode,
-       &bl);
-      if (r < 0) {
-        dout(10) << __func__ << " error on ECUtil::decode r=" << r << dendl;
-        res.r = r;
-        goto out;
-      }
-      bufferlist trimmed;
-      // If partial stripe reads are disabled aligned_offset_in_stripe will
-      // be 0 which will mean trim_offset is 0. When partial reads are enabled
-      // the shards read (wanted_to_read) is a union of the requirements for
-      // each stripe, each range being read may need to trim unneeded shards
-      uint64_t aligned_offset_in_stripe = aligned.first -
-       read_pipeline.sinfo.logical_to_prev_stripe_offset(aligned.first);
-      uint64_t chunk_size = read_pipeline.sinfo.get_chunk_size();
-      uint64_t trim_offset = 0;
-      for (auto shard : wanted_to_read) {
-        int s = static_cast<int>(read_pipeline.sinfo.get_raw_shard(shard_id_t(shard)));
-        if ( s * chunk_size < aligned_offset_in_stripe) {
-         trim_offset += chunk_size;
-       } else {
-         break;
-       }
+    if (res.r == 0) {
+      ceph_assert(res.errors.empty());
+#if DEBUG_EC_BUFFERS
+      dout(20) << __func__ << ": before decode: " << res.buffers_read.debug_string(2048, 8) << dendl;
+#endif
+      /* Decode any missing buffers */
+      int r = res.buffers_read.decode(read_pipeline.ec_impl,
+                                  req.shard_want_to_read,
+                                  req.object_size);
+      ceph_assert( r == 0 );
+
+#if DEBUG_EC_BUFFERS
+      dout(20) << __func__ << ": after decode: " << res.buffers_read.debug_string(2048, 8) << dendl;
+#endif
+
+      for (auto &&read: req.to_read) {
+        result.insert(read.offset, read.size,
+                      res.buffers_read.get_ro_buffer(read.offset, read.size));
        }
-      auto off = read.offset + trim_offset - aligned.first;
-      dout(20) << __func__ << " bl.length()=" << bl.length()
-              << " off=" << off
-              << " read.offset=" << read.offset
-              << " read.size=" << read.size
-              << " trim_offset="<< trim_offset << dendl;
-      ceph_assert(read.size <= bl.length() - off);
-      trimmed.substr_of(bl, off, read.size);
-      result.insert(
-       read.offset, trimmed.length(), std::move(trimmed));
-      res.returned.pop_front();
      }
-out:
      dout(20) << __func__ << " calling complete_object with result="
               << result << dendl;
-    status->complete_object(hoid, res.r, std::move(result));
+    status->complete_object(hoid, res.r, std::move(result),
+                            std::move(res.buffers_read));
      read_pipeline.kick_reads();
    }
  
-  void finish(int priority) && override
-  {
+  void finish(int priority) && override {
      // NOP
    }
  
    ECCommon::ReadPipeline &read_pipeline;
    ECCommon::ClientAsyncReadStatus *status;
  };
-static ostream& _prefix(std::ostream *_dout, ClientReadCompleter *read_completer) {
+
+static ostream &_prefix(std::ostream *_dout,
+                        ClientReadCompleter const *read_completer) {
    return _prefix(_dout, &read_completer->read_pipeline);
  }
  
  void ECCommon::ReadPipeline::objects_read_and_reconstruct(
-  const map<hobject_t, std::list<ec_align_t>> &reads,
-  bool fast_read,
-  GenContextURef<ECCommon::ec_extents_t &&> &&func)
-{
-  in_progress_client_reads.emplace_back(
-    reads.size(), std::move(func));
+    const map<hobject_t, std::list<ec_align_t>> &reads,
+    const bool fast_read,
+    const uint64_t object_size,
+    GenContextURef<ec_extents_t&&> &&func) {
+  in_progress_client_reads.emplace_back(reads.size(), std::move(func));
    if (!reads.size()) {
      kick_reads();
      return;
    }
  
-  map<hobject_t, set<int>> obj_want_to_read;
-    
    map<hobject_t, read_request_t> for_read_op;
-  for (auto &&to_read: reads) {
-    set<int> want_to_read;
-    if (cct->_conf->osd_ec_partial_reads) {
-      for (const auto& single_region : to_read.second) {
-        get_min_want_to_read_shards(single_region.offset,
-                                   single_region.size,
-                                   &want_to_read);
-      }
-    } else {
-      get_want_to_read_shards(&want_to_read);
-    }
-    map<pg_shard_t, vector<pair<int, int>>> shards;
-    int r = get_min_avail_to_read_shards(
-      to_read.first,
-      want_to_read,
+  for (auto &&[hoid, to_read]: reads) {
+    ECUtil::shard_extent_set_t want_shard_reads(sinfo.get_k_plus_m());
+    get_want_to_read_shards(to_read, want_shard_reads);
+
+    read_request_t read_request(to_read, want_shard_reads, false, object_size);
+    const int r = get_min_avail_to_read_shards(
+      hoid,
        false,
        fast_read,
-      &shards);
+      read_request);
      ceph_assert(r == 0);
  
-    int subchunk_size =
-      sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
+    const int subchunk_size =
+        sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
      dout(20) << __func__
+             << " to_read=" << to_read
               << " subchunk_size=" << subchunk_size
               << " chunk_size=" << sinfo.get_chunk_size() << dendl;
  
-    for_read_op.insert(
-      make_pair(
-       to_read.first,
-       read_request_t(
-         to_read.second,
-         shards,
-         false)));
-    obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
+    for_read_op.insert(make_pair(hoid, read_request));
    }
  
    start_read_op(
      CEPH_MSG_PRIO_DEFAULT,
-    obj_want_to_read,
      for_read_op,
-    OpRequestRef(),
      fast_read,
      false,
-    std::make_unique<ClientReadCompleter>(*this, &(in_progress_client_reads.back())));
+    std::make_unique<ClientReadCompleter>(
+      *this, &(in_progress_client_reads.back())));
  }
  
+void ECCommon::ReadPipeline::objects_read_and_reconstruct_for_rmw(
+    map<hobject_t, read_request_t> &&to_read,
+    GenContextURef<ec_extents_t&&> &&func) {
+  in_progress_client_reads.emplace_back(to_read.size(), std::move(func));
+  if (!to_read.size()) {
+    kick_reads();
+    return;
+  }
  
-int ECCommon::ReadPipeline::send_all_remaining_reads(
-  const hobject_t &hoid,
-  ReadOp &rop)
-{
-  set<int> already_read;
-  const set<pg_shard_t>& ots = rop.obj_to_source[hoid];
-  for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i)
-    already_read.insert(static_cast<int>(i->shard));
-  dout(10) << __func__ << " have/error shards=" << already_read << dendl;
-  map<pg_shard_t, vector<pair<int, int>>> shards;
-  int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
-                              rop.complete[hoid], &shards, rop.for_recovery);
-  if (r)
-    return r;
+  map<hobject_t, read_request_t> for_read_op;
+  for (auto &&[hoid, read_request]: to_read) {
+    const int r =
+        get_min_avail_to_read_shards(hoid, false, false, read_request);
+    ceph_assert(r == 0);
+
+    const int subchunk_size = sinfo.get_chunk_size() / ec_impl->
+        get_sub_chunk_count();
+    dout(20) << __func__
+             << " read_request=" << read_request
+             << " subchunk_size=" << subchunk_size
+             << " chunk_size=" << sinfo.get_chunk_size() << dendl;
  
-  list<ec_align_t> to_read = rop.to_read.find(hoid)->second.to_read;
+    for_read_op.insert(make_pair(hoid, read_request));
+  }
  
+  start_read_op(
+    CEPH_MSG_PRIO_DEFAULT,
+    for_read_op, false, false,
+    std::make_unique<ClientReadCompleter>(
+      *this, &(in_progress_client_reads.back())));
+}
+
+
+int ECCommon::ReadPipeline::send_all_remaining_reads(
+    const hobject_t &hoid,
+    ReadOp &rop) {
    // (Note cuixf) If we need to read attrs and we read failed, try to read again.
-  bool want_attrs =
-    rop.to_read.find(hoid)->second.want_attrs &&
-    (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty());
+  const bool want_attrs =
+      rop.to_read.at(hoid).want_attrs &&
+      (!rop.complete.at(hoid).attrs || rop.complete.at(hoid).attrs->empty());
    if (want_attrs) {
      dout(10) << __func__ << " want attrs again" << dendl;
    }
  
-  rop.to_read.erase(hoid);
-  rop.to_read.insert(make_pair(
-      hoid,
-      read_request_t(
-       to_read,
-       shards,
-       want_attrs)));
-  return 0;
+  read_request_t &read_request = rop.to_read.at(hoid);
+  // reset the old shard reads, we are going to read them again.
+  read_request.shard_reads.clear();
+  return get_remaining_shards(hoid, rop.complete.at(hoid), read_request,
+                              rop.do_redundant_reads, want_attrs);
  }
  
-void ECCommon::ReadPipeline::kick_reads()
-{
+void ECCommon::ReadPipeline::kick_reads() {
    while (in_progress_client_reads.size() &&
           in_progress_client_reads.front().is_complete()) {
-    in_progress_client_reads.front().run();
-    in_progress_client_reads.pop_front();
+         in_progress_client_reads.front().run();
+         in_progress_client_reads.pop_front();
    }
  }
  
-
-void ECCommon::RMWPipeline::start_rmw(OpRef op)
-{
-  ceph_assert(op);
-  dout(10) << __func__ << ": " << *op << dendl;
-
-  ceph_assert(!tid_to_op_map.count(op->tid));
-  waiting_state.push_back(*op);
-  tid_to_op_map[op->tid] = std::move(op);
-  check_ops();
+bool ec_align_t::operator==(const ec_align_t &other) const {
+  return offset == other.offset && size == other.size && flags == other.flags;
  }
  
-bool ECCommon::RMWPipeline::try_state_to_reads()
-{
-  if (waiting_state.empty())
-    return false;
-
-  Op *op = &(waiting_state.front());
-  if (op->requires_rmw() && pipeline_state.cache_invalid()) {
-    ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
-    dout(20) << __func__ << ": blocking " << *op
-            << " because it requires an rmw and the cache is invalid "
-            << pipeline_state
-            << dendl;
-    return false;
-  }
-
-  if (!pipeline_state.caching_enabled()) {
-    op->using_cache = false;
-  } else if (op->invalidates_cache()) {
-    dout(20) << __func__ << ": invalidating cache after this op"
-            << dendl;
-    pipeline_state.invalidate();
-  }
-
-  waiting_state.pop_front();
-  waiting_reads.push_back(*op);
-
-  if (op->using_cache) {
-    cache.open_write_pin(op->pin);
+bool ECCommon::shard_read_t::operator==(const shard_read_t &other) const {
+  return extents == other.extents &&
+      subchunk == other.subchunk &&
+      pg_shard == other.pg_shard;
+}
  
-    extent_set empty;
-    for (auto &&hpair: op->plan.will_write) {
-      auto to_read_plan_iter = op->plan.to_read.find(hpair.first);
-      const extent_set &to_read_plan =
-       to_read_plan_iter == op->plan.to_read.end() ?
-       empty :
-       to_read_plan_iter->second;
+bool ECCommon::read_request_t::operator==(const read_request_t &other) const {
+  return to_read == other.to_read &&
+      flags == other.flags &&
+      shard_want_to_read == other.shard_want_to_read &&
+      shard_reads == other.shard_reads &&
+      want_attrs == other.want_attrs;
+}
  
-      extent_set remote_read = cache.reserve_extents_for_rmw(
-       hpair.first,
-       op->pin,
-       hpair.second,
-       to_read_plan);
+void ECCommon::RMWPipeline::start_rmw(OpRef op) {
+  dout(20) << __func__ << " op=" << *op << dendl;
  
-      extent_set pending_read = to_read_plan;
-      pending_read.subtract(remote_read);
+  ceph_assert(!tid_to_op_map.contains(op->tid));
+  tid_to_op_map[op->tid] = op;
  
-      if (!remote_read.empty()) {
-       op->remote_read[hpair.first] = std::move(remote_read);
-      }
-      if (!pending_read.empty()) {
-       op->pending_read[hpair.first] = std::move(pending_read);
-      }
-    }
-  } else {
-    op->remote_read = op->plan.to_read;
-  }
+  op->pending_cache_ops = op->plan.plans.size();
+  waiting_commit.push_back(op);
  
-  dout(10) << __func__ << ": " << *op << dendl;
-
-  if (!op->remote_read.empty()) {
-    ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
-    objects_read_async_no_cache(
-      op->remote_read,
-      [op, this](ec_extents_t &&results) {
-       for (auto &&i: results) {
-         op->remote_read_result.emplace(make_pair(i.first, i.second.emap));
-       }
-       check_ops();
+  for (auto &plan: op->plan.plans) {
+    ECExtentCache::OpRef cache_op = extent_cache.prepare(plan.hoid,
+      plan.to_read,
+      plan.will_write,
+      plan.orig_size,
+      plan.projected_size,
+      plan.invalidates_cache,
+      [op](ECExtentCache::OpRef const &cop)
+      {
+        op->cache_ready(cop->get_hoid(), cop->get_result());
        });
+    op->cache_ops.emplace_back(std::move(cache_op));
    }
-
-  return true;
+  extent_cache.execute(op->cache_ops);
  }
  
-bool ECCommon::RMWPipeline::try_reads_to_commit()
-{
-  if (waiting_reads.empty())
-    return false;
-  Op *op = &(waiting_reads.front());
-  if (op->read_in_progress())
-    return false;
-  waiting_reads.pop_front();
-  waiting_commit.push_back(*op);
-
-  dout(10) << __func__ << ": starting commit on " << *op << dendl;
-  dout(20) << __func__ << ": " << cache << dendl;
-
+void ECCommon::RMWPipeline::cache_ready(Op &op) {
    get_parent()->apply_stats(
-    op->hoid,
-    op->delta_stats);
-
-  if (op->using_cache) {
-    for (auto &&hpair: op->pending_read) {
-      op->remote_read_result[hpair.first].insert(
-       cache.get_remaining_extents_for_rmw(
-         hpair.first,
-         op->pin,
-         hpair.second));
-    }
-    op->pending_read.clear();
-  } else {
-    ceph_assert(op->pending_read.empty());
-  }
+    op.hoid,
+    op.delta_stats);
  
-  map<shard_id_t, ObjectStore::Transaction> trans;
-  for (set<pg_shard_t>::const_iterator i =
-        get_parent()->get_acting_recovery_backfill_shards().begin();
-       i != get_parent()->get_acting_recovery_backfill_shards().end();
-       ++i) {
-    trans.emplace(i->shard, get_parent()->min_peer_features());
+  shard_id_map<ObjectStore::Transaction> trans(sinfo.get_k_plus_m());
+  for (auto &&shard: get_parent()->
+       get_acting_recovery_backfill_shard_id_set()) {
+    trans[shard];
    }
  
-  op->trace.event("start ec write");
+  op.trace.event("start ec write");
  
-  map<hobject_t,extent_map> written;
-  op->generate_transactions(
+  map<hobject_t, ECUtil::shard_extent_map_t> written;
+  op.generate_transactions(
      ec_impl,
      get_parent()->get_info().pgid.pgid,
      sinfo,
      &written,
      &trans,
      get_parent()->get_dpp(),
-    get_osdmap()->require_osd_release);
+    get_osdmap());
  
-  dout(20) << __func__ << ": " << cache << dendl;
-  dout(20) << __func__ << ": written: " << written << dendl;
-  dout(20) << __func__ << ": op: " << *op << dendl;
+  dout(20) << __func__ << ": written: " << written << ", op: " << op << dendl;
  
-  if (!get_parent()->get_pool().allows_ecoverwrites()) {
-    for (auto &&i: op->log_entries) {
+  if (!sinfo.supports_ec_overwrites()) {
+    for (auto &&i: op.log_entries) {
        if (i.requires_kraken()) {
-       derr << __func__ << ": log entry " << i << " requires kraken"
-            << " but overwrites are not enabled!" << dendl;
-       ceph_abort();
+        derr << __func__ << ": log entry " << i << " requires kraken"
+             << " but overwrites are not enabled!" << dendl;
+        ceph_abort();
        }
      }
    }
  
-  map<hobject_t,extent_set> written_set;
-  for (auto &&i: written) {
-    written_set[i.first] = i.second.get_interval_set();
-  }
-  dout(20) << __func__ << ": written_set: " << written_set << dendl;
-  ceph_assert(written_set == op->plan.will_write);
-
-  if (op->using_cache) {
-    for (auto &&hpair: written) {
-      dout(20) << __func__ << ": " << hpair << dendl;
-      cache.present_rmw_update(hpair.first, op->pin, hpair.second);
-    }
-  }
-  op->remote_read.clear();
-  op->remote_read_result.clear();
-
    ObjectStore::Transaction empty;
    bool should_write_local = false;
    ECSubWrite local_write_op;
    std::vector<std::pair<int, Message*>> messages;
    messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size());
    set<pg_shard_t> backfill_shards = get_parent()->get_backfill_shards();
-  for (set<pg_shard_t>::const_iterator i =
-        get_parent()->get_acting_recovery_backfill_shards().begin();
-       i != get_parent()->get_acting_recovery_backfill_shards().end();
-       ++i) {
-    op->pending_apply.insert(*i);
-    op->pending_commit.insert(*i);
-    map<shard_id_t, ObjectStore::Transaction>::iterator iter =
-      trans.find(i->shard);
-    ceph_assert(iter != trans.end());
-    bool should_send = get_parent()->should_send_op(*i, op->hoid);
+
+  if (op.version.version != 0) {
+    if (oid_to_version.contains(op.hoid)) {
+      ceph_assert(oid_to_version.at(op.hoid) <= op.version);
+    }
+    oid_to_version[op.hoid] = op.version;
+  }
+  for (auto &&pg_shard: get_parent()->get_acting_recovery_backfill_shards()) {
+    ObjectStore::Transaction &transaction = trans.at(pg_shard.shard);
+    shard_id_t shard = pg_shard.shard;
+    if (transaction.empty()) {
+      dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " is empty" << dendl;
+    } else {
+      dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " contents ";
+      Formatter *f = Formatter::create("json");
+      f->open_object_section("t");
+      transaction.dump(f);
+      f->close_section();
+      f->flush(*_dout);
+      delete f;
+      *_dout << dendl;
+    }
+    if (op.skip_transaction(pending_roll_forward, shard, transaction)) {
+      // Must be an empty transaction
+      ceph_assert(transaction.empty());
+      dout(20) << __func__ << " Skipping transaction for osd." << shard << dendl;
+      continue;
+    }
+    op.pending_commits++;
+    bool should_send = get_parent()->should_send_op(pg_shard, op.hoid);
      const pg_stat_t &stats =
-      (should_send || !backfill_shards.count(*i)) ?
-      get_info().stats :
-      get_parent()->get_shard_info().find(*i)->second.stats;
+        (should_send || !backfill_shards.contains(pg_shard))
+          ? get_info().stats
+          : get_parent()->get_shard_info().find(pg_shard)->second.stats;
  
      ECSubWrite sop(
        get_parent()->whoami_shard(),
-      op->tid,
-      op->reqid,
-      op->hoid,
+      op.tid,
+      op.reqid,
+      op.hoid,
        stats,
-      should_send ? iter->second : empty,
-      op->version,
-      op->trim_to,
-      op->pg_committed_to,
-      op->log_entries,
-      op->updated_hit_set_history,
-      op->temp_added,
-      op->temp_cleared,
+      should_send ? transaction : empty,
+      op.version,
+      op.trim_to,
+      op.pg_committed_to,
+      op.log_entries,
+      op.updated_hit_set_history,
+      op.temp_added,
+      op.temp_cleared,
        !should_send);
  
      ZTracer::Trace trace;
-    if (op->trace) {
+    if (op.trace) {
        // initialize a child span for this shard
-      trace.init("ec sub write", nullptr, &op->trace);
-      trace.keyval("shard", i->shard.id);
+      trace.init("ec sub write", nullptr, &op.trace);
+      trace.keyval("shard", pg_shard.shard.id);
      }
  
-    if (*i == get_parent()->whoami_shard()) {
+    if (pg_shard == get_parent()->whoami_shard()) {
        should_write_local = true;
        local_write_op.claim(sop);
      } else if (cct->_conf->bluestore_debug_inject_read_err &&
-                 ECInject::test_write_error1(ghobject_t(op->hoid,
-                   ghobject_t::NO_GEN, i->shard))) {
+      ECInject::test_write_error1(ghobject_t(op.hoid,
+                                             ghobject_t::NO_GEN,
+                                             pg_shard.shard))) {
        dout(0) << " Error inject - Dropping write message to shard " <<
-        i->shard << dendl;
+          pg_shard.shard << dendl;
      } else {
-      MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
-      r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
+      auto *r = new MOSDECSubOpWrite(sop);
+      r->pgid = spg_t(get_parent()->primary_spg_t().pgid, pg_shard.shard);
        r->map_epoch = get_osdmap_epoch();
        r->min_epoch = get_parent()->get_interval_start_epoch();
        r->trace = trace;
-      messages.push_back(std::make_pair(i->osd, r));
+      messages.push_back(std::make_pair(pg_shard.osd, r));
      }
    }
  
@@ -947,148 +796,151 @@ bool ECCommon::RMWPipeline::try_reads_to_commit()
    if (should_write_local) {
      handle_sub_write(
        get_parent()->whoami_shard(),
-      op->client_op,
+      op.client_op,
        local_write_op,
-      op->trace);
+      op.trace);
    }
  
-  for (auto i = op->on_write.begin();
-       i != op->on_write.end();
-       op->on_write.erase(i++)) {
-    (*i)();
-  }
  
-  return true;
+  for (auto &cop: op.cache_ops) {
+    const hobject_t &oid = cop->get_hoid();
+    if (written.contains(oid)) {
+      extent_cache.write_done(cop, std::move(written.at(oid)));
+    } else {
+      extent_cache.write_done(cop, ECUtil::shard_extent_map_t(&sinfo));
+    }
+  }
  }
  
-struct ECDummyOp : ECCommon::RMWPipeline::Op {
+struct ECDummyOp final : ECCommon::RMWPipeline::Op {
    void generate_transactions(
-      ceph::ErasureCodeInterfaceRef &ecimpl,
+      ceph::ErasureCodeInterfaceRef &ec_impl,
        pg_t pgid,
        const ECUtil::stripe_info_t &sinfo,
-      std::map<hobject_t,extent_map> *written,
-      std::map<shard_id_t, ObjectStore::Transaction> *transactions,
+      map<hobject_t, ECUtil::shard_extent_map_t> *written,
+      shard_id_map<ObjectStore::Transaction> *transactions,
        DoutPrefixProvider *dpp,
-      const ceph_release_t require_osd_release) final
-  {
+      const OSDMapRef &osdmap
+    ) override {
      // NOP, as -- in constrast to ECClassicalOp -- there is no
      // transaction involved
    }
+
+  bool skip_transaction(
+      std::set<shard_id_t> &pending_roll_forward,
+      const shard_id_t shard,
+      ceph::os::Transaction &transaction
+    ) override {
+    return !pending_roll_forward.erase(shard);
+  }
  };
  
-bool ECCommon::RMWPipeline::try_finish_rmw()
-{
-  if (waiting_commit.empty())
-    return false;
-  Op *op = &(waiting_commit.front());
-  if (op->write_in_progress())
-    return false;
-  waiting_commit.pop_front();
+void ECCommon::RMWPipeline::try_finish_rmw() {
+  while (waiting_commit.size() > 0) {
+    OpRef op = waiting_commit.front();
+
+    if (op->pending_commits != 0 || op->pending_cache_ops != 0) {
+      return;
+    }
+
+    waiting_commit.pop_front();
+    finish_rmw(op);
+  }
+}
+
+void ECCommon::RMWPipeline::finish_rmw(OpRef const &op) {
+  dout(20) << __func__ << " op=" << *op << dendl;
  
-  dout(10) << __func__ << ": " << *op << dendl;
-  dout(20) << __func__ << ": " << cache << dendl;
+  if (op->on_all_commit) {
+    dout(10) << __func__ << " Calling on_all_commit on " << op << dendl;
+    op->on_all_commit->complete(0);
+    op->on_all_commit = nullptr;
+    op->trace.event("ec write all committed");
+  }
  
    if (op->pg_committed_to > completed_to)
      completed_to = op->pg_committed_to;
    if (op->version > committed_to)
      committed_to = op->version;
  
-  if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
-    if (op->version > get_parent()->get_log().get_can_rollback_to() &&
-       waiting_reads.empty() &&
-       waiting_commit.empty()) {
+  op->cache_ops.clear();
+
+  if (extent_cache.idle()) {
+    if (op->version > get_parent()->get_log().get_can_rollback_to()) {
+      const int transactions_since_last_idle = extent_cache.
+          get_and_reset_counter();
+      dout(20) << __func__ << " version=" << op->version << " ec_counter=" <<
+          transactions_since_last_idle << dendl;
        // submit a dummy, transaction-empty op to kick the rollforward
-      auto tid = get_parent()->get_tid();
-      auto nop = std::make_unique<ECDummyOp>();
+      const auto tid = get_parent()->get_tid();
+      const auto nop = std::make_shared<ECDummyOp>();
        nop->hoid = op->hoid;
        nop->trim_to = op->trim_to;
        nop->pg_committed_to = op->version;
        nop->tid = tid;
        nop->reqid = op->reqid;
-      waiting_reads.push_back(*nop);
-      tid_to_op_map[tid] = std::move(nop);
-    }
-  }
+      nop->pending_cache_ops = 1;
+      nop->pipeline = this;
  
-  if (op->using_cache) {
-    cache.release_write_pin(op->pin);
-  }
-  tid_to_op_map.erase(op->tid);
+      tid_to_op_map[tid] = nop;
  
-  if (waiting_reads.empty() &&
-      waiting_commit.empty()) {
-    pipeline_state.clear();
-    dout(20) << __func__ << ": clearing pipeline_state "
-            << pipeline_state
-            << dendl;
+      /* The cache is idle (we checked above) and this IO never blocks for reads
+       * so we can skip the extent cache and immediately call the completion.
+       */
+      nop->cache_ready(nop->hoid, ECUtil::shard_extent_map_t(&sinfo));
+    }
    }
-  return true;
-}
  
-void ECCommon::RMWPipeline::check_ops()
-{
-  while (try_state_to_reads() ||
-        try_reads_to_commit() ||
-        try_finish_rmw());
+  tid_to_op_map.erase(op->tid);
  }
  
-void ECCommon::RMWPipeline::on_change()
-{
+void ECCommon::RMWPipeline::on_change() {
    dout(10) << __func__ << dendl;
  
    completed_to = eversion_t();
    committed_to = eversion_t();
-  pipeline_state.clear();
-  waiting_reads.clear();
-  waiting_state.clear();
-  waiting_commit.clear();
-  for (auto &&op: tid_to_op_map) {
-    cache.release_write_pin(op.second->pin);
-  }
+  extent_cache.on_change();
    tid_to_op_map.clear();
+  oid_to_version.clear();
+  waiting_commit.clear();
+}
+
+void ECCommon::RMWPipeline::on_change2() {
+  extent_cache.on_change2();
  }
  
  void ECCommon::RMWPipeline::call_write_ordered(std::function<void(void)> &&cb) {
-  if (!waiting_state.empty()) {
-    waiting_state.back().on_write.emplace_back(std::move(cb));
-  } else if (!waiting_reads.empty()) {
-    waiting_reads.back().on_write.emplace_back(std::move(cb));
-  } else {
-    // Nothing earlier in the pipeline, just call it
-    cb();
-  }
+  extent_cache.add_on_write(std::move(cb));
  }
  
  ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::maybe_put_hash_info(
-  const hobject_t &hoid,
-  ECUtil::HashInfo &&hinfo)
-{
+    const hobject_t &hoid,
+    ECUtil::HashInfo &&hinfo) {
    return registry.lookup_or_create(hoid, hinfo);
  }
  
  ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info(
-  const hobject_t &hoid,
-  bool create,
-  const map<string, bufferlist, less<>>& attrs,
-  uint64_t size)
-{
+    const hobject_t &hoid,
+    bool create,
+    const map<string, bufferlist, less<>> &attrs,
+    uint64_t size) {
    dout(10) << __func__ << ": Getting attr on " << hoid << dendl;
-  ECUtil::HashInfoRef ref = registry.lookup(hoid);
+  auto ref = registry.lookup(hoid);
    if (!ref) {
      dout(10) << __func__ << ": not in cache " << hoid << dendl;
      ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
      bufferlist bl;
-    map<string, bufferlist>::const_iterator k = attrs.find(ECUtil::get_hinfo_key());
-    if (k == attrs.end()) {
-      dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
+    if (attrs.contains(ECUtil::get_hinfo_key())) {
+      bl = attrs.at(ECUtil::get_hinfo_key());
      } else {
-      bl = k->second;
+      dout(30) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
      }
      if (bl.length() > 0) {
        auto bp = bl.cbegin();
        try {
          decode(hinfo, bp);
-      } catch(...) {
+      }
+      catch (...) {
          dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl;
          return ECUtil::HashInfoRef();
        }
@@ -1096,10 +948,10 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info(
          dout(0) << __func__ << ": Mismatch of total_chunk_size "
                        << hinfo.get_total_chunk_size() << dendl;
          return ECUtil::HashInfoRef();
-      } else {
-        create = true;
        }
-    } else if (size == 0) { // If empty object and no hinfo, create it
+      create = true;
+    } else if (size == 0) {
+      // If empty object and no hinfo, create it
        create = true;
      }
      if (create) {
@@ -1108,5 +960,3 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info(
    }
    return ref;
  }
-
-END_IGNORE_DEPRECATED
diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h

index 8135269274d5978d3917af9880857ef9065f671e..a17aff017ffd3f2effb3123f6114b3357a7c2cc2 100644 (file)
--- a/src/osd/ECCommon.h
+++ b/src/osd/ECCommon.h
@@ -22,7 +22,6 @@
  #include "ECUtil.h"
  #include "ECTypes.h"
  #if WITH_CRIMSON
-#include "ExtentCache.h"
  #include "crimson/osd/object_context.h"
  #include "os/Transaction.h"
  #include "osd/OSDMap.h"
@@ -32,7 +31,7 @@ struct ECTransaction {
    struct WritePlan {
      bool invalidates_cache = false; // Yes, both are possible
      std::map<hobject_t,extent_set> to_read;
-    std::map<hobject_t,extent_set> will_write; // superset of to_read
+    std::map<hobject_t,extent_set> will_write;
  
      std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
    };
@@ -45,8 +44,9 @@ typedef crimson::osd::ObjectContextRef ObjectContextRef;
  #endif
  
  #include "ECTransaction.h"
-#include "ExtentCache.h"
+#include "ECExtentCache.h"
  #include "ECListener.h"
+#include "common/dout.h"
  
  //forward declaration
  struct ECSubWrite;
@@ -56,36 +56,86 @@ struct ECCommon {
    struct ec_extent_t {
      int err;
      extent_map emap;
+    ECUtil::shard_extent_map_t shard_extent_map;
+
+    void print(std::ostream &os) const {
+      os << err << "," << emap;
+    }
    };
-  friend std::ostream &operator<<(std::ostream &lhs, const ec_extent_t &rhs);
+
    using ec_extents_t = std::map<hobject_t, ec_extent_t>;
  
    virtual ~ECCommon() = default;
  
    virtual void handle_sub_write(
-    pg_shard_t from,
-    OpRequestRef msg,
-    ECSubWrite &op,
-    const ZTracer::Trace &trace,
-    ECListener& eclistener
-    ) = 0;
+      pg_shard_t from,
+      OpRequestRef msg,
+      ECSubWrite &op,
+      const ZTracer::Trace &trace,
+      ECListener &eclistener) = 0;
  
    virtual void objects_read_and_reconstruct(
-    const std::map<hobject_t, std::list<ec_align_t>> &reads,
-    bool fast_read,
-    GenContextURef<ec_extents_t &&> &&func) = 0;
+      const std::map<hobject_t, std::list<ec_align_t>> &reads,
+      bool fast_read,
+      uint64_t object_size,
+      GenContextURef<ec_extents_t&&> &&func) = 0;
+
+  struct shard_read_t {
+    extent_set extents;
+    std::optional<std::vector<std::pair<int, int>>> subchunk;
+    pg_shard_t pg_shard;
+    bool operator==(const shard_read_t &other) const;
+
+    void print(std::ostream &os) const {
+      os << "shard_read_t(extents=[" << extents << "]"
+          << ", subchunk=" << subchunk
+          << ", pg_shard=" << pg_shard
+          << ")";
+    }
+  };
  
    struct read_request_t {
      const std::list<ec_align_t> to_read;
-    std::map<pg_shard_t, std::vector<std::pair<int, int>>> need;
-    bool want_attrs;
+    const uint32_t flags = 0;
+    const ECUtil::shard_extent_set_t shard_want_to_read;
+    shard_id_map<shard_read_t> shard_reads;
+    bool want_attrs = false;
+    uint64_t object_size;
+
      read_request_t(
-      const std::list<ec_align_t> &to_read,
-      const std::map<pg_shard_t, std::vector<std::pair<int, int>>> &need,
-      bool want_attrs)
-      : to_read(to_read), need(need), want_attrs(want_attrs) {}
+        const std::list<ec_align_t> &to_read,
+        const ECUtil::shard_extent_set_t &shard_want_to_read,
+        bool want_attrs, uint64_t object_size) :
+      to_read(to_read),
+      flags(to_read.front().flags),
+      shard_want_to_read(shard_want_to_read),
+      shard_reads(shard_want_to_read.get_max_shards()),
+      want_attrs(want_attrs),
+      object_size(object_size) {}
+
+    read_request_t(const ECUtil::shard_extent_set_t &shard_want_to_read,
+               bool want_attrs, uint64_t object_size) :
+      shard_want_to_read(shard_want_to_read),
+      shard_reads(shard_want_to_read.get_max_shards()),
+      want_attrs(want_attrs),
+      object_size(object_size) {}
+
+    bool operator==(const read_request_t &other) const;
+
+    void print(std::ostream &os) const {
+      os << "read_request_t(to_read=[" << to_read << "]"
+          << ", flags=" << flags
+          << ", shard_want_to_read=" << shard_want_to_read
+          << ", shard_reads=" << shard_reads
+          << ", want_attrs=" << want_attrs
+          << ")";
+    }
    };
-  friend std::ostream &operator<<(std::ostream &lhs, const read_request_t &rhs);
+
+  virtual void objects_read_and_reconstruct_for_rmw(
+      std::map<hobject_t, read_request_t> &&to_read,
+      GenContextURef<ec_extents_t&&> &&func) = 0;
+
    struct ReadOp;
    /**
     * Low level async read mechanism
@@ -111,19 +161,30 @@ struct ECCommon {
    struct read_result_t {
      int r;
      std::map<pg_shard_t, int> errors;
-    std::optional<std::map<std::string, ceph::buffer::list, std::less<>> > attrs;
-    std::list<
-      boost::tuple<
-       uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > > returned;
-    read_result_t() : r(0) {}
+    std::optional<std::map<std::string, ceph::buffer::list, std::less<>>> attrs;
+    ECUtil::shard_extent_map_t buffers_read;
+    ECUtil::shard_extent_set_t processed_read_requests;
+
+    read_result_t(const ECUtil::stripe_info_t *sinfo) :
+      r(0), buffers_read(sinfo),
+      processed_read_requests(sinfo->get_k_plus_m()) {}
+
+    void print(std::ostream &os) const {
+      os << "read_result_t(r=" << r << ", errors=" << errors;
+      if (attrs) {
+        os << ", attrs=" << *(attrs);
+      } else {
+        os << ", noattrs";
+      }
+      os << ", buffers_read=" << buffers_read << ")";
+    }
    };
  
    struct ReadCompleter {
      virtual void finish_single_request(
-      const hobject_t &hoid,
-      read_result_t &res,
-      std::list<ec_align_t> to_read,
-      std::set<int> wanted_to_read) = 0;
+        const hobject_t &hoid,
+        read_result_t &&res,
+        ECCommon::read_request_t &req) = 0;
  
      virtual void finish(int priority) && = 0;
  
@@ -131,26 +192,35 @@ struct ECCommon {
    };
  
    friend struct CallClientContexts;
+
    struct ClientAsyncReadStatus {
      unsigned objects_to_read;
-    GenContextURef<ec_extents_t &&> func;
+    GenContextURef<ec_extents_t&&> func;
      ec_extents_t results;
+
      explicit ClientAsyncReadStatus(
-      unsigned objects_to_read,
-      GenContextURef<ec_extents_t &&> &&func)
+        unsigned objects_to_read,
+        GenContextURef<ec_extents_t&&> &&func)
        : objects_to_read(objects_to_read), func(std::move(func)) {}
+
      void complete_object(
-      const hobject_t &hoid,
-      int err,
-      extent_map &&buffers) {
+        const hobject_t &hoid,
+        int err,
+        extent_map &&buffers,
+        ECUtil::shard_extent_map_t &&shard_extent_map) {
        ceph_assert(objects_to_read);
        --objects_to_read;
-      ceph_assert(!results.count(hoid));
-      results.emplace(hoid, ec_extent_t{err, std::move(buffers)});
+      ceph_assert(!results.contains(hoid));
+      results.emplace(hoid, ec_extent_t{
+                        err, std::move(buffers),
+                        std::move(shard_extent_map)
+                      });
      }
+
      bool is_complete() const {
        return objects_to_read == 0;
      }
+
      void run() {
        func.release()->complete(std::move(results));
      }
@@ -171,113 +241,125 @@ struct ECCommon {
  
      ZTracer::Trace trace;
  
-    std::map<hobject_t, std::set<int>> want_to_read;
      std::map<hobject_t, read_request_t> to_read;
      std::map<hobject_t, read_result_t> complete;
  
      std::map<hobject_t, std::set<pg_shard_t>> obj_to_source;
-    std::map<pg_shard_t, std::set<hobject_t> > source_to_obj;
+    std::map<pg_shard_t, std::set<hobject_t>> source_to_obj;
  
      void dump(ceph::Formatter *f) const;
  
      std::set<pg_shard_t> in_progress;
  
+    std::list<ECUtil::log_entry_t> debug_log;
+
      ReadOp(
-      int priority,
-      ceph_tid_t tid,
-      bool do_redundant_reads,
-      bool for_recovery,
-      std::unique_ptr<ReadCompleter> _on_complete,
-      OpRequestRef op,
-      std::map<hobject_t, std::set<int>> &&_want_to_read,
-      std::map<hobject_t, read_request_t> &&_to_read)
+        int priority,
+        ceph_tid_t tid,
+        bool do_redundant_reads,
+        bool for_recovery,
+        std::unique_ptr<ReadCompleter> _on_complete,
+        std::map<hobject_t, read_request_t> &&_to_read)
        : priority(priority),
          tid(tid),
-        op(op),
          do_redundant_reads(do_redundant_reads),
          for_recovery(for_recovery),
          on_complete(std::move(_on_complete)),
-        want_to_read(std::move(_want_to_read)),
-       to_read(std::move(_to_read)) {
-      for (auto &&hpair: to_read) {
-       auto &returned = complete[hpair.first].returned;
-       for (auto &&extent: hpair.second.to_read) {
-         returned.push_back(
-           boost::make_tuple(
-             extent.offset,
-             extent.size,
-             std::map<pg_shard_t, ceph::buffer::list>()));
-       }
-      }
-    }
+        to_read(std::move(_to_read)) {}
+
      ReadOp() = delete;
      ReadOp(const ReadOp &) = delete; // due to on_complete being unique_ptr
      ReadOp(ReadOp &&) = default;
+
+    void print(std::ostream &os) const {
+      os << "ReadOp(tid=" << tid;
+#ifndef WITH_CRIMSON
+      if (op && op->get_req()) {
+        os << ", op=";
+        op->get_req()->print(os);
+      }
+#endif
+      os << ", to_read=" << to_read << ", complete=" << complete
+          << ", priority=" << priority << ", obj_to_source=" << obj_to_source
+          << ", source_to_obj=" << source_to_obj << ", in_progress=" <<
+          in_progress
+          << ", debug_log=" << debug_log << ")";
+    }
    };
+
    struct ReadPipeline {
      void objects_read_and_reconstruct(
-      const std::map<hobject_t, std::list<ec_align_t>> &reads,
-      bool fast_read,
-      GenContextURef<ec_extents_t &&> &&func);
+        const std::map<hobject_t, std::list<ec_align_t>> &reads,
+        bool fast_read,
+        uint64_t object_size,
+        GenContextURef<ec_extents_t&&> &&func);
+
+    void objects_read_and_reconstruct_for_rmw(
+        std::map<hobject_t, read_request_t> &&to_read,
+        GenContextURef<ECCommon::ec_extents_t&&> &&func);
  
      template <class F, class G>
      void filter_read_op(
-      const OSDMapRef& osdmap,
-      ReadOp &op,
-      F&& on_erase,
-      G&& on_schedule_recovery);
+        const OSDMapRef &osdmap,
+        ReadOp &op,
+        F &&on_erase,
+        G &&on_schedule_recovery);
  
      template <class F, class G>
      void check_recovery_sources(
-      const OSDMapRef& osdmap,
-      F&& on_erase,
-      G&& on_schedule_recovery);
+        const OSDMapRef &osdmap,
+        F &&on_erase,
+        G &&on_schedule_recovery);
  
-    void complete_read_op(ReadOp &rop);
+    void complete_read_op(ReadOp &&rop);
  
      void start_read_op(
-      int priority,
-      std::map<hobject_t, std::set<int>> &want_to_read,
-      std::map<hobject_t, read_request_t> &to_read,
-      OpRequestRef op,
-      bool do_redundant_reads,
-      bool for_recovery,
-      std::unique_ptr<ReadCompleter> on_complete);
+        int priority,
+        std::map<hobject_t, read_request_t> &to_read,
+        bool do_redundant_reads,
+        bool for_recovery,
+        std::unique_ptr<ReadCompleter> on_complete);
  
      void do_read_op(ReadOp &rop);
  
      int send_all_remaining_reads(
-      const hobject_t &hoid,
-      ReadOp &rop);
+        const hobject_t &hoid,
+        ReadOp &rop);
  
      void on_change();
  
      void kick_reads();
  
      std::map<ceph_tid_t, ReadOp> tid_to_read_map;
-    std::map<pg_shard_t, std::set<ceph_tid_t> > shard_to_read_map;
+    std::map<pg_shard_t, std::set<ceph_tid_t>> shard_to_read_map;
      std::list<ClientAsyncReadStatus> in_progress_client_reads;
  
-    CephContext* cct;
+    CephContext *cct;
      ceph::ErasureCodeInterfaceRef ec_impl;
-    const ECUtil::stripe_info_t& sinfo;
+    const ECUtil::stripe_info_t &sinfo;
      // TODO: lay an interface down here
-    ECListener* parent;
+    ECListener *parent;
  
      ECListener *get_parent() const { return parent; }
-    const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
-    epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
-    const pg_info_t &get_info() { return get_parent()->get_info(); }
  
-    ReadPipeline(CephContext* cct,
-                ceph::ErasureCodeInterfaceRef ec_impl,
-                const ECUtil::stripe_info_t& sinfo,
-                ECListener* parent)
+    const OSDMapRef &get_osdmap() const {
+      return get_parent()->pgb_get_osdmap();
+    }
+
+    epoch_t get_osdmap_epoch() const {
+      return get_parent()->pgb_get_osdmap_epoch();
+    }
+
+    const pg_info_t &get_info() const { return get_parent()->get_info(); }
+
+    ReadPipeline(CephContext *cct,
+                 ceph::ErasureCodeInterfaceRef ec_impl,
+                 const ECUtil::stripe_info_t &sinfo,
+                 ECListener *parent)
        : cct(cct),
          ec_impl(std::move(ec_impl)),
          sinfo(sinfo),
-        parent(parent) {
-    }
+        parent(parent) {}
  
      /**
       * While get_want_to_read_shards creates a want_to_read based on the EC
@@ -289,47 +371,42 @@ struct ECCommon {
       *
       */
      void get_min_want_to_read_shards(
-      uint64_t offset,                         ///< [in]
-      uint64_t length,                         ///< [in]
-      std::set<int> *want_to_read               ///< [out]
-      );
-    static void get_min_want_to_read_shards(
-      const uint64_t offset,
-      const uint64_t length,
-      const ECUtil::stripe_info_t& sinfo,
-      std::set<int> *want_to_read);
+        const ec_align_t &to_read, ///< [in]
+        ECUtil::shard_extent_set_t &want_shard_reads); ///< [out]
  
      int get_remaining_shards(
-      const hobject_t &hoid,
-      const std::set<int> &avail,
-      const std::set<int> &want,
-      const read_result_t &result,
-      std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read,
-      bool for_recovery);
+        const hobject_t &hoid,
+        read_result_t &read_result,
+        read_request_t &read_request,
+        bool for_recovery,
+        bool fast_read);
  
      void get_all_avail_shards(
-      const hobject_t &hoid,
-      const std::set<pg_shard_t> &error_shards,
-      std::set<int> &have,
-      std::map<shard_id_t, pg_shard_t> &shards,
-      bool for_recovery);
+        const hobject_t &hoid,
+        shard_id_set &have,
+        shard_id_map<pg_shard_t> &shards,
+        bool for_recovery,
+        const std::optional<std::set<pg_shard_t>> &error_shards = std::nullopt);
+
+    std::pair<const shard_id_set, const shard_id_set> get_readable_writable_shard_id_sets();
  
-    friend std::ostream &operator<<(std::ostream &lhs, const ReadOp &rhs);
      friend struct FinishReadOp;
  
-    void get_want_to_read_shards(std::set<int> *want_to_read) const;
+    void get_want_to_read_shards(
+        const std::list<ec_align_t> &to_read,
+        ECUtil::shard_extent_set_t &want_shard_reads);
  
      /// Returns to_read replicas sufficient to reconstruct want
      int get_min_avail_to_read_shards(
-      const hobject_t &hoid,     ///< [in] object
-      const std::set<int> &want,      ///< [in] desired shards
-      bool for_recovery,         ///< [in] true if we may use non-acting replicas
-      bool do_redundant_reads,   ///< [in] true if we want to issue redundant reads to reduce latency
-      std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read   ///< [out] shards, corresponding subchunks to read
+        const hobject_t &hoid, ///< [in] object
+        bool for_recovery, ///< [in] true if we may use non-acting replicas
+        bool do_redundant_reads,
+        ///< [in] true if we want to issue redundant reads to reduce latency
+        read_request_t &read_request,
+        ///< [out] shard_reads, corresponding subchunks / other sub reads to read
+        const std::optional<std::set<pg_shard_t>> &error_shards = std::nullopt
+        //< [in] Shards where reads have failed (optional)
        ); ///< @return error code, 0 on success
-
-    void schedule_recovery_work();
-
    };
  
    /**
@@ -346,7 +423,7 @@ struct ECCommon {
     * on the writing std::list.
     */
  
-  struct RMWPipeline {
+  struct RMWPipeline : ECExtentCache::BackendReadListener {
      struct Op : boost::intrusive::list_base_hook<> {
        /// From submit_transaction caller, describes operation
        hobject_t hoid;
@@ -374,171 +451,176 @@ struct ECCommon {
        /// Ancillary also provided from submit_transaction caller
        std::map<hobject_t, ObjectContextRef> obc_map;
  
-      /// see call_write_ordered
-      std::list<std::function<void(void)> > on_write;
-
        /// Generated internally
        std::set<hobject_t> temp_added;
        std::set<hobject_t> temp_cleared;
  
        ECTransaction::WritePlan plan;
-      bool requires_rmw() const { return !plan.to_read.empty(); }
-      bool invalidates_cache() const { return plan.invalidates_cache; }
+      bool requires_rmw() const { return !plan.want_read; }
  
        // must be true if requires_rmw(), must be false if invalidates_cache()
        bool using_cache = true;
  
        /// In progress read state;
-      std::map<hobject_t,extent_set> pending_read; // subset already being read
-      std::map<hobject_t,extent_set> remote_read;  // subset we must read
-      std::map<hobject_t,extent_map> remote_read_result;
-      bool read_in_progress() const {
-        return !remote_read.empty() && remote_read_result.empty();
-      }
+      int pending_cache_ops = 0;
+      std::map<hobject_t, ECUtil::shard_extent_map_t> remote_shard_extent_map;
  
        /// In progress write state.
-      std::set<pg_shard_t> pending_commit;
-      // we need pending_apply for pre-mimic peers so that we don't issue a
-      // read on a remote shard before it has applied a previous write.  We can
-      // remove this after nautilus.
-      std::set<pg_shard_t> pending_apply;
+      int pending_commits = 0;
+
        bool write_in_progress() const {
-        return !pending_commit.empty() || !pending_apply.empty();
+        return pending_commits != 0;
        }
  
        /// optional, may be null, for tracking purposes
        OpRequestRef client_op;
  
        /// pin for cache
-      ExtentCache::write_pin pin;
+      std::list<ECExtentCache::OpRef> cache_ops;
+      RMWPipeline *pipeline;
+
+      Op() : tid(), plan(), pipeline(nullptr) {}
  
        /// Callbacks
        Context *on_all_commit = nullptr;
+
        virtual ~Op() {
          delete on_all_commit;
        }
  
        virtual void generate_transactions(
-        ceph::ErasureCodeInterfaceRef &ecimpl,
-        pg_t pgid,
-        const ECUtil::stripe_info_t &sinfo,
-        std::map<hobject_t,extent_map> *written,
-        std::map<shard_id_t, ceph::os::Transaction> *transactions,
-        DoutPrefixProvider *dpp,
-        const ceph_release_t require_osd_release = ceph_release_t::unknown) = 0;
-    };
-    using OpRef = std::unique_ptr<Op>;
-    using op_list = boost::intrusive::list<Op>;
-    friend std::ostream &operator<<(std::ostream &lhs, const Op &rhs);
+          ceph::ErasureCodeInterfaceRef &ec_impl,
+          pg_t pgid,
+          const ECUtil::stripe_info_t &sinfo,
+          std::map<hobject_t, ECUtil::shard_extent_map_t> *written,
+          shard_id_map<ceph::os::Transaction> *transactions,
+          DoutPrefixProvider *dpp,
+          const OSDMapRef &osdmap) = 0;
+
+      virtual bool skip_transaction(
+          std::set<shard_id_t> &pending_roll_forward,
+          shard_id_t shard,
+          ceph::os::Transaction &transaction) = 0;
+
+      void cache_ready(const hobject_t &oid, const ECUtil::shard_extent_map_t &result) {
+        if (!result.empty()) {
+          remote_shard_extent_map.insert(std::pair(oid, result));
+        }
  
-    ExtentCache cache;
-    std::map<ceph_tid_t, OpRef> tid_to_op_map; /// Owns Op structure
-    /**
-     * We model the possible rmw states as a std::set of waitlists.
-     * All writes at this time complete in order, so a write blocked
-     * at waiting_state blocks all writes behind it as well (same for
-     * other states).
-     *
-     * Future work: We can break this up into a per-object pipeline
-     * (almost).  First, provide an ordering token to submit_transaction
-     * and require that all operations within a single transaction take
-     * place on a subset of hobject_t space partitioned by that token
-     * (the hashid seem about right to me -- even works for temp objects
-     * if you recall that a temp object created for object head foo will
-     * only ever be referenced by other transactions on foo and aren't
-     * reused).  Next, factor this part into a class and maintain one per
-     * ordering token.  Next, fixup PrimaryLogPG's repop queue to be
-     * partitioned by ordering token.  Finally, refactor the op pipeline
-     * so that the log entries passed into submit_transaction aren't
-     * versioned.  We can't assign versions to them until we actually
-     * submit the operation.  That's probably going to be the hard part.
-     */
-    class pipeline_state_t {
-      enum {
-        CACHE_VALID = 0,
-        CACHE_INVALID = 1
-      } pipeline_state = CACHE_VALID;
-    public:
-      bool caching_enabled() const {
-        return pipeline_state == CACHE_VALID;
-      }
-      bool cache_invalid() const {
-        return !caching_enabled();
-      }
-      void invalidate() {
-        pipeline_state = CACHE_INVALID;
+        if (!--pending_cache_ops) {
+          pipeline->cache_ready(*this);
+        }
        }
-      void clear() {
-        pipeline_state = CACHE_VALID;
+
+      void print(std::ostream &os) const {
+        os << "Op(" << hoid << " v=" << version << " tt=" << trim_to
+            << " tid=" << tid << " reqid=" << reqid;
+#ifndef WITH_CRIMSON
+        if (client_op && client_op->get_req()) {
+          os << " client_op=";
+          client_op->get_req()->print(os);
+        }
+#endif
+        os << " pg_committed_to=" << pg_committed_to
+            << " temp_added=" << temp_added
+            << " temp_cleared=" << temp_cleared
+            << " remote_read_result=" << remote_shard_extent_map
+            << " pending_commits=" << pending_commits
+            << " plan.to_read=" << plan
+            << ")";
        }
-      friend std::ostream &operator<<(std::ostream &lhs, const pipeline_state_t &rhs);
-    } pipeline_state;
+    };
+
+    void backend_read(hobject_t oid, ECUtil::shard_extent_set_t const &request,
+                      uint64_t object_size) override {
+      std::map<hobject_t, read_request_t> to_read;
+      to_read.emplace(oid, read_request_t(request, false, object_size));
+
+      objects_read_async_no_cache(
+        std::move(to_read),
+        [this](ec_extents_t &&results) {
+          for (auto &&[oid, result]: results) {
+            extent_cache.read_done(oid, std::move(result.shard_extent_map));
+          }
+        });
+    }
  
-    op_list waiting_state;        /// writes waiting on pipe_state
-    op_list waiting_reads;        /// writes waiting on partial stripe reads
-    op_list waiting_commit;       /// writes waiting on initial commit
+    using OpRef = std::shared_ptr<Op>;
+
+    std::map<ceph_tid_t, OpRef> tid_to_op_map; /// Owns Op structure
+    std::map<hobject_t, eversion_t> oid_to_version;
+
+    std::list<OpRef> waiting_commit;
      eversion_t completed_to;
      eversion_t committed_to;
      void start_rmw(OpRef op);
-    bool try_state_to_reads();
-    bool try_reads_to_commit();
-    bool try_finish_rmw();
-    void check_ops();
+    void cache_ready(Op &op);
+    void try_finish_rmw();
+    void finish_rmw(OpRef const &op);
  
      void on_change();
+    void on_change2();
      void call_write_ordered(std::function<void(void)> &&cb);
  
-    CephContext* cct;
+    CephContext *cct;
      ECListener *get_parent() const { return parent; }
-    const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
-    epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
-    const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+    const OSDMapRef &get_osdmap() const {
+      return get_parent()->pgb_get_osdmap();
+    }
+
+    epoch_t get_osdmap_epoch() const {
+      return get_parent()->pgb_get_osdmap_epoch();
+    }
+
+    const pg_info_t &get_info() const { return get_parent()->get_info(); }
  
      template <typename Func>
      void objects_read_async_no_cache(
-      const std::map<hobject_t,extent_set> &to_read,
-      Func &&on_complete
-    ) {
-      std::map<hobject_t, std::list<ec_align_t>> _to_read;
-      for (auto &&hpair: to_read) {
-        auto &l = _to_read[hpair.first];
-        for (auto extent: hpair.second) {
-          l.emplace_back(ec_align_t{extent.first, extent.second, 0});
-        }
-      }
-      ec_backend.objects_read_and_reconstruct(
-        _to_read,
-        false,
+        std::map<hobject_t, read_request_t> &&to_read,
+        Func &&on_complete) {
+      ec_backend.objects_read_and_reconstruct_for_rmw(
+        std::move(to_read),
          make_gen_lambda_context<
-        ECCommon::ec_extents_t &&, Func>(
-            std::forward<Func>(on_complete)));
+          ECCommon::ec_extents_t&&, Func>(
+          std::forward<Func>(on_complete)));
      }
+
      void handle_sub_write(
-      pg_shard_t from,
-      OpRequestRef msg,
-      ECSubWrite &op,
-      const ZTracer::Trace &trace
-    ) {
-      ec_backend.handle_sub_write(from, std::move(msg), op, trace, *get_parent());
+        pg_shard_t from,
+        OpRequestRef msg,
+        ECSubWrite &op,
+        const ZTracer::Trace &trace) const {
+      ec_backend.handle_sub_write(from, std::move(msg), op, trace,
+                                  *get_parent());
      }
+
      // end of iface
  
+    // Set of shards that will need a dummy transaction for the final
+    // roll forward
+    std::set<shard_id_t> pending_roll_forward;
+
      ceph::ErasureCodeInterfaceRef ec_impl;
-    const ECUtil::stripe_info_t& sinfo;
-    ECListener* parent;
-    ECCommon& ec_backend;
+    const ECUtil::stripe_info_t &sinfo;
+    ECListener *parent;
+    ECCommon &ec_backend;
+    ECExtentCache extent_cache;
+    uint64_t ec_pdw_write_mode;
  
-    RMWPipeline(CephContext* cct,
+    RMWPipeline(CephContext *cct,
                  ceph::ErasureCodeInterfaceRef ec_impl,
-                const ECUtil::stripe_info_t& sinfo,
-                ECListener* parent,
-                ECCommon& ec_backend)
+                const ECUtil::stripe_info_t &sinfo,
+                ECListener *parent,
+                ECCommon &ec_backend,
+                ECExtentCache::LRU &ec_extent_cache_lru)
        : cct(cct),
          ec_impl(std::move(ec_impl)),
          sinfo(sinfo),
          parent(parent),
-        ec_backend(ec_backend) {
-    }
+        ec_backend(ec_backend),
+        extent_cache(*this, ec_extent_cache_lru, sinfo, cct),
+        ec_pdw_write_mode(cct->_conf.get_val<uint64_t>("ec_pdw_write_mode")) {}
    };
  
    class UnstableHashInfoRegistry {
@@ -547,47 +629,35 @@ struct ECCommon {
      /// If modified, ensure that the ref is held until the update is applied
      SharedPtrRegistry<hobject_t, ECUtil::HashInfo> registry;
  
-  public:
+   public:
      UnstableHashInfoRegistry(
-      CephContext *cct,
-      ceph::ErasureCodeInterfaceRef ec_impl)
+        CephContext *cct,
+        ceph::ErasureCodeInterfaceRef ec_impl)
        : cct(cct),
-       ec_impl(std::move(ec_impl)) {}
+        ec_impl(std::move(ec_impl)) {}
  
      ECUtil::HashInfoRef maybe_put_hash_info(
-      const hobject_t &hoid,
-      ECUtil::HashInfo &&hinfo);
+        const hobject_t &hoid,
+        ECUtil::HashInfo &&hinfo);
  
      ECUtil::HashInfoRef get_hash_info(
-      const hobject_t &hoid,
-      bool create,
-      const std::map<std::string, ceph::buffer::list, std::less<>>& attr,
-      uint64_t size);
+        const hobject_t &hoid,
+        bool create,
+        const std::map<std::string, ceph::buffer::list, std::less<>> &attrs,
+        uint64_t size);
    };
  };
  
-std::ostream &operator<<(std::ostream &lhs,
-                        const ECCommon::RMWPipeline::pipeline_state_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
-                        const ECCommon::read_request_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
-                        const ECCommon::read_result_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
-                        const ECCommon::ReadOp &rhs);
-std::ostream &operator<<(std::ostream &lhs,
-                        const ECCommon::RMWPipeline::Op &rhs);
-
  template <class F, class G>
  void ECCommon::ReadPipeline::check_recovery_sources(
-  const OSDMapRef& osdmap,
-  F&& on_erase,
-  G&& on_schedule_recovery)
-{
+    const OSDMapRef &osdmap,
+    F &&on_erase,
+    G &&on_schedule_recovery
+  ) {
    std::set<ceph_tid_t> tids_to_filter;
-  for (std::map<pg_shard_t, std::set<ceph_tid_t> >::iterator 
+  for (std::map<pg_shard_t, std::set<ceph_tid_t>>::iterator
         i = shard_to_read_map.begin();
-       i != shard_to_read_map.end();
-       ) {
+       i != shard_to_read_map.end();) {
      if (osdmap->is_down(i->first.osd)) {
        tids_to_filter.insert(i->second.begin(), i->second.end());
        shard_to_read_map.erase(i++);
@@ -606,53 +676,45 @@ void ECCommon::ReadPipeline::check_recovery_sources(
  
  template <class F, class G>
  void ECCommon::ReadPipeline::filter_read_op(
-  const OSDMapRef& osdmap,
-  ReadOp &op,
-  F&& on_erase,
-  G&& on_schedule_recovery)
-{
+    const OSDMapRef &osdmap,
+    ReadOp &op,
+    F &&on_erase,
+    G &&on_schedule_recovery
+  ) {
    std::set<hobject_t> to_cancel;
-  for (std::map<pg_shard_t, std::set<hobject_t> >::iterator i = op.source_to_obj.begin();
-       i != op.source_to_obj.end();
-       ++i) {
-    if (osdmap->is_down(i->first.osd)) {
-      to_cancel.insert(i->second.begin(), i->second.end());
-      op.in_progress.erase(i->first);
-      continue;
+  for (auto &&[pg_shard, hoid_set] : op.source_to_obj) {
+    if (osdmap->is_down(pg_shard.osd)) {
+      to_cancel.insert(hoid_set.begin(), hoid_set.end());
+      op.in_progress.erase(pg_shard);
      }
    }
  
    if (to_cancel.empty())
      return;
  
-  for (std::map<pg_shard_t, std::set<hobject_t> >::iterator i = op.source_to_obj.begin();
-       i != op.source_to_obj.end();
-       ) {
-    for (std::set<hobject_t>::iterator j = i->second.begin();
-        j != i->second.end();
-        ) {
-      if (to_cancel.count(*j))
-       i->second.erase(j++);
-      else
-       ++j;
+  for (auto iter = op.source_to_obj.begin();
+       iter != op.source_to_obj.end();) {
+    auto &[pg_shard, hoid_set] = *iter;
+    for (auto &hoid : hoid_set) {
+      if (to_cancel.contains(hoid)) {
+        hoid_set.erase(hoid);
+      }
      }
-    if (i->second.empty()) {
-      op.source_to_obj.erase(i++);
+    if (hoid_set.empty()) {
+      op.source_to_obj.erase(iter++);
      } else {
-      ceph_assert(!osdmap->is_down(i->first.osd));
-      ++i;
+      ceph_assert(!osdmap->is_down(pg_shard.osd));
+      ++iter;
      }
    }
  
-  for (std::set<hobject_t>::iterator i = to_cancel.begin();
-       i != to_cancel.end();
-       ++i) {
-    get_parent()->cancel_pull(*i);
+  for (auto hoid : to_cancel) {
+    get_parent()->cancel_pull(hoid);
  
-    ceph_assert(op.to_read.count(*i));
-    op.to_read.erase(*i);
-    op.complete.erase(*i);
-    on_erase(*i);
+    ceph_assert(op.to_read.contains(hoid));
+    op.to_read.erase(hoid);
+    op.complete.erase(hoid);
+    on_erase(hoid);
    }
  
    if (op.in_progress.empty()) {
@@ -675,8 +737,14 @@ void ECCommon::ReadPipeline::filter_read_op(
    }
  }
  
-template <> struct fmt::formatter<ECCommon::RMWPipeline::pipeline_state_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::read_request_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::read_result_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::ReadOp> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::RMWPipeline::Op> : fmt::ostream_formatter {};
-\ No newline at end of file
+template <>
+struct fmt::formatter<ECCommon::read_request_t> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::read_result_t> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::ReadOp> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::RMWPipeline::Op> : fmt::ostream_formatter {};
diff --git a/src/osd/ECExtentCache.cc b/src/osd/ECExtentCache.cc

new file mode 100644 (file)

index 0000000..9ee94c2
--- /dev/null
+++ b/src/osd/ECExtentCache.cc
@@ -0,0 +1,480 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ECExtentCache.h"
+#include "ECUtil.h"
+
+#include <ranges>
+
+using namespace std;
+using namespace ECUtil;
+
+void ECExtentCache::Object::request(OpRef &op) {
+  /* After a cache invalidation, we allow through a single cache-invalidating
+   * IO.
+   */
+  if (op->invalidates_cache) {
+    if (cache_invalidated) {
+      op->invalidates_cache = false;
+    } else {
+      cache_invalidate_expected = true;
+    }
+  }
+  cache_invalidated = false;
+
+  extent_set eset = op->get_pin_eset(line_size);
+
+  for (auto &&[start, len] : eset) {
+    for (uint64_t to_pin = start; to_pin < start + len; to_pin += line_size) {
+      LineRef l;
+      if (!lines.contains(to_pin)) {
+        l = make_shared<Line>(*this, to_pin);
+        if (!l->cache->empty()) {
+          l->cache->to_shard_extent_set(do_not_read);
+        }
+        lines.emplace(to_pin, weak_ptr(l));
+      } else {
+        l = lines.at(to_pin).lock();
+      }
+      op->lines.emplace_back(l);
+    }
+  }
+
+  bool read_required = false;
+
+  /* Deal with reads if there are any.
+   * If any cache invalidation ops have been added, there is no point adding any
+   * reads as they are all going to be thrown away before any of the
+   * post-invalidate ops are honoured.
+   */
+  if (op->reads && !cache_invalidate_expected) {
+    for (auto &&[shard, eset] : *(op->reads)) {
+      extent_set request = eset;
+      if (do_not_read.contains(shard)) {
+        request.subtract(do_not_read.at(shard));
+      }
+
+      if (!request.empty()) {
+        requesting[shard].union_of(request);
+        read_required = true;
+        requesting_ops.emplace_back(op);
+      }
+    }
+  }
+
+
+  /* Calculate the range of the object which no longer need to be written. This
+   * will include:
+   *  - Any reads being issued by this IO.
+   *  - Any writes being issued (these will be cached)
+   *  - any unwritten regions in an append - these can assumed to be zero.
+   */
+  if (read_required) {
+    do_not_read.insert(requesting);
+  }
+  do_not_read.insert(op->writes);
+  if (op->projected_size > projected_size) {
+    /* This write is growing the size of the object. This essentially counts
+     * as a write (although the cache will not get populated). Future reads
+     * to this area will be skipped, but this makes them essentially zero
+     * reads.
+     */
+    shard_extent_set_t obj_hole(pg.sinfo.get_k_plus_m());
+    shard_extent_set_t read_mask(pg.sinfo.get_k_plus_m());
+
+    pg.sinfo.ro_size_to_read_mask(op->projected_size, obj_hole);
+    pg.sinfo.ro_size_to_read_mask(projected_size, read_mask);
+    obj_hole.subtract(read_mask);
+    do_not_read.insert(obj_hole);
+  } else if (op->projected_size < projected_size) {
+    // Invalidate the object's cache when we see any object reduce in size.
+    op->invalidates_cache = true;
+  }
+
+  projected_size = op->projected_size;
+
+  if (read_required) send_reads();
+  else op->read_done = true;
+}
+
+void ECExtentCache::Object::send_reads() {
+  if (reading || requesting.empty())
+    return; // Read busy
+
+  reading_ops.swap(requesting_ops);
+  pg.backend_read.backend_read(oid, requesting, current_size);
+  requesting.clear();
+  reading = true;
+}
+
+void ECExtentCache::Object::read_done(shard_extent_map_t const &buffers) {
+  reading = false;
+  for (auto &&op : reading_ops) {
+    op->read_done = true;
+  }
+  reading_ops.clear();
+  insert(buffers);
+}
+
+uint64_t ECExtentCache::Object::line_align(uint64_t x) const {
+  return x - (x % line_size);
+}
+
+void ECExtentCache::Object::insert(shard_extent_map_t const &buffers) const {
+  if (buffers.empty()) return;
+
+  /* The following gets quite inefficient for writes which write to the start
+   * and the end of a very large object, since we iterated over the middle.
+   * This seems like a strange use case, so currently this is not being
+   * optimised.
+   */
+  for (uint64_t slice_start = line_align(buffers.get_start_offset());
+       slice_start < buffers.get_end_offset();
+       slice_start += line_size) {
+    shard_extent_map_t slice = buffers.slice_map(slice_start, line_size);
+    if (!slice.empty()) {
+      LineRef l = lines.at(slice_start).lock();
+      /* The line should have been created already! */
+      l->cache->insert(slice);
+      uint64_t old_size = l->size;
+      l->size = l->cache->size();
+      ceph_assert(l->size >= old_size);
+      update_mempool(0, l->size - old_size);
+    }
+  }
+}
+
+void ECExtentCache::Object::write_done(shard_extent_map_t const &buffers,
+                                       uint64_t new_size) {
+  insert(buffers);
+  current_size = new_size;
+}
+
+void ECExtentCache::Object::unpin(Op &op) const {
+  op.lines.clear();
+  delete_maybe();
+}
+
+void ECExtentCache::Object::delete_maybe() const {
+  if (lines.empty() && active_ios == 0) {
+    pg.objects.erase(oid);
+  }
+}
+
+void check_seset_empty_for_range(shard_extent_set_t s, uint64_t off,
+                                 uint64_t len) {
+  for (auto &[shard, eset] : s) {
+    ceph_assert(!eset.intersects(off, len));
+  }
+}
+
+void ECExtentCache::Object::erase_line(uint64_t offset) {
+  check_seset_empty_for_range(requesting, offset, line_size);
+  do_not_read.erase_stripe(offset, line_size);
+  lines.erase(offset);
+  delete_maybe();
+}
+
+void ECExtentCache::Object::invalidate(const OpRef &invalidating_op) {
+  for (auto &l : std::views::values(lines)) {
+    auto line = l.lock();
+    line->cache->clear();
+    update_mempool(0, -line->size);
+    line->size = 0;
+  }
+
+  /* Remove all entries from the LRU */
+  pg.lru.remove_object(oid);
+
+  ceph_assert(!reading);
+  do_not_read.clear();
+  requesting.clear();
+  requesting_ops.clear();
+  reading_ops.clear();
+
+  /* Current size should reflect the actual size of the object, which was set
+   * by the previous write. We are going to replay all the writes now, so set
+   * the projected size to that of this op.
+   */
+  projected_size = invalidating_op->projected_size;
+
+  // Cache can now be replayed and invalidate teh cache!
+  invalidating_op->invalidates_cache = false;
+
+  cache_invalidated = true;
+  cache_invalidate_expected = false;
+
+  /* We now need to reply all outstanding ops, so as to regenerate the read */
+  for (auto &op : pg.waiting_ops) {
+    if (op->object.oid == oid) {
+      op->read_done = false;
+      request(op);
+    }
+  }
+}
+
+void ECExtentCache::cache_maybe_ready() {
+  while (!waiting_ops.empty()) {
+    OpRef op = waiting_ops.front();
+    if (op->invalidates_cache) {
+      /* We must wait for any outstanding reads to complete. The cache replans
+       * all reads as part of invalidate. If an in-flight read completes after
+       * the invalidate, it will potentially corrupt it, leading to data
+       * corruption at the host.
+       */
+      if (op->object.reading) {
+        return;
+      }
+      op->object.invalidate(op);
+      ceph_assert(!op->invalidates_cache);
+    }
+    /* If reads_done finds all reads complete it will call the completion
+     * callback. Typically, this will cause the client to execute the
+     * transaction and pop the front of waiting_ops.  So we abort if either
+     * reads are not ready, or the client chooses not to complete the op
+     */
+    if (!op->complete_if_reads_cached(op)) {
+      return;
+    }
+
+    waiting_ops.pop_front();
+  }
+}
+
+ECExtentCache::OpRef ECExtentCache::prepare(GenContextURef<OpRef&> &&ctx,
+                                            hobject_t const &oid,
+                                            std::optional<shard_extent_set_t>
+                                            const &to_read,
+                                            shard_extent_set_t const &write,
+                                            uint64_t orig_size,
+                                            uint64_t projected_size,
+                                            bool invalidates_cache) {
+
+  auto object_iter = objects.find(oid);
+  if (object_iter == objects.end()) {
+    auto p = objects.emplace(oid, Object(*this, oid, orig_size));
+    object_iter = p.first;
+  }
+  OpRef op = std::make_shared<Op>(
+    std::move(ctx), object_iter->second, to_read, write, projected_size,
+    invalidates_cache);
+
+  return op;
+}
+
+void ECExtentCache::read_done(hobject_t const &oid,
+                              shard_extent_map_t const &update) {
+  objects.at(oid).read_done(update);
+  cache_maybe_ready();
+  objects.at(oid).send_reads();
+}
+
+void ECExtentCache::write_done(OpRef const &op,
+                               shard_extent_map_t const &update) {
+  op->write_done(std::move(update));
+}
+
+uint64_t ECExtentCache::get_projected_size(hobject_t const &oid) const {
+  return objects.at(oid).get_projected_size();
+}
+
+bool ECExtentCache::contains_object(hobject_t const &oid) const {
+  return objects.contains(oid);
+}
+
+ECExtentCache::Op::~Op() {
+  ceph_assert(object.active_ios > 0);
+  object.active_ios--;
+  ceph_assert(object.pg.active_ios > 0);
+  object.pg.active_ios--;
+
+  object.unpin(*this);
+}
+
+/* ECExtent cache cleanup on occurs in two parts. The first performs cleanup
+ * of the ops currently managed by the extent cache. At this point, however
+ * the cache will be waiting for other parts of EC to clean up (for example
+ * any outstanding reads). on_change2() executes once all of this cleanup has
+ * occurred.
+ */
+void ECExtentCache::on_change() {
+  for (auto &&o : std::views::values(objects)) {
+    o.reading_ops.clear();
+    o.requesting_ops.clear();
+    o.requesting.clear();
+  }
+  for (auto &&op : waiting_ops) {
+    op->cancel();
+  }
+  waiting_ops.clear();
+}
+
+/* This must be run toward the end of EC on_change handling.  It asserts that
+ * any object which is automatically self-destructs when idle has done so.
+ * Additionally, it discards the entire LRU cache. This must be done after all
+ * in-flight reads/writes have completed, or we risk attempting to insert data
+ * into the cache after it has been cleared.
+ *
+ * Note that the LRU will end up being called multiple times. With some
+ * additional code complexity this could be fixed for a small (probably
+ * insignificant) performance improvement.
+ */
+void ECExtentCache::on_change2() const {
+  lru.discard();
+  /* If this assert fires in a unit test, make sure that all ops have completed
+   * and cleared any extent cache ops they contain */
+  ceph_assert(objects.empty());
+  ceph_assert(active_ios == 0);
+  ceph_assert(idle());
+}
+
+void ECExtentCache::execute(list<OpRef> &op_list) {
+  for (auto &op : op_list) {
+    op->object.request(op);
+  }
+  waiting_ops.insert(waiting_ops.end(), op_list.begin(), op_list.end());
+  counter++;
+  cache_maybe_ready();
+}
+
+bool ECExtentCache::idle() const {
+  return active_ios == 0;
+}
+
+uint32_t ECExtentCache::get_and_reset_counter() {
+  uint32_t ret = counter;
+  counter = 0;
+  return ret;
+}
+
+list<ECExtentCache::LRU::Key>::iterator ECExtentCache::LRU::erase(
+    const list<Key>::iterator &it,
+    bool do_update_mempool) {
+  uint64_t size_change = map.at(*it).second->size();
+  if (do_update_mempool) {
+    update_mempool(-1, 0 - size_change);
+  }
+  size -= size_change;
+  map.erase(*it);
+  return lru.erase(it);
+}
+
+void ECExtentCache::LRU::add(const Line &line) {
+  if (line.size == 0) {
+    update_mempool(-1, 0);
+    return;
+  }
+
+  const Key k(line.offset, line.object.oid);
+
+  shared_ptr<shard_extent_map_t> cache = line.cache;
+
+  mutex.lock();
+  ceph_assert(!map.contains(k));
+  auto i = lru.insert(lru.end(), k);
+  auto j = make_pair(std::move(i), std::move(cache));
+  map.insert(std::pair(std::move(k), std::move(j)));
+  size += line.size; // This is already accounted for in mempool.
+  free_maybe();
+  mutex.unlock();
+}
+
+shared_ptr<shard_extent_map_t> ECExtentCache::LRU::find(
+    const hobject_t &oid, uint64_t offset) {
+  Key k(offset, oid);
+  shared_ptr<shard_extent_map_t> cache = nullptr;
+  mutex.lock();
+  if (map.contains(k)) {
+    auto &&[lru_iter, c] = map.at(k);
+    cache = c;
+    auto it = lru_iter; // Intentional copy.
+    erase(it, false);
+  }
+  mutex.unlock();
+  return cache;
+}
+
+void ECExtentCache::LRU::remove_object(const hobject_t &oid) {
+  mutex.lock();
+  for (auto it = lru.begin(); it != lru.end();) {
+    if (it->oid == oid) {
+      it = erase(it, true);
+    } else {
+      ++it;
+    }
+  }
+  mutex.unlock();
+}
+
+void ECExtentCache::LRU::free_maybe() {
+  while (max_size < size) {
+    auto it = lru.begin();
+    erase(it, true);
+  }
+}
+
+void ECExtentCache::LRU::discard() {
+  mutex.lock();
+  lru.clear();
+  update_mempool(0 - map.size(), 0 - size);
+  map.clear();
+  size = 0;
+  mutex.unlock();
+}
+
+const extent_set ECExtentCache::Op::get_pin_eset(uint64_t alignment) const {
+  extent_set eset = writes.get_extent_superset();
+  if (reads) {
+    reads->get_extent_superset(eset);
+  }
+  eset.align(alignment);
+
+  return eset;
+}
+
+ECExtentCache::Op::Op(GenContextURef<OpRef&> &&cache_ready_cb,
+                      Object &object,
+                      std::optional<shard_extent_set_t> const &to_read,
+                      shard_extent_set_t const &write,
+                      uint64_t projected_size,
+                      bool invalidates_cache) :
+  object(object),
+  reads(to_read),
+  writes(write),
+  result(&object.pg.sinfo),
+  invalidates_cache(invalidates_cache),
+  projected_size(projected_size),
+  cache_ready_cb(std::move(cache_ready_cb)) {
+  object.active_ios++;
+  object.pg.active_ios++;
+}
+
+shard_extent_map_t ECExtentCache::Object::get_cache(
+    std::optional<shard_extent_set_t> const &set) const {
+  if (!set) {
+    return shard_extent_map_t(&pg.sinfo);
+  }
+
+  shard_id_map<extent_map> res(pg.sinfo.get_k_plus_m());
+  for (auto &&[shard, eset] : *set) {
+    for (auto [off, len] : eset) {
+      for (uint64_t slice_start = line_align(off);
+           slice_start < off + len;
+           slice_start += line_size) {
+        uint64_t offset = max(slice_start, off);
+        uint64_t length = min(slice_start + line_size, off + len) - offset;
+        // This line must exist, as it was created when the op was created.
+        LineRef l = lines.at(slice_start).lock();
+        if (l->cache->contains_shard(shard)) {
+          extent_map m = l->cache->get_extent_map(shard).intersect(
+            offset, length);
+          if (!m.empty()) {
+            if (!res.contains(shard)) res.emplace(shard, std::move(m));
+            else res.at(shard).insert(m);
+          }
+        }
+      }
+    }
+  }
+  return shard_extent_map_t(&pg.sinfo, std::move(res));
+}
diff --git a/src/osd/ECExtentCache.h b/src/osd/ECExtentCache.h

index b02afec4a114cd45cf73bba8b4d940dd9bd90ffc..b4e06d6fddfd0bfb48a64c98b8e822f993795bfa 100644 (file)
--- a/src/osd/ECExtentCache.h
+++ b/src/osd/ECExtentCache.h
@@ -1,10 +1,383 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* EC "extent" cache.  This extent cache attempts to improve performance,
+ * particularly for small sequential writes, by caching the results of recent
+ * reads and writes.
+ *
+ * The cache has two parts: The main cache which is active while an IO is
+ * outstanding to an object and an "LRU" which stashes recent IO according to
+ * a least-recently-used scheme.
+ *
+ * The cache keeps all caches indexed by shard, shard_offset. That is it
+ * independently tracks caches for each shard of an EC. It will keep a cache
+ * even for shards which are currently offline or missing, since the cache
+ * is formed from the result of reads and writes, which are required to always
+ * calculate missing shards.
+ *
+ * The cache allows for a single read to be outstanding per PG at a time. If
+ * multiple writes are received while a read is active, the next read will
+ * contain all necessary reads, so as to catch up. Early on in development, a
+ * more parallel read mechanism was explored but was found to have no benefit.
+ *
+ * This cache will never re-order IO.
+ *
+ * The LRU
+ *
+ * The LRU is a per-OSD-shard (not to be confused with an EC shard). Since the
+ * OSD-shard can have multiple threads, the LRU must have a mutex. This should
+ * not be required for crimson-based pools, since each osd shard has a single
+ * reactor. Some effort has been made to limit the frequency that this mutex is
+ * taken.
+ *
+ * The LRU has a maximum size (defined in the constructor) and will keep its
+ * usage below this amount.
+ *
+ * Cache Lines
+ *
+ * The LRU tracks extents of recent writes with cache Lines.  These are
+ * simple-to-track ranges of offsets across all shards. Each line represents 32K
+ * of address space on each shard.
+ *
+ * A cache line can be owned by:
+ * - No-one (i.e. it is not instantiated)
+ * - Object - an IO is inflight for this cache line
+ * - LRU - A recent IO touched this cache line.
+ *
+ * This simple ownership model means that the locking required for the LRU does
+ * not leak out into the wider extent cache and allows for the entire cache
+ * to be built from reference-counters.
+ *
+ * Client API
+ *
+ * The client has a number of required interactions:
+ * 1. prepare(...). This creates a cache op. All cache ops required for a single
+ *                  parent op must be prepared before any are executed.
+ * 2. execute(...). Execute an IO. This gives the cache permission to perform
+ *                  the IO. This function can (and frequently does) call back
+ *                  re-entrantly, so the caller must be aware that this can
+ *                  happen.
+ *
+ * The client must provide a mechanism for the extent cache to read. It does
+ * this by extending the ECExtentCache::BackendRead class.
+ *
+ * Once a read is complete, the client must call cache.read_done().
+ *
+ * When the cache is ready, it will call back the lambda passed to execute.
+ * The client is expected to populate the write data, including any parity
+ * data, by calling the cache.write_done() method.
+ *
+ * Finally, there is an on_change() and on_change2() methods. The first of these
+ * instructs the extent cache to discard any ops it has queued.  The second
+ * simply asserts that the cache is now idle, this is to ensure that the calling
+ * code has performed the required clean up to clear the extent cache.
+ */
+
  #pragma once
  
-// Temporary stubs
+#include "ECUtil.h"
+#include "include/Context.h"
+
  class ECExtentCache {
+  class Address;
+  class Line;
+  class Object;
+  typedef std::shared_ptr<Line> LineRef;
+  typedef std::list<LineRef>::iterator LineIter;
+
   public:
+  class LRU;
+  class Op;
+  typedef std::shared_ptr<Op> OpRef;
+
+  struct BackendReadListener {
+    virtual void backend_read(hobject_t oid,
+                              ECUtil::shard_extent_set_t const &request,
+                              uint64_t object_size) = 0;
+    virtual ~BackendReadListener() = default;
+  };
+
+  static void update_mempool(int items, int64_t bytes) {
+    mempool::get_pool(mempool::pool_index_t(mempool::mempool_ec_extent_cache)).
+        adjust_count(items, bytes);
+  }
+
    class LRU {
     public:
-    LRU(uint64_t) {}
+    class Key {
+     public:
+      uint64_t offset;
+      hobject_t oid;
+
+      Key(uint64_t offset, const hobject_t &oid) : offset(offset), oid(oid) {};
+
+      friend bool operator==(const Key &lhs, const Key &rhs) {
+        return lhs.offset == rhs.offset
+            && lhs.oid == rhs.oid;
+      }
+
+      friend bool operator!=(const Key &lhs, const Key &rhs) {
+        return !(lhs == rhs);
+      }
+    };
+
+    struct KeyHash {
+      std::size_t operator()(const Key &obj) const {
+        std::size_t seed = 0x625610ED;
+        seed ^= (seed << 6) + (seed >> 2) + 0x1E665363 + static_cast<
+          std::size_t>(obj.offset);
+        seed ^= (seed << 6) + (seed >> 2) + 0x51343C80 + obj.oid.get_hash();
+        return seed;
+      }
+    };
+
+   private:
+    friend class Object;
+    friend class ECExtentCache;
+    std::unordered_map<Key, std::pair<
+                         std::list<Key>::iterator, std::shared_ptr<
+                           ECUtil::shard_extent_map_t>>, KeyHash> map;
+    std::list<Key> lru;
+    uint64_t max_size = 0;
+    uint64_t size = 0;
+    ceph::mutex mutex = ceph::make_mutex("ECExtentCache::LRU");
+
+    void free_maybe();
+    void discard();
+    void add(const Line &line);
+    void erase(const Key &k);
+    std::list<Key>::iterator erase(const std::list<Key>::iterator &it,
+                                   bool update_mempool);
+    std::shared_ptr<ECUtil::shard_extent_map_t> find(
+        const hobject_t &oid, uint64_t offset);
+    void remove_object(const hobject_t &oid);
+
+   public:
+    explicit LRU(uint64_t max_size) : map(), max_size(max_size) {}
+  };
+
+  class Op {
+    friend class Object;
+    friend class ECExtentCache;
+
+    Object &object;
+    std::optional<ECUtil::shard_extent_set_t> const reads;
+    ECUtil::shard_extent_set_t const writes;
+    ECUtil::shard_extent_map_t result;
+    bool complete = false;
+    bool invalidates_cache = false;
+    bool reading = false;
+    bool read_done = false;
+    uint64_t projected_size = 0;
+    GenContextURef<OpRef&> cache_ready_cb;
+    std::list<LineRef> lines;
+
+    // List of callbacks to be executed on write completion (not commit)
+    std::list<std::function<void(void)>> on_write;
+
+    const extent_set get_pin_eset(uint64_t alignment) const;
+
+   public:
+    explicit Op(
+        GenContextURef<OpRef&> &&cache_ready_cb,
+        Object &object,
+        std::optional<ECUtil::shard_extent_set_t> const &to_read,
+        ECUtil::shard_extent_set_t const &write,
+        uint64_t projected_size,
+        bool invalidates_cache);
+
+    ~Op();
+    void cancel() { delete cache_ready_cb.release(); }
+    const  ECUtil::shard_extent_set_t &get_writes() const { return writes; }
+    const Object &get_object() const { return object; }
+    const hobject_t &get_hoid() const { return object.oid; }
+    const ECUtil::shard_extent_map_t &get_result() { return result; }
+
+    void add_on_write(std::function<void(void)> &&cb) {
+      on_write.emplace_back(std::move(cb));
+    }
+
+    bool complete_if_reads_cached(OpRef &op_ref) {
+      if (!read_done) {
+        return false;
+      }
+      result = object.get_cache(reads);
+      complete = true;
+      cache_ready_cb.release()->complete(op_ref);
+      return true;
+    }
+
+    void write_done(ECUtil::shard_extent_map_t const &update) const {
+      object.write_done(update, projected_size);
+      for (auto &cb: on_write) {
+        cb();
+      }
+    }
    };
-};
+
+#define MIN_LINE_SIZE (32UL*1024UL)
+
+private:
+  class Object {
+    friend class Op;
+    friend class LRU;
+    friend class Line;
+    friend class ECExtentCache;
+
+    ECExtentCache &pg;
+    ECUtil::shard_extent_set_t requesting;
+    ECUtil::shard_extent_set_t do_not_read;
+    std::list<OpRef> reading_ops;
+    std::list<OpRef> requesting_ops;
+    // Map of the byte-offset of the start of the line to the line.
+    std::map<uint64_t, std::weak_ptr<Line>> lines;
+    int active_ios = 0;
+    uint64_t current_size = 0;
+    uint64_t projected_size = 0;
+    uint64_t line_size = 0;
+    bool reading = false;
+    bool cache_invalidated = false;
+    bool cache_invalidate_expected = false;
+
+    void request(OpRef &op);
+    void send_reads();
+    void unpin(Op &op) const;
+    void delete_maybe() const;
+    void erase_line(uint64_t offset);
+    void invalidate(const OpRef &invalidating_op);
+
+   public:
+    hobject_t oid;
+
+    Object(ECExtentCache &pg, hobject_t const &oid, uint64_t size) :
+      pg(pg),
+      requesting(pg.sinfo.get_k_plus_m()),
+      do_not_read(pg.sinfo.get_k_plus_m()),
+      current_size(size),
+      projected_size(size),
+      oid(oid) {
+      line_size = std::max(MIN_LINE_SIZE, pg.sinfo.get_chunk_size());
+    }
+
+    void insert(ECUtil::shard_extent_map_t const &buffers) const;
+    void write_done(ECUtil::shard_extent_map_t const &buffers, uint64_t new_size);
+    void read_done(ECUtil::shard_extent_map_t const &result);
+    [[nodiscard]] uint64_t get_projected_size() const { return projected_size; }
+    ECUtil::shard_extent_map_t get_cache(
+        std::optional<ECUtil::shard_extent_set_t> const &set) const;
+    uint64_t line_align(uint64_t line) const;
+  };
+
+
+  class Line {
+   public:
+    uint64_t offset;
+    uint64_t size;
+    std::shared_ptr<ECUtil::shard_extent_map_t> cache;
+    Object &object;
+
+    Line(Object &object,
+         uint64_t offset) :
+      offset(offset),
+      object(object) {
+      std::shared_ptr<ECUtil::shard_extent_map_t> c = object.pg.lru.find(
+        object.oid, offset);
+
+      if (c == nullptr) {
+        cache = std::make_shared<ECUtil::shard_extent_map_t>(&object.pg.sinfo);
+        size = 0;
+        /* We are creating an empty cache line */
+        update_mempool(1, 0);
+      } else {
+        cache = c;
+        size = c->size();
+      }
+    }
+
+    ~Line() {
+      object.pg.lru.add(*this);
+      object.erase_line(offset);
+    }
+
+    friend bool operator==(const Line &lhs, const Line &rhs) {
+      return lhs.offset == rhs.offset
+          && lhs.object.oid == rhs.object.oid;
+    }
+
+    friend bool operator!=(const Line &lhs, const Line &rhs) {
+      return !(lhs == rhs);
+    }
+  };
+
+  std::map<hobject_t, Object> objects;
+  BackendReadListener &backend_read;
+  LRU &lru;
+  const ECUtil::stripe_info_t &sinfo;
+  std::list<OpRef> waiting_ops;
+  void cache_maybe_ready();
+  uint32_t counter = 0;
+  uint32_t active_ios = 0;
+  CephContext *cct;
+
+  OpRef prepare(GenContextURef<OpRef&> &&ctx,
+                hobject_t const &oid,
+                std::optional<ECUtil::shard_extent_set_t> const &to_read,
+                ECUtil::shard_extent_set_t const &write,
+                uint64_t orig_size,
+                uint64_t projected_size,
+                bool invalidates_cache);
+
+ public:
+  ~ECExtentCache() {
+    // This should really only be needed in failed tests, as the PG should
+    // clear up any IO before it gets destructed. However, here we make sure
+    // to clean up any outstanding IO.
+    on_change();
+    on_change2();
+  }
+
+  explicit ECExtentCache(BackendReadListener &backend_read,
+                         LRU &lru, const ECUtil::stripe_info_t &sinfo,
+                         CephContext *cct
+    ) :
+    backend_read(backend_read),
+    lru(lru),
+    sinfo(sinfo),
+    cct(cct) {}
+
+  // Insert some data into the cache.
+  void read_done(hobject_t const &oid, ECUtil::shard_extent_map_t const &update);
+  void write_done(OpRef const &op, ECUtil::shard_extent_map_t const &update);
+  void on_change();
+  void on_change2() const;
+  [[nodiscard]] bool contains_object(hobject_t const &oid) const;
+  [[nodiscard]] uint64_t get_projected_size(hobject_t const &oid) const;
+
+  template <typename CacheReadyCb>
+  OpRef prepare(hobject_t const &oid,
+                std::optional<ECUtil::shard_extent_set_t> const &to_read,
+                ECUtil::shard_extent_set_t const &write,
+                uint64_t orig_size,
+                uint64_t projected_size,
+                bool invalidates_cache,
+                CacheReadyCb &&ready_cb) {
+    GenContextURef<OpRef&> ctx =
+        make_gen_lambda_context<OpRef&, CacheReadyCb>(
+          std::forward<CacheReadyCb>(ready_cb));
+
+    return prepare(std::move(ctx), oid, to_read, write, orig_size,
+                   projected_size, invalidates_cache);
+  }
+
+  void execute(std::list<OpRef> &op_list);
+  [[nodiscard]] bool idle() const;
+  uint32_t get_and_reset_counter();
+
+  void add_on_write(std::function<void(void)> &&cb) const {
+    if (waiting_ops.empty()) {
+      cb();
+    } else {
+      waiting_ops.back()->add_on_write(std::move(cb));
+    }
+  }
+}; // ECExtentCaches
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc

index 7cfe995a4b92747db305d9642894d7263406a11a..d6de5274b8e61b863d822539743cca9159294922 100644 (file)
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -34,625 +34,933 @@ using ceph::decode;
  using ceph::encode;
  using ceph::ErasureCodeInterfaceRef;
  
-static void encode_and_write(
-  pg_t pgid,
-  const hobject_t &oid,
-  const ECUtil::stripe_info_t &sinfo,
-  ErasureCodeInterfaceRef &ecimpl,
-  const set<int> &want,
-  uint64_t offset,
-  bufferlist bl,
-  uint32_t flags,
-  ECUtil::HashInfoRef hinfo,
-  extent_map &written,
-  map<shard_id_t, ObjectStore::Transaction> *transactions,
-  DoutPrefixProvider *dpp)
-{
-  const uint64_t before_size = hinfo->get_total_logical_size(sinfo);
-  ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset));
-  ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length()));
-  ceph_assert(bl.length());
-
-  map<int, bufferlist> buffers;
-  int r = ECUtil::encode(
-    sinfo, ecimpl, bl, want, &buffers);
-  ceph_assert(r == 0);
+void debug(const hobject_t &oid, const std::string &str,
+           const ECUtil::shard_extent_map_t &map, DoutPrefixProvider *dpp
+  ) {
+#if DEBUG_EC_BUFFERS
+  ldpp_dout(dpp, 20)
+    << "EC_DEBUG_BUFFERS: generate_transactions: "
+    << "oid: " << oid
+    << " " << str << " " << map.debug_string(2048, 8) << dendl;
+#else
+  ldpp_dout(dpp, 20)
+    << "generate_transactions: "
+    << "oid: " << oid
+    << str << map << dendl;
+#endif
+}
  
-  written.insert(offset, bl.length(), bl);
+void ECTransaction::Generate::encode_and_write() {
+  // For PDW, we already have necessary parity buffers.
+  if (!plan.do_parity_delta_write) {
+    to_write.insert_parity_buffers();
+  }
  
+  // If partial writes are not supported, pad out to_write to a full stripe.
+  if (!sinfo.supports_partial_writes()) {
+    for (auto &&[shard, eset]: plan.will_write) {
+      if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) continue;
+
+      for (auto [off, len]: eset) {
+        to_write.zero_pad(shard, off, len);
+      }
+    }
+  }
+
+  int r = 0;
+  if (plan.do_parity_delta_write) {
+    /* For parity delta writes, we remove any unwanted writes before calculating
+     * the parity.
+     */
+    read_sem->zero_pad(plan.will_write);
+    to_write.pad_with_other(plan.will_write, *read_sem);
+    r = to_write.encode_parity_delta(ec_impl, *read_sem);
+  } else {
+    r = to_write.encode(ec_impl, plan.hinfo, plan.orig_size);
+  }
+  ceph_assert(r == 0);
+  // Remove any unnecessary writes.
+  //to_write = to_write.intersect(plan.will_write);
+
+  debug(oid, "parity", to_write, dpp);
    ldpp_dout(dpp, 20) << __func__ << ": " << oid
-                    << " new_size "
-                    << offset + bl.length()
-                    << dendl;
-
-  if (offset >= before_size) {
-    ceph_assert(offset == before_size);
-    hinfo->append(
-      sinfo.aligned_logical_offset_to_chunk_offset(offset),
-      buffers);
-  }
-
-  for (auto &&i : *transactions) {
-    ceph_assert(buffers.count(static_cast<int>(i.first)));
-    bufferlist &enc_bl = buffers[static_cast<int>(i.first)];
-    if (offset >= before_size) {
-      i.second.set_alloc_hint(
-       coll_t(spg_t(pgid, i.first)),
-       ghobject_t(oid, ghobject_t::NO_GEN, i.first),
-       0, 0,
-       CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
-       CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+                    << " plan " << plan
+                    << dendl;
+
+  for (auto &&[shard, to_write_eset]: plan.will_write) {
+    /* Zero pad, even if we are not writing.  The extent cache requires that
+     * all shards are fully populated with write data, even if the OSDs are
+     * down. This is not a fundamental requirement of the cache, but dealing
+     * with implied zeros due to incomplete writes is both difficult and
+     * removes a level of protection against bugs.
+     */
+    for (auto &&[offset, len]: to_write_eset) {
+      to_write.zero_pad(shard, offset, len);
+    }
+
+    if (transactions.contains(shard)) {
+      auto &t = transactions.at(shard);
+      if (to_write_eset.begin().get_start() >= plan.orig_size) {
+        t.set_alloc_hint(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard),
+          0, 0,
+          CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+          CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+      }
+
+      for (auto &&[offset, len]: to_write_eset) {
+        buffer::list bl;
+        to_write.get_buffer(shard, offset, len, bl);
+        t.write(coll_t(spg_t(pgid, shard)),
+                ghobject_t(oid, ghobject_t::NO_GEN, shard),
+                offset, bl.length(), bl, fadvise_flags);
+      }
      }
-    i.second.write(
-      coll_t(spg_t(pgid, i.first)),
-      ghobject_t(oid, ghobject_t::NO_GEN, i.first),
-      sinfo.logical_to_prev_chunk_offset(
-       offset),
-      enc_bl.length(),
-      enc_bl,
-      flags);
    }
  }
  
-void ECTransaction::generate_transactions(
-  PGTransaction* _t,
-  WritePlan &plan,
-  ErasureCodeInterfaceRef &ecimpl,
-  pg_t pgid,
-  const ECUtil::stripe_info_t &sinfo,
-  const map<hobject_t,extent_map> &partial_extents,
-  vector<pg_log_entry_t> &entries,
-  map<hobject_t,extent_map> *written_map,
-  map<shard_id_t, ObjectStore::Transaction> *transactions,
-  set<hobject_t> *temp_added,
-  set<hobject_t> *temp_removed,
-  DoutPrefixProvider *dpp,
-  const ceph_release_t require_osd_release)
+ECTransaction::WritePlanObj::WritePlanObj(
+    const hobject_t &hoid,
+    const PGTransaction::ObjectOperation &op,
+    const ECUtil::stripe_info_t &sinfo,
+    const shard_id_set readable_shards,
+    const shard_id_set writable_shards,
+    const bool object_in_cache,
+    uint64_t orig_size,
+    const std::optional<object_info_t> &oi,
+    const std::optional<object_info_t> &soi,
+    const ECUtil::HashInfoRef &&hinfo,
+    const ECUtil::HashInfoRef &&shinfo,
+    const unsigned pdw_write_mode
+  ) :
+  hoid(hoid),
+  will_write(sinfo.get_k_plus_m()),
+  hinfo(hinfo),
+  shinfo(shinfo),
+  orig_size(orig_size) // On-disk object sizes are rounded up to the next page.
  {
-  ceph_assert(written_map);
-  ceph_assert(transactions);
-  ceph_assert(temp_added);
-  ceph_assert(temp_removed);
-  ceph_assert(_t);
-  auto &t = *_t;
+  extent_set unaligned_ro_writes;
  
-  auto &hash_infos = plan.hash_infos;
+  projected_size = oi ? oi->size : 0;
  
-  map<hobject_t, pg_log_entry_t*> obj_to_log;
-  for (auto &&i: entries) {
-    obj_to_log.insert(make_pair(i.soid, &i));
+  if (soi) {
+    projected_size = soi->size;
    }
  
-  t.safe_create_traverse(
-    [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
-      const hobject_t &oid = opair.first;
-      auto &op = opair.second;
-      auto &obc_map = t.obc_map;
-      auto &written = (*written_map)[oid];
+  hobject_t source;
+  invalidates_cache = op.has_source(&source) || op.is_delete();
+
+  op.buffer_updates.to_interval_set(unaligned_ro_writes);
+  /* We can get multiple truncates/appends in a single tranaction. These get
+   * simplified to two values - a minimum and a maximum. It is not guaranteed
+   * that this region has writes.  We create writes for this region so as to
+   * essentially write zeros (or holes) in that region.
+   */
+
+  if (op.truncate) {
+    uint64_t start = op.truncate->first;
+    uint64_t end = projected_size;
+    if (projected_size > op.truncate->second ) {
+      end = op.truncate->second;
+    }
+    if (end > start) {
+      unaligned_ro_writes.insert(start, end - start);
+    }
+  }
  
-      auto iter = obj_to_log.find(oid);
-      pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+  /* Calculate any non-aligned pages. These need to be read and written */
+  extent_set aligned_ro_writes(unaligned_ro_writes);
+  aligned_ro_writes.align(CEPH_PAGE_SIZE);
+  extent_set partial_page_ro_writes(aligned_ro_writes);
+  partial_page_ro_writes.subtract(unaligned_ro_writes);
+  partial_page_ro_writes.align(CEPH_PAGE_SIZE);
+
+  extent_set write_superset;
+  for (auto &&[off, len] : unaligned_ro_writes) {
+    sinfo.ro_range_to_shard_extent_set_with_superset(
+      off, len, will_write, write_superset);
+  }
+  write_superset.align(CEPH_PAGE_SIZE);
+
+  shard_id_set writable_parity_shards = shard_id_set::intersection(sinfo.get_parity_shards(), writable_shards);
+  for (auto shard : writable_parity_shards) {
+    will_write[shard].insert(write_superset);
+  }
+
+  ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
+  ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
  
-      ObjectContextRef obc;
-      auto obiter = t.obc_map.find(oid);
-      if (obiter != t.obc_map.end()) {
-       obc = obiter->second;
+  if (!sinfo.supports_partial_writes()) {
+    for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+      will_write[shard].insert(write_superset);
+    }
+    will_write.align(sinfo.get_chunk_size());
+    reads = will_write;
+    sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
+    reads.intersection_of(read_mask);
+    do_parity_delta_write = false;
+  } else {
+    will_write.align(CEPH_PAGE_SIZE);
+    ECUtil::shard_extent_set_t pdw_reads(will_write);
+
+    sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+
+    /* Next we need to add the reads required for a conventional write */
+    for (auto shard : sinfo.get_data_shards()) {
+      reads[shard].insert(write_superset);
+      if (will_write.contains(shard)) {
+        reads[shard].subtract(will_write.at(shard));
        }
-      if (entry) {
-       ceph_assert(obc);
-      } else {
-       ceph_assert(oid.is_temp());
+      if (reads[shard].empty()) {
+        reads.erase(shard);
        }
+    }
  
-      ECUtil::HashInfoRef hinfo;
-      {
-       auto iter = hash_infos.find(oid);
-       ceph_assert(iter != hash_infos.end());
-       hinfo = iter->second;
-      }
+    /* We now need to add in the partial page ro writes. This is not particularly
+     * efficient as the are many divs in here, but non-4k aligned writes are
+     * not very efficient anyway
+     */
+    for (auto &&[off, len] : partial_page_ro_writes) {
+      sinfo.ro_range_to_shard_extent_set(
+        off, len, reads);
+    }
  
-      if (oid.is_temp()) {
-       if (op.is_fresh_object()) {
-         temp_added->insert(oid);
-       } else if (op.is_delete()) {
-         temp_removed->insert(oid);
-       }
-      }
+    reads.intersection_of(read_mask);
  
-      if (entry &&
-         entry->is_modify() &&
-         op.updated_snaps) {
-       bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
-       encode(op.updated_snaps->second, bl);
-       entry->snaps.swap(bl);
-       entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
-      }
+    /* Here we decide if we want to do a conventional write or a parity delta write. */
+    if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
+        orig_size == projected_size && !reads.empty()) {
  
-      ldpp_dout(dpp, 20) << "generate_transactions: "
-                        << opair.first
-                        << ", current size is "
-                        << hinfo->get_total_logical_size(sinfo)
-                        << " buffers are "
-                        << op.buffer_updates
-                        << dendl;
-      if (op.truncate) {
-       ldpp_dout(dpp, 20) << "generate_transactions: "
-                          << " truncate is "
-                          << *(op.truncate)
-                          << dendl;
-      }
+      shard_id_set read_shards = reads.get_shard_id_set();
+      shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
  
-      if (entry && op.updated_snaps) {
-       entry->mod_desc.update_snaps(op.updated_snaps->first);
+      if (pdw_write_mode != 0) {
+        do_parity_delta_write = (pdw_write_mode == 2);
+      } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
+        // Some kind of reconstruct would be needed for PDW, so don't bother.
+        do_parity_delta_write = false;
+      } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
+        // Some kind of reconstruct is needed for conventional, but NOT for PDW!
+        do_parity_delta_write = true;
+      } else {
+        /* Everything we need for both is available, opt for which ever is less
+         * reads.
+         */
+        do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
        }
  
-      map<string, std::optional<bufferlist> > xattr_rollback;
-      ceph_assert(hinfo);
-      bufferlist old_hinfo;
-      encode(*hinfo, old_hinfo);
-      xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
-
-      if (op.is_none() && op.truncate && op.truncate->first == 0) {
-       ceph_assert(entry);
-       ceph_assert(obc);
-
-       if (op.truncate->first != op.truncate->second) {
-         op.truncate->first = op.truncate->second;
-       } else {
-         op.truncate = std::nullopt;
-       }
-
-       op.delete_first = true;
-       op.init_type = PGTransaction::ObjectOperation::Init::Create();
-
-       if (obc) {
-         /* We need to reapply all of the cached xattrs.
-            * std::map insert fortunately only writes keys
-            * which don't already exist, so this should do
-            * the right thing. */
-         op.attr_updates.insert(
-           obc->attr_cache.begin(),
-           obc->attr_cache.end());
-       }
+      if (do_parity_delta_write) {
+        to_read = std::move(pdw_reads);
+        reads.clear(); // So we don't stash it at the end.
        }
+    }
  
-      if (op.delete_first) {
-       /* We also want to remove the std::nullopt entries since
-          * the keys already won't exist */
-       for (auto j = op.attr_updates.begin();
-            j != op.attr_updates.end();
-         ) {
-         if (j->second) {
-           ++j;
-         } else {
-           op.attr_updates.erase(j++);
-         }
-       }
-       /* Fill in all current entries for xattr rollback */
-       if (obc) {
-         xattr_rollback.insert(
-           obc->attr_cache.begin(),
-           obc->attr_cache.end());
-         obc->attr_cache.clear();
-       }
-       if (entry) {
-         entry->mod_desc.rmobject(entry->version.version);
-         for (auto &&st: *transactions) {
-           st.second.collection_move_rename(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, entry->version.version, st.first));
-         }
-       } else {
-         for (auto &&st: *transactions) {
-           st.second.remove(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first));
-         }
-       }
-       hinfo->clear();
-      }
+    /* NOTE: We intentionally leave un-writable shards in the write plan.  As
+     * it is actually less efficient to take them out:- PDWs still need to
+     * compute the deltas and conventional writes still need to calcualte the
+     * parity. The transaction will be dropped by generate_transactions.
+     */
+  }
+
+  if (!reads.empty()) {
+    to_read = std::move(reads);
+  }
  
-      if (op.is_fresh_object() && entry) {
-       entry->mod_desc.create();
+  /* validate post conditions:
+   * to_read should have an entry for `obj` if it isn't empty
+   * and if we are reading from `obj`, we can't be renaming or
+   * cloning it */
+  ceph_assert(!to_read || !soi);
+}
+
+void ECTransaction::Generate::all_shards_written() {
+  if (entry) {
+    entry->written_shards.insert_range(shard_id_t(0), sinfo.get_k_plus_m());
+  }
+}
+
+void ECTransaction::Generate::shard_written(const shard_id_t shard) {
+  if (entry) {
+    entry->written_shards.insert(shard);
+  }
+}
+
+void ECTransaction::Generate::shards_written(const shard_id_set &shards) {
+  if (entry) {
+    entry->written_shards.insert(shards);
+  }
+}
+
+void ECTransaction::Generate::zero_truncate_to_delete() {
+  ceph_assert(obc);
+
+  if (op.truncate->first != op.truncate->second) {
+    op.truncate->first = op.truncate->second;
+  } else {
+    op.truncate = std::nullopt;
+  }
+
+  op.delete_first = true;
+  op.init_type = PGTransaction::ObjectOperation::Init::Create();
+
+  if (obc) {
+    /* We need to reapply all of the cached xattrs.
+       * std::map insert fortunately only writes keys
+       * which don't already exist, so this should do
+       * the right thing. */
+    op.attr_updates.insert(
+      obc->attr_cache.begin(),
+      obc->attr_cache.end());
+  }
+}
+
+void ECTransaction::Generate::delete_first() {
+  /* We also want to remove the std::nullopt entries since
+   * the keys already won't exist */
+  for (auto j = op.attr_updates.begin();
+       j != op.attr_updates.end();
+    ) {
+    if (j->second) {
+      ++j;
+    } else {
+      j = op.attr_updates.erase(j);
+    }
+    }
+  /* Fill in all current entries for xattr rollback */
+  if (obc) {
+    xattr_rollback.insert(
+      obc->attr_cache.begin(),
+      obc->attr_cache.end());
+    obc->attr_cache.clear();
+  }
+  if (entry) {
+    entry->mod_desc.rmobject(entry->version.version);
+    all_shards_written();
+    for (auto &&[shard, t]: transactions) {
+      t.collection_move_rename(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, ghobject_t::NO_GEN, shard),
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, entry->version.version, shard));
+    }
+  } else {
+    for (auto &&[shard, t]: transactions) {
+      t.remove(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, ghobject_t::NO_GEN, shard));
+    }
+  }
+  if (plan.hinfo)
+    plan.hinfo->clear();
+}
+
+void ECTransaction::Generate::process_init() {
+  match(
+    op.init_type,
+    [&](const PGTransaction::ObjectOperation::Init::None &) {},
+    [&](const PGTransaction::ObjectOperation::Init::Create &_) {
+      all_shards_written();
+      for (auto &&[shard, t]: transactions) {
+        if (osdmap->require_osd_release >= ceph_release_t::octopus) {
+          t.create(
+            coll_t(spg_t(pgid, shard)),
+            ghobject_t(oid, ghobject_t::NO_GEN, shard));
+        } else {
+          t.touch(
+            coll_t(spg_t(pgid, shard)),
+            ghobject_t(oid, ghobject_t::NO_GEN, shard));
+        }
+      }
+    },
+    [&](const PGTransaction::ObjectOperation::Init::Clone &cop) {
+      all_shards_written();
+      for (auto &&[shard, t]: transactions) {
+        t.clone(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(cop.source, ghobject_t::NO_GEN, shard),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard));
        }
  
-      match(
-       op.init_type,
-       [&](const PGTransaction::ObjectOperation::Init::None &) {},
-       [&](const PGTransaction::ObjectOperation::Init::Create &op) {
-         for (auto &&st: *transactions) {
-           if (require_osd_release >= ceph_release_t::octopus) {
-             st.second.create(
-               coll_t(spg_t(pgid, st.first)),
-               ghobject_t(oid, ghobject_t::NO_GEN, st.first));
-           } else {
-             st.second.touch(
-               coll_t(spg_t(pgid, st.first)),
-               ghobject_t(oid, ghobject_t::NO_GEN, st.first));
-           }
-         }
-       },
-       [&](const PGTransaction::ObjectOperation::Init::Clone &op) {
-         for (auto &&st: *transactions) {
-           st.second.clone(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first));
-         }
-
-         auto siter = hash_infos.find(op.source);
-         ceph_assert(siter != hash_infos.end());
-         hinfo->update_to(*(siter->second));
-
-         if (obc) {
-           auto cobciter = obc_map.find(op.source);
-           ceph_assert(cobciter != obc_map.end());
-           obc->attr_cache = cobciter->second->attr_cache;
-         }
-       },
-       [&](const PGTransaction::ObjectOperation::Init::Rename &op) {
-         ceph_assert(op.source.is_temp());
-         for (auto &&st: *transactions) {
-           st.second.collection_move_rename(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first));
-         }
-         auto siter = hash_infos.find(op.source);
-         ceph_assert(siter != hash_infos.end());
-         hinfo->update_to(*(siter->second));
-         if (obc) {
-           auto cobciter = obc_map.find(op.source);
-           ceph_assert(cobciter == obc_map.end());
-           obc->attr_cache.clear();
-         }
-       });
-
-      // omap not supported (except 0, handled above)
-      ceph_assert(!(op.clear_omap));
-      ceph_assert(!(op.omap_header));
-      ceph_assert(op.omap_updates.empty());
-
-      if (!op.attr_updates.empty()) {
-       map<string, bufferlist, less<>> to_set;
-       for (auto &&j: op.attr_updates) {
-         if (j.second) {
-           to_set[j.first] = *(j.second);
-         } else {
-           for (auto &&st : *transactions) {
-             st.second.rmattr(
-               coll_t(spg_t(pgid, st.first)),
-               ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-               j.first);
-           }
-         }
-         if (obc) {
-           auto citer = obc->attr_cache.find(j.first);
-           if (entry) {
-             if (citer != obc->attr_cache.end()) {
-               // won't overwrite anything we put in earlier
-               xattr_rollback.insert(
-                 make_pair(
-                   j.first,
-                   std::optional<bufferlist>(citer->second)));
-             } else {
-               // won't overwrite anything we put in earlier
-               xattr_rollback.insert(
-                 make_pair(
-                   j.first,
-                   std::nullopt));
-             }
-           }
-           if (j.second) {
-             obc->attr_cache[j.first] = *(j.second);
-           } else if (citer != obc->attr_cache.end()) {
-             obc->attr_cache.erase(citer);
-           }
-         } else {
-           ceph_assert(!entry);
-         }
-       }
-       for (auto &&st : *transactions) {
-         st.second.setattrs(
-           coll_t(spg_t(pgid, st.first)),
-           ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-           to_set);
-       }
-       ceph_assert(!xattr_rollback.empty());
+      if (plan.hinfo && plan.shinfo)
+        plan.hinfo->update_to(*plan.shinfo);
+
+      if (obc) {
+        auto cobciter = t.obc_map.find(cop.source);
+        ceph_assert(cobciter != t.obc_map.end());
+        obc->attr_cache = cobciter->second->attr_cache;
        }
-      if (entry && !xattr_rollback.empty()) {
-       entry->mod_desc.setattrs(xattr_rollback);
+    },
+    [&](const PGTransaction::ObjectOperation::Init::Rename &rop) {
+      ceph_assert(rop.source.is_temp());
+      all_shards_written();
+      for (auto &&[shard, t]: transactions) {
+        t.collection_move_rename(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(rop.source, ghobject_t::NO_GEN, shard),
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard));
        }
+      if (plan.hinfo && plan.shinfo)
+        plan.hinfo->update_to(*plan.shinfo);
+      if (obc) {
+        auto cobciter = t.obc_map.find(rop.source);
+        ceph_assert(cobciter == t.obc_map.end());
+        obc->attr_cache.clear();
+      }
+    });
+}
+
+void alloc_hint(PGTransaction::ObjectOperation& op,
+      shard_id_map<ObjectStore::Transaction> &transactions,
+      pg_t &pgid,
+      const hobject_t &oid,
+      const ECUtil::stripe_info_t &sinfo) {
+  /* ro_offset_to_next_chunk_offset() scales down both aligned and
+   * unaligned offsets
+
+   * we don't bother to roll this back at this time for two reasons:
+   * 1) it's advisory
+   * 2) we don't track the old value */
+  uint64_t object_size = sinfo.ro_offset_to_next_chunk_offset(
+    op.alloc_hint->expected_object_size);
+  uint64_t write_size = sinfo.ro_offset_to_next_chunk_offset(
+    op.alloc_hint->expected_write_size);
+
+  for (auto &&[shard, t]: transactions) {
+    t.set_alloc_hint(
+      coll_t(spg_t(pgid, shard)),
+      ghobject_t(oid, ghobject_t::NO_GEN, shard),
+      object_size,
+      write_size,
+      op.alloc_hint->flags);
+  }
+}
+
+ECTransaction::Generate::Generate(PGTransaction &t,
+    ErasureCodeInterfaceRef &ec_impl,
+    pg_t &pgid,
+    const ECUtil::stripe_info_t &sinfo,
+    const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+    std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+    shard_id_map<ceph::os::Transaction> &transactions,
+    const OSDMapRef &osdmap,
+    const hobject_t &oid,
+    PGTransaction::ObjectOperation &op,
+    WritePlanObj &plan,
+    DoutPrefixProvider *dpp,
+    pg_log_entry_t *entry)
+  : t(t),
+    ec_impl(ec_impl),
+    pgid(pgid),
+    sinfo(sinfo),
+    transactions(transactions),
+    dpp(dpp),
+    osdmap(osdmap),
+    entry(entry),
+    oid(oid),
+    op(op),
+    plan(plan),
+    read_sem(&sinfo),
+    to_write(&sinfo) {
+  auto obiter = t.obc_map.find(oid);
+  if (obiter != t.obc_map.end()) {
+    obc = obiter->second;
+  }
+
+  if (entry) {
+    ceph_assert(obc);
+  } else {
+    ceph_assert(oid.is_temp());
+  }
+
+  if (entry && entry->is_modify() && op.updated_snaps) {
+    bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
+    encode(op.updated_snaps->second, bl);
+    entry->snaps.swap(bl);
+    entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+  }
+
+  ldpp_dout(dpp, 20) << __func__ << ": " << oid << plan
+                     << " fresh_object: " << op.is_fresh_object()
+                     << dendl;
+  if (op.truncate) {
+    ldpp_dout(dpp, 20) << __func__ << ": truncate is " << *(op.truncate) << dendl;
+  }
+
+  if (entry && op.updated_snaps) {
+    entry->mod_desc.update_snaps(op.updated_snaps->first);
+  }
+
+  bufferlist old_hinfo;
+  if (plan.hinfo) {
+    encode(*(plan.hinfo), old_hinfo);
+    xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
+  }
  
-      if (op.alloc_hint) {
-       /* logical_to_next_chunk_offset() scales down both aligned and
-          * unaligned offsets
-          
-          * we don't bother to roll this back at this time for two reasons:
-          * 1) it's advisory
-          * 2) we don't track the old value */
-       uint64_t object_size = sinfo.logical_to_next_chunk_offset(
-         op.alloc_hint->expected_object_size);
-       uint64_t write_size = sinfo.logical_to_next_chunk_offset(
-         op.alloc_hint->expected_write_size);
-       
-       for (auto &&st : *transactions) {
-         st.second.set_alloc_hint(
-           coll_t(spg_t(pgid, st.first)),
-           ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-           object_size,
-           write_size,
-           op.alloc_hint->flags);
-       }
+  if (op.is_none() && op.truncate && op.truncate->first == 0) {
+    zero_truncate_to_delete();
+  }
+
+  if (op.delete_first) {
+    delete_first();
+  }
+
+  if (op.is_fresh_object() && entry) {
+    entry->mod_desc.create();
+  }
+
+  process_init();
+
+  // omap not supported (except 0, handled above)
+  ceph_assert(!(op.clear_omap) && !(op.omap_header) && op.omap_updates.empty());
+
+  if (op.alloc_hint) {
+    alloc_hint(op, transactions, pgid, oid, sinfo);
+  }
+
+  auto pextiter = partial_extents.find(oid);
+  if (pextiter != partial_extents.end()) {
+    if (plan.do_parity_delta_write) {
+      read_sem = pextiter->second;
+    } else {
+      to_write = pextiter->second;
+    }
+  }
+  debug(oid, "to_write", to_write, dpp);
+  ldpp_dout(dpp, 20) << "generate_transactions: plan: " << plan << dendl;
+
+  if (op.truncate && op.truncate->first < plan.orig_size) {
+    truncate();
+  }
+
+  overlay_writes();
+  appends_and_clone_ranges();
+
+  /* The write plan is permitted to drop parity shards when the shard is
+   * missing. However, written_shards must contain all parity shards.
+   * Note that the write plan will *not* drop data shards.
+   */
+  shards_written(sinfo.get_parity_shards());
+
+  if (!to_write.empty()) {
+    encode_and_write();
+  }
+
+  written_map->emplace(oid, std::move(to_write));
+
+  if (entry && plan.hinfo) {
+    plan.hinfo->set_total_chunk_size_clear_hash(
+      sinfo.ro_offset_to_next_stripe_ro_offset(plan.projected_size));
+  }
+
+  if (entry && plan.orig_size < plan.projected_size) {
+    entry->mod_desc.append(ECUtil::align_page_next(plan.orig_size));
+  }
+
+  if (!op.attr_updates.empty()) {
+    attr_updates();
+  }
+
+  if (entry && !xattr_rollback.empty()) {
+    entry->mod_desc.setattrs(xattr_rollback);
+  }
+
+  if (!op.is_delete()) {
+    handle_deletes();
+  }
+
+  written_and_present_shards();
+}
+
+void ECTransaction::Generate::truncate() {
+  ceph_assert(!op.is_fresh_object());
+  // causes encode to invent zeros
+  to_write.erase_after_ro_offset(plan.orig_size);
+  all_shards_written();
+
+  debug(oid, "truncate_erase", to_write, dpp);
+
+  if (entry && !op.is_fresh_object()) {
+    uint64_t restore_from = sinfo.ro_offset_to_prev_chunk_offset(
+      op.truncate->first);
+    uint64_t restore_len = sinfo.aligned_ro_offset_to_chunk_offset(
+      plan.orig_size -
+      sinfo.ro_offset_to_prev_stripe_ro_offset(op.truncate->first));
+    shard_id_set all_shards; // intentionally left blank!
+    rollback_extents.emplace_back(make_pair(restore_from, restore_len));
+    rollback_shards.emplace_back(all_shards);
+    for (auto &&[shard, t]: transactions) {
+      t.touch(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, entry->version.version, shard));
+      t.clone_range(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, ghobject_t::NO_GEN, shard),
+        ghobject_t(oid, entry->version.version, shard),
+        restore_from,
+        restore_len,
+        restore_from);
+    }
+  }
+
+  for (auto &&[shard, t]: transactions) {
+    t.truncate(
+      coll_t(spg_t(pgid, shard)),
+      ghobject_t(oid, ghobject_t::NO_GEN, shard),
+      sinfo.ro_offset_to_shard_offset(plan.orig_size,
+                                      sinfo.get_raw_shard(shard)));
+  }
+}
+
+void ECTransaction::Generate::overlay_writes() {
+  for (auto &&extent: op.buffer_updates) {
+    using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+    bufferlist bl;
+    match(
+      extent.get_val(),
+      [&](const BufferUpdate::Write &wop) {
+        bl = wop.buffer;
+        fadvise_flags |= wop.fadvise_flags;
+      },
+      [&](const BufferUpdate::Zero &) {
+        bl.append_zero(extent.get_len());
+      },
+      [&](const BufferUpdate::CloneRange &) {
+        ceph_abort_msg(
+          "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+      });
+
+    uint64_t off = extent.get_off();
+    uint64_t len = extent.get_len();
+
+    sinfo.ro_range_to_shard_extent_map(off, len, bl, to_write);
+    debug(oid, "overlay_buffer", to_write, dpp);
+  }
+}
+
+void ECTransaction::Generate::appends_and_clone_ranges() {
+
+  extent_set clone_ranges = plan.will_write.get_extent_superset();
+  uint64_t clone_max = ECUtil::align_page_next(plan.orig_size);
+
+  if (op.delete_first) {
+    clone_max = 0;
+  } else if (op.truncate && op.truncate->first < clone_max) {
+    clone_max = ECUtil::align_page_next(op.truncate->first);
+  }
+  ECUtil::shard_extent_set_t cloneable_range(sinfo.get_k_plus_m());
+  sinfo.ro_size_to_read_mask(clone_max, cloneable_range);
+
+  if (plan.orig_size < plan.projected_size) {
+    ECUtil::shard_extent_set_t projected_cloneable_range(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(plan.projected_size,projected_cloneable_range);
+
+    for (auto &&[shard, eset]: projected_cloneable_range) {
+      uint64_t old_shard_size = 0;
+      if (cloneable_range.contains(shard)) {
+        old_shard_size = cloneable_range.at(shard).range_end();
        }
+      uint64_t new_shard_size = eset.range_end();
  
-      extent_map to_write;
-      auto pextiter = partial_extents.find(oid);
-      if (pextiter != partial_extents.end()) {
-       to_write = pextiter->second;
+      if (new_shard_size == old_shard_size) continue;
+
+      uint64_t write_end = 0;
+      if (plan.will_write.contains(shard)) {
+        write_end = plan.will_write.at(shard).range_end();
        }
  
-      vector<pair<uint64_t, uint64_t> > rollback_extents;
-      const uint64_t orig_size = hinfo->get_total_logical_size(sinfo);
-
-      uint64_t new_size = orig_size;
-      uint64_t append_after = new_size;
-      ldpp_dout(dpp, 20) << "generate_transactions: new_size start "
-        << new_size << dendl;
-      if (op.truncate && op.truncate->first < new_size) {
-       ceph_assert(!op.is_fresh_object());
-       new_size = sinfo.logical_to_next_stripe_offset(
-         op.truncate->first);
-       ldpp_dout(dpp, 20) << "generate_transactions: new_size truncate down "
-                          << new_size << dendl;
-       if (new_size != op.truncate->first) { // 0 the unaligned part
-         bufferlist bl;
-         bl.append_zero(new_size - op.truncate->first);
-         to_write.insert(
-           op.truncate->first,
-           bl.length(),
-           bl);
-         append_after = sinfo.logical_to_prev_stripe_offset(
-           op.truncate->first);
-       } else {
-         append_after = new_size;
-       }
-       to_write.erase(
-         new_size,
-         std::numeric_limits<uint64_t>::max() - new_size);
-
-       if (entry && !op.is_fresh_object()) {
-         uint64_t restore_from = sinfo.logical_to_prev_chunk_offset(
-           op.truncate->first);
-         uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
-           orig_size -
-           sinfo.logical_to_prev_stripe_offset(op.truncate->first));
-         ceph_assert(rollback_extents.empty());
-
-         ldpp_dout(dpp, 20) << "generate_transactions: saving extent "
-                            << make_pair(restore_from, restore_len)
-                            << dendl;
-         ldpp_dout(dpp, 20) << "generate_transactions: truncating to "
-                            << new_size
-                            << dendl;
-         rollback_extents.emplace_back(
-           make_pair(restore_from, restore_len));
-         for (auto &&st : *transactions) {
-           st.second.touch(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, entry->version.version, st.first));
-           st.second.clone_range(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-             ghobject_t(oid, entry->version.version, st.first),
-             restore_from,
-             restore_len,
-             restore_from);
-           
-         }
-       } else {
-         ldpp_dout(dpp, 20) << "generate_transactions: not saving extents"
-                                ", fresh object" << dendl;
-       }
-       for (auto &&st : *transactions) {
-         st.second.truncate(
-           coll_t(spg_t(pgid, st.first)),
-           ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-           sinfo.aligned_logical_offset_to_chunk_offset(new_size));
-       }
+      if (write_end == new_shard_size) continue;
+
+      /* If code is executing here, it means that the written part of the
+       * shard does not reflect the size that EC believes the shard to be.
+       * This is not a problem for reads (they will be truncated), but it
+       * is a problem for writes, where future writes may attempt a clone
+       * off the end of the object.
+       * To solve this, we use an interesting quirk of "truncate" where we
+       * can actually truncate to a size larger than the object!
+       */
+      if (transactions.contains(shard)) {
+        auto &t = transactions.at(shard);
+        t.truncate(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard),
+          new_shard_size);
        }
+      // Update written_shards because this must complete to consider
+      // the write as complete
+      shard_written(shard);
+    }
+  }
  
-      uint32_t fadvise_flags = 0;
-      for (auto &&extent: op.buffer_updates) {
-       using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
-       bufferlist bl;
-       match(
-         extent.get_val(),
-         [&](const BufferUpdate::Write &op) {
-           bl = op.buffer;
-           fadvise_flags |= op.fadvise_flags;
-         },
-         [&](const BufferUpdate::Zero &) {
-           bl.append_zero(extent.get_len());
-         },
-         [&](const BufferUpdate::CloneRange &) {
-           ceph_assert(
-             0 ==
-             "CloneRange is not allowed, do_op should have returned ENOTSUPP");
-         });
-
-       uint64_t off = extent.get_off();
-       uint64_t len = extent.get_len();
-       uint64_t end = off + len;
-       ldpp_dout(dpp, 20) << "generate_transactions: adding buffer_update "
-                          << make_pair(off, len)
-                          << dendl;
-       ceph_assert(len > 0);
-       if (off > new_size) {
-         ceph_assert(off > append_after);
-         bl.prepend_zero(off - new_size);
-         len += off - new_size;
-         ldpp_dout(dpp, 20) << "generate_transactions: prepending zeroes to align "
-                            << off << "->" << new_size
-                            << dendl;
-         off = new_size;
-       }
-       if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) {
-         uint64_t aligned_end = sinfo.logical_to_next_stripe_offset(
-           end);
-         uint64_t tail = aligned_end - end;
-         bl.append_zero(tail);
-         ldpp_dout(dpp, 20) << "generate_transactions: appending zeroes to align end "
-                            << end << "->" << end+tail
-                            << ", len: " << len << "->" << len+tail
-                            << dendl;
-         end += tail;
-         len += tail;
-       }
-
-       to_write.insert(off, len, bl);
-       if (end > new_size)
-         new_size = end;
+  shard_id_set touched;
+
+  for (auto &[start, len]: clone_ranges) {
+    shard_id_set to_clone_shards;
+    uint64_t clone_end = 0;
+
+    for (auto &&[shard, eset]: plan.will_write) {
+      shard_written(shard);
+
+      // If no clonable range here, then ignore.
+      if (!cloneable_range.contains(shard)) continue;
+
+      // Do not clone off the end of the old range
+      uint64_t shard_clone_max = cloneable_range.at(shard).range_end();
+      uint64_t shard_end = start + len;
+      if (shard_end > shard_clone_max) shard_end = shard_clone_max;
+
+      // clone_end needs to be the biggest shard_end.
+      if (shard_end > clone_end) clone_end = shard_end;
+
+      // Ignore pure appends on this shard.
+      if (shard_end <= start) continue;
+
+      // Ignore clones that do not intersect with the write.
+      if (!eset.intersects(start, len)) continue;
+
+      // We need a clone...
+      if (transactions.contains(shard)) {
+        auto &t = transactions.at(shard);
+
+        // Only touch once.
+        if (!touched.contains(shard)) {
+          t.touch(
+            coll_t(spg_t(pgid, shard)),
+            ghobject_t(oid, entry->version.version, shard));
+          touched.insert(shard_id_t(shard));
+        }
+        t.clone_range(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard),
+          ghobject_t(oid, entry->version.version, shard),
+          start,
+          shard_end - start,
+          start);
+
+        // We have done a clone, so tell the rollback.
+        to_clone_shards.insert(shard);
        }
+    }
  
-      if (op.truncate &&
-         op.truncate->second > new_size) {
-       ceph_assert(op.truncate->second > append_after);
-       uint64_t truncate_to =
-         sinfo.logical_to_next_stripe_offset(
-           op.truncate->second);
-       uint64_t zeroes = truncate_to - new_size;
-       bufferlist bl;
-       bl.append_zero(zeroes);
-       to_write.insert(
-         new_size,
-         zeroes,
-         bl);
-       new_size = truncate_to;
-       ldpp_dout(dpp, 20) << "generate_transactions: truncating out to "
-                          << truncate_to
-                          << dendl;
+    if (!to_clone_shards.empty()) {
+      // It is more efficent to store an empty set to represent the common
+      // all shards case.
+      if (to_clone_shards.size() == sinfo.get_k_plus_m()) {
+        to_clone_shards.clear();
+      }
+      if (clone_end > start) {
+        rollback_extents.emplace_back(make_pair(start, clone_end - start));
+        rollback_shards.emplace_back(to_clone_shards);
        }
+    }
+  }
+}
  
-      set<int> want;
-      for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) {
-       want.insert(i);
+void ECTransaction::Generate::written_and_present_shards() {
+  if (entry) {
+    if (!rollback_extents.empty()) {
+      entry->mod_desc.rollback_extents(
+        entry->version.version,
+        rollback_extents,
+        ECUtil::align_page_next(plan.orig_size),
+        rollback_shards);
+    }
+    if (entry->written_shards.size() == sinfo.get_k_plus_m()) {
+      // More efficient to encode an empty set for all shards
+      entry->written_shards.clear();
+    }
+    // Calculate set of present shards
+    for (auto &&[shard, t]: transactions) {
+      entry->present_shards.insert(shard);
+    }
+    if (entry->present_shards.size() == sinfo.get_k_plus_m()) {
+      // More efficient to encode an empty set for all shards
+      entry->present_shards.clear();
+    }
+
+    // Update shard_versions in object_info to record which shards are being
+    // written
+    if (op.attr_updates.contains(OI_ATTR)) {
+      object_info_t oi(*(op.attr_updates[OI_ATTR]));
+      bool update = false;
+      if (entry->written_shards.empty()) {
+        if (!oi.shard_versions.empty()) {
+          oi.shard_versions.clear();
+          update = true;
+        }
+      } else {
+        for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+          if (sinfo.is_nonprimary_shard(shard)) {
+            if (entry->is_written_shard(shard) || plan.orig_size != plan.
+              projected_size) {
+                // Written - erase per shard version
+                if (oi.shard_versions.erase(shard)) {
+                  update = true;
+                }
+              } else if (!oi.shard_versions.count(shard)) {
+                // Unwritten shard, previously up to date
+                oi.shard_versions[shard] = oi.prior_version;
+                update = true;
+              } else {
+                // Unwritten shard, already out of date
+              }
+          } else {
+            // Primary shards are always written and use oi.version
+          }
+        }
        }
-      auto to_overwrite = to_write.intersect(0, append_after);
-      ldpp_dout(dpp, 20) << "generate_transactions: to_overwrite: "
-                        << to_overwrite
-                        << dendl;
-      for (auto &&extent: to_overwrite) {
-       ceph_assert(extent.get_off() + extent.get_len() <= append_after);
-       ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
-       ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
-       if (entry) {
-         uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset(
-           extent.get_off());
-         uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
-           extent.get_len());
-         ldpp_dout(dpp, 20) << "generate_transactions: overwriting "
-                            << restore_from << "~" << restore_len
-                            << dendl;
-         if (rollback_extents.empty()) {
-           for (auto &&st : *transactions) {
-             st.second.touch(
-               coll_t(spg_t(pgid, st.first)),
-               ghobject_t(oid, entry->version.version, st.first));
-           }
-         }
-         rollback_extents.emplace_back(make_pair(restore_from, restore_len));
-         for (auto &&st : *transactions) {
-           st.second.clone_range(
-             coll_t(spg_t(pgid, st.first)),
-             ghobject_t(oid, ghobject_t::NO_GEN, st.first),
-             ghobject_t(oid, entry->version.version, st.first),
-             restore_from,
-             restore_len,
-             restore_from);
-         }
-       }
-       encode_and_write(
-         pgid,
-         oid,
-         sinfo,
-         ecimpl,
-         want,
-         extent.get_off(),
-         extent.get_val(),
-         fadvise_flags,
-         hinfo,
-         written,
-         transactions,
-         dpp);
+      if (update) {
+        bufferlist bl;
+        oi.encode(bl, osdmap->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+        op.attr_updates[OI_ATTR] = bl;
+        // Update cached OI
+        obc->obs.oi.shard_versions = oi.shard_versions;
        }
+      ldpp_dout(dpp, 20) << __func__ << "shard_info: version=" << entry->version
+                         << " present=" << entry->present_shards
+                         << " written=" << entry->written_shards
+                         << " shard_versions=" << oi.shard_versions << dendl;
+    }
  
-      auto to_append = to_write.intersect(
-       append_after,
-       std::numeric_limits<uint64_t>::max() - append_after);
-      ldpp_dout(dpp, 20) << "generate_transactions: to_append: "
-                        << to_append
-                        << dendl;
-      for (auto &&extent: to_append) {
-       ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
-       ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
-       ldpp_dout(dpp, 20) << "generate_transactions: appending "
-                          << extent.get_off() << "~" << extent.get_len()
-                          << dendl;
-       encode_and_write(
-         pgid,
-         oid,
-         sinfo,
-         ecimpl,
-         want,
-         extent.get_off(),
-         extent.get_val(),
-         fadvise_flags,
-         hinfo,
-         written,
-         transactions,
-         dpp);
+    /* It is essential for rollback that every shard with a non-empty transaction
+     * is recorded in written_shards. In fact written shards contains every
+     * shard that would have a transaction if it were present. This is why we do
+     * not simply construct written shards here.
+     */
+    for (auto &&[shard, t] : transactions) {
+      if (entry && (!t.empty() || !sinfo.is_nonprimary_shard(shard))) {
+        ceph_assert(entry->is_written_shard(shard));
        }
+    }
+  }
+}
  
-      ldpp_dout(dpp, 20) << "generate_transactions: " << oid
-                        << " resetting hinfo to logical size "
-                        << new_size
-                        << dendl;
-      if (!rollback_extents.empty() && entry) {
-       if (entry) {
-         ldpp_dout(dpp, 20) << "generate_transactions: " << oid
-                            << " marking rollback extents "
-                            << rollback_extents
-                            << dendl;
-         entry->mod_desc.rollback_extents(
-           entry->version.version, rollback_extents);
-       }
-       hinfo->set_total_chunk_size_clear_hash(
-         sinfo.aligned_logical_offset_to_chunk_offset(new_size));
-      } else {
-       ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size);
+void ECTransaction::Generate::attr_updates() {
+  map<string, bufferlist, less<>> to_set;
+  for (auto &&[attr, update]: op.attr_updates) {
+    if (update) {
+      to_set[attr] = *(update);
+    } else {
+      all_shards_written();
+      for (auto &&[shard, t]: transactions) {
+        t.rmattr(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard),
+          attr);
+      }
+    }
+    if (obc) {
+      auto citer = obc->attr_cache.find(attr);
+      if (entry) {
+        if (citer != obc->attr_cache.end()) {
+          // won't overwrite anything we put in earlier
+          xattr_rollback.insert(
+            make_pair(
+              attr,
+              std::optional<bufferlist>(citer->second)));
+        } else {
+          // won't overwrite anything we put in earlier
+          xattr_rollback.insert(
+            make_pair(
+              attr,
+              std::nullopt));
+        }
        }
+      if (update) {
+        obc->attr_cache[attr] = *(update);
+      } else if (citer != obc->attr_cache.end()) {
+        obc->attr_cache.erase(citer);
+      }
+    } else {
+      ceph_assert(!entry);
+    }
+  }
+  all_shards_written();
+  for (auto &&[shard, t]: transactions) {
+    if (!sinfo.is_nonprimary_shard(shard)) {
+      // Primary shard - Update all attributes
+      t.setattrs(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, ghobject_t::NO_GEN, shard),
+        to_set);
+    } else if (entry->is_written_shard(shard)) {
+      // Written shard - Only update object_info attribute
+      t.setattr(
+        coll_t(spg_t(pgid, shard)),
+        ghobject_t(oid, ghobject_t::NO_GEN, shard),
+        OI_ATTR,
+        to_set[OI_ATTR]);
+    } // Else: Unwritten shard - Don't update any attributes
+  }
+  ceph_assert(!xattr_rollback.empty());
+}
  
-      if (entry && !to_append.empty()) {
-       ldpp_dout(dpp, 20) << "generate_transactions: marking append "
-                          << append_after
-                          << dendl;
-       entry->mod_desc.append(append_after);
+void ECTransaction::Generate::handle_deletes() {
+  bufferlist hbuf;
+  if (plan.hinfo) {
+    encode(*plan.hinfo, hbuf);
+    for (auto &&[shard, t]: transactions) {
+      if (!sinfo.is_nonprimary_shard(shard)) {
+        shard_written(shard);
+        t.setattr(
+          coll_t(spg_t(pgid, shard)),
+          ghobject_t(oid, ghobject_t::NO_GEN, shard),
+          ECUtil::get_hinfo_key(),
+          hbuf);
        }
+    }
+  }
+}
+
+void ECTransaction::generate_transactions(
+    PGTransaction *_t,
+    WritePlan &plans,
+    ErasureCodeInterfaceRef &ec_impl,
+    pg_t pgid,
+    const ECUtil::stripe_info_t &sinfo,
+    const map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+    vector<pg_log_entry_t> &entries,
+    map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+    shard_id_map<ObjectStore::Transaction> *transactions,
+    set<hobject_t> *temp_added,
+    set<hobject_t> *temp_removed,
+    DoutPrefixProvider *dpp,
+    const OSDMapRef &osdmap) {
+  ceph_assert(written_map);
+  ceph_assert(transactions);
+  ceph_assert(temp_added);
+  ceph_assert(temp_removed);
+  ceph_assert(_t);
+  auto &t = *_t;
+
+  map<hobject_t, pg_log_entry_t*> obj_to_log;
+  for (auto &&i: entries) {
+    obj_to_log.insert(make_pair(i.soid, &i));
+  }
  
-      if (!op.is_delete()) {
-       bufferlist hbuf;
-       encode(*hinfo, hbuf);
-       for (auto &&i : *transactions) {
-         i.second.setattr(
-           coll_t(spg_t(pgid, i.first)),
-           ghobject_t(oid, ghobject_t::NO_GEN, i.first),
-           ECUtil::get_hinfo_key(),
-           hbuf);
-       }
+  t.safe_create_traverse(
+    [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
+      auto oid = opair.first;
+      PGTransaction::ObjectOperation& op = opair.second;
+      auto iter = obj_to_log.find(oid);
+      pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+      if (oid.is_temp()) {
+        if (op.is_fresh_object()) {
+          temp_added->insert(oid);
+        } else if (op.is_delete()) {
+          temp_removed->insert(oid);
+        }
        }
-    });
+
+      // Transactions must be submitted in the same order that they were planned in.
+      ceph_assert(!plans.plans.empty());
+      ECTransaction::WritePlanObj &plan = plans.plans.front();
+      ceph_assert(plan.hoid == oid);
+
+      Generate generate(t, ec_impl, pgid, sinfo, partial_extents, written_map,
+        *transactions, osdmap, oid, op, plan, dpp, entry);
+
+      plans.plans.pop_front();
+  });
  }
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h

index 8ca7b4da53ed0d6a3a229236639f543aaa2e9eb1..64bb8eed1b4a566f7ec19232ab85cea15d86b40d 100644 (file)
--- a/src/osd/ECTransaction.h
+++ b/src/osd/ECTransaction.h
@@ -12,184 +12,133 @@
   *
   */
  
-#ifndef ECTRANSACTION_H
-#define ECTRANSACTION_H
+#pragma once
  
+#include "common/dout.h"
  #include "ECUtil.h"
-#include "ExtentCache.h"
  #include "erasure-code/ErasureCodeInterface.h"
  #include "os/Transaction.h"
  #include "PGTransaction.h"
  
  namespace ECTransaction {
-  struct WritePlan {
-    bool invalidates_cache = false; // Yes, both are possible
-    std::map<hobject_t,extent_set> to_read;
-    std::map<hobject_t,extent_set> will_write; // superset of to_read
+class WritePlanObj {
+ public:
+  const hobject_t hoid;
+  std::optional<ECUtil::shard_extent_set_t> to_read;
+  ECUtil::shard_extent_set_t will_write;
+  const ECUtil::HashInfoRef hinfo;
+  const ECUtil::HashInfoRef shinfo;
+  const uint64_t orig_size;
+  uint64_t projected_size;
+  bool invalidates_cache;
+  bool do_parity_delta_write = false;
+
+  WritePlanObj(
+      const hobject_t &hoid,
+      const PGTransaction::ObjectOperation &op,
+      const ECUtil::stripe_info_t &sinfo,
+      const shard_id_set readable_shards,
+      const shard_id_set writable_shards,
+      const bool object_in_cache,
+      uint64_t orig_size,
+      const std::optional<object_info_t> &oi,
+      const std::optional<object_info_t> &soi,
+      const ECUtil::HashInfoRef &&hinfo,
+      const ECUtil::HashInfoRef &&shinfo,
+      const unsigned pdw_write_mode);
+
+  void print(std::ostream &os) const {
+    os << "to_read: " << to_read
+       << " will_write: " << will_write
+       << " hinfo: " << hinfo
+       << " shinfo: " << shinfo
+       << " orig_size: " << orig_size
+       << " projected_size: " << projected_size
+       << " invalidates_cache: " << invalidates_cache
+       << " do_pdw: " << do_parity_delta_write;
+  }
+};
  
-    std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
-  };
+struct WritePlan {
+  bool want_read;
+  std::list<WritePlanObj> plans;
+
+  void print(std::ostream &os) const {
+    os << " { plans : ";
+    bool first = true;
+    for (auto && p : plans) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      os << p;
+    }
+    os << "}";
+  }
+};
  
-  template <typename F>
-  WritePlan get_write_plan(
+class Generate {
+  PGTransaction &t;
+  const ErasureCodeInterfaceRef &ec_impl;
+  const pg_t &pgid;
+  const ECUtil::stripe_info_t &sinfo;
+  shard_id_map<ceph::os::Transaction> &transactions;
+  DoutPrefixProvider *dpp;
+  const OSDMapRef &osdmap;
+  pg_log_entry_t *entry;
+  const hobject_t &oid;
+  PGTransaction::ObjectOperation& op;
+  ObjectContextRef obc;
+  std::map<std::string, std::optional<bufferlist>> xattr_rollback;
+  const WritePlanObj &plan;
+  std::optional<ECUtil::shard_extent_map_t> read_sem;
+  ECUtil::shard_extent_map_t to_write;
+  std::vector<std::pair<uint64_t, uint64_t>> rollback_extents;
+  std::vector<shard_id_set> rollback_shards;
+  uint32_t fadvise_flags = 0;
+
+  void all_shards_written();
+  void shard_written(const shard_id_t shard);
+  void shards_written(const shard_id_set &shards);
+  void delete_first();
+  void zero_truncate_to_delete();
+  void process_init();
+  void encode_and_write();
+  void truncate();
+  void overlay_writes();
+  void appends_and_clone_ranges();
+  void written_and_present_shards();
+  void attr_updates();
+  void handle_deletes();
+
+ public:
+  Generate(PGTransaction &t,
+    ErasureCodeInterfaceRef &ec_impl, pg_t &pgid,
      const ECUtil::stripe_info_t &sinfo,
-    PGTransaction& t,
-    F &&get_hinfo,
-    DoutPrefixProvider *dpp) {
-    WritePlan plan;
-    t.safe_create_traverse(
-      [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
-        const auto& [obj, op] = i;
-       ECUtil::HashInfoRef hinfo = get_hinfo(obj);
-       plan.hash_infos[obj] = hinfo;
-
-       uint64_t projected_size =
-         hinfo->get_projected_total_logical_size(sinfo);
-
-       if (op.deletes_first()) {
-         ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size"
-                            << " to 0" << dendl;
-         projected_size = 0;
-       }
-
-       hobject_t source;
-       if (op.has_source(&source)) {
-         // typically clone or mv
-         plan.invalidates_cache = true;
-
-         ECUtil::HashInfoRef shinfo = get_hinfo(source);
-         projected_size = shinfo->get_projected_total_logical_size(sinfo);
-         plan.hash_infos[source] = shinfo;
-       }
-
-       auto &will_write = plan.will_write[obj];
-       if (op.truncate &&
-           op.truncate->first < projected_size) {
-         if (!(sinfo.logical_offset_is_stripe_aligned(
-                 op.truncate->first))) {
-           plan.to_read[obj].union_insert(
-             sinfo.logical_to_prev_stripe_offset(op.truncate->first),
-             sinfo.get_stripe_width());
-
-           ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl;
-
-           will_write.union_insert(
-             sinfo.logical_to_prev_stripe_offset(op.truncate->first),
-             sinfo.get_stripe_width());
-         }
-         projected_size = sinfo.logical_to_next_stripe_offset(
-           op.truncate->first);
-       }
-
-       extent_set raw_write_set;
-       for (auto &&extent: op.buffer_updates) {
-         using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
-         if (boost::get<BufferUpdate::CloneRange>(&(extent.get_val()))) {
-           ceph_assert(
-             0 ==
-             "CloneRange is not allowed, do_op should have returned ENOTSUPP");
-         }
-         raw_write_set.insert(extent.get_off(), extent.get_len());
-       }
-
-       auto orig_size = projected_size;
-       for (auto extent = raw_write_set.begin();
-            extent != raw_write_set.end();
-            ++extent) {
-         uint64_t head_start =
-           sinfo.logical_to_prev_stripe_offset(extent.get_start());
-         uint64_t head_finish =
-           sinfo.logical_to_next_stripe_offset(extent.get_start());
-         if (head_start > projected_size) {
-           head_start = projected_size;
-         }
-         if (head_start != head_finish &&
-             head_start < orig_size) {
-           ceph_assert(head_finish <= orig_size);
-           ceph_assert(head_finish - head_start == sinfo.get_stripe_width());
-           ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe "
-                              << head_start << "~" << sinfo.get_stripe_width()
-                              << dendl;
-           plan.to_read[obj].union_insert(
-             head_start, sinfo.get_stripe_width());
-         }
-
-         uint64_t tail_start =
-           sinfo.logical_to_prev_stripe_offset(
-             extent.get_start() + extent.get_len());
-         uint64_t tail_finish =
-           sinfo.logical_to_next_stripe_offset(
-             extent.get_start() + extent.get_len());
-         if (tail_start != tail_finish &&
-             (head_start == head_finish || tail_start != head_start) &&
-             tail_start < orig_size) {
-           ceph_assert(tail_finish <= orig_size);
-           ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width());
-           ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe "
-                              << tail_start << "~" << sinfo.get_stripe_width()
-                              << dendl;
-           plan.to_read[obj].union_insert(
-             tail_start, sinfo.get_stripe_width());
-         }
-
-         if (head_start != tail_finish) {
-           ceph_assert(
-             sinfo.logical_offset_is_stripe_aligned(
-               tail_finish - head_start)
-             );
-           will_write.union_insert(
-             head_start, tail_finish - head_start);
-           if (tail_finish > projected_size)
-             projected_size = tail_finish;
-         } else {
-           ceph_assert(tail_finish <= projected_size);
-         }
-       }
-
-       if (op.truncate && op.truncate->second > projected_size) {
-         uint64_t truncating_to =
-           sinfo.logical_to_next_stripe_offset(op.truncate->second);
-         ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
-                            <<  truncating_to
-                            << dendl;
-         will_write.union_insert(projected_size,
-                                 truncating_to - projected_size);
-         projected_size = truncating_to;
-       }
-
-       ldpp_dout(dpp, 20) << __func__ << ": " << obj
-                          << " projected size "
-                          << projected_size
-                          << dendl;
-       hinfo->set_projected_total_logical_size(
-         sinfo,
-         projected_size);
-
-       /* validate post conditions:
-        * to_read should have an entry for `obj` if it isn't empty
-        * and if we are reading from `obj`, we can't be renaming or
-        * cloning it */
-       ceph_assert(plan.to_read.count(obj) == 0 ||
-              (!plan.to_read.at(obj).empty() &&
-               !i.second.has_source()));
-      });
-    return plan;
-  }
+    const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+    std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+    shard_id_map<ceph::os::Transaction> &transactions,
+    const OSDMapRef &osdmap,
+    const hobject_t &oid, PGTransaction::ObjectOperation &op,
+    WritePlanObj &plan,
+    DoutPrefixProvider *dpp,
+    pg_log_entry_t *entry);
+};
  
-  void generate_transactions(
-    PGTransaction* _t,
+void generate_transactions(
+    PGTransaction *_t,
      WritePlan &plan,
-    ceph::ErasureCodeInterfaceRef &ecimpl,
+    ceph::ErasureCodeInterfaceRef &ec_impl,
      pg_t pgid,
      const ECUtil::stripe_info_t &sinfo,
-    const std::map<hobject_t,extent_map> &partial_extents,
+    const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
      std::vector<pg_log_entry_t> &entries,
-    std::map<hobject_t,extent_map> *written,
-    std::map<shard_id_t, ceph::os::Transaction> *transactions,
+    std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+    shard_id_map<ceph::os::Transaction> *transactions,
      std::set<hobject_t> *temp_added,
      std::set<hobject_t> *temp_removed,
      DoutPrefixProvider *dpp,
-    const ceph_release_t require_osd_release = ceph_release_t::unknown);
-};
-
-#endif
+    const OSDMapRef &osdmap
+  );
+}
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc

index d24095809f5f56e48032d0757871d68ab3460426..9baf951e8b6abe4abd9219fc3a029df211073ece 100644 (file)
--- a/src/osd/ECUtil.cc
+++ b/src/osd/ECUtil.cc
@@ -9,215 +9,1100 @@
  #include "global/global_context.h"
  #include "include/encoding.h"
  
-/* This file is soon going to be replaced (before next release), so we are going
- * to simply ignore all deprecated warnings.
- * */
-IGNORE_DEPRECATED
-
  using namespace std;
  using ceph::bufferlist;
  using ceph::ErasureCodeInterfaceRef;
  using ceph::Formatter;
  
-std::pair<uint64_t, uint64_t> ECUtil::stripe_info_t::chunk_aligned_offset_len_to_chunk(
-  std::pair<uint64_t, uint64_t> in) const {
-  pair<uint64_t, uint64_t> tmp = offset_len_to_stripe_bounds(in);
+template <typename T>
+using shard_id_map = shard_id_map<T>;
+
+std::pair<uint64_t, uint64_t>
+ECUtil::stripe_info_t::chunk_aligned_ro_range_to_shard_ro_range(
+    uint64_t _off, uint64_t _len) const {
+  auto [off, len] = ro_offset_len_to_stripe_ro_offset_len(_off, _len);
    return std::make_pair(
-    chunk_aligned_logical_offset_to_chunk_offset(tmp.first),
-    chunk_aligned_logical_size_to_chunk_size(tmp.second));
+    chunk_aligned_ro_offset_to_chunk_offset(off),
+    chunk_aligned_ro_length_to_shard_length(len));
  }
  
-int ECUtil::decode(
-  const stripe_info_t &sinfo,
-  ErasureCodeInterfaceRef &ec_impl,
-  const set<int> want_to_read,
-  map<int, bufferlist> &to_decode,
-  bufferlist *out)
-{
-  ceph_assert(to_decode.size());
+/*
+ASCII Art describing the various variables in the following function:
+                    start    end
+                      |       |
+                      |       |
+                      |       |
+           - - - - - -v- -+---+-----------+ - - - - - -
+                 start_adj|   |           |      ^
+to_read.offset - ->-------+   |           | chunk_size
+                  |           |           |      v
+           +------+ - - - - - + - - - - - + - - - - - -
+           |                  |           |
+           |                  v           |
+           |              - - - - +-------+
+           |               end_adj|
+           |              +-------+
+           |              |       |
+           +--------------+       |
+                          |       |
+                          | shard |
+
+Given an offset and size, this adds to a vector of extents describing the
+minimal IO ranges on each shard.  If passed, this method will also populate
+a superset of all extents required.
+ */
+void ECUtil::stripe_info_t::ro_range_to_shards(
+    uint64_t ro_offset,
+    uint64_t ro_size,
+    shard_extent_set_t *shard_extent_set,
+    extent_set *extent_superset,
+    buffer::list *bl,
+    shard_extent_map_t *shard_extent_map) const {
+  // Some of the maths below assumes size not zero.
+  if (ro_size == 0) {
+    return;
+  }
+
+  uint64_t k = get_k();
+
+  // Aim is to minimise non-^2 divs (chunk_size is assumed to be a power of 2).
+  // These should be the only non ^2 divs.
+  uint64_t begin_div = ro_offset / stripe_width;
+  uint64_t end_div = (ro_offset + ro_size + stripe_width - 1) / stripe_width -
+      1;
+  uint64_t start = begin_div * chunk_size;
+  uint64_t end = end_div * chunk_size;
+
+  uint64_t start_shard = (ro_offset - begin_div * stripe_width) / chunk_size;
+  uint64_t chunk_count = (ro_offset + ro_size + chunk_size - 1) / chunk_size -
+      ro_offset / chunk_size;;
+
+  // The end_shard needs a modulus to calculate the actual shard, however
+  // it is convenient to store it like this for the loop.
+  auto end_shard = start_shard + std::min(chunk_count, k);
+
+  // The last shard is the raw shard index which contains the last chunk.
+  // Is it possible to calculate this without th e +%?
+  uint64_t last_shard = (start_shard + chunk_count - 1) % k;
+
+  uint64_t buffer_shard_start_offset = 0;
+
+  for (auto i = start_shard; i < end_shard; i++) {
+    raw_shard_id_t raw_shard(i >= k ? i - k : i);
+
+    // Adjust the start and end blocks if needed.
+    uint64_t start_adj = 0;
+    uint64_t end_adj = 0;
+
+    if (raw_shard < start_shard) {
+      // Shards before the start, must start on the next chunk.
+      start_adj = chunk_size;
+    } else if (int(raw_shard) == int(start_shard)) {
+      // The start shard itself needs to be moved a partial-chunk forward.
+      start_adj = ro_offset % chunk_size;
+    }
+
+    // The end is similar to the start, but the end must be rounded up.
+    if (raw_shard < last_shard) {
+      end_adj = chunk_size;
+    } else if (int(raw_shard) == int(last_shard)) {
+      end_adj = (ro_offset + ro_size - 1) % chunk_size + 1;
+    }
+
+    shard_id_t shard = get_shard(raw_shard);
+
+    uint64_t off = start + start_adj;
+    uint64_t len = end + end_adj - start - start_adj;
+    if (shard_extent_set) {
+      (*shard_extent_set)[shard].union_insert(off, len);
+    }
+
+    if (extent_superset) {
+      extent_superset->union_insert(off, len);
+    }
  
-  uint64_t total_data_size = to_decode.begin()->second.length();
-  ceph_assert(total_data_size % sinfo.get_chunk_size() == 0);
+    if (shard_extent_map) {
+      ceph_assert(bl);
+      buffer::list shard_bl;
  
-  ceph_assert(out);
-  ceph_assert(out->length() == 0);
+      uint64_t bl_offset = buffer_shard_start_offset;
  
-  for (map<int, bufferlist>::iterator i = to_decode.begin();
-       i != to_decode.end();
-       ++i) {
-    ceph_assert(i->second.length() == total_data_size);
+      // Start with any partial chunks.
+      if (chunk_size != start_adj) {
+        shard_bl.substr_of(*bl, bl_offset,
+                           min(static_cast<uint64_t>(bl->length()) - bl_offset,
+                               chunk_size - start_adj));
+        buffer_shard_start_offset += chunk_size - start_adj;
+        bl_offset += chunk_size - start_adj + (k - 1) * chunk_size;
+      } else {
+        buffer_shard_start_offset += chunk_size;
+      }
+      while (bl_offset < bl->length()) {
+        buffer::list tmp;
+        tmp.substr_of(*bl, bl_offset,
+                      min(chunk_size, bl->length() - bl_offset));
+        shard_bl.append(tmp);
+        bl_offset += k * chunk_size;
+      }
+      shard_extent_map->insert_in_shard(shard, off, shard_bl, ro_offset,
+                                          ro_offset + ro_size);
+    }
    }
+}
  
-  if (total_data_size == 0)
-    return 0;
+void ECUtil::stripe_info_t::trim_shard_extent_set_for_ro_offset(
+    uint64_t ro_offset,
+    shard_extent_set_t &shard_extent_set) const {
+  /* If the offset is within the first shard, then the remaining shards are
+   * not written and we don't need to generated zeros for either */
+  int ro_offset_shard = (ro_offset / chunk_size) % k;
+  if (ro_offset_shard == 0) {
+    uint64_t shard_offset = ro_offset_to_shard_offset(
+      ro_offset, raw_shard_id_t(0));
+    for (auto &&iter = shard_extent_set.begin(); iter != shard_extent_set.end()
+         ;) {
+      iter->second.erase_after(align_page_next(shard_offset));
+      if (iter->second.empty()) iter = shard_extent_set.erase(iter);
+      else ++iter;
+    }
+  }
+}
+
+void ECUtil::stripe_info_t::ro_size_to_stripe_aligned_read_mask(
+    uint64_t ro_size,
+    shard_extent_set_t &shard_extent_set) const {
+  ro_range_to_shard_extent_set_with_parity(
+    0, ro_offset_to_next_stripe_ro_offset(ro_size), shard_extent_set);
+  trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set);
+}
+
+void ECUtil::stripe_info_t::ro_size_to_read_mask(
+    uint64_t ro_size,
+    shard_extent_set_t &shard_extent_set) const {
+  ro_range_to_shard_extent_set_with_parity(0, align_page_next(ro_size),
+                                           shard_extent_set);
+}
+
+void ECUtil::stripe_info_t::ro_size_to_zero_mask(
+    uint64_t ro_size,
+    shard_extent_set_t &shard_extent_set) const {
+  // There should never be any zero padding on the parity.
+  ro_range_to_shard_extent_set(align_page_next(ro_size),
+                               ro_offset_to_next_stripe_ro_offset(ro_size) -
+                               align_page_next(ro_size),
+                               shard_extent_set);
+  trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set);
+}
+
+namespace ECUtil {
+void shard_extent_map_t::erase_after_ro_offset(uint64_t ro_offset) {
+  /* Ignore the null case */
+  if (ro_offset >= ro_end) {
+    return;
+  }
  
-  for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
-    map<int, bufferlist> chunks;
-    for (map<int, bufferlist>::iterator j = to_decode.begin();
-        j != to_decode.end();
-        ++j) {
-      chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size());
+  shard_extent_set_t ro_to_erase(sinfo->get_k_plus_m());
+  sinfo->ro_range_to_shard_extent_set(ro_offset, ro_end - ro_start,
+                                      ro_to_erase);
+  for (auto &&[shard, eset] : ro_to_erase) {
+    if (extent_maps.contains(shard)) {
+      extent_maps[shard].erase(eset.range_start(), eset.range_end());
+    }
+
+    // If the result is empty, delete the extent map.
+    if (extent_maps[shard].empty()) {
+      extent_maps.erase(shard);
      }
-    bufferlist bl;
-    int r = ec_impl->decode_concat(want_to_read, chunks, &bl);
-    ceph_assert(r == 0);
-    ceph_assert(bl.length() % sinfo.get_chunk_size() == 0);
-    out->claim_append(bl);
    }
-  return 0;
+
+  compute_ro_range();
+}
+
+shard_extent_map_t shard_extent_map_t::intersect_ro_range(
+    uint64_t ro_offset,
+    uint64_t ro_length) const {
+  // Optimise (common) use case where the overlap is everything
+  if (ro_offset <= ro_start &&
+    ro_offset + ro_length >= ro_end) {
+    return *this;
+  }
+
+  // Optimise (common) use cases where the overlap is nothing
+  if (ro_offset >= ro_end ||
+    ro_offset + ro_length <= ro_start) {
+    return shard_extent_map_t(sinfo);
+  }
+
+  shard_extent_set_t ro_to_intersect(sinfo->get_k_plus_m());
+  sinfo->ro_range_to_shard_extent_set(ro_offset, ro_length, ro_to_intersect);
+
+  return intersect(ro_to_intersect);
+}
+
+shard_extent_map_t shard_extent_map_t::intersect(
+    optional<shard_extent_set_t> const &other) const {
+  if (!other) {
+    return shard_extent_map_t(sinfo);
+  }
+
+  return intersect(*other);
+}
+
+shard_extent_map_t shard_extent_map_t::intersect(
+    shard_extent_set_t const &other) const {
+  shard_extent_map_t out(sinfo);
+  out.ro_end = 0;
+  out.end_offset = 0;
+
+  for (auto &&[shard, this_eset] : other) {
+    if (extent_maps.contains(shard)) {
+      extent_map tmp;
+      extent_set eset;
+      extent_maps.at(shard).to_interval_set(eset);
+      eset.intersection_of(this_eset);
+
+      for (auto [offset, len] : eset) {
+        bufferlist bl;
+        get_buffer(shard, offset, len, bl);
+        tmp.insert(offset, len, bl);
+      }
+      if (!tmp.empty()) {
+        uint64_t range_start = tmp.get_start_off();
+        uint64_t range_end = tmp.get_end_off();
+
+        out.start_offset = min(out.start_offset, range_start);
+        out.end_offset = max(out.end_offset, range_end);
+
+        raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+        if (raw_shard < sinfo->get_k()) {
+          out.ro_start = std::min(out.ro_start,
+                                  calc_ro_offset(raw_shard, range_start));
+          out.ro_end = std::max(out.ro_end, calc_ro_end(raw_shard, range_end));
+        }
+
+        out.extent_maps.emplace(shard, std::move(tmp));
+      }
+    }
+  }
+
+  if (out.ro_start == invalid_offset) {
+    out.ro_end = out.end_offset = invalid_offset;
+  }
+
+  return out;
+}
+
+void shard_extent_map_t::insert(shard_extent_map_t const &other) {
+  for (auto &&[shard, emap] : other.extent_maps) {
+    if (!extent_maps.contains(shard)) {
+      extent_maps.emplace(shard, emap);
+    } else {
+      extent_maps[shard].insert(emap);
+    }
+  }
+
+  if (ro_start == invalid_offset || other.ro_start < ro_start) {
+    ro_start = other.ro_start;
+  }
+  if (ro_end == invalid_offset || other.ro_end > ro_end) {
+    ro_end = other.ro_end;
+  }
+  if (start_offset == invalid_offset || other.start_offset < start_offset) {
+    start_offset = other.start_offset;
+  }
+  if (end_offset == invalid_offset || other.end_offset > end_offset) {
+    end_offset = other.end_offset;
+  }
+}
+
+uint64_t shard_extent_map_t::size() {
+  uint64_t size = 0;
+  for (auto &i : extent_maps) {
+    for (auto &j : i.second) {
+      size += j.get_len();
+    }
+  }
+
+  return size;
+}
+
+void shard_extent_map_t::clear() {
+  ro_start = ro_end = start_offset = end_offset = invalid_offset;
+  extent_maps.clear();
+}
+
+void shard_extent_map_t::deep_copy(shard_extent_map_t const &other) {
+  for (auto &&[shard, emap] : other.extent_maps) {
+    for (auto iter : emap) {
+      uint64_t off = iter.get_off();
+      uint64_t len = iter.get_len();
+      bufferlist bl = iter.get_val();
+      bl.rebuild();
+      extent_maps[shard].insert(off, len, bl);
+    }
+  }
+}
+
+/* Insert a buffer for a particular shard.
+ * NOTE: DO NOT CALL sinfo->get_min_want_shards()
+ */
+void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off,
+                                         const buffer::list &bl) {
+  if (bl.length() == 0) {
+    return;
+  }
+
+  extent_maps[shard].insert(off, bl.length(), bl);
+  raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+
+  if (raw_shard > sinfo->get_k()) {
+    return;
+  }
+
+  uint64_t new_start = calc_ro_offset(sinfo->get_raw_shard(shard), off);
+  uint64_t new_end =
+      calc_ro_end(sinfo->get_raw_shard(shard), off + bl.length());
+  if (empty()) {
+    ro_start = new_start;
+    ro_end = new_end;
+    start_offset = off;
+    end_offset = off + bl.length();
+  } else {
+    ro_start = min(ro_start, new_start);
+    ro_end = max(ro_end, new_end);
+    start_offset = min(start_offset, off);
+    end_offset = max(end_offset, off + bl.length());
+  }
  }
  
-int ECUtil::decode(
-  const stripe_info_t &sinfo,
-  ErasureCodeInterfaceRef &ec_impl,
-  map<int, bufferlist> &to_decode,
-  map<int, bufferlist*> &out) {
+/* Insert a buffer for a particular shard.
+ * If the client knows the new start and end, use this interface to improve
+ * performance.
+ */
+void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off,
+                                         const buffer::list &bl,
+                                         uint64_t new_start, uint64_t new_end) {
+  if (bl.length() == 0) {
+    return;
+  }
+
+  extent_maps[shard].insert(off, bl.length(), bl);
+  if (empty()) {
+    ro_start = new_start;
+    ro_end = new_end;
+    start_offset = off;
+    end_offset = off + bl.length();
+  } else {
+    ro_start = min(ro_start, new_start);
+    ro_end = max(ro_end, new_end);
+    start_offset = min(start_offset, off);
+    end_offset = max(end_offset, off + bl.length());
+  }
+}
  
-  ceph_assert(to_decode.size());
+/* Insert a region of zeros in rados object address space..
+ */
+void shard_extent_map_t::insert_ro_zero_buffer(uint64_t ro_offset,
+                                               uint64_t ro_length) {
+  buffer::list zero_buffer;
+  zero_buffer.append_zero(ro_length);
+  sinfo->ro_range_to_shard_extent_map(ro_offset, ro_length, zero_buffer, *this);
+}
  
-  for (auto &&i : to_decode) {
-    if(i.second.length() == 0)
-      return 0;
+/* Append zeros to the extent maps, such that all bytes from the current end
+ * of the rados object range to the specified offset are zero.  Note that the
+ * byte at ro_offset does NOT get populated, so that this works as an
+ * addition to length.
+ */
+void shard_extent_map_t::append_zeros_to_ro_offset(uint64_t ro_offset) {
+  uint64_t _ro_end = ro_end == invalid_offset ? 0 : ro_end;
+  if (ro_offset <= _ro_end) {
+    return;
    }
+  uint64_t append_offset = _ro_end;
+  uint64_t append_length = ro_offset - _ro_end;
+  insert_ro_zero_buffer(append_offset, append_length);
+}
  
-  set<int> need;
-  for (map<int, bufferlist*>::iterator i = out.begin();
-       i != out.end();
-       ++i) {
-    ceph_assert(i->second);
-    ceph_assert(i->second->length() == 0);
-    need.insert(i->first);
+/* This method rearranges buffers from a rados object extent map into a shard
+ * extent map.  Note that it is a simple transformation, it does NOT perform
+ * any encoding of parity shards.
+ */
+void shard_extent_map_t::insert_ro_extent_map(const extent_map &host_extent_map) {
+  for (auto &&range = host_extent_map.begin();
+       range != host_extent_map.end();
+       ++range) {
+    buffer::list bl = range.get_val();
+    sinfo->ro_range_to_shard_extent_map(
+      range.get_off(),
+      range.get_len(),
+      bl,
+      *this);
    }
+}
  
-  set<int> avail;
-  for (auto &&i : to_decode) {
-    ceph_assert(i.second.length() != 0);
-    avail.insert(i.first);
+extent_set shard_extent_map_t::get_extent_superset() const {
+  extent_set eset;
+
+  for (auto &&[shard, emap] : extent_maps) {
+    emap.to_interval_set(eset);
    }
  
-  map<int, vector<pair<int, int>>> min;
-  int r = ec_impl->minimum_to_decode(need, avail, &min);
-  ceph_assert(r == 0);
+  return eset;
+}
  
-  int chunks_count = 0;
-  int repair_data_per_chunk = 0;
-  int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count();
+void shard_extent_map_t::insert_parity_buffers() {
+  extent_set encode_set = get_extent_superset();
  
-  for (auto &&i : to_decode) {
-    auto found = min.find(i.first);
-    if (found != min.end()) {
-      int repair_subchunk_count = 0;
-      for (auto& subchunks : min[i.first]) {
-        repair_subchunk_count += subchunks.second;
+  /* Invent buffers for the parity coding, if they were not provided.
+   * e.g. appends will not provide parity buffers.
+   * We should EITHER have no buffers, or have the right buffers.
+   */
+  for (raw_shard_id_t raw_shard(sinfo->get_k()); raw_shard < sinfo->
+       get_k_plus_m(); ++raw_shard) {
+    shard_id_t shard = sinfo->get_shard(raw_shard);
+
+    for (auto &&[offset, length] : encode_set) {
+      /* No need to recreate buffers we already have */
+      if (extent_maps.contains(shard)) {
+        extent_map emap = extent_maps.at(shard);
+        if (emap.contains(offset, length))
+          continue;
        }
-      repair_data_per_chunk = repair_subchunk_count * subchunk_size;
-      chunks_count = (int)i.second.length() / repair_data_per_chunk;
-      break;
+      bufferlist bl;
+      bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+      extent_maps[shard].insert(offset, length, bl);
      }
    }
+}
+
+slice_iterator<shard_id_t, extent_map> shard_extent_map_t::begin_slice_iterator(
+    const shard_id_set &out) {
+  return slice_iterator(extent_maps, out);
+}
+
+/* Encode parity chunks, using the encode_chunks interface into the
+ * erasure coding. This generates all parity using full stripe writes.
+ */
+int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
+  shard_id_set out_set = sinfo->get_parity_shards();
+  bool rebuild_req = false;
+
+  for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+    if (!iter.is_page_aligned()) {
+      rebuild_req = true;
+      break;
+    }
  
-  for (int i = 0; i < chunks_count; i++) {
-    map<int, bufferlist> chunks;
-    for (auto j = to_decode.begin();
-        j != to_decode.end();
-        ++j) {
-      chunks[j->first].substr_of(j->second, 
-                                 i*repair_data_per_chunk, 
-                                 repair_data_per_chunk);
+    shard_id_map<bufferptr> &in = iter.get_in_bufferptrs();
+    shard_id_map<bufferptr> &out = iter.get_out_bufferptrs();
+
+    if (int ret = ec_impl->encode_chunks(in, out)) {
+      return ret;
      }
-    map<int, bufferlist> out_bls;
-    r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size());
-    ceph_assert(r == 0);
-    for (auto j = out.begin(); j != out.end(); ++j) {
-      ceph_assert(out_bls.count(j->first));
-      ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size());
-      j->second->claim_append(out_bls[j->first]);
+  }
+
+  if (rebuild_req) {
+    pad_and_rebuild_to_page_align();
+    return _encode(ec_impl);
+  }
+
+  return 0;
+}
+
+/* Encode parity chunks, using the encode_chunks interface into the
+ * erasure coding. This generates all parity using full stripe writes.
+ */
+int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl,
+                               const HashInfoRef &hinfo,
+                               uint64_t before_ro_size) {
+  int r = _encode(ec_impl);
+
+  if (!r && hinfo && ro_start >= before_ro_size) {
+    /* NEEDS REVIEW:  The following calculates the new hinfo CRCs. This is
+     *                 currently considering ALL the buffers, including the
+     *                 parity buffers.  Is this really right?
+     *                 Also, does this really belong here? Its convenient
+     *                 because have just built the buffer list...
+     */
+    shard_id_set full_set;
+    full_set.insert_range(shard_id_t(0), sinfo->get_k_plus_m());
+    for (auto iter = begin_slice_iterator(full_set); !iter.is_end(); ++iter) {
+      ceph_assert(ro_start == before_ro_size);
+      hinfo->append(iter.get_offset(), iter.get_in_bufferptrs());
      }
    }
-  for (auto &&i : out) {
-    ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size());
+
+  return r;
+}
+
+/* Encode parity chunks, using the parity delta write interfaces on plugins
+ * that support them.
+ */
+int shard_extent_map_t::encode_parity_delta(
+    const ErasureCodeInterfaceRef &ec_impl,
+    shard_extent_map_t &old_sem) {
+  shard_id_set out_set = sinfo->get_parity_shards();
+
+  pad_and_rebuild_to_page_align();
+  old_sem.pad_and_rebuild_to_page_align();
+
+  for (auto data_shard : sinfo->get_data_shards()) {
+    shard_extent_map_t s(sinfo);
+    if (!contains_shard(data_shard)) {
+      continue;
+    }
+    s.extent_maps[shard_id_t(0)] = old_sem.extent_maps[data_shard];
+    s.extent_maps[shard_id_t(1)] = extent_maps[data_shard];
+    for (shard_id_t parity_shard : sinfo->get_parity_shards()) {
+      if (extent_maps.contains(parity_shard)) {
+        s.extent_maps[parity_shard] = extent_maps[parity_shard];
+      }
+    }
+
+    s.compute_ro_range();
+
+    for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+      ceph_assert(iter.is_page_aligned());
+      shard_id_map<bufferptr> &data_shards = iter.get_in_bufferptrs();
+      shard_id_map<bufferptr> &parity_shards = iter.get_out_bufferptrs();
+
+      unsigned int size = iter.get_length();
+      ceph_assert(size % 4096 == 0);
+      ceph_assert(size > 0);
+      bufferptr delta = buffer::create_aligned(size, CEPH_PAGE_SIZE);
+
+      if (data_shards[shard_id_t(0)].length() != 0 && data_shards[shard_id_t(1)]
+        .length() != 0) {
+        ec_impl->encode_delta(data_shards[shard_id_t(0)],
+                              data_shards[shard_id_t(1)], &delta);
+        shard_id_map<bufferptr> in(sinfo->get_k_plus_m());
+        in.emplace(data_shard, delta);
+        ec_impl->apply_delta(in, parity_shards);
+      }
+    }
    }
+
+  compute_ro_range();
    return 0;
  }
  
-int ECUtil::encode(
-  const stripe_info_t &sinfo,
-  ErasureCodeInterfaceRef &ec_impl,
-  bufferlist &in,
-  const set<int> &want,
-  map<int, bufferlist> *out) {
+void shard_extent_map_t::pad_on_shards(const shard_extent_set_t &pad_to,
+                                       const shard_id_set &shards) {
+  for (auto &shard : shards) {
+    if (!pad_to.contains(shard)) {
+      continue;
+    }
+    for (auto &[off, length] : pad_to.at(shard)) {
+      bufferlist bl;
+      bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+      insert_in_shard(shard, off, bl);
+    }
+  }
+}
  
-  uint64_t logical_size = in.length();
+void shard_extent_map_t::pad_on_shards(const extent_set &pad_to,
+                                       const shard_id_set &shards) {
+  for (auto &shard : shards) {
+    for (auto &[off, length] : pad_to) {
+      bufferlist bl;
+      bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+      insert_in_shard(shard, off, bl);
+    }
+  }
+}
+
+/* Trim to the specified extent set. Note that this will panic if the shard
+ * extent set does not contain the extents described in trim_to.
+ */
+void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) {
  
-  ceph_assert(logical_size % sinfo.get_stripe_width() == 0);
-  ceph_assert(out);
-  ceph_assert(out->empty());
+  // Erase any shards missing from trim_to
+  for ( auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+    auto && [shard, emap] = *iter;
+    if (!trim_to.contains(shard)) {
+      iter = extent_maps.erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+  for (auto &&[shard, want_eset] : trim_to) {
+    extent_set tmp;
+    ceph_assert(extent_maps.contains(shard));
+    extent_map &emap = extent_maps.at(shard);
+    emap.to_interval_set(tmp);
+    ceph_assert(tmp.contains(want_eset));
+
+    // Now trim to what was requested.
+    if (tmp.size() != want_eset.size()) {
+      tmp.subtract(trim_to.at(shard));
+      for (auto [off, len] : tmp) {
+        emap.erase(off, len);
+      }
+    }
+  }
  
-  if (logical_size == 0)
+  compute_ro_range();
+}
+
+int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
+                               const shard_extent_set_t &want,
+                               uint64_t object_size) {
+  shard_id_set want_set;
+  shard_id_set have_set;
+  want.populate_shard_id_set(want_set);
+  extent_maps.populate_bitset_set(have_set);
+
+  shard_id_set need_set = shard_id_set::difference(want_set, have_set);
+
+  /* Optimise the no-op */
+  if (need_set.empty()) {
      return 0;
+  }
+
+  if (add_zero_padding_for_decode(object_size, need_set)) {
+    // We added some zero buffers, which means our have and need set may change
+    extent_maps.populate_bitset_set(have_set);
+    need_set = shard_id_set::difference(want_set, have_set);
+  }
+
+  shard_id_set decode_set = shard_id_set::intersection(need_set, sinfo->get_data_shards());
+  shard_id_set encode_set = shard_id_set::intersection(need_set, sinfo->get_parity_shards());
+  int r = 0;
+  if (!decode_set.empty()) {
+    pad_on_shards(want, decode_set);
+    /* If we are going to be encoding, we need to make sure all the necessary
+     * shards are decoded. The get_min_available functions should have already
+     * worked out what needs to be read for this.
+     */
+    extent_set decode_for_parity;
+    for (auto shard : encode_set) {
+      decode_for_parity.insert(want.at(shard));
+    }
+    pad_on_shards(decode_for_parity, decode_set);
+    r = _decode(ec_impl, want_set, decode_set);
+  }
+  if (!r && !encode_set.empty()) {
+    pad_on_shards(want, encode_set);
+    r = _encode(ec_impl);
+  }
+
+  // If we failed to decode, then bail out, or the trimming below might fail.
+  if (r) {
+    return r;
+  }
+
+  /* Some of the above can invent buffers. There are some edge cases whereby
+   * they can invent buffers outside the want extent_set which are actually
+   * invalid.  So here, we trim off those buffers.
+   */
+  trim(want);
+
+  return 0;
+}
+
+int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
+                                const shard_id_set &want_set,
+                                const shard_id_set &need_set) {
+  bool rebuild_req = false;
+  for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) {
+    if (!iter.is_page_aligned()) {
+      rebuild_req = true;
+      break;
+    }
+    shard_id_map<bufferptr> &in = iter.get_in_bufferptrs();
+    shard_id_map<bufferptr> &out = iter.get_out_bufferptrs();
  
-  for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) {
-    map<int, bufferlist> encoded;
-    bufferlist buf;
-    buf.substr_of(in, i, sinfo.get_stripe_width());
-    int r = ec_impl->encode(want, buf, &encoded);
-    ceph_assert(r == 0);
-    for (map<int, bufferlist>::iterator i = encoded.begin();
-        i != encoded.end();
-        ++i) {
-      ceph_assert(i->second.length() == sinfo.get_chunk_size());
-      (*out)[i->first].claim_append(i->second);
+    if (int ret = ec_impl->decode_chunks(want_set, in, out)) {
+      return ret;
      }
    }
  
-  for (map<int, bufferlist>::iterator i = out->begin();
-       i != out->end();
-       ++i) {
-    ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0);
-    ceph_assert(
-      sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) ==
-      logical_size);
+  if (rebuild_req) {
+    pad_and_rebuild_to_page_align();
+    return _decode(ec_impl, want_set, need_set);
    }
+
+  compute_ro_range();
+
    return 0;
  }
  
+void shard_extent_map_t::pad_and_rebuild_to_page_align() {
+  bool resized = false;
+  for (auto &&[shard, emap] : extent_maps) {
+    extent_map aligned;
+
+    // Inserting while iterating is not supported in extent maps, make the
+    // iterated-over emap const to help defend against mistakes.
+    const extent_map &cemap = emap;
+    for (auto i = cemap.begin(); i != cemap.end(); ++i) {
+      bool resized_i = false;
+      bufferlist bl = i.get_val();
+      uint64_t start = i.get_off();
+      uint64_t end = start + i.get_len();
+
+      if ((start & ~CEPH_PAGE_MASK) != 0) {
+        bl.prepend_zero(start - (start & CEPH_PAGE_MASK));
+        start = start & CEPH_PAGE_MASK;
+        resized_i = true;
+      }
+      if ((end & ~CEPH_PAGE_MASK) != 0) {
+        bl.append_zero((end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE - end);
+        end = (end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE;
+        resized_i = true;
+      }
+
+      // Perhaps we can get away without page aligning here and only SIMD
+      // align. However, typical workloads are actually page aligned already,
+      // so this should not cause problems on any sensible workload.
+      if (bl.rebuild_aligned_size_and_memory(bl.length(), CEPH_PAGE_SIZE) ||
+        resized_i) {
+        // We are not permitted to modify the emap while iterating.
+        aligned.insert(start, end - start, bl);
+      }
+      if (resized_i) resized = true;
+    }
+    emap.insert(aligned);
+  }
+
+  if (resized) {
+    compute_ro_range();
+  }
+}
+
+shard_extent_map_t shard_extent_map_t::slice_map(
+    uint64_t offset, uint64_t length) const {
+  // Range entirely contains offset - this will be common for small IO.
+  if (offset <= start_offset && offset + length >= end_offset) return *this;
+
+  shard_extent_map_t slice(sinfo);
+
+  // Null cases just generate an empty map.
+  if (offset >= end_offset) {
+    return slice;
+  }
+  if (offset + length <= start_offset) {
+    return slice;
+  }
+
+  slice.end_offset = slice.ro_end = 0;
+
+  for (auto &&[shard, emap] : extent_maps) {
+    extent_map iemap = emap.intersect(offset, length);
+
+    if (!iemap.empty()) {
+      slice.start_offset = min(slice.start_offset, iemap.get_start_off());
+      slice.end_offset = max(slice.start_offset, iemap.get_end_off());
+      slice.ro_start = min(slice.start_offset,
+                           calc_ro_offset(sinfo->get_raw_shard(shard),
+                                          iemap.get_start_off()));
+      slice.ro_end = min(slice.ro_end,
+                         calc_ro_end(sinfo->get_raw_shard(shard),
+                                     iemap.get_end_off()));
+      slice.extent_maps.emplace(shard, iemap);
+    }
+  }
+
+  if (slice.end_offset == 0) {
+    slice.end_offset = slice.ro_end = invalid_offset;
+  }
+
+  return slice;
+}
+
+void shard_extent_map_t::get_buffer(shard_id_t shard, uint64_t offset,
+                                    uint64_t length,
+                                    buffer::list &append_to) const {
+  const extent_map &emap = extent_maps.at(shard);
+  auto &&range = emap.get_lower_range(offset, length);
+
+  if (range == emap.end() || !emap.contains(offset, length)) {
+    return;
+  }
+
+  if (range.get_len() == length) {
+    buffer::list bl = range.get_val();
+    // This should be asserted on extent map insertion.
+    ceph_assert(bl.length() == length);
+    append_to.append(bl);
+  } else {
+    buffer::list bl;
+    bl.substr_of(range.get_val(), offset - range.get_off(), length);
+    append_to.append(bl);
+  }
+}
+
+void shard_extent_map_t::get_shard_first_buffer(shard_id_t shard,
+                                                buffer::list &append_to) const {
+  if (!extent_maps.contains(shard)) {
+    return;
+  }
+  const extent_map &emap = extent_maps.at(shard);
+  auto range = emap.begin();
+  if (range == emap.end()) {
+    return;
+  }
+
+  append_to.append(range.get_val());
+}
+
+uint64_t shard_extent_map_t::get_shard_first_offset(shard_id_t shard) const {
+  if (!extent_maps.contains(shard)) {
+    return invalid_offset;
+  }
+  const extent_map &emap = extent_maps.at(shard);
+  auto range = emap.begin();
+  if (range == emap.end()) {
+    return invalid_offset;
+  }
+
+  return range.get_off();
+}
+
+void shard_extent_map_t::zero_pad(shard_extent_set_t const &pad_to) {
+  for (auto &&[shard, eset] : pad_to) {
+    for (auto &&[off, len] : eset) {
+      zero_pad(shard, off, len);
+    }
+  }
+}
+
+void shard_extent_map_t::zero_pad(shard_id_t shard, uint64_t offset,
+                                  uint64_t length) {
+  const extent_map &emap = extent_maps[shard];
+  if (emap.contains(offset, length)) {
+    return;
+  }
+
+  extent_set required;
+  required.union_insert(offset, length);
+  extent_set not_required;
+  emap.to_interval_set(not_required);
+  required.subtract(not_required);
+
+  for (auto [z_off, z_len] : required) {
+    bufferlist zeros;
+    zeros.append_zero(z_len);
+    insert_in_shard(shard, z_off, zeros);
+  }
+}
+
+void shard_extent_map_t::pad_with_other(shard_extent_set_t const &pad_to,
+                                        shard_extent_map_t const &other) {
+  for (auto &&[shard, eset] : pad_to) {
+    for (auto &&[off, len] : eset) {
+      pad_with_other(shard, off, len, other);
+    }
+  }
+}
+
+void shard_extent_map_t::pad_with_other(shard_id_t shard, uint64_t offset,
+                                        uint64_t length,
+                                        shard_extent_map_t const &other) {
+  const extent_map &emap = extent_maps[shard];
+  if (emap.contains(offset, length)) return;
+
+  extent_set required;
+  required.union_insert(offset, length);
+  extent_set not_required;
+  emap.to_interval_set(not_required);
+  required.subtract(not_required);
+
+  for (auto [z_off, z_len] : required) {
+    bufferlist bl;
+    other.get_buffer(shard, z_off, z_len, bl);
+    bl.rebuild();
+    insert_in_shard(shard, z_off, bl);
+  }
+}
+
+ECUtil::shard_extent_set_t shard_extent_map_t::get_extent_set() {
+  shard_extent_set_t shard_eset(sinfo->get_k_plus_m());
+  for (auto &&[shard, emap] : extent_maps) {
+    emap.to_interval_set(shard_eset[shard]);
+  }
+
+  return shard_eset;
+}
+
+void shard_extent_map_t::erase_shard(shard_id_t shard) {
+  if (extent_maps.erase(shard)) {
+    compute_ro_range();
+  }
+}
+
+bufferlist shard_extent_map_t::get_ro_buffer(
+    uint64_t ro_offset,
+    uint64_t ro_length) const {
+  bufferlist bl;
+  uint64_t chunk_size = sinfo->get_chunk_size();
+  uint64_t stripe_size = sinfo->get_stripe_width();
+  int data_chunk_count = sinfo->get_k();
+
+  pair read_pair(ro_offset, ro_length);
+  auto chunk_aligned_read = sinfo->ro_range_to_chunk_ro_range(read_pair);
+
+  raw_shard_id_t raw_shard((ro_offset / chunk_size) % data_chunk_count);
+
+  for (uint64_t chunk_offset = chunk_aligned_read.first;
+       chunk_offset < chunk_aligned_read.first + chunk_aligned_read.second;
+       chunk_offset += chunk_size, ++raw_shard) {
+    if ((int(raw_shard) == data_chunk_count)) {
+      raw_shard = 0;
+    }
+
+    uint64_t sub_chunk_offset = std::max(chunk_offset, ro_offset);
+    uint64_t sub_chunk_shard_offset = (chunk_offset / stripe_size) * chunk_size
+        + sub_chunk_offset - chunk_offset;
+    uint64_t sub_chunk_len = std::min(ro_offset + ro_length,
+                                      chunk_offset + chunk_size) -
+        sub_chunk_offset;
+
+    get_buffer(sinfo->get_shard(raw_shard), sub_chunk_shard_offset,
+               sub_chunk_len, bl);
+  }
+  return bl;
+}
+
+bufferlist shard_extent_map_t::get_ro_buffer() const {
+  return get_ro_buffer(ro_start, ro_end - ro_start);
+}
+
+std::string shard_extent_map_t::debug_string(uint64_t interval, uint64_t offset) const {
+  std::stringstream str;
+  str << "shard_extent_map_t: " << *this << " bufs: [";
+
+  bool s_comma = false;
+  for (auto &&[shard, emap] : get_extent_maps()) {
+    if (s_comma) str << ", ";
+    s_comma = true;
+    str << shard << ": [";
+
+    bool comma = false;
+    for (auto &&extent : emap) {
+      bufferlist bl = extent.get_val();
+      char *buf = bl.c_str();
+      for (uint64_t i = 0; i < extent.get_len(); i += interval) {
+        int *seed = (int*)&buf[i + offset];
+        if (comma) str << ", ";
+        str << (i + extent.get_off()) << ":" << std::to_string(*seed);
+        comma = true;
+      }
+    }
+    str << "]";
+  }
+  str << "]";
+  return str.str();
+}
+
+void shard_extent_map_t::erase_stripe(uint64_t offset, uint64_t length) {
+  for (auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+    auto &&[shard, emap] = *iter;
+    emap.erase(offset, length);
+    if (emap.empty()) {
+      iter = extent_maps.erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+  compute_ro_range();
+}
+
+bool shard_extent_map_t::contains(shard_id_t shard) const {
+  return extent_maps.contains(shard);
+}
+
+bool shard_extent_map_t::contains(optional<shard_extent_set_t> const &other) const {
+  if (!other) {
+    return true;
+  }
+
+  return contains(*other);
+}
+
+bool shard_extent_map_t::contains(shard_extent_set_t const &other) const {
+  for (auto &&[shard, other_eset] : other) {
+    if (!extent_maps.contains(shard)) {
+      return false;
+    }
+
+    extent_set eset;
+    extent_maps.at(shard).to_interval_set(eset);
+
+    if (!eset.contains(other_eset)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void shard_extent_set_t::subtract(const shard_extent_set_t &other) {
+  for (auto &&[shard, eset] : other) {
+    if (!contains(shard)) {
+      continue;
+    }
+
+    at(shard).subtract(eset);
+    if (at(shard).empty()) {
+      erase(shard);
+    }
+  }
+}
+
+void shard_extent_set_t::intersection_of(const shard_extent_set_t &other) {
+  for (shard_id_t s; s < map.max_size(); ++s) {
+    if (!map.contains(s) || !other.contains(s)) {
+      erase(s);
+    } else {
+      at(s).intersection_of(other.at(s));
+      if (at(s).empty()) {
+        erase(s);
+      }
+    }
+  }
+}
+
+void shard_extent_set_t::insert(const shard_extent_set_t &other) {
+  for (auto &&[shard, eset] : other) {
+    map[shard].union_of(other.at(shard));
+  }
+}
+}
+
  void ECUtil::HashInfo::append(uint64_t old_size,
-                             map<int, bufferlist> &to_append) {
+                              shard_id_map<bufferptr> &to_append) {
    ceph_assert(old_size == total_chunk_size);
    uint64_t size_to_append = to_append.begin()->second.length();
    if (has_chunk_hash()) {
      ceph_assert(to_append.size() == cumulative_shard_hashes.size());
-    for (map<int, bufferlist>::iterator i = to_append.begin();
-        i != to_append.end();
-        ++i) {
-      ceph_assert(size_to_append == i->second.length());
-      ceph_assert((unsigned)i->first < cumulative_shard_hashes.size());
-      uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
-      cumulative_shard_hashes[i->first] = new_hash;
+    for (auto &&[shard, ptr] : to_append) {
+      ceph_assert(size_to_append == ptr.length());
+      ceph_assert(shard < static_cast<int>(cumulative_shard_hashes.size()));
+      cumulative_shard_hashes[int(shard)] =
+          ceph_crc32c(cumulative_shard_hashes[int(shard)],
+                      (unsigned char*)ptr.c_str(), ptr.length());
      }
    }
    total_chunk_size += size_to_append;
  }
  
-void ECUtil::HashInfo::encode(bufferlist &bl) const
-{
+void ECUtil::HashInfo::encode(bufferlist &bl) const {
    ENCODE_START(1, 1, bl);
    encode(total_chunk_size, bl);
    encode(cumulative_shard_hashes, bl);
    ENCODE_FINISH(bl);
  }
  
-void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl)
-{
+void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) {
    DECODE_START(1, bl);
    decode(total_chunk_size, bl);
    decode(cumulative_shard_hashes, bl);
-  projected_total_chunk_size = total_chunk_size;
    DECODE_FINISH(bl);
  }
  
-void ECUtil::HashInfo::dump(Formatter *f) const
-{
+void ECUtil::HashInfo::dump(Formatter *f) const {
    f->dump_unsigned("total_chunk_size", total_chunk_size);
    f->open_array_section("cumulative_shard_hashes");
    for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) {
@@ -230,25 +1115,61 @@ void ECUtil::HashInfo::dump(Formatter *f) const
  }
  
  namespace ECUtil {
-std::ostream& operator<<(std::ostream& out, const HashInfo& hi)
-{
+std::ostream &operator<<(std::ostream &out, const HashInfo &hi) {
    ostringstream hashes;
-  for (auto hash: hi.cumulative_shard_hashes)
+  for (auto hash : hi.cumulative_shard_hashes) {
      hashes << " " << hex << hash;
+  }
    return out << "tcs=" << hi.total_chunk_size << hashes.str();
  }
+
+std::ostream &operator<<(std::ostream &out, const shard_extent_map_t &rhs) {
+  // sinfo not thought to be needed for debug, as it is constant.
+  return out << "shard_extent_map: ({" << rhs.ro_start << "~"
+      << rhs.ro_end << "}, maps=" << rhs.extent_maps << ")";
  }
  
-void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
-{
+std::ostream &operator<<(std::ostream &out, const log_entry_t &rhs) {
+  switch (rhs.event) {
+  case READ_REQUEST: out << "READ_REQUEST";
+    break;
+  case READ_DONE: out << "READ_DONE";
+    break;
+  case INJECT_EIO: out << "INJECT_EIO";
+    break;
+  case CANCELLED: out << "CANCELLED";
+    break;
+  case ERROR: out << "ERROR";
+    break;
+  case REQUEST_MISSING: out << "REQUEST_MISSING";
+    break;
+  case COMPLETE_ERROR: out << "COMPLETE_ERROR";
+    break;
+  case ERROR_CLEAR: out << "ERROR_CLEAR";
+    break;
+  case COMPLETE: out << "COMPLETE";
+    break;
+  default:
+    ceph_assert(false);
+  }
+  return out << "[" << rhs.shard << "]->" << rhs.io << "\n";
+}
+}
+
+void ECUtil::HashInfo::generate_test_instances(list<HashInfo*> &o) {
    o.push_back(new HashInfo(3));
    {
      bufferlist bl;
      bl.append_zero(20);
-    map<int, bufferlist> buffers;
-    buffers[0] = bl;
-    buffers[1] = bl;
-    buffers[2] = bl;
+
+    bufferptr bp = bl.begin().get_current_ptr();
+
+    // We don't have the k+m here, but this is not critical performance, so
+    // create an oversized map.
+    shard_id_map<bufferptr> buffers(128);
+    buffers[shard_id_t(0)] = bp;
+    buffers[shard_id_t(1)] = bp;
+    buffers[shard_id_t(2)] = bp;
      o.back()->append(0, buffers);
      o.back()->append(20, buffers);
    }
@@ -257,14 +1178,10 @@ void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
  
  const string HINFO_KEY = "hinfo_key";
  
-bool ECUtil::is_hinfo_key_string(const string &key)
-{
+bool ECUtil::is_hinfo_key_string(const string &key) {
    return key == HINFO_KEY;
  }
  
-const string &ECUtil::get_hinfo_key()
-{
+const string &ECUtil::get_hinfo_key() {
    return HINFO_KEY;
  }
-
-END_IGNORE_DEPRECATED
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h

index 65bdcb51994009596d8ee40fe6c8c7f3e04b5a62..f0abbb0cd3ca8e7888c23dc2de364390295d4aef 100644 (file)
--- a/src/osd/ECUtil.h
+++ b/src/osd/ECUtil.h
@@ -23,258 +23,1046 @@
  #include "include/buffer_fwd.h"
  #include "include/ceph_assert.h"
  #include "include/encoding.h"
-#include "common/Formatter.h"
+#include "common/interval_map.h"
+#include "common/mini_flat_map.h"
+
+#include "osd_types.h"
+
+/// If someone wants these types, but not ExtentCache, move to another file
+struct bl_split_merge {
+  ceph::buffer::list split(
+      uint64_t offset,
+      uint64_t length,
+      ceph::buffer::list &bl) const {
+    ceph::buffer::list out;
+    out.substr_of(bl, offset, length);
+    return out;
+  }
+
+  bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
+    return true;
+  }
+
+  ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
+    ceph::buffer::list bl{std::move(left)};
+    bl.claim_append(right);
+    return bl;
+  }
+
+  uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
+};
+
+using extent_set = interval_set<uint64_t, boost::container::flat_map, false>;
+using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge,
+                                boost::container::flat_map>;
+
+/* Slice iterator.  This looks for contiguous buffers which are common
+ * across all shards in the out_set.
+ *
+ * It is a template, but essentially:
+ * K must a key suitable for a mini_flat_map.
+ * T must be either an extent map or a reference to an extent map.
+ */
+template <typename K, typename T>
+class slice_iterator {
+  mini_flat_map<K, T> &input;
+  uint64_t offset = std::numeric_limits<uint64_t>::max();
+  uint64_t length = std::numeric_limits<uint64_t>::max();
+  uint64_t start = std::numeric_limits<uint64_t>::max();
+  uint64_t end = std::numeric_limits<uint64_t>::max();
+  shard_id_map<std::pair<extent_map::const_iterator,
+                         bufferlist::const_iterator>> iters;
+  shard_id_map<bufferptr> in;
+  shard_id_map<bufferptr> out;
+  const shard_id_set &out_set;
+
+  void advance() {
+    in.clear();
+    out.clear();
+    offset = start;
+    end = std::numeric_limits<uint64_t>::max();
+
+    if (iters.empty()) {
+      return;
+    }
+
+    // First we find the last buffer in the list
+    for (auto &&[shard, iters] : iters) {
+      auto &&[emap_iter, bl_iter] = iters;
+      uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off();
+      ceph_assert(iter_offset >= start);
+      // If this iterator is after the current offset, then we will ignore
+      // it for this buffer ptr. The end must move to or before this point.
+      if (iter_offset > start && iter_offset < end) {
+        end = iter_offset;
+        continue;
+      }
+
+      uint64_t iter_end = iter_offset + bl_iter.get_current_ptr().length();
+      if (iter_end < end) {
+        end = iter_end;
+      }
+    }
+
+    for (auto &&iter = iters.begin(); iter != iters.end();) {
+      auto shard = iter->first;
+      auto &&[emap_iter, bl_iter] = iter->second;
+      uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off();
+      bool erase = false;
+
+      // Ignore any blank buffers.
+      if (iter_offset == start) {
+        ceph_assert(iter_offset == start);
+
+        // Create a new buffer pointer for the result. We don't want the client
+        // manipulating the ptr.
+        if (out_set.contains(shard)) {
+          out.emplace(
+            shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
+        } else {
+          in.emplace(
+            shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
+        }
+
+        // Now we need to move on the iterators.
+        bl_iter += end - start;
+
+        // If we have reached the end of the extent, we need to move that on too.
+        if (bl_iter == emap_iter.get_val().end()) {
+          ++emap_iter;
+          if (emap_iter == input[shard].end()) {
+            erase = true;
+          } else {
+            iters.at(shard).second = emap_iter.get_val().begin();
+          }
+        }
+      } else
+        ceph_assert(iter_offset > start);
+
+      if (erase) {
+        iter = iters.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+
+    // We can now move the offset on.
+    length = end - start;
+    start = end;
+
+    /* This can arise in two ways:
+     * 1. We can generate an empty buffer out of a gap, so just skip over.
+     * 2. Only the inputs contain any interesting data.  We don't need
+     *    to perform a decode/encode on a slice in that case.
+     */
+    if (out.empty()) {
+      advance();
+    }
+  }
+
+public:
+  slice_iterator(mini_flat_map<K, T> &_input, const shard_id_set &out_set) :
+    input(_input),
+    iters(input.max_size()),
+    in(input.max_size()),
+    out(input.max_size()),
+    out_set(out_set) {
+    for (auto &&[shard, emap] : input) {
+      auto emap_iter = emap.begin();
+      auto bl_iter = emap_iter.get_val().begin();
+      auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter));
+      iters.emplace(shard, std::move(p));
+
+      if (emap_iter.get_off() < start) {
+        start = emap_iter.get_off();
+      }
+    }
+
+    advance();
+  }
+
+  shard_id_map<bufferptr> &get_in_bufferptrs() { return in; }
+  shard_id_map<bufferptr> &get_out_bufferptrs() { return out; }
+  uint64_t get_offset() const { return offset; }
+  uint64_t get_length() const { return length; }
+  bool is_end() const { return in.empty() && out.empty(); }
+
+  bool is_page_aligned() const {
+    for (auto &&[_, ptr] : in) {
+      uintptr_t p = (uintptr_t)ptr.c_str();
+      if (p & ~CEPH_PAGE_MASK) return false;
+      if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false;
+    }
+
+    for (auto &&[_, ptr] : out) {
+      uintptr_t p = (uintptr_t)ptr.c_str();
+      if (p & ~CEPH_PAGE_MASK) return false;
+      if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false;
+    }
+
+    return true;
+  }
+
+  slice_iterator &operator++() {
+    advance();
+    return *this;
+  }
+};
+
+// Setting to 1 turns on very large amounts of level 0 debug containing the
+// contents of buffers. Even on level 20 this is not really wanted.
+#define DEBUG_EC_BUFFERS 1
  
  namespace ECUtil {
+class shard_extent_map_t;
+
+struct shard_extent_set_t {
+  // The following boilerplate is just to make this look like a map.
+  shard_id_map<extent_set> map;
+
+  shard_extent_set_t(short max_shards) : map(max_shards) {}
+
+  bool contains(shard_id_t shard) const { return map.contains(shard); }
+  bool empty() const { return map.empty(); }
+  void swap(shard_extent_set_t &other) noexcept { map.swap(other.map); }
+  void clear() { map.clear(); }
+  auto erase(shard_id_t shard) { return map.erase(shard); }
+
+  auto erase(shard_id_map<extent_set>::iterator &iter) {
+    return map.erase(iter);
+  }
+
+  void erase_stripe(uint64_t offset, uint64_t length) {
+    for (auto it = map.begin(); it != map.end();) {
+      it->second.erase(offset, length);
+      if (it->second.empty()) it = map.erase(it);
+      else ++it;
+    }
+  }
+
+  auto begin() const { return map.cbegin(); }
+  auto begin() { return map.begin(); }
+  auto end() const { return map.cend(); }
+  auto end() { return map.end(); }
+
+  void emplace(shard_id_t shard, extent_set &&set) {
+    map.emplace(shard, std::move(set));
+  }
+
+  size_t shard_count() const { return map.size(); }
+  extent_set &at(shard_id_t shard) { return map.at(shard); }
+  const extent_set &at(shard_id_t shard) const { return map.at(shard); }
+
+  extent_set get(shard_id_t shard) const {
+    if (!map.contains(shard)) {
+      return extent_set();
+    }
+    return at(shard);
+  }
+
+  extent_set &operator[](shard_id_t shard) { return map[shard]; }
+
+  bool operator==(shard_extent_set_t const &other) const {
+    return map == other.map;
+  }
+
+  friend std::ostream &operator<<(std::ostream &lhs,
+                                  const shard_extent_set_t &rhs) {
+    lhs << rhs.map;
+    return lhs;
+  }
+
+  void get_extent_superset(extent_set &eset) const {
+    for (auto &&[_, e] : map) {
+      eset.union_of(e);
+    }
+  }
+
+  extent_set get_extent_superset() const {
+    extent_set eset;
+    get_extent_superset(eset);
+    return eset;
+  }
+
+  /* Return the extent set which is common across all populated shards. */
+  extent_set get_extent_common_set() const {
+    extent_set eset;
+    bool first = true;
+    for (auto &&[_, e] : map) {
+      if (first) {
+        eset.insert(e);
+        first = false;
+      } else {
+        eset.intersection_of(e);
+      }
+    }
+    return eset;
+  }
+
+  void align(uint64_t a) {
+    for (auto &&[_, e] : map) {
+      e.align(a);
+    }
+  }
+
+  size_t get_max_shards() const { return map.max_size(); }
+
+  void subtract(const shard_extent_set_t &set);
+  void intersection_of(const shard_extent_set_t &set);
+  void insert(const shard_extent_set_t &set);
+
+  /** return the sum of extent_set.size */
+  uint64_t size() const {
+    uint64_t size = 0;
+    for (auto &&[_, e] : map) size += e.size();
+
+    return size;
+  }
+
+  void populate_shard_id_set(shard_id_set &set) const {
+    map.populate_bitset_set(set);
+  }
+
+  shard_id_set get_shard_id_set() const {
+    shard_id_set r;
+    map.populate_bitset_set(r);
+    return r;
+  }
+};
+
+inline uint64_t page_mask() {
+  static const uint64_t page_mask = ((uint64_t)CEPH_PAGE_SIZE) - 1;
+  return page_mask;
+}
+
+inline uint64_t align_page_next(uint64_t val) {
+  return p2roundup(val, (uint64_t)CEPH_PAGE_SIZE);
+}
+
+inline uint64_t align_page_prev(uint64_t val) {
+  return p2align(val, (uint64_t)CEPH_PAGE_SIZE);
+}
  
  class stripe_info_t {
+  friend class shard_extent_map_t;
+
    const uint64_t stripe_width;
+  const uint64_t plugin_flags;
    const uint64_t chunk_size;
-  const unsigned int k; // Can be calculated with a division from above. Better to cache.
+  const pg_pool_t *pool;
+  const unsigned int k;
+  // Can be calculated with a division from above. Better to cache.
    const unsigned int m;
    const std::vector<shard_id_t> chunk_mapping;
    const std::vector<raw_shard_id_t> chunk_mapping_reverse;
+  const shard_id_set data_shards;
+  const shard_id_set parity_shards;
+
  private:
+  void ro_range_to_shards(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t *shard_extent_set,
+      extent_set *extent_superset,
+      buffer::list *bl,
+      shard_extent_map_t *shard_extent_map) const;
+
    static std::vector<shard_id_t> complete_chunk_mapping(
-    std::vector<shard_id_t> _chunk_mapping, unsigned int n)
-  {
-    unsigned int size = _chunk_mapping.size();
+      const std::vector<shard_id_t> &_chunk_mapping, unsigned int n) {
+    unsigned int size = (int)_chunk_mapping.size();
      std::vector<shard_id_t> chunk_mapping(n);
-    for (shard_id_t i; i < n; ++i) {
+    for (unsigned int i = 0; i < n; i++) {
        if (size > i) {
-        chunk_mapping.at(static_cast<int>(i)) = _chunk_mapping.at(static_cast<int>(i));
+        chunk_mapping.at(i) = _chunk_mapping.at(i);
        } else {
-        chunk_mapping.at(static_cast<int>(i)) = i;
+        chunk_mapping.at(i) = static_cast<int>(i);
        }
      }
      return chunk_mapping;
    }
+
    static std::vector<raw_shard_id_t> reverse_chunk_mapping(
-    std::vector<shard_id_t> chunk_mapping)
-  {
-    unsigned int size = chunk_mapping.size();
+      const std::vector<shard_id_t> &chunk_mapping) {
+    size_t size = chunk_mapping.size();
      std::vector<raw_shard_id_t> reverse(size);
      shard_id_set used;
-    for (raw_shard_id_t i; i < size; ++i) {
-      shard_id_t index = chunk_mapping.at(static_cast<int>(i));
+    for (raw_shard_id_t raw_shard; raw_shard < size; ++raw_shard) {
+      shard_id_t shard = chunk_mapping[int(raw_shard)];
        // Mapping must be a bijection and a permutation
-      ceph_assert(!used.contains(index));
-      used.insert(index);
-      reverse.at(static_cast<int>(index)) = i;
+      ceph_assert(!used.contains(shard));
+      used.insert(shard);
+      reverse.at(int(shard)) = raw_shard;
      }
      return reverse;
    }
+
+  static shard_id_set calc_shards(raw_shard_id_t start,
+                                  int count,
+                                  const std::vector<shard_id_t> &chunk_mapping) {
+    shard_id_set data_shards;
+    for (raw_shard_id_t raw_shard = start;
+         raw_shard < int(start) + count;
+         ++raw_shard) {
+      shard_id_t shard = chunk_mapping[int(raw_shard)];
+      data_shards.insert(shard);
+    }
+    return data_shards;
+  }
+
  public:
-  stripe_info_t(ErasureCodeInterfaceRef ec_impl, uint64_t stripe_width)
+  stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
+                uint64_t stripe_width
+    )
      : stripe_width(stripe_width),
+      plugin_flags(ec_impl->get_supported_optimizations()),
        chunk_size(stripe_width / ec_impl->get_data_chunk_count()),
+      pool(pool),
        k(ec_impl->get_data_chunk_count()),
        m(ec_impl->get_coding_chunk_count()),
-      chunk_mapping(complete_chunk_mapping(ec_impl->get_chunk_mapping(),
-                                          k + m)),
-      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+      chunk_mapping(
+        complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
+      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+      data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+    ceph_assert(stripe_width != 0);
      ceph_assert(stripe_width % k == 0);
    }
+
    // Simpler constructors for unit tests
    stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width)
      : stripe_width(stripe_width),
+      plugin_flags(0xFFFFFFFFFFFFFFFFul),
+      // Everything enabled for test harnesses.
        chunk_size(stripe_width / k),
+      pool(nullptr),
        k(k),
        m(m),
        chunk_mapping(complete_chunk_mapping(std::vector<shard_id_t>(), k + m)),
-      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+      data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+    ceph_assert(stripe_width != 0);
+    ceph_assert(stripe_width % k == 0);
+  }
+
+  stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
+                const std::vector<shard_id_t> &_chunk_mapping)
+    : stripe_width(stripe_width),
+      plugin_flags(0xFFFFFFFFFFFFFFFFul),
+      // Everything enabled for test harnesses.
+      chunk_size(stripe_width / k),
+      pool(nullptr),
+      k(k),
+      m(m),
+      chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)),
+      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+      data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+    ceph_assert(stripe_width != 0);
      ceph_assert(stripe_width % k == 0);
    }
+
    stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
-               std::vector<shard_id_t> _chunk_mapping)
+                const pg_pool_t *pool, const std::vector<shard_id_t> &_chunk_mapping)
      : stripe_width(stripe_width),
+      plugin_flags(0xFFFFFFFFFFFFFFFFul),
+      // Everything enabled for test harnesses.
        chunk_size(stripe_width / k),
+      pool(pool),
        k(k),
        m(m),
        chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)),
-      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+      data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+    ceph_assert(stripe_width != 0);
      ceph_assert(stripe_width % k == 0);
    }
-  bool logical_offset_is_stripe_aligned(uint64_t logical) const {
-    return (logical % stripe_width) == 0;
+
+  stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
+                const pg_pool_t *pool)
+    : stripe_width(stripe_width),
+      plugin_flags(0xFFFFFFFFFFFFFFFFul),
+      // Everything enabled for test harnesses.
+      chunk_size(stripe_width / k),
+      pool(pool),
+      k(k),
+      m(m),
+      chunk_mapping(complete_chunk_mapping(std::vector<shard_id_t>(), k + m)),
+      chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+      data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+      parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+    ceph_assert(stripe_width != 0);
+    ceph_assert(stripe_width % k == 0);
+  }
+
+  uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const {
+    uint64_t remainder = size % get_stripe_width();
+    uint64_t shard_size = (size - remainder) / k;
+    raw_shard_id_t raw_shard = get_raw_shard(shard);
+    if (raw_shard >= get_k()) {
+      // coding parity shards have same size as data shard 0
+      raw_shard = 0;
+    }
+    if (remainder > uint64_t(raw_shard) * get_chunk_size()) {
+      remainder -= uint64_t(raw_shard) * get_chunk_size();
+      if (remainder > get_chunk_size()) {
+        remainder = get_chunk_size();
+      }
+      shard_size += remainder;
+    }
+    return ECUtil::align_page_next(shard_size);
+  }
+
+  uint64_t ro_offset_to_shard_offset(uint64_t ro_offset,
+                                     const raw_shard_id_t raw_shard) const {
+    uint64_t full_stripes = (ro_offset / stripe_width) * chunk_size;
+    int offset_shard = (ro_offset / chunk_size) % k;
+
+    if (int(raw_shard) == offset_shard) {
+      return full_stripes + ro_offset % chunk_size;
+    }
+    if (raw_shard < offset_shard) {
+      return full_stripes + chunk_size;
+    }
+    return full_stripes;
+  }
+
+  /**
+   * Return true if shard does not require metadata updates
+   */
+  bool is_nonprimary_shard(const shard_id_t shard) const {
+    return pool->is_nonprimary_shard(shard);
    }
+
+  bool supports_ec_overwrites() const {
+    return pool->allows_ecoverwrites();
+  }
+
+  bool supports_sub_chunks() const {
+    return (plugin_flags &
+      ErasureCodeInterface::FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS) != 0;
+  }
+
+  bool require_hinfo() const {
+    return !supports_ec_overwrites();
+  }
+
+  bool supports_partial_reads() const {
+    return (plugin_flags &
+      ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION) != 0;
+  }
+
+  bool supports_partial_writes() const {
+    return (plugin_flags &
+      ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION) != 0;
+  }
+
+  bool supports_parity_delta_writes() const {
+    return (plugin_flags &
+      ErasureCodeInterface::FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION) != 0;
+  }
+
    uint64_t get_stripe_width() const {
      return stripe_width;
    }
+
    uint64_t get_chunk_size() const {
      return chunk_size;
    }
+
    unsigned int get_m() const {
      return m;
    }
+
    unsigned int get_k() const {
      return k;
    }
+
    unsigned int get_k_plus_m() const {
      return k + m;
    }
-  shard_id_t get_shard(raw_shard_id_t raw_shard) const {
-    return chunk_mapping[static_cast<int>(raw_shard)];
+
+  const shard_id_t get_shard(const raw_shard_id_t raw_shard) const {
+    return chunk_mapping[int(raw_shard)];
    }
+
    raw_shard_id_t get_raw_shard(shard_id_t shard) const {
-    return chunk_mapping_reverse[static_cast<int>(shard)];
+    return chunk_mapping_reverse.at(int(shard));
+  }
+
+  /* Return a "span" - which can be iterated over */
+  auto get_data_shards() const {
+    return data_shards;
    }
-  uint64_t logical_to_prev_chunk_offset(uint64_t offset) const {
+
+  auto get_parity_shards() const {
+    return parity_shards;
+  }
+
+  uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
      return (offset / stripe_width) * chunk_size;
    }
-  uint64_t logical_to_next_chunk_offset(uint64_t offset) const {
-    return ((offset + stripe_width - 1)/ stripe_width) * chunk_size;
+
+  uint64_t ro_offset_to_next_chunk_offset(uint64_t offset) const {
+    return ((offset + stripe_width - 1) / stripe_width) * chunk_size;
    }
-  uint64_t logical_to_prev_stripe_offset(uint64_t offset) const {
+
+  uint64_t ro_offset_to_prev_stripe_ro_offset(uint64_t offset) const {
      return offset - (offset % stripe_width);
    }
-  uint64_t logical_to_next_stripe_offset(uint64_t offset) const {
-    return ((offset % stripe_width) ?
-      (offset - (offset % stripe_width) + stripe_width) :
-      offset);
+
+  uint64_t ro_offset_to_next_stripe_ro_offset(uint64_t offset) const {
+    return ((offset % stripe_width)
+              ? (offset - (offset % stripe_width) + stripe_width)
+              : offset);
    }
-  uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+
+  uint64_t aligned_ro_offset_to_chunk_offset(uint64_t offset) const {
      ceph_assert(offset % stripe_width == 0);
      return (offset / stripe_width) * chunk_size;
    }
-  uint64_t chunk_aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+
+  uint64_t chunk_aligned_ro_offset_to_chunk_offset(uint64_t offset) const {
      [[maybe_unused]] const auto residue_in_stripe = offset % stripe_width;
      ceph_assert(residue_in_stripe % chunk_size == 0);
      ceph_assert(stripe_width % chunk_size == 0);
      // this rounds down
      return (offset / stripe_width) * chunk_size;
    }
-  uint64_t chunk_aligned_logical_size_to_chunk_size(uint64_t len) const {
-    [[maybe_unused]] const auto residue_in_stripe = len % stripe_width;
-    ceph_assert(residue_in_stripe % chunk_size == 0);
-    ceph_assert(stripe_width % chunk_size == 0);
+
+  uint64_t chunk_aligned_ro_length_to_shard_length(uint64_t len) const {
      // this rounds up
      return ((len + stripe_width - 1) / stripe_width) * chunk_size;
    }
-  uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const {
+
+  uint64_t chunk_aligned_shard_offset_to_ro_offset(uint64_t offset) const {
      ceph_assert(offset % chunk_size == 0);
      return (offset / chunk_size) * stripe_width;
    }
-  std::pair<uint64_t, uint64_t> chunk_aligned_offset_len_to_chunk(
-    std::pair<uint64_t, uint64_t> in) const;
-  std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds(
-    std::pair<uint64_t, uint64_t> in) const {
-    uint64_t off = logical_to_prev_stripe_offset(in.first);
-    uint64_t len = logical_to_next_stripe_offset(
-      (in.first - off) + in.second);
+
+  std::pair<uint64_t, uint64_t> chunk_aligned_ro_range_to_shard_ro_range(
+      uint64_t off, uint64_t len) const;
+
+  std::pair<uint64_t, uint64_t> ro_offset_len_to_stripe_ro_offset_len(
+      uint64_t _off, uint64_t _len) const {
+    uint64_t off = ro_offset_to_prev_stripe_ro_offset(_off);
+    uint64_t len = ro_offset_to_next_stripe_ro_offset(
+      (_off - off) + _len);
      return std::make_pair(off, len);
    }
-  std::pair<uint64_t, uint64_t> offset_len_to_chunk_bounds(
-    std::pair<uint64_t, uint64_t> in) const {
+
+  std::pair<uint64_t, uint64_t> ro_range_to_chunk_ro_range(
+      const std::pair<uint64_t, uint64_t> &in) const {
      uint64_t off = in.first - (in.first % chunk_size);
      uint64_t tmp_len = (in.first - off) + in.second;
-    uint64_t len = ((tmp_len % chunk_size) ?
-      (tmp_len - (tmp_len % chunk_size) + chunk_size) :
-      tmp_len);
+    uint64_t len = ((tmp_len % chunk_size)
+                      ? (tmp_len - (tmp_len % chunk_size) + chunk_size)
+                      : tmp_len);
      return std::make_pair(off, len);
    }
-  std::pair<uint64_t, uint64_t> offset_length_to_data_chunk_indices(
-    uint64_t off, uint64_t len) const {
-    assert(chunk_size > 0);
-    const auto first_chunk_idx = (off / chunk_size);
-    const auto last_chunk_idx = (chunk_size - 1 + off + len) / chunk_size;
-    return {first_chunk_idx, last_chunk_idx};
-  }
-  bool offset_length_is_same_stripe(
-    uint64_t off, uint64_t len) const {
-    if (len == 0) {
-      return true;
+
+  void ro_range_to_shard_extent_set(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set) const {
+    ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, nullptr, nullptr, nullptr);
+  }
+
+  void ro_range_to_shard_extent_set(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set,
+      extent_set &extent_superset) const {
+    ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &extent_superset,
+                       nullptr,
+                       nullptr);
+  }
+
+  void ro_range_to_shard_extent_set_with_parity(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set) const {
+    extent_set parity;
+    ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &parity, nullptr,
+                       nullptr);
+
+    if (parity.empty()) return;
+
+    for (shard_id_t shard : get_parity_shards()) {
+      shard_extent_set[shard].union_of(parity);
      }
-    assert(chunk_size > 0);
-    const auto first_stripe_idx = off / stripe_width;
-    const auto last_inc_stripe_idx = (off + len - 1) / stripe_width;
-    return first_stripe_idx == last_inc_stripe_idx;
    }
-};
  
-int decode(
-  const stripe_info_t &sinfo,
-  ceph::ErasureCodeInterfaceRef &ec_impl,
-  const std::set<int> want_to_read,
-  std::map<int, ceph::buffer::list> &to_decode,
-  ceph::buffer::list *out);
-
-int decode(
-  const stripe_info_t &sinfo,
-  ceph::ErasureCodeInterfaceRef &ec_impl,
-  std::map<int, ceph::buffer::list> &to_decode,
-  std::map<int, ceph::buffer::list*> &out);
-
-int encode(
-  const stripe_info_t &sinfo,
-  ceph::ErasureCodeInterfaceRef &ec_impl,
-  ceph::buffer::list &in,
-  const std::set<int> &want,
-  std::map<int, ceph::buffer::list> *out);
+  void ro_range_to_shard_extent_set_with_superset(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set,
+      extent_set &superset) const {
+    ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &superset, nullptr,
+                       nullptr);
+  }
+
+  void ro_range_to_shard_extent_map(
+      uint64_t ro_offset,
+      uint64_t ro_size,
+      buffer::list &bl,
+      shard_extent_map_t &shard_extent_map) const {
+    ro_range_to_shards(ro_offset, ro_size, nullptr, nullptr, &bl, &shard_extent_map);
+  }
+
+  void trim_shard_extent_set_for_ro_offset(uint64_t ro_offset,
+                                           ECUtil::shard_extent_set_t &
+                                           shard_extent_set) const;
+
+  void ro_size_to_stripe_aligned_read_mask(
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set) const;
+
+  void ro_size_to_read_mask(
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set) const;
+
+  void ro_size_to_zero_mask(
+      uint64_t ro_size,
+      ECUtil::shard_extent_set_t &shard_extent_set) const;
+};
  
  class HashInfo {
    uint64_t total_chunk_size = 0;
    std::vector<uint32_t> cumulative_shard_hashes;
  
-  // purely ephemeral, represents the size once all in-flight ops commit
-  uint64_t projected_total_chunk_size = 0;
  public:
    HashInfo() {}
+
    explicit HashInfo(unsigned num_chunks) :
      cumulative_shard_hashes(num_chunks, -1) {}
-  void append(uint64_t old_size, std::map<int, ceph::buffer::list> &to_append);
+
+  void append(uint64_t old_size, shard_id_map<bufferptr> &to_append);
+
    void clear() {
      total_chunk_size = 0;
      cumulative_shard_hashes = std::vector<uint32_t>(
        cumulative_shard_hashes.size(),
        -1);
    }
+
    void encode(ceph::buffer::list &bl) const;
    void decode(ceph::buffer::list::const_iterator &bl);
    void dump(ceph::Formatter *f) const;
-  static void generate_test_instances(std::list<HashInfo*>& o);
+  static void generate_test_instances(std::list<HashInfo*> &o);
+
    uint32_t get_chunk_hash(shard_id_t shard) const {
      ceph_assert(shard < cumulative_shard_hashes.size());
-    return cumulative_shard_hashes[static_cast<int>(shard)];
+    return cumulative_shard_hashes[int(shard)];
    }
+
    uint64_t get_total_chunk_size() const {
      return total_chunk_size;
    }
-  uint64_t get_projected_total_chunk_size() const {
-    return projected_total_chunk_size;
-  }
-  uint64_t get_total_logical_size(const stripe_info_t &sinfo) const {
-    return get_total_chunk_size() *
-      (sinfo.get_stripe_width()/sinfo.get_chunk_size());
-  }
-  uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const {
-    return get_projected_total_chunk_size() *
-      (sinfo.get_stripe_width()/sinfo.get_chunk_size());
-  }
-  void set_projected_total_logical_size(
-    const stripe_info_t &sinfo,
-    uint64_t logical_size) {
-    ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size));
-    projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset(
-      logical_size);
-  }
+
    void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) {
      cumulative_shard_hashes.clear();
      total_chunk_size = new_chunk_size;
    }
+
    bool has_chunk_hash() const {
      return !cumulative_shard_hashes.empty();
    }
+
    void update_to(const HashInfo &rhs) {
-    auto ptcs = projected_total_chunk_size;
      *this = rhs;
-    projected_total_chunk_size = ptcs;
    }
-  friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi);
+
+  friend std::ostream &operator<<(std::ostream &out, const HashInfo &hi);
  };
  
  typedef std::shared_ptr<HashInfo> HashInfoRef;
  
+class shard_extent_map_t {
+  static const uint64_t invalid_offset = std::numeric_limits<uint64_t>::max();
+
+public:
+  const stripe_info_t *sinfo;
+  // The maximal range of all extents maps within rados object space.
+  uint64_t ro_start;
+  uint64_t ro_end;
+  uint64_t start_offset;
+  uint64_t end_offset;
+  shard_id_map<extent_map> extent_maps;
+
+  slice_iterator<shard_id_t, extent_map> begin_slice_iterator(
+      const shard_id_set &out_set);
+
+  /* This caculates the ro offset for an offset into a particular shard */
+  uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const {
+    int stripes = shard_offset / sinfo->chunk_size;
+    return stripes * sinfo->stripe_width + uint64_t(raw_shard) * sinfo->
+        chunk_size +
+        shard_offset % sinfo->chunk_size;
+  }
+
+  uint64_t calc_ro_end(raw_shard_id_t raw_shard, int shard_offset) const {
+    return calc_ro_offset(raw_shard, shard_offset - 1) + 1;
+  }
+
+  /* This is a relatively expensive operation to update the ro offset/length.
+   * Ideally, we should be able to update offset/length incrementally.
+   */
+  void compute_ro_range() {
+    uint64_t start = invalid_offset;
+    uint64_t end = 0;
+    uint64_t o_start = invalid_offset;
+    uint64_t o_end = 0;
+
+    for (auto &&[shard, emap] : extent_maps) {
+      raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+      uint64_t start_off = emap.get_start_off();
+      uint64_t end_off = emap.get_end_off();
+      o_start = std::min(o_start, start_off);
+      o_end = std::max(o_end, end_off);
+
+      if (raw_shard < sinfo->get_k()) {
+        start = std::min(start, calc_ro_offset(raw_shard, start_off));
+        end = std::max(end, calc_ro_end(raw_shard, end_off));
+      }
+    }
+    if (end != 0) {
+      ro_start = start;
+      ro_end = end;
+      start_offset = o_start;
+      end_offset = o_end;
+    } else {
+      ro_start = invalid_offset;
+      ro_end = invalid_offset;
+      start_offset = invalid_offset;
+      end_offset = invalid_offset;
+    }
+  }
+
+public:
+  shard_extent_map_t(const stripe_info_t *sinfo) :
+    sinfo(sinfo),
+    ro_start(invalid_offset),
+    ro_end(invalid_offset),
+    start_offset(invalid_offset),
+    end_offset(invalid_offset),
+    extent_maps(sinfo->get_k_plus_m()) {}
+
+  shard_extent_map_t(const stripe_info_t *sinfo,
+                     shard_id_map<extent_map> &&_extent_maps) :
+    sinfo(sinfo),
+    extent_maps(std::move(_extent_maps)) {
+    // Empty shards are not permitted, so clear them out.
+    for (auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+      if (iter->second.empty()) {
+        iter = extent_maps.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+    compute_ro_range();
+  }
+
+  bool empty() const {
+    return ro_end == invalid_offset;
+  }
+
+  uint64_t get_ro_start() const {
+    return ro_start;
+  }
+
+  uint64_t get_ro_end() const {
+    return ro_end;
+  }
+
+  /* Return the extent maps.  For reading only, set to const as the returned
+   * map should not be modified.
+   * We want to avoid:
+   *  - Empty extent maps on shards
+   *  - getting the offset/length out of sync.
+   */
+  const auto &get_extent_maps() const {
+    return extent_maps;
+  }
+
+  /* Return a particlar extent map. This must be const because updating it
+   * would cause the shard_extent_map to become inconsistent.
+   *
+   * * This method will raise an exception if the shard has no extents.
+   */
+  const extent_map &get_extent_map(shard_id_t shard) const {
+    return extent_maps.at(shard);
+  }
+
+  extent_set get_extent_set(const shard_id_t &shard) const {
+    extent_set ret;
+    if (extent_maps.contains(shard)) {
+      extent_maps.at(shard).to_interval_set(ret);
+    }
+    return ret;
+  }
+
+  void to_shard_extent_set(shard_extent_set_t &set) const {
+    for (auto &&[shard, emap] : extent_maps) {
+      emap.to_interval_set(set[shard]);
+    }
+  }
+
+  bool contains_shard(shard_id_t shard) const {
+    return extent_maps.contains(shard);
+  }
+
+  void erase_after_ro_offset(uint64_t ro_offset);
+  shard_extent_map_t intersect_ro_range(uint64_t ro_offset, uint64_t ro_length) const;
+  shard_extent_map_t intersect(std::optional<shard_extent_set_t> const &other) const;
+  shard_extent_map_t intersect(shard_extent_set_t const &other) const;
+  void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl);
+  void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl,
+                       uint64_t new_start, uint64_t new_end);
+  void insert_ro_zero_buffer(uint64_t ro_offset, uint64_t ro_length);
+  void insert(shard_extent_map_t const &other);
+  void append_zeros_to_ro_offset(uint64_t ro_offset);
+  void insert_ro_extent_map(const extent_map &host_extent_map);
+  extent_set get_extent_superset() const;
+  int encode(const ErasureCodeInterfaceRef &ec_impl, const HashInfoRef &hinfo,
+             uint64_t before_ro_size);
+  int _encode(const ErasureCodeInterfaceRef &ec_impl);
+  int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl,
+                          shard_extent_map_t &old_sem);
+
+  void pad_on_shards(const shard_extent_set_t &pad_to,
+                     const shard_id_set &shards);
+  void pad_on_shards(const extent_set &pad_to,
+                     const shard_id_set &shards);
+  void trim(const shard_extent_set_t &trim_to);
+  int decode(const ErasureCodeInterfaceRef &ec_impl,
+             const shard_extent_set_t &want,
+             uint64_t object_size);
+  int _decode(const ErasureCodeInterfaceRef &ec_impl,
+              const shard_id_set &want_set,
+              const shard_id_set &need_set);
+  void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length,
+                  buffer::list &append_to) const;
+  void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const;
+  uint64_t get_shard_first_offset(shard_id_t shard) const;
+  void zero_pad(shard_extent_set_t const &pad_to);
+  void zero_pad(shard_id_t shard, uint64_t offset, uint64_t length);
+  void pad_with_other(shard_extent_set_t const &pad_to,
+                      shard_extent_map_t const &other);
+  void pad_with_other(shard_id_t shard, uint64_t offset, uint64_t length,
+                      shard_extent_map_t const &other);
+  bufferlist get_ro_buffer(uint64_t ro_offset, uint64_t ro_length) const;
+  /* Returns a buffer assuming that there is a single contigious buffer
+   * represented by the map. */
+  bufferlist get_ro_buffer() const;
+  shard_extent_set_t get_extent_set();
+  void insert_parity_buffers();
+  void erase_shard(shard_id_t shard);
+  shard_extent_map_t slice_map(uint64_t offset, uint64_t length) const;
+  std::string debug_string(uint64_t inteval, uint64_t offset) const;
+  void erase_stripe(uint64_t offset, uint64_t length);
+  bool contains(shard_id_t shard) const;
+  bool contains(std::optional<shard_extent_set_t> const &other) const;
+  bool contains(shard_extent_set_t const &other) const;
+  void pad_and_rebuild_to_page_align();
+  uint64_t size();
+  void clear();
+  uint64_t get_start_offset() const { return start_offset; }
+  uint64_t get_end_offset() const { return end_offset; }
+  void deep_copy(shard_extent_map_t const &other);
+  void swap() {}
+  size_t shard_count() { return extent_maps.size(); }
+
+
+  void assert_buffer_contents_equal(shard_extent_map_t other) const {
+    for (auto &&[shard, emap] : extent_maps) {
+      for (auto &&i : emap) {
+        bufferlist bl = i.get_val();
+        bufferlist otherbl;
+        other.get_buffer(shard, i.get_off(), i.get_len(), otherbl);
+        ceph_assert(bl.contents_equal(otherbl));
+      }
+    }
+  }
+
+  bool add_zero_padding_for_decode(uint64_t object_size, shard_id_set &exclude_set) {
+    shard_extent_set_t zeros(sinfo->get_k_plus_m());
+    sinfo->ro_size_to_zero_mask(object_size, zeros);
+    extent_set superset = get_extent_superset();
+    bool changed = false;
+    for (auto &&[shard, z] : zeros) {
+      if (exclude_set.contains(shard)) {
+        continue;
+      }
+      z.intersection_of(superset);
+      for (auto [off, len] : z) {
+        changed = true;
+        bufferlist bl;
+        bl.append_zero(len);
+        extent_maps[shard].insert(off, len, bl);
+      }
+    }
+
+    if (changed) {
+      compute_ro_range();
+    }
+
+    return changed;
+  }
+
+  friend std::ostream &operator<<(std::ostream &lhs,
+                                  const shard_extent_map_t &rhs);
+
+  friend bool operator==(const shard_extent_map_t &lhs,
+                         const shard_extent_map_t &rhs) {
+    return lhs.sinfo == rhs.sinfo
+        && lhs.ro_start == rhs.ro_start
+        && lhs.ro_end == rhs.ro_end
+        && lhs.extent_maps == rhs.extent_maps;
+  }
+};
+
+typedef enum {
+  READ_REQUEST,
+  READ_DONE,
+  INJECT_EIO,
+  CANCELLED,
+  ERROR,
+  REQUEST_MISSING,
+  COMPLETE_ERROR,
+  ERROR_CLEAR,
+  COMPLETE
+} log_event_t;
+
+struct log_entry_t {
+  const log_event_t event;
+  const pg_shard_t shard;
+  const extent_set io;
+
+  log_entry_t(
+      const log_event_t event,
+      const pg_shard_t &shard,
+      const extent_set &io) :
+    event(event), shard(shard), io(io) {}
+
+  log_entry_t(
+      const log_event_t event,
+      const pg_shard_t &shard) :
+    event(event), shard(shard) {}
+
+  log_entry_t(
+      const log_event_t event,
+      const pg_shard_t &pg_shard,
+      const shard_extent_map_t &extent_map) :
+    event(event), shard(pg_shard),
+    io(extent_map.contains(pg_shard.shard)
+         ? extent_map.get_extent_set(pg_shard.shard)
+         : extent_set()) {}
+
+  friend std::ostream &operator<<(std::ostream &out, const log_entry_t &lhs);
+};
+
  bool is_hinfo_key_string(const std::string &key);
  const std::string &get_hinfo_key();
  
diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc

deleted file mode 100644 (file)

index 3a8bbf1..0000000
--- a/src/osd/ExtentCache.cc
+++ /dev/null
@@ -1,245 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#include "ExtentCache.h"
-
-using std::ostream;
-
-using ceph::bufferlist;
-
-void ExtentCache::extent::_link_pin_state(pin_state &pin_state)
-{
-  ceph_assert(parent_extent_set);
-  ceph_assert(!parent_pin_state);
-  parent_pin_state = &pin_state;
-  pin_state.pin_list.push_back(*this);
-}
-
-void ExtentCache::extent::_unlink_pin_state()
-{
-  ceph_assert(parent_extent_set);
-  ceph_assert(parent_pin_state);
-  auto liter = pin_state::list::s_iterator_to(*this);
-  parent_pin_state->pin_list.erase(liter);
-  parent_pin_state = nullptr;
-}
-
-void ExtentCache::extent::unlink()
-{
-  ceph_assert(parent_extent_set);
-  ceph_assert(parent_pin_state);
-
-  _unlink_pin_state();
-
-  // remove from extent set
-  {
-    auto siter = object_extent_set::set::s_iterator_to(*this);
-    auto &set = object_extent_set::set::container_from_iterator(siter);
-    ceph_assert(&set == &(parent_extent_set->extent_set));
-    set.erase(siter);
-  }
-
-  parent_extent_set = nullptr;
-  ceph_assert(!parent_pin_state);
-}
-
-void ExtentCache::extent::link(
-  object_extent_set &extent_set,
-  pin_state &pin_state)
-{
-  ceph_assert(!parent_extent_set);
-  parent_extent_set = &extent_set;
-  extent_set.extent_set.insert(*this);
-
-  _link_pin_state(pin_state);
-}
-
-void ExtentCache::extent::move(
-  pin_state &to)
-{
-  _unlink_pin_state();
-  _link_pin_state(to);
-}
-
-void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset)
-{
-  if (eset.extent_set.empty()) {
-    auto siter = cache_set::s_iterator_to(eset);
-    auto &set = cache_set::container_from_iterator(siter);
-    ceph_assert(&set == &per_object_caches);
-
-    // per_object_caches owns eset
-    per_object_caches.erase(eset);
-    delete &eset;
-  }
-}
-
-ExtentCache::object_extent_set &ExtentCache::get_or_create(
-  const hobject_t &oid)
-{
-  cache_set::insert_commit_data data;
-  auto p = per_object_caches.insert_check(oid, Cmp(), data);
-  if (p.second) {
-    auto *eset = new object_extent_set(oid);
-    per_object_caches.insert_commit(*eset, data);
-    return *eset;
-  } else {
-    return *(p.first);
-  }
-}
-
-ExtentCache::object_extent_set *ExtentCache::get_if_exists(
-  const hobject_t &oid)
-{
-  cache_set::insert_commit_data data;
-  auto p = per_object_caches.insert_check(oid, Cmp(), data);
-  if (p.second) {
-    return nullptr;
-  } else {
-    return &*(p.first);
-  }
-}
-
-std::pair<
-  ExtentCache::object_extent_set::set::iterator,
-  ExtentCache::object_extent_set::set::iterator
-  > ExtentCache::object_extent_set::get_containing_range(
-    uint64_t off, uint64_t len)
-{
-  // fst is first iterator with end after off (may be end)
-  auto fst = extent_set.upper_bound(off, uint_cmp());
-  if (fst != extent_set.begin())
-    --fst;
-  if (fst != extent_set.end() && off >= (fst->offset + fst->get_length()))
-    ++fst;
-
-  // lst is first iterator with start >= off + len (may be end)
-  auto lst = extent_set.lower_bound(off + len, uint_cmp());
-  return std::make_pair(fst, lst);
-}
-
-extent_set ExtentCache::reserve_extents_for_rmw(
-  const hobject_t &oid,
-  write_pin &pin,
-  const extent_set &to_write,
-  const extent_set &to_read)
-{
-  if (to_write.empty() && to_read.empty()) {
-    return extent_set();
-  }
-  extent_set must_read;
-  auto &eset = get_or_create(oid);
-  extent_set missing;
-  for (auto &&res: to_write) {
-    eset.traverse_update(
-      pin,
-      res.first,
-      res.second,
-      [&](uint64_t off, uint64_t len,
-         extent *ext, object_extent_set::update_action *action) {
-       action->action = object_extent_set::update_action::UPDATE_PIN;
-       if (!ext) {
-         missing.insert(off, len);
-       }
-      });
-  }
-  must_read.intersection_of(
-    to_read,
-    missing);
-  return must_read;
-}
-
-extent_map ExtentCache::get_remaining_extents_for_rmw(
-  const hobject_t &oid,
-  write_pin &pin,
-  const extent_set &to_get)
-{
-  if (to_get.empty()) {
-    return extent_map();
-  }
-  extent_map ret;
-  auto &eset = get_or_create(oid);
-  for (auto &&res: to_get) {
-    bufferlist bl;
-    uint64_t cur = res.first;
-    eset.traverse_update(
-      pin,
-      res.first,
-      res.second,
-      [&](uint64_t off, uint64_t len,
-         extent *ext, object_extent_set::update_action *action) {
-       ceph_assert(off == cur);
-       cur = off + len;
-       action->action = object_extent_set::update_action::NONE;
-       ceph_assert(ext && ext->bl && ext->pinned_by_write());
-       bl.substr_of(
-         *(ext->bl),
-         off - ext->offset,
-         len);
-       ret.insert(off, len, bl);
-      });
-  }
-  return ret;
-}
-
-void ExtentCache::present_rmw_update(
-  const hobject_t &oid,
-  write_pin &pin,
-  const extent_map &extents)
-{
-  if (extents.empty()) {
-    return;
-  }
-  auto &eset = get_or_create(oid);
-  for (auto &&res: extents) {
-    eset.traverse_update(
-      pin,
-      res.get_off(),
-      res.get_len(),
-      [&](uint64_t off, uint64_t len,
-         extent *ext, object_extent_set::update_action *action) {
-       action->action = object_extent_set::update_action::NONE;
-       ceph_assert(ext && ext->pinned_by_write());
-       action->bl = bufferlist();
-       action->bl->substr_of(
-         res.get_val(),
-         off - res.get_off(),
-         len);
-      });
-  }
-}
-
-ostream &ExtentCache::print(ostream &out) const
-{
-  out << "ExtentCache(" << std::endl;
-  for (auto esiter = per_object_caches.begin();
-       esiter != per_object_caches.end();
-       ++esiter) {
-    out << "  Extents(" << esiter->oid << ")[" << std::endl;
-    for (auto exiter = esiter->extent_set.begin();
-        exiter != esiter->extent_set.end();
-        ++exiter) {
-      out << "    Extent(" << exiter->offset
-         << "~" << exiter->get_length()
-         << ":" << exiter->pin_tid()
-         << ")" << std::endl;
-    }
-  }
-  return out << ")" << std::endl;
-}
-
-ostream &operator<<(ostream &lhs, const ExtentCache &cache)
-{
-  return cache.print(lhs);
-}
diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h

deleted file mode 100644 (file)

index 674ba69..0000000
--- a/src/osd/ExtentCache.h
+++ /dev/null
@@ -1,486 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#pragma once
-
-#include <map>
-#include <list>
-#include <vector>
-#include <utility>
-#include <optional>
-#include <boost/intrusive/set.hpp>
-#include <boost/intrusive/list.hpp>
-#include "include/interval_set.h"
-#include "common/interval_map.h"
-#include "include/buffer.h"
-#include "common/hobject.h"
-
-/**
-   ExtentCache
-
-   The main purpose of this cache is to ensure that we can pipeline
-   overlapping partial overwrites.
-
-   To that end we need to ensure that an extent pinned for an operation is
-   live until that operation completes.  However, a particular extent
-   might be pinned by multiple operations (several pipelined writes
-   on the same object).
-
-   1) When we complete an operation, we only look at extents owned only
-      by that operation.
-   2) Per-extent overhead is fixed size.
-   2) Per-operation metadata is fixed size.
-
-   This is simple enough to realize with two main structures:
-   - extent: contains a pointer to the pin owning it and intrusive list
-             pointers to other extents owned by the same pin
-   - pin_state: contains the list head for extents owned by it
-
-   This works as long as we only need to remember one "owner" for
-   each extent.  To make this work, we'll need to leverage some
-   invariants guaranteed by higher layers:
-
-   1) Writes on a particular object must be ordered
-   2) A particular object will have outstanding reads or writes, but not
-      both (note that you can have a read while a write is committed, but
-      not applied).
-
-   Our strategy therefore will be to have whichever in-progress op will
-   finish "last" be the owner of a particular extent.  For now, we won't
-   cache reads, so 2) simply means that we can assume that reads and
-   recovery operations imply no unstable extents on the object in
-   question.
-
-   Write: WaitRead -> WaitCommit -> Complete
-
-   Invariant 1) above actually indicates that we can't have writes
-   bypassing the WaitRead state while there are writes waiting on
-   Reads.  Thus, the set of operations pinning a particular extent
-   must always complete in order or arrival.
-
-   This suggests that a particular extent may be in only the following
-   states:
-
-
-   0) Empty (not in the map at all)
-   1) Write Pending N
-      - Some write with reqid <= N is currently fetching the data for
-        this extent
-      - The extent must persist until Write reqid N completes
-      - All ops pinning this extent are writes in the WaitRead state of
-        the Write pipeline (there must be an in progress write, so no
-       reads can be in progress).
-   2) Write Pinned N:
-      - This extent has data corresponding to some reqid M <= N
-      - The extent must persist until Write reqid N commits
-      - All ops pinning this extent are writes in some Write
-        state (all are possible).  Reads are not possible
-       in this state (or the others) due to 2).
-
-   All of the above suggests that there are 3 things users can
-   ask of the cache corresponding to the 3 Write pipelines
-   states.
- */
-
-/// If someone wants these types, but not ExtentCache, move to another file
-struct bl_split_merge {
-  ceph::buffer::list split(
-    uint64_t offset,
-    uint64_t length,
-    ceph::buffer::list &bl) const {
-    ceph::buffer::list out;
-    out.substr_of(bl, offset, length);
-    return out;
-  }
-  bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
-    return true;
-  }
-  ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
-    ceph::buffer::list bl{std::move(left)};
-    bl.claim_append(right);
-    return bl;
-  }
-  uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
-};
-using extent_set = interval_set<uint64_t>;
-using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge>;
-
-class ExtentCache {
-  struct object_extent_set;
-  struct pin_state;
-private:
-
-  struct extent {
-    object_extent_set *parent_extent_set = nullptr;
-    pin_state *parent_pin_state = nullptr;
-    boost::intrusive::set_member_hook<> extent_set_member;
-    boost::intrusive::list_member_hook<> pin_list_member;
-
-    uint64_t offset;
-    uint64_t length;
-    std::optional<ceph::buffer::list> bl;
-
-    uint64_t get_length() const {
-      return length;
-    }
-
-    bool is_pending() const {
-      return bl == std::nullopt;
-    }
-
-    bool pinned_by_write() const {
-      ceph_assert(parent_pin_state);
-      return parent_pin_state->is_write();
-    }
-
-    uint64_t pin_tid() const {
-      ceph_assert(parent_pin_state);
-      return parent_pin_state->tid;
-    }
-
-    extent(uint64_t offset, ceph::buffer::list _bl)
-      : offset(offset), length(_bl.length()), bl(_bl) {}
-
-    extent(uint64_t offset, uint64_t length)
-      : offset(offset), length(length) {}
-
-    bool operator<(const extent &rhs) const {
-      return offset < rhs.offset;
-    }
-  private:
-    // can briefly violate the two link invariant, used in unlink() and move()
-    void _link_pin_state(pin_state &pin_state);
-    void _unlink_pin_state();
-  public:
-    void unlink();
-    void link(object_extent_set &parent_extent_set, pin_state &pin_state);
-    void move(pin_state &to);
-  };
-
-  struct object_extent_set : boost::intrusive::set_base_hook<> {
-    hobject_t oid;
-    explicit object_extent_set(const hobject_t &oid) : oid(oid) {}
-
-    using set_member_options = boost::intrusive::member_hook<
-      extent,
-      boost::intrusive::set_member_hook<>,
-      &extent::extent_set_member>;
-    using set = boost::intrusive::set<extent, set_member_options>;
-    set extent_set;
-
-    bool operator<(const object_extent_set &rhs) const {
-      return oid < rhs.oid;
-    }
-
-    struct uint_cmp {
-      bool operator()(uint64_t lhs, const extent &rhs) const {
-       return lhs < rhs.offset;
-      }
-      bool operator()(const extent &lhs, uint64_t rhs) const {
-       return lhs.offset < rhs;
-      }
-    };
-    std::pair<set::iterator, set::iterator> get_containing_range(
-      uint64_t offset, uint64_t length);
-
-    void erase(uint64_t offset, uint64_t length);
-
-    struct update_action {
-      enum type {
-       NONE,
-       UPDATE_PIN
-      };
-      type action = NONE;
-      std::optional<ceph::buffer::list> bl;
-    };
-    template <typename F>
-    void traverse_update(
-      pin_state &pin,
-      uint64_t offset,
-      uint64_t length,
-      F &&f) {
-      auto range = get_containing_range(offset, length);
-
-      if (range.first == range.second || range.first->offset > offset) {
-       uint64_t extlen = range.first == range.second ?
-         length : range.first->offset - offset;
-
-       update_action action;
-       f(offset, extlen, nullptr, &action);
-       ceph_assert(!action.bl || action.bl->length() == extlen);
-       if (action.action == update_action::UPDATE_PIN) {
-         extent *ext = action.bl ?
-           new extent(offset, *action.bl) :
-           new extent(offset, extlen);
-         ext->link(*this, pin);
-       } else {
-         ceph_assert(!action.bl);
-       }
-      }
-
-      for (auto p = range.first; p != range.second;) {
-       extent *ext = &*p;
-       ++p;
-
-       uint64_t extoff = std::max(ext->offset, offset);
-       uint64_t extlen = std::min(
-         ext->length - (extoff - ext->offset),
-         offset + length - extoff);
-
-       update_action action;
-       f(extoff, extlen, ext, &action);
-       ceph_assert(!action.bl || action.bl->length() == extlen);
-       extent *final_extent = nullptr;
-       if (action.action == update_action::NONE) {
-         final_extent = ext;
-       } else {
-         pin_state *ps = ext->parent_pin_state;
-         ext->unlink();
-         if ((ext->offset < offset) &&
-             (ext->offset + ext->get_length() > offset)) {
-           extent *head = nullptr;
-           if (ext->bl) {
-             ceph::buffer::list bl;
-             bl.substr_of(
-               *(ext->bl),
-               0,
-               offset - ext->offset);
-             head = new extent(ext->offset, bl);
-           } else {
-             head = new extent(
-               ext->offset, offset - ext->offset);
-           }
-           head->link(*this, *ps);
-         }
-         if ((ext->offset + ext->length > offset + length) &&
-             (offset + length > ext->offset)) {
-           uint64_t nlen =
-             (ext->offset + ext->get_length()) - (offset + length);
-           extent *tail = nullptr;
-           if (ext->bl) {
-             ceph::buffer::list bl;
-             bl.substr_of(
-               *(ext->bl),
-               ext->get_length() - nlen,
-               nlen);
-             tail = new extent(offset + length, bl);
-           } else {
-             tail = new extent(offset + length, nlen);
-           }
-           tail->link(*this, *ps);
-         }
-         if (action.action == update_action::UPDATE_PIN) {
-           if (ext->bl) {
-             ceph::buffer::list bl;
-             bl.substr_of(
-               *(ext->bl),
-               extoff - ext->offset,
-               extlen);
-             final_extent = new ExtentCache::extent(
-               extoff,
-               bl);
-           } else {
-             final_extent = new ExtentCache::extent(
-               extoff, extlen);
-           }
-           final_extent->link(*this, pin);
-         }
-         delete ext;
-       }
-
-       if (action.bl) {
-         ceph_assert(final_extent);
-         ceph_assert(final_extent->length == action.bl->length());
-         final_extent->bl = *(action.bl);
-       }
-
-       uint64_t next_off = p == range.second ?
-         offset + length : p->offset;
-       if (extoff + extlen < next_off) {
-         uint64_t tailoff = extoff + extlen;
-         uint64_t taillen = next_off - tailoff;
-
-         update_action action;
-         f(tailoff, taillen, nullptr, &action);
-         ceph_assert(!action.bl || action.bl->length() == taillen);
-         if (action.action == update_action::UPDATE_PIN) {
-           extent *ext = action.bl ?
-             new extent(tailoff, *action.bl) :
-             new extent(tailoff, taillen);
-           ext->link(*this, pin);
-         } else {
-           ceph_assert(!action.bl);
-         }
-       }
-      }
-    }
-  };
-  struct Cmp {
-    bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
-      return oid < rhs.oid;
-    }
-    bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
-      return lhs.oid < oid;
-    }
-  };
-
-  object_extent_set &get_or_create(const hobject_t &oid);
-  object_extent_set *get_if_exists(const hobject_t &oid);
-
-  void remove_and_destroy_if_empty(object_extent_set &set);
-  using cache_set = boost::intrusive::set<object_extent_set>;
-  cache_set per_object_caches;
-
-  uint64_t next_write_tid = 1;
-  uint64_t next_read_tid = 1;
-  struct pin_state {
-    uint64_t tid = 0;
-    enum pin_type_t {
-      NONE,
-      WRITE,
-    };
-    pin_type_t pin_type = NONE;
-    bool is_write() const { return pin_type == WRITE; }
-
-    pin_state(const pin_state &other) = delete;
-    pin_state &operator=(const pin_state &other) = delete;
-    pin_state(pin_state &&other) = delete;
-    pin_state() = default;
-
-    using list_member_options = boost::intrusive::member_hook<
-      extent,
-      boost::intrusive::list_member_hook<>,
-      &extent::pin_list_member>;
-    using list = boost::intrusive::list<extent, boost::intrusive::constant_time_size<false>, list_member_options>;
-    list pin_list;
-    ~pin_state() {
-      ceph_assert(pin_list.empty());
-      ceph_assert(tid == 0);
-      ceph_assert(pin_type == NONE);
-    }
-    void _open(uint64_t in_tid, pin_type_t in_type) {
-      ceph_assert(pin_type == NONE);
-      ceph_assert(in_tid > 0);
-      tid = in_tid;
-      pin_type = in_type;
-    }
-  };
-
-  void release_pin(pin_state &p) {
-    for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
-      std::unique_ptr<extent> extent(&*iter); // we now own this
-      iter++; // unlink will invalidate
-      ceph_assert(extent->parent_extent_set);
-      auto &eset = *(extent->parent_extent_set);
-      extent->unlink();
-      remove_and_destroy_if_empty(eset);
-    }
-    p.tid = 0;
-    p.pin_type = pin_state::NONE;
-  }
-
-public:
-  class write_pin : private pin_state {
-    friend class ExtentCache;
-  private:
-    void open(uint64_t in_tid) {
-      _open(in_tid, pin_state::WRITE);
-    }
-  public:
-    write_pin() : pin_state() {}
-  };
-
-  void open_write_pin(write_pin &pin) {
-    pin.open(next_write_tid++);
-  }
-
-  /**
-   * Reserves extents required for rmw, and learn
-   * which need to be read
-   *
-   * Pins all extents in to_write.  Returns subset of to_read not
-   * currently present in the cache.  Caller must obtain those
-   * extents before calling get_remaining_extents_for_rmw.
-   *
-   * Transition table:
-   * - Empty -> Write Pending pin.reqid
-   * - Write Pending N -> Write Pending pin.reqid
-   * - Write Pinned N -> Write Pinned pin.reqid
-   *
-   * @param oid [in] object undergoing rmw
-   * @param pin [in,out] pin to use (obtained from create_write_pin)
-   * @param to_write [in] extents which will be written
-   * @param to_read [in] extents to read prior to write (must be subset
-   *                     of to_write)
-   * @return subset of to_read which isn't already present or pending
-   */
-  extent_set reserve_extents_for_rmw(
-    const hobject_t &oid,
-    write_pin &pin,
-    const extent_set &to_write,
-    const extent_set &to_read);
-
-  /**
-   * Gets extents required for rmw not returned from
-   * reserve_extents_for_rmw
-   *
-   * Requested extents (to_get) must be the set to_read \ the set
-   * returned from reserve_extents_for_rmw.  No transition table,
-   * all extents at this point must be present and already pinned
-   * for this pin by reserve_extents_for_rmw.
-   *
-   * @param oid [in] object
-   * @param pin [in,out] pin associated with this IO
-   * @param to_get [in] extents to get (see above for restrictions)
-   * @return map of buffers from to_get
-   */
-  extent_map get_remaining_extents_for_rmw(
-    const hobject_t &oid,
-    write_pin &pin,
-    const extent_set &to_get);
-
-  /**
-   * Updates the cache to reflect the rmw write
-   *
-   * All presented extents must already have been specified in
-   * reserve_extents_for_rmw under to_write.
-   *
-   * Transition table:
-   * - Empty -> invalid, must call reserve_extents_for_rmw first
-   * - Write Pending N -> Write Pinned N, update buffer
-   *     (assert N >= pin.reqid)
-   * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
-   *
-   * @param oid [in] object
-   * @param pin [in,out] pin associated with this IO
-   * @param extents [in] map of buffers to update
-   * @return void
-   */
-  void present_rmw_update(
-    const hobject_t &oid,
-    write_pin &pin,
-    const extent_map &extents);
-
-  /**
-   * Release all buffers pinned by pin
-   */
-  void release_write_pin(
-    write_pin &pin) {
-    release_pin(pin);
-  }
-
-  std::ostream &print(std::ostream &out) const;
-};
-
-std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache);
-\ No newline at end of file
diff --git a/src/test/erasure-code/TestErasureCodePluginJerasure.cc b/src/test/erasure-code/TestErasureCodePluginJerasure.cc

index ad5082bd8c39f2021edfac65b7c95727be295c74..e53c55c15cc63694bba2b7d44931d1cef00eb4a1 100644 (file)
--- a/src/test/erasure-code/TestErasureCodePluginJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodePluginJerasure.cc
@@ -17,6 +17,7 @@
  
  #include <errno.h>
  #include <stdlib.h>
+
  #include "erasure-code/ErasureCodePlugin.h"
  #include "log/Log.h"
  #include "global/global_context.h"
@@ -62,6 +63,80 @@ TEST(ErasureCodePlugin, factory)
    }
  }
  
+bufferptr create_bufferptr(uint64_t value) {
+  bufferlist bl;
+  bl.append_zero(4096);
+  memcpy(bl.c_str(), &value, sizeof(value));
+  return bl.begin().get_current_ptr();
+}
+
+TEST(ErasureCodePlugin, parity_delta_write) {
+  ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+  ErasureCodeInterfaceRef erasure_code;
+  ErasureCodeProfile profile;
+  profile["technique"] = "reed_sol_van";
+  profile["k"] = "5";
+  int k=5;
+  profile["m"] = "3";
+  int m=3;
+  EXPECT_EQ(0, instance.factory("jerasure",
+                              g_conf().get_val<std::string>("erasure_code_dir"),
+                              profile,
+                              &erasure_code, &cerr));
+  shard_id_map<bufferptr> data(8);
+  shard_id_map<bufferptr> coding(8);
+  shard_id_map<bufferptr> coding2(8);
+  shard_id_map<bufferptr> decode_in(8);
+  shard_id_map<bufferptr> decode_out(8);
+
+  uint32_t seeds[] = {100, 101, 102, 103, 104};
+  uint32_t overwrite3 = 1032;
+
+  for (shard_id_t s; s < k; ++s) {
+    data[s] = create_bufferptr(seeds[int(s)]);
+  }
+  for (shard_id_t s(k); s < k + m; ++s) {
+    coding[s] = create_bufferptr(-1);
+    coding2[s] = create_bufferptr(-1);
+  }
+
+  // Do a normal encode.
+  erasure_code->encode_chunks(data, coding);
+
+  shard_id_map<bufferptr> delta(8);
+  delta[shard_id_t(3)] = create_bufferptr(-1);
+
+  bufferptr overwrite_bp = create_bufferptr(overwrite3);
+
+  erasure_code->encode_delta(data[shard_id_t(3)], overwrite_bp, &delta[shard_id_t(3)]);
+  erasure_code->apply_delta(delta, coding);
+  data[shard_id_t(3)] = overwrite_bp;
+
+  erasure_code->encode_chunks(data, coding2);
+
+  for (shard_id_t s(k); s < k + m; ++s) {
+    ASSERT_EQ(*(uint32_t*)coding[s].c_str(), *(uint32_t*)coding2[s].c_str());
+  }
+
+  data.erase(shard_id_t(4));
+  data.emplace(shard_id_t(4), (char*)malloc(4096), 4096);
+  shard_id_set want;
+  want.insert_range(shard_id_t(0), 5);
+  decode_in[shard_id_t(0)] = data[shard_id_t(0)];
+  decode_in[shard_id_t(1)] = data[shard_id_t(1)];
+  decode_in[shard_id_t(2)] = data[shard_id_t(2)];
+  decode_in[shard_id_t(3)] = data[shard_id_t(3)];
+  decode_out[shard_id_t(4)] = data[shard_id_t(4)];
+  decode_in[shard_id_t(6)] = coding[shard_id_t(6)];
+
+  ASSERT_EQ(0, erasure_code->decode_chunks(want, decode_in, decode_out));
+
+  seeds[3] = overwrite3;
+  for (shard_id_t s(0); s < k; ++s) {
+    ASSERT_EQ(seeds[int(s)], *(uint32_t*)data[s].c_str());
+  }
+}
+
  /*
   * Local Variables:
   * compile-command: "cd ../.. ; make -j4 &&
diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt

index bcaf9b54bf26a8353139f5161c3aa1069cac7a8b..3661d91dffea19000b578038f2aeacd2a265cf07 100644 (file)
--- a/src/test/osd/CMakeLists.txt
+++ b/src/test/osd/CMakeLists.txt
@@ -76,6 +76,14 @@ add_executable(unittest_ecbackend
  add_ceph_unittest(unittest_ecbackend)
  target_link_libraries(unittest_ecbackend osd global)
  
+# unittest_ecutil
+add_executable(unittest_ecutil
+        TestECUtil.cc
+        $<TARGET_OBJECTS:unit-main>
+)
+add_ceph_unittest(unittest_ecutil)
+target_link_libraries(unittest_ecutil osd global)
+
  # unittest_osdscrub
  add_executable(unittest_osdscrub
    TestOSDScrub.cc
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc

index c979e84a518e19974d0cba84183dfec7446f2e45..ec875abdb9e6910c519fd9b62a3247376d4d8a1f 100644 (file)
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -19,6 +19,9 @@
  #include "osd/ECCommon.h"
  #include "osd/ECBackend.h"
  #include "gtest/gtest.h"
+#include "osd/osd_types.h"
+#include "common/ceph_argparse.h"
+#include "erasure-code/ErasureCode.h"
  
  using namespace std;
  
@@ -31,216 +34,1066 @@ TEST(ECUtil, stripe_info_t)
    ECUtil::stripe_info_t s(k, m, swidth);
    ASSERT_EQ(s.get_stripe_width(), swidth);
  
-  ASSERT_EQ(s.logical_to_next_chunk_offset(0), 0u);
-  ASSERT_EQ(s.logical_to_next_chunk_offset(1), s.get_chunk_size());
-  ASSERT_EQ(s.logical_to_next_chunk_offset(swidth - 1),
+  ASSERT_EQ(s.ro_offset_to_next_chunk_offset(0), 0u);
+  ASSERT_EQ(s.ro_offset_to_next_chunk_offset(1), s.get_chunk_size());
+  ASSERT_EQ(s.ro_offset_to_next_chunk_offset(swidth - 1),
             s.get_chunk_size());
  
-  ASSERT_EQ(s.logical_to_prev_chunk_offset(0), 0u);
-  ASSERT_EQ(s.logical_to_prev_chunk_offset(swidth), s.get_chunk_size());
-  ASSERT_EQ(s.logical_to_prev_chunk_offset((swidth * 2) - 1),
+  ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(0), 0u);
+  ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(swidth), s.get_chunk_size());
+  ASSERT_EQ(s.ro_offset_to_prev_chunk_offset((swidth * 2) - 1),
             s.get_chunk_size());
  
-  ASSERT_EQ(s.logical_to_next_stripe_offset(0), 0u);
-  ASSERT_EQ(s.logical_to_next_stripe_offset(swidth - 1),
+  ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(0), 0u);
+  ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(swidth - 1),
             s.get_stripe_width());
  
-  ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width());
-  ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width());
-  ASSERT_EQ(s.logical_to_prev_stripe_offset((swidth * 2) - 1),
+  ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width());
+  ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width());
+  ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset((swidth * 2) - 1),
             s.get_stripe_width());
  
-  ASSERT_EQ(s.aligned_logical_offset_to_chunk_offset(2*swidth),
+  ASSERT_EQ(s.aligned_ro_offset_to_chunk_offset(2*swidth),
             2*s.get_chunk_size());
-  ASSERT_EQ(s.aligned_chunk_offset_to_logical_offset(2*s.get_chunk_size()),
+  ASSERT_EQ(s.chunk_aligned_shard_offset_to_ro_offset(2*s.get_chunk_size()),
             2*s.get_stripe_width());
  
    // Stripe 1 + 1 chunk for 10 stripes needs to read 11 stripes starting
    // from 1 because there is a partial stripe at the start and end
-  ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(
-             make_pair(swidth+s.get_chunk_size(), 10*swidth)),
+  ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth+s.get_chunk_size(), 10*swidth),
             make_pair(s.get_chunk_size(), 11*s.get_chunk_size()));
  
    // Stripe 1 + 0 chunks for 10 stripes needs to read 10 stripes starting
    // from 1 because there are no partial stripes
-  ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(swidth, 10*swidth)),
+  ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth, 10*swidth),
             make_pair(s.get_chunk_size(), 10*s.get_chunk_size()));
  
    // Stripe 0 + 1 chunk for 10 stripes needs to read 11 stripes starting
    // from 0 because there is a partial stripe at the start and end
-  ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(), 10*swidth)),
+  ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(), 10*swidth),
             make_pair<uint64_t>(0, 11*s.get_chunk_size()));
  
    // Stripe 0 + 1 chunk for (10 stripes + 1 chunk) needs to read 11 stripes
    // starting from 0 because there is a partial stripe at the start and end
-  ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(),
-                                                         10*swidth + s.get_chunk_size())),
+  ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(),
+                                                         10*swidth + s.get_chunk_size()),
             make_pair<uint64_t>(0, 11*s.get_chunk_size()));
  
    // Stripe 0 + 2 chunks for (10 stripes + 2 chunks) needs to read 11 stripes
    // starting from 0 because there is a partial stripe at the start
-  ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(2*s.get_chunk_size(),
-                                                         10*swidth + 2*s.get_chunk_size())),
-           make_pair<uint64_t>(0, 11*s.get_chunk_size()));
+  ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(2*s.get_chunk_size(),
+    10*swidth + 2*s.get_chunk_size()),
+    make_pair<uint64_t>(0, 11*s.get_chunk_size()));
  
-  ASSERT_EQ(s.offset_len_to_stripe_bounds(make_pair(swidth-10, (uint64_t)20)),
+  ASSERT_EQ(s.ro_offset_len_to_stripe_ro_offset_len(swidth-10, (uint64_t)20),
              make_pair((uint64_t)0, 2*swidth));
  }
  
-TEST(ECUtil, offset_length_is_same_stripe)
-{
-  const uint64_t swidth = 4096;
-  const uint64_t schunk = 1024;
-  const unsigned int k = 4;
-  const unsigned int m = 2;
+class ErasureCodeDummyImpl : public ErasureCodeInterface {
+public:
  
-  ECUtil::stripe_info_t s(k, m, swidth);
-  ASSERT_EQ(s.get_stripe_width(), swidth);
-  ASSERT_EQ(s.get_chunk_size(), schunk);
+  uint64_t get_supported_optimizations() const override {
+    return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
+          FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
+          FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
+          FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION |
+          FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
+  }
  
-  // read nothing at the very beginning
-  //   +---+---+---+---+
-  //   |  0|   |   |   |
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(0, 0));
-
-  // read nothing at the stripe end
-  //   +---+---+---+---+
-  //   |   |   |   |  0|
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(swidth, 0));
-
-  // read single byte at the stripe end
-  //   +---+---+---+---+
-  //   |   |   |   | ~1|
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(swidth - 1, 1));
-
-  // read single stripe
-  //   +---+---+---+---+
-  //   | 1k| 1k| 1k| 1k|
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(0, swidth));
-
-  // read single chunk
-  //   +---+---+---+---+
-  //   | 1k|   |   |   |
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(0, schunk));
-
-  // read single stripe except its first chunk
-  //   +---+---+---+---+
-  //   |   | 1k| 1k| 1k|
-  //   +---+---+---+---+
-  //   |   |   |   |   |
-  //   +---+---+---+---+
-  ASSERT_TRUE(s.offset_length_is_same_stripe(schunk, swidth - schunk));
-
-  // read two stripes
-  //   +---+---+---+---+
-  //   | 1k| 1k| 1k| 1k|
-  //   +---+---+---+---+
-  //   | 1k| 1k| 1k| 1k|
-  //   +---+---+---+---+
-  ASSERT_FALSE(s.offset_length_is_same_stripe(0, 2*swidth));
-
-  // multistripe read: 1st stripe without 1st byte + 1st byte of 2nd stripe
-  //   +-----+---+---+---+
-  //   | 1k-1| 1k| 1k| 1k|
-  //   +-----+---+---+---+
-  //   |    1|   |   |   |
-  //   +-----+---+---+---+
-  ASSERT_FALSE(s.offset_length_is_same_stripe(1, swidth));
-}
+  ErasureCodeProfile _profile;
+  const std::vector<shard_id_t> chunk_mapping = {}; // no remapping
+  std::vector<std::pair<int, int>> default_sub_chunk = {std::pair(0,1)};
+  int data_chunk_count = 4;
+  int chunk_count = 6;
+
+  int init(ErasureCodeProfile &profile, std::ostream *ss) override {
+    return 0;
+  }
+
+  const ErasureCodeProfile &get_profile() const override {
+    return _profile;
+  }
+
+  int create_rule(const string &name, CrushWrapper &crush, std::ostream *ss) const override {
+    return 0;
+  }
+
+  unsigned int get_chunk_count() const override {
+    return chunk_count;
+  }
+
+  unsigned int get_data_chunk_count() const override {
+    return data_chunk_count;
+  }
+
+  unsigned int get_coding_chunk_count() const override {
+    return 0;
+  }
+
+  int get_sub_chunk_count() override {
+    return 1;
+  }
+
+  unsigned int get_chunk_size(unsigned int stripe_width) const override {
+    return 0;
+  }
+
+  int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available,
+                        shard_id_set &minimum_set,
+                       shard_id_map<std::vector<std::pair<int, int>>> *minimum_sub_chunks) override {
+    shard_id_t parity_shard_index(data_chunk_count);
+    for (shard_id_t shard : want_to_read) {
+      if (available.contains(shard)) {
+        minimum_set.insert(shard);
+      } else {
+        // Shard is missing.  Recover with every other shard and one parity
+        // for each missing shard.
+        for (shard_id_t i; i<data_chunk_count; ++i) {
+          if (available.contains(i)) {
+            minimum_set.insert(i);
+          } else {
+            minimum_set.insert(parity_shard_index);
+            ++parity_shard_index;
+          }
+
+          if (int(parity_shard_index) == chunk_count)
+            return -EIO; // Cannot recover.
+        }
+      }
+    }
+
+    for (auto &&shard : minimum_set) {
+      minimum_sub_chunks->emplace(shard, default_sub_chunk);
+    }
+    return 0;
+  }
+
+  [[deprecated]]
+  int minimum_to_decode(const std::set<int> &want_to_read,
+    const std::set<int> &available,
+    std::map<int, std::vector<std::pair<int, int>>> *minimum) override
+  {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  [[deprecated]]
+  int minimum_to_decode_with_cost(const std::set<int> &want_to_read,
+      const std::map<int, int> &available, std::set<int> *minimum) override {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  int minimum_to_decode_with_cost(const shard_id_set &want_to_read, const shard_id_map<int> &available,
+                                shard_id_set *minimum) override {
+    return 0;
+  }
+
+  int encode(const shard_id_set &want_to_encode, const bufferlist &in, shard_id_map<bufferlist> *encoded) override {
+    return 0;
+  }
+
+  [[deprecated]]
+  int encode(const std::set<int> &want_to_encode, const bufferlist &in
+    , std::map<int, bufferlist> *encoded) override
+  {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  [[deprecated]]
+  int encode_chunks(const std::set<int> &want_to_encode,
+                    std::map<int, bufferlist> *encoded) override
+  {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  int encode_chunks(const shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override {
+    return 0;
+  }
+
+  int decode(const shard_id_set &want_to_read, const shard_id_map<bufferlist> &chunks, shard_id_map<bufferlist> *decoded,
+            int chunk_size) override {
+    return 0;
+  }
+
+  [[deprecated]]
+  int decode(const std::set<int> &want_to_read, const std::map<int, bufferlist> &chunks,
+    std::map<int, bufferlist> *decoded, int chunk_size) override
+  {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  [[deprecated]]
+  int decode_chunks(const std::set<int> &want_to_read,
+                    const std::map<int, bufferlist> &chunks,
+                    std::map<int, bufferlist> *decoded) override {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  int decode_chunks(const shard_id_set &want_to_read,
+                    shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override
+  {
+    if (in.size() < data_chunk_count) {
+      ADD_FAILURE();
+    }
+    uint64_t len = 0;
+    for (auto &&[shard, bp] : in) {
+      if (len == 0) {
+        len = bp.length();
+      } else if (len != bp.length()) {
+        ADD_FAILURE();
+      }
+    }
+    if (len == 0) {
+      ADD_FAILURE();
+    }
+    if (out.size() == 0) {
+      ADD_FAILURE();
+    }
+    for (auto &&[shard, bp] : out) {
+      if (len != bp.length()) {
+        ADD_FAILURE();
+      }
+    }
+    return 0;
+  }
+
+  const vector<shard_id_t> &get_chunk_mapping() const override {
+    return chunk_mapping;
+  }
+
+  [[deprecated]]
+  int decode_concat(const std::set<int> &want_to_read,
+                    const std::map<int, bufferlist> &chunks, bufferlist *decoded) override {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  [[deprecated]]
+  int decode_concat(const std::map<int, bufferlist> &chunks,
+                    bufferlist *decoded) override {
+    ADD_FAILURE();
+    return 0;
+  }
+
+  size_t get_minimum_granularity() override { return 0; }
+  void encode_delta(const bufferptr &old_data, const bufferptr &new_data
+    , bufferptr *delta) override {}
+  void apply_delta(const shard_id_map<bufferptr> &in
+    , shard_id_map<bufferptr> &out) override {}
+};
+
+class ECListenerStub : public ECListener {
+  OSDMapRef osd_map_ref;
+  pg_info_t pg_info;
+  set<pg_shard_t> backfill_shards;
+  shard_id_set backfill_shard_id_set;
+  map<hobject_t, set<pg_shard_t>> missing_loc_shards;
+  map<pg_shard_t, pg_missing_t> shard_missing;
+  pg_missing_set<false> shard_not_missing_const;
+  pg_pool_t pg_pool;
+  set<pg_shard_t> acting_recovery_backfill_shards;
+  shard_id_set acting_recovery_backfill_shard_id_set;
+  map<pg_shard_t, pg_info_t> shard_info;
+  PGLog pg_log;
+  pg_info_t shard_pg_info;
+  std::string dbg_prefix = "stub";
+
+public:
+  set<pg_shard_t> acting_shards;
+
+  ECListenerStub()
+    : pg_log(NULL) {}
+
+  const OSDMapRef &pgb_get_osdmap() const override {
+    return osd_map_ref;
+  }
+
+  epoch_t pgb_get_osdmap_epoch() const override {
+    return 0;
+  }
+
+  const pg_info_t &get_info() const override {
+    return pg_info;
+  }
+
+  void cancel_pull(const hobject_t &soid) override {
+
+  }
+
+  pg_shard_t primary_shard() const override {
+    return pg_shard_t();
+  }
+
+  bool pgb_is_primary() const override {
+    return false;
+  }
+
+  void on_failed_pull(const set<pg_shard_t> &from, const hobject_t &soid, const eversion_t &v) override {
+
+  }
+
+  void
+  on_local_recover(const hobject_t &oid, const ObjectRecoveryInfo &recovery_info, ObjectContextRef obc, bool is_delete,
+                  ceph::os::Transaction *t) override {
+
+  }
+
+  void on_global_recover(const hobject_t &oid, const object_stat_sum_t &stat_diff, bool is_delete) override {
+
+  }
+
+  void on_peer_recover(pg_shard_t peer, const hobject_t &oid, const ObjectRecoveryInfo &recovery_info) override {
+
+  }
  
+  void begin_peer_recover(pg_shard_t peer, const hobject_t oid) override {
+
+  }
+
+  bool pg_is_repair() const override {
+    return false;
+  }
+
+  ObjectContextRef
+  get_obc(const hobject_t &hoid, const map<std::string, ceph::buffer::list, std::less<>> &attrs) override {
+    return ObjectContextRef();
+  }
+
+  bool check_failsafe_full() override {
+    return false;
+  }
+
+  hobject_t get_temp_recovery_object(const hobject_t &target, eversion_t version) override {
+    return hobject_t();
+  }
+
+  bool pg_is_remote_backfilling() override {
+    return false;
+  }
+
+  void pg_add_local_num_bytes(int64_t num_bytes) override {
+
+  }
+
+  void pg_add_num_bytes(int64_t num_bytes) override {
+
+  }
+
+  void inc_osd_stat_repaired() override {
+
+  }
+
+  void add_temp_obj(const hobject_t &oid) override {
+
+  }
+
+  void clear_temp_obj(const hobject_t &oid) override {
+
+  }
+
+  epoch_t get_last_peering_reset_epoch() const override {
+    return 0;
+  }
+
+  GenContext<ThreadPool::TPHandle &> *bless_unlocked_gencontext(GenContext<ThreadPool::TPHandle &> *c) override {
+    return nullptr;
+  }
+
+  void schedule_recovery_work(GenContext<ThreadPool::TPHandle &> *c, uint64_t cost) override {
+
+  }
+
+  epoch_t get_interval_start_epoch() const override {
+    return 0;
+  }
+
+  const set<pg_shard_t> &get_acting_shards() const override {
+    return acting_shards;
+  }
+
+  const set<pg_shard_t> &get_backfill_shards() const override {
+    return backfill_shards;
+  }
+
+  const map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards() const override {
+    return missing_loc_shards;
+  }
+
+  const map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+    return shard_missing;
+  }
+
+  const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const override {
+    return shard_not_missing_const;
+  }
+
+  const pg_missing_const_i *maybe_get_shard_missing(pg_shard_t peer) const override {
+    return nullptr;
+  }
+
+  const pg_info_t &get_shard_info(pg_shard_t peer) const override {
+    return shard_pg_info;
+  }
+
+  ceph_tid_t get_tid() override {
+    return 0;
+  }
+
+  pg_shard_t whoami_shard() const override {
+    return pg_shard_t();
+  }
+
+  void send_message_osd_cluster(vector<std::pair<int, Message *>> &messages, epoch_t from_epoch) override {
+
+  }
+
+  ostream &gen_dbg_prefix(ostream &out) const override {
+    out << dbg_prefix;
+    return out;
+  }
+
+  const pg_pool_t &get_pool() const override {
+    return pg_pool;
+  }
+
+  const set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
+    return acting_recovery_backfill_shards;
+  }
+
+  const shard_id_set &get_acting_recovery_backfill_shard_id_set() const override {
+    return acting_recovery_backfill_shard_id_set;
+  }
+
+  bool should_send_op(pg_shard_t peer, const hobject_t &hoid) override {
+    return false;
+  }
+
+  const map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+    return shard_info;
+  }
+
+  spg_t primary_spg_t() const override {
+    return spg_t();
+  }
+
+  const PGLog &get_log() const override {
+    return pg_log;
+  }
+
+  DoutPrefixProvider *get_dpp() override {
+    return nullptr;
+  }
+
+  void apply_stats(const hobject_t &soid, const object_stat_sum_t &delta_stats) override {
+
+  }
+
+  bool is_missing_object(const hobject_t &oid) const override {
+    return false;
+  }
+
+  void add_local_next_event(const pg_log_entry_t &e) override {
+
+  }
+
+  void log_operation(vector<pg_log_entry_t> &&logv, const optional<pg_hit_set_history_t> &hset_history,
+                    const eversion_t &trim_to, const eversion_t &roll_forward_to,
+                    const eversion_t &min_last_complete_ondisk, bool transaction_applied, os::Transaction &t,
+                    bool async) override {
+
+  }
+
+  void op_applied(const eversion_t &applied_version) override {
+
+  }
+
+  uint64_t min_peer_features() const {
+    return 0;
+  }
+};
  
  TEST(ECCommon, get_min_want_to_read_shards)
  {
    const uint64_t swidth = 4096;
    const unsigned int k = 4;
    const unsigned int m = 2;
+  const uint64_t csize = 1024;
  
    ECUtil::stripe_info_t s(k, m, swidth);
+  ECListenerStub listenerStub;
    ASSERT_EQ(s.get_stripe_width(), swidth);
-  ASSERT_EQ(s.get_chunk_size(), 1024);
+  ASSERT_EQ(s.get_chunk_size(), csize);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  ECUtil::shard_extent_set_t empty_extent_set_map(s.get_k_plus_m());
  
    // read nothing at the very beginning
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      0, 0, s, &want_to_read);
-    ASSERT_TRUE(want_to_read == std::set<int>{});
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(0, 0, 0);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ASSERT_EQ(want_to_read,  empty_extent_set_map);
    }
  
    // read nothing at the middle (0-sized partial read)
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      2048, 0, s, &want_to_read);
-    ASSERT_TRUE(want_to_read == std::set<int>{});
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(2048, 0, 0);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ASSERT_EQ(want_to_read,  empty_extent_set_map);
+  }
+  // read nothing at the the second stripe (0-sized partial read)
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth, 0, 0);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ASSERT_EQ(want_to_read,  empty_extent_set_map);
    }
  
    // read not-so-many (< chunk_size) bytes at the middle (partial read)
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      2048, 42, s, &want_to_read);
-    ASSERT_TRUE(want_to_read == std::set<int>{2});
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(2048, 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(2)].insert(0, 42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // read not-so-many (< chunk_size) bytes after the first stripe.
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth+2048, 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(2)].insert(csize, 42);
+    ASSERT_EQ(want_to_read, ref);
    }
  
    // read more (> chunk_size) bytes at the middle (partial read)
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      1024, 1024+42, s, &want_to_read);
-    // extra () due to a language / macro limitation
-    ASSERT_TRUE(want_to_read == (std::set<int>{1, 2}));
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(csize, csize + 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(1)].insert(0, csize);
+    ref[shard_id_t(2)].insert(0, 42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // read more (> chunk_size) bytes at the middle (partial read), second stripe
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth + csize, csize + 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, 42);
+    ASSERT_EQ(want_to_read, ref);
    }
  
    // full stripe except last chunk
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      0, 3*1024, s, &want_to_read);
-    // extra () due to a language / macro limitation
-    ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2}));
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(0, 3*csize, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(0)].insert(0, csize);
+    ref[shard_id_t(1)].insert(0, csize);
+    ref[shard_id_t(2)].insert(0, csize);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // full stripe except last chunk (second stripe)
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth, 3*csize, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(0)].insert(csize, csize);
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, csize);
+    ASSERT_EQ(want_to_read, ref);
    }
  
    // full stripe except 1st chunk
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      1024, swidth-1024, s, &want_to_read);
-    // extra () due to a language / macro limitation
-    ASSERT_TRUE(want_to_read == (std::set<int>{1, 2, 3}));
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(csize, swidth - csize, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(1)].insert(0, csize);
+    ref[shard_id_t(2)].insert(0, csize);
+    ref[shard_id_t(3)].insert(0, csize);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // full stripe except 1st chunk (second stripe)
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth + csize, swidth - csize, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, csize);
+    ref[shard_id_t(3)].insert(csize, csize);
+    ASSERT_EQ(want_to_read, ref);
    }
  
    // large, multi-stripe read starting just after 1st chunk
+  // 0XXX
+  // XXXX x41
+  // X000
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(csize, swidth * 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(0)].insert(csize, csize*42);
+    ref[shard_id_t(1)].insert(0, csize*42);
+    ref[shard_id_t(2)].insert(0, csize*42);
+    ref[shard_id_t(3)].insert(0, csize*42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // large, multi-stripe read starting just after 1st chunk (second stripe)
+  // 0XXX
+  // XXXX x41
+  // X000
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth + csize, swidth * 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+    ref[shard_id_t(0)].insert(csize*2, csize*42);
+    ref[shard_id_t(1)].insert(csize, csize*42);
+    ref[shard_id_t(2)].insert(csize, csize*42);
+    ref[shard_id_t(3)].insert(csize, csize*42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // large read from the beginning
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      1024, swidth*42, s, &want_to_read);
-    // extra () due to a language / macro limitation
-    ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(0, swidth * 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(0, csize*42);
+    ref[shard_id_t(1)].insert(0, csize*42);
+    ref[shard_id_t(2)].insert(0, csize*42);
+    ref[shard_id_t(3)].insert(0, csize*42);
+    ASSERT_EQ(want_to_read, ref);
    }
  
    // large read from the beginning
    {
-    std::set<int> want_to_read;
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      0, swidth*42, s, &want_to_read);
-    // extra () due to a language / macro limitation
-    ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(0, swidth * 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(0, csize*42);
+    ref[shard_id_t(1)].insert(0, csize*42);
+    ref[shard_id_t(2)].insert(0, csize*42);
+    ref[shard_id_t(3)].insert(0, csize*42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // large read from the beginning (second stripe)
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth, swidth * 42, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(csize, csize*42);
+    ref[shard_id_t(1)].insert(csize, csize*42);
+    ref[shard_id_t(2)].insert(csize, csize*42);
+    ref[shard_id_t(3)].insert(csize, csize*42);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // large read that starts and ends on same shard.
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth, swidth+csize/2, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(csize, csize+csize/2);
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, csize);
+    ref[shard_id_t(3)].insert(csize, csize);
+    ASSERT_EQ(want_to_read, ref);
+  }
+
+  // large read that starts and ends on last shard
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth-csize, swidth+csize/2, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(csize, csize);
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, csize);
+    ref[shard_id_t(3)].insert(0, csize+csize/2);
+    ASSERT_EQ(want_to_read, ref);
+  }
+  // large read that starts and ends on last shard, partial first shard.
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ec_align_t to_read(swidth-csize/2, swidth, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+    ref[shard_id_t(0)].insert(csize, csize);
+    ref[shard_id_t(1)].insert(csize, csize);
+    ref[shard_id_t(2)].insert(csize, csize);
+    ref[shard_id_t(3)].insert(csize/2, csize);
+    ASSERT_EQ(want_to_read, ref);
+  }
+}
+
+TEST(ECCommon, get_min_avail_to_read_shards) {
+  const uint64_t page_size = CEPH_PAGE_SIZE;
+  const uint64_t swidth = 64*page_size;
+  const unsigned int k = 4;
+  const unsigned int m = 2;
+  const int nshards = 6;
+  const uint64_t object_size = swidth * 1024;
+
+  std::vector<ECCommon::shard_read_t> empty_shard_vector(k);
+
+  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), swidth / k);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+  ErasureCodeInterfaceRef ec_impl(ecode);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  for (int i = 0; i < nshards; i++) {
+    listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i)));
+  }
+
+  // read nothing
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  /* Read to every data shard. */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+
+    for (shard_id_t i; i<k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+    }
+
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    for (shard_id_t shard_id; shard_id < k; ++shard_id) {
+      ref.shard_reads[shard_id].extents = to_read_list[shard_id];
+      ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+      ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id));
+      ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id), shard_id);
+    }
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  /* Read to every data shard. */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+    for (shard_id_t i; i<k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+    }
+
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    for (shard_id_t i; i<k; ++i) {
+      shard_id_t shard_id(i);
+      ref.shard_reads[shard_id].extents = to_read_list[i];
+      ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+      ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id);
+    }
+
+    ASSERT_EQ(read_request,  ref);
+  }
+
+
+  /* Read to every data shard - small read */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+
+    for (shard_id_t i; i < (int)k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size + int(i) + 1, int(i) + 1);
+    }
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+    for (int i=0; i < (int)k; i++) {
+      shard_id_t shard_id(i);
+      ECCommon::shard_read_t &ref_shard_read = ref.shard_reads[shard_id];
+      ref_shard_read.subchunk = ecode->default_sub_chunk;
+      ref_shard_read.extents.insert(i*2*page_size, page_size);
+      ref_shard_read.pg_shard = pg_shard_t(i, shard_id_t(i));
+    }
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  /* Read to every data shard, missing shard. */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+
+    for (shard_id_t i; i<k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+    }
+
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+    shard_id_t missing_shard(1);
+    int parity_shard = k;
+    listenerStub.acting_shards.erase(pg_shard_t(int(missing_shard), shard_id_t(missing_shard)));
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    for (shard_id_t i; i<k; ++i) {
+      if (i != missing_shard) {
+        shard_id_t shard_id(i);
+       to_read_list[i].union_of(to_read_list[missing_shard]);
+        ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+       ref.shard_reads[shard_id].extents = to_read_list[i];
+        ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id);
+      } else {
+       ECCommon::shard_read_t parity_shard_read;
+       parity_shard_read.subchunk = ecode->default_sub_chunk;
+       parity_shard_read.extents.union_of(to_read_list[i]);
+       ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read;
+        ref.shard_reads[shard_id_t(parity_shard)].pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard));
+      }
+    }
+
+    ASSERT_EQ(read_request,  ref);
+
+    listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+  }
+
+
+  /* Read to every data shard, missing shard, missing shard is adjacent. */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+    unsigned int missing_shard = 1;
+
+    to_read_list[shard_id_t(0)].insert(0, page_size);
+    to_read_list[shard_id_t(1)].insert(page_size, page_size);
+    to_read_list[shard_id_t(2)].insert(2*page_size, page_size);
+    to_read_list[shard_id_t(3)].insert(3*page_size, page_size);
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+
+    // Populating reference manually to check that adjacent shards get correctly combined.
+    ref.shard_reads[shard_id_t(0)].extents.insert(0, page_size*2);
+    ref.shard_reads[shard_id_t(2)].extents.insert(page_size, page_size*2);
+    ref.shard_reads[shard_id_t(3)].extents.insert(page_size, page_size);
+    ref.shard_reads[shard_id_t(3)].extents.insert(3*page_size, page_size);
+    ref.shard_reads[shard_id_t(4)].extents.insert(page_size, page_size);
+    ref.shard_reads[shard_id_t(0)].pg_shard = pg_shard_t(0, shard_id_t(0));
+    ref.shard_reads[shard_id_t(2)].pg_shard = pg_shard_t(2, shard_id_t(2));
+    ref.shard_reads[shard_id_t(3)].pg_shard = pg_shard_t(3, shard_id_t(3));
+    ref.shard_reads[shard_id_t(4)].pg_shard = pg_shard_t(4, shard_id_t(4));
+    for (unsigned int i=0; i<k+1; i++) {
+      if (i==missing_shard) {
+       continue;
+      }
+      ref.shard_reads[shard_id_t(i)].subchunk = ecode->default_sub_chunk;
+    }
+
+    listenerStub.acting_shards.erase(pg_shard_t(missing_shard, shard_id_t(missing_shard)));
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ASSERT_EQ(read_request,  ref);
+
+    listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+  }
+
+  /* Read to every data shard, but with "fast" (redundant) reads */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+
+    extent_set extents_to_read;
+    for (shard_id_t i; i<k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+      extents_to_read.insert(int(i) * 2 * page_size, page_size);
+    }
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, true, read_request);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    for (unsigned int i=0; i<k+2; i++) {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents = extents_to_read;
+      shard_read.pg_shard = pg_shard_t(i, shard_id_t(i));
+      ref.shard_reads[shard_id_t(i)] = shard_read;
+    }
+
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  /* Read to every data shard, missing shard. */
+  {
+    ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+    hobject_t hoid;
+
+    for (shard_id_t i; i<k; ++i) {
+      to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+    }
+    ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+    shard_id_t missing_shard(1);
+    int parity_shard = k;
+    std::set<pg_shard_t> error_shards;
+    error_shards.emplace(int(missing_shard), shard_id_t(missing_shard));
+    // Similar to previous tests with missing shards, but this time, emulate
+    // the shard being missing as a result of a bad read.
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request, error_shards);
+
+    ECCommon::read_request_t ref(to_read_list, false, object_size);
+    std::vector<ECCommon::shard_read_t> want_to_read(empty_shard_vector);
+    for (shard_id_t i; i<k; ++i) {
+      if (i != missing_shard) {
+        want_to_read[int(i)].subchunk = ecode->default_sub_chunk;
+        want_to_read[int(i)].extents.union_of(to_read_list[missing_shard]);
+        want_to_read[int(i)].extents.union_of(to_read_list[i]);
+        want_to_read[int(i)].pg_shard = pg_shard_t(int(i), shard_id_t(i));
+        ref.shard_reads[shard_id_t(i)] = want_to_read[int(i)];
+      } else {
+        ECCommon::shard_read_t parity_shard_read;
+        parity_shard_read.subchunk = ecode->default_sub_chunk;
+        parity_shard_read.extents.union_of(to_read_list[missing_shard]);
+        parity_shard_read.pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard));
+        ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read;
+      }
+    }
+
+    ASSERT_EQ(read_request,  ref);
+
+    listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+  }
+}
+
+TEST(ECCommon, shard_read_combo_tests)
+{
+  const uint64_t page_size = CEPH_PAGE_SIZE;
+  const uint64_t swidth = 2*page_size;
+  const unsigned int k = 2;
+  const unsigned int m = 2;
+  const int nshards = 4;
+  const uint64_t object_size = swidth * 1024;
+  hobject_t hoid;
+
+  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+  ErasureCodeInterfaceRef ec_impl(ecode);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  for (int i = 0; i < nshards; i++) {
+    listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i)));
+  }
+
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+
+    ec_align_t to_read(36*1024,10*1024, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECCommon::read_request_t read_request(want_to_read, false, object_size);
+
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(want_to_read, false, object_size);
+    {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents.insert(20*1024, 4*1024);
+      shard_read.pg_shard = pg_shard_t(0, shard_id_t(0));
+      ref.shard_reads[shard_id_t(0)] = shard_read;
+    }
+    {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents.insert(16*1024, 8*1024);
+      shard_read.pg_shard = pg_shard_t(1, shard_id_t(1));
+      ref.shard_reads[shard_id_t(1)] = shard_read;
+    }
+
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  {
+    ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+
+    ec_align_t to_read(12*1024,12*1024, 1);
+    pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+    ECCommon::read_request_t read_request(want_to_read, false, object_size);
+    pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+    ECCommon::read_request_t ref(want_to_read, false, object_size);
+    {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents.insert(8*1024, 4*1024);
+      shard_read.pg_shard = pg_shard_t(0, shard_id_t(0));
+      ref.shard_reads[shard_id_t(0)] = shard_read;
+    }
+    {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents.insert(4*1024, 8*1024);
+      shard_read.pg_shard = pg_shard_t(1, shard_id_t(1));
+      ref.shard_reads[shard_id_t(1)] = shard_read;
+    }
+
+    ASSERT_EQ(read_request,  ref);
    }
  }
  
@@ -249,21 +1102,207 @@ TEST(ECCommon, get_min_want_to_read_shards_bug67087)
    const uint64_t swidth = 4096;
    const unsigned int k = 4;
    const unsigned int m = 2;
+  const uint64_t csize = 1024;
  
    ECUtil::stripe_info_t s(k, m, swidth);
    ASSERT_EQ(s.get_stripe_width(), swidth);
    ASSERT_EQ(s.get_chunk_size(), 1024);
  
-  std::set<int> want_to_read;
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), csize);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+  ec_align_t to_read1(512,512, 1);
+  ec_align_t to_read2(512+16*1024,512, 1);
+
+  ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+  ref[shard_id_t(0)].insert(512, 512);
  
    // multitple calls with the same want_to_read can happen during
-  // multi-region reads.
+  // multi-region reads. This will create multiple extents in want_to_read,
+  {
+    pipeline.get_min_want_to_read_shards(
+     to_read1, want_to_read);
+    ASSERT_EQ(want_to_read, ref);
+
+    pipeline.get_min_want_to_read_shards(
+     to_read2, want_to_read);
+    // We have 4 data shards per stripe.
+    ref[shard_id_t(0)].insert(512+4*1024, 512);
+  }
+}
+
+TEST(ECCommon, get_remaining_shards)
+{
+  const uint64_t page_size = CEPH_PAGE_SIZE;
+  const uint64_t swidth = 64*page_size;
+  const unsigned int k = 4;
+  const unsigned int m = 2;
+  const int nshards = 6;
+  const uint64_t chunk_size = swidth / k;
+  const uint64_t object_size = swidth * 1024;
+
+  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+  ErasureCodeInterfaceRef ec_impl(ecode);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  std::vector<ECCommon::shard_read_t> empty_shard_vector(k);
+  ECCommon::shard_read_t empty_shard_read;
+  fill(empty_shard_vector.begin(), empty_shard_vector.end(), empty_shard_read);
+
+  vector<pg_shard_t> pg_shards(nshards);
+  for (int i = 0; i < nshards; i++) {
+    pg_shards[i] = pg_shard_t(i, shard_id_t(i));
+    listenerStub.acting_shards.insert(pg_shards[i]);
+  }
+
    {
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      512, 512, s, &want_to_read);
-    ASSERT_EQ(want_to_read, std::set<int>{0});
-    ECCommon::ReadPipeline::get_min_want_to_read_shards(
-      512+16*1024, 512, s, &want_to_read);
-    ASSERT_EQ(want_to_read, std::set<int>{0});
+    hobject_t hoid;
+
+    // Mock up a read request
+    ECUtil::shard_extent_set_t to_read(s.get_k_plus_m());
+    to_read[shard_id_t(0)].insert(0, 4096);
+    ECCommon::read_request_t read_request(to_read, false, object_size);
+    int missing_shard = 0;
+
+    // Mock up a read result.
+    ECCommon::read_result_t read_result(&s);
+    read_result.errors.emplace(pg_shards[missing_shard], -EIO);
+
+    pipeline.get_remaining_shards(hoid, read_result, read_request, false, false);
+
+    ECCommon::read_request_t ref(to_read, false, object_size);
+    int parity_shard = 4;
+    for (unsigned int i=0; i<k; i++) {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      shard_read.extents.insert(0,4096);
+      unsigned int shard_id = i==missing_shard?parity_shard:i;
+      shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id));
+      ref.shard_reads[shard_id_t(shard_id)] = shard_read;
+    }
+
+    ASSERT_EQ(read_request,  ref);
+  }
+
+  // Request re-read. There is a page of overlap in what is already read.
+  {
+    hobject_t hoid;
+
+    ECUtil::shard_extent_set_t to_read(s.get_k_plus_m());
+    s.ro_range_to_shard_extent_set(chunk_size/2, chunk_size+page_size, to_read);
+    ECCommon::read_request_t read_request(to_read, false, object_size);
+    unsigned int missing_shard = 1;
+
+    // Mock up a read result.
+    ECCommon::read_result_t read_result(&s);
+    read_result.errors.emplace(pg_shards[missing_shard], -EIO);
+    buffer::list bl;
+    bl.append_zero(chunk_size/2);
+    read_result.buffers_read.insert_in_shard(shard_id_t(0), chunk_size/2, bl);
+    read_result.processed_read_requests[shard_id_t(0)].insert(chunk_size/2, bl.length());
+
+    pipeline.get_remaining_shards(hoid, read_result, read_request, false, false);
+
+    // The result should be a read request for the first 4k of shard 0, as that
+    // is currently missing.
+    ECCommon::read_request_t ref(to_read, false, object_size);
+    int parity_shard = 4;
+    for (unsigned int i=0; i<k; i++) {
+      ECCommon::shard_read_t shard_read;
+      shard_read.subchunk = ecode->default_sub_chunk;
+      unsigned int shard_id = i==missing_shard?parity_shard:i;
+      ref.shard_reads[shard_id_t(shard_id)] = shard_read;
+    }
+    ref.shard_reads[shard_id_t(0)].extents.insert(0, chunk_size/2);
+    ref.shard_reads[shard_id_t(0)].pg_shard = pg_shards[0];
+    ref.shard_reads[shard_id_t(2)].extents.insert(0, chunk_size/2+page_size);
+    ref.shard_reads[shard_id_t(2)].pg_shard = pg_shards[2];
+    ref.shard_reads[shard_id_t(3)].extents.insert(0, chunk_size/2+page_size);
+    ref.shard_reads[shard_id_t(3)].pg_shard = pg_shards[3];
+    ref.shard_reads[shard_id_t(4)].extents.insert(0, chunk_size/2+page_size);
+    ref.shard_reads[shard_id_t(4)].pg_shard = pg_shards[4];
+    ASSERT_EQ(read_request,  ref);
    }
  }
+
+TEST(ECCommon, encode)
+{
+  const uint64_t page_size = CEPH_PAGE_SIZE;
+  const uint64_t swidth = 2*page_size;
+  const unsigned int k = 2;
+  const unsigned int m = 2;
+
+  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+  ErasureCodeInterfaceRef ec_impl(ecode);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  ECUtil::shard_extent_map_t semap(&s);
+
+  for (shard_id_t i; i<k+m; ++i) {
+    bufferlist bl;
+    bl.append_zero(i>=k?4096:2048);
+    semap.insert_in_shard(i, 12*1024, bl);
+  }
+  semap.encode(ec_impl, nullptr, 0);
+}
+
+TEST(ECCommon, decode)
+{
+  const uint64_t page_size = CEPH_PAGE_SIZE;
+  const uint64_t swidth = 3*page_size;
+  const unsigned int k = 3;
+  const unsigned int m = 2;
+
+  ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+  ECListenerStub listenerStub;
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+  ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+  ecode->data_chunk_count = k;
+  ecode->chunk_count = k + m;
+  ErasureCodeInterfaceRef ec_impl(ecode);
+  ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+  ECUtil::shard_extent_map_t semap(&s);
+  bufferlist bl12k;
+  bl12k.append_zero(12288);
+  bufferlist bl8k;
+  bl8k.append_zero(8192);
+  bufferlist bl16k;
+  bl16k.append_zero(16384);
+  semap.insert_in_shard(shard_id_t(1), 512000, bl12k);
+  semap.insert_in_shard(shard_id_t(1), 634880, bl12k);
+  semap.insert_in_shard(shard_id_t(2), 512000, bl12k);
+  semap.insert_in_shard(shard_id_t(2), 630784, bl16k);
+  semap.insert_in_shard(shard_id_t(3), 516096, bl8k);
+  semap.insert_in_shard(shard_id_t(3), 634880, bl12k);
+  ECUtil::shard_extent_set_t want = semap.get_extent_set();
+
+  want[shard_id_t(0)].insert(516096, 8192);
+  want[shard_id_t(0)].insert(634880, 12288);
+  want[shard_id_t(4)].insert(516096, 8192);
+  want[shard_id_t(4)].insert(634880, 12288);
+
+  ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024));
+}
diff --git a/src/test/osd/TestECUtil.cc b/src/test/osd/TestECUtil.cc

new file mode 100644 (file)

index 0000000..71cf3bb
--- /dev/null
+++ b/src/test/osd/TestECUtil.cc
@@ -0,0 +1,1034 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <errno.h>
+#include <signal.h>
+#include "osd/ECUtil.h"
+#include "gtest/gtest.h"
+#include "osd/osd_types.h"
+#include "common/ceph_argparse.h"
+#include "osd/ECTransaction.h"
+
+using namespace std;
+using namespace ECUtil;
+
+// FIXME: Once PRs are in, we should move the other ECUtil tests are moved here.
+
+TEST(ECUtil, stripe_info_t_chunk_mapping)
+{
+  int k=4;
+  int m=2;
+  int chunk_size = 4096;
+  vector<shard_id_t> forward_cm(k+m);
+  vector<shard_id_t> reverse_cm(k+m);
+
+  std::iota(forward_cm.begin(), forward_cm.end(), 0);
+  std::iota(reverse_cm.rbegin(), reverse_cm.rend(), 0);
+
+  stripe_info_t forward_sinfo1(k, m, chunk_size*k);
+  stripe_info_t forward_sinfo2(k, m, chunk_size*k, forward_cm);
+  stripe_info_t reverse_sinfo(k, m, chunk_size*k, reverse_cm);
+
+  for (shard_id_t shard_id : forward_cm) {
+    raw_shard_id_t raw_shard_id((int)shard_id);
+    ASSERT_EQ(shard_id, forward_sinfo1.get_shard(raw_shard_id));
+    ASSERT_EQ(raw_shard_id, forward_sinfo1.get_raw_shard(shard_id));
+    ASSERT_EQ(shard_id, forward_sinfo2.get_shard(raw_shard_id));
+    ASSERT_EQ(raw_shard_id, forward_sinfo2.get_raw_shard(shard_id));
+    ASSERT_EQ(shard_id, reverse_sinfo.get_shard(raw_shard_id_t(k + m - int(raw_shard_id) - 1)));
+    ASSERT_EQ(raw_shard_id_t(k + m- int(shard_id) - 1), reverse_sinfo.get_raw_shard(shard_id));
+  }
+
+  ASSERT_EQ(k, forward_sinfo1.get_k());
+  ASSERT_EQ(m, forward_sinfo1.get_m());
+  ASSERT_EQ(k+m, forward_sinfo1.get_k_plus_m());
+}
+
+TEST(ECUtil, shard_extent_map_t)
+{
+  int k=4;
+  int m=2;
+  int chunk_size = 4096;
+  stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+
+  // insert_in_shard
+  {
+    shard_extent_map_t semap(&sinfo);
+    int new_off = 512;
+    int new_len = 1024;
+    shard_id_t shard0(0);
+    shard_id_t shard2(2);
+
+    // Empty
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(0)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+    ASSERT_TRUE(semap.empty());
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_ro_start());
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_ro_end());
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_start_offset());
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_end_offset());
+
+
+    // Insert a 1k buffer in shard 2
+    buffer::list bl;
+    bl.append_zero(new_len);
+    semap.insert_in_shard(shard2, new_off, bl);
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(0)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+    ASSERT_TRUE(semap.contains_shard(shard_id_t(2)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+    ASSERT_FALSE(semap.empty());
+    ASSERT_EQ(int(shard2) * chunk_size + new_off, semap.get_ro_start());
+    ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end());
+    ASSERT_EQ(new_off, semap.get_start_offset());
+    ASSERT_EQ(new_off + bl.length(), semap.get_end_offset());
+    auto iter = semap.get_extent_map(shard2).begin();
+    ASSERT_EQ(new_off, iter.get_off());
+    ASSERT_EQ(new_len, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+
+    // Insert a 1k buffer in shard 0
+    semap.insert_in_shard(shard0, new_off, bl);
+    ASSERT_TRUE(semap.contains_shard(shard_id_t(0)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+    ASSERT_TRUE(semap.contains_shard(shard_id_t(2)));
+    ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+    ASSERT_FALSE(semap.empty());
+    ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start());
+    ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end());
+    ASSERT_EQ(new_off, semap.get_start_offset());
+    ASSERT_EQ(new_off + bl.length(), semap.get_end_offset());
+    iter = semap.get_extent_map(shard0).begin();
+    ASSERT_EQ(new_off, iter.get_off());
+    ASSERT_EQ(new_len, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard0).end(), iter);
+    iter = semap.get_extent_map(shard2).begin();
+    ASSERT_EQ(new_off, iter.get_off());
+    ASSERT_EQ(new_len, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+
+    /* Insert overlapping into next stripe */
+    semap.insert_in_shard(shard2, chunk_size - 512, bl);
+    ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start());
+    ASSERT_EQ((int(shard2) + k) * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(new_off, semap.get_start_offset());
+    ASSERT_EQ(chunk_size - 512 + bl.length(), semap.get_end_offset());
+
+    iter = semap.get_extent_map(shard2).begin();
+    ASSERT_EQ(new_off, iter.get_off());
+    ASSERT_EQ(new_len, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size - 512, iter.get_off());
+    ASSERT_EQ(new_len, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+  }
+
+  //insert_ro_extent_map
+  //erase_after_ro_offset
+  {
+    shard_extent_map_t semap(&sinfo);
+    extent_map emap;
+    buffer::list bl1k;
+    buffer::list bl16k;
+    buffer::list bl64k;
+
+    bl1k.append_zero(1024);
+    bl16k.append_zero(chunk_size * k);
+    bl64k.append_zero(chunk_size * k * 4);
+    shard_extent_set_t ref(sinfo.get_k_plus_m());
+
+    // 1: Strangely aligned. (shard 0 [5~1024])
+    emap.insert(5, 1024, bl1k);
+    ref[shard_id_t(0)].insert(5, 1024);
+    // 2: Start of second chunk (shard 1 [0~1024])
+    emap.insert(chunk_size, 1024, bl1k);
+    ref[shard_id_t(1)].insert(0, 1024);
+    // 3: Overlap two chunks (shard1[3584~512], shard2[0~512])
+    emap.insert(chunk_size*2 - 512, 1024, bl1k);
+    ref[shard_id_t(1)].insert(3584, 512);
+    ref[shard_id_t(2)].insert(0, 512);
+    // 4: Overlap two stripes (shard3[3584~512], shard0[4096~512])
+    emap.insert(chunk_size*4 - 512, 1024, bl1k);
+    ref[shard_id_t(3)].insert(3584, 512);
+    ref[shard_id_t(0)].insert(4096, 512);
+    // 5: Full stripe (shard*[8192~4096])
+    emap.insert(chunk_size*k*2, chunk_size*k, bl16k);
+    for (auto &&[_, eset] : ref)
+      eset.insert(8192, 4096);
+    // 6: Two half stripes (shard0,1[20480~4096], shard 2,3[16384~4096])
+    emap.insert(chunk_size*k*4 + 2*chunk_size, chunk_size * k, bl16k);
+    ref[shard_id_t(0)].insert(20480, 4096);
+    ref[shard_id_t(1)].insert(20480, 4096);
+    ref[shard_id_t(2)].insert(16384, 4096);
+    ref[shard_id_t(3)].insert(16384, 4096);
+
+    // 7: Two half stripes, strange alignment (shard0,1[36864~4096], shard2[32773~4096], shard3[32784~4096])
+    emap.insert(chunk_size*k*8 + 2*chunk_size + 5, chunk_size * k, bl16k);
+    ref[shard_id_t(0)].insert(36864, 4096);
+    ref[shard_id_t(1)].insert(36864, 4096);
+    ref[shard_id_t(2)].insert(32773, 4096);
+    ref[shard_id_t(3)].insert(32768, 4096);
+
+    // 8: Multiple stripes (shard*[49152, 16384]
+    emap.insert(chunk_size*k*12, chunk_size * k * 4, bl64k);
+    for (auto &&[_, eset] : ref)
+      eset.insert(49152, 16384);
+
+    semap.insert_ro_extent_map(emap);
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(emap.get_start_off(), semap.get_ro_start());
+    ASSERT_EQ(emap.get_end_off(), semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(chunk_size * 16, semap.get_end_offset());
+
+    /* Erase the later parts at an obscure offset. */
+    semap.erase_after_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512);
+
+    {
+      extent_set tmp;
+
+      tmp.union_insert(0, chunk_size * 8);
+      ref[shard_id_t(3)].intersection_of(tmp);
+      tmp.union_insert(0, chunk_size * 8 + 512);
+      ref[shard_id_t(2)].intersection_of(tmp);
+      tmp.union_insert(0, chunk_size * 9);
+      ref[shard_id_t(1)].intersection_of(tmp);
+      ref[shard_id_t(0)].intersection_of(tmp);
+    }
+
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(5, semap.get_ro_start());
+    ASSERT_EQ(chunk_size * k * 8 + 2 * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(33280, semap.get_end_offset());
+
+    /* Append again */
+    semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512);
+    ref[shard_id_t(0)].insert(chunk_size * 9, chunk_size);
+    ref[shard_id_t(1)].insert(chunk_size * 9, chunk_size);
+    ref[shard_id_t(2)].insert(chunk_size * 8 + 512, chunk_size);
+    ref[shard_id_t(3)].insert(chunk_size * 8, chunk_size);
+
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(5, semap.get_ro_start());
+    ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+    /* Append nothing */
+    semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512);
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(5, semap.get_ro_start());
+    ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+    /* Append, to an offset before the end */
+    semap.append_zeros_to_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512);
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(5, semap.get_ro_start());
+    ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+    /* Intersect the beginning ro range */
+    shard_extent_map_t semap2 = semap.intersect_ro_range(chunk_size * 2 - 256,
+      chunk_size * k * 8);
+
+    /* The original semap should be untouched */
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(5, semap.get_ro_start());
+    ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+    ASSERT_EQ(0, semap.get_start_offset());
+    ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+    {
+      extent_set tmp;
+      tmp.insert(chunk_size, chunk_size * 8);
+      ref[shard_id_t(0)].intersection_of(tmp);
+    }
+    {
+      extent_set tmp;
+      tmp.insert(chunk_size - 256, chunk_size * 8);
+      ref[shard_id_t(1)].intersection_of(tmp);
+    }
+    {
+      extent_set tmp;
+      tmp.insert(0, chunk_size * 8);
+      ref[shard_id_t(2)].intersection_of(tmp);
+      ref[shard_id_t(3)].intersection_of(tmp);
+    }
+
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(chunk_size*2 - 256, semap2.get_ro_start());
+    ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end())
+      << "semap2=" << semap2;
+    ASSERT_EQ(0, semap2.get_start_offset());
+    ASSERT_EQ(chunk_size * 6, semap2.get_end_offset());
+
+    // intersect with somethning bigger and it should be identical
+    semap2 = semap2.intersect_ro_range(0, chunk_size * k * 10);
+    for (auto &&[shard, eset] : ref) {
+      ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard;
+    }
+    ASSERT_EQ(chunk_size * 2 - 256, semap2.get_ro_start());
+    ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end());
+    ASSERT_EQ(0, semap2.get_start_offset());
+    ASSERT_EQ(chunk_size * 6, semap2.get_end_offset());
+
+    extent_set superset;
+    for (auto &&[_, eset] : ref)
+      superset.union_of(eset);
+
+    ASSERT_EQ(superset, semap2.get_extent_superset());
+  }
+
+  // To test "encode" we need more framework... So will leave to higher level
+  // tests.
+}
+
+// This scenario went wrong in ec transaction code in a cluster-based test.
+TEST(ECUtil, shard_extent_map_t_scenario_1)
+{
+  int k=2;
+  int m=2;
+  int chunk_size = 4096;
+  stripe_info_t sinfo(k, m,  chunk_size*k, vector<shard_id_t>(0));
+  shard_extent_map_t semap(&sinfo);
+
+  bufferlist bl;
+  bl.append_zero(chunk_size);
+  semap.insert_in_shard(shard_id_t(0), chunk_size, bl);
+  semap.insert_in_shard(shard_id_t(0), chunk_size*3, bl);
+  semap.insert_in_shard(shard_id_t(1), chunk_size, bl);
+  semap.insert_in_shard(shard_id_t(1), chunk_size*3, bl);
+
+  for (int i=0; i<k; i++) {
+    auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+    ASSERT_EQ(chunk_size, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size*3, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+  }
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+  ASSERT_EQ(2*chunk_size, semap.get_ro_start());
+  ASSERT_EQ(8*chunk_size, semap.get_ro_end());
+  ASSERT_EQ(chunk_size, semap.get_start_offset());
+  ASSERT_EQ(4*chunk_size, semap.get_end_offset());
+
+  bufferlist bl2;
+  bl2.append_zero(2048);
+  bl2.c_str()[0]='A';
+  ASSERT_EQ('A', bl2.c_str()[0]);
+  bufferlist bl3;
+  bl3.append_zero(2048);
+  bl3.c_str()[0]='B';
+  ASSERT_EQ('B', bl3.c_str()[0]);
+  sinfo.ro_range_to_shard_extent_map(3*chunk_size, 2048, bl2, semap);
+  sinfo.ro_range_to_shard_extent_map(6*chunk_size, 2048, bl3, semap);
+
+  for (int i=0; i<k; i++) {
+    auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+    ASSERT_EQ(chunk_size, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size*3, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+  }
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+  ASSERT_EQ(2*chunk_size, semap.get_ro_start());
+  ASSERT_EQ(8*chunk_size, semap.get_ro_end());
+  ASSERT_EQ(chunk_size, semap.get_start_offset());
+  ASSERT_EQ(4*chunk_size, semap.get_end_offset());
+
+
+  shard_extent_map_t semap2 = semap.intersect_ro_range(0, 8*chunk_size);
+  for (int i=0; i<k; i++) {
+    auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+    ASSERT_EQ(chunk_size, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size*3, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+  }
+
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+  ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+
+  for (int i=0; i<k; i++) {
+    auto &&iter = semap2.get_extent_map(shard_id_t(i)).begin();
+    ASSERT_EQ(chunk_size, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size*3, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap2.get_extent_map(shard_id_t(i)).end(), iter);
+  }
+
+  ASSERT_FALSE(semap2.contains_shard(shard_id_t(2)));
+  ASSERT_FALSE(semap2.contains_shard(shard_id_t(3)));
+
+  semap2.insert_parity_buffers();
+  for (int i=0; i<(k+m); i++) {
+    auto &&iter = semap2.get_extent_map(shard_id_t(i)).begin();
+    ASSERT_EQ(chunk_size, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(chunk_size*3, iter.get_off());
+    ASSERT_EQ(chunk_size, iter.get_len());
+    ++iter;
+    ASSERT_EQ(semap2.get_extent_map(shard_id_t(i)).end(), iter);
+  }
+}
+
+
+// This scenario went wrong in ec transaction code in a cluster-based test.
+/*
+ *Recreate of this failure:
+-171> 2024-10-07T11:38:23.746+0100 7fa0df6f4800  0 == test 1 Random offset, random length read/write I/O with queue depth 1 (seqseed 1137522502) ==
+-170> 2024-10-07T11:38:23.746+0100 7fa0df6f4800  5 test Step 0: Create (size=44K)
+-169> 2024-10-07T11:38:23.787+0100 7fa0df6f4800  5 test Step 1: Barrier
+-168> 2024-10-07T11:38:23.787+0100 7fa0df6f4800  5 test Step 2: Write (offset=38K,length=4K)
+-167> 2024-10-07T11:38:23.829+0100 7fa0df6f4800  5 test Step 3: Barrier
+-166> 2024-10-07T11:38:23.829+0100 7fa0df6f4800  5 test Step 4: Write (offset=38K,length=4K)
+-165> 2024-10-07T11:38:23.876+0100 7fa0df6f4800  5 test Step 5: Barrier
+-164> 2024-10-07T11:38:23.876+0100 7fa0df6f4800  5 test Step 6: Write (offset=10K,length=6K)
+-163> 2024-10-07T11:38:23.963+0100 7fa0df6f4800  5 test Step 7: Barrier
+-162> 2024-10-07T11:38:23.963+0100 7fa0df6f4800  5 test Step 8: Write (offset=30K,length=2K)
+*/
+TEST(ECUtil, shard_extent_map_t_insert_ro_buffer)
+{
+  int k=2;
+  int m=2;
+  int chunk_size = 4096;
+  char c = 1;
+  stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+  shard_extent_map_t semap(&sinfo);
+
+  bufferlist bl;
+  bl.append_zero(44*1024);
+
+  char *buf = bl.c_str();
+
+  shard_extent_map_t ref_semap(&sinfo);
+  ref_semap.append_zeros_to_ro_offset(48*1024);
+
+  for (char i=0; i<44; i++) {
+    buf[i*1024] = c;
+    int chunk = i/4;
+    shard_id_t shard(chunk % k);
+    int offset = chunk_size * (chunk / k) + i % 4 * 1024;
+    bufferlist tmp;
+    ref_semap.get_buffer(shard, offset, 1024, tmp);
+    tmp.c_str()[0] = c++;
+  }
+
+  sinfo.ro_range_to_shard_extent_map(0, 44*1024, bl, semap);
+  semap.assert_buffer_contents_equal(ref_semap);
+  bufferlist insert_bl;
+  insert_bl.append_zero(2*1024);
+  insert_bl.c_str()[0] = c;
+  {
+    bufferlist tmp;
+    ref_semap.get_buffer(shard_id_t(1), 14*1024, 1024, tmp);
+    tmp.c_str()[0] = c++;
+  }
+  insert_bl.c_str()[1024] = c;
+  {
+    bufferlist tmp;
+    ref_semap.get_buffer(shard_id_t(1), 15*1024, 1024, tmp);
+    tmp.c_str()[0] = c++;
+  }
+
+  sinfo.ro_range_to_shard_extent_map(30*1024, 1024, insert_bl, semap);
+  semap.assert_buffer_contents_equal(ref_semap);
+}
+
+// Sanity check that k=3 buffer inserts work
+TEST(ECUtil, shard_extent_map_t_insert_ro_buffer_3)
+{
+  int k=3;
+  int m=2;
+  int chunk_size = 4096;
+  uint64_t ro_offset = 10 * 1024;
+  uint64_t ro_length = 32 * 1024;
+
+  char c = 5;
+  stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+  shard_extent_map_t semap(&sinfo);
+  bufferlist ref;
+  bufferlist in;
+  ref.append_zero(ro_length);
+  in.append_zero(ro_length);
+
+  for (uint64_t i=0; i<ro_length; i += 2048) {
+    ref.c_str()[i+8] = c;
+    in.c_str()[i+8] = c;
+    c++;
+  }
+
+  extent_map emap_in;
+  emap_in.insert(ro_offset, ro_length, in);
+  semap.insert_ro_extent_map(emap_in);
+  bufferlist out = semap.get_ro_buffer(ro_offset, ro_length);
+
+  ASSERT_TRUE(out.contents_equal(ref)) << semap.debug_string(2048, 0);
+}
+
+TEST(ECUtil, sinfo_ro_size_to_read_mask_lrc) {
+  std::vector<shard_id_t> chunk_mapping = {shard_id_t(1), shard_id_t(2), shard_id_t(0)};
+  stripe_info_t sinfo(2, 1, 2 * 4096, chunk_mapping);
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(1, read_mask);
+    sinfo.ro_size_to_zero_mask(1, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(1)].insert(0, 4096);
+    ref_zero[shard_id_t(2)].insert(0, 4096);
+    ref_read[shard_id_t(0)].insert(0, 4096);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(38912, read_mask);
+    sinfo.ro_size_to_zero_mask(38912, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(1)].insert(0, 20480);
+    ref_read[shard_id_t(2)].insert(0, 20480);
+    ref_read[shard_id_t(0)].insert(0, 20480);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+}
+
+TEST(ECUtil, sinfo_ro_size_to_read_mask) {
+  stripe_info_t sinfo(2, 1, 16*4096);
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(1, read_mask);
+    sinfo.ro_size_to_zero_mask(1, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(0)].insert(0, 4096);
+    ref_zero[shard_id_t(1)].insert(0, 4096);
+    ref_read[shard_id_t(2)].insert(0, 4096);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(4096, read_mask);
+    sinfo.ro_size_to_zero_mask(4096, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(0)].insert(0, 4096);
+    ref_zero[shard_id_t(1)].insert(0, 4096);
+    ref_read[shard_id_t(2)].insert(0, 4096);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(4097, read_mask);
+    sinfo.ro_size_to_zero_mask(4097, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(0)].insert(0, 8192);
+    ref_zero[shard_id_t(1)].insert(0, 8192);
+    ref_read[shard_id_t(2)].insert(0, 8192);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(8*4096+1, read_mask);
+    sinfo.ro_size_to_zero_mask(8*4096+1, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(0)].insert(0, 8*4096);
+    ref_read[shard_id_t(1)].insert(0, 4096);
+    ref_zero[shard_id_t(1)].insert(4096, 7*4096);
+    ref_read[shard_id_t(2)].insert(0, 8*4096);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+
+  {
+    shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+    shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+    sinfo.ro_size_to_read_mask(16*4096+1, read_mask);
+    sinfo.ro_size_to_zero_mask(16*4096+1, zero_mask);
+
+    shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+    shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+    ref_read[shard_id_t(0)].insert(0, 9*4096);
+    ref_read[shard_id_t(1)].insert(0, 8*4096);
+    ref_zero[shard_id_t(1)].insert(8*4096, 1*4096);
+    ref_read[shard_id_t(2)].insert(0, 9*4096);
+
+    ASSERT_EQ(ref_read, read_mask);
+    ASSERT_EQ(ref_zero, zero_mask);
+  }
+}
+
+TEST(ECUtil, slice_iterator)
+{
+  stripe_info_t sinfo(2, 1, 2*4096);
+  shard_id_set out_set;
+  out_set.insert_range(shard_id_t(0), 3);
+  shard_extent_map_t sem(&sinfo);
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+    ASSERT_TRUE(iter.get_out_bufferptrs().empty());
+  }
+
+  bufferlist a, b;
+  a.append_zero(8192);
+  a.c_str()[0] = 'A';
+  a.c_str()[4096] = 'C';
+  b.append_zero(4096);
+  b.c_str()[0] = 'B';
+
+  sem.insert_in_shard(shard_id_t(0), 0, a);
+  sem.insert_in_shard(shard_id_t(1), 0, b);
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(0, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_EQ(2, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    {
+      auto out = iter.get_out_bufferptrs();
+
+      ASSERT_EQ(4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+    }
+
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+  // Create a gap.
+  bufferlist d, e;
+  d.append_zero(4096);
+  d.c_str()[0] = 'D';
+  e.append_zero(4096);
+  e.c_str()[0] = 'E';
+  sem.insert_in_shard(shard_id_t(0), 4096*4, d);
+  sem.insert_in_shard(shard_id_t(1), 4096*4, e);
+
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(0, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(2, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+    }
+
+    ++iter;
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(4*4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(2, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ('D', out[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+  // Multiple buffers in each shard and gap at start.
+  sem.clear();
+  a.clear();
+  a.append_zero(4096);
+  a.c_str()[0] = 'A';
+  bufferlist c;
+  c.append_zero(4096);
+  c.c_str()[0] = 'C';
+
+  sem.insert_in_shard(shard_id_t(0), 4096*1, a);
+  sem.insert_in_shard(shard_id_t(1), 4096*1, b);
+  sem.insert_in_shard(shard_id_t(0), 4096*2, c);
+  sem.insert_in_shard(shard_id_t(1), 4096*2, d);
+
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(2, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    {
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(2*4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(2, out.size());
+      ASSERT_EQ(4096, out[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+}
+TEST(ECUtil, slice_iterator_subset_out)
+{
+  stripe_info_t sinfo(2, 1, 2*4096);
+  shard_id_set out_set;
+  out_set.insert(shard_id_t(1));
+  shard_extent_map_t sem(&sinfo);
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+    ASSERT_TRUE(iter.get_in_bufferptrs().empty());
+    ASSERT_TRUE(iter.get_out_bufferptrs().empty());
+  }
+
+  bufferlist a, b;
+  a.append_zero(8192);
+  a.c_str()[0] = 'A';
+  a.c_str()[4096] = 'C';
+  b.append_zero(4096);
+  b.c_str()[0] = 'B';
+
+  sem.insert_in_shard(shard_id_t(0), 0, a);
+  sem.insert_in_shard(shard_id_t(1), 0, b);
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto in = iter.get_in_bufferptrs();
+      auto out = iter.get_out_bufferptrs();
+      ASSERT_EQ(0, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_EQ(1, in.size());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, in[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    /* The iterator only cares about outputs, so doesn't care that there is an
+     * extra 4k to go.
+     */
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+  // Create a gap.
+  bufferlist d, e;
+  d.append_zero(4096);
+  d.c_str()[0] = 'D';
+  e.append_zero(4096);
+  e.c_str()[0] = 'E';
+  sem.insert_in_shard(shard_id_t(0), 4096*4, d);
+  sem.insert_in_shard(shard_id_t(1), 4096*4, e);
+
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto in = iter.get_in_bufferptrs();
+      auto out = iter.get_out_bufferptrs();
+
+      ASSERT_EQ(0, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(in.empty());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, in.size());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, in[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    // Skip the next 4k, since it is not in the output buffer.
+
+    ++iter;
+    {
+      auto in = iter.get_in_bufferptrs();
+      auto out = iter.get_out_bufferptrs();
+
+      ASSERT_EQ(4*4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(in.empty());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, in.size());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, in[shard_id_t(0)].length());
+      ASSERT_EQ('D', in[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+  // Multiple buffers in each shard and gap at start.
+  sem.clear();
+  a.clear();
+  a.append_zero(4096);
+  a.c_str()[0] = 'A';
+  bufferlist c;
+  c.append_zero(4096);
+  c.c_str()[0] = 'C';
+
+  sem.insert_in_shard(shard_id_t(0), 4096*1, a);
+  sem.insert_in_shard(shard_id_t(1), 4096*1, b);
+  sem.insert_in_shard(shard_id_t(0), 4096*2, c);
+  sem.insert_in_shard(shard_id_t(1), 4096*2, d);
+
+  {
+    auto iter = sem.begin_slice_iterator(out_set);
+
+    {
+      auto in = iter.get_in_bufferptrs();
+      auto out = iter.get_out_bufferptrs();
+
+      ASSERT_EQ(4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(in.empty());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, in.size());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, in[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    {
+      auto in = iter.get_in_bufferptrs();
+      auto out = iter.get_out_bufferptrs();
+
+      ASSERT_EQ(2*4096, iter.get_offset());
+      ASSERT_EQ(4096, iter.get_length());
+      ASSERT_FALSE(in.empty());
+      ASSERT_FALSE(out.empty());
+      ASSERT_EQ(1, in.size());
+      ASSERT_EQ(1, out.size());
+      ASSERT_EQ(4096, in[shard_id_t(0)].length());
+      ASSERT_EQ(4096, out[shard_id_t(1)].length());
+      ASSERT_EQ('C', in[shard_id_t(0)].c_str()[0]);
+      ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]);
+    }
+
+    ++iter;
+    ASSERT_TRUE(iter.is_end());
+  }
+
+}
+
+
+TEST(ECUtil, object_size_to_shard_size)
+{
+  // This should return aligned values, inputs verifying that the result is
+  // aligned to the next page
+  std::vector<uint64_t> inputs = {0x4D000, 0x4CCFF, 0x4C001};
+
+  stripe_info_t sinfo(4, 2, 4*4096);
+  for (uint64_t input : inputs)
+  {
+    ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(0)));
+    ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(1)));
+    ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(2)));
+    ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(3)));
+    ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(4)));
+    ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(5)));
+  }
+
+  // Verify +/-1 also rounds correctly
+  ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(0x4C000, shard_id_t(0)));
+  ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(0x4D001, shard_id_t(1)));
+}
+
+TEST(ECUtil, slice)
+{
+  int k=4;
+  int m=2;
+  int chunk_size = 4096;
+  stripe_info_t sinfo(k, m, k*4096);
+  shard_extent_map_t sem(&sinfo);
+
+  extent_map emap;
+  buffer::list bl1k;
+  buffer::list bl4k;
+  buffer::list bl16k;
+  buffer::list bl64k;
+
+  bl1k.append_zero(1024);
+  bl4k.append_zero(4096);
+  bl16k.append_zero(chunk_size * k);
+  bl64k.append_zero(chunk_size * k * 4);
+  shard_extent_set_t ref(sinfo.get_k_plus_m());
+
+  sem.insert_in_shard(shard_id_t(1), 512, bl1k);
+  sem.insert_in_shard(shard_id_t(2), 5, bl4k);
+  sem.insert_in_shard(shard_id_t(3), 256, bl16k);
+  sem.insert_in_shard(shard_id_t(4), 5, bl64k);
+
+  {
+    auto slice_map = sem.slice_map(512, 1024);
+    ASSERT_EQ(4, slice_map.get_extent_maps().size());
+    ASSERT_EQ(512, slice_map.get_start_offset());
+    ASSERT_EQ(512+1024, slice_map.get_end_offset());
+
+    for (int i=1; i<5; i++) {
+      ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(i)).get_start_off());
+      ASSERT_EQ(512+1024, slice_map.get_extent_map(shard_id_t(i)).get_end_off());
+    }
+  }
+
+  {
+    auto slice_map = sem.slice_map(0, 4096);
+    ASSERT_EQ(4, slice_map.get_extent_maps().size());
+    ASSERT_EQ(5, slice_map.get_start_offset());
+    ASSERT_EQ(4096, slice_map.get_end_offset());
+    ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(1)).get_start_off());
+    ASSERT_EQ(512 + 1024, slice_map.get_extent_map(shard_id_t(1)).get_end_off());
+    ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(2)).get_start_off());
+    ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(2)).get_end_off());
+    ASSERT_EQ(256, slice_map.get_extent_map(shard_id_t(3)).get_start_off());
+    ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(3)).get_end_off());
+    ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(4)).get_start_off());
+    ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(4)).get_end_off());
+  }
+
+  {
+    auto slice_map = sem.slice_map(0, 5);
+    ASSERT_TRUE(slice_map.empty());
+  }
+
+  {
+    auto slice_map = sem.slice_map(64*1024+5, 5);
+    ASSERT_TRUE(slice_map.empty());
+  }
+
+  {
+    auto slice_map = sem.slice_map(5, 64*1024);
+    ASSERT_EQ(slice_map, sem);
+  }
+
+  {
+    auto slice_map = sem.slice_map(0, 65*1024);
+    ASSERT_EQ(slice_map, sem);
+  }
+}
+\ No newline at end of file
diff --git a/src/test/osd/test_ec_transaction.cc b/src/test/osd/test_ec_transaction.cc

index da089b6c2d2875804342d8fc40f526d22f8f6ae8..5c662279a5dd9990e1195aaa5db1c0b51520023e 100644 (file)
--- a/src/test/osd/test_ec_transaction.cc
+++ b/src/test/osd/test_ec_transaction.cc
@@ -16,6 +16,7 @@
  #include "osd/PGTransaction.h"
  #include "osd/ECTransaction.h"
  #include "common/debug.h"
+#include "osd/ECBackend.h"
  
  #include "test/unit.cc"
  
@@ -27,99 +28,344 @@ struct mydpp : public DoutPrefixProvider {
  
  #define dout_context g_ceph_context
  
-TEST(ectransaction, two_writes_separated)
+struct ECTestOp : ECCommon::RMWPipeline::Op {
+  PGTransactionUPtr t;
+};
+
+TEST(ectransaction, two_writes_separated_append)
  {
    hobject_t h;
-  PGTransactionUPtr t(new PGTransaction);
+  PGTransaction::ObjectOperation op;
    bufferlist a, b;
-  t->create(h);
    a.append_zero(565760);
-  t->write(h, 0, a.length(), a, 0);
+  op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
    b.append_zero(2437120);
-  t->write(h, 669856, b.length(), b, 0);
+  op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0});
  
-  ECUtil::stripe_info_t sinfo(2, 2, 8192);
-  auto plan = ECTransaction::get_write_plan(
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool);
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
      sinfo,
-    *t,
-    [&](const hobject_t &i) {
-      ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
-      return ref;
-    },
-    &dpp);
-  generic_derr << "to_read " << plan.to_read << dendl;
-  generic_derr << "will_write " << plan.will_write << dendl;
-
-  ASSERT_EQ(0u, plan.to_read.size());
-  ASSERT_EQ(1u, plan.will_write.size());
+    shards,
+    shards,
+    false,
+    0,
+    std::nullopt,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  ASSERT_FALSE(plan.to_read);
+  ASSERT_EQ(4u, plan.will_write.shard_count());
  }
  
-TEST(ectransaction, two_writes_nearby)
+TEST(ectransaction, two_writes_separated_misaligned_overwrite)
  {
    hobject_t h;
-  PGTransactionUPtr t(new PGTransaction);
+  PGTransaction::ObjectOperation op;
    bufferlist a, b;
-  t->create(h);
-
-  // two nearby writes, both partly touching the same 8192-byte stripe
-  ECUtil::stripe_info_t sinfo(2, 2, 8192);
    a.append_zero(565760);
-  t->write(h, 0, a.length(), a, 0);
+  op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
    b.append_zero(2437120);
-  t->write(h, 569856, b.length(), b, 0);
+  op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0});
  
-  auto plan = ECTransaction::get_write_plan(
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 3112960;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
      sinfo,
-    *t,
-    [&](const hobject_t &i) {
-      ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
-      return ref;
-    },
-    &dpp);
-  generic_derr << "to_read " << plan.to_read << dendl;
-  generic_derr << "will_write " << plan.will_write << dendl;
-
-  ASSERT_EQ(0u, plan.to_read.size());
-  ASSERT_EQ(1u, plan.will_write.size());
+    shards,
+    shards,
+    false,
+    oi.size,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  ASSERT_EQ(2u, (*plan.to_read).shard_count());
+  ASSERT_EQ(4u, plan.will_write.shard_count());
  }
  
-TEST(ectransaction, many_writes)
+// Test writing to an object at an offset which is beyond the end of the
+// current object.
+TEST(ectransaction, partial_write)
  {
    hobject_t h;
-  PGTransactionUPtr t(new PGTransaction);
-  bufferlist a, b;
-  a.append_zero(512);
-  b.append_zero(4096);
-  t->create(h);
-
-  ECUtil::stripe_info_t sinfo(2, 2, 8192);
-  // write 2801664~512
-  // write 2802176~512
-  // write 2802688~512
-  // write 2803200~512
-  t->write(h, 2801664, a.length(), a, 0);
-  t->write(h, 2802176, a.length(), a, 0);
-  t->write(h, 2802688, a.length(), a, 0);
-  t->write(h, 2803200, a.length(), a, 0);
-
-  // write 2805760~4096
-  // write 2809856~4096
-  // write 2813952~4096
-  t->write(h, 2805760, b.length(), b, 0);
-  t->write(h, 2809856, b.length(), b, 0);
-  t->write(h, 2813952, b.length(), b, 0);
-
-  auto plan = ECTransaction::get_write_plan(
+  PGTransaction::ObjectOperation op;
+  bufferlist a;
+
+  // Start by writing 8 bytes to the start of an object.
+  a.append_zero(8);
+  op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 8;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 3);
+
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    0,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // The object is empty, so we should have no reads and an 4k write.
+  ASSERT_FALSE(plan.to_read);
+  extent_set ref_write;
+  ref_write.insert(0, 4096);
+  ASSERT_EQ(2u, plan.will_write.shard_count());
+  ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(0)));
+  ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(2)));
+}
+
+TEST(ectransaction, overlapping_write_non_aligned)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+  bufferlist a;
+
+  // Start by writing 8 bytes to the start of an object.
+  a.append_zero(8);
+  op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 8;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    8,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // There should be no overlap of this read.
+  ASSERT_EQ(1u, (*plan.to_read).shard_count());
+  extent_set ref;
+  ref.insert(0, 4096);
+  ASSERT_EQ(2u, plan.will_write.shard_count());
+  ASSERT_EQ(1u, (*plan.to_read).shard_count());
+  ASSERT_EQ(ref, plan.will_write.at(shard_id_t(0)));
+  ASSERT_EQ(ref, plan.will_write.at(shard_id_t(2)));
+}
+
+TEST(ectransaction, test_appending_write_non_aligned)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+  bufferlist a;
+
+  // Start by writing 8 bytes to the start of an object.
+  a.append_zero(4096);
+  op.buffer_updates.insert(3*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 4*4096;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    8,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // We are growing an option from zero with a hole.
+  ASSERT_FALSE(plan.to_read);
+
+  // The writes will cover not cover the zero parts
+  ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+  ref_write[shard_id_t(1)].insert(4096, 4096);
+  ref_write[shard_id_t(2)].insert(4096, 4096);
+  ASSERT_EQ(ref_write, plan.will_write);
+}
+
+TEST(ectransaction, append_with_large_hole)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+  bufferlist a;
+
+  // We have a 4k write quite a way after the current limit of a 4k object
+  a.append_zero(4096);
+  op.buffer_updates.insert(24*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 25*4096;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 4);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
      sinfo,
-    *t,
-    [&](const hobject_t &i) {
-      ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
-      return ref;
-    },
-    &dpp);
-  generic_derr << "to_read " << plan.to_read << dendl;
-  generic_derr << "will_write " << plan.will_write << dendl;
-
-  ASSERT_EQ(0u, plan.to_read.size());
-  ASSERT_EQ(1u, plan.will_write.size());
+    shards,
+    shards,
+    false,
+    4096,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // Should not require any reads.
+  ASSERT_FALSE(plan.to_read);
+
+  // The writes will cover the new zero parts.
+  ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+  ref_write[shard_id_t(0)].insert(12*4096, 4096);
+  ref_write[shard_id_t(2)].insert(12*4096, 4096);
+  ASSERT_EQ(ref_write, plan.will_write);
  }
+
+TEST(ectransaction, test_append_not_page_aligned_with_large_hole)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op;
+  bufferlist a;
+
+  // We have a 4k write quite a way after the current limit of a 4k object
+  a.append_zero(2048);
+  op.buffer_updates.insert(24*4096 + 1024, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 25*4096;
+  shard_id_set shards;
+  shards.insert_range(shard_id_t(), 3);
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    4096,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // No reads (because not yet written)
+  ASSERT_FALSE(plan.to_read);
+
+  // Writes should grow to 4k
+  ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+  ref_write[shard_id_t(0)].insert(12*4096, 4096);
+  ref_write[shard_id_t(2)].insert(12*4096, 4096);
+  ASSERT_EQ(ref_write, plan.will_write);
+}
+
+TEST(ectransaction, test_overwrite_with_missing)
+{
+  hobject_t h;
+  PGTransaction::ObjectOperation op, op2;
+  bufferlist a;
+
+  // We have a 4k write quite a way after the current limit of a 4k object
+  a.append_zero(14*1024);
+  op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+  pg_pool_t pool;
+  pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+  ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+  object_info_t oi;
+  oi.size = 42*1024;
+  shard_id_set shards;
+  shards.insert(shard_id_t(0));
+  shards.insert(shard_id_t(1));
+
+  ECTransaction::WritePlanObj plan(
+    h,
+    op,
+    sinfo,
+    shards,
+    shards,
+    false,
+    42*1024,
+    oi,
+    std::nullopt,
+    ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+    nullptr,
+    0);
+
+  generic_derr << "plan " << plan << dendl;
+
+  // No reads (because not yet written)
+  ASSERT_TRUE(plan.to_read);
+  ECUtil::shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+  ref_read[shard_id_t(1)].insert(4096, 4096);
+  ASSERT_EQ(ref_read, plan.to_read);
+
+  // Writes should grow to 4k
+  ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+  ref_write[shard_id_t(0)].insert(0, 8192);
+  ref_write[shard_id_t(1)].insert(0, 8192);
+  ASSERT_EQ(ref_write, plan.will_write);
+}
+\ No newline at end of file
diff --git a/src/test/osd/test_extent_cache.cc b/src/test/osd/test_extent_cache.cc

index 9c789ca3274515991ac0770bf6a08e7beb9e3598..12ded85f4082ec25861ef65fff475df885b12dd2 100644 (file)
--- a/src/test/osd/test_extent_cache.cc
+++ b/src/test/osd/test_extent_cache.cc
@@ -14,269 +14,682 @@
  
  
  #include <gtest/gtest.h>
-#include "osd/ExtentCache.h"
-#include <iostream>
+#include "osd/ECExtentCache.h"
  
  using namespace std;
+using namespace ECUtil;
  
-extent_map imap_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
+shard_extent_map_t imap_from_vector(vector<vector<pair<uint64_t, uint64_t>>> &&in, stripe_info_t const *sinfo)
  {
-  extent_map out;
-  for (auto &&tup: in) {
-    bufferlist bl;
-    bl.append_zero(tup.second);
-    out.insert(tup.first, bl.length(), bl);
+  shard_extent_map_t out(sinfo);
+  for (int shard = 0; shard < (int)in.size(); shard++) {
+    for (auto &&tup: in[shard]) {
+      bufferlist bl;
+      bl.append_zero(tup.second);
+      out.insert_in_shard(shard_id_t(shard), tup.first, bl);
+    }
    }
    return out;
  }
  
-extent_map imap_from_iset(const extent_set &set)
+shard_extent_map_t imap_from_iset(const shard_extent_set_t &sset, stripe_info_t *sinfo)
  {
-  extent_map out;
-  for (auto &&iter: set) {
-    bufferlist bl;
-    bl.append_zero(iter.second);
-    out.insert(iter.first, iter.second, bl);
+  shard_extent_map_t out(sinfo);
+
+  for (auto &&[shard, set]: sset) {
+    for (auto &&iter: set) {
+      bufferlist bl;
+      bl.append_zero(iter.second);
+      out.insert_in_shard(shard, iter.first, bl);
+    }
    }
    return out;
  }
  
-extent_set iset_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
+shard_extent_set_t iset_from_vector(vector<vector<pair<uint64_t, uint64_t>>> &&in, const stripe_info_t *sinfo)
  {
-  extent_set out;
-  for (auto &&tup: in) {
-    out.insert(tup.first, tup.second);
+  shard_extent_set_t out(sinfo->get_k_plus_m());
+  for (int shard = 0; shard < (int)in.size(); shard++) {
+    for (auto &&tup: in[shard]) {
+      out[shard_id_t(shard)].insert(tup.first, tup.second);
+    }
    }
    return out;
  }
  
-TEST(extentcache, simple_write)
+struct Client : public ECExtentCache::BackendReadListener
  {
-  hobject_t oid;
-
-  ExtentCache c;
-  ExtentCache::write_pin pin;
-  c.open_write_pin(pin);
-
-  auto to_read = iset_from_vector(
-    {{0, 2}, {8, 2}, {20, 2}});
-  auto to_write = iset_from_vector(
-    {{0, 10}, {20, 4}});
-  auto must_read = c.reserve_extents_for_rmw(
-    oid, pin, to_write, to_read);
-  ASSERT_EQ(
-    must_read,
-    to_read);
-
-  c.print(std::cerr);
-
-  auto got = imap_from_iset(must_read);
-  auto pending_read = to_read;
-  pending_read.subtract(must_read);
-
-  auto pending = c.get_remaining_extents_for_rmw(
-    oid,
-    pin,
-    pending_read);
-  ASSERT_TRUE(pending.empty());
-
-  auto write_map = imap_from_iset(to_write);
-  c.present_rmw_update(
-    oid,
-    pin,
-    write_map);
-
-  c.release_write_pin(pin);
+  hobject_t oid = hobject_t().make_temp_hobject("My first object");
+  stripe_info_t sinfo;
+  ECExtentCache::LRU lru;
+  ECExtentCache cache;
+  optional<shard_extent_set_t> active_reads;
+  list<shard_extent_map_t> results;
+
+  Client(uint64_t chunk_size, int k, int m, uint64_t cache_size) :
+    sinfo(k, m, k*chunk_size, vector<shard_id_t>(0)),
+    lru(cache_size), cache(*this, lru, sinfo, g_ceph_context) {};
+
+  void backend_read(hobject_t _oid, const shard_extent_set_t& request,
+    uint64_t object_size) override  {
+    ceph_assert(oid == _oid);
+    active_reads = request;
+  }
+
+  void cache_ready(const hobject_t& _oid, const shard_extent_map_t& _result)
+  {
+    ceph_assert(oid == _oid);
+    results.emplace_back(_result);
+  }
+
+  void complete_read()
+  {
+    auto reads_done = imap_from_iset(*active_reads, &sinfo);
+    active_reads.reset(); // set before done, as may be called back.
+    cache.read_done(oid, std::move(reads_done));
+  }
+
+  void complete_write(ECExtentCache::OpRef &op)
+  {
+    shard_extent_map_t emap = imap_from_iset(op->get_writes(), &sinfo);
+    //Fill in the parity. Parity correctness does not matter to the cache.
+    emap.insert_parity_buffers();
+    results.clear();
+    cache.write_done(op, std::move(emap));
+  }
+
+  void cache_execute(ECExtentCache::OpRef &op)
+  {
+    list<ECExtentCache::OpRef> l;
+    l.emplace_back(op);
+    cache.execute(l);
+  }
+
+  const stripe_info_t *get_stripe_info() const { return &sinfo; }
+};
+
+TEST(ECExtentCache, double_write_done)
+{
+  Client cl(32, 2, 1, 64);
+
+  auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+
+  optional op = cl.cache.prepare(cl.oid, nullopt, to_write, 10, 10, false,
+  [&cl](ECExtentCache::OpRef &op)
+  {
+    cl.cache_ready(op->get_hoid(), op->get_result());
+  });
+  cl.cache_execute(*op);
+  cl.complete_write(*op);
+}
+
+TEST(ECExtentCache, simple_write)
+{
+  Client cl(32, 2, 1, 64);
+  {
+    auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info());
+    auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+
+    /*    OpRef request(hobject_t const &oid,
+      std::optional<std::shard_extent_set_t> const &to_read,
+      std::shard_extent_set_t const &write,
+      uint64_t orig_size,
+      uint64_t projected_size,
+      CacheReadyCb &&ready_cb)
+      */
+
+    optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op);
+    ASSERT_EQ(to_read, cl.active_reads);
+    ASSERT_TRUE(cl.results.empty());
+    cl.complete_read();
+
+    ASSERT_FALSE(cl.active_reads);
+    ASSERT_EQ(1, cl.results.size());
+    ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+    cl.complete_write(*op);
+
+    ASSERT_FALSE(cl.active_reads);
+    ASSERT_TRUE(cl.results.empty());
+    op.reset();
+  }
+
+  // Repeating the same read should complete without a backend read..
+  {
+    auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info());
+    auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+    optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op);
+    ASSERT_FALSE(cl.active_reads);
+    ASSERT_FALSE(cl.results.empty());
+    ASSERT_EQ(1, cl.results.size());
+    ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+    cl.complete_write(*op);
+    op.reset();
+  }
+
+  // Perform a read overlapping with the previous write, but not hte previous read.
+  // This should not result in any backend reads, since the cache can be honoured
+  // from the previous write.
+  {
+    auto to_read = iset_from_vector( {{{2, 2}}, {{2, 2}}}, cl.get_stripe_info());
+    auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+    optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op);
+
+    // SHould have remained in LRU!
+    ASSERT_FALSE(cl.active_reads);
+    ASSERT_EQ(1, cl.results.size());
+    ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+    cl.complete_write(*op);
+    op.reset();
+  }
+}
+
+TEST(ECExtentCache, sequential_appends) {
+  Client cl(32, 2, 1, 32);
+
+  auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+  // The first write...
+  optional op1 = cl.cache.prepare(cl.oid, nullopt, to_write1, 0, 10, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op1);
+
+  // Write should have been honoured immediately.
+  ASSERT_FALSE(cl.results.empty());
+  auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info());
+  cl.complete_write(*op1);
+  ASSERT_TRUE(cl.results.empty());
+
+  // The first write...
+  optional op2 = cl.cache.prepare(cl.oid, nullopt, to_write1, 10, 20, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op2);
+
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op2);
+
  }
  
-TEST(extentcache, write_write_overlap)
+TEST(ECExtentCache, multiple_writes)
  {
-  hobject_t oid;
-
-  ExtentCache c;
-  ExtentCache::write_pin pin;
-  c.open_write_pin(pin);
-
-  // start write 1
-  auto to_read = iset_from_vector(
-    {{0, 2}, {8, 2}, {20, 2}});
-  auto to_write = iset_from_vector(
-    {{0, 10}, {20, 4}});
-  auto must_read = c.reserve_extents_for_rmw(
-    oid, pin, to_write, to_read);
-  ASSERT_EQ(
-    must_read,
-    to_read);
-
-  c.print(std::cerr);
-
-  // start write 2
-  ExtentCache::write_pin pin2;
-  c.open_write_pin(pin2);
-  auto to_read2 = iset_from_vector(
-    {{2, 4}, {10, 4}, {18, 4}});
-  auto to_write2 = iset_from_vector(
-    {{2, 12}, {18, 12}});
-  auto must_read2 = c.reserve_extents_for_rmw(
-    oid, pin2, to_write2, to_read2);
-  ASSERT_EQ(
-    must_read2,
-    iset_from_vector({{10, 4}, {18, 2}}));
-
-  c.print(std::cerr);
-
-  // complete read for write 1 and start commit
-  auto got = imap_from_iset(must_read);
-  auto pending_read = to_read;
-  pending_read.subtract(must_read);
-  auto pending = c.get_remaining_extents_for_rmw(
-    oid,
-    pin,
-    pending_read);
-  ASSERT_TRUE(pending.empty());
-
-  auto write_map = imap_from_iset(to_write);
-  c.present_rmw_update(
-    oid,
-    pin,
-    write_map);
-
-  c.print(std::cerr);
-
-  // complete read for write 2 and start commit
-  auto pending_read2 = to_read2;
-  pending_read2.subtract(must_read2);
-  auto pending2 = c.get_remaining_extents_for_rmw(
-    oid,
-    pin2,
-    pending_read2);
-  ASSERT_EQ(
-    pending2,
-    imap_from_iset(pending_read2));
-
-  auto write_map2 = imap_from_iset(to_write2);
-  c.present_rmw_update(
-    oid,
-    pin2,
-    write_map2);
-
-  c.print(std::cerr);
-
-  c.release_write_pin(pin);
-
-  c.print(std::cerr);
-
-  c.release_write_pin(pin2);
+  Client cl(32, 2, 1, 32);
+
+  auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info());
+  auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+  // This should drive a request for this IO, which we do not yet honour.
+  optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op1);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Perform another request. We should not see any change in the read requests.
+  auto to_read2 = iset_from_vector( {{{8, 4}}}, cl.get_stripe_info());
+  auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info());
+  optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 10, 10, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op2);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Perform another request, this to check that reads are coalesced.
+  auto to_read3 = iset_from_vector( {{{32, 6}}}, cl.get_stripe_info());
+  auto to_write3 = iset_from_vector({}, cl.get_stripe_info());
+  optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 10, 10, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op3);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Finally op4, with no reads.
+  auto to_write4 = iset_from_vector({{{20, 10}}}, cl.get_stripe_info());
+  optional op4 = cl.cache.prepare(cl.oid, nullopt, to_write4, 10, 10, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+      cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op4);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Completing the first read will allow the first write and start a batched read.
+  // Note that the cache must not read what was written in op 1.
+  cl.complete_read();
+  auto expected_read = iset_from_vector({{{10,2}, {32,6}}}, cl.get_stripe_info());
+  ASSERT_EQ(expected_read, cl.active_reads);
+  ASSERT_EQ(1, cl.results.size());
+  ASSERT_EQ(to_read1, cl.results.front().get_extent_set());
+  cl.complete_write(*op1);
+
+  // The next write requires some more reads, so should not occur.
+  ASSERT_TRUE(cl.results.empty());
+
+  // All reads complete, this should allow for op2 to be ready.
+  cl.complete_read();
+  ASSERT_FALSE(cl.active_reads);
+  ASSERT_EQ(3, cl.results.size());
+  auto result = cl.results.begin();
+  ASSERT_EQ(to_read2, result++->get_extent_set());
+  ASSERT_EQ(to_read3, result++->get_extent_set());
+  ASSERT_TRUE(result++->empty());
+
+  cl.complete_write(*op2);
+  cl.complete_write(*op3);
+  cl.complete_write(*op4);
+
+  op1.reset();
+  op2.reset();
+  op3.reset();
+  op4.reset();
  }
  
-TEST(extentcache, write_write_overlap2)
+int dummies;
+struct Dummy
+{
+  Dummy() {dummies++;}
+  ~Dummy() {dummies--;}
+};
+
+TEST(ECExtentCache, on_change)
  {
-  hobject_t oid;
-
-  ExtentCache c;
-  ExtentCache::write_pin pin;
-  c.open_write_pin(pin);
-
-  // start write 1
-  auto to_read = extent_set();
-  auto to_write = iset_from_vector(
-    {{659456, 4096}});
-  auto must_read = c.reserve_extents_for_rmw(
-    oid, pin, to_write, to_read);
-  ASSERT_EQ(
-    must_read,
-    to_read);
-
-  c.print(std::cerr);
-
-  // start write 2
-  ExtentCache::write_pin pin2;
-  c.open_write_pin(pin2);
-  auto to_read2 = extent_set();
-  auto to_write2 = iset_from_vector(
-    {{663552, 4096}});
-  auto must_read2 = c.reserve_extents_for_rmw(
-    oid, pin2, to_write2, to_read2);
-  ASSERT_EQ(
-    must_read2,
-    to_read2);
-
-
-  // start write 3
-  ExtentCache::write_pin pin3;
-  c.open_write_pin(pin3);
-  auto to_read3 = iset_from_vector({{659456, 8192}});
-  auto to_write3 = iset_from_vector({{659456, 8192}});
-  auto must_read3 = c.reserve_extents_for_rmw(
-    oid, pin3, to_write3, to_read3);
-  ASSERT_EQ(
-    must_read3,
-    extent_set());
-
-  c.print(std::cerr);
-
-  // complete read for write 1 and start commit
-  auto got = imap_from_iset(must_read);
-  auto pending_read = to_read;
-  pending_read.subtract(must_read);
-  auto pending = c.get_remaining_extents_for_rmw(
-    oid,
-    pin,
-    pending_read);
-  ASSERT_TRUE(pending.empty());
-
-  auto write_map = imap_from_iset(to_write);
-  c.present_rmw_update(
-    oid,
-    pin,
-    write_map);
-
-  c.print(std::cerr);
-
-  // complete read for write 2 and start commit
-  auto pending_read2 = to_read2;
-  pending_read2.subtract(must_read2);
-  auto pending2 = c.get_remaining_extents_for_rmw(
-    oid,
-    pin2,
-    pending_read2);
-  ASSERT_EQ(
-    pending2,
-    imap_from_iset(pending_read2));
-
-  auto write_map2 = imap_from_iset(to_write2);
-  c.present_rmw_update(
-    oid,
-    pin2,
-    write_map2);
-
-  // complete read for write 2 and start commit
-  auto pending_read3 = to_read3;
-  pending_read3.subtract(must_read3);
-  auto pending3 = c.get_remaining_extents_for_rmw(
-    oid,
-    pin3,
-    pending_read3);
-  ASSERT_EQ(
-    pending3,
-    imap_from_iset(pending_read3));
-
-  auto write_map3 = imap_from_iset(to_write3);
-  c.present_rmw_update(
-    oid,
-    pin3,
-    write_map3);
-
-
-  c.print(std::cerr);
-
-  c.release_write_pin(pin);
-
-  c.print(std::cerr);
-
-  c.release_write_pin(pin2);
-
-  c.print(std::cerr);
-
-  c.release_write_pin(pin3);
+  Client cl(32, 2, 1, 64);
+  auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info());
+  auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+  optional<ECExtentCache::OpRef> op;
+  optional<shared_ptr<Dummy>> dummy;
+
+  dummy.emplace(make_shared<Dummy>());
+  ceph_assert(dummies == 1);
+  {
+    shared_ptr<Dummy> d = *dummy;
+    /* Here we generate an op that we never expect to be completed. Note that
+     * some static code analysis tools suggest deleting d here. DO NOT DO THIS
+     * as we are relying on side effects from the destruction of d in this test.
+     */
+    op.emplace(cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false,
+      [d](ECExtentCache::OpRef &ignored)
+      {
+        ceph_abort("Should be cancelled");
+      }));
+  }
+  cl.cache_execute(*op);
+
+  /* We now have the following graph of objects:
+   * cache -- op -- lambda -- d
+   *                 dummy --/
+   */
+  ASSERT_EQ(1, dummies);
+
+  /* Executing the on_change will "cancel" this cache op.  This will cause it
+   * to release the lambda, reducing us down to dummy -- d
+   */
+  cl.cache.on_change();
+  ASSERT_EQ(1, dummies);
+
+  /* This emulates the rmw pipeline clearing outstanding IO.  We now have no
+   * references to d, so we should have destructed the object.
+   * */
+  dummy.reset();
+  ASSERT_EQ(0, dummies);
+
+  /* Keeping the op alive here is emulating the dummy keeping a record of the
+   * cache op. It will also be destroyed at this point by rmw pipeline.
+   */
+  ASSERT_FALSE(cl.cache.idle());
+  op.reset();
+  ASSERT_TRUE(cl.cache.idle());
+
+  // The cache has its own asserts, which we should honour.
+  cl.cache.on_change2();
+}
+
+TEST(ECExtentCache, multiple_misaligned_writes)
+{
+  Client cl(256*1024, 2, 1, 1024*1024);
+
+  // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is
+  // partial, so we read the second 4k to RMW
+  auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info());
+
+  // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the
+  // first and last pages. The first part of this read should be in the cache.
+  auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*4096, 4*4096}}}, cl.get_stripe_info());
+  auto to_read2_exec = iset_from_vector( {{{12*4096, 4*4096}}}, cl.get_stripe_info());
+  auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info());
+
+  // IO 3 is the next misaligned 4k, very similar to IO 3.
+  auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*4096, 4*4096}}}, cl.get_stripe_info());
+  auto to_read3_exec = iset_from_vector( {{{20*4096, 4*4096}}}, cl.get_stripe_info());
+  auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info());
+
+  //Perform the first write, which should result in a read.
+  optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op1);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Submit the second IO.
+  optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op2);
+  // We should still be executing read 1.
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Allow the read to complete. We should now have op1 done...
+  cl.complete_read();
+  ASSERT_EQ(to_read2_exec, cl.active_reads);
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op1);
+
+  // And move on to op3
+  optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op3);
+  // We should still be executing read 1.
+  ASSERT_EQ(to_read2_exec, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Allow the read to complete. We should now have op2 done...
+  cl.complete_read();
+  ASSERT_EQ(to_read3_exec, cl.active_reads);
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op2);
+  ASSERT_EQ(to_read3_exec, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+  cl.complete_read();
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op3);
+
  }
+
+TEST(ECExtentCache, multiple_misaligned_writes2)
+{
+  Client cl(256*1024, 2, 1, 1024*1024);
+
+  // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is
+  // partial, so we read the second 4k to RMW
+  auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info());
+
+  // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the
+  // first and last pages. The first part of this read should be in the cache.
+  auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_read2_exec = iset_from_vector( {{{12*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info());
+
+  // IO 3 is the next misaligned 4k, very similar to IO 3.
+  auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_read3_exec = iset_from_vector( {{{20*1024, 4*1024}}}, cl.get_stripe_info());
+  auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info());
+
+  //Perform the first write, which should result in a read.
+  optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op1);
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Submit the second IO.
+  optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op2);
+  // We should still be executing read 1.
+  ASSERT_EQ(to_read1, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Allow the read to complete. We should now have op1 done...
+  cl.complete_read();
+  ASSERT_EQ(to_read2_exec, cl.active_reads);
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op1);
+
+  // And move on to op3
+  optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false,
+   [&cl](ECExtentCache::OpRef &op)
+   {
+     cl.cache_ready(op->get_hoid(), op->get_result());
+   });
+  cl.cache_execute(*op3);
+  // We should still be executing read 1.
+  ASSERT_EQ(to_read2_exec, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+
+  // Allow the read to complete. We should now have op2 done...
+  cl.complete_read();
+  ASSERT_EQ(to_read3_exec, cl.active_reads);
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op2);
+  ASSERT_EQ(to_read3_exec, cl.active_reads);
+  ASSERT_TRUE(cl.results.empty());
+  cl.complete_read();
+  ASSERT_FALSE(cl.results.empty());
+  cl.complete_write(*op3);
+
+}
+
+TEST(ECExtentCache, test_invalidate)
+{
+  Client cl(256*1024, 2, 1, 1024*1024);
+
+  /* First attempt a write which does not do any reads */
+  {
+    auto to_read1 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info());
+    auto to_write1 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info());
+    optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 4096, 4096, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op1);
+    ASSERT_EQ(to_read1, cl.active_reads);
+    ASSERT_TRUE(cl.results.empty());
+
+    /* Now perform an invalidating cache write */
+    optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op2);
+
+    cl.complete_read();
+    ASSERT_EQ(2, cl.results.size());
+    auto result = cl.results.begin();
+    ASSERT_FALSE(result++->empty());
+    ASSERT_TRUE(result++->empty());
+
+    cl.complete_write(*op1);
+    ASSERT_FALSE(cl.active_reads);
+    cl.complete_write(*op2);
+
+    cl.cache.on_change();
+    op1.reset();
+    op2.reset();
+    cl.cache.on_change2();
+    ASSERT_TRUE(cl.cache.idle());
+  }
+
+  /* Second test, modifies, deletes, creates, then modifies.  */
+  {
+    auto to_read1 = iset_from_vector( {{{0, 8192}}}, cl.get_stripe_info());
+    auto to_write1 = iset_from_vector({{{0, 8192}}}, cl.get_stripe_info());
+    auto to_write2 = iset_from_vector({{{4096, 4096}}}, cl.get_stripe_info());
+    auto to_read3 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info());
+    auto to_write3 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info());
+    optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 8192, 8192, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    optional op3 = cl.cache.prepare(cl.oid, nullopt, to_write2, 0, 8192, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    optional op4 = cl.cache.prepare(cl.oid, to_read3, to_write3, 8192, 8192, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op1);
+    cl.cache_execute(*op2);
+    cl.cache_execute(*op3);
+    cl.cache_execute(*op4);
+
+    /* The first result must actually read. */
+    cl.complete_read();
+    ASSERT_EQ(4, cl.results.size());
+    auto result = cl.results.begin();
+    ASSERT_FALSE(result++->empty());
+    ASSERT_TRUE(result++->empty());
+    ASSERT_TRUE(result++->empty());
+    ASSERT_TRUE(result++->empty());
+    cl.complete_write(*op1);
+    cl.complete_write(*op2);
+    cl.complete_write(*op3);
+    cl.complete_write(*op4);
+
+    cl.cache.on_change();
+    op1.reset();
+    op2.reset();
+    op3.reset();
+    op4.reset();
+    cl.cache.on_change2();
+    ASSERT_TRUE(cl.cache.idle());
+  }
+}
+
+TEST(ECExtentCache, test_invalidate_lru)
+{
+  uint64_t c = 4096;
+  int k = 4;
+  int m = 2;
+  Client cl(c, k, m, 1024*c);
+
+  /* Populate the cache LRU and then invalidate the cache. */
+  {
+    uint64_t bs = 3767;
+    auto io1 = iset_from_vector({{{align_page_prev(35*bs), align_page_next(36*bs) - align_page_prev(35*bs)}}}, cl.get_stripe_info());
+    io1[shard_id_t(k)].insert(io1.get_extent_superset());
+    io1[shard_id_t(k+1)].insert(io1.get_extent_superset());
+    auto io2 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info());
+    io2[shard_id_t(k)].insert(io1.get_extent_superset());
+    io2[shard_id_t(k+1)].insert(io1.get_extent_superset());
+    // io 3 is the truncate
+    auto io3 = shard_extent_set_t(cl.sinfo.get_k_plus_m());
+    auto io4 = iset_from_vector({{{align_page_prev(30*bs), align_page_next(31*bs) - align_page_prev(30*bs)}}}, cl.get_stripe_info());
+    io3[shard_id_t(k)].insert(io1.get_extent_superset());
+    io3[shard_id_t(k+1)].insert(io1.get_extent_superset());
+    auto io5 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info());
+    io4[shard_id_t(k)].insert(io1.get_extent_superset());
+    io4[shard_id_t(k+1)].insert(io1.get_extent_superset());
+
+    optional op1 = cl.cache.prepare(cl.oid, nullopt, io1, 0, align_page_next(36*bs), false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+
+    cl.cache_execute(*op1);
+    ASSERT_FALSE(cl.active_reads);
+    cl.complete_write(*op1);
+    op1.reset();
+
+    optional op2 = cl.cache.prepare(cl.oid, io2, io2, align_page_next(36*bs), align_page_next(36*bs), false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op2);
+    // We have active reads because the object was discarded fro the cache
+    // and has forgotten about all the zero reads.
+    ASSERT_TRUE(cl.active_reads);
+    cl.complete_read();
+    cl.complete_write(*op2);
+    op2.reset();
+
+    optional op3 = cl.cache.prepare(cl.oid, nullopt, io3, align_page_next(36*bs), 0, false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op3);
+    ASSERT_FALSE(cl.active_reads);
+    cl.complete_write(*op3);
+    op3.reset();
+
+    optional op4 = cl.cache.prepare(cl.oid, nullopt, io4, 0, align_page_next(30*bs), false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op4);
+    ASSERT_FALSE(cl.active_reads);
+    cl.complete_write(*op4);
+    op4.reset();
+
+    optional op5 = cl.cache.prepare(cl.oid, io5, io5, align_page_next(30*bs), align_page_next(30*bs), false,
+      [&cl](ECExtentCache::OpRef &op)
+      {
+        cl.cache_ready(op->get_hoid(), op->get_result());
+      });
+    cl.cache_execute(*op5);
+    ASSERT_TRUE(cl.active_reads);
+    cl.complete_write(*op5);
+    op5.reset();
+  }
+}
+\ No newline at end of file
diff --git a/src/tools/erasure-code/ceph-erasure-code-tool.cc b/src/tools/erasure-code/ceph-erasure-code-tool.cc

index 9a5dc09100c4274ece937e8f6fa378fe8a20950d..3a0fb888da4b50b3323c1f628d17f13579ea3702 100644 (file)
--- a/src/tools/erasure-code/ceph-erasure-code-tool.cc
+++ b/src/tools/erasure-code/ceph-erasure-code-tool.cc
@@ -6,7 +6,6 @@
  #include "common/ceph_argparse.h"
  #include "common/config_proxy.h"
  #include "common/errno.h"
-#include "erasure-code/ErasureCode.h"
  #include "erasure-code/ErasureCodePlugin.h"
  #include "global/global_context.h"
  #include "global/global_init.h"
@@ -96,7 +95,7 @@ int ec_init(const std::string &profile_str,
    uint64_t stripe_size = atoi(profile["k"].c_str());
    ceph_assert(stripe_size > 0);
    uint64_t stripe_width = stripe_size * stripe_unit;
-  sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, stripe_width));
+  sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, nullptr, stripe_width));
  
    return 0;
  }
@@ -196,37 +195,36 @@ int do_encode(const std::vector<const char*> &args) {
      return r;
    }
  
-  std::set<int> want;
+  ECUtil::shard_extent_map_t encoded_data(sinfo.get());
    std::vector<std::string> shards;
    boost::split(shards, args[2], boost::is_any_of(","));
-  for (auto &shard : shards) {
-    want.insert(atoi(shard.c_str()));
-  }
-  ceph::bufferlist decoded_data;
+  ceph::bufferlist input_data;
    std::string fname = args[3];
  
    std::string error;
-  r = decoded_data.read_file(fname.c_str(), &error);
+  r = input_data.read_file(fname.c_str(), &error);
    if (r < 0) {
      std::cerr << "failed to read " << fname << ": " << error << std::endl;
      return 1;
    }
  
    uint64_t stripe_width = sinfo->get_stripe_width();
-  if (decoded_data.length() % stripe_width != 0) {
-    uint64_t pad = stripe_width - decoded_data.length() % stripe_width;
-    decoded_data.append_zero(pad);
+  if (input_data.length() % stripe_width != 0) {
+    uint64_t pad = stripe_width - input_data.length() % stripe_width;
+    input_data.append_zero(pad);
    }
  
-  std::map<int, ceph::bufferlist> encoded_data;
-  r = ECUtil::encode(*sinfo, ec_impl, decoded_data, want, &encoded_data);
+  sinfo->ro_range_to_shard_extent_map(0, input_data.length(), input_data, encoded_data);
+  r = encoded_data.encode(ec_impl, nullptr, encoded_data.get_ro_end());
    if (r < 0) {
      std::cerr << "failed to encode: " << cpp_strerror(r) << std::endl;
      return 1;
    }
  
-  for (auto &[shard, bl] : encoded_data) {
+  for (auto &[shard, _] : encoded_data.get_extent_maps()) {
      std::string name = fname + "." + stringify(shard);
+    bufferlist bl;
+    encoded_data.get_shard_first_buffer(shard, bl);
      r = bl.write_file(name.c_str());
      if (r < 0) {
        std::cerr << "failed to write " << name << ": " << cpp_strerror(r)
@@ -247,40 +245,41 @@ int do_decode(const std::vector<const char*> &args) {
    ceph::ErasureCodeInterfaceRef ec_impl;
    std::unique_ptr<ECUtil::stripe_info_t> sinfo;
    int r = ec_init(args[0], args[1], &ec_impl, &sinfo);
-  if (r < 0) {
+  if (r) {
      return r;
    }
  
-  std::map<int, ceph::bufferlist> encoded_data;
+  ECUtil::shard_extent_map_t encoded_data(sinfo.get());
    std::vector<std::string> shards;
    boost::split(shards, args[2], boost::is_any_of(","));
-  for (auto &shard : shards) {
-    encoded_data[atoi(shard.c_str())] = {};
-  }
-  ceph::bufferlist decoded_data;
    std::string fname = args[3];
  
    std::set<int> want_to_read;
    const auto chunk_mapping = ec_impl->get_chunk_mapping();
-  for (auto &[shard, bl] : encoded_data) {
-    std::string name = fname + "." + stringify(shard);
+  for (auto &shard_str : shards) {
+    std::string name = fname + "." + shard_str;
      std::string error;
+    bufferlist bl;
      r = bl.read_file(name.c_str(), &error);
      if (r < 0) {
        std::cerr << "failed to read " << name << ": " << error << std::endl;
        return 1;
      }
-    auto chunk = static_cast<ssize_t>(chunk_mapping.size()) > shard ?
-      chunk_mapping[shard] : shard_id_t(shard);
-    want_to_read.insert(static_cast<int>(chunk));
+    shard_id_t shard = sinfo->get_shard(raw_shard_id_t(atoi(shard_str.c_str())));
+    encoded_data.insert_in_shard(shard, 0, bl);
    }
  
-  r = ECUtil::decode(*sinfo, ec_impl, want_to_read, encoded_data, &decoded_data);
+  ECUtil::shard_extent_set_t wanted(sinfo->get_k_plus_m());
+  sinfo->ro_range_to_shard_extent_set(encoded_data.get_ro_start(),
+    encoded_data.get_ro_end() - encoded_data.get_ro_start(), wanted);
+
+  r = encoded_data.decode(ec_impl, wanted, encoded_data.get_ro_end());
    if (r < 0) {
      std::cerr << "failed to decode: " << cpp_strerror(r) << std::endl;
      return 1;
    }
  
+  bufferlist decoded_data = encoded_data.get_ro_buffer();
    r = decoded_data.write_file(fname.c_str());
    if (r < 0) {
      std::cerr << "failed to write " << fname << ": " << cpp_strerror(r)
author	Alex Ainscow <aainscow@uk.ibm.com>
	Thu, 3 Apr 2025 13:47:28 +0000 (14:47 +0100)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Tue, 22 Apr 2025 07:42:41 +0000 (08:42 +0100)
src/osd/CMakeLists.txt		patch \| blob \| history
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/ECBackend.h		patch \| blob \| history
src/osd/ECCommon.cc		patch \| blob \| history
src/osd/ECCommon.h		patch \| blob \| history
src/osd/ECExtentCache.cc	[new file with mode: 0644]	patch \| blob
src/osd/ECExtentCache.h		patch \| blob \| history
src/osd/ECTransaction.cc		patch \| blob \| history
src/osd/ECTransaction.h		patch \| blob \| history
src/osd/ECUtil.cc		patch \| blob \| history
src/osd/ECUtil.h		patch \| blob \| history
src/osd/ExtentCache.cc	[deleted file]	patch \| blob \| history
src/osd/ExtentCache.h	[deleted file]	patch \| blob \| history
src/test/erasure-code/TestErasureCodePluginJerasure.cc		patch \| blob \| history
src/test/osd/CMakeLists.txt		patch \| blob \| history
src/test/osd/TestECBackend.cc		patch \| blob \| history
src/test/osd/TestECUtil.cc	[new file with mode: 0644]	patch \| blob
src/test/osd/test_ec_transaction.cc		patch \| blob \| history
src/test/osd/test_extent_cache.cc		patch \| blob \| history
src/tools/erasure-code/ceph-erasure-code-tool.cc		patch \| blob \| history