From 9e2841ab167e0a0ac4869c4c28b2223cc29e0dbe Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Thu, 3 Apr 2025 14:47:28 +0100 Subject: [PATCH] osd: Introduce optimized EC Signed-off-by: Alex Ainscow --- src/osd/CMakeLists.txt | 2 +- src/osd/ECBackend.cc | 1463 +++++++++-------- src/osd/ECBackend.h | 545 +++--- src/osd/ECCommon.cc | 1268 +++++++------- src/osd/ECCommon.h | 672 ++++---- src/osd/ECExtentCache.cc | 480 ++++++ src/osd/ECExtentCache.h | 379 ++++- src/osd/ECTransaction.cc | 1448 +++++++++------- src/osd/ECTransaction.h | 275 ++-- src/osd/ECUtil.cc | 1219 ++++++++++++-- src/osd/ECUtil.h | 1020 ++++++++++-- src/osd/ExtentCache.cc | 245 --- src/osd/ExtentCache.h | 486 ------ .../TestErasureCodePluginJerasure.cc | 75 + src/test/osd/CMakeLists.txt | 8 + src/test/osd/TestECBackend.cc | 1323 +++++++++++++-- src/test/osd/TestECUtil.cc | 1034 ++++++++++++ src/test/osd/test_ec_transaction.cc | 394 ++++- src/test/osd/test_extent_cache.cc | 893 +++++++--- .../erasure-code/ceph-erasure-code-tool.cc | 51 +- 20 files changed, 9101 insertions(+), 4179 deletions(-) create mode 100644 src/osd/ECExtentCache.cc delete mode 100644 src/osd/ExtentCache.cc delete mode 100644 src/osd/ExtentCache.h create mode 100644 src/test/osd/TestECUtil.cc diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt index e7f579f38410c..d350607353478 100644 --- a/src/osd/CMakeLists.txt +++ b/src/osd/CMakeLists.txt @@ -48,7 +48,7 @@ set(osd_srcs ECUtilL.cc ECCommon.cc ECBackend.cc - ExtentCache.cc + ECExtentCache.cc ECTransaction.cc ECUtil.cc ECInject.cc diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 63dfc99015d74..88e985c77af6f 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -29,7 +29,6 @@ #include "ECSwitch.h" #include "PrimaryLogPG.h" -#include "osd_tracer.h" #define dout_context cct #define dout_subsys ceph_subsys_osd @@ -56,11 +55,11 @@ using ceph::bufferptr; using ceph::ErasureCodeInterfaceRef; using ceph::Formatter; -static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) { +static ostream &_prefix(std::ostream *_dout, ECBackend *pgb) { return pgb->get_parent()->gen_dbg_prefix(*_dout); } -static ostream& _prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) { +static ostream &_prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) { return pgb->get_parent()->gen_dbg_prefix(*_dout); } @@ -68,50 +67,7 @@ struct ECRecoveryHandle : public PGBackend::RecoveryHandle { list ops; }; -static ostream &operator<<(ostream &lhs, const map &rhs) -{ - lhs << "["; - for (map::const_iterator i = rhs.begin(); - i != rhs.end(); - ++i) { - if (i != rhs.begin()) - lhs << ", "; - lhs << make_pair(i->first, i->second.length()); - } - return lhs << "]"; -} - -static ostream &operator<<(ostream &lhs, const map &rhs) -{ - lhs << "["; - for (map::const_iterator i = rhs.begin(); - i != rhs.end(); - ++i) { - if (i != rhs.begin()) - lhs << ", "; - lhs << make_pair(i->first, i->second.length()); - } - return lhs << "]"; -} - -ostream &operator<<(ostream &lhs, const ECBackend::RecoveryBackend::RecoveryOp &rhs) -{ - return lhs << "RecoveryOp(" - << "hoid=" << rhs.hoid - << " v=" << rhs.v - << " missing_on=" << rhs.missing_on - << " missing_on_shards=" << rhs.missing_on_shards - << " recovery_info=" << rhs.recovery_info - << " recovery_progress=" << rhs.recovery_progress - << " obc refcount=" << rhs.obc.use_count() - << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(rhs.state) - << " waiting_on_pushes=" << rhs.waiting_on_pushes - << " extent_requested=" << rhs.extent_requested - << ")"; -} - -void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const -{ +void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const { f->dump_stream("hoid") << hoid; f->dump_stream("v") << v; f->dump_stream("missing_on") << missing_on; @@ -120,7 +76,6 @@ void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const f->dump_stream("recovery_progress") << recovery_progress; f->dump_stream("state") << tostr(state); f->dump_stream("waiting_on_pushes") << waiting_on_pushes; - f->dump_stream("extent_requested") << extent_requested; } ECBackend::ECBackend( @@ -129,32 +84,38 @@ ECBackend::ECBackend( ErasureCodeInterfaceRef ec_impl, uint64_t stripe_width, ECSwitch *s, - ECExtentCache::LRU &ignored) + ECExtentCache::LRU &ec_extent_cache_lru) : parent(pg), cct(cct), switcher(s), read_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener()), - rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(), *this), - recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline, unstable_hashinfo_registry, get_parent(), this), + rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(), + *this, ec_extent_cache_lru), + recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline, + unstable_hashinfo_registry, get_parent(), this), ec_impl(ec_impl), - sinfo(ec_impl, stripe_width), + sinfo(ec_impl, &(get_parent()->get_pool()), stripe_width), unstable_hashinfo_registry(cct, ec_impl) { + + /* EC makes some assumptions about how the plugin organises the *data* shards: + * - The chunk size is constant for a particular profile. + * - A stripe consists of k chunks. + */ ceph_assert((ec_impl->get_data_chunk_count() * - ec_impl->get_chunk_size(stripe_width)) == stripe_width); + ec_impl->get_chunk_size(stripe_width)) == stripe_width); } -PGBackend::RecoveryHandle *ECBackend::open_recovery_op() -{ +PGBackend::RecoveryHandle *ECBackend::open_recovery_op() { return recovery_backend.open_recovery_op(); } ECBackend::RecoveryBackend::RecoveryBackend( - CephContext* cct, + CephContext *cct, const coll_t &coll, ceph::ErasureCodeInterfaceRef ec_impl, - const ECUtil::stripe_info_t& sinfo, - ReadPipeline& read_pipeline, - UnstableHashInfoRegistry& unstable_hashinfo_registry, - ECListener* parent, - ECBackend* ecbackend) + const ECUtil::stripe_info_t &sinfo, + ReadPipeline &read_pipeline, + UnstableHashInfoRegistry &unstable_hashinfo_registry, + ECListener *parent, + ECBackend *ecbackend) : cct(cct), coll(coll), ec_impl(std::move(ec_impl)), @@ -162,16 +123,14 @@ ECBackend::RecoveryBackend::RecoveryBackend( read_pipeline(read_pipeline), unstable_hashinfo_registry(unstable_hashinfo_registry), parent(parent), - ecbackend(ecbackend) { -} + ecbackend(ecbackend) {} -PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op() -{ +PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op() { return new ECRecoveryHandle; } -void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, ECCommon::read_result_t &res) -{ +void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, + ECCommon::read_result_t &res) { dout(10) << __func__ << ": Read error " << hoid << " r=" << res.r << " errors=" << res.errors << dendl; dout(10) << __func__ << ": canceling recovery op for obj " << hoid @@ -181,70 +140,55 @@ void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, ECCommon::r recovery_ops.erase(hoid); set fl; - for (auto&& i : res.errors) { + for (auto &&i: res.errors) { fl.insert(i.first); } get_parent()->on_failed_pull(fl, hoid, v); } struct RecoveryMessages { - map recovery_reads; - map> want_to_read; - - void recovery_read( - const hobject_t &hoid, uint64_t off, uint64_t len, - set &&_want_to_read, - const map>> &need, - bool attrs) - { - list to_read; - to_read.emplace_back(ec_align_t{off, len, 0}); + map recovery_reads; + + void recovery_read(const hobject_t &hoid, + const ECCommon::read_request_t &read_request) { ceph_assert(!recovery_reads.count(hoid)); - want_to_read.insert(make_pair(hoid, std::move(_want_to_read))); - recovery_reads.insert( - make_pair( - hoid, - ECCommon::read_request_t( - to_read, - need, - attrs))); - } - - map > pushes; - map > push_replies; + recovery_reads.insert(make_pair(hoid, read_request)); + } + + map> pushes; + map> push_replies; ObjectStore::Transaction t; }; void ECBackend::handle_recovery_push( const PushOp &op, RecoveryMessages *m, - bool is_repair) -{ + bool is_repair) { if (get_parent()->pg_is_remote_backfilling()) { get_parent()->pg_add_local_num_bytes(op.data.length()); - get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count()); + get_parent()->pg_add_num_bytes(op.data.length() * sinfo.get_k()); dout(10) << __func__ << " " << op.soid << " add new actual data by " << op.data.length() - << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count() + << " add new num_bytes by " << op.data.length() * sinfo.get_k() << dendl; } recovery_backend.handle_recovery_push(op, m, is_repair); if (op.after_progress.data_complete && - !(get_parent()->pgb_is_primary()) && - get_parent()->pg_is_remote_backfilling()) { + !(get_parent()->pgb_is_primary()) && + get_parent()->pg_is_remote_backfilling()) { struct stat st; - int r = switcher->store->stat(switcher->ch, ghobject_t(op.soid, ghobject_t::NO_GEN, - get_parent()->whoami_shard().shard), &st); + int r = switcher->store->stat(switcher->ch, ghobject_t( + op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard), &st); if (r == 0) { get_parent()->pg_sub_local_num_bytes(st.st_size); // XXX: This can be way overestimated for small objects - get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count()); + get_parent()->pg_sub_num_bytes(st.st_size * sinfo.get_k()); dout(10) << __func__ << " " << op.soid << " sub actual data by " << st.st_size - << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count() + << " sub num_bytes by " << st.st_size * sinfo.get_k() << dendl; } } @@ -253,10 +197,10 @@ void ECBackend::handle_recovery_push( void ECBackend::RecoveryBackend::handle_recovery_push( const PushOp &op, RecoveryMessages *m, - bool is_repair) -{ + bool is_repair) { if (get_parent()->check_failsafe_full()) { - dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl; + dout(10) << __func__ << " Out of space (failsafe) processing push request." + << dendl; ceph_abort(); } @@ -264,12 +208,12 @@ void ECBackend::RecoveryBackend::handle_recovery_push( ghobject_t tobj; if (oneshot) { tobj = ghobject_t(op.soid, ghobject_t::NO_GEN, - get_parent()->whoami_shard().shard); + get_parent()->whoami_shard().shard); } else { tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid, - op.version), - ghobject_t::NO_GEN, - get_parent()->whoami_shard().shard); + op.version), + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard); if (op.before_progress.first) { dout(10) << __func__ << ": Adding oid " << tobj.hobj << " in the temp collection" << dendl; @@ -310,11 +254,12 @@ void ECBackend::RecoveryBackend::handle_recovery_push( << tobj.hobj << " from the temp collection" << dendl; clear_temp_obj(tobj.hobj); m->t.remove(coll, ghobject_t( - op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + op.soid, ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard)); m->t.collection_move_rename( coll, tobj, coll, ghobject_t( - op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); + op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); } if (op.after_progress.data_complete) { if ((get_parent()->pgb_is_primary())) { @@ -323,21 +268,21 @@ void ECBackend::RecoveryBackend::handle_recovery_push( if (get_parent()->pg_is_repair() || is_repair) get_parent()->inc_osd_stat_repaired(); get_parent()->on_local_recover( - op.soid, - op.recovery_info, - recovery_ops[op.soid].obc, - false, - &m->t); + op.soid, + op.recovery_info, + recovery_ops[op.soid].obc, + false, + &m->t); } else { // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired if (is_repair) get_parent()->inc_osd_stat_repaired(); get_parent()->on_local_recover( - op.soid, - op.recovery_info, - ObjectContextRef(), - false, - &m->t); + op.soid, + op.recovery_info, + ObjectContextRef(), + false, + &m->t); } } m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp()); @@ -347,8 +292,7 @@ void ECBackend::RecoveryBackend::handle_recovery_push( void ECBackend::RecoveryBackend::handle_recovery_push_reply( const PushReplyOp &op, pg_shard_t from, - RecoveryMessages *m) -{ + RecoveryMessages *m) { if (!recovery_ops.count(op.soid)) return; RecoveryOp &rop = recovery_ops[op.soid]; @@ -359,35 +303,14 @@ void ECBackend::RecoveryBackend::handle_recovery_push_reply( void ECBackend::RecoveryBackend::handle_recovery_read_complete( const hobject_t &hoid, - boost::tuple > &to_read, - std::optional> > attrs, - RecoveryMessages *m) -{ - dout(10) << __func__ << ": returned " << hoid << " " - << "(" << to_read.get<0>() - << ", " << to_read.get<1>() - << ", " << to_read.get<2>() - << ")" - << dendl; - ceph_assert(recovery_ops.count(hoid)); + ECUtil::shard_extent_map_t &&buffers_read, + std::optional>> attrs, + const ECUtil::shard_extent_set_t &want_to_read, + RecoveryMessages *m) { + dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl; + ceph_assert(recovery_ops.contains(hoid)); RecoveryBackend::RecoveryOp &op = recovery_ops[hoid]; - ceph_assert(op.returned_data.empty()); - map target; - for (set::iterator i = op.missing_on_shards.begin(); - i != op.missing_on_shards.end(); - ++i) { - target[static_cast(*i)] = &(op.returned_data[static_cast(*i)]); - } - map from; - for(map::iterator i = to_read.get<2>().begin(); - i != to_read.get<2>().end(); - ++i) { - from[static_cast(i->first.shard)] = std::move(i->second); - } - dout(10) << __func__ << ": " << from << dendl; - int r; - r = ECUtil::decode(sinfo, ec_impl, from, target); - ceph_assert(r == 0); + if (attrs) { op.xattrs.swap(*attrs); @@ -413,99 +336,145 @@ void ECBackend::RecoveryBackend::handle_recovery_read_complete( op.recovery_info.oi = op.obc->obs.oi; } - ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); - if (op.obc->obs.oi.size > 0) { - ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key())); - auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin(); - decode(hinfo, bp); + if (sinfo.require_hinfo()) { + ECUtil::HashInfo hinfo(sinfo.get_k_plus_m()); + if (op.obc->obs.oi.size > 0) { + ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key())); + auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin(); + decode(hinfo, bp); + } + op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info( + hoid, std::move(hinfo)); } - op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info(hoid, std::move(hinfo)); } ceph_assert(op.xattrs.size()); ceph_assert(op.obc); + + op.returned_data.emplace(std::move(buffers_read)); + + ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(op.recovery_info.size, read_mask); + ECUtil::shard_extent_set_t shard_want_to_read(sinfo.get_k_plus_m()); + + for (auto &[shard, eset] : want_to_read) { + /* Read buffers do not need recovering! */ + if (buffers_read.contains(shard)) { + continue; + } + + /* Read-buffers will be truncated to the end-of-object. Do not attempt + * to recover off-the-end. + */ + shard_want_to_read[shard].intersection_of(read_mask.get(shard),eset); + + /* Some shards may be empty */ + if (shard_want_to_read[shard].empty()) { + shard_want_to_read.erase(shard); + } + } + + uint64_t aligned_size = ECUtil::align_page_next(op.obc->obs.oi.size); + + int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size); + ceph_assert(r == 0); + // We are never appending here, so we never need hinfo. + op.returned_data->insert_parity_buffers(); + r = op.returned_data->encode(ec_impl, NULL, 0); + ceph_assert(r==0); + + // Finally, we don't want to write any padding, so truncate the buffer + // to remove it. + op.returned_data->erase_after_ro_offset(aligned_size); + + for (auto &&shard: op.missing_on_shards) { + if (read_mask.contains(shard) && op.returned_data->contains_shard(shard)) { + ceph_assert(read_mask.at(shard).range_end() >= + op.returned_data->get_extent_map(shard).get_end_off()); + } + } + + dout(20) << __func__ << ": oid=" << op.hoid << " " + << op.returned_data->debug_string(2048, 8) << dendl; + continue_recovery_op(op, m); } struct SendPushReplies : public Context { PGBackend::Listener *l; epoch_t epoch; - map replies; + std::map replies; + SendPushReplies( PGBackend::Listener *l, epoch_t epoch, - map &in) : l(l), epoch(epoch) { + std::map &in) : l(l), epoch(epoch) { replies.swap(in); } + void finish(int) override { std::vector> messages; messages.reserve(replies.size()); - for (map::iterator i = replies.begin(); - i != replies.end(); - ++i) { - messages.push_back(std::make_pair(i->first, i->second)); + for (auto & reply : replies) { + messages.push_back(reply); } if (!messages.empty()) { l->send_message_osd_cluster(messages, epoch); } replies.clear(); } + ~SendPushReplies() override { - for (map::iterator i = replies.begin(); - i != replies.end(); - ++i) { - i->second->put(); + for (auto & [_, reply] : replies) { + reply->put(); } replies.clear(); } }; struct RecoveryReadCompleter : ECCommon::ReadCompleter { - RecoveryReadCompleter(ECBackend::RecoveryBackend& backend) + RecoveryReadCompleter(ECBackend::RecoveryBackend &backend) : backend(backend) {} void finish_single_request( - const hobject_t &hoid, - ECCommon::read_result_t &res, - list, - set wanted_to_read) override - { + const hobject_t &hoid, + ECCommon::read_result_t &&res, + ECCommon::read_request_t &req) override { if (!(res.r == 0 && res.errors.empty())) { backend._failed_push(hoid, res); return; } - ceph_assert(res.returned.size() == 1); + ceph_assert(req.to_read.size() == 0); backend.handle_recovery_read_complete( hoid, - res.returned.back(), + std::move(res.buffers_read), res.attrs, + req.shard_want_to_read, &rm); } - void finish(int priority) && override - { + void finish(int priority) && override { backend.dispatch_recovery_messages(rm, priority); } - ECBackend::RecoveryBackend& backend; + ECBackend::RecoveryBackend &backend; RecoveryMessages rm; }; void ECBackend::ECRecoveryBackend::commit_txn_send_replies( - ceph::os::Transaction&& txn, - std::map replies) -{ + ceph::os::Transaction &&txn, + std::map replies) { txn.register_on_complete( - get_parent()->bless_context( - new SendPushReplies( - get_parent(), - get_osdmap_epoch(), - replies))); + get_parent()->bless_context( + new SendPushReplies( + get_parent(), + get_osdmap_epoch(), + replies))); get_parent()->queue_transaction(std::move(txn)); } -void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority) -{ - for (map >::iterator i = m.pushes.begin(); +void ECBackend::RecoveryBackend::dispatch_recovery_messages( + RecoveryMessages &m, int priority) { + for (map>::iterator i = m.pushes.begin(); i != m.pushes.end(); m.pushes.erase(i++)) { MOSDPGPush *msg = new MOSDPGPush(); @@ -517,14 +486,14 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, msg->pushes.swap(i->second); msg->compute_cost(cct); msg->is_repair = get_parent()->pg_is_repair(); - std::vector wrapped_msg { + std::vector wrapped_msg{ std::make_pair(i->first.osd, static_cast(msg)) }; get_parent()->send_message_osd_cluster(wrapped_msg, msg->map_epoch); } - map replies; - for (map >::iterator i = - m.push_replies.begin(); + std::map replies; + for (map>::iterator i = + m.push_replies.begin(); i != m.push_replies.end(); m.push_replies.erase(i++)) { MOSDPGPushReply *msg = new MOSDPGPushReply(); @@ -535,7 +504,7 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard); msg->replies.swap(i->second); msg->compute_cost(cct); - replies.insert(make_pair(i->first.osd, msg)); + replies.insert(std::pair(i->first.osd, msg)); } #if 1 @@ -548,9 +517,7 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, return; read_pipeline.start_read_op( priority, - m.want_to_read, m.recovery_reads, - OpRequestRef(), false, true, std::make_unique(*this)); @@ -558,120 +525,128 @@ void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, void ECBackend::RecoveryBackend::continue_recovery_op( RecoveryBackend::RecoveryOp &op, - RecoveryMessages *m) -{ + RecoveryMessages *m) { dout(10) << __func__ << ": continuing " << op << dendl; using RecoveryOp = RecoveryBackend::RecoveryOp; while (1) { switch (op.state) { case RecoveryOp::IDLE: { - // start read - op.state = RecoveryOp::READING; ceph_assert(!op.recovery_progress.data_complete); - set want(op.missing_on_shards.begin(), op.missing_on_shards.end()); - uint64_t from = op.recovery_progress.data_recovered_to; - uint64_t amount = get_recovery_chunk_size(); + ECUtil::shard_extent_set_t want(sinfo.get_k_plus_m()); + + op.state = RecoveryOp::READING; + + // We always read the recovery chunk size (default 8MiB + parity). If that + // amount of data is not available, then the backend will truncate the + // response. + sinfo.ro_range_to_shard_extent_set_with_parity( + op.recovery_progress.data_recovered_to, + get_recovery_chunk_size(), want); if (op.recovery_progress.first && op.obc) { - if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(op.hoid); - r >= 0 || r == -ENOENT) { - op.hinfo = unstable_hashinfo_registry.get_hash_info(op.hoid, false, attrs, size); - } else { - derr << __func__ << ": can't stat-or-getattr on " << op.hoid << dendl; - } - if (!op.hinfo) { - derr << __func__ << ": " << op.hoid << " has inconsistent hinfo" + op.xattrs = op.obc->attr_cache; + if (sinfo.require_hinfo()) { + if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk( + op.hoid); + r >= 0 || r == -ENOENT) { + op.hinfo = unstable_hashinfo_registry.get_hash_info( + op.hoid, false, attrs, size); + } else { + derr << __func__ << ": can't stat-or-getattr on " << op.hoid << + dendl; + } + if (!op.hinfo) { + derr << __func__ << ": " << op.hoid << " has inconsistent hinfo" << dendl; - ceph_assert(recovery_ops.count(op.hoid)); - eversion_t v = recovery_ops[op.hoid].v; - recovery_ops.erase(op.hoid); - // TODO: not in crimson yet - get_parent()->on_failed_pull({get_parent()->whoami_shard()}, - op.hoid, v); - return; + ceph_assert(recovery_ops.count(op.hoid)); + eversion_t v = recovery_ops[op.hoid].v; + recovery_ops.erase(op.hoid); + // TODO: not in crimson yet + get_parent()->on_failed_pull({get_parent()->whoami_shard()}, + op.hoid, v); + return; + } + encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]); } - op.xattrs = op.obc->attr_cache; - encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]); } - map>> to_read; + read_request_t read_request(std::move(want), + op.recovery_progress.first && !op.obc, + op.obc + ? op.obc->obs.oi.size + : get_recovery_chunk_size()); + int r = read_pipeline.get_min_avail_to_read_shards( - op.hoid, want, true, false, &to_read); + op.hoid, true, false, read_request); + if (r != 0) { - // we must have lost a recovery source - ceph_assert(!op.recovery_progress.first); - dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid - << dendl; - // in crimson - get_parent()->cancel_pull(op.hoid); - recovery_ops.erase(op.hoid); - return; + // we must have lost a recovery source + ceph_assert(!op.recovery_progress.first); + dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid + << dendl; + // in crimson + get_parent()->cancel_pull(op.hoid); + recovery_ops.erase(op.hoid); + return; + } + if (read_request.shard_reads.empty()) { + ceph_assert(op.obc); + ceph_assert(0 == op.obc->obs.oi.size); + dout(10) << __func__ << "Zero size object recovery, skipping reads." + << op << dendl; + // Create an empty read result and fall through. + op.returned_data.emplace(&sinfo); + } else { + m->recovery_read( + op.hoid, + read_request); + dout(10) << __func__ << ": IDLE return " << op << dendl; + return; } - m->recovery_read( - op.hoid, - op.recovery_progress.data_recovered_to, - amount, - std::move(want), - to_read, - op.recovery_progress.first && !op.obc); - op.extent_requested = make_pair( - from, - amount); - dout(10) << __func__ << ": IDLE return " << op << dendl; - return; } + [[fallthrough]]; case RecoveryOp::READING: { // read completed, start write ceph_assert(op.xattrs.size()); - ceph_assert(op.returned_data.size()); + ceph_assert(op.returned_data); + dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl; op.state = RecoveryOp::WRITING; ObjectRecoveryProgress after_progress = op.recovery_progress; - after_progress.data_recovered_to += op.extent_requested.second; + after_progress.data_recovered_to = op.returned_data->get_ro_end(); after_progress.first = false; if (after_progress.data_recovered_to >= op.obc->obs.oi.size) { - after_progress.data_recovered_to = - sinfo.logical_to_next_stripe_offset( - op.obc->obs.oi.size); - after_progress.data_complete = true; + after_progress.data_complete = true; } - for (set::iterator mi = op.missing_on.begin(); - mi != op.missing_on.end(); - ++mi) { - ceph_assert(op.returned_data.count(static_cast(mi->shard))); - m->pushes[*mi].push_back(PushOp()); - PushOp &pop = m->pushes[*mi].back(); - pop.soid = op.hoid; - pop.version = op.v; - pop.data = op.returned_data[static_cast(mi->shard)]; - dout(10) << __func__ << ": before_progress=" << op.recovery_progress + for (auto &&pg_shard: op.missing_on) { + m->pushes[pg_shard].push_back(PushOp()); + PushOp &pop = m->pushes[pg_shard].back(); + pop.soid = op.hoid; + pop.version = op.v; + op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data); + dout(10) << __func__ << ": pop shard=" << pg_shard + << ", oid=" << pop.soid + << ", before_progress=" << op.recovery_progress << ", after_progress=" << after_progress << ", pop.data.length()=" << pop.data.length() << ", size=" << op.obc->obs.oi.size << dendl; - ceph_assert( - pop.data.length() == - sinfo.aligned_logical_offset_to_chunk_offset( - after_progress.data_recovered_to - - op.recovery_progress.data_recovered_to) - ); - if (pop.data.length()) - pop.data_included.insert( - sinfo.aligned_logical_offset_to_chunk_offset( - op.recovery_progress.data_recovered_to), - pop.data.length() - ); - if (op.recovery_progress.first) { - pop.attrset = op.xattrs; - } - pop.recovery_info = op.recovery_info; - pop.before_progress = op.recovery_progress; - pop.after_progress = after_progress; - if (*mi != get_parent()->primary_shard()) - // already in crimson -- junction point with PeeringState - get_parent()->begin_peer_recover( - *mi, - op.hoid); + if (pop.data.length()) + pop.data_included.union_insert( + op.returned_data->get_shard_first_offset(pg_shard.shard), + pop.data.length()); + if (op.recovery_progress.first) { + pop.attrset = op.xattrs; + } + pop.recovery_info = op.recovery_info; + pop.before_progress = op.recovery_progress; + pop.after_progress = after_progress; + if (pg_shard != get_parent()->primary_shard()) { + // already in crimson -- junction point with PeeringState + get_parent()->begin_peer_recover( + pg_shard, + op.hoid); + } } - op.returned_data.clear(); + op.returned_data.reset(); op.waiting_on_pushes = op.missing_on; op.recovery_progress = after_progress; dout(10) << __func__ << ": READING return " << op << dendl; @@ -679,37 +654,37 @@ void ECBackend::RecoveryBackend::continue_recovery_op( } case RecoveryOp::WRITING: { if (op.waiting_on_pushes.empty()) { - if (op.recovery_progress.data_complete) { - op.state = RecoveryOp::COMPLETE; - for (set::iterator i = op.missing_on.begin(); - i != op.missing_on.end(); - ++i) { - if (*i != get_parent()->primary_shard()) { - dout(10) << __func__ << ": on_peer_recover on " << *i + if (op.recovery_progress.data_complete) { + op.state = RecoveryOp::COMPLETE; + for (set::iterator i = op.missing_on.begin(); + i != op.missing_on.end(); + ++i) { + if (*i != get_parent()->primary_shard()) { + dout(10) << __func__ << ": on_peer_recover on " << *i << ", obj " << op.hoid << dendl; - get_parent()->on_peer_recover( - *i, - op.hoid, - op.recovery_info); - } - } - object_stat_sum_t stat; - stat.num_bytes_recovered = op.recovery_info.size; - stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ? - stat.num_objects_recovered = 1; - // TODO: not in crimson yet - if (get_parent()->pg_is_repair()) - stat.num_objects_repaired = 1; - // pg_recovery.cc in crimson has it - get_parent()->on_global_recover(op.hoid, stat, false); - dout(10) << __func__ << ": WRITING return " << op << dendl; - recovery_ops.erase(op.hoid); - return; - } else { - op.state = RecoveryOp::IDLE; - dout(10) << __func__ << ": WRITING continue " << op << dendl; - continue; - } + get_parent()->on_peer_recover( + *i, + op.hoid, + op.recovery_info); + } + } + object_stat_sum_t stat; + stat.num_bytes_recovered = op.recovery_info.size; + stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ? + stat.num_objects_recovered = 1; + // TODO: not in crimson yet + if (get_parent()->pg_is_repair()) + stat.num_objects_repaired = 1; + // pg_recovery.cc in crimson has it + get_parent()->on_global_recover(op.hoid, stat, false); + dout(10) << __func__ << ": WRITING return " << op << dendl; + recovery_ops.erase(op.hoid); + return; + } else { + op.state = RecoveryOp::IDLE; + dout(10) << __func__ << ": WRITING continue " << op << dendl; + continue; + } } return; } @@ -724,8 +699,7 @@ void ECBackend::RecoveryBackend::continue_recovery_op( void ECBackend::run_recovery_op( PGBackend::RecoveryHandle *_h, - int priority) -{ + int priority) { ceph_assert(_h); ECRecoveryHandle &h = static_cast(*_h); recovery_backend.run_recovery_op(h, priority); @@ -735,8 +709,7 @@ void ECBackend::run_recovery_op( void ECBackend::RecoveryBackend::run_recovery_op( ECRecoveryHandle &h, - int priority) -{ + int priority) { RecoveryMessages m; for (list::iterator i = h.ops.begin(); i != h.ops.end(); @@ -754,8 +727,7 @@ int ECBackend::recover_object( eversion_t v, ObjectContextRef head, ObjectContextRef obc, - PGBackend::RecoveryHandle *_h) -{ + PGBackend::RecoveryHandle *_h) { return recovery_backend.recover_object(hoid, v, head, obc, _h); } @@ -764,8 +736,7 @@ int ECBackend::RecoveryBackend::recover_object( eversion_t v, ObjectContextRef head, ObjectContextRef obc, - PGBackend::RecoveryHandle *_h) -{ + PGBackend::RecoveryHandle *_h) { ECRecoveryHandle *h = static_cast(_h); h->ops.push_back(RecoveryOp()); h->ops.back().v = v; @@ -790,7 +761,7 @@ int ECBackend::RecoveryBackend::recover_object( } h->ops.back().recovery_progress.omap_complete = true; for (set::const_iterator i = - get_parent()->get_acting_recovery_backfill_shards().begin(); + get_parent()->get_acting_recovery_backfill_shards().begin(); i != get_parent()->get_acting_recovery_backfill_shards().end(); ++i) { dout(10) << "checking " << *i << dendl; @@ -804,14 +775,12 @@ int ECBackend::RecoveryBackend::recover_object( } bool ECBackend::can_handle_while_inactive( - OpRequestRef _op) -{ + OpRequestRef _op) { return false; } bool ECBackend::_handle_message( - OpRequestRef _op) -{ + OpRequestRef _op) { dout(10) << __func__ << ": " << *_op->get_req() << dendl; int priority = _op->get_req()->get_priority(); switch (_op->get_req()->get_type()) { @@ -822,7 +791,8 @@ bool ECBackend::_handle_message( MOSDECSubOpWrite *op = static_cast( _op->get_nonconst_req()); parent->maybe_preempt_replica_scrub(op->op.soid); - handle_sub_write(op->op.from, _op, op->op, _op->pg_trace, *get_parent()->get_eclistener()); + handle_sub_write(op->op.from, _op, op->op, _op->pg_trace, + *get_parent()->get_eclistener()); return true; } case MSG_OSD_EC_WRITE_REPLY: { @@ -857,20 +827,20 @@ bool ECBackend::_handle_message( auto op = _op->get_req(); RecoveryMessages rm; for (vector::const_iterator i = op->pushes.begin(); - i != op->pushes.end(); - ++i) { + i != op->pushes.end(); + ++i) { handle_recovery_push(*i, &rm, op->is_repair); } recovery_backend.dispatch_recovery_messages(rm, priority); return true; } case MSG_OSD_PG_PUSH_REPLY: { - const MOSDPGPushReply *op = static_cast( + const MOSDPGPushReply *op = static_cast( _op->get_req()); RecoveryMessages rm; for (vector::const_iterator i = op->replies.begin(); - i != op->replies.end(); - ++i) { + i != op->replies.end(); + ++i) { recovery_backend.handle_recovery_push_reply(*i, op->from, &rm); } recovery_backend.dispatch_recovery_messages(rm, priority); @@ -889,6 +859,7 @@ struct SubWriteCommitted : public Context { eversion_t version; eversion_t last_complete; const ZTracer::Trace trace; + SubWriteCommitted( ECBackend *pg, OpRequestRef msg, @@ -898,12 +869,14 @@ struct SubWriteCommitted : public Context { const ZTracer::Trace &trace) : pg(pg), msg(msg), tid(tid), version(version), last_complete(last_complete), trace(trace) {} + void finish(int) override { if (msg) msg->mark_event("sub_op_committed"); pg->sub_write_committed(tid, version, last_complete, trace); } }; + void ECBackend::sub_write_committed( ceph_tid_t tid, eversion_t version, eversion_t last_complete, const ZTracer::Trace &trace) { @@ -941,15 +914,14 @@ void ECBackend::handle_sub_write( OpRequestRef msg, ECSubWrite &op, const ZTracer::Trace &trace, - ECListener&) -{ + ECListener &) { if (msg) { msg->mark_event("sub_op_started"); } trace.event("handle_sub_write"); if (cct->_conf->bluestore_debug_inject_read_err && - ECInject::test_write_error3(op.soid)) { + ECInject::test_write_error3(op.soid)) { ceph_abort_msg("Error inject - OSD down"); } if (!get_parent()->pgb_is_primary()) @@ -960,26 +932,28 @@ void ECBackend::handle_sub_write( } if (op.backfill_or_async_recovery) { for (set::iterator i = op.temp_removed.begin(); - i != op.temp_removed.end(); - ++i) { + i != op.temp_removed.end(); + ++i) { dout(10) << __func__ << ": removing object " << *i << " since we won't get the transaction" << dendl; localt.remove( - switcher->coll, - ghobject_t( - *i, - ghobject_t::NO_GEN, - get_parent()->whoami_shard().shard)); + switcher->coll, + ghobject_t( + *i, + ghobject_t::NO_GEN, + get_parent()->whoami_shard().shard)); } } switcher->clear_temp_objs(op.temp_removed); - dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl; + dout(30) << __func__ << " missing before " << + get_parent()->get_log().get_missing().get_items() << dendl; // flag set to true during async recovery bool async = false; pg_missing_tracker_t pmissing = get_parent()->get_local_missing(); if (pmissing.is_missing(op.soid)) { async = true; - dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl; + dout(30) << __func__ << " is_missing " << + pmissing.is_missing(op.soid) << dendl; for (auto &&e: op.log_entries) { dout(30) << " add_next_event entry " << e << dendl; get_parent()->add_local_next_event(e); @@ -997,21 +971,24 @@ void ECBackend::handle_sub_write( async); if (!get_parent()->pg_is_undersized() && - (unsigned)get_parent()->whoami_shard().shard >= sinfo.get_k()) + get_parent()->whoami_shard().shard >= sinfo.get_k()) op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); localt.register_on_commit( get_parent()->bless_context( new SubWriteCommitted( - this, msg, op.tid, - op.at_version, - get_parent()->get_info().last_complete, trace))); + this, msg, op.tid, + op.at_version, + get_parent()->get_info().last_complete, trace))); vector tls; tls.reserve(2); tls.push_back(std::move(op.t)); tls.push_back(std::move(localt)); get_parent()->queue_transactions(tls, msg); - dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl; + dout(30) << __func__ << " missing after" << get_parent()->get_log(). + get_missing(). + get_items() << dendl + ; if (op.at_version != eversion_t()) { // dummy rollforward transaction doesn't get at_version (and doesn't advance it) get_parent()->op_applied(op.at_version); @@ -1022,26 +999,21 @@ void ECBackend::handle_sub_read( pg_shard_t from, const ECSubRead &op, ECSubReadReply *reply, - const ZTracer::Trace &trace) -{ + const ZTracer::Trace &trace) { trace.event("handle sub read"); shard_id_t shard = get_parent()->whoami_shard().shard; - for(auto i = op.to_read.begin(); - i != op.to_read.end(); - ++i) { + for (auto &&[hoid, to_read]: op.to_read) { int r = 0; - for (auto j = i->second.begin(); j != i->second.end(); ++j) { + for (auto &&[offset, len, flags]: to_read) { bufferlist bl; - if ((op.subchunks.find(i->first)->second.size() == 1) && - (op.subchunks.find(i->first)->second.front().second == - ec_impl->get_sub_chunk_count())) { + auto &subchunks = op.subchunks.at(hoid); + if ((subchunks.size() == 1) && + (subchunks.front().second == ec_impl->get_sub_chunk_count())) { dout(20) << __func__ << " case1: reading the complete chunk/shard." << dendl; r = switcher->store->read( - switcher->ch, - ghobject_t(i->first, ghobject_t::NO_GEN, shard), - j->get<0>(), - j->get<1>(), - bl, j->get<2>()); // Allow EIO return + switcher->ch, + ghobject_t(hoid, ghobject_t::NO_GEN, shard), + offset, len, bl, flags); // Allow EIO return } else { int subchunk_size = sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); @@ -1049,16 +1021,16 @@ void ECBackend::handle_sub_read( << " subchunk_size=" << subchunk_size << " chunk_size=" << sinfo.get_chunk_size() << dendl; bool error = false; - for (int m = 0; m < (int)j->get<1>() && !error; + for (int m = 0; m < (int)len && !error; m += sinfo.get_chunk_size()) { - for (auto &&k:op.subchunks.find(i->first)->second) { + for (auto &&k: subchunks) { bufferlist bl0; r = switcher->store->read( - switcher->ch, - ghobject_t(i->first, ghobject_t::NO_GEN, shard), - j->get<0>() + m + (k.first)*subchunk_size, - (k.second)*subchunk_size, - bl0, j->get<2>()); + switcher->ch, + ghobject_t(hoid, ghobject_t::NO_GEN, shard), + offset + m + (k.first) * subchunk_size, + (k.second) * subchunk_size, + bl0, flags); if (r < 0) { error = true; break; @@ -1069,92 +1041,93 @@ void ECBackend::handle_sub_read( } if (r < 0) { - // if we are doing fast reads, it's possible for one of the shard - // reads to cross paths with another update and get a (harmless) - // ENOENT. Suppress the message to the cluster log in that case. - if (r == -ENOENT && get_parent()->get_pool().fast_read) { - dout(5) << __func__ << ": Error " << r - << " reading " << i->first << ", fast read, probably ok" + // if we are doing fast reads, it's possible for one of the shard + // reads to cross paths with another update and get a (harmless) + // ENOENT. Suppress the message to the cluster log in that case. + if (r == -ENOENT && get_parent()->get_pool().fast_read) { + dout(5) << __func__ << ": Error " << r + << " reading " << hoid << ", fast read, probably ok" << dendl; - } else { - get_parent()->clog_error() << "Error " << r - << " reading object " - << i->first; - dout(5) << __func__ << ": Error " << r - << " reading " << i->first << dendl; - } - goto error; + } else { + get_parent()->clog_error() << "Error " << r + << " reading object " + << hoid; + dout(5) << __func__ << ": Error " << r + << " reading " << hoid << dendl; + } + goto error; } else { - dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl; - reply->buffers_read[i->first].push_back( - make_pair( - j->get<0>(), - bl) - ); + dout(20) << __func__ << " read request=" << len << " r=" << r << " len=" + << bl.length() << dendl; + reply->buffers_read[hoid].push_back(make_pair(offset, bl)); } - if (!get_parent()->get_pool().allows_ecoverwrites()) { - // This shows that we still need deep scrub because large enough files - // are read in sections, so the digest check here won't be done here. - // Do NOT check osd_read_eio_on_bad_digest here. We need to report - // the state of our chunk in case other chunks could substitute. + if (!sinfo.supports_ec_overwrites()) { + // This shows that we still need deep scrub because large enough files + // are read in sections, so the digest check here won't be done here. + // Do NOT check osd_read_eio_on_bad_digest here. We need to report + // the state of our chunk in case other chunks could substitute. ECUtil::HashInfoRef hinfo; map> attrs; - struct stat st; - int r = object_stat(i->first, &st); + struct stat st; + int r = object_stat(hoid, &st); + if (r >= 0) { + dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl; + r = switcher->objects_get_attrs_with_hinfo(hoid, &attrs); + } if (r >= 0) { - dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl; - r = switcher->objects_get_attrs_with_hinfo(i->first, &attrs); - } - if (r >= 0) { - hinfo = unstable_hashinfo_registry.get_hash_info(i->first, false, attrs, st.st_size); - } else { - derr << __func__ << ": access (attrs) on " << i->first << " failed: " + hinfo = unstable_hashinfo_registry.get_hash_info( + hoid, false, attrs, st.st_size); + } else { + derr << __func__ << ": access (attrs) on " << hoid << " failed: " << cpp_strerror(r) << dendl; - } + } if (!hinfo) { r = -EIO; get_parent()->clog_error() << "Corruption detected: object " - << i->first - << " is missing hash_info"; - dout(5) << __func__ << ": No hinfo for " << i->first << dendl; + << hoid + << " is missing hash_info"; + dout(5) << __func__ << ": No hinfo for " << hoid << dendl; goto error; } - ceph_assert(hinfo->has_chunk_hash()); - if ((bl.length() == hinfo->get_total_chunk_size()) && - (j->get<0>() == 0)) { - dout(20) << __func__ << ": Checking hash of " << i->first << dendl; - bufferhash h(-1); - h << bl; - if (h.digest() != hinfo->get_chunk_hash(shard)) { - get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x" - << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec; - dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x" - << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl; - r = -EIO; - goto error; - } - } + ceph_assert(hinfo->has_chunk_hash()); + if ((bl.length() == hinfo->get_total_chunk_size()) && + (offset == 0)) { + dout(20) << __func__ << ": Checking hash of " << hoid << dendl; + bufferhash h(-1); + h << bl; + if (h.digest() != hinfo->get_chunk_hash(shard)) { + get_parent()->clog_error() << "Bad hash for " << hoid << + " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo-> + get_chunk_hash(shard) << dec; + dout(5) << __func__ << ": Bad hash for " << hoid << " digest 0x" + << hex << h.digest() << " expected 0x" << hinfo-> +get_chunk_hash(shard) << dec << dendl; + r = -EIO; + goto error; + } + } } } continue; -error: + error: // Do NOT check osd_read_eio_on_bad_digest here. We need to report // the state of our chunk in case other chunks could substitute. - reply->buffers_read.erase(i->first); - reply->errors[i->first] = r; + reply->buffers_read.erase(hoid); + reply->errors[hoid] = r; } for (set::iterator i = op.attrs_to_read.begin(); i != op.attrs_to_read.end(); ++i) { dout(10) << __func__ << ": fulfilling attr request on " << *i << dendl; - if (reply->errors.count(*i)) + if (reply->errors.contains(*i)) continue; int r = switcher->store->getattrs( switcher->ch, ghobject_t( - *i, ghobject_t::NO_GEN, shard), + *i, ghobject_t::NO_GEN, shard), reply->attrs_read[*i]); if (r < 0) { // If we read error, we should not return the attrs too. @@ -1169,54 +1142,43 @@ error: void ECBackend::handle_sub_write_reply( pg_shard_t from, - const ECSubWriteReply &op, - const ZTracer::Trace &trace) -{ - map::iterator i = rmw_pipeline.tid_to_op_map.find(op.tid); - ceph_assert(i != rmw_pipeline.tid_to_op_map.end()); - if (op.committed) { + const ECSubWriteReply &ec_write_reply_op, + const ZTracer::Trace &trace) { + RMWPipeline::OpRef &op = rmw_pipeline.tid_to_op_map.at(ec_write_reply_op.tid); + if (ec_write_reply_op.committed) { trace.event("sub write committed"); - ceph_assert(i->second->pending_commit.count(from)); - i->second->pending_commit.erase(from); + ceph_assert(op->pending_commits > 0); + op->pending_commits--; if (from != get_parent()->whoami_shard()) { - get_parent()->update_peer_last_complete_ondisk(from, op.last_complete); + get_parent()->update_peer_last_complete_ondisk( + from, ec_write_reply_op.last_complete); } } - if (op.applied) { - trace.event("sub write applied"); - ceph_assert(i->second->pending_apply.count(from)); - i->second->pending_apply.erase(from); - } - if (i->second->pending_commit.empty() && - i->second->on_all_commit && - // also wait for apply, to preserve ordering with luminous peers. - i->second->pending_apply.empty()) { - dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl; - i->second->on_all_commit->complete(0); - i->second->on_all_commit = 0; - i->second->trace.event("ec write all committed"); - } if (cct->_conf->bluestore_debug_inject_read_err && - (i->second->pending_commit.size() == 1) && - ECInject::test_write_error2(i->second->hoid)) { + (op->pending_commits == 1) && + ECInject::test_write_error2(op->hoid)) { std::string cmd = - "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }"; + "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( + get_parent()->whoami()) + "\"] }"; vector vcmd{cmd}; dout(0) << __func__ << " Error inject - marking OSD down" << dendl; get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); } - rmw_pipeline.check_ops(); + + if (op->pending_commits == 0) { + rmw_pipeline.try_finish_rmw(); + } } void ECBackend::handle_sub_read_reply( pg_shard_t from, ECSubReadReply &op, - const ZTracer::Trace &trace) -{ + const ZTracer::Trace &trace) { trace.event("ec sub read reply"); dout(10) << __func__ << ": reply " << op << dendl; - map::iterator iter = read_pipeline.tid_to_read_map.find(op.tid); + map::iterator iter = read_pipeline.tid_to_read_map. + find(op.tid); if (iter == read_pipeline.tid_to_read_map.end()) { //canceled dout(20) << __func__ << ": dropped " << op << dendl; @@ -1225,68 +1187,90 @@ void ECBackend::handle_sub_read_reply( ReadOp &rop = iter->second; if (cct->_conf->bluestore_debug_inject_read_err) { for (auto i = op.buffers_read.begin(); - i != op.buffers_read.end(); - ++i) { - if (ECInject::test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) { - dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl; - op.buffers_read.erase(i->first); - op.attrs_read.erase(i->first); - op.errors[i->first] = -EIO; + i != op.buffers_read.end(); + ++i) { + if (ECInject::test_read_error0( + ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) { + dout(0) << __func__ << " Error inject - EIO error for shard " << op.from + .shard + << dendl; + op.buffers_read.erase(i->first); + op.attrs_read.erase(i->first); + op.errors[i->first] = -EIO; + rop.debug_log.emplace_back(ECUtil::INJECT_EIO, op.from); } - } } - for (auto i = op.buffers_read.begin(); - i != op.buffers_read.end(); - ++i) { - ceph_assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer - if (!rop.to_read.count(i->first)) { + for (auto &&[hoid, offset_buffer_map]: op.buffers_read) { + ceph_assert(!op.errors.contains(hoid)); + // If attribute error we better not have sent a buffer + if (!rop.to_read.contains(hoid)) { + rop.debug_log.emplace_back(ECUtil::CANCELLED, op.from); + // We canceled this read! @see filter_read_op dout(20) << __func__ << " to_read skipping" << dendl; continue; } - list::const_iterator req_iter = - rop.to_read.find(i->first)->second.to_read.begin(); - list< - boost::tuple< - uint64_t, uint64_t, map > >::iterator riter = - rop.complete[i->first].returned.begin(); - for (list >::iterator j = i->second.begin(); - j != i->second.end(); - ++j, ++req_iter, ++riter) { - ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end()); - ceph_assert(riter != rop.complete[i->first].returned.end()); - pair aligned = - sinfo.chunk_aligned_offset_len_to_chunk( - make_pair(req_iter->offset, req_iter->size)); - ceph_assert(aligned.first == j->first); - riter->get<2>()[from] = std::move(j->second); + + if (!rop.complete.contains(hoid)) { + rop.complete.emplace(hoid, &sinfo); + } + + auto &buffers_read = rop.complete.at(hoid).buffers_read; + for (auto &&[offset, buffer_list]: offset_buffer_map) { + buffers_read.insert_in_shard(from.shard, offset, buffer_list); } + rop.debug_log.emplace_back(ECUtil::READ_DONE, op.from, buffers_read); } - for (auto i = op.attrs_read.begin(); - i != op.attrs_read.end(); - ++i) { - ceph_assert(!op.errors.count(i->first)); // if read error better not have sent an attribute - if (!rop.to_read.count(i->first)) { + for (auto &&[hoid, req]: rop.to_read) { + if (!rop.complete.contains(hoid)) { + rop.complete.emplace(hoid, &sinfo); + } + auto &complete = rop.complete.at(hoid); + for (auto &&[shard, read]: std::as_const(req.shard_reads)) { + if (complete.errors.contains(read.pg_shard)) continue; + + complete.processed_read_requests[shard].union_of(read.extents); + + if (!rop.complete.contains(hoid) || + !complete.buffers_read.contains(shard)) { + if (!read.extents.empty()) continue; // Complete the actual read first. + + // If we are first here, populate the completion. + if (!rop.complete.contains(hoid)) { + rop.complete.emplace(hoid, read_result_t(&sinfo)); + } + } + } + } + for (auto &&[hoid, attr]: op.attrs_read) { + ceph_assert(!op.errors.count(hoid)); + // if read error better not have sent an attribute + if (!rop.to_read.count(hoid)) { // We canceled this read! @see filter_read_op dout(20) << __func__ << " to_read skipping" << dendl; continue; } - rop.complete[i->first].attrs.emplace(); - (*(rop.complete[i->first].attrs)).swap(i->second); + if (!rop.complete.contains(hoid)) { + rop.complete.emplace(hoid, &sinfo); + } + rop.complete.at(hoid).attrs.emplace(); + (*(rop.complete.at(hoid).attrs)).swap(attr); } - for (auto i = op.errors.begin(); - i != op.errors.end(); - ++i) { - rop.complete[i->first].errors.insert( - make_pair( - from, - i->second)); - dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl; + for (auto &&[hoid, err]: op.errors) { + if (!rop.complete.contains(hoid)) { + rop.complete.emplace(hoid, &sinfo); + } + auto &complete = rop.complete.at(hoid); + complete.errors.emplace(from, err); + rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read); + complete.buffers_read.erase_shard(from.shard); + complete.processed_read_requests.erase(from.shard); + dout(20) << __func__ << " shard=" << from << " error=" << err << dendl; } - map >::iterator siter = - read_pipeline.shard_to_read_map.find(from); + map>::iterator siter = + read_pipeline.shard_to_read_map.find(from); ceph_assert(siter != read_pipeline.shard_to_read_map.end()); ceph_assert(siter->second.count(op.tid)); siter->second.erase(op.tid); @@ -1298,96 +1282,104 @@ void ECBackend::handle_sub_read_reply( // For redundant reads check for completion as each shard comes in, // or in a non-recovery read check for completion once all the shards read. if (rop.do_redundant_reads || rop.in_progress.empty()) { - for (map::const_iterator iter = - rop.complete.begin(); - iter != rop.complete.end(); - ++iter) { - set have; - for (map::const_iterator j = - iter->second.returned.front().get<2>().begin(); - j != iter->second.returned.front().get<2>().end(); - ++j) { - have.insert(static_cast(j->first.shard)); - dout(20) << __func__ << " have shard=" << j->first.shard << dendl; - } - map>> dummy_minimum; - int err; - if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) { - dout(20) << __func__ << " minimum_to_decode failed" << dendl; + for (auto &&[oid, read_result]: rop.complete) { + shard_id_set have; + read_result.processed_read_requests.populate_shard_id_set(have); + shard_id_set dummy_minimum; + shard_id_set want_to_read; + rop.to_read.at(oid).shard_want_to_read. + populate_shard_id_set(want_to_read); + + int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum, + nullptr); + if (err) { + dout(20) << __func__ << " minimum_to_decode failed" << dendl; if (rop.in_progress.empty()) { - // If we don't have enough copies, try other pg_shard_ts if available. - // During recovery there may be multiple osds with copies of the same shard, - // so getting EIO from one may result in multiple passes through this code path. - if (!rop.do_redundant_reads) { - int r = read_pipeline.send_all_remaining_reads(iter->first, rop); - if (r == 0) { - // We changed the rop's to_read and not incrementing is_complete - need_resend = true; - continue; - } - // Couldn't read any additional shards so handle as completed with errors - } - // We don't want to confuse clients / RBD with objectstore error - // values in particular ENOENT. We may have different error returns - // from different shards, so we'll return minimum_to_decode() error - // (usually EIO) to reader. It is likely an error here is due to a - // damaged pg. - rop.complete[iter->first].r = err; - ++is_complete; - } - } else { - ceph_assert(rop.complete[iter->first].r == 0); - if (!rop.complete[iter->first].errors.empty()) { - if (cct->_conf->osd_read_ec_check_for_errors) { - dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl; - err = rop.complete[iter->first].errors.begin()->second; - rop.complete[iter->first].r = err; - } else { - get_parent()->clog_warn() << "Error(s) ignored for " - << iter->first << " enough copies available"; - dout(10) << __func__ << " Error(s) ignored for " << iter->first + // If we don't have enough copies, try other pg_shard_ts if available. + // During recovery there may be multiple osds with copies of the same shard, + // so getting EIO from one may result in multiple passes through this code path. + if (!rop.do_redundant_reads) { + rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from); + int r = read_pipeline.send_all_remaining_reads(oid, rop); + if (r == 0) { + // We found that new reads are required to do a decode. + need_resend = true; + continue; + } else if (r > 0) { + // No new reads were requested. This means that some parity + // shards can be assumed to be zeros. + err = 0; + } + // else insufficient shards are available, keep the errors. + } + // Couldn't read any additional shards so handle as completed with errors + // We don't want to confuse clients / RBD with objectstore error + // values in particular ENOENT. We may have different error returns + // from different shards, so we'll return minimum_to_decode() error + // (usually EIO) to reader. It is likely an error here is due to a + // damaged pg. + rop.complete.at(oid).r = err; + ++is_complete; + } + } + + if (!err) { + ceph_assert(rop.complete.at(oid).r == 0); + if (!rop.complete.at(oid).errors.empty()) { + if (cct->_conf->osd_read_ec_check_for_errors) { + rop.debug_log.emplace_back(ECUtil::COMPLETE_ERROR, op.from); + dout(10) << __func__ << ": Not ignoring errors, use one shard" << dendl; + err = rop.complete.at(oid).errors.begin()->second; + rop.complete.at(oid).r = err; + } else { + get_parent()->clog_warn() << "Error(s) ignored for " + << iter->first << " enough copies available"; + dout(10) << __func__ << " Error(s) ignored for " << iter->first << " enough copies available" << dendl; - rop.complete[iter->first].errors.clear(); - } - } - // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects - rop.to_read.at(iter->first).need.clear(); - rop.to_read.at(iter->first).want_attrs = false; - ++is_complete; + rop.debug_log.emplace_back(ECUtil::ERROR_CLEAR, op.from); + rop.complete.at(oid).errors.clear(); + } + } + // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects + rop.to_read.at(oid).shard_reads.clear(); + rop.to_read.at(oid).want_attrs = false; + ++is_complete; } } } if (need_resend) { read_pipeline.do_read_op(rop); - } else if (rop.in_progress.empty() || + } else if (rop.in_progress.empty() || is_complete == rop.complete.size()) { dout(20) << __func__ << " Complete: " << rop << dendl; rop.trace.event("ec read complete"); - read_pipeline.complete_read_op(rop); + rop.debug_log.emplace_back(ECUtil::COMPLETE, op.from); + read_pipeline.complete_read_op(std::move(rop)); } else { dout(10) << __func__ << " readop not complete: " << rop << dendl; } } -void ECBackend::check_recovery_sources(const OSDMapRef& osdmap) -{ - struct FinishReadOp : public GenContext { - ECCommon::ReadPipeline& read_pipeline; +void ECBackend::check_recovery_sources(const OSDMapRef &osdmap) { + struct FinishReadOp : public GenContext { + ECCommon::ReadPipeline &read_pipeline; ceph_tid_t tid; - FinishReadOp(ECCommon::ReadPipeline& read_pipeline, ceph_tid_t tid) + + FinishReadOp(ECCommon::ReadPipeline &read_pipeline, ceph_tid_t tid) : read_pipeline(read_pipeline), tid(tid) {} - void finish(ThreadPool::TPHandle&) override { + + void finish(ThreadPool::TPHandle &) override { auto ropiter = read_pipeline.tid_to_read_map.find(tid); ceph_assert(ropiter != read_pipeline.tid_to_read_map.end()); - read_pipeline.complete_read_op(ropiter->second); + read_pipeline.complete_read_op(std::move(ropiter->second)); } }; read_pipeline.check_recovery_sources( osdmap, - [this] (const hobject_t& obj) { + [this](const hobject_t &obj) { recovery_backend.recovery_ops.erase(obj); }, - [this] (const ReadOp& op) { + [this](const ReadOp &op) { get_parent()->schedule_recovery_work( get_parent()->bless_unlocked_gencontext( new FinishReadOp(read_pipeline, op.tid)), @@ -1395,22 +1387,21 @@ void ECBackend::check_recovery_sources(const OSDMapRef& osdmap) }); } -void ECBackend::on_change() -{ +void ECBackend::on_change() { rmw_pipeline.on_change(); read_pipeline.on_change(); + rmw_pipeline.on_change2(); clear_recovery_state(); } -void ECBackend::clear_recovery_state() -{ +void ECBackend::clear_recovery_state() { recovery_backend.recovery_ops.clear(); } -void ECBackend::dump_recovery_info(Formatter *f) const -{ +void ECBackend::dump_recovery_info(Formatter *f) const { f->open_array_section("recovery_ops"); - for (map::const_iterator i = recovery_backend.recovery_ops.begin(); + for (map::const_iterator i = + recovery_backend.recovery_ops.begin(); i != recovery_backend.recovery_ops.end(); ++i) { f->open_object_section("op"); @@ -1419,7 +1410,8 @@ void ECBackend::dump_recovery_info(Formatter *f) const } f->close_section(); f->open_array_section("read_ops"); - for (map::const_iterator i = read_pipeline.tid_to_read_map.begin(); + for (map::const_iterator i = read_pipeline.tid_to_read_map + .begin(); i != read_pipeline.tid_to_read_map.end(); ++i) { f->open_object_section("read_op"); @@ -1433,43 +1425,39 @@ struct ECClassicalOp : ECCommon::RMWPipeline::Op { PGTransactionUPtr t; void generate_transactions( - ceph::ErasureCodeInterfaceRef &ecimpl, - pg_t pgid, - const ECUtil::stripe_info_t &sinfo, - std::map *written, - std::map *transactions, - DoutPrefixProvider *dpp, - const ceph_release_t require_osd_release) final - { + ceph::ErasureCodeInterfaceRef &ec_impl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + map *written, + shard_id_map *transactions, + DoutPrefixProvider *dpp, + const OSDMapRef &osdmap) final { assert(t); ECTransaction::generate_transactions( t.get(), plan, - ecimpl, + ec_impl, pgid, sinfo, - remote_read_result, + remote_shard_extent_map, log_entries, written, transactions, &temp_added, &temp_cleared, dpp, - require_osd_release); + osdmap); } - template - static ECTransaction::WritePlan get_write_plan( - const ECUtil::stripe_info_t &sinfo, - PGTransaction& t, - F &&get_hinfo, - DoutPrefixProvider *dpp) - { - return ECTransaction::get_write_plan( - sinfo, - t, - std::forward(get_hinfo), - dpp); + bool skip_transaction( + std::set &pending_roll_forward, + shard_id_t shard, + ceph::os::Transaction &transaction) final { + if (transaction.empty()) { + return true; + } + pending_roll_forward.insert(shard); + return false; } }; @@ -1477,19 +1465,38 @@ std::tuple< int, map>, size_t -> ECBackend::get_attrs_n_size_from_disk(const hobject_t& hoid) -{ +> ECBackend::get_attrs_n_size_from_disk(const hobject_t &hoid) { struct stat st; if (int r = object_stat(hoid, &st); r < 0) { dout(10) << __func__ << ": stat error " << r << " on" << hoid << dendl; - return { r, {}, 0 }; + return {r, {}, 0}; } map> real_attrs; if (int r = switcher->objects_get_attrs_with_hinfo(hoid, &real_attrs); r < 0) { dout(10) << __func__ << ": get attr error " << r << " on" << hoid << dendl; - return { r, {}, 0 }; + return {r, {}, 0}; } - return { 0, real_attrs, st.st_size }; + return {0, real_attrs, st.st_size}; +} + +ECUtil::HashInfoRef ECBackend::get_hinfo_from_disk(hobject_t oid) { + auto [r, attrs, size] = get_attrs_n_size_from_disk(oid); + ceph_assert(r >= 0 || r == -ENOENT); + ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info( + oid, true, attrs, size); + return hinfo; +} + +std::optional ECBackend::get_object_info_from_obc( + ObjectContextRef &obc) { + std::optional ret; + + auto attr_cache = obc->attr_cache; + if (!attr_cache.contains(OI_ATTR)) + return ret; + + ret.emplace(attr_cache.at(OI_ATTR)); + return ret; } void ECBackend::submit_transaction( @@ -1499,15 +1506,15 @@ void ECBackend::submit_transaction( PGTransactionUPtr &&t, const eversion_t &trim_to, const eversion_t &pg_committed_to, - vector&& log_entries, + vector &&log_entries, std::optional &hset_history, Context *on_all_commit, ceph_tid_t tid, osd_reqid_t reqid, OpRequestRef client_op - ) -{ - auto op = std::make_unique(); +) { + auto op = std::make_shared(); + auto obc_map = t->obc_map; op->t = std::move(t); op->hoid = hoid; op->delta_stats = delta_stats; @@ -1516,7 +1523,7 @@ void ECBackend::submit_transaction( /* We update PeeringState::pg_committed_to via the callback * invoked from ECBackend::handle_sub_write_reply immediately * before updating rmw_pipeline.commited_to via - * rmw_pipeline.check_ops()->try_finish_rmw(), so these will + * rmw_pipeline.check_ops()->finish_rmw(), so these will * *usually* match. However, the PrimaryLogPG::submit_log_entries * pathway can perform an out-of-band log update which updates * PeeringState::pg_committed_to independently. Thus, the value @@ -1528,33 +1535,66 @@ void ECBackend::submit_transaction( op->tid = tid; op->reqid = reqid; op->client_op = client_op; + op->pipeline = &rmw_pipeline; if (client_op) { op->trace = client_op->pg_trace; } - op->plan = op->get_write_plan( - sinfo, - *(op->t), - [&](const hobject_t &i) { - dout(10) << "submit_transaction: obtaining hash info for get_write_plan" << dendl; - ECUtil::HashInfoRef ref; - if (auto [r, attrs, size] = get_attrs_n_size_from_disk(i); r >= 0 || r == -ENOENT) { - ref = unstable_hashinfo_registry.get_hash_info( - i, - true, - attrs, //op->t->obc_map[hoid]->attr_cache, - size); //op->t->obc_map[hoid]->obs.oi.size); + ECTransaction::WritePlan &plans = op->plan; + + op->t->safe_create_traverse( + [&](std::pair &i) { + const auto &[oid, inner_op] = i; + ECUtil::HashInfoRef shinfo; + auto &obc = obc_map.at(oid); + object_info_t oi = obc->obs.oi; + std::optional soi; + ECUtil::HashInfoRef hinfo; + + if (!sinfo.supports_ec_overwrites()) { + hinfo = get_hinfo_from_disk(oid); } - if (!ref) { - derr << __func__ << ": get_hash_info(" << i << ")" - << " returned a null pointer and there is no " - << " way to recover from such an error in this " - << " context" << dendl; - ceph_abort(); + + hobject_t source; + if (inner_op.has_source(&source)) { + if (!sinfo.supports_ec_overwrites()) { + shinfo = get_hinfo_from_disk(source); + } + if (!inner_op.is_rename()) { + soi = get_object_info_from_obc(obc_map.at(source)); + } } - return ref; - }, - get_parent()->get_dpp()); - dout(10) << __func__ << ": op " << *op << " starting" << dendl; + + uint64_t old_object_size = 0; + bool object_in_cache = false; + if (rmw_pipeline.extent_cache.contains_object(oid)) { + /* We have a valid extent cache for this object. If we need to read, we + * need to behave as if the object is already the size projected by the + * extent cache, or we may not read enough data. + */ + old_object_size = rmw_pipeline.extent_cache.get_projected_size(oid); + object_in_cache = true; + } else { + std::optional old_oi = get_object_info_from_obc(obc); + if (old_oi && !inner_op.delete_first) { + old_object_size = old_oi->size; + } + } + + auto [readable_shards, writable_shards] = + read_pipeline.get_readable_writable_shard_id_sets(); + ECTransaction::WritePlanObj plan(oid, inner_op, sinfo, readable_shards, + writable_shards, + object_in_cache, old_object_size, + oi, soi, std::move(hinfo), + std::move(shinfo), + rmw_pipeline.ec_pdw_write_mode); + + if (plan.to_read) plans.want_read = true; + plans.plans.emplace_back(std::move(plan)); + }); + ldpp_dout(get_parent()->get_dpp(), 20) << __func__ + << " plans=" << plans + << dendl; rmw_pipeline.start_rmw(std::move(op)); } @@ -1563,8 +1603,7 @@ int ECBackend::objects_read_sync( uint64_t off, uint64_t len, uint32_t op_flags, - bufferlist *bl) -{ + bufferlist *bl) { return -EOPNOTSUPP; } @@ -1574,18 +1613,18 @@ void ECBackend::objects_read_async( const list>> &to_read, Context *on_complete, - bool fast_read) -{ - map> reads; + bool fast_read) { + map> reads; uint32_t flags = 0; extent_set es; - for (const auto& [read, ctx] : to_read) { + for (const auto &[read, ctx]: to_read) { pair tmp; if (!cct->_conf->osd_ec_partial_reads || fast_read) { - tmp = sinfo.offset_len_to_stripe_bounds(make_pair(read.offset, read.size)); + tmp = sinfo.ro_offset_len_to_stripe_ro_offset_len(read.offset, read.size); } else { - tmp = sinfo.offset_len_to_chunk_bounds(make_pair(read.offset, read.size)); + tmp.first = read.offset; + tmp.second = read.size; } es.union_insert(tmp.first, tmp.second); flags |= read.flags; @@ -1593,10 +1632,8 @@ void ECBackend::objects_read_async( if (!es.empty()) { auto &offsets = reads[hoid]; - for (auto j = es.begin(); - j != es.end(); - ++j) { - offsets.emplace_back(ec_align_t{j.get_start(), j.get_len(), flags}); + for (auto [off, len]: es) { + offsets.emplace_back(ec_align_t{off, len, flags}); } } @@ -1604,69 +1641,71 @@ void ECBackend::objects_read_async( ECBackend *ec; hobject_t hoid; list > > to_read; + pair>> to_read; unique_ptr on_complete; - cb(const cb&) = delete; + cb(const cb &) = delete; cb(cb &&) = default; + cb(ECBackend *ec, const hobject_t &hoid, const list > > &to_read, + pair>> &to_read, Context *on_complete) : ec(ec), - hoid(hoid), - to_read(to_read), - on_complete(on_complete) {} + hoid(hoid), + to_read(to_read), + on_complete(on_complete) {} + void operator()(ECCommon::ec_extents_t &&results) { auto dpp = ec->get_parent()->get_dpp(); ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results << dendl; - ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->rmw_pipeline.cache - << dendl; - auto &got = results[hoid]; + auto &got = results.at(hoid); int r = 0; - for (auto &&read: to_read) { - if (got.err < 0) { - // error handling - if (read.second.second) { - read.second.second->complete(got.err); - } - if (r == 0) - r = got.err; - } else { - ceph_assert(read.second.first); - uint64_t offset = read.first.offset; - uint64_t length = read.first.size; - auto range = got.emap.get_containing_range(offset, length); - uint64_t range_offset = range.first.get_off(); - uint64_t range_length = range.first.get_len(); - ceph_assert(range.first != range.second); - ceph_assert(range_offset <= offset); + for (auto &&[read, result]: to_read) { + auto &&[bufs, ctx] = result; + if (got.err < 0) { + // error handling + if (ctx) { + ctx->complete(got.err); + } + if (r == 0) + r = got.err; + } else { + ceph_assert(bufs); + uint64_t offset = read.offset; + uint64_t length = read.size; + auto range = got.emap.get_containing_range(offset, length); + uint64_t range_offset = range.first.get_off(); + uint64_t range_length = range.first.get_len(); + ceph_assert(range.first != range.second); + ceph_assert(range_offset <= offset); ldpp_dout(dpp, 20) << "offset: " << offset << dendl; ldpp_dout(dpp, 20) << "range offset: " << range_offset << dendl; ldpp_dout(dpp, 20) << "length: " << length << dendl; ldpp_dout(dpp, 20) << "range length: " << range_length << dendl; - ceph_assert(offset + length <= range_offset + range_length); - read.second.first->substr_of( - range.first.get_val(), - offset - range_offset, - length); - if (read.second.second) { - read.second.second->complete(length); - read.second.second = nullptr; - } - } + ceph_assert((offset + length) <= (range_offset + range_length)); + bufs->substr_of( + range.first.get_val(), + offset - range_offset, + length); + if (ctx) { + ctx->complete(length); + ctx = nullptr; + } + } } to_read.clear(); if (on_complete) { - on_complete.release()->complete(r); + on_complete.release()->complete(r); } } + ~cb() { for (auto &&i: to_read) { - delete i.second.second; + delete i.second.second; } to_read.clear(); } @@ -1674,23 +1713,29 @@ void ECBackend::objects_read_async( objects_read_and_reconstruct( reads, fast_read, + object_size, make_gen_lambda_context< - ECCommon::ec_extents_t &&, cb>( - cb(this, - hoid, - to_read, - on_complete))); + ECCommon::ec_extents_t&&, cb>( + cb(this, + hoid, + to_read, + on_complete))); } void ECBackend::objects_read_and_reconstruct( - const map - > &reads, + const map> &reads, bool fast_read, - GenContextURef &&func) -{ + uint64_t object_size, + GenContextURef &&func) { return read_pipeline.objects_read_and_reconstruct( - reads, fast_read, std::move(func)); + reads, fast_read, object_size, std::move(func)); +} + +void ECBackend::objects_read_and_reconstruct_for_rmw( + map &&to_read, + GenContextURef &&func) { + return read_pipeline.objects_read_and_reconstruct_for_rmw( + std::move(to_read), std::move(func)); } void ECBackend::kick_reads() { @@ -1699,8 +1744,7 @@ void ECBackend::kick_reads() { int ECBackend::object_stat( const hobject_t &hoid, - struct stat* st) -{ + struct stat *st) { int r = switcher->store->stat( switcher->ch, ghobject_t{hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard}, @@ -1710,11 +1754,10 @@ int ECBackend::object_stat( int ECBackend::objects_get_attrs( const hobject_t &hoid, - map> *out) -{ + map> *out) { for (map::iterator i = out->begin(); i != out->end(); - ) { + ) { if (ECUtil::is_hinfo_key_string(i->first)) out->erase(i++); else @@ -1727,14 +1770,13 @@ int ECBackend::be_deep_scrub( const hobject_t &poid, ScrubMap &map, ScrubMapBuilder &pos, - ScrubMap::object &o) -{ + ScrubMap::object &o) { dout(10) << __func__ << " " << poid << " pos " << pos << dendl; int r; uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | - CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; utime_t sleeptime; sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); @@ -1780,36 +1822,37 @@ int ECBackend::be_deep_scrub( return -EINPROGRESS; } - ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(poid, false, o.attrs, o.size); + ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info( + poid, false, o.attrs, o.size); if (!hinfo) { dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl; o.read_error = true; o.digest_present = false; return 0; } else { - if (!get_parent()->get_pool().allows_ecoverwrites()) { + if (!sinfo.supports_ec_overwrites()) { if (!hinfo->has_chunk_hash()) { dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl; o.ec_size_mismatch = true; return 0; } if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) { - dout(0) << "_scan_list " << poid << " got incorrect size on read 0x" + dout(0) << "_scan_list " << poid << " got incorrect size on read 0x" << std::hex << pos << " expected 0x" << hinfo->get_total_chunk_size() << std::dec << dendl; - o.ec_size_mismatch = true; - return 0; + o.ec_size_mismatch = true; + return 0; } if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != - pos.data_hash.digest()) { - dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x" + pos.data_hash.digest()) { + dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x" << std::hex << pos.data_hash.digest() << " != expected 0x" << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) << std::dec << dendl; - o.ec_hash_mismatch = true; - return 0; + o.ec_hash_mismatch = true; + return 0; } /* We checked above that we match our own stored hash. We cannot diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index b11b946183fbc..82b89f3e4afe2 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -14,22 +14,23 @@ #pragma once -#include #include +#include #include "ECCommon.h" +#include "ECExtentCache.h" +#include "ECListener.h" +#include "ECTypes.h" +#include "ECUtil.h" #include "OSD.h" #include "PGBackend.h" #include "erasure-code/ErasureCodeInterface.h" -#include "ECUtil.h" -#include "ECTransaction.h" -#include "ExtentCache.h" -#include "ECListener.h" +#include "include/buffer.h" +#include "osd/scrubber/scrub_backend.h" /* This file is soon going to be replaced (before next release), so we are going * to simply ignore all deprecated warnings. * */ -IGNORE_DEPRECATED //forward declaration struct ECSubWrite; @@ -39,63 +40,61 @@ struct ECSubReadReply; class ECSwitch; struct RecoveryMessages; +class ECSwitch; class ECBackend : public ECCommon { -public: - PGBackend::RecoveryHandle *open_recovery_op(); + public: + PGBackend::RecoveryHandle *open_recovery_op(); void run_recovery_op( PGBackend::RecoveryHandle *h, - int priority - ); + int priority + ); int recover_object( - const hobject_t &hoid, - eversion_t v, - ObjectContextRef head, - ObjectContextRef obc, + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, PGBackend::RecoveryHandle *h - ); - - bool _handle_message( - OpRequestRef op - ); - bool can_handle_while_inactive( - OpRequestRef op - ); + ); + + bool _handle_message(OpRequestRef op); + bool can_handle_while_inactive(OpRequestRef op); friend struct SubWriteApplied; friend struct SubWriteCommitted; void sub_write_committed( - ceph_tid_t tid, - eversion_t version, - eversion_t last_complete, - const ZTracer::Trace &trace); + ceph_tid_t tid, + eversion_t version, + eversion_t last_complete, + const ZTracer::Trace &trace + ); void handle_sub_write( - pg_shard_t from, - OpRequestRef msg, - ECSubWrite &op, - const ZTracer::Trace &trace, - ECListener& eclistener + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace, + ECListener &eclistener ) override; void handle_sub_read( - pg_shard_t from, - const ECSubRead &op, - ECSubReadReply *reply, - const ZTracer::Trace &trace + pg_shard_t from, + const ECSubRead &op, + ECSubReadReply *reply, + const ZTracer::Trace &trace ); void handle_sub_write_reply( - pg_shard_t from, - const ECSubWriteReply &op, - const ZTracer::Trace &trace + pg_shard_t from, + const ECSubWriteReply &op, + const ZTracer::Trace &trace ); void handle_sub_read_reply( - pg_shard_t from, - ECSubReadReply &op, - const ZTracer::Trace &trace + pg_shard_t from, + ECSubReadReply &op, + const ZTracer::Trace &trace ); /// @see ReadOp below - void check_recovery_sources(const OSDMapRef& osdmap); + void check_recovery_sources(const OSDMapRef &osdmap); void on_change(); void clear_recovery_state(); @@ -107,26 +106,27 @@ public: } void submit_transaction( - const hobject_t &hoid, - const object_stat_sum_t &delta_stats, - const eversion_t &at_version, - PGTransactionUPtr &&t, - const eversion_t &trim_to, - const eversion_t &pg_committed_to, - std::vector&& log_entries, - std::optional &hset_history, - Context *on_all_commit, - ceph_tid_t tid, - osd_reqid_t reqid, - OpRequestRef op + const hobject_t &hoid, + const object_stat_sum_t &delta_stats, + const eversion_t &at_version, + PGTransactionUPtr &&t, + const eversion_t &trim_to, + const eversion_t &pg_committed_to, + std::vector &&log_entries, + std::optional &hset_history, + Context *on_all_commit, + ceph_tid_t tid, + osd_reqid_t reqid, + OpRequestRef op ); int objects_read_sync( - const hobject_t &hoid, - uint64_t off, - uint64_t len, - uint32_t op_flags, - ceph::buffer::list *bl); + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl + ); /** * Async read mechanism @@ -147,19 +147,35 @@ public: * check_recovery_sources. */ void objects_read_and_reconstruct( - const std::map> &reads, - bool fast_read, - GenContextURef &&func) override; + const std::map> &reads, + bool fast_read, + uint64_t object_size, + GenContextURef &&func + ) override; + + /** + * Async read mechanism for read-modify-write (RMW) code paths. Here wthe + * client already knows the set of shard reads that are required, so these + * can be passed in directly. The "fast_read" mechanism is not needed. + * + * Otherwise this is the same as objects_read_and_reconstruct. + */ + void objects_read_and_reconstruct_for_rmw( + std::map &&reads, + GenContextURef &&func + ) override; void objects_read_async( - const hobject_t &hoid, - uint64_t object_size, - const std::list>> &to_read, - Context *on_complete, - bool fast_read = false); - -private: + const hobject_t &hoid, + uint64_t object_size, + const std::list>> & + to_read, + Context *on_complete, + bool fast_read = false + ); + + private: friend struct ECRecoveryHandle; void kick_reads(); @@ -194,159 +210,185 @@ private: * Transaction, and reads in a RecoveryMessages object which is passed * among the recovery methods. */ -public: + public: struct RecoveryBackend { - CephContext* cct; + CephContext *cct; const coll_t &coll; ceph::ErasureCodeInterfaceRef ec_impl; - const ECUtil::stripe_info_t& sinfo; - ReadPipeline& read_pipeline; - UnstableHashInfoRegistry& unstable_hashinfo_registry; + const ECUtil::stripe_info_t &sinfo; + ReadPipeline &read_pipeline; + UnstableHashInfoRegistry &unstable_hashinfo_registry; // TODO: lay an interface down here - ECListener* parent; - ECBackend* ecbackend; + ECListener *parent; + ECBackend *ecbackend; ECListener *get_parent() const { return parent; } - const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); } - epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); } + + const OSDMapRef &get_osdmap() const { + return get_parent()->pgb_get_osdmap(); + } + + epoch_t get_osdmap_epoch() const { + return get_parent()->pgb_get_osdmap_epoch(); + } + const pg_info_t &get_info() { return get_parent()->get_info(); } void add_temp_obj(const hobject_t &oid) { get_parent()->add_temp_obj(oid); } - void clear_temp_obj(const hobject_t &oid) { get_parent()->clear_temp_obj(oid); } - - RecoveryBackend(CephContext* cct, - const coll_t &coll, - ceph::ErasureCodeInterfaceRef ec_impl, - const ECUtil::stripe_info_t& sinfo, - ReadPipeline& read_pipeline, - UnstableHashInfoRegistry& unstable_hashinfo_registry, - ECListener* parent, - ECBackend* ecbackend); - struct RecoveryOp { - hobject_t hoid; - eversion_t v; - std::set missing_on; - std::set missing_on_shards; - - ObjectRecoveryInfo recovery_info; - ObjectRecoveryProgress recovery_progress; - - enum state_t { IDLE, READING, WRITING, COMPLETE } state; - - static const char* tostr(state_t state) { - switch (state) { - case RecoveryOp::IDLE: - return "IDLE"; - case RecoveryOp::READING: - return "READING"; - case RecoveryOp::WRITING: - return "WRITING"; - case RecoveryOp::COMPLETE: - return "COMPLETE"; - default: - ceph_abort(); - return ""; - } - } - // must be filled if state == WRITING - std::map returned_data; - std::map> xattrs; - ECUtil::HashInfoRef hinfo; - ObjectContextRef obc; - std::set waiting_on_pushes; + void clear_temp_obj(const hobject_t &oid) { + get_parent()->clear_temp_obj(oid); + } - // valid in state READING - std::pair extent_requested; + RecoveryBackend(CephContext *cct, + const coll_t &coll, + ceph::ErasureCodeInterfaceRef ec_impl, + const ECUtil::stripe_info_t &sinfo, + ReadPipeline &read_pipeline, + UnstableHashInfoRegistry &unstable_hashinfo_registry, + ECListener *parent, + ECBackend *ecbackend); + + struct RecoveryOp { + hobject_t hoid; + eversion_t v; + std::set missing_on; + std::set missing_on_shards; + + ObjectRecoveryInfo recovery_info; + ObjectRecoveryProgress recovery_progress; + + enum state_t { IDLE, READING, WRITING, COMPLETE } state; + + static const char *tostr(state_t state) { + switch (state) { + case RecoveryOp::IDLE: + return "IDLE"; + case RecoveryOp::READING: + return "READING"; + case RecoveryOp::WRITING: + return "WRITING"; + case RecoveryOp::COMPLETE: + return "COMPLETE"; + default: + ceph_abort(); + return ""; + } + } - void dump(ceph::Formatter *f) const; + // must be filled if state == WRITING + std::optional returned_data; + std::map> xattrs; + ECUtil::HashInfoRef hinfo; + ObjectContextRef obc; + std::set waiting_on_pushes; + + void dump(ceph::Formatter *f) const; + + RecoveryOp() : state(IDLE) {} + + void print(std::ostream &os) const { + os << "RecoveryOp(" + << "hoid=" << hoid + << " v=" << v + << " missing_on=" << missing_on + << " missing_on_shards=" << missing_on_shards + << " recovery_info=" << recovery_info + << " recovery_progress=" << recovery_progress + << " obc refcount=" << obc.use_count() + << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(state) + << " waiting_on_pushes=" << waiting_on_pushes + << ")"; + } + }; - RecoveryOp() : state(IDLE) {} - }; - friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs); - std::map recovery_ops; + std::map recovery_ops; - uint64_t get_recovery_chunk_size() const { - return round_up_to(cct->_conf->osd_recovery_max_chunk, - sinfo.get_stripe_width()); - } + uint64_t get_recovery_chunk_size() const { + return round_up_to(cct->_conf->osd_recovery_max_chunk, + sinfo.get_stripe_width()); + } - virtual ~RecoveryBackend() = default; - virtual void commit_txn_send_replies( - ceph::os::Transaction&& txn, - std::map replies) = 0; - void dispatch_recovery_messages(RecoveryMessages &m, int priority); + virtual ~RecoveryBackend() = default; + virtual void commit_txn_send_replies( + ceph::os::Transaction &&txn, + std::map replies) = 0; + void dispatch_recovery_messages(RecoveryMessages &m, int priority); - PGBackend::RecoveryHandle *open_recovery_op(); - void run_recovery_op( - struct ECRecoveryHandle &h, - int priority); - int recover_object( - const hobject_t &hoid, - eversion_t v, - ObjectContextRef head, - ObjectContextRef obc, - PGBackend::RecoveryHandle *h); - void continue_recovery_op( - RecoveryBackend::RecoveryOp &op, - RecoveryMessages *m); - void handle_recovery_read_complete( - const hobject_t &hoid, - boost::tuple > &to_read, - std::optional> > attrs, - RecoveryMessages *m); - void handle_recovery_push( - const PushOp &op, - RecoveryMessages *m, - bool is_repair); - void handle_recovery_push_reply( - const PushReplyOp &op, - pg_shard_t from, - RecoveryMessages *m); - friend struct RecoveryMessages; - int get_ec_data_chunk_count() const { - return ec_impl->get_data_chunk_count(); - } - void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res); + PGBackend::RecoveryHandle *open_recovery_op(); + void run_recovery_op( + struct ECRecoveryHandle &h, + int priority); + int recover_object( + const hobject_t &hoid, + eversion_t v, + ObjectContextRef head, + ObjectContextRef obc, + PGBackend::RecoveryHandle *h); + void continue_recovery_op( + RecoveryBackend::RecoveryOp &op, + RecoveryMessages *m); + void handle_recovery_read_complete( + const hobject_t &hoid, + ECUtil::shard_extent_map_t &&buffers_read, + std::optional>> + attrs, + const ECUtil::shard_extent_set_t &want_to_read, + RecoveryMessages *m); + void handle_recovery_push( + const PushOp &op, + RecoveryMessages *m, + bool is_repair); + void handle_recovery_push_reply( + const PushReplyOp &op, + pg_shard_t from, + RecoveryMessages *m); + friend struct RecoveryMessages; + void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res); }; + struct ECRecoveryBackend : RecoveryBackend { - ECRecoveryBackend(CephContext* cct, - const coll_t &coll, - ceph::ErasureCodeInterfaceRef ec_impl, - const ECUtil::stripe_info_t& sinfo, - ReadPipeline& read_pipeline, - UnstableHashInfoRegistry& unstable_hashinfo_registry, - PGBackend::Listener* parent, - ECBackend* ecbackend) - : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline, unstable_hashinfo_registry, parent->get_eclistener(), ecbackend), - parent(parent) { - } + ECRecoveryBackend(CephContext *cct, + const coll_t &coll, + ceph::ErasureCodeInterfaceRef ec_impl, + const ECUtil::stripe_info_t &sinfo, + ReadPipeline &read_pipeline, + UnstableHashInfoRegistry &unstable_hashinfo_registry, + PGBackend::Listener *parent, + ECBackend *ecbackend) + : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline, + unstable_hashinfo_registry, parent->get_eclistener(), + ecbackend), + parent(parent) {} void commit_txn_send_replies( - ceph::os::Transaction&& txn, - std::map replies) override; + ceph::os::Transaction &&txn, + std::map replies) override; PGBackend::Listener *get_parent() const { return parent; } - private: + private: PGBackend::Listener *parent; }; - friend ostream &operator<<(ostream &lhs, const RecoveryBackend::RecoveryOp &rhs); + + friend ostream &operator<<(ostream &lhs, + const RecoveryBackend::RecoveryOp &rhs + ); friend struct RecoveryMessages; friend struct OnRecoveryReadComplete; friend struct RecoveryReadCompleter; void handle_recovery_push( - const PushOp &op, - RecoveryMessages *m, - bool is_repair); + const PushOp &op, + RecoveryMessages *m, + bool is_repair + ); -public: - PGBackend::Listener *parent; - CephContext *cct; - ECSwitch *switcher; - struct ReadPipeline read_pipeline; - struct RMWPipeline rmw_pipeline; - struct ECRecoveryBackend recovery_backend; + PGBackend::Listener *parent; + CephContext *cct; + ECSwitch *switcher; + ReadPipeline read_pipeline; + RMWPipeline rmw_pipeline; + ECRecoveryBackend recovery_backend; ceph::ErasureCodeInterfaceRef ec_impl; @@ -355,46 +397,55 @@ public: /** * ECRecPred * - * Determines the whether _have is sufficient to recover an object + * Determines whether _have is sufficient to recover an object */ class ECRecPred : public IsPGRecoverablePredicate { - std::set want; + shard_id_set want; + const ECUtil::stripe_info_t *sinfo; ceph::ErasureCodeInterfaceRef ec_impl; - public: - explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) { - for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) { - want.insert(i); - } + + public: + explicit ECRecPred(const ECUtil::stripe_info_t *sinfo, + ceph::ErasureCodeInterfaceRef ec_impl) : + sinfo(sinfo), ec_impl(ec_impl) { + want.insert_range(shard_id_t(0), sinfo->get_k_plus_m()); } + bool operator()(const std::set &_have) const override { - std::set have; - for (std::set::const_iterator i = _have.begin(); - i != _have.end(); - ++i) { - have.insert(static_cast(i->shard)); + shard_id_set have; + for (pg_shard_t p: _have) { + have.insert(p.shard); + } + std::unique_ptr>>> + min_sub_chunks = nullptr; + if (sinfo->supports_sub_chunks()) { + min_sub_chunks = std::make_unique>>>(sinfo->get_k_plus_m()); } - std::map>> min; + shard_id_set min; - return ec_impl->minimum_to_decode(want, have, &min) == 0; + return ec_impl->minimum_to_decode(want, have, min, min_sub_chunks.get()) + == 0; } }; + std::unique_ptr get_is_recoverable_predicate() const { - return std::make_unique(ec_impl); + return std::make_unique(&sinfo, ec_impl); } - unsigned get_ec_data_chunk_count() const { - return ec_impl->get_data_chunk_count(); + unsigned get_ec_data_chunk_count() const { + return sinfo.get_k(); } + int get_ec_stripe_chunk_size() const { return sinfo.get_chunk_size(); } - uint64_t object_size_to_shard_size(const uint64_t size, - shard_id_t shard) const { - if (size == std::numeric_limits::max()) { - return size; - } - return sinfo.logical_to_next_chunk_offset(size); + + uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard + ) const { + return sinfo.object_size_to_shard_size(size, shard); } + /** * ECReadPred * @@ -403,16 +454,21 @@ public: class ECReadPred : public IsPGReadablePredicate { pg_shard_t whoami; ECRecPred rec_pred; - public: + + public: ECReadPred( - pg_shard_t whoami, - ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {} + pg_shard_t whoami, + const ECUtil::stripe_info_t *sinfo, + ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(sinfo, ec_impl) {} + bool operator()(const std::set &_have) const override { return _have.count(whoami) && rec_pred(_have); } }; - std::unique_ptr get_is_readable_predicate(pg_shard_t whoami) const { - return std::make_unique(whoami, ec_impl); + + std::unique_ptr + get_is_readable_predicate(pg_shard_t whoami) const { + return std::make_unique(whoami, &sinfo, ec_impl); } const ECUtil::stripe_info_t sinfo; @@ -424,34 +480,41 @@ public: int, std::map>, size_t - > get_attrs_n_size_from_disk(const hobject_t& hoid); + > get_attrs_n_size_from_disk(const hobject_t &hoid); + + ECUtil::HashInfoRef get_hinfo_from_disk(hobject_t oid); + + std::optional get_object_info_from_obc( + ObjectContextRef &obc_map + ); -public: - int object_stat(const hobject_t &hoid, struct stat* st); + public: + int object_stat(const hobject_t &hoid, struct stat *st); ECBackend( - PGBackend::Listener *pg, - CephContext *cct, - ceph::ErasureCodeInterfaceRef ec_impl, - uint64_t stripe_width, - ECSwitch *s, - ECExtentCache::LRU &ignored); + PGBackend::Listener *pg, + CephContext *cct, + ceph::ErasureCodeInterfaceRef ec_impl, + uint64_t stripe_width, + ECSwitch *s, + ECExtentCache::LRU &ec_extent_cache_lru + ); int objects_get_attrs( - const hobject_t &hoid, - std::map> *out); + const hobject_t &hoid, + std::map> *out + ); bool auto_repair_supported() const { return true; } int be_deep_scrub( - const hobject_t &poid, - ScrubMap &map, - ScrubMapBuilder &pos, - ScrubMap::object &o); + const hobject_t &poid, + ScrubMap &map, + ScrubMapBuilder &pos, + ScrubMap::object &o + ); - uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t ignored) const { - return sinfo.logical_to_next_chunk_offset(logical_size); + uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t shard_id + ) const { + return object_size_to_shard_size(logical_size, shard_id); } }; -ostream &operator<<(ostream &lhs, const ECBackend::RMWPipeline::pipeline_state_t &rhs); - -END_IGNORE_DEPRECATED diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 1b197284161df..6a4d64ba41516 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -16,18 +16,15 @@ #include #include +#include +#include #include "ECInject.h" -#include "messages/MOSDPGPush.h" -#include "messages/MOSDPGPushReply.h" #include "messages/MOSDECSubOpWrite.h" -#include "messages/MOSDECSubOpWriteReply.h" #include "messages/MOSDECSubOpRead.h" -#include "messages/MOSDECSubOpReadReply.h" #include "common/debug.h" #include "ECMsgTypes.h" #include "PGLog.h" - #include "osd_tracer.h" #define dout_context cct @@ -36,11 +33,6 @@ #undef dout_prefix #define dout_prefix _prefix(_dout, this) -/* This file is soon going to be replaced (before next release), so we are going - * to simply ignore all deprecated warnings. - * */ -IGNORE_DEPRECATED - using std::dec; using std::hex; using std::less; @@ -60,77 +52,27 @@ using ceph::bufferptr; using ceph::ErasureCodeInterfaceRef; using ceph::Formatter; -static ostream& _prefix(std::ostream *_dout, ECCommon::RMWPipeline *rmw_pipeline) { +static ostream &_prefix(std::ostream *_dout, + ECCommon::RMWPipeline const *rmw_pipeline) { return rmw_pipeline->get_parent()->gen_dbg_prefix(*_dout); } -static ostream& _prefix(std::ostream *_dout, ECCommon::ReadPipeline *read_pipeline) { - return read_pipeline->get_parent()->gen_dbg_prefix(*_dout); -} -static ostream& _prefix(std::ostream *_dout, - ECCommon::UnstableHashInfoRegistry *unstable_hash_info_registry) { - // TODO: backref to ECListener? - return *_dout; -} -static ostream& _prefix(std::ostream *_dout, struct ClientReadCompleter *read_completer); - -ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::pipeline_state_t &rhs) { - switch (rhs.pipeline_state) { - case ECCommon::RMWPipeline::pipeline_state_t::CACHE_VALID: - return lhs << "CACHE_VALID"; - case ECCommon::RMWPipeline::pipeline_state_t::CACHE_INVALID: - return lhs << "CACHE_INVALID"; - default: - ceph_abort_msg("invalid pipeline state"); - } - return lhs; // unreachable -} -ostream &operator<<(ostream &lhs, const ECCommon::ec_extent_t &rhs) -{ - return lhs << rhs.err << "," - << rhs.emap; -} - -ostream &operator<<(ostream &lhs, const ECCommon::read_request_t &rhs) -{ - return lhs << "read_request_t(to_read=[" << rhs.to_read << "]" - << ", need=" << rhs.need - << ", want_attrs=" << rhs.want_attrs - << ")"; +static ostream &_prefix(std::ostream *_dout, + ECCommon::ReadPipeline const *read_pipeline) { + return read_pipeline->get_parent()->gen_dbg_prefix(*_dout); } -ostream &operator<<(ostream &lhs, const ECCommon::read_result_t &rhs) -{ - lhs << "read_result_t(r=" << rhs.r - << ", errors=" << rhs.errors; - if (rhs.attrs) { - lhs << ", attrs=" << *(rhs.attrs); - } else { - lhs << ", noattrs"; - } - return lhs << ", returned=" << rhs.returned << ")"; +static ostream &_prefix(std::ostream *_dout, + ECCommon::UnstableHashInfoRegistry * + unstable_hash_info_registry) { + return *_dout; } -ostream &operator<<(ostream &lhs, const ECCommon::ReadOp &rhs) -{ - lhs << "ReadOp(tid=" << rhs.tid; -#ifndef WITH_CRIMSON - if (rhs.op && rhs.op->get_req()) { - lhs << ", op="; - rhs.op->get_req()->print(lhs); - } -#endif - return lhs << ", to_read=" << rhs.to_read - << ", complete=" << rhs.complete - << ", priority=" << rhs.priority - << ", obj_to_source=" << rhs.obj_to_source - << ", source_to_obj=" << rhs.source_to_obj - << ", want_to_read" << rhs.want_to_read - << ", in_progress=" << rhs.in_progress << ")"; -} +static ostream &_prefix(std::ostream *_dout, + struct ClientReadCompleter const *read_completer + ); -void ECCommon::ReadOp::dump(Formatter *f) const -{ +void ECCommon::ReadOp::dump(Formatter *f) const { f->dump_unsigned("tid", tid); #ifndef WITH_CRIMSON if (op && op->get_req()) { @@ -142,272 +84,287 @@ void ECCommon::ReadOp::dump(Formatter *f) const f->dump_int("priority", priority); f->dump_stream("obj_to_source") << obj_to_source; f->dump_stream("source_to_obj") << source_to_obj; - f->dump_stream("want_to_read") << want_to_read; f->dump_stream("in_progress") << in_progress; } -ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::Op &rhs) -{ - lhs << "Op(" << rhs.hoid - << " v=" << rhs.version - << " tt=" << rhs.trim_to - << " tid=" << rhs.tid - << " reqid=" << rhs.reqid; -#ifndef WITH_CRIMSON - if (rhs.client_op && rhs.client_op->get_req()) { - lhs << " client_op="; - rhs.client_op->get_req()->print(lhs); - } -#endif - lhs << " pg_committed_to=" << rhs.pg_committed_to - << " temp_added=" << rhs.temp_added - << " temp_cleared=" << rhs.temp_cleared - << " pending_read=" << rhs.pending_read - << " remote_read=" << rhs.remote_read - << " remote_read_result=" << rhs.remote_read_result - << " pending_apply=" << rhs.pending_apply - << " pending_commit=" << rhs.pending_commit - << " plan.to_read=" << rhs.plan.to_read - << " plan.will_write=" << rhs.plan.will_write - << ")"; - return lhs; -} - -void ECCommon::ReadPipeline::complete_read_op(ReadOp &rop) -{ +void ECCommon::ReadPipeline::complete_read_op(ReadOp &&rop) { dout(20) << __func__ << " completing " << rop << dendl; - map::iterator req_iter = - rop.to_read.begin(); - map::iterator resiter = - rop.complete.begin(); + auto req_iter = rop.to_read.begin(); + auto resiter = rop.complete.begin(); ceph_assert(rop.to_read.size() == rop.complete.size()); for (; req_iter != rop.to_read.end(); ++req_iter, ++resiter) { - ceph_assert(rop.want_to_read.contains(req_iter->first)); + auto &hoid = req_iter->first; + read_result_t &res = resiter->second; + read_request_t &req = req_iter->second; rop.on_complete->finish_single_request( - req_iter->first, - resiter->second, - req_iter->second.to_read, - rop.want_to_read[req_iter->first]); + hoid, std::move(res), req); } ceph_assert(rop.on_complete); std::move(*rop.on_complete).finish(rop.priority); rop.on_complete = nullptr; + // if the read op is over. clean all the data of this tid. - for (set::iterator iter = rop.in_progress.begin(); - iter != rop.in_progress.end(); - iter++) { - shard_to_read_map[*iter].erase(rop.tid); + for (auto &pg_shard: rop.in_progress) { + shard_to_read_map[pg_shard].erase(rop.tid); } rop.in_progress.clear(); tid_to_read_map.erase(rop.tid); } -void ECCommon::ReadPipeline::on_change() -{ - for (map::iterator i = tid_to_read_map.begin(); - i != tid_to_read_map.end(); - ++i) { - dout(10) << __func__ << ": cancelling " << i->second << dendl; +void ECCommon::ReadPipeline::on_change() { + for (auto &rop: std::views::keys(tid_to_read_map)) { + dout(10) << __func__ << ": cancelling " << rop << dendl; } tid_to_read_map.clear(); shard_to_read_map.clear(); in_progress_client_reads.clear(); } +std::pair +ECCommon::ReadPipeline::get_readable_writable_shard_id_sets() { + shard_id_set readable; + shard_id_set writable; + + for (auto &&pg_shard: get_parent()->get_acting_shards()) { + readable.insert(pg_shard.shard); + } + + writable = get_parent()->get_acting_recovery_backfill_shard_id_set(); + return std::make_pair(std::move(readable), std::move(writable)); +} + void ECCommon::ReadPipeline::get_all_avail_shards( - const hobject_t &hoid, - const set &error_shards, - set &have, - map &shards, - bool for_recovery) -{ - for (set::const_iterator i = - get_parent()->get_acting_shards().begin(); - i != get_parent()->get_acting_shards().end(); - ++i) { - dout(10) << __func__ << ": checking acting " << *i << dendl; - const pg_missing_t &missing = get_parent()->get_shard_missing(*i); - if (error_shards.contains(*i)) { + const hobject_t &hoid, + shard_id_set &have, + shard_id_map &shards, + const bool for_recovery, + const std::optional> &error_shards) { + for (auto &&pg_shard: get_parent()->get_acting_shards()) { + dout(10) << __func__ << ": checking acting " << pg_shard << dendl; + const pg_missing_t &missing = get_parent()->get_shard_missing(pg_shard); + if (error_shards && error_shards->contains(pg_shard)) { continue; } + const shard_id_t &shard = pg_shard.shard; if (cct->_conf->bluestore_debug_inject_read_err && - ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) { - dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl; + ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, shard))) { + dout(0) << __func__ << " Error inject - Missing shard " << shard << dendl; continue; } if (!missing.is_missing(hoid)) { - ceph_assert(!have.count(static_cast(i->shard))); - have.insert(static_cast(i->shard)); - ceph_assert(!shards.count(i->shard)); - shards.insert(make_pair(i->shard, *i)); + ceph_assert(!have.contains(shard)); + have.insert(shard); + ceph_assert(!shards.contains(shard)); + shards.insert(shard, pg_shard); } } if (for_recovery) { - for (set::const_iterator i = - get_parent()->get_backfill_shards().begin(); - i != get_parent()->get_backfill_shards().end(); - ++i) { - if (error_shards.find(*i) != error_shards.end()) - continue; - if (have.count(static_cast(i->shard))) { - ceph_assert(shards.count(i->shard)); - continue; + for (auto &&pg_shard: get_parent()->get_backfill_shards()) { + if (error_shards && error_shards->contains(pg_shard)) + continue; + const shard_id_t &shard = pg_shard.shard; + if (have.contains(shard)) { + ceph_assert(shards.contains(shard)); + continue; } - dout(10) << __func__ << ": checking backfill " << *i << dendl; - ceph_assert(!shards.count(i->shard)); - const pg_info_t &info = get_parent()->get_shard_info(*i); - const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + dout(10) << __func__ << ": checking backfill " << pg_shard << dendl; + ceph_assert(!shards.count(shard)); + const pg_info_t &info = get_parent()->get_shard_info(pg_shard); if (hoid < info.last_backfill && - !missing.is_missing(hoid)) { - have.insert(static_cast(i->shard)); - shards.insert(make_pair(i->shard, *i)); + !get_parent()->get_shard_missing(pg_shard).is_missing(hoid)) { + have.insert(shard); + shards.insert(shard, pg_shard); } } - map>::const_iterator miter = - get_parent()->get_missing_loc_shards().find(hoid); + auto miter = get_parent()->get_missing_loc_shards().find(hoid); if (miter != get_parent()->get_missing_loc_shards().end()) { - for (set::iterator i = miter->second.begin(); - i != miter->second.end(); - ++i) { - dout(10) << __func__ << ": checking missing_loc " << *i << dendl; - auto m = get_parent()->maybe_get_shard_missing(*i); - if (m) { - ceph_assert(!(*m).is_missing(hoid)); - } - if (error_shards.find(*i) != error_shards.end()) - continue; - have.insert(static_cast(i->shard)); - shards.insert(make_pair(i->shard, *i)); + for (auto &&pg_shard: miter->second) { + dout(10) << __func__ << ": checking missing_loc " << pg_shard << dendl; + if (const auto m = get_parent()->maybe_get_shard_missing(pg_shard)) { + ceph_assert(!m->is_missing(hoid)); + } + if (error_shards && error_shards->contains(pg_shard)) { + continue; + } + have.insert(pg_shard.shard); + shards.insert(pg_shard.shard, pg_shard); } } } } int ECCommon::ReadPipeline::get_min_avail_to_read_shards( - const hobject_t &hoid, - const set &want, - bool for_recovery, - bool do_redundant_reads, - map>> *to_read) -{ + const hobject_t &hoid, + bool for_recovery, + bool do_redundant_reads, + read_request_t &read_request, + const std::optional> &error_shards) { // Make sure we don't do redundant reads for recovery ceph_assert(!for_recovery || !do_redundant_reads); - set have; - map shards; - set error_shards; + if (read_request.object_size == 0) { + dout(10) << __func__ << " empty read" << dendl; + return 0; + } - get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + shard_id_set have; + shard_id_map shards(sinfo.get_k_plus_m()); - map>> need; - int r = ec_impl->minimum_to_decode(want, have, &need); - if (r < 0) + get_all_avail_shards(hoid, have, shards, for_recovery, error_shards); + + std::unique_ptr>>> need_sub_chunks = + nullptr; + if (sinfo.supports_sub_chunks()) { + need_sub_chunks = std::make_unique>>>( + sinfo.get_k_plus_m()); + } + shard_id_set need_set; + shard_id_set want; + + read_request.shard_want_to_read.populate_shard_id_set(want); + + int r = ec_impl->minimum_to_decode(want, have, need_set, + need_sub_chunks.get()); + if (r < 0) { + dout(20) << "minimum_to_decode_failed r: " << r << "want: " << want + << " have: " << have << " need: " << need_set << dendl; return r; + } if (do_redundant_reads) { + if (need_sub_chunks) { vector> subchunks_list; subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); for (auto &&i: have) { - need[i] = subchunks_list; + (*need_sub_chunks)[i] = subchunks_list; } - } + } + for (auto &&i: have) { + need_set.insert(i); + } + } - if (!to_read) - return 0; + extent_set extra_extents; + ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + ECUtil::shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + + sinfo.ro_size_to_read_mask(read_request.object_size, read_mask); + sinfo.ro_size_to_zero_mask(read_request.object_size, zero_mask); - for (auto &&i:need) { - ceph_assert(shards.count(shard_id_t(i.first))); - to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second)); + /* First deal with missing shards */ + for (auto &&[shard, extent_set]: read_request.shard_want_to_read) { + /* Work out what extra extents we need to read on each shard. If do + * redundant reads is set, then we want to have the same reads on + * every extent. Otherwise, we need to read every shard only if the + * necessary shard is missing. + */ + if (!have.contains(shard) || do_redundant_reads) { + extra_extents.union_of(extent_set); + } } - return 0; -} -// a static for the sake of unittesting -void ECCommon::ReadPipeline::get_min_want_to_read_shards( - const uint64_t offset, - const uint64_t length, - const ECUtil::stripe_info_t& sinfo, - set *want_to_read) -{ - const auto [left_chunk_index, right_chunk_index] = - sinfo.offset_length_to_data_chunk_indices(offset, length); - const auto distance = - std::min(right_chunk_index - left_chunk_index, (uint64_t)sinfo.get_k()); - for(uint64_t i = 0; i < distance; i++) { - raw_shard_id_t raw_shard((left_chunk_index + i) % sinfo.get_k()); - want_to_read->insert(static_cast(sinfo.get_shard(raw_shard))); + for (auto &shard: need_set) { + if (!have.contains(shard)) { + continue; + } + shard_id_t shard_id(shard); + extent_set extents = extra_extents; + shard_read_t shard_read; + if (need_sub_chunks) { + shard_read.subchunk = need_sub_chunks->at(shard_id); + } + shard_read.pg_shard = shards[shard_id]; + + if (read_request.shard_want_to_read.contains(shard)) { + extents.union_of(read_request.shard_want_to_read.at(shard)); + } + + extents.align(CEPH_PAGE_SIZE); + if (read_mask.contains(shard)) { + shard_read.extents.intersection_of(extents, read_mask.at(shard)); + } + + if (!shard_read.extents.empty()) { + read_request.shard_reads[shard_id] = std::move(shard_read); + } } + + dout(20) << __func__ << " for_recovery: " << for_recovery + << " do_redundant_reads: " << do_redundant_reads + << " read_request: " << read_request + << " error_shards: " << error_shards + << dendl; + return 0; } + void ECCommon::ReadPipeline::get_min_want_to_read_shards( - const uint64_t offset, - const uint64_t length, - set *want_to_read) -{ - get_min_want_to_read_shards(offset, length, sinfo, want_to_read); - dout(20) << __func__ << ": offset " << offset << " length " << length - << " want_to_read " << *want_to_read << dendl; + const ec_align_t &to_read, + ECUtil::shard_extent_set_t &want_shard_reads) { + sinfo.ro_range_to_shard_extent_set(to_read.offset, to_read.size, + want_shard_reads); + dout(20) << __func__ << ": to_read " << to_read + << " read_request " << want_shard_reads << dendl; } int ECCommon::ReadPipeline::get_remaining_shards( - const hobject_t &hoid, - const set &avail, - const set &want, - const read_result_t &result, - map>> *to_read, - bool for_recovery) -{ - ceph_assert(to_read); - - set have; - map shards; + const hobject_t &hoid, + read_result_t &read_result, + read_request_t &read_request, + const bool for_recovery, + const bool fast_read) { + shard_id_map shards(sinfo.get_k_plus_m()); set error_shards; - for (auto &p : result.errors) { - error_shards.insert(p.first); + for (auto &shard: std::views::keys(read_result.errors)) { + error_shards.insert(shard); } - get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + const int r = get_min_avail_to_read_shards( + hoid, + for_recovery, + fast_read, + read_request, + error_shards); - map>> need; - int r = ec_impl->minimum_to_decode(want, have, &need); - if (r < 0) { + if (r) { dout(0) << __func__ << " not enough shards left to try for " << hoid - << " read result was " << result << dendl; + << " read result was " << read_result << dendl; return -EIO; } - set shards_left; - for (auto p : need) { - if (avail.find(p.first) == avail.end()) { - shards_left.insert(p.first); + // Rather than repeating whole read, we can remove everything we already have. + for (auto iter = read_request.shard_reads.begin(); + iter != read_request.shard_reads.end();) { + auto &&[shard_id, shard_read] = *iter; + bool do_erase = false; + + // Ignore where shard has not been read at all. + if (read_result.processed_read_requests.contains(shard_id)) { + shard_read.extents.subtract( + read_result.processed_read_requests.at(shard_id)); + do_erase = shard_read.extents.empty(); } - } - vector> subchunks; - subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); - for (set::iterator i = shards_left.begin(); - i != shards_left.end(); - ++i) { - ceph_assert(shards.count(shard_id_t(*i))); - ceph_assert(avail.find(*i) == avail.end()); - to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); + if (do_erase) { + iter = read_request.shard_reads.erase(iter); + } else { + ++iter; + } } - return 0; + + return read_request.shard_reads.empty()?1:0; } void ECCommon::ReadPipeline::start_read_op( - int priority, - map> &want_to_read, - map &to_read, - OpRequestRef _op, - bool do_redundant_reads, - bool for_recovery, - std::unique_ptr on_complete) -{ + const int priority, + map &to_read, + const bool do_redundant_reads, + const bool for_recovery, + std::unique_ptr on_complete) { ceph_tid_t tid = get_parent()->get_tid(); - ceph_assert(!tid_to_read_map.count(tid)); + ceph_assert(!tid_to_read_map.contains(tid)); auto &op = tid_to_read_map.emplace( tid, ReadOp( @@ -416,527 +373,419 @@ void ECCommon::ReadPipeline::start_read_op( do_redundant_reads, for_recovery, std::move(on_complete), - _op, - std::move(want_to_read), std::move(to_read))).first->second; dout(10) << __func__ << ": starting " << op << dendl; - if (_op) { + if (op.op) { #ifndef WITH_CRIMSON - op.trace = _op->pg_trace; + op.trace = op.op->pg_trace; #endif op.trace.event("start ec read"); } do_read_op(op); } -void ECCommon::ReadPipeline::do_read_op(ReadOp &op) -{ - int priority = op.priority; - ceph_tid_t tid = op.tid; +void ECCommon::ReadPipeline::do_read_op(ReadOp &rop) { + const int priority = rop.priority; + const ceph_tid_t tid = rop.tid; - dout(10) << __func__ << ": starting read " << op << dendl; + dout(10) << __func__ << ": starting read " << rop << dendl; + ceph_assert(!rop.to_read.empty()); map messages; - for (map::iterator i = op.to_read.begin(); - i != op.to_read.end(); - ++i) { - bool need_attrs = i->second.want_attrs; - - for (auto j = i->second.need.begin(); - j != i->second.need.end(); - ++j) { - if (need_attrs) { - messages[j->first].attrs_to_read.insert(i->first); - need_attrs = false; + for (auto &&[hoid, read_request]: rop.to_read) { + bool need_attrs = read_request.want_attrs; + ceph_assert(!read_request.shard_reads.empty()); + + for (auto &&[shard, shard_read]: read_request.shard_reads) { + if (need_attrs && !sinfo.is_nonprimary_shard(shard)) { + messages[shard_read.pg_shard].attrs_to_read.insert(hoid); + need_attrs = false; } - messages[j->first].subchunks[i->first] = j->second; - op.obj_to_source[i->first].insert(j->first); - op.source_to_obj[j->first].insert(i->first); + if (shard_read.subchunk) { + messages[shard_read.pg_shard].subchunks[hoid] = *shard_read.subchunk; + } else { + static const std::vector default_sub_chunk = {make_pair(0, 1)}; + messages[shard_read.pg_shard].subchunks[hoid] = default_sub_chunk; + } + rop.obj_to_source[hoid].insert(shard_read.pg_shard); + rop.source_to_obj[shard_read.pg_shard].insert(hoid); } - for (const auto& read : i->second.to_read) { - auto p = make_pair(read.offset, read.size); - pair chunk_off_len = sinfo.chunk_aligned_offset_len_to_chunk(p); - for (auto k = i->second.need.begin(); - k != i->second.need.end(); - ++k) { - messages[k->first].to_read[i->first].push_back( - boost::make_tuple( - chunk_off_len.first, - chunk_off_len.second, - read.flags)); + for (auto &[_, shard_read]: read_request.shard_reads) { + ceph_assert(!shard_read.extents.empty()); + rop.debug_log.emplace_back(ECUtil::READ_REQUEST, shard_read.pg_shard, + shard_read.extents); + for (auto &[start, len]: shard_read.extents) { + messages[shard_read.pg_shard].to_read[hoid].emplace_back( + boost::make_tuple(start, len, read_request.flags)); } - ceph_assert(!need_attrs); } + ceph_assert(!need_attrs); } std::vector> m; m.reserve(messages.size()); - for (map::iterator i = messages.begin(); - i != messages.end(); - ++i) { - op.in_progress.insert(i->first); - shard_to_read_map[i->first].insert(op.tid); - i->second.tid = tid; - MOSDECSubOpRead *msg = new MOSDECSubOpRead; + for (auto &&[pg_shard, read]: messages) { + rop.in_progress.insert(pg_shard); + shard_to_read_map[pg_shard].insert(rop.tid); + read.tid = tid; + auto *msg = new MOSDECSubOpRead; msg->set_priority(priority); - msg->pgid = spg_t( - get_info().pgid.pgid, - i->first.shard); + msg->pgid = spg_t(get_info().pgid.pgid, pg_shard.shard); msg->map_epoch = get_osdmap_epoch(); msg->min_epoch = get_parent()->get_interval_start_epoch(); - msg->op = i->second; + msg->op = read; msg->op.from = get_parent()->whoami_shard(); msg->op.tid = tid; - if (op.trace) { + if (rop.trace) { // initialize a child span for this shard - msg->trace.init("ec sub read", nullptr, &op.trace); - msg->trace.keyval("shard", i->first.shard.id); + msg->trace.init("ec sub read", nullptr, &rop.trace); + msg->trace.keyval("shard", pg_shard.shard.id); } - m.push_back(std::make_pair(i->first.osd, msg)); + m.push_back(std::make_pair(pg_shard.osd, msg)); } if (!m.empty()) { get_parent()->send_message_osd_cluster(m, get_osdmap_epoch()); } - dout(10) << __func__ << ": started " << op << dendl; + dout(10) << __func__ << ": started " << rop << dendl; } void ECCommon::ReadPipeline::get_want_to_read_shards( - std::set *want_to_read) const -{ - for (raw_shard_id_t i; i < (int)sinfo.get_k(); ++i) { - want_to_read->insert(static_cast(sinfo.get_shard(i))); + const list &to_read, + ECUtil::shard_extent_set_t &want_shard_reads) { + if (sinfo.supports_partial_reads()) { + // Optimised. + for (const auto &single_region: to_read) { + get_min_want_to_read_shards(single_region, want_shard_reads); + } + return; + } + + // Non-optimised version. + for (const shard_id_t shard: sinfo.get_data_shards()) { + for (auto &&read: to_read) { + auto &&[offset, len] = sinfo.chunk_aligned_ro_range_to_shard_ro_range( + read.offset, read.size); + want_shard_reads[shard].union_insert(offset, len); + } } } -struct ClientReadCompleter : ECCommon::ReadCompleter { +struct ClientReadCompleter final : ECCommon::ReadCompleter { ClientReadCompleter(ECCommon::ReadPipeline &read_pipeline, - ECCommon::ClientAsyncReadStatus *status) + ECCommon::ClientAsyncReadStatus *status + ) : read_pipeline(read_pipeline), status(status) {} void finish_single_request( - const hobject_t &hoid, - ECCommon::read_result_t &res, - list to_read, - set wanted_to_read) override - { - auto* cct = read_pipeline.cct; + const hobject_t &hoid, + ECCommon::read_result_t &&res, + ECCommon::read_request_t &req) override { + auto *cct = read_pipeline.cct; dout(20) << __func__ << " completing hoid=" << hoid - << " res=" << res << " to_read=" << to_read << dendl; + << " res=" << res << " req=" << req << dendl; extent_map result; - if (res.r != 0) - goto out; - ceph_assert(res.returned.size() == to_read.size()); - ceph_assert(res.errors.empty()); - for (auto &&read: to_read) { - const auto bounds = make_pair(read.offset, read.size); - const auto aligned = - read_pipeline.sinfo.offset_len_to_chunk_bounds(bounds); - ceph_assert(res.returned.front().get<0>() == aligned.first); - ceph_assert(res.returned.front().get<1>() == aligned.second); - map to_decode; - bufferlist bl; - for (map::iterator j = - res.returned.front().get<2>().begin(); - j != res.returned.front().get<2>().end(); - ++j) { - to_decode[static_cast(j->first.shard)] = std::move(j->second); - } - dout(20) << __func__ << " going to decode: " - << " wanted_to_read=" << wanted_to_read - << " to_decode=" << to_decode - << dendl; - int r = ECUtil::decode( - read_pipeline.sinfo, - read_pipeline.ec_impl, - wanted_to_read, - to_decode, - &bl); - if (r < 0) { - dout(10) << __func__ << " error on ECUtil::decode r=" << r << dendl; - res.r = r; - goto out; - } - bufferlist trimmed; - // If partial stripe reads are disabled aligned_offset_in_stripe will - // be 0 which will mean trim_offset is 0. When partial reads are enabled - // the shards read (wanted_to_read) is a union of the requirements for - // each stripe, each range being read may need to trim unneeded shards - uint64_t aligned_offset_in_stripe = aligned.first - - read_pipeline.sinfo.logical_to_prev_stripe_offset(aligned.first); - uint64_t chunk_size = read_pipeline.sinfo.get_chunk_size(); - uint64_t trim_offset = 0; - for (auto shard : wanted_to_read) { - int s = static_cast(read_pipeline.sinfo.get_raw_shard(shard_id_t(shard))); - if ( s * chunk_size < aligned_offset_in_stripe) { - trim_offset += chunk_size; - } else { - break; - } + if (res.r == 0) { + ceph_assert(res.errors.empty()); +#if DEBUG_EC_BUFFERS + dout(20) << __func__ << ": before decode: " << res.buffers_read.debug_string(2048, 8) << dendl; +#endif + /* Decode any missing buffers */ + int r = res.buffers_read.decode(read_pipeline.ec_impl, + req.shard_want_to_read, + req.object_size); + ceph_assert( r == 0 ); + +#if DEBUG_EC_BUFFERS + dout(20) << __func__ << ": after decode: " << res.buffers_read.debug_string(2048, 8) << dendl; +#endif + + for (auto &&read: req.to_read) { + result.insert(read.offset, read.size, + res.buffers_read.get_ro_buffer(read.offset, read.size)); } - auto off = read.offset + trim_offset - aligned.first; - dout(20) << __func__ << " bl.length()=" << bl.length() - << " off=" << off - << " read.offset=" << read.offset - << " read.size=" << read.size - << " trim_offset="<< trim_offset << dendl; - ceph_assert(read.size <= bl.length() - off); - trimmed.substr_of(bl, off, read.size); - result.insert( - read.offset, trimmed.length(), std::move(trimmed)); - res.returned.pop_front(); } -out: dout(20) << __func__ << " calling complete_object with result=" << result << dendl; - status->complete_object(hoid, res.r, std::move(result)); + status->complete_object(hoid, res.r, std::move(result), + std::move(res.buffers_read)); read_pipeline.kick_reads(); } - void finish(int priority) && override - { + void finish(int priority) && override { // NOP } ECCommon::ReadPipeline &read_pipeline; ECCommon::ClientAsyncReadStatus *status; }; -static ostream& _prefix(std::ostream *_dout, ClientReadCompleter *read_completer) { + +static ostream &_prefix(std::ostream *_dout, + ClientReadCompleter const *read_completer) { return _prefix(_dout, &read_completer->read_pipeline); } void ECCommon::ReadPipeline::objects_read_and_reconstruct( - const map> &reads, - bool fast_read, - GenContextURef &&func) -{ - in_progress_client_reads.emplace_back( - reads.size(), std::move(func)); + const map> &reads, + const bool fast_read, + const uint64_t object_size, + GenContextURef &&func) { + in_progress_client_reads.emplace_back(reads.size(), std::move(func)); if (!reads.size()) { kick_reads(); return; } - map> obj_want_to_read; - map for_read_op; - for (auto &&to_read: reads) { - set want_to_read; - if (cct->_conf->osd_ec_partial_reads) { - for (const auto& single_region : to_read.second) { - get_min_want_to_read_shards(single_region.offset, - single_region.size, - &want_to_read); - } - } else { - get_want_to_read_shards(&want_to_read); - } - map>> shards; - int r = get_min_avail_to_read_shards( - to_read.first, - want_to_read, + for (auto &&[hoid, to_read]: reads) { + ECUtil::shard_extent_set_t want_shard_reads(sinfo.get_k_plus_m()); + get_want_to_read_shards(to_read, want_shard_reads); + + read_request_t read_request(to_read, want_shard_reads, false, object_size); + const int r = get_min_avail_to_read_shards( + hoid, false, fast_read, - &shards); + read_request); ceph_assert(r == 0); - int subchunk_size = - sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); + const int subchunk_size = + sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count(); dout(20) << __func__ + << " to_read=" << to_read << " subchunk_size=" << subchunk_size << " chunk_size=" << sinfo.get_chunk_size() << dendl; - for_read_op.insert( - make_pair( - to_read.first, - read_request_t( - to_read.second, - shards, - false))); - obj_want_to_read.insert(make_pair(to_read.first, want_to_read)); + for_read_op.insert(make_pair(hoid, read_request)); } start_read_op( CEPH_MSG_PRIO_DEFAULT, - obj_want_to_read, for_read_op, - OpRequestRef(), fast_read, false, - std::make_unique(*this, &(in_progress_client_reads.back()))); + std::make_unique( + *this, &(in_progress_client_reads.back()))); } +void ECCommon::ReadPipeline::objects_read_and_reconstruct_for_rmw( + map &&to_read, + GenContextURef &&func) { + in_progress_client_reads.emplace_back(to_read.size(), std::move(func)); + if (!to_read.size()) { + kick_reads(); + return; + } -int ECCommon::ReadPipeline::send_all_remaining_reads( - const hobject_t &hoid, - ReadOp &rop) -{ - set already_read; - const set& ots = rop.obj_to_source[hoid]; - for (set::iterator i = ots.begin(); i != ots.end(); ++i) - already_read.insert(static_cast(i->shard)); - dout(10) << __func__ << " have/error shards=" << already_read << dendl; - map>> shards; - int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid], - rop.complete[hoid], &shards, rop.for_recovery); - if (r) - return r; + map for_read_op; + for (auto &&[hoid, read_request]: to_read) { + const int r = + get_min_avail_to_read_shards(hoid, false, false, read_request); + ceph_assert(r == 0); + + const int subchunk_size = sinfo.get_chunk_size() / ec_impl-> + get_sub_chunk_count(); + dout(20) << __func__ + << " read_request=" << read_request + << " subchunk_size=" << subchunk_size + << " chunk_size=" << sinfo.get_chunk_size() << dendl; - list to_read = rop.to_read.find(hoid)->second.to_read; + for_read_op.insert(make_pair(hoid, read_request)); + } + start_read_op( + CEPH_MSG_PRIO_DEFAULT, + for_read_op, false, false, + std::make_unique( + *this, &(in_progress_client_reads.back()))); +} + + +int ECCommon::ReadPipeline::send_all_remaining_reads( + const hobject_t &hoid, + ReadOp &rop) { // (Note cuixf) If we need to read attrs and we read failed, try to read again. - bool want_attrs = - rop.to_read.find(hoid)->second.want_attrs && - (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty()); + const bool want_attrs = + rop.to_read.at(hoid).want_attrs && + (!rop.complete.at(hoid).attrs || rop.complete.at(hoid).attrs->empty()); if (want_attrs) { dout(10) << __func__ << " want attrs again" << dendl; } - rop.to_read.erase(hoid); - rop.to_read.insert(make_pair( - hoid, - read_request_t( - to_read, - shards, - want_attrs))); - return 0; + read_request_t &read_request = rop.to_read.at(hoid); + // reset the old shard reads, we are going to read them again. + read_request.shard_reads.clear(); + return get_remaining_shards(hoid, rop.complete.at(hoid), read_request, + rop.do_redundant_reads, want_attrs); } -void ECCommon::ReadPipeline::kick_reads() -{ +void ECCommon::ReadPipeline::kick_reads() { while (in_progress_client_reads.size() && in_progress_client_reads.front().is_complete()) { - in_progress_client_reads.front().run(); - in_progress_client_reads.pop_front(); + in_progress_client_reads.front().run(); + in_progress_client_reads.pop_front(); } } - -void ECCommon::RMWPipeline::start_rmw(OpRef op) -{ - ceph_assert(op); - dout(10) << __func__ << ": " << *op << dendl; - - ceph_assert(!tid_to_op_map.count(op->tid)); - waiting_state.push_back(*op); - tid_to_op_map[op->tid] = std::move(op); - check_ops(); +bool ec_align_t::operator==(const ec_align_t &other) const { + return offset == other.offset && size == other.size && flags == other.flags; } -bool ECCommon::RMWPipeline::try_state_to_reads() -{ - if (waiting_state.empty()) - return false; - - Op *op = &(waiting_state.front()); - if (op->requires_rmw() && pipeline_state.cache_invalid()) { - ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); - dout(20) << __func__ << ": blocking " << *op - << " because it requires an rmw and the cache is invalid " - << pipeline_state - << dendl; - return false; - } - - if (!pipeline_state.caching_enabled()) { - op->using_cache = false; - } else if (op->invalidates_cache()) { - dout(20) << __func__ << ": invalidating cache after this op" - << dendl; - pipeline_state.invalidate(); - } - - waiting_state.pop_front(); - waiting_reads.push_back(*op); - - if (op->using_cache) { - cache.open_write_pin(op->pin); +bool ECCommon::shard_read_t::operator==(const shard_read_t &other) const { + return extents == other.extents && + subchunk == other.subchunk && + pg_shard == other.pg_shard; +} - extent_set empty; - for (auto &&hpair: op->plan.will_write) { - auto to_read_plan_iter = op->plan.to_read.find(hpair.first); - const extent_set &to_read_plan = - to_read_plan_iter == op->plan.to_read.end() ? - empty : - to_read_plan_iter->second; +bool ECCommon::read_request_t::operator==(const read_request_t &other) const { + return to_read == other.to_read && + flags == other.flags && + shard_want_to_read == other.shard_want_to_read && + shard_reads == other.shard_reads && + want_attrs == other.want_attrs; +} - extent_set remote_read = cache.reserve_extents_for_rmw( - hpair.first, - op->pin, - hpair.second, - to_read_plan); +void ECCommon::RMWPipeline::start_rmw(OpRef op) { + dout(20) << __func__ << " op=" << *op << dendl; - extent_set pending_read = to_read_plan; - pending_read.subtract(remote_read); + ceph_assert(!tid_to_op_map.contains(op->tid)); + tid_to_op_map[op->tid] = op; - if (!remote_read.empty()) { - op->remote_read[hpair.first] = std::move(remote_read); - } - if (!pending_read.empty()) { - op->pending_read[hpair.first] = std::move(pending_read); - } - } - } else { - op->remote_read = op->plan.to_read; - } + op->pending_cache_ops = op->plan.plans.size(); + waiting_commit.push_back(op); - dout(10) << __func__ << ": " << *op << dendl; - - if (!op->remote_read.empty()) { - ceph_assert(get_parent()->get_pool().allows_ecoverwrites()); - objects_read_async_no_cache( - op->remote_read, - [op, this](ec_extents_t &&results) { - for (auto &&i: results) { - op->remote_read_result.emplace(make_pair(i.first, i.second.emap)); - } - check_ops(); + for (auto &plan: op->plan.plans) { + ECExtentCache::OpRef cache_op = extent_cache.prepare(plan.hoid, + plan.to_read, + plan.will_write, + plan.orig_size, + plan.projected_size, + plan.invalidates_cache, + [op](ECExtentCache::OpRef const &cop) + { + op->cache_ready(cop->get_hoid(), cop->get_result()); }); + op->cache_ops.emplace_back(std::move(cache_op)); } - - return true; + extent_cache.execute(op->cache_ops); } -bool ECCommon::RMWPipeline::try_reads_to_commit() -{ - if (waiting_reads.empty()) - return false; - Op *op = &(waiting_reads.front()); - if (op->read_in_progress()) - return false; - waiting_reads.pop_front(); - waiting_commit.push_back(*op); - - dout(10) << __func__ << ": starting commit on " << *op << dendl; - dout(20) << __func__ << ": " << cache << dendl; - +void ECCommon::RMWPipeline::cache_ready(Op &op) { get_parent()->apply_stats( - op->hoid, - op->delta_stats); - - if (op->using_cache) { - for (auto &&hpair: op->pending_read) { - op->remote_read_result[hpair.first].insert( - cache.get_remaining_extents_for_rmw( - hpair.first, - op->pin, - hpair.second)); - } - op->pending_read.clear(); - } else { - ceph_assert(op->pending_read.empty()); - } + op.hoid, + op.delta_stats); - map trans; - for (set::const_iterator i = - get_parent()->get_acting_recovery_backfill_shards().begin(); - i != get_parent()->get_acting_recovery_backfill_shards().end(); - ++i) { - trans.emplace(i->shard, get_parent()->min_peer_features()); + shard_id_map trans(sinfo.get_k_plus_m()); + for (auto &&shard: get_parent()-> + get_acting_recovery_backfill_shard_id_set()) { + trans[shard]; } - op->trace.event("start ec write"); + op.trace.event("start ec write"); - map written; - op->generate_transactions( + map written; + op.generate_transactions( ec_impl, get_parent()->get_info().pgid.pgid, sinfo, &written, &trans, get_parent()->get_dpp(), - get_osdmap()->require_osd_release); + get_osdmap()); - dout(20) << __func__ << ": " << cache << dendl; - dout(20) << __func__ << ": written: " << written << dendl; - dout(20) << __func__ << ": op: " << *op << dendl; + dout(20) << __func__ << ": written: " << written << ", op: " << op << dendl; - if (!get_parent()->get_pool().allows_ecoverwrites()) { - for (auto &&i: op->log_entries) { + if (!sinfo.supports_ec_overwrites()) { + for (auto &&i: op.log_entries) { if (i.requires_kraken()) { - derr << __func__ << ": log entry " << i << " requires kraken" - << " but overwrites are not enabled!" << dendl; - ceph_abort(); + derr << __func__ << ": log entry " << i << " requires kraken" + << " but overwrites are not enabled!" << dendl; + ceph_abort(); } } } - map written_set; - for (auto &&i: written) { - written_set[i.first] = i.second.get_interval_set(); - } - dout(20) << __func__ << ": written_set: " << written_set << dendl; - ceph_assert(written_set == op->plan.will_write); - - if (op->using_cache) { - for (auto &&hpair: written) { - dout(20) << __func__ << ": " << hpair << dendl; - cache.present_rmw_update(hpair.first, op->pin, hpair.second); - } - } - op->remote_read.clear(); - op->remote_read_result.clear(); - ObjectStore::Transaction empty; bool should_write_local = false; ECSubWrite local_write_op; std::vector> messages; messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size()); set backfill_shards = get_parent()->get_backfill_shards(); - for (set::const_iterator i = - get_parent()->get_acting_recovery_backfill_shards().begin(); - i != get_parent()->get_acting_recovery_backfill_shards().end(); - ++i) { - op->pending_apply.insert(*i); - op->pending_commit.insert(*i); - map::iterator iter = - trans.find(i->shard); - ceph_assert(iter != trans.end()); - bool should_send = get_parent()->should_send_op(*i, op->hoid); + + if (op.version.version != 0) { + if (oid_to_version.contains(op.hoid)) { + ceph_assert(oid_to_version.at(op.hoid) <= op.version); + } + oid_to_version[op.hoid] = op.version; + } + for (auto &&pg_shard: get_parent()->get_acting_recovery_backfill_shards()) { + ObjectStore::Transaction &transaction = trans.at(pg_shard.shard); + shard_id_t shard = pg_shard.shard; + if (transaction.empty()) { + dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " is empty" << dendl; + } else { + dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " contents "; + Formatter *f = Formatter::create("json"); + f->open_object_section("t"); + transaction.dump(f); + f->close_section(); + f->flush(*_dout); + delete f; + *_dout << dendl; + } + if (op.skip_transaction(pending_roll_forward, shard, transaction)) { + // Must be an empty transaction + ceph_assert(transaction.empty()); + dout(20) << __func__ << " Skipping transaction for osd." << shard << dendl; + continue; + } + op.pending_commits++; + bool should_send = get_parent()->should_send_op(pg_shard, op.hoid); const pg_stat_t &stats = - (should_send || !backfill_shards.count(*i)) ? - get_info().stats : - get_parent()->get_shard_info().find(*i)->second.stats; + (should_send || !backfill_shards.contains(pg_shard)) + ? get_info().stats + : get_parent()->get_shard_info().find(pg_shard)->second.stats; ECSubWrite sop( get_parent()->whoami_shard(), - op->tid, - op->reqid, - op->hoid, + op.tid, + op.reqid, + op.hoid, stats, - should_send ? iter->second : empty, - op->version, - op->trim_to, - op->pg_committed_to, - op->log_entries, - op->updated_hit_set_history, - op->temp_added, - op->temp_cleared, + should_send ? transaction : empty, + op.version, + op.trim_to, + op.pg_committed_to, + op.log_entries, + op.updated_hit_set_history, + op.temp_added, + op.temp_cleared, !should_send); ZTracer::Trace trace; - if (op->trace) { + if (op.trace) { // initialize a child span for this shard - trace.init("ec sub write", nullptr, &op->trace); - trace.keyval("shard", i->shard.id); + trace.init("ec sub write", nullptr, &op.trace); + trace.keyval("shard", pg_shard.shard.id); } - if (*i == get_parent()->whoami_shard()) { + if (pg_shard == get_parent()->whoami_shard()) { should_write_local = true; local_write_op.claim(sop); } else if (cct->_conf->bluestore_debug_inject_read_err && - ECInject::test_write_error1(ghobject_t(op->hoid, - ghobject_t::NO_GEN, i->shard))) { + ECInject::test_write_error1(ghobject_t(op.hoid, + ghobject_t::NO_GEN, + pg_shard.shard))) { dout(0) << " Error inject - Dropping write message to shard " << - i->shard << dendl; + pg_shard.shard << dendl; } else { - MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); - r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); + auto *r = new MOSDECSubOpWrite(sop); + r->pgid = spg_t(get_parent()->primary_spg_t().pgid, pg_shard.shard); r->map_epoch = get_osdmap_epoch(); r->min_epoch = get_parent()->get_interval_start_epoch(); r->trace = trace; - messages.push_back(std::make_pair(i->osd, r)); + messages.push_back(std::make_pair(pg_shard.osd, r)); } } @@ -947,148 +796,151 @@ bool ECCommon::RMWPipeline::try_reads_to_commit() if (should_write_local) { handle_sub_write( get_parent()->whoami_shard(), - op->client_op, + op.client_op, local_write_op, - op->trace); + op.trace); } - for (auto i = op->on_write.begin(); - i != op->on_write.end(); - op->on_write.erase(i++)) { - (*i)(); - } - return true; + for (auto &cop: op.cache_ops) { + const hobject_t &oid = cop->get_hoid(); + if (written.contains(oid)) { + extent_cache.write_done(cop, std::move(written.at(oid))); + } else { + extent_cache.write_done(cop, ECUtil::shard_extent_map_t(&sinfo)); + } + } } -struct ECDummyOp : ECCommon::RMWPipeline::Op { +struct ECDummyOp final : ECCommon::RMWPipeline::Op { void generate_transactions( - ceph::ErasureCodeInterfaceRef &ecimpl, + ceph::ErasureCodeInterfaceRef &ec_impl, pg_t pgid, const ECUtil::stripe_info_t &sinfo, - std::map *written, - std::map *transactions, + map *written, + shard_id_map *transactions, DoutPrefixProvider *dpp, - const ceph_release_t require_osd_release) final - { + const OSDMapRef &osdmap + ) override { // NOP, as -- in constrast to ECClassicalOp -- there is no // transaction involved } + + bool skip_transaction( + std::set &pending_roll_forward, + const shard_id_t shard, + ceph::os::Transaction &transaction + ) override { + return !pending_roll_forward.erase(shard); + } }; -bool ECCommon::RMWPipeline::try_finish_rmw() -{ - if (waiting_commit.empty()) - return false; - Op *op = &(waiting_commit.front()); - if (op->write_in_progress()) - return false; - waiting_commit.pop_front(); +void ECCommon::RMWPipeline::try_finish_rmw() { + while (waiting_commit.size() > 0) { + OpRef op = waiting_commit.front(); + + if (op->pending_commits != 0 || op->pending_cache_ops != 0) { + return; + } + + waiting_commit.pop_front(); + finish_rmw(op); + } +} + +void ECCommon::RMWPipeline::finish_rmw(OpRef const &op) { + dout(20) << __func__ << " op=" << *op << dendl; - dout(10) << __func__ << ": " << *op << dendl; - dout(20) << __func__ << ": " << cache << dendl; + if (op->on_all_commit) { + dout(10) << __func__ << " Calling on_all_commit on " << op << dendl; + op->on_all_commit->complete(0); + op->on_all_commit = nullptr; + op->trace.event("ec write all committed"); + } if (op->pg_committed_to > completed_to) completed_to = op->pg_committed_to; if (op->version > committed_to) committed_to = op->version; - if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) { - if (op->version > get_parent()->get_log().get_can_rollback_to() && - waiting_reads.empty() && - waiting_commit.empty()) { + op->cache_ops.clear(); + + if (extent_cache.idle()) { + if (op->version > get_parent()->get_log().get_can_rollback_to()) { + const int transactions_since_last_idle = extent_cache. + get_and_reset_counter(); + dout(20) << __func__ << " version=" << op->version << " ec_counter=" << + transactions_since_last_idle << dendl; // submit a dummy, transaction-empty op to kick the rollforward - auto tid = get_parent()->get_tid(); - auto nop = std::make_unique(); + const auto tid = get_parent()->get_tid(); + const auto nop = std::make_shared(); nop->hoid = op->hoid; nop->trim_to = op->trim_to; nop->pg_committed_to = op->version; nop->tid = tid; nop->reqid = op->reqid; - waiting_reads.push_back(*nop); - tid_to_op_map[tid] = std::move(nop); - } - } + nop->pending_cache_ops = 1; + nop->pipeline = this; - if (op->using_cache) { - cache.release_write_pin(op->pin); - } - tid_to_op_map.erase(op->tid); + tid_to_op_map[tid] = nop; - if (waiting_reads.empty() && - waiting_commit.empty()) { - pipeline_state.clear(); - dout(20) << __func__ << ": clearing pipeline_state " - << pipeline_state - << dendl; + /* The cache is idle (we checked above) and this IO never blocks for reads + * so we can skip the extent cache and immediately call the completion. + */ + nop->cache_ready(nop->hoid, ECUtil::shard_extent_map_t(&sinfo)); + } } - return true; -} -void ECCommon::RMWPipeline::check_ops() -{ - while (try_state_to_reads() || - try_reads_to_commit() || - try_finish_rmw()); + tid_to_op_map.erase(op->tid); } -void ECCommon::RMWPipeline::on_change() -{ +void ECCommon::RMWPipeline::on_change() { dout(10) << __func__ << dendl; completed_to = eversion_t(); committed_to = eversion_t(); - pipeline_state.clear(); - waiting_reads.clear(); - waiting_state.clear(); - waiting_commit.clear(); - for (auto &&op: tid_to_op_map) { - cache.release_write_pin(op.second->pin); - } + extent_cache.on_change(); tid_to_op_map.clear(); + oid_to_version.clear(); + waiting_commit.clear(); +} + +void ECCommon::RMWPipeline::on_change2() { + extent_cache.on_change2(); } void ECCommon::RMWPipeline::call_write_ordered(std::function &&cb) { - if (!waiting_state.empty()) { - waiting_state.back().on_write.emplace_back(std::move(cb)); - } else if (!waiting_reads.empty()) { - waiting_reads.back().on_write.emplace_back(std::move(cb)); - } else { - // Nothing earlier in the pipeline, just call it - cb(); - } + extent_cache.add_on_write(std::move(cb)); } ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::maybe_put_hash_info( - const hobject_t &hoid, - ECUtil::HashInfo &&hinfo) -{ + const hobject_t &hoid, + ECUtil::HashInfo &&hinfo) { return registry.lookup_or_create(hoid, hinfo); } ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info( - const hobject_t &hoid, - bool create, - const map>& attrs, - uint64_t size) -{ + const hobject_t &hoid, + bool create, + const map> &attrs, + uint64_t size) { dout(10) << __func__ << ": Getting attr on " << hoid << dendl; - ECUtil::HashInfoRef ref = registry.lookup(hoid); + auto ref = registry.lookup(hoid); if (!ref) { dout(10) << __func__ << ": not in cache " << hoid << dendl; ECUtil::HashInfo hinfo(ec_impl->get_chunk_count()); bufferlist bl; - map::const_iterator k = attrs.find(ECUtil::get_hinfo_key()); - if (k == attrs.end()) { - dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl; + if (attrs.contains(ECUtil::get_hinfo_key())) { + bl = attrs.at(ECUtil::get_hinfo_key()); } else { - bl = k->second; + dout(30) << __func__ << " " << hoid << " missing hinfo attr" << dendl; } if (bl.length() > 0) { auto bp = bl.cbegin(); try { decode(hinfo, bp); - } catch(...) { + } + catch (...) { dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl; return ECUtil::HashInfoRef(); } @@ -1096,10 +948,10 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info( dout(0) << __func__ << ": Mismatch of total_chunk_size " << hinfo.get_total_chunk_size() << dendl; return ECUtil::HashInfoRef(); - } else { - create = true; } - } else if (size == 0) { // If empty object and no hinfo, create it + create = true; + } else if (size == 0) { + // If empty object and no hinfo, create it create = true; } if (create) { @@ -1108,5 +960,3 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info( } return ref; } - -END_IGNORE_DEPRECATED diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index 8135269274d59..a17aff017ffd3 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -22,7 +22,6 @@ #include "ECUtil.h" #include "ECTypes.h" #if WITH_CRIMSON -#include "ExtentCache.h" #include "crimson/osd/object_context.h" #include "os/Transaction.h" #include "osd/OSDMap.h" @@ -32,7 +31,7 @@ struct ECTransaction { struct WritePlan { bool invalidates_cache = false; // Yes, both are possible std::map to_read; - std::map will_write; // superset of to_read + std::map will_write; std::map hash_infos; }; @@ -45,8 +44,9 @@ typedef crimson::osd::ObjectContextRef ObjectContextRef; #endif #include "ECTransaction.h" -#include "ExtentCache.h" +#include "ECExtentCache.h" #include "ECListener.h" +#include "common/dout.h" //forward declaration struct ECSubWrite; @@ -56,36 +56,86 @@ struct ECCommon { struct ec_extent_t { int err; extent_map emap; + ECUtil::shard_extent_map_t shard_extent_map; + + void print(std::ostream &os) const { + os << err << "," << emap; + } }; - friend std::ostream &operator<<(std::ostream &lhs, const ec_extent_t &rhs); + using ec_extents_t = std::map; virtual ~ECCommon() = default; virtual void handle_sub_write( - pg_shard_t from, - OpRequestRef msg, - ECSubWrite &op, - const ZTracer::Trace &trace, - ECListener& eclistener - ) = 0; + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace, + ECListener &eclistener) = 0; virtual void objects_read_and_reconstruct( - const std::map> &reads, - bool fast_read, - GenContextURef &&func) = 0; + const std::map> &reads, + bool fast_read, + uint64_t object_size, + GenContextURef &&func) = 0; + + struct shard_read_t { + extent_set extents; + std::optional>> subchunk; + pg_shard_t pg_shard; + bool operator==(const shard_read_t &other) const; + + void print(std::ostream &os) const { + os << "shard_read_t(extents=[" << extents << "]" + << ", subchunk=" << subchunk + << ", pg_shard=" << pg_shard + << ")"; + } + }; struct read_request_t { const std::list to_read; - std::map>> need; - bool want_attrs; + const uint32_t flags = 0; + const ECUtil::shard_extent_set_t shard_want_to_read; + shard_id_map shard_reads; + bool want_attrs = false; + uint64_t object_size; + read_request_t( - const std::list &to_read, - const std::map>> &need, - bool want_attrs) - : to_read(to_read), need(need), want_attrs(want_attrs) {} + const std::list &to_read, + const ECUtil::shard_extent_set_t &shard_want_to_read, + bool want_attrs, uint64_t object_size) : + to_read(to_read), + flags(to_read.front().flags), + shard_want_to_read(shard_want_to_read), + shard_reads(shard_want_to_read.get_max_shards()), + want_attrs(want_attrs), + object_size(object_size) {} + + read_request_t(const ECUtil::shard_extent_set_t &shard_want_to_read, + bool want_attrs, uint64_t object_size) : + shard_want_to_read(shard_want_to_read), + shard_reads(shard_want_to_read.get_max_shards()), + want_attrs(want_attrs), + object_size(object_size) {} + + bool operator==(const read_request_t &other) const; + + void print(std::ostream &os) const { + os << "read_request_t(to_read=[" << to_read << "]" + << ", flags=" << flags + << ", shard_want_to_read=" << shard_want_to_read + << ", shard_reads=" << shard_reads + << ", want_attrs=" << want_attrs + << ")"; + } }; - friend std::ostream &operator<<(std::ostream &lhs, const read_request_t &rhs); + + virtual void objects_read_and_reconstruct_for_rmw( + std::map &&to_read, + GenContextURef &&func) = 0; + struct ReadOp; /** * Low level async read mechanism @@ -111,19 +161,30 @@ struct ECCommon { struct read_result_t { int r; std::map errors; - std::optional> > attrs; - std::list< - boost::tuple< - uint64_t, uint64_t, std::map > > returned; - read_result_t() : r(0) {} + std::optional>> attrs; + ECUtil::shard_extent_map_t buffers_read; + ECUtil::shard_extent_set_t processed_read_requests; + + read_result_t(const ECUtil::stripe_info_t *sinfo) : + r(0), buffers_read(sinfo), + processed_read_requests(sinfo->get_k_plus_m()) {} + + void print(std::ostream &os) const { + os << "read_result_t(r=" << r << ", errors=" << errors; + if (attrs) { + os << ", attrs=" << *(attrs); + } else { + os << ", noattrs"; + } + os << ", buffers_read=" << buffers_read << ")"; + } }; struct ReadCompleter { virtual void finish_single_request( - const hobject_t &hoid, - read_result_t &res, - std::list to_read, - std::set wanted_to_read) = 0; + const hobject_t &hoid, + read_result_t &&res, + ECCommon::read_request_t &req) = 0; virtual void finish(int priority) && = 0; @@ -131,26 +192,35 @@ struct ECCommon { }; friend struct CallClientContexts; + struct ClientAsyncReadStatus { unsigned objects_to_read; - GenContextURef func; + GenContextURef func; ec_extents_t results; + explicit ClientAsyncReadStatus( - unsigned objects_to_read, - GenContextURef &&func) + unsigned objects_to_read, + GenContextURef &&func) : objects_to_read(objects_to_read), func(std::move(func)) {} + void complete_object( - const hobject_t &hoid, - int err, - extent_map &&buffers) { + const hobject_t &hoid, + int err, + extent_map &&buffers, + ECUtil::shard_extent_map_t &&shard_extent_map) { ceph_assert(objects_to_read); --objects_to_read; - ceph_assert(!results.count(hoid)); - results.emplace(hoid, ec_extent_t{err, std::move(buffers)}); + ceph_assert(!results.contains(hoid)); + results.emplace(hoid, ec_extent_t{ + err, std::move(buffers), + std::move(shard_extent_map) + }); } + bool is_complete() const { return objects_to_read == 0; } + void run() { func.release()->complete(std::move(results)); } @@ -171,113 +241,125 @@ struct ECCommon { ZTracer::Trace trace; - std::map> want_to_read; std::map to_read; std::map complete; std::map> obj_to_source; - std::map > source_to_obj; + std::map> source_to_obj; void dump(ceph::Formatter *f) const; std::set in_progress; + std::list debug_log; + ReadOp( - int priority, - ceph_tid_t tid, - bool do_redundant_reads, - bool for_recovery, - std::unique_ptr _on_complete, - OpRequestRef op, - std::map> &&_want_to_read, - std::map &&_to_read) + int priority, + ceph_tid_t tid, + bool do_redundant_reads, + bool for_recovery, + std::unique_ptr _on_complete, + std::map &&_to_read) : priority(priority), tid(tid), - op(op), do_redundant_reads(do_redundant_reads), for_recovery(for_recovery), on_complete(std::move(_on_complete)), - want_to_read(std::move(_want_to_read)), - to_read(std::move(_to_read)) { - for (auto &&hpair: to_read) { - auto &returned = complete[hpair.first].returned; - for (auto &&extent: hpair.second.to_read) { - returned.push_back( - boost::make_tuple( - extent.offset, - extent.size, - std::map())); - } - } - } + to_read(std::move(_to_read)) {} + ReadOp() = delete; ReadOp(const ReadOp &) = delete; // due to on_complete being unique_ptr ReadOp(ReadOp &&) = default; + + void print(std::ostream &os) const { + os << "ReadOp(tid=" << tid; +#ifndef WITH_CRIMSON + if (op && op->get_req()) { + os << ", op="; + op->get_req()->print(os); + } +#endif + os << ", to_read=" << to_read << ", complete=" << complete + << ", priority=" << priority << ", obj_to_source=" << obj_to_source + << ", source_to_obj=" << source_to_obj << ", in_progress=" << + in_progress + << ", debug_log=" << debug_log << ")"; + } }; + struct ReadPipeline { void objects_read_and_reconstruct( - const std::map> &reads, - bool fast_read, - GenContextURef &&func); + const std::map> &reads, + bool fast_read, + uint64_t object_size, + GenContextURef &&func); + + void objects_read_and_reconstruct_for_rmw( + std::map &&to_read, + GenContextURef &&func); template void filter_read_op( - const OSDMapRef& osdmap, - ReadOp &op, - F&& on_erase, - G&& on_schedule_recovery); + const OSDMapRef &osdmap, + ReadOp &op, + F &&on_erase, + G &&on_schedule_recovery); template void check_recovery_sources( - const OSDMapRef& osdmap, - F&& on_erase, - G&& on_schedule_recovery); + const OSDMapRef &osdmap, + F &&on_erase, + G &&on_schedule_recovery); - void complete_read_op(ReadOp &rop); + void complete_read_op(ReadOp &&rop); void start_read_op( - int priority, - std::map> &want_to_read, - std::map &to_read, - OpRequestRef op, - bool do_redundant_reads, - bool for_recovery, - std::unique_ptr on_complete); + int priority, + std::map &to_read, + bool do_redundant_reads, + bool for_recovery, + std::unique_ptr on_complete); void do_read_op(ReadOp &rop); int send_all_remaining_reads( - const hobject_t &hoid, - ReadOp &rop); + const hobject_t &hoid, + ReadOp &rop); void on_change(); void kick_reads(); std::map tid_to_read_map; - std::map > shard_to_read_map; + std::map> shard_to_read_map; std::list in_progress_client_reads; - CephContext* cct; + CephContext *cct; ceph::ErasureCodeInterfaceRef ec_impl; - const ECUtil::stripe_info_t& sinfo; + const ECUtil::stripe_info_t &sinfo; // TODO: lay an interface down here - ECListener* parent; + ECListener *parent; ECListener *get_parent() const { return parent; } - const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); } - epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); } - const pg_info_t &get_info() { return get_parent()->get_info(); } - ReadPipeline(CephContext* cct, - ceph::ErasureCodeInterfaceRef ec_impl, - const ECUtil::stripe_info_t& sinfo, - ECListener* parent) + const OSDMapRef &get_osdmap() const { + return get_parent()->pgb_get_osdmap(); + } + + epoch_t get_osdmap_epoch() const { + return get_parent()->pgb_get_osdmap_epoch(); + } + + const pg_info_t &get_info() const { return get_parent()->get_info(); } + + ReadPipeline(CephContext *cct, + ceph::ErasureCodeInterfaceRef ec_impl, + const ECUtil::stripe_info_t &sinfo, + ECListener *parent) : cct(cct), ec_impl(std::move(ec_impl)), sinfo(sinfo), - parent(parent) { - } + parent(parent) {} /** * While get_want_to_read_shards creates a want_to_read based on the EC @@ -289,47 +371,42 @@ struct ECCommon { * */ void get_min_want_to_read_shards( - uint64_t offset, ///< [in] - uint64_t length, ///< [in] - std::set *want_to_read ///< [out] - ); - static void get_min_want_to_read_shards( - const uint64_t offset, - const uint64_t length, - const ECUtil::stripe_info_t& sinfo, - std::set *want_to_read); + const ec_align_t &to_read, ///< [in] + ECUtil::shard_extent_set_t &want_shard_reads); ///< [out] int get_remaining_shards( - const hobject_t &hoid, - const std::set &avail, - const std::set &want, - const read_result_t &result, - std::map>> *to_read, - bool for_recovery); + const hobject_t &hoid, + read_result_t &read_result, + read_request_t &read_request, + bool for_recovery, + bool fast_read); void get_all_avail_shards( - const hobject_t &hoid, - const std::set &error_shards, - std::set &have, - std::map &shards, - bool for_recovery); + const hobject_t &hoid, + shard_id_set &have, + shard_id_map &shards, + bool for_recovery, + const std::optional> &error_shards = std::nullopt); + + std::pair get_readable_writable_shard_id_sets(); - friend std::ostream &operator<<(std::ostream &lhs, const ReadOp &rhs); friend struct FinishReadOp; - void get_want_to_read_shards(std::set *want_to_read) const; + void get_want_to_read_shards( + const std::list &to_read, + ECUtil::shard_extent_set_t &want_shard_reads); /// Returns to_read replicas sufficient to reconstruct want int get_min_avail_to_read_shards( - const hobject_t &hoid, ///< [in] object - const std::set &want, ///< [in] desired shards - bool for_recovery, ///< [in] true if we may use non-acting replicas - bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency - std::map>> *to_read ///< [out] shards, corresponding subchunks to read + const hobject_t &hoid, ///< [in] object + bool for_recovery, ///< [in] true if we may use non-acting replicas + bool do_redundant_reads, + ///< [in] true if we want to issue redundant reads to reduce latency + read_request_t &read_request, + ///< [out] shard_reads, corresponding subchunks / other sub reads to read + const std::optional> &error_shards = std::nullopt + //< [in] Shards where reads have failed (optional) ); ///< @return error code, 0 on success - - void schedule_recovery_work(); - }; /** @@ -346,7 +423,7 @@ struct ECCommon { * on the writing std::list. */ - struct RMWPipeline { + struct RMWPipeline : ECExtentCache::BackendReadListener { struct Op : boost::intrusive::list_base_hook<> { /// From submit_transaction caller, describes operation hobject_t hoid; @@ -374,171 +451,176 @@ struct ECCommon { /// Ancillary also provided from submit_transaction caller std::map obc_map; - /// see call_write_ordered - std::list > on_write; - /// Generated internally std::set temp_added; std::set temp_cleared; ECTransaction::WritePlan plan; - bool requires_rmw() const { return !plan.to_read.empty(); } - bool invalidates_cache() const { return plan.invalidates_cache; } + bool requires_rmw() const { return !plan.want_read; } // must be true if requires_rmw(), must be false if invalidates_cache() bool using_cache = true; /// In progress read state; - std::map pending_read; // subset already being read - std::map remote_read; // subset we must read - std::map remote_read_result; - bool read_in_progress() const { - return !remote_read.empty() && remote_read_result.empty(); - } + int pending_cache_ops = 0; + std::map remote_shard_extent_map; /// In progress write state. - std::set pending_commit; - // we need pending_apply for pre-mimic peers so that we don't issue a - // read on a remote shard before it has applied a previous write. We can - // remove this after nautilus. - std::set pending_apply; + int pending_commits = 0; + bool write_in_progress() const { - return !pending_commit.empty() || !pending_apply.empty(); + return pending_commits != 0; } /// optional, may be null, for tracking purposes OpRequestRef client_op; /// pin for cache - ExtentCache::write_pin pin; + std::list cache_ops; + RMWPipeline *pipeline; + + Op() : tid(), plan(), pipeline(nullptr) {} /// Callbacks Context *on_all_commit = nullptr; + virtual ~Op() { delete on_all_commit; } virtual void generate_transactions( - ceph::ErasureCodeInterfaceRef &ecimpl, - pg_t pgid, - const ECUtil::stripe_info_t &sinfo, - std::map *written, - std::map *transactions, - DoutPrefixProvider *dpp, - const ceph_release_t require_osd_release = ceph_release_t::unknown) = 0; - }; - using OpRef = std::unique_ptr; - using op_list = boost::intrusive::list; - friend std::ostream &operator<<(std::ostream &lhs, const Op &rhs); + ceph::ErasureCodeInterfaceRef &ec_impl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + std::map *written, + shard_id_map *transactions, + DoutPrefixProvider *dpp, + const OSDMapRef &osdmap) = 0; + + virtual bool skip_transaction( + std::set &pending_roll_forward, + shard_id_t shard, + ceph::os::Transaction &transaction) = 0; + + void cache_ready(const hobject_t &oid, const ECUtil::shard_extent_map_t &result) { + if (!result.empty()) { + remote_shard_extent_map.insert(std::pair(oid, result)); + } - ExtentCache cache; - std::map tid_to_op_map; /// Owns Op structure - /** - * We model the possible rmw states as a std::set of waitlists. - * All writes at this time complete in order, so a write blocked - * at waiting_state blocks all writes behind it as well (same for - * other states). - * - * Future work: We can break this up into a per-object pipeline - * (almost). First, provide an ordering token to submit_transaction - * and require that all operations within a single transaction take - * place on a subset of hobject_t space partitioned by that token - * (the hashid seem about right to me -- even works for temp objects - * if you recall that a temp object created for object head foo will - * only ever be referenced by other transactions on foo and aren't - * reused). Next, factor this part into a class and maintain one per - * ordering token. Next, fixup PrimaryLogPG's repop queue to be - * partitioned by ordering token. Finally, refactor the op pipeline - * so that the log entries passed into submit_transaction aren't - * versioned. We can't assign versions to them until we actually - * submit the operation. That's probably going to be the hard part. - */ - class pipeline_state_t { - enum { - CACHE_VALID = 0, - CACHE_INVALID = 1 - } pipeline_state = CACHE_VALID; - public: - bool caching_enabled() const { - return pipeline_state == CACHE_VALID; - } - bool cache_invalid() const { - return !caching_enabled(); - } - void invalidate() { - pipeline_state = CACHE_INVALID; + if (!--pending_cache_ops) { + pipeline->cache_ready(*this); + } } - void clear() { - pipeline_state = CACHE_VALID; + + void print(std::ostream &os) const { + os << "Op(" << hoid << " v=" << version << " tt=" << trim_to + << " tid=" << tid << " reqid=" << reqid; +#ifndef WITH_CRIMSON + if (client_op && client_op->get_req()) { + os << " client_op="; + client_op->get_req()->print(os); + } +#endif + os << " pg_committed_to=" << pg_committed_to + << " temp_added=" << temp_added + << " temp_cleared=" << temp_cleared + << " remote_read_result=" << remote_shard_extent_map + << " pending_commits=" << pending_commits + << " plan.to_read=" << plan + << ")"; } - friend std::ostream &operator<<(std::ostream &lhs, const pipeline_state_t &rhs); - } pipeline_state; + }; + + void backend_read(hobject_t oid, ECUtil::shard_extent_set_t const &request, + uint64_t object_size) override { + std::map to_read; + to_read.emplace(oid, read_request_t(request, false, object_size)); + + objects_read_async_no_cache( + std::move(to_read), + [this](ec_extents_t &&results) { + for (auto &&[oid, result]: results) { + extent_cache.read_done(oid, std::move(result.shard_extent_map)); + } + }); + } - op_list waiting_state; /// writes waiting on pipe_state - op_list waiting_reads; /// writes waiting on partial stripe reads - op_list waiting_commit; /// writes waiting on initial commit + using OpRef = std::shared_ptr; + + std::map tid_to_op_map; /// Owns Op structure + std::map oid_to_version; + + std::list waiting_commit; eversion_t completed_to; eversion_t committed_to; void start_rmw(OpRef op); - bool try_state_to_reads(); - bool try_reads_to_commit(); - bool try_finish_rmw(); - void check_ops(); + void cache_ready(Op &op); + void try_finish_rmw(); + void finish_rmw(OpRef const &op); void on_change(); + void on_change2(); void call_write_ordered(std::function &&cb); - CephContext* cct; + CephContext *cct; ECListener *get_parent() const { return parent; } - const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); } - epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); } - const pg_info_t &get_info() { return get_parent()->get_info(); } + + const OSDMapRef &get_osdmap() const { + return get_parent()->pgb_get_osdmap(); + } + + epoch_t get_osdmap_epoch() const { + return get_parent()->pgb_get_osdmap_epoch(); + } + + const pg_info_t &get_info() const { return get_parent()->get_info(); } template void objects_read_async_no_cache( - const std::map &to_read, - Func &&on_complete - ) { - std::map> _to_read; - for (auto &&hpair: to_read) { - auto &l = _to_read[hpair.first]; - for (auto extent: hpair.second) { - l.emplace_back(ec_align_t{extent.first, extent.second, 0}); - } - } - ec_backend.objects_read_and_reconstruct( - _to_read, - false, + std::map &&to_read, + Func &&on_complete) { + ec_backend.objects_read_and_reconstruct_for_rmw( + std::move(to_read), make_gen_lambda_context< - ECCommon::ec_extents_t &&, Func>( - std::forward(on_complete))); + ECCommon::ec_extents_t&&, Func>( + std::forward(on_complete))); } + void handle_sub_write( - pg_shard_t from, - OpRequestRef msg, - ECSubWrite &op, - const ZTracer::Trace &trace - ) { - ec_backend.handle_sub_write(from, std::move(msg), op, trace, *get_parent()); + pg_shard_t from, + OpRequestRef msg, + ECSubWrite &op, + const ZTracer::Trace &trace) const { + ec_backend.handle_sub_write(from, std::move(msg), op, trace, + *get_parent()); } + // end of iface + // Set of shards that will need a dummy transaction for the final + // roll forward + std::set pending_roll_forward; + ceph::ErasureCodeInterfaceRef ec_impl; - const ECUtil::stripe_info_t& sinfo; - ECListener* parent; - ECCommon& ec_backend; + const ECUtil::stripe_info_t &sinfo; + ECListener *parent; + ECCommon &ec_backend; + ECExtentCache extent_cache; + uint64_t ec_pdw_write_mode; - RMWPipeline(CephContext* cct, + RMWPipeline(CephContext *cct, ceph::ErasureCodeInterfaceRef ec_impl, - const ECUtil::stripe_info_t& sinfo, - ECListener* parent, - ECCommon& ec_backend) + const ECUtil::stripe_info_t &sinfo, + ECListener *parent, + ECCommon &ec_backend, + ECExtentCache::LRU &ec_extent_cache_lru) : cct(cct), ec_impl(std::move(ec_impl)), sinfo(sinfo), parent(parent), - ec_backend(ec_backend) { - } + ec_backend(ec_backend), + extent_cache(*this, ec_extent_cache_lru, sinfo, cct), + ec_pdw_write_mode(cct->_conf.get_val("ec_pdw_write_mode")) {} }; class UnstableHashInfoRegistry { @@ -547,47 +629,35 @@ struct ECCommon { /// If modified, ensure that the ref is held until the update is applied SharedPtrRegistry registry; - public: + public: UnstableHashInfoRegistry( - CephContext *cct, - ceph::ErasureCodeInterfaceRef ec_impl) + CephContext *cct, + ceph::ErasureCodeInterfaceRef ec_impl) : cct(cct), - ec_impl(std::move(ec_impl)) {} + ec_impl(std::move(ec_impl)) {} ECUtil::HashInfoRef maybe_put_hash_info( - const hobject_t &hoid, - ECUtil::HashInfo &&hinfo); + const hobject_t &hoid, + ECUtil::HashInfo &&hinfo); ECUtil::HashInfoRef get_hash_info( - const hobject_t &hoid, - bool create, - const std::map>& attr, - uint64_t size); + const hobject_t &hoid, + bool create, + const std::map> &attrs, + uint64_t size); }; }; -std::ostream &operator<<(std::ostream &lhs, - const ECCommon::RMWPipeline::pipeline_state_t &rhs); -std::ostream &operator<<(std::ostream &lhs, - const ECCommon::read_request_t &rhs); -std::ostream &operator<<(std::ostream &lhs, - const ECCommon::read_result_t &rhs); -std::ostream &operator<<(std::ostream &lhs, - const ECCommon::ReadOp &rhs); -std::ostream &operator<<(std::ostream &lhs, - const ECCommon::RMWPipeline::Op &rhs); - template void ECCommon::ReadPipeline::check_recovery_sources( - const OSDMapRef& osdmap, - F&& on_erase, - G&& on_schedule_recovery) -{ + const OSDMapRef &osdmap, + F &&on_erase, + G &&on_schedule_recovery + ) { std::set tids_to_filter; - for (std::map >::iterator + for (std::map>::iterator i = shard_to_read_map.begin(); - i != shard_to_read_map.end(); - ) { + i != shard_to_read_map.end();) { if (osdmap->is_down(i->first.osd)) { tids_to_filter.insert(i->second.begin(), i->second.end()); shard_to_read_map.erase(i++); @@ -606,53 +676,45 @@ void ECCommon::ReadPipeline::check_recovery_sources( template void ECCommon::ReadPipeline::filter_read_op( - const OSDMapRef& osdmap, - ReadOp &op, - F&& on_erase, - G&& on_schedule_recovery) -{ + const OSDMapRef &osdmap, + ReadOp &op, + F &&on_erase, + G &&on_schedule_recovery + ) { std::set to_cancel; - for (std::map >::iterator i = op.source_to_obj.begin(); - i != op.source_to_obj.end(); - ++i) { - if (osdmap->is_down(i->first.osd)) { - to_cancel.insert(i->second.begin(), i->second.end()); - op.in_progress.erase(i->first); - continue; + for (auto &&[pg_shard, hoid_set] : op.source_to_obj) { + if (osdmap->is_down(pg_shard.osd)) { + to_cancel.insert(hoid_set.begin(), hoid_set.end()); + op.in_progress.erase(pg_shard); } } if (to_cancel.empty()) return; - for (std::map >::iterator i = op.source_to_obj.begin(); - i != op.source_to_obj.end(); - ) { - for (std::set::iterator j = i->second.begin(); - j != i->second.end(); - ) { - if (to_cancel.count(*j)) - i->second.erase(j++); - else - ++j; + for (auto iter = op.source_to_obj.begin(); + iter != op.source_to_obj.end();) { + auto &[pg_shard, hoid_set] = *iter; + for (auto &hoid : hoid_set) { + if (to_cancel.contains(hoid)) { + hoid_set.erase(hoid); + } } - if (i->second.empty()) { - op.source_to_obj.erase(i++); + if (hoid_set.empty()) { + op.source_to_obj.erase(iter++); } else { - ceph_assert(!osdmap->is_down(i->first.osd)); - ++i; + ceph_assert(!osdmap->is_down(pg_shard.osd)); + ++iter; } } - for (std::set::iterator i = to_cancel.begin(); - i != to_cancel.end(); - ++i) { - get_parent()->cancel_pull(*i); + for (auto hoid : to_cancel) { + get_parent()->cancel_pull(hoid); - ceph_assert(op.to_read.count(*i)); - op.to_read.erase(*i); - op.complete.erase(*i); - on_erase(*i); + ceph_assert(op.to_read.contains(hoid)); + op.to_read.erase(hoid); + op.complete.erase(hoid); + on_erase(hoid); } if (op.in_progress.empty()) { @@ -675,8 +737,14 @@ void ECCommon::ReadPipeline::filter_read_op( } } -template <> struct fmt::formatter : fmt::ostream_formatter {}; -template <> struct fmt::formatter : fmt::ostream_formatter {}; -template <> struct fmt::formatter : fmt::ostream_formatter {}; -template <> struct fmt::formatter : fmt::ostream_formatter {}; -template <> struct fmt::formatter : fmt::ostream_formatter {}; \ No newline at end of file +template <> +struct fmt::formatter : fmt::ostream_formatter {}; + +template <> +struct fmt::formatter : fmt::ostream_formatter {}; + +template <> +struct fmt::formatter : fmt::ostream_formatter {}; + +template <> +struct fmt::formatter : fmt::ostream_formatter {}; diff --git a/src/osd/ECExtentCache.cc b/src/osd/ECExtentCache.cc new file mode 100644 index 0000000000000..9ee94c2d9dc81 --- /dev/null +++ b/src/osd/ECExtentCache.cc @@ -0,0 +1,480 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ECExtentCache.h" +#include "ECUtil.h" + +#include + +using namespace std; +using namespace ECUtil; + +void ECExtentCache::Object::request(OpRef &op) { + /* After a cache invalidation, we allow through a single cache-invalidating + * IO. + */ + if (op->invalidates_cache) { + if (cache_invalidated) { + op->invalidates_cache = false; + } else { + cache_invalidate_expected = true; + } + } + cache_invalidated = false; + + extent_set eset = op->get_pin_eset(line_size); + + for (auto &&[start, len] : eset) { + for (uint64_t to_pin = start; to_pin < start + len; to_pin += line_size) { + LineRef l; + if (!lines.contains(to_pin)) { + l = make_shared(*this, to_pin); + if (!l->cache->empty()) { + l->cache->to_shard_extent_set(do_not_read); + } + lines.emplace(to_pin, weak_ptr(l)); + } else { + l = lines.at(to_pin).lock(); + } + op->lines.emplace_back(l); + } + } + + bool read_required = false; + + /* Deal with reads if there are any. + * If any cache invalidation ops have been added, there is no point adding any + * reads as they are all going to be thrown away before any of the + * post-invalidate ops are honoured. + */ + if (op->reads && !cache_invalidate_expected) { + for (auto &&[shard, eset] : *(op->reads)) { + extent_set request = eset; + if (do_not_read.contains(shard)) { + request.subtract(do_not_read.at(shard)); + } + + if (!request.empty()) { + requesting[shard].union_of(request); + read_required = true; + requesting_ops.emplace_back(op); + } + } + } + + + /* Calculate the range of the object which no longer need to be written. This + * will include: + * - Any reads being issued by this IO. + * - Any writes being issued (these will be cached) + * - any unwritten regions in an append - these can assumed to be zero. + */ + if (read_required) { + do_not_read.insert(requesting); + } + do_not_read.insert(op->writes); + if (op->projected_size > projected_size) { + /* This write is growing the size of the object. This essentially counts + * as a write (although the cache will not get populated). Future reads + * to this area will be skipped, but this makes them essentially zero + * reads. + */ + shard_extent_set_t obj_hole(pg.sinfo.get_k_plus_m()); + shard_extent_set_t read_mask(pg.sinfo.get_k_plus_m()); + + pg.sinfo.ro_size_to_read_mask(op->projected_size, obj_hole); + pg.sinfo.ro_size_to_read_mask(projected_size, read_mask); + obj_hole.subtract(read_mask); + do_not_read.insert(obj_hole); + } else if (op->projected_size < projected_size) { + // Invalidate the object's cache when we see any object reduce in size. + op->invalidates_cache = true; + } + + projected_size = op->projected_size; + + if (read_required) send_reads(); + else op->read_done = true; +} + +void ECExtentCache::Object::send_reads() { + if (reading || requesting.empty()) + return; // Read busy + + reading_ops.swap(requesting_ops); + pg.backend_read.backend_read(oid, requesting, current_size); + requesting.clear(); + reading = true; +} + +void ECExtentCache::Object::read_done(shard_extent_map_t const &buffers) { + reading = false; + for (auto &&op : reading_ops) { + op->read_done = true; + } + reading_ops.clear(); + insert(buffers); +} + +uint64_t ECExtentCache::Object::line_align(uint64_t x) const { + return x - (x % line_size); +} + +void ECExtentCache::Object::insert(shard_extent_map_t const &buffers) const { + if (buffers.empty()) return; + + /* The following gets quite inefficient for writes which write to the start + * and the end of a very large object, since we iterated over the middle. + * This seems like a strange use case, so currently this is not being + * optimised. + */ + for (uint64_t slice_start = line_align(buffers.get_start_offset()); + slice_start < buffers.get_end_offset(); + slice_start += line_size) { + shard_extent_map_t slice = buffers.slice_map(slice_start, line_size); + if (!slice.empty()) { + LineRef l = lines.at(slice_start).lock(); + /* The line should have been created already! */ + l->cache->insert(slice); + uint64_t old_size = l->size; + l->size = l->cache->size(); + ceph_assert(l->size >= old_size); + update_mempool(0, l->size - old_size); + } + } +} + +void ECExtentCache::Object::write_done(shard_extent_map_t const &buffers, + uint64_t new_size) { + insert(buffers); + current_size = new_size; +} + +void ECExtentCache::Object::unpin(Op &op) const { + op.lines.clear(); + delete_maybe(); +} + +void ECExtentCache::Object::delete_maybe() const { + if (lines.empty() && active_ios == 0) { + pg.objects.erase(oid); + } +} + +void check_seset_empty_for_range(shard_extent_set_t s, uint64_t off, + uint64_t len) { + for (auto &[shard, eset] : s) { + ceph_assert(!eset.intersects(off, len)); + } +} + +void ECExtentCache::Object::erase_line(uint64_t offset) { + check_seset_empty_for_range(requesting, offset, line_size); + do_not_read.erase_stripe(offset, line_size); + lines.erase(offset); + delete_maybe(); +} + +void ECExtentCache::Object::invalidate(const OpRef &invalidating_op) { + for (auto &l : std::views::values(lines)) { + auto line = l.lock(); + line->cache->clear(); + update_mempool(0, -line->size); + line->size = 0; + } + + /* Remove all entries from the LRU */ + pg.lru.remove_object(oid); + + ceph_assert(!reading); + do_not_read.clear(); + requesting.clear(); + requesting_ops.clear(); + reading_ops.clear(); + + /* Current size should reflect the actual size of the object, which was set + * by the previous write. We are going to replay all the writes now, so set + * the projected size to that of this op. + */ + projected_size = invalidating_op->projected_size; + + // Cache can now be replayed and invalidate teh cache! + invalidating_op->invalidates_cache = false; + + cache_invalidated = true; + cache_invalidate_expected = false; + + /* We now need to reply all outstanding ops, so as to regenerate the read */ + for (auto &op : pg.waiting_ops) { + if (op->object.oid == oid) { + op->read_done = false; + request(op); + } + } +} + +void ECExtentCache::cache_maybe_ready() { + while (!waiting_ops.empty()) { + OpRef op = waiting_ops.front(); + if (op->invalidates_cache) { + /* We must wait for any outstanding reads to complete. The cache replans + * all reads as part of invalidate. If an in-flight read completes after + * the invalidate, it will potentially corrupt it, leading to data + * corruption at the host. + */ + if (op->object.reading) { + return; + } + op->object.invalidate(op); + ceph_assert(!op->invalidates_cache); + } + /* If reads_done finds all reads complete it will call the completion + * callback. Typically, this will cause the client to execute the + * transaction and pop the front of waiting_ops. So we abort if either + * reads are not ready, or the client chooses not to complete the op + */ + if (!op->complete_if_reads_cached(op)) { + return; + } + + waiting_ops.pop_front(); + } +} + +ECExtentCache::OpRef ECExtentCache::prepare(GenContextURef &&ctx, + hobject_t const &oid, + std::optional + const &to_read, + shard_extent_set_t const &write, + uint64_t orig_size, + uint64_t projected_size, + bool invalidates_cache) { + + auto object_iter = objects.find(oid); + if (object_iter == objects.end()) { + auto p = objects.emplace(oid, Object(*this, oid, orig_size)); + object_iter = p.first; + } + OpRef op = std::make_shared( + std::move(ctx), object_iter->second, to_read, write, projected_size, + invalidates_cache); + + return op; +} + +void ECExtentCache::read_done(hobject_t const &oid, + shard_extent_map_t const &update) { + objects.at(oid).read_done(update); + cache_maybe_ready(); + objects.at(oid).send_reads(); +} + +void ECExtentCache::write_done(OpRef const &op, + shard_extent_map_t const &update) { + op->write_done(std::move(update)); +} + +uint64_t ECExtentCache::get_projected_size(hobject_t const &oid) const { + return objects.at(oid).get_projected_size(); +} + +bool ECExtentCache::contains_object(hobject_t const &oid) const { + return objects.contains(oid); +} + +ECExtentCache::Op::~Op() { + ceph_assert(object.active_ios > 0); + object.active_ios--; + ceph_assert(object.pg.active_ios > 0); + object.pg.active_ios--; + + object.unpin(*this); +} + +/* ECExtent cache cleanup on occurs in two parts. The first performs cleanup + * of the ops currently managed by the extent cache. At this point, however + * the cache will be waiting for other parts of EC to clean up (for example + * any outstanding reads). on_change2() executes once all of this cleanup has + * occurred. + */ +void ECExtentCache::on_change() { + for (auto &&o : std::views::values(objects)) { + o.reading_ops.clear(); + o.requesting_ops.clear(); + o.requesting.clear(); + } + for (auto &&op : waiting_ops) { + op->cancel(); + } + waiting_ops.clear(); +} + +/* This must be run toward the end of EC on_change handling. It asserts that + * any object which is automatically self-destructs when idle has done so. + * Additionally, it discards the entire LRU cache. This must be done after all + * in-flight reads/writes have completed, or we risk attempting to insert data + * into the cache after it has been cleared. + * + * Note that the LRU will end up being called multiple times. With some + * additional code complexity this could be fixed for a small (probably + * insignificant) performance improvement. + */ +void ECExtentCache::on_change2() const { + lru.discard(); + /* If this assert fires in a unit test, make sure that all ops have completed + * and cleared any extent cache ops they contain */ + ceph_assert(objects.empty()); + ceph_assert(active_ios == 0); + ceph_assert(idle()); +} + +void ECExtentCache::execute(list &op_list) { + for (auto &op : op_list) { + op->object.request(op); + } + waiting_ops.insert(waiting_ops.end(), op_list.begin(), op_list.end()); + counter++; + cache_maybe_ready(); +} + +bool ECExtentCache::idle() const { + return active_ios == 0; +} + +uint32_t ECExtentCache::get_and_reset_counter() { + uint32_t ret = counter; + counter = 0; + return ret; +} + +list::iterator ECExtentCache::LRU::erase( + const list::iterator &it, + bool do_update_mempool) { + uint64_t size_change = map.at(*it).second->size(); + if (do_update_mempool) { + update_mempool(-1, 0 - size_change); + } + size -= size_change; + map.erase(*it); + return lru.erase(it); +} + +void ECExtentCache::LRU::add(const Line &line) { + if (line.size == 0) { + update_mempool(-1, 0); + return; + } + + const Key k(line.offset, line.object.oid); + + shared_ptr cache = line.cache; + + mutex.lock(); + ceph_assert(!map.contains(k)); + auto i = lru.insert(lru.end(), k); + auto j = make_pair(std::move(i), std::move(cache)); + map.insert(std::pair(std::move(k), std::move(j))); + size += line.size; // This is already accounted for in mempool. + free_maybe(); + mutex.unlock(); +} + +shared_ptr ECExtentCache::LRU::find( + const hobject_t &oid, uint64_t offset) { + Key k(offset, oid); + shared_ptr cache = nullptr; + mutex.lock(); + if (map.contains(k)) { + auto &&[lru_iter, c] = map.at(k); + cache = c; + auto it = lru_iter; // Intentional copy. + erase(it, false); + } + mutex.unlock(); + return cache; +} + +void ECExtentCache::LRU::remove_object(const hobject_t &oid) { + mutex.lock(); + for (auto it = lru.begin(); it != lru.end();) { + if (it->oid == oid) { + it = erase(it, true); + } else { + ++it; + } + } + mutex.unlock(); +} + +void ECExtentCache::LRU::free_maybe() { + while (max_size < size) { + auto it = lru.begin(); + erase(it, true); + } +} + +void ECExtentCache::LRU::discard() { + mutex.lock(); + lru.clear(); + update_mempool(0 - map.size(), 0 - size); + map.clear(); + size = 0; + mutex.unlock(); +} + +const extent_set ECExtentCache::Op::get_pin_eset(uint64_t alignment) const { + extent_set eset = writes.get_extent_superset(); + if (reads) { + reads->get_extent_superset(eset); + } + eset.align(alignment); + + return eset; +} + +ECExtentCache::Op::Op(GenContextURef &&cache_ready_cb, + Object &object, + std::optional const &to_read, + shard_extent_set_t const &write, + uint64_t projected_size, + bool invalidates_cache) : + object(object), + reads(to_read), + writes(write), + result(&object.pg.sinfo), + invalidates_cache(invalidates_cache), + projected_size(projected_size), + cache_ready_cb(std::move(cache_ready_cb)) { + object.active_ios++; + object.pg.active_ios++; +} + +shard_extent_map_t ECExtentCache::Object::get_cache( + std::optional const &set) const { + if (!set) { + return shard_extent_map_t(&pg.sinfo); + } + + shard_id_map res(pg.sinfo.get_k_plus_m()); + for (auto &&[shard, eset] : *set) { + for (auto [off, len] : eset) { + for (uint64_t slice_start = line_align(off); + slice_start < off + len; + slice_start += line_size) { + uint64_t offset = max(slice_start, off); + uint64_t length = min(slice_start + line_size, off + len) - offset; + // This line must exist, as it was created when the op was created. + LineRef l = lines.at(slice_start).lock(); + if (l->cache->contains_shard(shard)) { + extent_map m = l->cache->get_extent_map(shard).intersect( + offset, length); + if (!m.empty()) { + if (!res.contains(shard)) res.emplace(shard, std::move(m)); + else res.at(shard).insert(m); + } + } + } + } + } + return shard_extent_map_t(&pg.sinfo, std::move(res)); +} diff --git a/src/osd/ECExtentCache.h b/src/osd/ECExtentCache.h index b02afec4a114c..b4e06d6fddfd0 100644 --- a/src/osd/ECExtentCache.h +++ b/src/osd/ECExtentCache.h @@ -1,10 +1,383 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* EC "extent" cache. This extent cache attempts to improve performance, + * particularly for small sequential writes, by caching the results of recent + * reads and writes. + * + * The cache has two parts: The main cache which is active while an IO is + * outstanding to an object and an "LRU" which stashes recent IO according to + * a least-recently-used scheme. + * + * The cache keeps all caches indexed by shard, shard_offset. That is it + * independently tracks caches for each shard of an EC. It will keep a cache + * even for shards which are currently offline or missing, since the cache + * is formed from the result of reads and writes, which are required to always + * calculate missing shards. + * + * The cache allows for a single read to be outstanding per PG at a time. If + * multiple writes are received while a read is active, the next read will + * contain all necessary reads, so as to catch up. Early on in development, a + * more parallel read mechanism was explored but was found to have no benefit. + * + * This cache will never re-order IO. + * + * The LRU + * + * The LRU is a per-OSD-shard (not to be confused with an EC shard). Since the + * OSD-shard can have multiple threads, the LRU must have a mutex. This should + * not be required for crimson-based pools, since each osd shard has a single + * reactor. Some effort has been made to limit the frequency that this mutex is + * taken. + * + * The LRU has a maximum size (defined in the constructor) and will keep its + * usage below this amount. + * + * Cache Lines + * + * The LRU tracks extents of recent writes with cache Lines. These are + * simple-to-track ranges of offsets across all shards. Each line represents 32K + * of address space on each shard. + * + * A cache line can be owned by: + * - No-one (i.e. it is not instantiated) + * - Object - an IO is inflight for this cache line + * - LRU - A recent IO touched this cache line. + * + * This simple ownership model means that the locking required for the LRU does + * not leak out into the wider extent cache and allows for the entire cache + * to be built from reference-counters. + * + * Client API + * + * The client has a number of required interactions: + * 1. prepare(...). This creates a cache op. All cache ops required for a single + * parent op must be prepared before any are executed. + * 2. execute(...). Execute an IO. This gives the cache permission to perform + * the IO. This function can (and frequently does) call back + * re-entrantly, so the caller must be aware that this can + * happen. + * + * The client must provide a mechanism for the extent cache to read. It does + * this by extending the ECExtentCache::BackendRead class. + * + * Once a read is complete, the client must call cache.read_done(). + * + * When the cache is ready, it will call back the lambda passed to execute. + * The client is expected to populate the write data, including any parity + * data, by calling the cache.write_done() method. + * + * Finally, there is an on_change() and on_change2() methods. The first of these + * instructs the extent cache to discard any ops it has queued. The second + * simply asserts that the cache is now idle, this is to ensure that the calling + * code has performed the required clean up to clear the extent cache. + */ + #pragma once -// Temporary stubs +#include "ECUtil.h" +#include "include/Context.h" + class ECExtentCache { + class Address; + class Line; + class Object; + typedef std::shared_ptr LineRef; + typedef std::list::iterator LineIter; + public: + class LRU; + class Op; + typedef std::shared_ptr OpRef; + + struct BackendReadListener { + virtual void backend_read(hobject_t oid, + ECUtil::shard_extent_set_t const &request, + uint64_t object_size) = 0; + virtual ~BackendReadListener() = default; + }; + + static void update_mempool(int items, int64_t bytes) { + mempool::get_pool(mempool::pool_index_t(mempool::mempool_ec_extent_cache)). + adjust_count(items, bytes); + } + class LRU { public: - LRU(uint64_t) {} + class Key { + public: + uint64_t offset; + hobject_t oid; + + Key(uint64_t offset, const hobject_t &oid) : offset(offset), oid(oid) {}; + + friend bool operator==(const Key &lhs, const Key &rhs) { + return lhs.offset == rhs.offset + && lhs.oid == rhs.oid; + } + + friend bool operator!=(const Key &lhs, const Key &rhs) { + return !(lhs == rhs); + } + }; + + struct KeyHash { + std::size_t operator()(const Key &obj) const { + std::size_t seed = 0x625610ED; + seed ^= (seed << 6) + (seed >> 2) + 0x1E665363 + static_cast< + std::size_t>(obj.offset); + seed ^= (seed << 6) + (seed >> 2) + 0x51343C80 + obj.oid.get_hash(); + return seed; + } + }; + + private: + friend class Object; + friend class ECExtentCache; + std::unordered_map::iterator, std::shared_ptr< + ECUtil::shard_extent_map_t>>, KeyHash> map; + std::list lru; + uint64_t max_size = 0; + uint64_t size = 0; + ceph::mutex mutex = ceph::make_mutex("ECExtentCache::LRU"); + + void free_maybe(); + void discard(); + void add(const Line &line); + void erase(const Key &k); + std::list::iterator erase(const std::list::iterator &it, + bool update_mempool); + std::shared_ptr find( + const hobject_t &oid, uint64_t offset); + void remove_object(const hobject_t &oid); + + public: + explicit LRU(uint64_t max_size) : map(), max_size(max_size) {} + }; + + class Op { + friend class Object; + friend class ECExtentCache; + + Object &object; + std::optional const reads; + ECUtil::shard_extent_set_t const writes; + ECUtil::shard_extent_map_t result; + bool complete = false; + bool invalidates_cache = false; + bool reading = false; + bool read_done = false; + uint64_t projected_size = 0; + GenContextURef cache_ready_cb; + std::list lines; + + // List of callbacks to be executed on write completion (not commit) + std::list> on_write; + + const extent_set get_pin_eset(uint64_t alignment) const; + + public: + explicit Op( + GenContextURef &&cache_ready_cb, + Object &object, + std::optional const &to_read, + ECUtil::shard_extent_set_t const &write, + uint64_t projected_size, + bool invalidates_cache); + + ~Op(); + void cancel() { delete cache_ready_cb.release(); } + const ECUtil::shard_extent_set_t &get_writes() const { return writes; } + const Object &get_object() const { return object; } + const hobject_t &get_hoid() const { return object.oid; } + const ECUtil::shard_extent_map_t &get_result() { return result; } + + void add_on_write(std::function &&cb) { + on_write.emplace_back(std::move(cb)); + } + + bool complete_if_reads_cached(OpRef &op_ref) { + if (!read_done) { + return false; + } + result = object.get_cache(reads); + complete = true; + cache_ready_cb.release()->complete(op_ref); + return true; + } + + void write_done(ECUtil::shard_extent_map_t const &update) const { + object.write_done(update, projected_size); + for (auto &cb: on_write) { + cb(); + } + } }; -}; + +#define MIN_LINE_SIZE (32UL*1024UL) + +private: + class Object { + friend class Op; + friend class LRU; + friend class Line; + friend class ECExtentCache; + + ECExtentCache &pg; + ECUtil::shard_extent_set_t requesting; + ECUtil::shard_extent_set_t do_not_read; + std::list reading_ops; + std::list requesting_ops; + // Map of the byte-offset of the start of the line to the line. + std::map> lines; + int active_ios = 0; + uint64_t current_size = 0; + uint64_t projected_size = 0; + uint64_t line_size = 0; + bool reading = false; + bool cache_invalidated = false; + bool cache_invalidate_expected = false; + + void request(OpRef &op); + void send_reads(); + void unpin(Op &op) const; + void delete_maybe() const; + void erase_line(uint64_t offset); + void invalidate(const OpRef &invalidating_op); + + public: + hobject_t oid; + + Object(ECExtentCache &pg, hobject_t const &oid, uint64_t size) : + pg(pg), + requesting(pg.sinfo.get_k_plus_m()), + do_not_read(pg.sinfo.get_k_plus_m()), + current_size(size), + projected_size(size), + oid(oid) { + line_size = std::max(MIN_LINE_SIZE, pg.sinfo.get_chunk_size()); + } + + void insert(ECUtil::shard_extent_map_t const &buffers) const; + void write_done(ECUtil::shard_extent_map_t const &buffers, uint64_t new_size); + void read_done(ECUtil::shard_extent_map_t const &result); + [[nodiscard]] uint64_t get_projected_size() const { return projected_size; } + ECUtil::shard_extent_map_t get_cache( + std::optional const &set) const; + uint64_t line_align(uint64_t line) const; + }; + + + class Line { + public: + uint64_t offset; + uint64_t size; + std::shared_ptr cache; + Object &object; + + Line(Object &object, + uint64_t offset) : + offset(offset), + object(object) { + std::shared_ptr c = object.pg.lru.find( + object.oid, offset); + + if (c == nullptr) { + cache = std::make_shared(&object.pg.sinfo); + size = 0; + /* We are creating an empty cache line */ + update_mempool(1, 0); + } else { + cache = c; + size = c->size(); + } + } + + ~Line() { + object.pg.lru.add(*this); + object.erase_line(offset); + } + + friend bool operator==(const Line &lhs, const Line &rhs) { + return lhs.offset == rhs.offset + && lhs.object.oid == rhs.object.oid; + } + + friend bool operator!=(const Line &lhs, const Line &rhs) { + return !(lhs == rhs); + } + }; + + std::map objects; + BackendReadListener &backend_read; + LRU &lru; + const ECUtil::stripe_info_t &sinfo; + std::list waiting_ops; + void cache_maybe_ready(); + uint32_t counter = 0; + uint32_t active_ios = 0; + CephContext *cct; + + OpRef prepare(GenContextURef &&ctx, + hobject_t const &oid, + std::optional const &to_read, + ECUtil::shard_extent_set_t const &write, + uint64_t orig_size, + uint64_t projected_size, + bool invalidates_cache); + + public: + ~ECExtentCache() { + // This should really only be needed in failed tests, as the PG should + // clear up any IO before it gets destructed. However, here we make sure + // to clean up any outstanding IO. + on_change(); + on_change2(); + } + + explicit ECExtentCache(BackendReadListener &backend_read, + LRU &lru, const ECUtil::stripe_info_t &sinfo, + CephContext *cct + ) : + backend_read(backend_read), + lru(lru), + sinfo(sinfo), + cct(cct) {} + + // Insert some data into the cache. + void read_done(hobject_t const &oid, ECUtil::shard_extent_map_t const &update); + void write_done(OpRef const &op, ECUtil::shard_extent_map_t const &update); + void on_change(); + void on_change2() const; + [[nodiscard]] bool contains_object(hobject_t const &oid) const; + [[nodiscard]] uint64_t get_projected_size(hobject_t const &oid) const; + + template + OpRef prepare(hobject_t const &oid, + std::optional const &to_read, + ECUtil::shard_extent_set_t const &write, + uint64_t orig_size, + uint64_t projected_size, + bool invalidates_cache, + CacheReadyCb &&ready_cb) { + GenContextURef ctx = + make_gen_lambda_context( + std::forward(ready_cb)); + + return prepare(std::move(ctx), oid, to_read, write, orig_size, + projected_size, invalidates_cache); + } + + void execute(std::list &op_list); + [[nodiscard]] bool idle() const; + uint32_t get_and_reset_counter(); + + void add_on_write(std::function &&cb) const { + if (waiting_ops.empty()) { + cb(); + } else { + waiting_ops.back()->add_on_write(std::move(cb)); + } + } +}; // ECExtentCaches diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc index 7cfe995a4b927..d6de5274b8e61 100644 --- a/src/osd/ECTransaction.cc +++ b/src/osd/ECTransaction.cc @@ -34,625 +34,933 @@ using ceph::decode; using ceph::encode; using ceph::ErasureCodeInterfaceRef; -static void encode_and_write( - pg_t pgid, - const hobject_t &oid, - const ECUtil::stripe_info_t &sinfo, - ErasureCodeInterfaceRef &ecimpl, - const set &want, - uint64_t offset, - bufferlist bl, - uint32_t flags, - ECUtil::HashInfoRef hinfo, - extent_map &written, - map *transactions, - DoutPrefixProvider *dpp) -{ - const uint64_t before_size = hinfo->get_total_logical_size(sinfo); - ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset)); - ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length())); - ceph_assert(bl.length()); - - map buffers; - int r = ECUtil::encode( - sinfo, ecimpl, bl, want, &buffers); - ceph_assert(r == 0); +void debug(const hobject_t &oid, const std::string &str, + const ECUtil::shard_extent_map_t &map, DoutPrefixProvider *dpp + ) { +#if DEBUG_EC_BUFFERS + ldpp_dout(dpp, 20) + << "EC_DEBUG_BUFFERS: generate_transactions: " + << "oid: " << oid + << " " << str << " " << map.debug_string(2048, 8) << dendl; +#else + ldpp_dout(dpp, 20) + << "generate_transactions: " + << "oid: " << oid + << str << map << dendl; +#endif +} - written.insert(offset, bl.length(), bl); +void ECTransaction::Generate::encode_and_write() { + // For PDW, we already have necessary parity buffers. + if (!plan.do_parity_delta_write) { + to_write.insert_parity_buffers(); + } + // If partial writes are not supported, pad out to_write to a full stripe. + if (!sinfo.supports_partial_writes()) { + for (auto &&[shard, eset]: plan.will_write) { + if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) continue; + + for (auto [off, len]: eset) { + to_write.zero_pad(shard, off, len); + } + } + } + + int r = 0; + if (plan.do_parity_delta_write) { + /* For parity delta writes, we remove any unwanted writes before calculating + * the parity. + */ + read_sem->zero_pad(plan.will_write); + to_write.pad_with_other(plan.will_write, *read_sem); + r = to_write.encode_parity_delta(ec_impl, *read_sem); + } else { + r = to_write.encode(ec_impl, plan.hinfo, plan.orig_size); + } + ceph_assert(r == 0); + // Remove any unnecessary writes. + //to_write = to_write.intersect(plan.will_write); + + debug(oid, "parity", to_write, dpp); ldpp_dout(dpp, 20) << __func__ << ": " << oid - << " new_size " - << offset + bl.length() - << dendl; - - if (offset >= before_size) { - ceph_assert(offset == before_size); - hinfo->append( - sinfo.aligned_logical_offset_to_chunk_offset(offset), - buffers); - } - - for (auto &&i : *transactions) { - ceph_assert(buffers.count(static_cast(i.first))); - bufferlist &enc_bl = buffers[static_cast(i.first)]; - if (offset >= before_size) { - i.second.set_alloc_hint( - coll_t(spg_t(pgid, i.first)), - ghobject_t(oid, ghobject_t::NO_GEN, i.first), - 0, 0, - CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE | - CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + << " plan " << plan + << dendl; + + for (auto &&[shard, to_write_eset]: plan.will_write) { + /* Zero pad, even if we are not writing. The extent cache requires that + * all shards are fully populated with write data, even if the OSDs are + * down. This is not a fundamental requirement of the cache, but dealing + * with implied zeros due to incomplete writes is both difficult and + * removes a level of protection against bugs. + */ + for (auto &&[offset, len]: to_write_eset) { + to_write.zero_pad(shard, offset, len); + } + + if (transactions.contains(shard)) { + auto &t = transactions.at(shard); + if (to_write_eset.begin().get_start() >= plan.orig_size) { + t.set_alloc_hint( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + 0, 0, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY); + } + + for (auto &&[offset, len]: to_write_eset) { + buffer::list bl; + to_write.get_buffer(shard, offset, len, bl); + t.write(coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + offset, bl.length(), bl, fadvise_flags); + } } - i.second.write( - coll_t(spg_t(pgid, i.first)), - ghobject_t(oid, ghobject_t::NO_GEN, i.first), - sinfo.logical_to_prev_chunk_offset( - offset), - enc_bl.length(), - enc_bl, - flags); } } -void ECTransaction::generate_transactions( - PGTransaction* _t, - WritePlan &plan, - ErasureCodeInterfaceRef &ecimpl, - pg_t pgid, - const ECUtil::stripe_info_t &sinfo, - const map &partial_extents, - vector &entries, - map *written_map, - map *transactions, - set *temp_added, - set *temp_removed, - DoutPrefixProvider *dpp, - const ceph_release_t require_osd_release) +ECTransaction::WritePlanObj::WritePlanObj( + const hobject_t &hoid, + const PGTransaction::ObjectOperation &op, + const ECUtil::stripe_info_t &sinfo, + const shard_id_set readable_shards, + const shard_id_set writable_shards, + const bool object_in_cache, + uint64_t orig_size, + const std::optional &oi, + const std::optional &soi, + const ECUtil::HashInfoRef &&hinfo, + const ECUtil::HashInfoRef &&shinfo, + const unsigned pdw_write_mode + ) : + hoid(hoid), + will_write(sinfo.get_k_plus_m()), + hinfo(hinfo), + shinfo(shinfo), + orig_size(orig_size) // On-disk object sizes are rounded up to the next page. { - ceph_assert(written_map); - ceph_assert(transactions); - ceph_assert(temp_added); - ceph_assert(temp_removed); - ceph_assert(_t); - auto &t = *_t; + extent_set unaligned_ro_writes; - auto &hash_infos = plan.hash_infos; + projected_size = oi ? oi->size : 0; - map obj_to_log; - for (auto &&i: entries) { - obj_to_log.insert(make_pair(i.soid, &i)); + if (soi) { + projected_size = soi->size; } - t.safe_create_traverse( - [&](pair &opair) { - const hobject_t &oid = opair.first; - auto &op = opair.second; - auto &obc_map = t.obc_map; - auto &written = (*written_map)[oid]; + hobject_t source; + invalidates_cache = op.has_source(&source) || op.is_delete(); + + op.buffer_updates.to_interval_set(unaligned_ro_writes); + /* We can get multiple truncates/appends in a single tranaction. These get + * simplified to two values - a minimum and a maximum. It is not guaranteed + * that this region has writes. We create writes for this region so as to + * essentially write zeros (or holes) in that region. + */ + + if (op.truncate) { + uint64_t start = op.truncate->first; + uint64_t end = projected_size; + if (projected_size > op.truncate->second ) { + end = op.truncate->second; + } + if (end > start) { + unaligned_ro_writes.insert(start, end - start); + } + } - auto iter = obj_to_log.find(oid); - pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr; + /* Calculate any non-aligned pages. These need to be read and written */ + extent_set aligned_ro_writes(unaligned_ro_writes); + aligned_ro_writes.align(CEPH_PAGE_SIZE); + extent_set partial_page_ro_writes(aligned_ro_writes); + partial_page_ro_writes.subtract(unaligned_ro_writes); + partial_page_ro_writes.align(CEPH_PAGE_SIZE); + + extent_set write_superset; + for (auto &&[off, len] : unaligned_ro_writes) { + sinfo.ro_range_to_shard_extent_set_with_superset( + off, len, will_write, write_superset); + } + write_superset.align(CEPH_PAGE_SIZE); + + shard_id_set writable_parity_shards = shard_id_set::intersection(sinfo.get_parity_shards(), writable_shards); + for (auto shard : writable_parity_shards) { + will_write[shard].insert(write_superset); + } + + ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m()); + ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m()); - ObjectContextRef obc; - auto obiter = t.obc_map.find(oid); - if (obiter != t.obc_map.end()) { - obc = obiter->second; + if (!sinfo.supports_partial_writes()) { + for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) { + will_write[shard].insert(write_superset); + } + will_write.align(sinfo.get_chunk_size()); + reads = will_write; + sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask); + reads.intersection_of(read_mask); + do_parity_delta_write = false; + } else { + will_write.align(CEPH_PAGE_SIZE); + ECUtil::shard_extent_set_t pdw_reads(will_write); + + sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask); + + /* Next we need to add the reads required for a conventional write */ + for (auto shard : sinfo.get_data_shards()) { + reads[shard].insert(write_superset); + if (will_write.contains(shard)) { + reads[shard].subtract(will_write.at(shard)); } - if (entry) { - ceph_assert(obc); - } else { - ceph_assert(oid.is_temp()); + if (reads[shard].empty()) { + reads.erase(shard); } + } - ECUtil::HashInfoRef hinfo; - { - auto iter = hash_infos.find(oid); - ceph_assert(iter != hash_infos.end()); - hinfo = iter->second; - } + /* We now need to add in the partial page ro writes. This is not particularly + * efficient as the are many divs in here, but non-4k aligned writes are + * not very efficient anyway + */ + for (auto &&[off, len] : partial_page_ro_writes) { + sinfo.ro_range_to_shard_extent_set( + off, len, reads); + } - if (oid.is_temp()) { - if (op.is_fresh_object()) { - temp_added->insert(oid); - } else if (op.is_delete()) { - temp_removed->insert(oid); - } - } + reads.intersection_of(read_mask); - if (entry && - entry->is_modify() && - op.updated_snaps) { - bufferlist bl(op.updated_snaps->second.size() * 8 + 8); - encode(op.updated_snaps->second, bl); - entry->snaps.swap(bl); - entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog); - } + /* Here we decide if we want to do a conventional write or a parity delta write. */ + if (sinfo.supports_parity_delta_writes() && !object_in_cache && + orig_size == projected_size && !reads.empty()) { - ldpp_dout(dpp, 20) << "generate_transactions: " - << opair.first - << ", current size is " - << hinfo->get_total_logical_size(sinfo) - << " buffers are " - << op.buffer_updates - << dendl; - if (op.truncate) { - ldpp_dout(dpp, 20) << "generate_transactions: " - << " truncate is " - << *(op.truncate) - << dendl; - } + shard_id_set read_shards = reads.get_shard_id_set(); + shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set(); - if (entry && op.updated_snaps) { - entry->mod_desc.update_snaps(op.updated_snaps->first); + if (pdw_write_mode != 0) { + do_parity_delta_write = (pdw_write_mode == 2); + } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) { + // Some kind of reconstruct would be needed for PDW, so don't bother. + do_parity_delta_write = false; + } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) { + // Some kind of reconstruct is needed for conventional, but NOT for PDW! + do_parity_delta_write = true; + } else { + /* Everything we need for both is available, opt for which ever is less + * reads. + */ + do_parity_delta_write = pdw_read_shards.size() < read_shards.size(); } - map > xattr_rollback; - ceph_assert(hinfo); - bufferlist old_hinfo; - encode(*hinfo, old_hinfo); - xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo; - - if (op.is_none() && op.truncate && op.truncate->first == 0) { - ceph_assert(entry); - ceph_assert(obc); - - if (op.truncate->first != op.truncate->second) { - op.truncate->first = op.truncate->second; - } else { - op.truncate = std::nullopt; - } - - op.delete_first = true; - op.init_type = PGTransaction::ObjectOperation::Init::Create(); - - if (obc) { - /* We need to reapply all of the cached xattrs. - * std::map insert fortunately only writes keys - * which don't already exist, so this should do - * the right thing. */ - op.attr_updates.insert( - obc->attr_cache.begin(), - obc->attr_cache.end()); - } + if (do_parity_delta_write) { + to_read = std::move(pdw_reads); + reads.clear(); // So we don't stash it at the end. } + } - if (op.delete_first) { - /* We also want to remove the std::nullopt entries since - * the keys already won't exist */ - for (auto j = op.attr_updates.begin(); - j != op.attr_updates.end(); - ) { - if (j->second) { - ++j; - } else { - op.attr_updates.erase(j++); - } - } - /* Fill in all current entries for xattr rollback */ - if (obc) { - xattr_rollback.insert( - obc->attr_cache.begin(), - obc->attr_cache.end()); - obc->attr_cache.clear(); - } - if (entry) { - entry->mod_desc.rmobject(entry->version.version); - for (auto &&st: *transactions) { - st.second.collection_move_rename( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, entry->version.version, st.first)); - } - } else { - for (auto &&st: *transactions) { - st.second.remove( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first)); - } - } - hinfo->clear(); - } + /* NOTE: We intentionally leave un-writable shards in the write plan. As + * it is actually less efficient to take them out:- PDWs still need to + * compute the deltas and conventional writes still need to calcualte the + * parity. The transaction will be dropped by generate_transactions. + */ + } + + if (!reads.empty()) { + to_read = std::move(reads); + } - if (op.is_fresh_object() && entry) { - entry->mod_desc.create(); + /* validate post conditions: + * to_read should have an entry for `obj` if it isn't empty + * and if we are reading from `obj`, we can't be renaming or + * cloning it */ + ceph_assert(!to_read || !soi); +} + +void ECTransaction::Generate::all_shards_written() { + if (entry) { + entry->written_shards.insert_range(shard_id_t(0), sinfo.get_k_plus_m()); + } +} + +void ECTransaction::Generate::shard_written(const shard_id_t shard) { + if (entry) { + entry->written_shards.insert(shard); + } +} + +void ECTransaction::Generate::shards_written(const shard_id_set &shards) { + if (entry) { + entry->written_shards.insert(shards); + } +} + +void ECTransaction::Generate::zero_truncate_to_delete() { + ceph_assert(obc); + + if (op.truncate->first != op.truncate->second) { + op.truncate->first = op.truncate->second; + } else { + op.truncate = std::nullopt; + } + + op.delete_first = true; + op.init_type = PGTransaction::ObjectOperation::Init::Create(); + + if (obc) { + /* We need to reapply all of the cached xattrs. + * std::map insert fortunately only writes keys + * which don't already exist, so this should do + * the right thing. */ + op.attr_updates.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + } +} + +void ECTransaction::Generate::delete_first() { + /* We also want to remove the std::nullopt entries since + * the keys already won't exist */ + for (auto j = op.attr_updates.begin(); + j != op.attr_updates.end(); + ) { + if (j->second) { + ++j; + } else { + j = op.attr_updates.erase(j); + } + } + /* Fill in all current entries for xattr rollback */ + if (obc) { + xattr_rollback.insert( + obc->attr_cache.begin(), + obc->attr_cache.end()); + obc->attr_cache.clear(); + } + if (entry) { + entry->mod_desc.rmobject(entry->version.version); + all_shards_written(); + for (auto &&[shard, t]: transactions) { + t.collection_move_rename( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, entry->version.version, shard)); + } + } else { + for (auto &&[shard, t]: transactions) { + t.remove( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard)); + } + } + if (plan.hinfo) + plan.hinfo->clear(); +} + +void ECTransaction::Generate::process_init() { + match( + op.init_type, + [&](const PGTransaction::ObjectOperation::Init::None &) {}, + [&](const PGTransaction::ObjectOperation::Init::Create &_) { + all_shards_written(); + for (auto &&[shard, t]: transactions) { + if (osdmap->require_osd_release >= ceph_release_t::octopus) { + t.create( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard)); + } else { + t.touch( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard)); + } + } + }, + [&](const PGTransaction::ObjectOperation::Init::Clone &cop) { + all_shards_written(); + for (auto &&[shard, t]: transactions) { + t.clone( + coll_t(spg_t(pgid, shard)), + ghobject_t(cop.source, ghobject_t::NO_GEN, shard), + ghobject_t(oid, ghobject_t::NO_GEN, shard)); } - match( - op.init_type, - [&](const PGTransaction::ObjectOperation::Init::None &) {}, - [&](const PGTransaction::ObjectOperation::Init::Create &op) { - for (auto &&st: *transactions) { - if (require_osd_release >= ceph_release_t::octopus) { - st.second.create( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first)); - } else { - st.second.touch( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first)); - } - } - }, - [&](const PGTransaction::ObjectOperation::Init::Clone &op) { - for (auto &&st: *transactions) { - st.second.clone( - coll_t(spg_t(pgid, st.first)), - ghobject_t(op.source, ghobject_t::NO_GEN, st.first), - ghobject_t(oid, ghobject_t::NO_GEN, st.first)); - } - - auto siter = hash_infos.find(op.source); - ceph_assert(siter != hash_infos.end()); - hinfo->update_to(*(siter->second)); - - if (obc) { - auto cobciter = obc_map.find(op.source); - ceph_assert(cobciter != obc_map.end()); - obc->attr_cache = cobciter->second->attr_cache; - } - }, - [&](const PGTransaction::ObjectOperation::Init::Rename &op) { - ceph_assert(op.source.is_temp()); - for (auto &&st: *transactions) { - st.second.collection_move_rename( - coll_t(spg_t(pgid, st.first)), - ghobject_t(op.source, ghobject_t::NO_GEN, st.first), - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first)); - } - auto siter = hash_infos.find(op.source); - ceph_assert(siter != hash_infos.end()); - hinfo->update_to(*(siter->second)); - if (obc) { - auto cobciter = obc_map.find(op.source); - ceph_assert(cobciter == obc_map.end()); - obc->attr_cache.clear(); - } - }); - - // omap not supported (except 0, handled above) - ceph_assert(!(op.clear_omap)); - ceph_assert(!(op.omap_header)); - ceph_assert(op.omap_updates.empty()); - - if (!op.attr_updates.empty()) { - map> to_set; - for (auto &&j: op.attr_updates) { - if (j.second) { - to_set[j.first] = *(j.second); - } else { - for (auto &&st : *transactions) { - st.second.rmattr( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - j.first); - } - } - if (obc) { - auto citer = obc->attr_cache.find(j.first); - if (entry) { - if (citer != obc->attr_cache.end()) { - // won't overwrite anything we put in earlier - xattr_rollback.insert( - make_pair( - j.first, - std::optional(citer->second))); - } else { - // won't overwrite anything we put in earlier - xattr_rollback.insert( - make_pair( - j.first, - std::nullopt)); - } - } - if (j.second) { - obc->attr_cache[j.first] = *(j.second); - } else if (citer != obc->attr_cache.end()) { - obc->attr_cache.erase(citer); - } - } else { - ceph_assert(!entry); - } - } - for (auto &&st : *transactions) { - st.second.setattrs( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - to_set); - } - ceph_assert(!xattr_rollback.empty()); + if (plan.hinfo && plan.shinfo) + plan.hinfo->update_to(*plan.shinfo); + + if (obc) { + auto cobciter = t.obc_map.find(cop.source); + ceph_assert(cobciter != t.obc_map.end()); + obc->attr_cache = cobciter->second->attr_cache; } - if (entry && !xattr_rollback.empty()) { - entry->mod_desc.setattrs(xattr_rollback); + }, + [&](const PGTransaction::ObjectOperation::Init::Rename &rop) { + ceph_assert(rop.source.is_temp()); + all_shards_written(); + for (auto &&[shard, t]: transactions) { + t.collection_move_rename( + coll_t(spg_t(pgid, shard)), + ghobject_t(rop.source, ghobject_t::NO_GEN, shard), + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard)); } + if (plan.hinfo && plan.shinfo) + plan.hinfo->update_to(*plan.shinfo); + if (obc) { + auto cobciter = t.obc_map.find(rop.source); + ceph_assert(cobciter == t.obc_map.end()); + obc->attr_cache.clear(); + } + }); +} + +void alloc_hint(PGTransaction::ObjectOperation& op, + shard_id_map &transactions, + pg_t &pgid, + const hobject_t &oid, + const ECUtil::stripe_info_t &sinfo) { + /* ro_offset_to_next_chunk_offset() scales down both aligned and + * unaligned offsets + + * we don't bother to roll this back at this time for two reasons: + * 1) it's advisory + * 2) we don't track the old value */ + uint64_t object_size = sinfo.ro_offset_to_next_chunk_offset( + op.alloc_hint->expected_object_size); + uint64_t write_size = sinfo.ro_offset_to_next_chunk_offset( + op.alloc_hint->expected_write_size); + + for (auto &&[shard, t]: transactions) { + t.set_alloc_hint( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + object_size, + write_size, + op.alloc_hint->flags); + } +} + +ECTransaction::Generate::Generate(PGTransaction &t, + ErasureCodeInterfaceRef &ec_impl, + pg_t &pgid, + const ECUtil::stripe_info_t &sinfo, + const std::map &partial_extents, + std::map *written_map, + shard_id_map &transactions, + const OSDMapRef &osdmap, + const hobject_t &oid, + PGTransaction::ObjectOperation &op, + WritePlanObj &plan, + DoutPrefixProvider *dpp, + pg_log_entry_t *entry) + : t(t), + ec_impl(ec_impl), + pgid(pgid), + sinfo(sinfo), + transactions(transactions), + dpp(dpp), + osdmap(osdmap), + entry(entry), + oid(oid), + op(op), + plan(plan), + read_sem(&sinfo), + to_write(&sinfo) { + auto obiter = t.obc_map.find(oid); + if (obiter != t.obc_map.end()) { + obc = obiter->second; + } + + if (entry) { + ceph_assert(obc); + } else { + ceph_assert(oid.is_temp()); + } + + if (entry && entry->is_modify() && op.updated_snaps) { + bufferlist bl(op.updated_snaps->second.size() * 8 + 8); + encode(op.updated_snaps->second, bl); + entry->snaps.swap(bl); + entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog); + } + + ldpp_dout(dpp, 20) << __func__ << ": " << oid << plan + << " fresh_object: " << op.is_fresh_object() + << dendl; + if (op.truncate) { + ldpp_dout(dpp, 20) << __func__ << ": truncate is " << *(op.truncate) << dendl; + } + + if (entry && op.updated_snaps) { + entry->mod_desc.update_snaps(op.updated_snaps->first); + } + + bufferlist old_hinfo; + if (plan.hinfo) { + encode(*(plan.hinfo), old_hinfo); + xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo; + } - if (op.alloc_hint) { - /* logical_to_next_chunk_offset() scales down both aligned and - * unaligned offsets - - * we don't bother to roll this back at this time for two reasons: - * 1) it's advisory - * 2) we don't track the old value */ - uint64_t object_size = sinfo.logical_to_next_chunk_offset( - op.alloc_hint->expected_object_size); - uint64_t write_size = sinfo.logical_to_next_chunk_offset( - op.alloc_hint->expected_write_size); - - for (auto &&st : *transactions) { - st.second.set_alloc_hint( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - object_size, - write_size, - op.alloc_hint->flags); - } + if (op.is_none() && op.truncate && op.truncate->first == 0) { + zero_truncate_to_delete(); + } + + if (op.delete_first) { + delete_first(); + } + + if (op.is_fresh_object() && entry) { + entry->mod_desc.create(); + } + + process_init(); + + // omap not supported (except 0, handled above) + ceph_assert(!(op.clear_omap) && !(op.omap_header) && op.omap_updates.empty()); + + if (op.alloc_hint) { + alloc_hint(op, transactions, pgid, oid, sinfo); + } + + auto pextiter = partial_extents.find(oid); + if (pextiter != partial_extents.end()) { + if (plan.do_parity_delta_write) { + read_sem = pextiter->second; + } else { + to_write = pextiter->second; + } + } + debug(oid, "to_write", to_write, dpp); + ldpp_dout(dpp, 20) << "generate_transactions: plan: " << plan << dendl; + + if (op.truncate && op.truncate->first < plan.orig_size) { + truncate(); + } + + overlay_writes(); + appends_and_clone_ranges(); + + /* The write plan is permitted to drop parity shards when the shard is + * missing. However, written_shards must contain all parity shards. + * Note that the write plan will *not* drop data shards. + */ + shards_written(sinfo.get_parity_shards()); + + if (!to_write.empty()) { + encode_and_write(); + } + + written_map->emplace(oid, std::move(to_write)); + + if (entry && plan.hinfo) { + plan.hinfo->set_total_chunk_size_clear_hash( + sinfo.ro_offset_to_next_stripe_ro_offset(plan.projected_size)); + } + + if (entry && plan.orig_size < plan.projected_size) { + entry->mod_desc.append(ECUtil::align_page_next(plan.orig_size)); + } + + if (!op.attr_updates.empty()) { + attr_updates(); + } + + if (entry && !xattr_rollback.empty()) { + entry->mod_desc.setattrs(xattr_rollback); + } + + if (!op.is_delete()) { + handle_deletes(); + } + + written_and_present_shards(); +} + +void ECTransaction::Generate::truncate() { + ceph_assert(!op.is_fresh_object()); + // causes encode to invent zeros + to_write.erase_after_ro_offset(plan.orig_size); + all_shards_written(); + + debug(oid, "truncate_erase", to_write, dpp); + + if (entry && !op.is_fresh_object()) { + uint64_t restore_from = sinfo.ro_offset_to_prev_chunk_offset( + op.truncate->first); + uint64_t restore_len = sinfo.aligned_ro_offset_to_chunk_offset( + plan.orig_size - + sinfo.ro_offset_to_prev_stripe_ro_offset(op.truncate->first)); + shard_id_set all_shards; // intentionally left blank! + rollback_extents.emplace_back(make_pair(restore_from, restore_len)); + rollback_shards.emplace_back(all_shards); + for (auto &&[shard, t]: transactions) { + t.touch( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, entry->version.version, shard)); + t.clone_range( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + ghobject_t(oid, entry->version.version, shard), + restore_from, + restore_len, + restore_from); + } + } + + for (auto &&[shard, t]: transactions) { + t.truncate( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + sinfo.ro_offset_to_shard_offset(plan.orig_size, + sinfo.get_raw_shard(shard))); + } +} + +void ECTransaction::Generate::overlay_writes() { + for (auto &&extent: op.buffer_updates) { + using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; + bufferlist bl; + match( + extent.get_val(), + [&](const BufferUpdate::Write &wop) { + bl = wop.buffer; + fadvise_flags |= wop.fadvise_flags; + }, + [&](const BufferUpdate::Zero &) { + bl.append_zero(extent.get_len()); + }, + [&](const BufferUpdate::CloneRange &) { + ceph_abort_msg( + "CloneRange is not allowed, do_op should have returned ENOTSUPP"); + }); + + uint64_t off = extent.get_off(); + uint64_t len = extent.get_len(); + + sinfo.ro_range_to_shard_extent_map(off, len, bl, to_write); + debug(oid, "overlay_buffer", to_write, dpp); + } +} + +void ECTransaction::Generate::appends_and_clone_ranges() { + + extent_set clone_ranges = plan.will_write.get_extent_superset(); + uint64_t clone_max = ECUtil::align_page_next(plan.orig_size); + + if (op.delete_first) { + clone_max = 0; + } else if (op.truncate && op.truncate->first < clone_max) { + clone_max = ECUtil::align_page_next(op.truncate->first); + } + ECUtil::shard_extent_set_t cloneable_range(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(clone_max, cloneable_range); + + if (plan.orig_size < plan.projected_size) { + ECUtil::shard_extent_set_t projected_cloneable_range(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(plan.projected_size,projected_cloneable_range); + + for (auto &&[shard, eset]: projected_cloneable_range) { + uint64_t old_shard_size = 0; + if (cloneable_range.contains(shard)) { + old_shard_size = cloneable_range.at(shard).range_end(); } + uint64_t new_shard_size = eset.range_end(); - extent_map to_write; - auto pextiter = partial_extents.find(oid); - if (pextiter != partial_extents.end()) { - to_write = pextiter->second; + if (new_shard_size == old_shard_size) continue; + + uint64_t write_end = 0; + if (plan.will_write.contains(shard)) { + write_end = plan.will_write.at(shard).range_end(); } - vector > rollback_extents; - const uint64_t orig_size = hinfo->get_total_logical_size(sinfo); - - uint64_t new_size = orig_size; - uint64_t append_after = new_size; - ldpp_dout(dpp, 20) << "generate_transactions: new_size start " - << new_size << dendl; - if (op.truncate && op.truncate->first < new_size) { - ceph_assert(!op.is_fresh_object()); - new_size = sinfo.logical_to_next_stripe_offset( - op.truncate->first); - ldpp_dout(dpp, 20) << "generate_transactions: new_size truncate down " - << new_size << dendl; - if (new_size != op.truncate->first) { // 0 the unaligned part - bufferlist bl; - bl.append_zero(new_size - op.truncate->first); - to_write.insert( - op.truncate->first, - bl.length(), - bl); - append_after = sinfo.logical_to_prev_stripe_offset( - op.truncate->first); - } else { - append_after = new_size; - } - to_write.erase( - new_size, - std::numeric_limits::max() - new_size); - - if (entry && !op.is_fresh_object()) { - uint64_t restore_from = sinfo.logical_to_prev_chunk_offset( - op.truncate->first); - uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( - orig_size - - sinfo.logical_to_prev_stripe_offset(op.truncate->first)); - ceph_assert(rollback_extents.empty()); - - ldpp_dout(dpp, 20) << "generate_transactions: saving extent " - << make_pair(restore_from, restore_len) - << dendl; - ldpp_dout(dpp, 20) << "generate_transactions: truncating to " - << new_size - << dendl; - rollback_extents.emplace_back( - make_pair(restore_from, restore_len)); - for (auto &&st : *transactions) { - st.second.touch( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, entry->version.version, st.first)); - st.second.clone_range( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - ghobject_t(oid, entry->version.version, st.first), - restore_from, - restore_len, - restore_from); - - } - } else { - ldpp_dout(dpp, 20) << "generate_transactions: not saving extents" - ", fresh object" << dendl; - } - for (auto &&st : *transactions) { - st.second.truncate( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - sinfo.aligned_logical_offset_to_chunk_offset(new_size)); - } + if (write_end == new_shard_size) continue; + + /* If code is executing here, it means that the written part of the + * shard does not reflect the size that EC believes the shard to be. + * This is not a problem for reads (they will be truncated), but it + * is a problem for writes, where future writes may attempt a clone + * off the end of the object. + * To solve this, we use an interesting quirk of "truncate" where we + * can actually truncate to a size larger than the object! + */ + if (transactions.contains(shard)) { + auto &t = transactions.at(shard); + t.truncate( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + new_shard_size); } + // Update written_shards because this must complete to consider + // the write as complete + shard_written(shard); + } + } - uint32_t fadvise_flags = 0; - for (auto &&extent: op.buffer_updates) { - using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; - bufferlist bl; - match( - extent.get_val(), - [&](const BufferUpdate::Write &op) { - bl = op.buffer; - fadvise_flags |= op.fadvise_flags; - }, - [&](const BufferUpdate::Zero &) { - bl.append_zero(extent.get_len()); - }, - [&](const BufferUpdate::CloneRange &) { - ceph_assert( - 0 == - "CloneRange is not allowed, do_op should have returned ENOTSUPP"); - }); - - uint64_t off = extent.get_off(); - uint64_t len = extent.get_len(); - uint64_t end = off + len; - ldpp_dout(dpp, 20) << "generate_transactions: adding buffer_update " - << make_pair(off, len) - << dendl; - ceph_assert(len > 0); - if (off > new_size) { - ceph_assert(off > append_after); - bl.prepend_zero(off - new_size); - len += off - new_size; - ldpp_dout(dpp, 20) << "generate_transactions: prepending zeroes to align " - << off << "->" << new_size - << dendl; - off = new_size; - } - if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) { - uint64_t aligned_end = sinfo.logical_to_next_stripe_offset( - end); - uint64_t tail = aligned_end - end; - bl.append_zero(tail); - ldpp_dout(dpp, 20) << "generate_transactions: appending zeroes to align end " - << end << "->" << end+tail - << ", len: " << len << "->" << len+tail - << dendl; - end += tail; - len += tail; - } - - to_write.insert(off, len, bl); - if (end > new_size) - new_size = end; + shard_id_set touched; + + for (auto &[start, len]: clone_ranges) { + shard_id_set to_clone_shards; + uint64_t clone_end = 0; + + for (auto &&[shard, eset]: plan.will_write) { + shard_written(shard); + + // If no clonable range here, then ignore. + if (!cloneable_range.contains(shard)) continue; + + // Do not clone off the end of the old range + uint64_t shard_clone_max = cloneable_range.at(shard).range_end(); + uint64_t shard_end = start + len; + if (shard_end > shard_clone_max) shard_end = shard_clone_max; + + // clone_end needs to be the biggest shard_end. + if (shard_end > clone_end) clone_end = shard_end; + + // Ignore pure appends on this shard. + if (shard_end <= start) continue; + + // Ignore clones that do not intersect with the write. + if (!eset.intersects(start, len)) continue; + + // We need a clone... + if (transactions.contains(shard)) { + auto &t = transactions.at(shard); + + // Only touch once. + if (!touched.contains(shard)) { + t.touch( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, entry->version.version, shard)); + touched.insert(shard_id_t(shard)); + } + t.clone_range( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + ghobject_t(oid, entry->version.version, shard), + start, + shard_end - start, + start); + + // We have done a clone, so tell the rollback. + to_clone_shards.insert(shard); } + } - if (op.truncate && - op.truncate->second > new_size) { - ceph_assert(op.truncate->second > append_after); - uint64_t truncate_to = - sinfo.logical_to_next_stripe_offset( - op.truncate->second); - uint64_t zeroes = truncate_to - new_size; - bufferlist bl; - bl.append_zero(zeroes); - to_write.insert( - new_size, - zeroes, - bl); - new_size = truncate_to; - ldpp_dout(dpp, 20) << "generate_transactions: truncating out to " - << truncate_to - << dendl; + if (!to_clone_shards.empty()) { + // It is more efficent to store an empty set to represent the common + // all shards case. + if (to_clone_shards.size() == sinfo.get_k_plus_m()) { + to_clone_shards.clear(); + } + if (clone_end > start) { + rollback_extents.emplace_back(make_pair(start, clone_end - start)); + rollback_shards.emplace_back(to_clone_shards); } + } + } +} - set want; - for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) { - want.insert(i); +void ECTransaction::Generate::written_and_present_shards() { + if (entry) { + if (!rollback_extents.empty()) { + entry->mod_desc.rollback_extents( + entry->version.version, + rollback_extents, + ECUtil::align_page_next(plan.orig_size), + rollback_shards); + } + if (entry->written_shards.size() == sinfo.get_k_plus_m()) { + // More efficient to encode an empty set for all shards + entry->written_shards.clear(); + } + // Calculate set of present shards + for (auto &&[shard, t]: transactions) { + entry->present_shards.insert(shard); + } + if (entry->present_shards.size() == sinfo.get_k_plus_m()) { + // More efficient to encode an empty set for all shards + entry->present_shards.clear(); + } + + // Update shard_versions in object_info to record which shards are being + // written + if (op.attr_updates.contains(OI_ATTR)) { + object_info_t oi(*(op.attr_updates[OI_ATTR])); + bool update = false; + if (entry->written_shards.empty()) { + if (!oi.shard_versions.empty()) { + oi.shard_versions.clear(); + update = true; + } + } else { + for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) { + if (sinfo.is_nonprimary_shard(shard)) { + if (entry->is_written_shard(shard) || plan.orig_size != plan. + projected_size) { + // Written - erase per shard version + if (oi.shard_versions.erase(shard)) { + update = true; + } + } else if (!oi.shard_versions.count(shard)) { + // Unwritten shard, previously up to date + oi.shard_versions[shard] = oi.prior_version; + update = true; + } else { + // Unwritten shard, already out of date + } + } else { + // Primary shards are always written and use oi.version + } + } } - auto to_overwrite = to_write.intersect(0, append_after); - ldpp_dout(dpp, 20) << "generate_transactions: to_overwrite: " - << to_overwrite - << dendl; - for (auto &&extent: to_overwrite) { - ceph_assert(extent.get_off() + extent.get_len() <= append_after); - ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); - ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); - if (entry) { - uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset( - extent.get_off()); - uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset( - extent.get_len()); - ldpp_dout(dpp, 20) << "generate_transactions: overwriting " - << restore_from << "~" << restore_len - << dendl; - if (rollback_extents.empty()) { - for (auto &&st : *transactions) { - st.second.touch( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, entry->version.version, st.first)); - } - } - rollback_extents.emplace_back(make_pair(restore_from, restore_len)); - for (auto &&st : *transactions) { - st.second.clone_range( - coll_t(spg_t(pgid, st.first)), - ghobject_t(oid, ghobject_t::NO_GEN, st.first), - ghobject_t(oid, entry->version.version, st.first), - restore_from, - restore_len, - restore_from); - } - } - encode_and_write( - pgid, - oid, - sinfo, - ecimpl, - want, - extent.get_off(), - extent.get_val(), - fadvise_flags, - hinfo, - written, - transactions, - dpp); + if (update) { + bufferlist bl; + oi.encode(bl, osdmap->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + op.attr_updates[OI_ATTR] = bl; + // Update cached OI + obc->obs.oi.shard_versions = oi.shard_versions; } + ldpp_dout(dpp, 20) << __func__ << "shard_info: version=" << entry->version + << " present=" << entry->present_shards + << " written=" << entry->written_shards + << " shard_versions=" << oi.shard_versions << dendl; + } - auto to_append = to_write.intersect( - append_after, - std::numeric_limits::max() - append_after); - ldpp_dout(dpp, 20) << "generate_transactions: to_append: " - << to_append - << dendl; - for (auto &&extent: to_append) { - ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off())); - ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len())); - ldpp_dout(dpp, 20) << "generate_transactions: appending " - << extent.get_off() << "~" << extent.get_len() - << dendl; - encode_and_write( - pgid, - oid, - sinfo, - ecimpl, - want, - extent.get_off(), - extent.get_val(), - fadvise_flags, - hinfo, - written, - transactions, - dpp); + /* It is essential for rollback that every shard with a non-empty transaction + * is recorded in written_shards. In fact written shards contains every + * shard that would have a transaction if it were present. This is why we do + * not simply construct written shards here. + */ + for (auto &&[shard, t] : transactions) { + if (entry && (!t.empty() || !sinfo.is_nonprimary_shard(shard))) { + ceph_assert(entry->is_written_shard(shard)); } + } + } +} - ldpp_dout(dpp, 20) << "generate_transactions: " << oid - << " resetting hinfo to logical size " - << new_size - << dendl; - if (!rollback_extents.empty() && entry) { - if (entry) { - ldpp_dout(dpp, 20) << "generate_transactions: " << oid - << " marking rollback extents " - << rollback_extents - << dendl; - entry->mod_desc.rollback_extents( - entry->version.version, rollback_extents); - } - hinfo->set_total_chunk_size_clear_hash( - sinfo.aligned_logical_offset_to_chunk_offset(new_size)); - } else { - ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size); +void ECTransaction::Generate::attr_updates() { + map> to_set; + for (auto &&[attr, update]: op.attr_updates) { + if (update) { + to_set[attr] = *(update); + } else { + all_shards_written(); + for (auto &&[shard, t]: transactions) { + t.rmattr( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + attr); + } + } + if (obc) { + auto citer = obc->attr_cache.find(attr); + if (entry) { + if (citer != obc->attr_cache.end()) { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + attr, + std::optional(citer->second))); + } else { + // won't overwrite anything we put in earlier + xattr_rollback.insert( + make_pair( + attr, + std::nullopt)); + } } + if (update) { + obc->attr_cache[attr] = *(update); + } else if (citer != obc->attr_cache.end()) { + obc->attr_cache.erase(citer); + } + } else { + ceph_assert(!entry); + } + } + all_shards_written(); + for (auto &&[shard, t]: transactions) { + if (!sinfo.is_nonprimary_shard(shard)) { + // Primary shard - Update all attributes + t.setattrs( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + to_set); + } else if (entry->is_written_shard(shard)) { + // Written shard - Only update object_info attribute + t.setattr( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + OI_ATTR, + to_set[OI_ATTR]); + } // Else: Unwritten shard - Don't update any attributes + } + ceph_assert(!xattr_rollback.empty()); +} - if (entry && !to_append.empty()) { - ldpp_dout(dpp, 20) << "generate_transactions: marking append " - << append_after - << dendl; - entry->mod_desc.append(append_after); +void ECTransaction::Generate::handle_deletes() { + bufferlist hbuf; + if (plan.hinfo) { + encode(*plan.hinfo, hbuf); + for (auto &&[shard, t]: transactions) { + if (!sinfo.is_nonprimary_shard(shard)) { + shard_written(shard); + t.setattr( + coll_t(spg_t(pgid, shard)), + ghobject_t(oid, ghobject_t::NO_GEN, shard), + ECUtil::get_hinfo_key(), + hbuf); } + } + } +} + +void ECTransaction::generate_transactions( + PGTransaction *_t, + WritePlan &plans, + ErasureCodeInterfaceRef &ec_impl, + pg_t pgid, + const ECUtil::stripe_info_t &sinfo, + const map &partial_extents, + vector &entries, + map *written_map, + shard_id_map *transactions, + set *temp_added, + set *temp_removed, + DoutPrefixProvider *dpp, + const OSDMapRef &osdmap) { + ceph_assert(written_map); + ceph_assert(transactions); + ceph_assert(temp_added); + ceph_assert(temp_removed); + ceph_assert(_t); + auto &t = *_t; + + map obj_to_log; + for (auto &&i: entries) { + obj_to_log.insert(make_pair(i.soid, &i)); + } - if (!op.is_delete()) { - bufferlist hbuf; - encode(*hinfo, hbuf); - for (auto &&i : *transactions) { - i.second.setattr( - coll_t(spg_t(pgid, i.first)), - ghobject_t(oid, ghobject_t::NO_GEN, i.first), - ECUtil::get_hinfo_key(), - hbuf); - } + t.safe_create_traverse( + [&](pair &opair) { + auto oid = opair.first; + PGTransaction::ObjectOperation& op = opair.second; + auto iter = obj_to_log.find(oid); + pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr; + if (oid.is_temp()) { + if (op.is_fresh_object()) { + temp_added->insert(oid); + } else if (op.is_delete()) { + temp_removed->insert(oid); + } } - }); + + // Transactions must be submitted in the same order that they were planned in. + ceph_assert(!plans.plans.empty()); + ECTransaction::WritePlanObj &plan = plans.plans.front(); + ceph_assert(plan.hoid == oid); + + Generate generate(t, ec_impl, pgid, sinfo, partial_extents, written_map, + *transactions, osdmap, oid, op, plan, dpp, entry); + + plans.plans.pop_front(); + }); } diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h index 8ca7b4da53ed0..64bb8eed1b4a5 100644 --- a/src/osd/ECTransaction.h +++ b/src/osd/ECTransaction.h @@ -12,184 +12,133 @@ * */ -#ifndef ECTRANSACTION_H -#define ECTRANSACTION_H +#pragma once +#include "common/dout.h" #include "ECUtil.h" -#include "ExtentCache.h" #include "erasure-code/ErasureCodeInterface.h" #include "os/Transaction.h" #include "PGTransaction.h" namespace ECTransaction { - struct WritePlan { - bool invalidates_cache = false; // Yes, both are possible - std::map to_read; - std::map will_write; // superset of to_read +class WritePlanObj { + public: + const hobject_t hoid; + std::optional to_read; + ECUtil::shard_extent_set_t will_write; + const ECUtil::HashInfoRef hinfo; + const ECUtil::HashInfoRef shinfo; + const uint64_t orig_size; + uint64_t projected_size; + bool invalidates_cache; + bool do_parity_delta_write = false; + + WritePlanObj( + const hobject_t &hoid, + const PGTransaction::ObjectOperation &op, + const ECUtil::stripe_info_t &sinfo, + const shard_id_set readable_shards, + const shard_id_set writable_shards, + const bool object_in_cache, + uint64_t orig_size, + const std::optional &oi, + const std::optional &soi, + const ECUtil::HashInfoRef &&hinfo, + const ECUtil::HashInfoRef &&shinfo, + const unsigned pdw_write_mode); + + void print(std::ostream &os) const { + os << "to_read: " << to_read + << " will_write: " << will_write + << " hinfo: " << hinfo + << " shinfo: " << shinfo + << " orig_size: " << orig_size + << " projected_size: " << projected_size + << " invalidates_cache: " << invalidates_cache + << " do_pdw: " << do_parity_delta_write; + } +}; - std::map hash_infos; - }; +struct WritePlan { + bool want_read; + std::list plans; + + void print(std::ostream &os) const { + os << " { plans : "; + bool first = true; + for (auto && p : plans) { + if (first) { + first = false; + } else { + os << ", "; + } + os << p; + } + os << "}"; + } +}; - template - WritePlan get_write_plan( +class Generate { + PGTransaction &t; + const ErasureCodeInterfaceRef &ec_impl; + const pg_t &pgid; + const ECUtil::stripe_info_t &sinfo; + shard_id_map &transactions; + DoutPrefixProvider *dpp; + const OSDMapRef &osdmap; + pg_log_entry_t *entry; + const hobject_t &oid; + PGTransaction::ObjectOperation& op; + ObjectContextRef obc; + std::map> xattr_rollback; + const WritePlanObj &plan; + std::optional read_sem; + ECUtil::shard_extent_map_t to_write; + std::vector> rollback_extents; + std::vector rollback_shards; + uint32_t fadvise_flags = 0; + + void all_shards_written(); + void shard_written(const shard_id_t shard); + void shards_written(const shard_id_set &shards); + void delete_first(); + void zero_truncate_to_delete(); + void process_init(); + void encode_and_write(); + void truncate(); + void overlay_writes(); + void appends_and_clone_ranges(); + void written_and_present_shards(); + void attr_updates(); + void handle_deletes(); + + public: + Generate(PGTransaction &t, + ErasureCodeInterfaceRef &ec_impl, pg_t &pgid, const ECUtil::stripe_info_t &sinfo, - PGTransaction& t, - F &&get_hinfo, - DoutPrefixProvider *dpp) { - WritePlan plan; - t.safe_create_traverse( - [&](std::pair &i) { - const auto& [obj, op] = i; - ECUtil::HashInfoRef hinfo = get_hinfo(obj); - plan.hash_infos[obj] = hinfo; - - uint64_t projected_size = - hinfo->get_projected_total_logical_size(sinfo); - - if (op.deletes_first()) { - ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size" - << " to 0" << dendl; - projected_size = 0; - } - - hobject_t source; - if (op.has_source(&source)) { - // typically clone or mv - plan.invalidates_cache = true; - - ECUtil::HashInfoRef shinfo = get_hinfo(source); - projected_size = shinfo->get_projected_total_logical_size(sinfo); - plan.hash_infos[source] = shinfo; - } - - auto &will_write = plan.will_write[obj]; - if (op.truncate && - op.truncate->first < projected_size) { - if (!(sinfo.logical_offset_is_stripe_aligned( - op.truncate->first))) { - plan.to_read[obj].union_insert( - sinfo.logical_to_prev_stripe_offset(op.truncate->first), - sinfo.get_stripe_width()); - - ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl; - - will_write.union_insert( - sinfo.logical_to_prev_stripe_offset(op.truncate->first), - sinfo.get_stripe_width()); - } - projected_size = sinfo.logical_to_next_stripe_offset( - op.truncate->first); - } - - extent_set raw_write_set; - for (auto &&extent: op.buffer_updates) { - using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; - if (boost::get(&(extent.get_val()))) { - ceph_assert( - 0 == - "CloneRange is not allowed, do_op should have returned ENOTSUPP"); - } - raw_write_set.insert(extent.get_off(), extent.get_len()); - } - - auto orig_size = projected_size; - for (auto extent = raw_write_set.begin(); - extent != raw_write_set.end(); - ++extent) { - uint64_t head_start = - sinfo.logical_to_prev_stripe_offset(extent.get_start()); - uint64_t head_finish = - sinfo.logical_to_next_stripe_offset(extent.get_start()); - if (head_start > projected_size) { - head_start = projected_size; - } - if (head_start != head_finish && - head_start < orig_size) { - ceph_assert(head_finish <= orig_size); - ceph_assert(head_finish - head_start == sinfo.get_stripe_width()); - ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe " - << head_start << "~" << sinfo.get_stripe_width() - << dendl; - plan.to_read[obj].union_insert( - head_start, sinfo.get_stripe_width()); - } - - uint64_t tail_start = - sinfo.logical_to_prev_stripe_offset( - extent.get_start() + extent.get_len()); - uint64_t tail_finish = - sinfo.logical_to_next_stripe_offset( - extent.get_start() + extent.get_len()); - if (tail_start != tail_finish && - (head_start == head_finish || tail_start != head_start) && - tail_start < orig_size) { - ceph_assert(tail_finish <= orig_size); - ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width()); - ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe " - << tail_start << "~" << sinfo.get_stripe_width() - << dendl; - plan.to_read[obj].union_insert( - tail_start, sinfo.get_stripe_width()); - } - - if (head_start != tail_finish) { - ceph_assert( - sinfo.logical_offset_is_stripe_aligned( - tail_finish - head_start) - ); - will_write.union_insert( - head_start, tail_finish - head_start); - if (tail_finish > projected_size) - projected_size = tail_finish; - } else { - ceph_assert(tail_finish <= projected_size); - } - } - - if (op.truncate && op.truncate->second > projected_size) { - uint64_t truncating_to = - sinfo.logical_to_next_stripe_offset(op.truncate->second); - ldpp_dout(dpp, 20) << __func__ << ": truncating out to " - << truncating_to - << dendl; - will_write.union_insert(projected_size, - truncating_to - projected_size); - projected_size = truncating_to; - } - - ldpp_dout(dpp, 20) << __func__ << ": " << obj - << " projected size " - << projected_size - << dendl; - hinfo->set_projected_total_logical_size( - sinfo, - projected_size); - - /* validate post conditions: - * to_read should have an entry for `obj` if it isn't empty - * and if we are reading from `obj`, we can't be renaming or - * cloning it */ - ceph_assert(plan.to_read.count(obj) == 0 || - (!plan.to_read.at(obj).empty() && - !i.second.has_source())); - }); - return plan; - } + const std::map &partial_extents, + std::map *written_map, + shard_id_map &transactions, + const OSDMapRef &osdmap, + const hobject_t &oid, PGTransaction::ObjectOperation &op, + WritePlanObj &plan, + DoutPrefixProvider *dpp, + pg_log_entry_t *entry); +}; - void generate_transactions( - PGTransaction* _t, +void generate_transactions( + PGTransaction *_t, WritePlan &plan, - ceph::ErasureCodeInterfaceRef &ecimpl, + ceph::ErasureCodeInterfaceRef &ec_impl, pg_t pgid, const ECUtil::stripe_info_t &sinfo, - const std::map &partial_extents, + const std::map &partial_extents, std::vector &entries, - std::map *written, - std::map *transactions, + std::map *written_map, + shard_id_map *transactions, std::set *temp_added, std::set *temp_removed, DoutPrefixProvider *dpp, - const ceph_release_t require_osd_release = ceph_release_t::unknown); -}; - -#endif + const OSDMapRef &osdmap + ); +} diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc index d24095809f5f5..9baf951e8b6ab 100644 --- a/src/osd/ECUtil.cc +++ b/src/osd/ECUtil.cc @@ -9,215 +9,1100 @@ #include "global/global_context.h" #include "include/encoding.h" -/* This file is soon going to be replaced (before next release), so we are going - * to simply ignore all deprecated warnings. - * */ -IGNORE_DEPRECATED - using namespace std; using ceph::bufferlist; using ceph::ErasureCodeInterfaceRef; using ceph::Formatter; -std::pair ECUtil::stripe_info_t::chunk_aligned_offset_len_to_chunk( - std::pair in) const { - pair tmp = offset_len_to_stripe_bounds(in); +template +using shard_id_map = shard_id_map; + +std::pair +ECUtil::stripe_info_t::chunk_aligned_ro_range_to_shard_ro_range( + uint64_t _off, uint64_t _len) const { + auto [off, len] = ro_offset_len_to_stripe_ro_offset_len(_off, _len); return std::make_pair( - chunk_aligned_logical_offset_to_chunk_offset(tmp.first), - chunk_aligned_logical_size_to_chunk_size(tmp.second)); + chunk_aligned_ro_offset_to_chunk_offset(off), + chunk_aligned_ro_length_to_shard_length(len)); } -int ECUtil::decode( - const stripe_info_t &sinfo, - ErasureCodeInterfaceRef &ec_impl, - const set want_to_read, - map &to_decode, - bufferlist *out) -{ - ceph_assert(to_decode.size()); +/* +ASCII Art describing the various variables in the following function: + start end + | | + | | + | | + - - - - - -v- -+---+-----------+ - - - - - - + start_adj| | | ^ +to_read.offset - ->-------+ | | chunk_size + | | | v + +------+ - - - - - + - - - - - + - - - - - - + | | | + | v | + | - - - - +-------+ + | end_adj| + | +-------+ + | | | + +--------------+ | + | | + | shard | + +Given an offset and size, this adds to a vector of extents describing the +minimal IO ranges on each shard. If passed, this method will also populate +a superset of all extents required. + */ +void ECUtil::stripe_info_t::ro_range_to_shards( + uint64_t ro_offset, + uint64_t ro_size, + shard_extent_set_t *shard_extent_set, + extent_set *extent_superset, + buffer::list *bl, + shard_extent_map_t *shard_extent_map) const { + // Some of the maths below assumes size not zero. + if (ro_size == 0) { + return; + } + + uint64_t k = get_k(); + + // Aim is to minimise non-^2 divs (chunk_size is assumed to be a power of 2). + // These should be the only non ^2 divs. + uint64_t begin_div = ro_offset / stripe_width; + uint64_t end_div = (ro_offset + ro_size + stripe_width - 1) / stripe_width - + 1; + uint64_t start = begin_div * chunk_size; + uint64_t end = end_div * chunk_size; + + uint64_t start_shard = (ro_offset - begin_div * stripe_width) / chunk_size; + uint64_t chunk_count = (ro_offset + ro_size + chunk_size - 1) / chunk_size - + ro_offset / chunk_size;; + + // The end_shard needs a modulus to calculate the actual shard, however + // it is convenient to store it like this for the loop. + auto end_shard = start_shard + std::min(chunk_count, k); + + // The last shard is the raw shard index which contains the last chunk. + // Is it possible to calculate this without th e +%? + uint64_t last_shard = (start_shard + chunk_count - 1) % k; + + uint64_t buffer_shard_start_offset = 0; + + for (auto i = start_shard; i < end_shard; i++) { + raw_shard_id_t raw_shard(i >= k ? i - k : i); + + // Adjust the start and end blocks if needed. + uint64_t start_adj = 0; + uint64_t end_adj = 0; + + if (raw_shard < start_shard) { + // Shards before the start, must start on the next chunk. + start_adj = chunk_size; + } else if (int(raw_shard) == int(start_shard)) { + // The start shard itself needs to be moved a partial-chunk forward. + start_adj = ro_offset % chunk_size; + } + + // The end is similar to the start, but the end must be rounded up. + if (raw_shard < last_shard) { + end_adj = chunk_size; + } else if (int(raw_shard) == int(last_shard)) { + end_adj = (ro_offset + ro_size - 1) % chunk_size + 1; + } + + shard_id_t shard = get_shard(raw_shard); + + uint64_t off = start + start_adj; + uint64_t len = end + end_adj - start - start_adj; + if (shard_extent_set) { + (*shard_extent_set)[shard].union_insert(off, len); + } + + if (extent_superset) { + extent_superset->union_insert(off, len); + } - uint64_t total_data_size = to_decode.begin()->second.length(); - ceph_assert(total_data_size % sinfo.get_chunk_size() == 0); + if (shard_extent_map) { + ceph_assert(bl); + buffer::list shard_bl; - ceph_assert(out); - ceph_assert(out->length() == 0); + uint64_t bl_offset = buffer_shard_start_offset; - for (map::iterator i = to_decode.begin(); - i != to_decode.end(); - ++i) { - ceph_assert(i->second.length() == total_data_size); + // Start with any partial chunks. + if (chunk_size != start_adj) { + shard_bl.substr_of(*bl, bl_offset, + min(static_cast(bl->length()) - bl_offset, + chunk_size - start_adj)); + buffer_shard_start_offset += chunk_size - start_adj; + bl_offset += chunk_size - start_adj + (k - 1) * chunk_size; + } else { + buffer_shard_start_offset += chunk_size; + } + while (bl_offset < bl->length()) { + buffer::list tmp; + tmp.substr_of(*bl, bl_offset, + min(chunk_size, bl->length() - bl_offset)); + shard_bl.append(tmp); + bl_offset += k * chunk_size; + } + shard_extent_map->insert_in_shard(shard, off, shard_bl, ro_offset, + ro_offset + ro_size); + } } +} - if (total_data_size == 0) - return 0; +void ECUtil::stripe_info_t::trim_shard_extent_set_for_ro_offset( + uint64_t ro_offset, + shard_extent_set_t &shard_extent_set) const { + /* If the offset is within the first shard, then the remaining shards are + * not written and we don't need to generated zeros for either */ + int ro_offset_shard = (ro_offset / chunk_size) % k; + if (ro_offset_shard == 0) { + uint64_t shard_offset = ro_offset_to_shard_offset( + ro_offset, raw_shard_id_t(0)); + for (auto &&iter = shard_extent_set.begin(); iter != shard_extent_set.end() + ;) { + iter->second.erase_after(align_page_next(shard_offset)); + if (iter->second.empty()) iter = shard_extent_set.erase(iter); + else ++iter; + } + } +} + +void ECUtil::stripe_info_t::ro_size_to_stripe_aligned_read_mask( + uint64_t ro_size, + shard_extent_set_t &shard_extent_set) const { + ro_range_to_shard_extent_set_with_parity( + 0, ro_offset_to_next_stripe_ro_offset(ro_size), shard_extent_set); + trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set); +} + +void ECUtil::stripe_info_t::ro_size_to_read_mask( + uint64_t ro_size, + shard_extent_set_t &shard_extent_set) const { + ro_range_to_shard_extent_set_with_parity(0, align_page_next(ro_size), + shard_extent_set); +} + +void ECUtil::stripe_info_t::ro_size_to_zero_mask( + uint64_t ro_size, + shard_extent_set_t &shard_extent_set) const { + // There should never be any zero padding on the parity. + ro_range_to_shard_extent_set(align_page_next(ro_size), + ro_offset_to_next_stripe_ro_offset(ro_size) - + align_page_next(ro_size), + shard_extent_set); + trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set); +} + +namespace ECUtil { +void shard_extent_map_t::erase_after_ro_offset(uint64_t ro_offset) { + /* Ignore the null case */ + if (ro_offset >= ro_end) { + return; + } - for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) { - map chunks; - for (map::iterator j = to_decode.begin(); - j != to_decode.end(); - ++j) { - chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size()); + shard_extent_set_t ro_to_erase(sinfo->get_k_plus_m()); + sinfo->ro_range_to_shard_extent_set(ro_offset, ro_end - ro_start, + ro_to_erase); + for (auto &&[shard, eset] : ro_to_erase) { + if (extent_maps.contains(shard)) { + extent_maps[shard].erase(eset.range_start(), eset.range_end()); + } + + // If the result is empty, delete the extent map. + if (extent_maps[shard].empty()) { + extent_maps.erase(shard); } - bufferlist bl; - int r = ec_impl->decode_concat(want_to_read, chunks, &bl); - ceph_assert(r == 0); - ceph_assert(bl.length() % sinfo.get_chunk_size() == 0); - out->claim_append(bl); } - return 0; + + compute_ro_range(); +} + +shard_extent_map_t shard_extent_map_t::intersect_ro_range( + uint64_t ro_offset, + uint64_t ro_length) const { + // Optimise (common) use case where the overlap is everything + if (ro_offset <= ro_start && + ro_offset + ro_length >= ro_end) { + return *this; + } + + // Optimise (common) use cases where the overlap is nothing + if (ro_offset >= ro_end || + ro_offset + ro_length <= ro_start) { + return shard_extent_map_t(sinfo); + } + + shard_extent_set_t ro_to_intersect(sinfo->get_k_plus_m()); + sinfo->ro_range_to_shard_extent_set(ro_offset, ro_length, ro_to_intersect); + + return intersect(ro_to_intersect); +} + +shard_extent_map_t shard_extent_map_t::intersect( + optional const &other) const { + if (!other) { + return shard_extent_map_t(sinfo); + } + + return intersect(*other); +} + +shard_extent_map_t shard_extent_map_t::intersect( + shard_extent_set_t const &other) const { + shard_extent_map_t out(sinfo); + out.ro_end = 0; + out.end_offset = 0; + + for (auto &&[shard, this_eset] : other) { + if (extent_maps.contains(shard)) { + extent_map tmp; + extent_set eset; + extent_maps.at(shard).to_interval_set(eset); + eset.intersection_of(this_eset); + + for (auto [offset, len] : eset) { + bufferlist bl; + get_buffer(shard, offset, len, bl); + tmp.insert(offset, len, bl); + } + if (!tmp.empty()) { + uint64_t range_start = tmp.get_start_off(); + uint64_t range_end = tmp.get_end_off(); + + out.start_offset = min(out.start_offset, range_start); + out.end_offset = max(out.end_offset, range_end); + + raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard); + if (raw_shard < sinfo->get_k()) { + out.ro_start = std::min(out.ro_start, + calc_ro_offset(raw_shard, range_start)); + out.ro_end = std::max(out.ro_end, calc_ro_end(raw_shard, range_end)); + } + + out.extent_maps.emplace(shard, std::move(tmp)); + } + } + } + + if (out.ro_start == invalid_offset) { + out.ro_end = out.end_offset = invalid_offset; + } + + return out; +} + +void shard_extent_map_t::insert(shard_extent_map_t const &other) { + for (auto &&[shard, emap] : other.extent_maps) { + if (!extent_maps.contains(shard)) { + extent_maps.emplace(shard, emap); + } else { + extent_maps[shard].insert(emap); + } + } + + if (ro_start == invalid_offset || other.ro_start < ro_start) { + ro_start = other.ro_start; + } + if (ro_end == invalid_offset || other.ro_end > ro_end) { + ro_end = other.ro_end; + } + if (start_offset == invalid_offset || other.start_offset < start_offset) { + start_offset = other.start_offset; + } + if (end_offset == invalid_offset || other.end_offset > end_offset) { + end_offset = other.end_offset; + } +} + +uint64_t shard_extent_map_t::size() { + uint64_t size = 0; + for (auto &i : extent_maps) { + for (auto &j : i.second) { + size += j.get_len(); + } + } + + return size; +} + +void shard_extent_map_t::clear() { + ro_start = ro_end = start_offset = end_offset = invalid_offset; + extent_maps.clear(); +} + +void shard_extent_map_t::deep_copy(shard_extent_map_t const &other) { + for (auto &&[shard, emap] : other.extent_maps) { + for (auto iter : emap) { + uint64_t off = iter.get_off(); + uint64_t len = iter.get_len(); + bufferlist bl = iter.get_val(); + bl.rebuild(); + extent_maps[shard].insert(off, len, bl); + } + } +} + +/* Insert a buffer for a particular shard. + * NOTE: DO NOT CALL sinfo->get_min_want_shards() + */ +void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off, + const buffer::list &bl) { + if (bl.length() == 0) { + return; + } + + extent_maps[shard].insert(off, bl.length(), bl); + raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard); + + if (raw_shard > sinfo->get_k()) { + return; + } + + uint64_t new_start = calc_ro_offset(sinfo->get_raw_shard(shard), off); + uint64_t new_end = + calc_ro_end(sinfo->get_raw_shard(shard), off + bl.length()); + if (empty()) { + ro_start = new_start; + ro_end = new_end; + start_offset = off; + end_offset = off + bl.length(); + } else { + ro_start = min(ro_start, new_start); + ro_end = max(ro_end, new_end); + start_offset = min(start_offset, off); + end_offset = max(end_offset, off + bl.length()); + } } -int ECUtil::decode( - const stripe_info_t &sinfo, - ErasureCodeInterfaceRef &ec_impl, - map &to_decode, - map &out) { +/* Insert a buffer for a particular shard. + * If the client knows the new start and end, use this interface to improve + * performance. + */ +void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off, + const buffer::list &bl, + uint64_t new_start, uint64_t new_end) { + if (bl.length() == 0) { + return; + } + + extent_maps[shard].insert(off, bl.length(), bl); + if (empty()) { + ro_start = new_start; + ro_end = new_end; + start_offset = off; + end_offset = off + bl.length(); + } else { + ro_start = min(ro_start, new_start); + ro_end = max(ro_end, new_end); + start_offset = min(start_offset, off); + end_offset = max(end_offset, off + bl.length()); + } +} - ceph_assert(to_decode.size()); +/* Insert a region of zeros in rados object address space.. + */ +void shard_extent_map_t::insert_ro_zero_buffer(uint64_t ro_offset, + uint64_t ro_length) { + buffer::list zero_buffer; + zero_buffer.append_zero(ro_length); + sinfo->ro_range_to_shard_extent_map(ro_offset, ro_length, zero_buffer, *this); +} - for (auto &&i : to_decode) { - if(i.second.length() == 0) - return 0; +/* Append zeros to the extent maps, such that all bytes from the current end + * of the rados object range to the specified offset are zero. Note that the + * byte at ro_offset does NOT get populated, so that this works as an + * addition to length. + */ +void shard_extent_map_t::append_zeros_to_ro_offset(uint64_t ro_offset) { + uint64_t _ro_end = ro_end == invalid_offset ? 0 : ro_end; + if (ro_offset <= _ro_end) { + return; } + uint64_t append_offset = _ro_end; + uint64_t append_length = ro_offset - _ro_end; + insert_ro_zero_buffer(append_offset, append_length); +} - set need; - for (map::iterator i = out.begin(); - i != out.end(); - ++i) { - ceph_assert(i->second); - ceph_assert(i->second->length() == 0); - need.insert(i->first); +/* This method rearranges buffers from a rados object extent map into a shard + * extent map. Note that it is a simple transformation, it does NOT perform + * any encoding of parity shards. + */ +void shard_extent_map_t::insert_ro_extent_map(const extent_map &host_extent_map) { + for (auto &&range = host_extent_map.begin(); + range != host_extent_map.end(); + ++range) { + buffer::list bl = range.get_val(); + sinfo->ro_range_to_shard_extent_map( + range.get_off(), + range.get_len(), + bl, + *this); } +} - set avail; - for (auto &&i : to_decode) { - ceph_assert(i.second.length() != 0); - avail.insert(i.first); +extent_set shard_extent_map_t::get_extent_superset() const { + extent_set eset; + + for (auto &&[shard, emap] : extent_maps) { + emap.to_interval_set(eset); } - map>> min; - int r = ec_impl->minimum_to_decode(need, avail, &min); - ceph_assert(r == 0); + return eset; +} - int chunks_count = 0; - int repair_data_per_chunk = 0; - int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count(); +void shard_extent_map_t::insert_parity_buffers() { + extent_set encode_set = get_extent_superset(); - for (auto &&i : to_decode) { - auto found = min.find(i.first); - if (found != min.end()) { - int repair_subchunk_count = 0; - for (auto& subchunks : min[i.first]) { - repair_subchunk_count += subchunks.second; + /* Invent buffers for the parity coding, if they were not provided. + * e.g. appends will not provide parity buffers. + * We should EITHER have no buffers, or have the right buffers. + */ + for (raw_shard_id_t raw_shard(sinfo->get_k()); raw_shard < sinfo-> + get_k_plus_m(); ++raw_shard) { + shard_id_t shard = sinfo->get_shard(raw_shard); + + for (auto &&[offset, length] : encode_set) { + /* No need to recreate buffers we already have */ + if (extent_maps.contains(shard)) { + extent_map emap = extent_maps.at(shard); + if (emap.contains(offset, length)) + continue; } - repair_data_per_chunk = repair_subchunk_count * subchunk_size; - chunks_count = (int)i.second.length() / repair_data_per_chunk; - break; + bufferlist bl; + bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE)); + extent_maps[shard].insert(offset, length, bl); } } +} + +slice_iterator shard_extent_map_t::begin_slice_iterator( + const shard_id_set &out) { + return slice_iterator(extent_maps, out); +} + +/* Encode parity chunks, using the encode_chunks interface into the + * erasure coding. This generates all parity using full stripe writes. + */ +int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) { + shard_id_set out_set = sinfo->get_parity_shards(); + bool rebuild_req = false; + + for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) { + if (!iter.is_page_aligned()) { + rebuild_req = true; + break; + } - for (int i = 0; i < chunks_count; i++) { - map chunks; - for (auto j = to_decode.begin(); - j != to_decode.end(); - ++j) { - chunks[j->first].substr_of(j->second, - i*repair_data_per_chunk, - repair_data_per_chunk); + shard_id_map &in = iter.get_in_bufferptrs(); + shard_id_map &out = iter.get_out_bufferptrs(); + + if (int ret = ec_impl->encode_chunks(in, out)) { + return ret; } - map out_bls; - r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size()); - ceph_assert(r == 0); - for (auto j = out.begin(); j != out.end(); ++j) { - ceph_assert(out_bls.count(j->first)); - ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size()); - j->second->claim_append(out_bls[j->first]); + } + + if (rebuild_req) { + pad_and_rebuild_to_page_align(); + return _encode(ec_impl); + } + + return 0; +} + +/* Encode parity chunks, using the encode_chunks interface into the + * erasure coding. This generates all parity using full stripe writes. + */ +int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl, + const HashInfoRef &hinfo, + uint64_t before_ro_size) { + int r = _encode(ec_impl); + + if (!r && hinfo && ro_start >= before_ro_size) { + /* NEEDS REVIEW: The following calculates the new hinfo CRCs. This is + * currently considering ALL the buffers, including the + * parity buffers. Is this really right? + * Also, does this really belong here? Its convenient + * because have just built the buffer list... + */ + shard_id_set full_set; + full_set.insert_range(shard_id_t(0), sinfo->get_k_plus_m()); + for (auto iter = begin_slice_iterator(full_set); !iter.is_end(); ++iter) { + ceph_assert(ro_start == before_ro_size); + hinfo->append(iter.get_offset(), iter.get_in_bufferptrs()); } } - for (auto &&i : out) { - ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size()); + + return r; +} + +/* Encode parity chunks, using the parity delta write interfaces on plugins + * that support them. + */ +int shard_extent_map_t::encode_parity_delta( + const ErasureCodeInterfaceRef &ec_impl, + shard_extent_map_t &old_sem) { + shard_id_set out_set = sinfo->get_parity_shards(); + + pad_and_rebuild_to_page_align(); + old_sem.pad_and_rebuild_to_page_align(); + + for (auto data_shard : sinfo->get_data_shards()) { + shard_extent_map_t s(sinfo); + if (!contains_shard(data_shard)) { + continue; + } + s.extent_maps[shard_id_t(0)] = old_sem.extent_maps[data_shard]; + s.extent_maps[shard_id_t(1)] = extent_maps[data_shard]; + for (shard_id_t parity_shard : sinfo->get_parity_shards()) { + if (extent_maps.contains(parity_shard)) { + s.extent_maps[parity_shard] = extent_maps[parity_shard]; + } + } + + s.compute_ro_range(); + + for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) { + ceph_assert(iter.is_page_aligned()); + shard_id_map &data_shards = iter.get_in_bufferptrs(); + shard_id_map &parity_shards = iter.get_out_bufferptrs(); + + unsigned int size = iter.get_length(); + ceph_assert(size % 4096 == 0); + ceph_assert(size > 0); + bufferptr delta = buffer::create_aligned(size, CEPH_PAGE_SIZE); + + if (data_shards[shard_id_t(0)].length() != 0 && data_shards[shard_id_t(1)] + .length() != 0) { + ec_impl->encode_delta(data_shards[shard_id_t(0)], + data_shards[shard_id_t(1)], &delta); + shard_id_map in(sinfo->get_k_plus_m()); + in.emplace(data_shard, delta); + ec_impl->apply_delta(in, parity_shards); + } + } } + + compute_ro_range(); return 0; } -int ECUtil::encode( - const stripe_info_t &sinfo, - ErasureCodeInterfaceRef &ec_impl, - bufferlist &in, - const set &want, - map *out) { +void shard_extent_map_t::pad_on_shards(const shard_extent_set_t &pad_to, + const shard_id_set &shards) { + for (auto &shard : shards) { + if (!pad_to.contains(shard)) { + continue; + } + for (auto &[off, length] : pad_to.at(shard)) { + bufferlist bl; + bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE)); + insert_in_shard(shard, off, bl); + } + } +} - uint64_t logical_size = in.length(); +void shard_extent_map_t::pad_on_shards(const extent_set &pad_to, + const shard_id_set &shards) { + for (auto &shard : shards) { + for (auto &[off, length] : pad_to) { + bufferlist bl; + bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE)); + insert_in_shard(shard, off, bl); + } + } +} + +/* Trim to the specified extent set. Note that this will panic if the shard + * extent set does not contain the extents described in trim_to. + */ +void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) { - ceph_assert(logical_size % sinfo.get_stripe_width() == 0); - ceph_assert(out); - ceph_assert(out->empty()); + // Erase any shards missing from trim_to + for ( auto iter = extent_maps.begin(); iter != extent_maps.end();) { + auto && [shard, emap] = *iter; + if (!trim_to.contains(shard)) { + iter = extent_maps.erase(iter); + } else { + ++iter; + } + } + for (auto &&[shard, want_eset] : trim_to) { + extent_set tmp; + ceph_assert(extent_maps.contains(shard)); + extent_map &emap = extent_maps.at(shard); + emap.to_interval_set(tmp); + ceph_assert(tmp.contains(want_eset)); + + // Now trim to what was requested. + if (tmp.size() != want_eset.size()) { + tmp.subtract(trim_to.at(shard)); + for (auto [off, len] : tmp) { + emap.erase(off, len); + } + } + } - if (logical_size == 0) + compute_ro_range(); +} + +int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl, + const shard_extent_set_t &want, + uint64_t object_size) { + shard_id_set want_set; + shard_id_set have_set; + want.populate_shard_id_set(want_set); + extent_maps.populate_bitset_set(have_set); + + shard_id_set need_set = shard_id_set::difference(want_set, have_set); + + /* Optimise the no-op */ + if (need_set.empty()) { return 0; + } + + if (add_zero_padding_for_decode(object_size, need_set)) { + // We added some zero buffers, which means our have and need set may change + extent_maps.populate_bitset_set(have_set); + need_set = shard_id_set::difference(want_set, have_set); + } + + shard_id_set decode_set = shard_id_set::intersection(need_set, sinfo->get_data_shards()); + shard_id_set encode_set = shard_id_set::intersection(need_set, sinfo->get_parity_shards()); + int r = 0; + if (!decode_set.empty()) { + pad_on_shards(want, decode_set); + /* If we are going to be encoding, we need to make sure all the necessary + * shards are decoded. The get_min_available functions should have already + * worked out what needs to be read for this. + */ + extent_set decode_for_parity; + for (auto shard : encode_set) { + decode_for_parity.insert(want.at(shard)); + } + pad_on_shards(decode_for_parity, decode_set); + r = _decode(ec_impl, want_set, decode_set); + } + if (!r && !encode_set.empty()) { + pad_on_shards(want, encode_set); + r = _encode(ec_impl); + } + + // If we failed to decode, then bail out, or the trimming below might fail. + if (r) { + return r; + } + + /* Some of the above can invent buffers. There are some edge cases whereby + * they can invent buffers outside the want extent_set which are actually + * invalid. So here, we trim off those buffers. + */ + trim(want); + + return 0; +} + +int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl, + const shard_id_set &want_set, + const shard_id_set &need_set) { + bool rebuild_req = false; + for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) { + if (!iter.is_page_aligned()) { + rebuild_req = true; + break; + } + shard_id_map &in = iter.get_in_bufferptrs(); + shard_id_map &out = iter.get_out_bufferptrs(); - for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) { - map encoded; - bufferlist buf; - buf.substr_of(in, i, sinfo.get_stripe_width()); - int r = ec_impl->encode(want, buf, &encoded); - ceph_assert(r == 0); - for (map::iterator i = encoded.begin(); - i != encoded.end(); - ++i) { - ceph_assert(i->second.length() == sinfo.get_chunk_size()); - (*out)[i->first].claim_append(i->second); + if (int ret = ec_impl->decode_chunks(want_set, in, out)) { + return ret; } } - for (map::iterator i = out->begin(); - i != out->end(); - ++i) { - ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0); - ceph_assert( - sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) == - logical_size); + if (rebuild_req) { + pad_and_rebuild_to_page_align(); + return _decode(ec_impl, want_set, need_set); } + + compute_ro_range(); + return 0; } +void shard_extent_map_t::pad_and_rebuild_to_page_align() { + bool resized = false; + for (auto &&[shard, emap] : extent_maps) { + extent_map aligned; + + // Inserting while iterating is not supported in extent maps, make the + // iterated-over emap const to help defend against mistakes. + const extent_map &cemap = emap; + for (auto i = cemap.begin(); i != cemap.end(); ++i) { + bool resized_i = false; + bufferlist bl = i.get_val(); + uint64_t start = i.get_off(); + uint64_t end = start + i.get_len(); + + if ((start & ~CEPH_PAGE_MASK) != 0) { + bl.prepend_zero(start - (start & CEPH_PAGE_MASK)); + start = start & CEPH_PAGE_MASK; + resized_i = true; + } + if ((end & ~CEPH_PAGE_MASK) != 0) { + bl.append_zero((end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE - end); + end = (end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE; + resized_i = true; + } + + // Perhaps we can get away without page aligning here and only SIMD + // align. However, typical workloads are actually page aligned already, + // so this should not cause problems on any sensible workload. + if (bl.rebuild_aligned_size_and_memory(bl.length(), CEPH_PAGE_SIZE) || + resized_i) { + // We are not permitted to modify the emap while iterating. + aligned.insert(start, end - start, bl); + } + if (resized_i) resized = true; + } + emap.insert(aligned); + } + + if (resized) { + compute_ro_range(); + } +} + +shard_extent_map_t shard_extent_map_t::slice_map( + uint64_t offset, uint64_t length) const { + // Range entirely contains offset - this will be common for small IO. + if (offset <= start_offset && offset + length >= end_offset) return *this; + + shard_extent_map_t slice(sinfo); + + // Null cases just generate an empty map. + if (offset >= end_offset) { + return slice; + } + if (offset + length <= start_offset) { + return slice; + } + + slice.end_offset = slice.ro_end = 0; + + for (auto &&[shard, emap] : extent_maps) { + extent_map iemap = emap.intersect(offset, length); + + if (!iemap.empty()) { + slice.start_offset = min(slice.start_offset, iemap.get_start_off()); + slice.end_offset = max(slice.start_offset, iemap.get_end_off()); + slice.ro_start = min(slice.start_offset, + calc_ro_offset(sinfo->get_raw_shard(shard), + iemap.get_start_off())); + slice.ro_end = min(slice.ro_end, + calc_ro_end(sinfo->get_raw_shard(shard), + iemap.get_end_off())); + slice.extent_maps.emplace(shard, iemap); + } + } + + if (slice.end_offset == 0) { + slice.end_offset = slice.ro_end = invalid_offset; + } + + return slice; +} + +void shard_extent_map_t::get_buffer(shard_id_t shard, uint64_t offset, + uint64_t length, + buffer::list &append_to) const { + const extent_map &emap = extent_maps.at(shard); + auto &&range = emap.get_lower_range(offset, length); + + if (range == emap.end() || !emap.contains(offset, length)) { + return; + } + + if (range.get_len() == length) { + buffer::list bl = range.get_val(); + // This should be asserted on extent map insertion. + ceph_assert(bl.length() == length); + append_to.append(bl); + } else { + buffer::list bl; + bl.substr_of(range.get_val(), offset - range.get_off(), length); + append_to.append(bl); + } +} + +void shard_extent_map_t::get_shard_first_buffer(shard_id_t shard, + buffer::list &append_to) const { + if (!extent_maps.contains(shard)) { + return; + } + const extent_map &emap = extent_maps.at(shard); + auto range = emap.begin(); + if (range == emap.end()) { + return; + } + + append_to.append(range.get_val()); +} + +uint64_t shard_extent_map_t::get_shard_first_offset(shard_id_t shard) const { + if (!extent_maps.contains(shard)) { + return invalid_offset; + } + const extent_map &emap = extent_maps.at(shard); + auto range = emap.begin(); + if (range == emap.end()) { + return invalid_offset; + } + + return range.get_off(); +} + +void shard_extent_map_t::zero_pad(shard_extent_set_t const &pad_to) { + for (auto &&[shard, eset] : pad_to) { + for (auto &&[off, len] : eset) { + zero_pad(shard, off, len); + } + } +} + +void shard_extent_map_t::zero_pad(shard_id_t shard, uint64_t offset, + uint64_t length) { + const extent_map &emap = extent_maps[shard]; + if (emap.contains(offset, length)) { + return; + } + + extent_set required; + required.union_insert(offset, length); + extent_set not_required; + emap.to_interval_set(not_required); + required.subtract(not_required); + + for (auto [z_off, z_len] : required) { + bufferlist zeros; + zeros.append_zero(z_len); + insert_in_shard(shard, z_off, zeros); + } +} + +void shard_extent_map_t::pad_with_other(shard_extent_set_t const &pad_to, + shard_extent_map_t const &other) { + for (auto &&[shard, eset] : pad_to) { + for (auto &&[off, len] : eset) { + pad_with_other(shard, off, len, other); + } + } +} + +void shard_extent_map_t::pad_with_other(shard_id_t shard, uint64_t offset, + uint64_t length, + shard_extent_map_t const &other) { + const extent_map &emap = extent_maps[shard]; + if (emap.contains(offset, length)) return; + + extent_set required; + required.union_insert(offset, length); + extent_set not_required; + emap.to_interval_set(not_required); + required.subtract(not_required); + + for (auto [z_off, z_len] : required) { + bufferlist bl; + other.get_buffer(shard, z_off, z_len, bl); + bl.rebuild(); + insert_in_shard(shard, z_off, bl); + } +} + +ECUtil::shard_extent_set_t shard_extent_map_t::get_extent_set() { + shard_extent_set_t shard_eset(sinfo->get_k_plus_m()); + for (auto &&[shard, emap] : extent_maps) { + emap.to_interval_set(shard_eset[shard]); + } + + return shard_eset; +} + +void shard_extent_map_t::erase_shard(shard_id_t shard) { + if (extent_maps.erase(shard)) { + compute_ro_range(); + } +} + +bufferlist shard_extent_map_t::get_ro_buffer( + uint64_t ro_offset, + uint64_t ro_length) const { + bufferlist bl; + uint64_t chunk_size = sinfo->get_chunk_size(); + uint64_t stripe_size = sinfo->get_stripe_width(); + int data_chunk_count = sinfo->get_k(); + + pair read_pair(ro_offset, ro_length); + auto chunk_aligned_read = sinfo->ro_range_to_chunk_ro_range(read_pair); + + raw_shard_id_t raw_shard((ro_offset / chunk_size) % data_chunk_count); + + for (uint64_t chunk_offset = chunk_aligned_read.first; + chunk_offset < chunk_aligned_read.first + chunk_aligned_read.second; + chunk_offset += chunk_size, ++raw_shard) { + if ((int(raw_shard) == data_chunk_count)) { + raw_shard = 0; + } + + uint64_t sub_chunk_offset = std::max(chunk_offset, ro_offset); + uint64_t sub_chunk_shard_offset = (chunk_offset / stripe_size) * chunk_size + + sub_chunk_offset - chunk_offset; + uint64_t sub_chunk_len = std::min(ro_offset + ro_length, + chunk_offset + chunk_size) - + sub_chunk_offset; + + get_buffer(sinfo->get_shard(raw_shard), sub_chunk_shard_offset, + sub_chunk_len, bl); + } + return bl; +} + +bufferlist shard_extent_map_t::get_ro_buffer() const { + return get_ro_buffer(ro_start, ro_end - ro_start); +} + +std::string shard_extent_map_t::debug_string(uint64_t interval, uint64_t offset) const { + std::stringstream str; + str << "shard_extent_map_t: " << *this << " bufs: ["; + + bool s_comma = false; + for (auto &&[shard, emap] : get_extent_maps()) { + if (s_comma) str << ", "; + s_comma = true; + str << shard << ": ["; + + bool comma = false; + for (auto &&extent : emap) { + bufferlist bl = extent.get_val(); + char *buf = bl.c_str(); + for (uint64_t i = 0; i < extent.get_len(); i += interval) { + int *seed = (int*)&buf[i + offset]; + if (comma) str << ", "; + str << (i + extent.get_off()) << ":" << std::to_string(*seed); + comma = true; + } + } + str << "]"; + } + str << "]"; + return str.str(); +} + +void shard_extent_map_t::erase_stripe(uint64_t offset, uint64_t length) { + for (auto iter = extent_maps.begin(); iter != extent_maps.end();) { + auto &&[shard, emap] = *iter; + emap.erase(offset, length); + if (emap.empty()) { + iter = extent_maps.erase(iter); + } else { + ++iter; + } + } + compute_ro_range(); +} + +bool shard_extent_map_t::contains(shard_id_t shard) const { + return extent_maps.contains(shard); +} + +bool shard_extent_map_t::contains(optional const &other) const { + if (!other) { + return true; + } + + return contains(*other); +} + +bool shard_extent_map_t::contains(shard_extent_set_t const &other) const { + for (auto &&[shard, other_eset] : other) { + if (!extent_maps.contains(shard)) { + return false; + } + + extent_set eset; + extent_maps.at(shard).to_interval_set(eset); + + if (!eset.contains(other_eset)) { + return false; + } + } + + return true; +} + +void shard_extent_set_t::subtract(const shard_extent_set_t &other) { + for (auto &&[shard, eset] : other) { + if (!contains(shard)) { + continue; + } + + at(shard).subtract(eset); + if (at(shard).empty()) { + erase(shard); + } + } +} + +void shard_extent_set_t::intersection_of(const shard_extent_set_t &other) { + for (shard_id_t s; s < map.max_size(); ++s) { + if (!map.contains(s) || !other.contains(s)) { + erase(s); + } else { + at(s).intersection_of(other.at(s)); + if (at(s).empty()) { + erase(s); + } + } + } +} + +void shard_extent_set_t::insert(const shard_extent_set_t &other) { + for (auto &&[shard, eset] : other) { + map[shard].union_of(other.at(shard)); + } +} +} + void ECUtil::HashInfo::append(uint64_t old_size, - map &to_append) { + shard_id_map &to_append) { ceph_assert(old_size == total_chunk_size); uint64_t size_to_append = to_append.begin()->second.length(); if (has_chunk_hash()) { ceph_assert(to_append.size() == cumulative_shard_hashes.size()); - for (map::iterator i = to_append.begin(); - i != to_append.end(); - ++i) { - ceph_assert(size_to_append == i->second.length()); - ceph_assert((unsigned)i->first < cumulative_shard_hashes.size()); - uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]); - cumulative_shard_hashes[i->first] = new_hash; + for (auto &&[shard, ptr] : to_append) { + ceph_assert(size_to_append == ptr.length()); + ceph_assert(shard < static_cast(cumulative_shard_hashes.size())); + cumulative_shard_hashes[int(shard)] = + ceph_crc32c(cumulative_shard_hashes[int(shard)], + (unsigned char*)ptr.c_str(), ptr.length()); } } total_chunk_size += size_to_append; } -void ECUtil::HashInfo::encode(bufferlist &bl) const -{ +void ECUtil::HashInfo::encode(bufferlist &bl) const { ENCODE_START(1, 1, bl); encode(total_chunk_size, bl); encode(cumulative_shard_hashes, bl); ENCODE_FINISH(bl); } -void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) -{ +void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) { DECODE_START(1, bl); decode(total_chunk_size, bl); decode(cumulative_shard_hashes, bl); - projected_total_chunk_size = total_chunk_size; DECODE_FINISH(bl); } -void ECUtil::HashInfo::dump(Formatter *f) const -{ +void ECUtil::HashInfo::dump(Formatter *f) const { f->dump_unsigned("total_chunk_size", total_chunk_size); f->open_array_section("cumulative_shard_hashes"); for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) { @@ -230,25 +1115,61 @@ void ECUtil::HashInfo::dump(Formatter *f) const } namespace ECUtil { -std::ostream& operator<<(std::ostream& out, const HashInfo& hi) -{ +std::ostream &operator<<(std::ostream &out, const HashInfo &hi) { ostringstream hashes; - for (auto hash: hi.cumulative_shard_hashes) + for (auto hash : hi.cumulative_shard_hashes) { hashes << " " << hex << hash; + } return out << "tcs=" << hi.total_chunk_size << hashes.str(); } + +std::ostream &operator<<(std::ostream &out, const shard_extent_map_t &rhs) { + // sinfo not thought to be needed for debug, as it is constant. + return out << "shard_extent_map: ({" << rhs.ro_start << "~" + << rhs.ro_end << "}, maps=" << rhs.extent_maps << ")"; } -void ECUtil::HashInfo::generate_test_instances(list& o) -{ +std::ostream &operator<<(std::ostream &out, const log_entry_t &rhs) { + switch (rhs.event) { + case READ_REQUEST: out << "READ_REQUEST"; + break; + case READ_DONE: out << "READ_DONE"; + break; + case INJECT_EIO: out << "INJECT_EIO"; + break; + case CANCELLED: out << "CANCELLED"; + break; + case ERROR: out << "ERROR"; + break; + case REQUEST_MISSING: out << "REQUEST_MISSING"; + break; + case COMPLETE_ERROR: out << "COMPLETE_ERROR"; + break; + case ERROR_CLEAR: out << "ERROR_CLEAR"; + break; + case COMPLETE: out << "COMPLETE"; + break; + default: + ceph_assert(false); + } + return out << "[" << rhs.shard << "]->" << rhs.io << "\n"; +} +} + +void ECUtil::HashInfo::generate_test_instances(list &o) { o.push_back(new HashInfo(3)); { bufferlist bl; bl.append_zero(20); - map buffers; - buffers[0] = bl; - buffers[1] = bl; - buffers[2] = bl; + + bufferptr bp = bl.begin().get_current_ptr(); + + // We don't have the k+m here, but this is not critical performance, so + // create an oversized map. + shard_id_map buffers(128); + buffers[shard_id_t(0)] = bp; + buffers[shard_id_t(1)] = bp; + buffers[shard_id_t(2)] = bp; o.back()->append(0, buffers); o.back()->append(20, buffers); } @@ -257,14 +1178,10 @@ void ECUtil::HashInfo::generate_test_instances(list& o) const string HINFO_KEY = "hinfo_key"; -bool ECUtil::is_hinfo_key_string(const string &key) -{ +bool ECUtil::is_hinfo_key_string(const string &key) { return key == HINFO_KEY; } -const string &ECUtil::get_hinfo_key() -{ +const string &ECUtil::get_hinfo_key() { return HINFO_KEY; } - -END_IGNORE_DEPRECATED diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h index 65bdcb5199400..f0abbb0cd3ca8 100644 --- a/src/osd/ECUtil.h +++ b/src/osd/ECUtil.h @@ -23,258 +23,1046 @@ #include "include/buffer_fwd.h" #include "include/ceph_assert.h" #include "include/encoding.h" -#include "common/Formatter.h" +#include "common/interval_map.h" +#include "common/mini_flat_map.h" + +#include "osd_types.h" + +/// If someone wants these types, but not ExtentCache, move to another file +struct bl_split_merge { + ceph::buffer::list split( + uint64_t offset, + uint64_t length, + ceph::buffer::list &bl) const { + ceph::buffer::list out; + out.substr_of(bl, offset, length); + return out; + } + + bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const { + return true; + } + + ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const { + ceph::buffer::list bl{std::move(left)}; + bl.claim_append(right); + return bl; + } + + uint64_t length(const ceph::buffer::list &b) const { return b.length(); } +}; + +using extent_set = interval_set; +using extent_map = interval_map; + +/* Slice iterator. This looks for contiguous buffers which are common + * across all shards in the out_set. + * + * It is a template, but essentially: + * K must a key suitable for a mini_flat_map. + * T must be either an extent map or a reference to an extent map. + */ +template +class slice_iterator { + mini_flat_map &input; + uint64_t offset = std::numeric_limits::max(); + uint64_t length = std::numeric_limits::max(); + uint64_t start = std::numeric_limits::max(); + uint64_t end = std::numeric_limits::max(); + shard_id_map> iters; + shard_id_map in; + shard_id_map out; + const shard_id_set &out_set; + + void advance() { + in.clear(); + out.clear(); + offset = start; + end = std::numeric_limits::max(); + + if (iters.empty()) { + return; + } + + // First we find the last buffer in the list + for (auto &&[shard, iters] : iters) { + auto &&[emap_iter, bl_iter] = iters; + uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off(); + ceph_assert(iter_offset >= start); + // If this iterator is after the current offset, then we will ignore + // it for this buffer ptr. The end must move to or before this point. + if (iter_offset > start && iter_offset < end) { + end = iter_offset; + continue; + } + + uint64_t iter_end = iter_offset + bl_iter.get_current_ptr().length(); + if (iter_end < end) { + end = iter_end; + } + } + + for (auto &&iter = iters.begin(); iter != iters.end();) { + auto shard = iter->first; + auto &&[emap_iter, bl_iter] = iter->second; + uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off(); + bool erase = false; + + // Ignore any blank buffers. + if (iter_offset == start) { + ceph_assert(iter_offset == start); + + // Create a new buffer pointer for the result. We don't want the client + // manipulating the ptr. + if (out_set.contains(shard)) { + out.emplace( + shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start)); + } else { + in.emplace( + shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start)); + } + + // Now we need to move on the iterators. + bl_iter += end - start; + + // If we have reached the end of the extent, we need to move that on too. + if (bl_iter == emap_iter.get_val().end()) { + ++emap_iter; + if (emap_iter == input[shard].end()) { + erase = true; + } else { + iters.at(shard).second = emap_iter.get_val().begin(); + } + } + } else + ceph_assert(iter_offset > start); + + if (erase) { + iter = iters.erase(iter); + } else { + ++iter; + } + } + + // We can now move the offset on. + length = end - start; + start = end; + + /* This can arise in two ways: + * 1. We can generate an empty buffer out of a gap, so just skip over. + * 2. Only the inputs contain any interesting data. We don't need + * to perform a decode/encode on a slice in that case. + */ + if (out.empty()) { + advance(); + } + } + +public: + slice_iterator(mini_flat_map &_input, const shard_id_set &out_set) : + input(_input), + iters(input.max_size()), + in(input.max_size()), + out(input.max_size()), + out_set(out_set) { + for (auto &&[shard, emap] : input) { + auto emap_iter = emap.begin(); + auto bl_iter = emap_iter.get_val().begin(); + auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter)); + iters.emplace(shard, std::move(p)); + + if (emap_iter.get_off() < start) { + start = emap_iter.get_off(); + } + } + + advance(); + } + + shard_id_map &get_in_bufferptrs() { return in; } + shard_id_map &get_out_bufferptrs() { return out; } + uint64_t get_offset() const { return offset; } + uint64_t get_length() const { return length; } + bool is_end() const { return in.empty() && out.empty(); } + + bool is_page_aligned() const { + for (auto &&[_, ptr] : in) { + uintptr_t p = (uintptr_t)ptr.c_str(); + if (p & ~CEPH_PAGE_MASK) return false; + if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false; + } + + for (auto &&[_, ptr] : out) { + uintptr_t p = (uintptr_t)ptr.c_str(); + if (p & ~CEPH_PAGE_MASK) return false; + if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false; + } + + return true; + } + + slice_iterator &operator++() { + advance(); + return *this; + } +}; + +// Setting to 1 turns on very large amounts of level 0 debug containing the +// contents of buffers. Even on level 20 this is not really wanted. +#define DEBUG_EC_BUFFERS 1 namespace ECUtil { +class shard_extent_map_t; + +struct shard_extent_set_t { + // The following boilerplate is just to make this look like a map. + shard_id_map map; + + shard_extent_set_t(short max_shards) : map(max_shards) {} + + bool contains(shard_id_t shard) const { return map.contains(shard); } + bool empty() const { return map.empty(); } + void swap(shard_extent_set_t &other) noexcept { map.swap(other.map); } + void clear() { map.clear(); } + auto erase(shard_id_t shard) { return map.erase(shard); } + + auto erase(shard_id_map::iterator &iter) { + return map.erase(iter); + } + + void erase_stripe(uint64_t offset, uint64_t length) { + for (auto it = map.begin(); it != map.end();) { + it->second.erase(offset, length); + if (it->second.empty()) it = map.erase(it); + else ++it; + } + } + + auto begin() const { return map.cbegin(); } + auto begin() { return map.begin(); } + auto end() const { return map.cend(); } + auto end() { return map.end(); } + + void emplace(shard_id_t shard, extent_set &&set) { + map.emplace(shard, std::move(set)); + } + + size_t shard_count() const { return map.size(); } + extent_set &at(shard_id_t shard) { return map.at(shard); } + const extent_set &at(shard_id_t shard) const { return map.at(shard); } + + extent_set get(shard_id_t shard) const { + if (!map.contains(shard)) { + return extent_set(); + } + return at(shard); + } + + extent_set &operator[](shard_id_t shard) { return map[shard]; } + + bool operator==(shard_extent_set_t const &other) const { + return map == other.map; + } + + friend std::ostream &operator<<(std::ostream &lhs, + const shard_extent_set_t &rhs) { + lhs << rhs.map; + return lhs; + } + + void get_extent_superset(extent_set &eset) const { + for (auto &&[_, e] : map) { + eset.union_of(e); + } + } + + extent_set get_extent_superset() const { + extent_set eset; + get_extent_superset(eset); + return eset; + } + + /* Return the extent set which is common across all populated shards. */ + extent_set get_extent_common_set() const { + extent_set eset; + bool first = true; + for (auto &&[_, e] : map) { + if (first) { + eset.insert(e); + first = false; + } else { + eset.intersection_of(e); + } + } + return eset; + } + + void align(uint64_t a) { + for (auto &&[_, e] : map) { + e.align(a); + } + } + + size_t get_max_shards() const { return map.max_size(); } + + void subtract(const shard_extent_set_t &set); + void intersection_of(const shard_extent_set_t &set); + void insert(const shard_extent_set_t &set); + + /** return the sum of extent_set.size */ + uint64_t size() const { + uint64_t size = 0; + for (auto &&[_, e] : map) size += e.size(); + + return size; + } + + void populate_shard_id_set(shard_id_set &set) const { + map.populate_bitset_set(set); + } + + shard_id_set get_shard_id_set() const { + shard_id_set r; + map.populate_bitset_set(r); + return r; + } +}; + +inline uint64_t page_mask() { + static const uint64_t page_mask = ((uint64_t)CEPH_PAGE_SIZE) - 1; + return page_mask; +} + +inline uint64_t align_page_next(uint64_t val) { + return p2roundup(val, (uint64_t)CEPH_PAGE_SIZE); +} + +inline uint64_t align_page_prev(uint64_t val) { + return p2align(val, (uint64_t)CEPH_PAGE_SIZE); +} class stripe_info_t { + friend class shard_extent_map_t; + const uint64_t stripe_width; + const uint64_t plugin_flags; const uint64_t chunk_size; - const unsigned int k; // Can be calculated with a division from above. Better to cache. + const pg_pool_t *pool; + const unsigned int k; + // Can be calculated with a division from above. Better to cache. const unsigned int m; const std::vector chunk_mapping; const std::vector chunk_mapping_reverse; + const shard_id_set data_shards; + const shard_id_set parity_shards; + private: + void ro_range_to_shards( + uint64_t ro_offset, + uint64_t ro_size, + ECUtil::shard_extent_set_t *shard_extent_set, + extent_set *extent_superset, + buffer::list *bl, + shard_extent_map_t *shard_extent_map) const; + static std::vector complete_chunk_mapping( - std::vector _chunk_mapping, unsigned int n) - { - unsigned int size = _chunk_mapping.size(); + const std::vector &_chunk_mapping, unsigned int n) { + unsigned int size = (int)_chunk_mapping.size(); std::vector chunk_mapping(n); - for (shard_id_t i; i < n; ++i) { + for (unsigned int i = 0; i < n; i++) { if (size > i) { - chunk_mapping.at(static_cast(i)) = _chunk_mapping.at(static_cast(i)); + chunk_mapping.at(i) = _chunk_mapping.at(i); } else { - chunk_mapping.at(static_cast(i)) = i; + chunk_mapping.at(i) = static_cast(i); } } return chunk_mapping; } + static std::vector reverse_chunk_mapping( - std::vector chunk_mapping) - { - unsigned int size = chunk_mapping.size(); + const std::vector &chunk_mapping) { + size_t size = chunk_mapping.size(); std::vector reverse(size); shard_id_set used; - for (raw_shard_id_t i; i < size; ++i) { - shard_id_t index = chunk_mapping.at(static_cast(i)); + for (raw_shard_id_t raw_shard; raw_shard < size; ++raw_shard) { + shard_id_t shard = chunk_mapping[int(raw_shard)]; // Mapping must be a bijection and a permutation - ceph_assert(!used.contains(index)); - used.insert(index); - reverse.at(static_cast(index)) = i; + ceph_assert(!used.contains(shard)); + used.insert(shard); + reverse.at(int(shard)) = raw_shard; } return reverse; } + + static shard_id_set calc_shards(raw_shard_id_t start, + int count, + const std::vector &chunk_mapping) { + shard_id_set data_shards; + for (raw_shard_id_t raw_shard = start; + raw_shard < int(start) + count; + ++raw_shard) { + shard_id_t shard = chunk_mapping[int(raw_shard)]; + data_shards.insert(shard); + } + return data_shards; + } + public: - stripe_info_t(ErasureCodeInterfaceRef ec_impl, uint64_t stripe_width) + stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool, + uint64_t stripe_width + ) : stripe_width(stripe_width), + plugin_flags(ec_impl->get_supported_optimizations()), chunk_size(stripe_width / ec_impl->get_data_chunk_count()), + pool(pool), k(ec_impl->get_data_chunk_count()), m(ec_impl->get_coding_chunk_count()), - chunk_mapping(complete_chunk_mapping(ec_impl->get_chunk_mapping(), - k + m)), - chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) { + chunk_mapping( + complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)), + chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)), + data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)), + parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) { + ceph_assert(stripe_width != 0); ceph_assert(stripe_width % k == 0); } + // Simpler constructors for unit tests stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width) : stripe_width(stripe_width), + plugin_flags(0xFFFFFFFFFFFFFFFFul), + // Everything enabled for test harnesses. chunk_size(stripe_width / k), + pool(nullptr), k(k), m(m), chunk_mapping(complete_chunk_mapping(std::vector(), k + m)), - chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) { + chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)), + data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)), + parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) { + ceph_assert(stripe_width != 0); + ceph_assert(stripe_width % k == 0); + } + + stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width, + const std::vector &_chunk_mapping) + : stripe_width(stripe_width), + plugin_flags(0xFFFFFFFFFFFFFFFFul), + // Everything enabled for test harnesses. + chunk_size(stripe_width / k), + pool(nullptr), + k(k), + m(m), + chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)), + chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)), + data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)), + parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) { + ceph_assert(stripe_width != 0); ceph_assert(stripe_width % k == 0); } + stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width, - std::vector _chunk_mapping) + const pg_pool_t *pool, const std::vector &_chunk_mapping) : stripe_width(stripe_width), + plugin_flags(0xFFFFFFFFFFFFFFFFul), + // Everything enabled for test harnesses. chunk_size(stripe_width / k), + pool(pool), k(k), m(m), chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)), - chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) { + chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)), + data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)), + parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) { + ceph_assert(stripe_width != 0); ceph_assert(stripe_width % k == 0); } - bool logical_offset_is_stripe_aligned(uint64_t logical) const { - return (logical % stripe_width) == 0; + + stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width, + const pg_pool_t *pool) + : stripe_width(stripe_width), + plugin_flags(0xFFFFFFFFFFFFFFFFul), + // Everything enabled for test harnesses. + chunk_size(stripe_width / k), + pool(pool), + k(k), + m(m), + chunk_mapping(complete_chunk_mapping(std::vector(), k + m)), + chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)), + data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)), + parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) { + ceph_assert(stripe_width != 0); + ceph_assert(stripe_width % k == 0); + } + + uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { + uint64_t remainder = size % get_stripe_width(); + uint64_t shard_size = (size - remainder) / k; + raw_shard_id_t raw_shard = get_raw_shard(shard); + if (raw_shard >= get_k()) { + // coding parity shards have same size as data shard 0 + raw_shard = 0; + } + if (remainder > uint64_t(raw_shard) * get_chunk_size()) { + remainder -= uint64_t(raw_shard) * get_chunk_size(); + if (remainder > get_chunk_size()) { + remainder = get_chunk_size(); + } + shard_size += remainder; + } + return ECUtil::align_page_next(shard_size); + } + + uint64_t ro_offset_to_shard_offset(uint64_t ro_offset, + const raw_shard_id_t raw_shard) const { + uint64_t full_stripes = (ro_offset / stripe_width) * chunk_size; + int offset_shard = (ro_offset / chunk_size) % k; + + if (int(raw_shard) == offset_shard) { + return full_stripes + ro_offset % chunk_size; + } + if (raw_shard < offset_shard) { + return full_stripes + chunk_size; + } + return full_stripes; + } + + /** + * Return true if shard does not require metadata updates + */ + bool is_nonprimary_shard(const shard_id_t shard) const { + return pool->is_nonprimary_shard(shard); } + + bool supports_ec_overwrites() const { + return pool->allows_ecoverwrites(); + } + + bool supports_sub_chunks() const { + return (plugin_flags & + ErasureCodeInterface::FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS) != 0; + } + + bool require_hinfo() const { + return !supports_ec_overwrites(); + } + + bool supports_partial_reads() const { + return (plugin_flags & + ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION) != 0; + } + + bool supports_partial_writes() const { + return (plugin_flags & + ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION) != 0; + } + + bool supports_parity_delta_writes() const { + return (plugin_flags & + ErasureCodeInterface::FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION) != 0; + } + uint64_t get_stripe_width() const { return stripe_width; } + uint64_t get_chunk_size() const { return chunk_size; } + unsigned int get_m() const { return m; } + unsigned int get_k() const { return k; } + unsigned int get_k_plus_m() const { return k + m; } - shard_id_t get_shard(raw_shard_id_t raw_shard) const { - return chunk_mapping[static_cast(raw_shard)]; + + const shard_id_t get_shard(const raw_shard_id_t raw_shard) const { + return chunk_mapping[int(raw_shard)]; } + raw_shard_id_t get_raw_shard(shard_id_t shard) const { - return chunk_mapping_reverse[static_cast(shard)]; + return chunk_mapping_reverse.at(int(shard)); + } + + /* Return a "span" - which can be iterated over */ + auto get_data_shards() const { + return data_shards; } - uint64_t logical_to_prev_chunk_offset(uint64_t offset) const { + + auto get_parity_shards() const { + return parity_shards; + } + + uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const { return (offset / stripe_width) * chunk_size; } - uint64_t logical_to_next_chunk_offset(uint64_t offset) const { - return ((offset + stripe_width - 1)/ stripe_width) * chunk_size; + + uint64_t ro_offset_to_next_chunk_offset(uint64_t offset) const { + return ((offset + stripe_width - 1) / stripe_width) * chunk_size; } - uint64_t logical_to_prev_stripe_offset(uint64_t offset) const { + + uint64_t ro_offset_to_prev_stripe_ro_offset(uint64_t offset) const { return offset - (offset % stripe_width); } - uint64_t logical_to_next_stripe_offset(uint64_t offset) const { - return ((offset % stripe_width) ? - (offset - (offset % stripe_width) + stripe_width) : - offset); + + uint64_t ro_offset_to_next_stripe_ro_offset(uint64_t offset) const { + return ((offset % stripe_width) + ? (offset - (offset % stripe_width) + stripe_width) + : offset); } - uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const { + + uint64_t aligned_ro_offset_to_chunk_offset(uint64_t offset) const { ceph_assert(offset % stripe_width == 0); return (offset / stripe_width) * chunk_size; } - uint64_t chunk_aligned_logical_offset_to_chunk_offset(uint64_t offset) const { + + uint64_t chunk_aligned_ro_offset_to_chunk_offset(uint64_t offset) const { [[maybe_unused]] const auto residue_in_stripe = offset % stripe_width; ceph_assert(residue_in_stripe % chunk_size == 0); ceph_assert(stripe_width % chunk_size == 0); // this rounds down return (offset / stripe_width) * chunk_size; } - uint64_t chunk_aligned_logical_size_to_chunk_size(uint64_t len) const { - [[maybe_unused]] const auto residue_in_stripe = len % stripe_width; - ceph_assert(residue_in_stripe % chunk_size == 0); - ceph_assert(stripe_width % chunk_size == 0); + + uint64_t chunk_aligned_ro_length_to_shard_length(uint64_t len) const { // this rounds up return ((len + stripe_width - 1) / stripe_width) * chunk_size; } - uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const { + + uint64_t chunk_aligned_shard_offset_to_ro_offset(uint64_t offset) const { ceph_assert(offset % chunk_size == 0); return (offset / chunk_size) * stripe_width; } - std::pair chunk_aligned_offset_len_to_chunk( - std::pair in) const; - std::pair offset_len_to_stripe_bounds( - std::pair in) const { - uint64_t off = logical_to_prev_stripe_offset(in.first); - uint64_t len = logical_to_next_stripe_offset( - (in.first - off) + in.second); + + std::pair chunk_aligned_ro_range_to_shard_ro_range( + uint64_t off, uint64_t len) const; + + std::pair ro_offset_len_to_stripe_ro_offset_len( + uint64_t _off, uint64_t _len) const { + uint64_t off = ro_offset_to_prev_stripe_ro_offset(_off); + uint64_t len = ro_offset_to_next_stripe_ro_offset( + (_off - off) + _len); return std::make_pair(off, len); } - std::pair offset_len_to_chunk_bounds( - std::pair in) const { + + std::pair ro_range_to_chunk_ro_range( + const std::pair &in) const { uint64_t off = in.first - (in.first % chunk_size); uint64_t tmp_len = (in.first - off) + in.second; - uint64_t len = ((tmp_len % chunk_size) ? - (tmp_len - (tmp_len % chunk_size) + chunk_size) : - tmp_len); + uint64_t len = ((tmp_len % chunk_size) + ? (tmp_len - (tmp_len % chunk_size) + chunk_size) + : tmp_len); return std::make_pair(off, len); } - std::pair offset_length_to_data_chunk_indices( - uint64_t off, uint64_t len) const { - assert(chunk_size > 0); - const auto first_chunk_idx = (off / chunk_size); - const auto last_chunk_idx = (chunk_size - 1 + off + len) / chunk_size; - return {first_chunk_idx, last_chunk_idx}; - } - bool offset_length_is_same_stripe( - uint64_t off, uint64_t len) const { - if (len == 0) { - return true; + + void ro_range_to_shard_extent_set( + uint64_t ro_offset, + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set) const { + ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, nullptr, nullptr, nullptr); + } + + void ro_range_to_shard_extent_set( + uint64_t ro_offset, + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set, + extent_set &extent_superset) const { + ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &extent_superset, + nullptr, + nullptr); + } + + void ro_range_to_shard_extent_set_with_parity( + uint64_t ro_offset, + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set) const { + extent_set parity; + ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &parity, nullptr, + nullptr); + + if (parity.empty()) return; + + for (shard_id_t shard : get_parity_shards()) { + shard_extent_set[shard].union_of(parity); } - assert(chunk_size > 0); - const auto first_stripe_idx = off / stripe_width; - const auto last_inc_stripe_idx = (off + len - 1) / stripe_width; - return first_stripe_idx == last_inc_stripe_idx; } -}; -int decode( - const stripe_info_t &sinfo, - ceph::ErasureCodeInterfaceRef &ec_impl, - const std::set want_to_read, - std::map &to_decode, - ceph::buffer::list *out); - -int decode( - const stripe_info_t &sinfo, - ceph::ErasureCodeInterfaceRef &ec_impl, - std::map &to_decode, - std::map &out); - -int encode( - const stripe_info_t &sinfo, - ceph::ErasureCodeInterfaceRef &ec_impl, - ceph::buffer::list &in, - const std::set &want, - std::map *out); + void ro_range_to_shard_extent_set_with_superset( + uint64_t ro_offset, + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set, + extent_set &superset) const { + ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &superset, nullptr, + nullptr); + } + + void ro_range_to_shard_extent_map( + uint64_t ro_offset, + uint64_t ro_size, + buffer::list &bl, + shard_extent_map_t &shard_extent_map) const { + ro_range_to_shards(ro_offset, ro_size, nullptr, nullptr, &bl, &shard_extent_map); + } + + void trim_shard_extent_set_for_ro_offset(uint64_t ro_offset, + ECUtil::shard_extent_set_t & + shard_extent_set) const; + + void ro_size_to_stripe_aligned_read_mask( + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set) const; + + void ro_size_to_read_mask( + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set) const; + + void ro_size_to_zero_mask( + uint64_t ro_size, + ECUtil::shard_extent_set_t &shard_extent_set) const; +}; class HashInfo { uint64_t total_chunk_size = 0; std::vector cumulative_shard_hashes; - // purely ephemeral, represents the size once all in-flight ops commit - uint64_t projected_total_chunk_size = 0; public: HashInfo() {} + explicit HashInfo(unsigned num_chunks) : cumulative_shard_hashes(num_chunks, -1) {} - void append(uint64_t old_size, std::map &to_append); + + void append(uint64_t old_size, shard_id_map &to_append); + void clear() { total_chunk_size = 0; cumulative_shard_hashes = std::vector( cumulative_shard_hashes.size(), -1); } + void encode(ceph::buffer::list &bl) const; void decode(ceph::buffer::list::const_iterator &bl); void dump(ceph::Formatter *f) const; - static void generate_test_instances(std::list& o); + static void generate_test_instances(std::list &o); + uint32_t get_chunk_hash(shard_id_t shard) const { ceph_assert(shard < cumulative_shard_hashes.size()); - return cumulative_shard_hashes[static_cast(shard)]; + return cumulative_shard_hashes[int(shard)]; } + uint64_t get_total_chunk_size() const { return total_chunk_size; } - uint64_t get_projected_total_chunk_size() const { - return projected_total_chunk_size; - } - uint64_t get_total_logical_size(const stripe_info_t &sinfo) const { - return get_total_chunk_size() * - (sinfo.get_stripe_width()/sinfo.get_chunk_size()); - } - uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const { - return get_projected_total_chunk_size() * - (sinfo.get_stripe_width()/sinfo.get_chunk_size()); - } - void set_projected_total_logical_size( - const stripe_info_t &sinfo, - uint64_t logical_size) { - ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size)); - projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset( - logical_size); - } + void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) { cumulative_shard_hashes.clear(); total_chunk_size = new_chunk_size; } + bool has_chunk_hash() const { return !cumulative_shard_hashes.empty(); } + void update_to(const HashInfo &rhs) { - auto ptcs = projected_total_chunk_size; *this = rhs; - projected_total_chunk_size = ptcs; } - friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi); + + friend std::ostream &operator<<(std::ostream &out, const HashInfo &hi); }; typedef std::shared_ptr HashInfoRef; +class shard_extent_map_t { + static const uint64_t invalid_offset = std::numeric_limits::max(); + +public: + const stripe_info_t *sinfo; + // The maximal range of all extents maps within rados object space. + uint64_t ro_start; + uint64_t ro_end; + uint64_t start_offset; + uint64_t end_offset; + shard_id_map extent_maps; + + slice_iterator begin_slice_iterator( + const shard_id_set &out_set); + + /* This caculates the ro offset for an offset into a particular shard */ + uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const { + int stripes = shard_offset / sinfo->chunk_size; + return stripes * sinfo->stripe_width + uint64_t(raw_shard) * sinfo-> + chunk_size + + shard_offset % sinfo->chunk_size; + } + + uint64_t calc_ro_end(raw_shard_id_t raw_shard, int shard_offset) const { + return calc_ro_offset(raw_shard, shard_offset - 1) + 1; + } + + /* This is a relatively expensive operation to update the ro offset/length. + * Ideally, we should be able to update offset/length incrementally. + */ + void compute_ro_range() { + uint64_t start = invalid_offset; + uint64_t end = 0; + uint64_t o_start = invalid_offset; + uint64_t o_end = 0; + + for (auto &&[shard, emap] : extent_maps) { + raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard); + uint64_t start_off = emap.get_start_off(); + uint64_t end_off = emap.get_end_off(); + o_start = std::min(o_start, start_off); + o_end = std::max(o_end, end_off); + + if (raw_shard < sinfo->get_k()) { + start = std::min(start, calc_ro_offset(raw_shard, start_off)); + end = std::max(end, calc_ro_end(raw_shard, end_off)); + } + } + if (end != 0) { + ro_start = start; + ro_end = end; + start_offset = o_start; + end_offset = o_end; + } else { + ro_start = invalid_offset; + ro_end = invalid_offset; + start_offset = invalid_offset; + end_offset = invalid_offset; + } + } + +public: + shard_extent_map_t(const stripe_info_t *sinfo) : + sinfo(sinfo), + ro_start(invalid_offset), + ro_end(invalid_offset), + start_offset(invalid_offset), + end_offset(invalid_offset), + extent_maps(sinfo->get_k_plus_m()) {} + + shard_extent_map_t(const stripe_info_t *sinfo, + shard_id_map &&_extent_maps) : + sinfo(sinfo), + extent_maps(std::move(_extent_maps)) { + // Empty shards are not permitted, so clear them out. + for (auto iter = extent_maps.begin(); iter != extent_maps.end();) { + if (iter->second.empty()) { + iter = extent_maps.erase(iter); + } else { + ++iter; + } + } + compute_ro_range(); + } + + bool empty() const { + return ro_end == invalid_offset; + } + + uint64_t get_ro_start() const { + return ro_start; + } + + uint64_t get_ro_end() const { + return ro_end; + } + + /* Return the extent maps. For reading only, set to const as the returned + * map should not be modified. + * We want to avoid: + * - Empty extent maps on shards + * - getting the offset/length out of sync. + */ + const auto &get_extent_maps() const { + return extent_maps; + } + + /* Return a particlar extent map. This must be const because updating it + * would cause the shard_extent_map to become inconsistent. + * + * * This method will raise an exception if the shard has no extents. + */ + const extent_map &get_extent_map(shard_id_t shard) const { + return extent_maps.at(shard); + } + + extent_set get_extent_set(const shard_id_t &shard) const { + extent_set ret; + if (extent_maps.contains(shard)) { + extent_maps.at(shard).to_interval_set(ret); + } + return ret; + } + + void to_shard_extent_set(shard_extent_set_t &set) const { + for (auto &&[shard, emap] : extent_maps) { + emap.to_interval_set(set[shard]); + } + } + + bool contains_shard(shard_id_t shard) const { + return extent_maps.contains(shard); + } + + void erase_after_ro_offset(uint64_t ro_offset); + shard_extent_map_t intersect_ro_range(uint64_t ro_offset, uint64_t ro_length) const; + shard_extent_map_t intersect(std::optional const &other) const; + shard_extent_map_t intersect(shard_extent_set_t const &other) const; + void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl); + void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl, + uint64_t new_start, uint64_t new_end); + void insert_ro_zero_buffer(uint64_t ro_offset, uint64_t ro_length); + void insert(shard_extent_map_t const &other); + void append_zeros_to_ro_offset(uint64_t ro_offset); + void insert_ro_extent_map(const extent_map &host_extent_map); + extent_set get_extent_superset() const; + int encode(const ErasureCodeInterfaceRef &ec_impl, const HashInfoRef &hinfo, + uint64_t before_ro_size); + int _encode(const ErasureCodeInterfaceRef &ec_impl); + int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl, + shard_extent_map_t &old_sem); + + void pad_on_shards(const shard_extent_set_t &pad_to, + const shard_id_set &shards); + void pad_on_shards(const extent_set &pad_to, + const shard_id_set &shards); + void trim(const shard_extent_set_t &trim_to); + int decode(const ErasureCodeInterfaceRef &ec_impl, + const shard_extent_set_t &want, + uint64_t object_size); + int _decode(const ErasureCodeInterfaceRef &ec_impl, + const shard_id_set &want_set, + const shard_id_set &need_set); + void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length, + buffer::list &append_to) const; + void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const; + uint64_t get_shard_first_offset(shard_id_t shard) const; + void zero_pad(shard_extent_set_t const &pad_to); + void zero_pad(shard_id_t shard, uint64_t offset, uint64_t length); + void pad_with_other(shard_extent_set_t const &pad_to, + shard_extent_map_t const &other); + void pad_with_other(shard_id_t shard, uint64_t offset, uint64_t length, + shard_extent_map_t const &other); + bufferlist get_ro_buffer(uint64_t ro_offset, uint64_t ro_length) const; + /* Returns a buffer assuming that there is a single contigious buffer + * represented by the map. */ + bufferlist get_ro_buffer() const; + shard_extent_set_t get_extent_set(); + void insert_parity_buffers(); + void erase_shard(shard_id_t shard); + shard_extent_map_t slice_map(uint64_t offset, uint64_t length) const; + std::string debug_string(uint64_t inteval, uint64_t offset) const; + void erase_stripe(uint64_t offset, uint64_t length); + bool contains(shard_id_t shard) const; + bool contains(std::optional const &other) const; + bool contains(shard_extent_set_t const &other) const; + void pad_and_rebuild_to_page_align(); + uint64_t size(); + void clear(); + uint64_t get_start_offset() const { return start_offset; } + uint64_t get_end_offset() const { return end_offset; } + void deep_copy(shard_extent_map_t const &other); + void swap() {} + size_t shard_count() { return extent_maps.size(); } + + + void assert_buffer_contents_equal(shard_extent_map_t other) const { + for (auto &&[shard, emap] : extent_maps) { + for (auto &&i : emap) { + bufferlist bl = i.get_val(); + bufferlist otherbl; + other.get_buffer(shard, i.get_off(), i.get_len(), otherbl); + ceph_assert(bl.contents_equal(otherbl)); + } + } + } + + bool add_zero_padding_for_decode(uint64_t object_size, shard_id_set &exclude_set) { + shard_extent_set_t zeros(sinfo->get_k_plus_m()); + sinfo->ro_size_to_zero_mask(object_size, zeros); + extent_set superset = get_extent_superset(); + bool changed = false; + for (auto &&[shard, z] : zeros) { + if (exclude_set.contains(shard)) { + continue; + } + z.intersection_of(superset); + for (auto [off, len] : z) { + changed = true; + bufferlist bl; + bl.append_zero(len); + extent_maps[shard].insert(off, len, bl); + } + } + + if (changed) { + compute_ro_range(); + } + + return changed; + } + + friend std::ostream &operator<<(std::ostream &lhs, + const shard_extent_map_t &rhs); + + friend bool operator==(const shard_extent_map_t &lhs, + const shard_extent_map_t &rhs) { + return lhs.sinfo == rhs.sinfo + && lhs.ro_start == rhs.ro_start + && lhs.ro_end == rhs.ro_end + && lhs.extent_maps == rhs.extent_maps; + } +}; + +typedef enum { + READ_REQUEST, + READ_DONE, + INJECT_EIO, + CANCELLED, + ERROR, + REQUEST_MISSING, + COMPLETE_ERROR, + ERROR_CLEAR, + COMPLETE +} log_event_t; + +struct log_entry_t { + const log_event_t event; + const pg_shard_t shard; + const extent_set io; + + log_entry_t( + const log_event_t event, + const pg_shard_t &shard, + const extent_set &io) : + event(event), shard(shard), io(io) {} + + log_entry_t( + const log_event_t event, + const pg_shard_t &shard) : + event(event), shard(shard) {} + + log_entry_t( + const log_event_t event, + const pg_shard_t &pg_shard, + const shard_extent_map_t &extent_map) : + event(event), shard(pg_shard), + io(extent_map.contains(pg_shard.shard) + ? extent_map.get_extent_set(pg_shard.shard) + : extent_set()) {} + + friend std::ostream &operator<<(std::ostream &out, const log_entry_t &lhs); +}; + bool is_hinfo_key_string(const std::string &key); const std::string &get_hinfo_key(); diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc deleted file mode 100644 index 3a8bbf11bb4cf..0000000000000 --- a/src/osd/ExtentCache.cc +++ /dev/null @@ -1,245 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2016 Red Hat - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "ExtentCache.h" - -using std::ostream; - -using ceph::bufferlist; - -void ExtentCache::extent::_link_pin_state(pin_state &pin_state) -{ - ceph_assert(parent_extent_set); - ceph_assert(!parent_pin_state); - parent_pin_state = &pin_state; - pin_state.pin_list.push_back(*this); -} - -void ExtentCache::extent::_unlink_pin_state() -{ - ceph_assert(parent_extent_set); - ceph_assert(parent_pin_state); - auto liter = pin_state::list::s_iterator_to(*this); - parent_pin_state->pin_list.erase(liter); - parent_pin_state = nullptr; -} - -void ExtentCache::extent::unlink() -{ - ceph_assert(parent_extent_set); - ceph_assert(parent_pin_state); - - _unlink_pin_state(); - - // remove from extent set - { - auto siter = object_extent_set::set::s_iterator_to(*this); - auto &set = object_extent_set::set::container_from_iterator(siter); - ceph_assert(&set == &(parent_extent_set->extent_set)); - set.erase(siter); - } - - parent_extent_set = nullptr; - ceph_assert(!parent_pin_state); -} - -void ExtentCache::extent::link( - object_extent_set &extent_set, - pin_state &pin_state) -{ - ceph_assert(!parent_extent_set); - parent_extent_set = &extent_set; - extent_set.extent_set.insert(*this); - - _link_pin_state(pin_state); -} - -void ExtentCache::extent::move( - pin_state &to) -{ - _unlink_pin_state(); - _link_pin_state(to); -} - -void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset) -{ - if (eset.extent_set.empty()) { - auto siter = cache_set::s_iterator_to(eset); - auto &set = cache_set::container_from_iterator(siter); - ceph_assert(&set == &per_object_caches); - - // per_object_caches owns eset - per_object_caches.erase(eset); - delete &eset; - } -} - -ExtentCache::object_extent_set &ExtentCache::get_or_create( - const hobject_t &oid) -{ - cache_set::insert_commit_data data; - auto p = per_object_caches.insert_check(oid, Cmp(), data); - if (p.second) { - auto *eset = new object_extent_set(oid); - per_object_caches.insert_commit(*eset, data); - return *eset; - } else { - return *(p.first); - } -} - -ExtentCache::object_extent_set *ExtentCache::get_if_exists( - const hobject_t &oid) -{ - cache_set::insert_commit_data data; - auto p = per_object_caches.insert_check(oid, Cmp(), data); - if (p.second) { - return nullptr; - } else { - return &*(p.first); - } -} - -std::pair< - ExtentCache::object_extent_set::set::iterator, - ExtentCache::object_extent_set::set::iterator - > ExtentCache::object_extent_set::get_containing_range( - uint64_t off, uint64_t len) -{ - // fst is first iterator with end after off (may be end) - auto fst = extent_set.upper_bound(off, uint_cmp()); - if (fst != extent_set.begin()) - --fst; - if (fst != extent_set.end() && off >= (fst->offset + fst->get_length())) - ++fst; - - // lst is first iterator with start >= off + len (may be end) - auto lst = extent_set.lower_bound(off + len, uint_cmp()); - return std::make_pair(fst, lst); -} - -extent_set ExtentCache::reserve_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_write, - const extent_set &to_read) -{ - if (to_write.empty() && to_read.empty()) { - return extent_set(); - } - extent_set must_read; - auto &eset = get_or_create(oid); - extent_set missing; - for (auto &&res: to_write) { - eset.traverse_update( - pin, - res.first, - res.second, - [&](uint64_t off, uint64_t len, - extent *ext, object_extent_set::update_action *action) { - action->action = object_extent_set::update_action::UPDATE_PIN; - if (!ext) { - missing.insert(off, len); - } - }); - } - must_read.intersection_of( - to_read, - missing); - return must_read; -} - -extent_map ExtentCache::get_remaining_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_get) -{ - if (to_get.empty()) { - return extent_map(); - } - extent_map ret; - auto &eset = get_or_create(oid); - for (auto &&res: to_get) { - bufferlist bl; - uint64_t cur = res.first; - eset.traverse_update( - pin, - res.first, - res.second, - [&](uint64_t off, uint64_t len, - extent *ext, object_extent_set::update_action *action) { - ceph_assert(off == cur); - cur = off + len; - action->action = object_extent_set::update_action::NONE; - ceph_assert(ext && ext->bl && ext->pinned_by_write()); - bl.substr_of( - *(ext->bl), - off - ext->offset, - len); - ret.insert(off, len, bl); - }); - } - return ret; -} - -void ExtentCache::present_rmw_update( - const hobject_t &oid, - write_pin &pin, - const extent_map &extents) -{ - if (extents.empty()) { - return; - } - auto &eset = get_or_create(oid); - for (auto &&res: extents) { - eset.traverse_update( - pin, - res.get_off(), - res.get_len(), - [&](uint64_t off, uint64_t len, - extent *ext, object_extent_set::update_action *action) { - action->action = object_extent_set::update_action::NONE; - ceph_assert(ext && ext->pinned_by_write()); - action->bl = bufferlist(); - action->bl->substr_of( - res.get_val(), - off - res.get_off(), - len); - }); - } -} - -ostream &ExtentCache::print(ostream &out) const -{ - out << "ExtentCache(" << std::endl; - for (auto esiter = per_object_caches.begin(); - esiter != per_object_caches.end(); - ++esiter) { - out << " Extents(" << esiter->oid << ")[" << std::endl; - for (auto exiter = esiter->extent_set.begin(); - exiter != esiter->extent_set.end(); - ++exiter) { - out << " Extent(" << exiter->offset - << "~" << exiter->get_length() - << ":" << exiter->pin_tid() - << ")" << std::endl; - } - } - return out << ")" << std::endl; -} - -ostream &operator<<(ostream &lhs, const ExtentCache &cache) -{ - return cache.print(lhs); -} diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h deleted file mode 100644 index 674ba6991679f..0000000000000 --- a/src/osd/ExtentCache.h +++ /dev/null @@ -1,486 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2016 Red Hat - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include "include/interval_set.h" -#include "common/interval_map.h" -#include "include/buffer.h" -#include "common/hobject.h" - -/** - ExtentCache - - The main purpose of this cache is to ensure that we can pipeline - overlapping partial overwrites. - - To that end we need to ensure that an extent pinned for an operation is - live until that operation completes. However, a particular extent - might be pinned by multiple operations (several pipelined writes - on the same object). - - 1) When we complete an operation, we only look at extents owned only - by that operation. - 2) Per-extent overhead is fixed size. - 2) Per-operation metadata is fixed size. - - This is simple enough to realize with two main structures: - - extent: contains a pointer to the pin owning it and intrusive list - pointers to other extents owned by the same pin - - pin_state: contains the list head for extents owned by it - - This works as long as we only need to remember one "owner" for - each extent. To make this work, we'll need to leverage some - invariants guaranteed by higher layers: - - 1) Writes on a particular object must be ordered - 2) A particular object will have outstanding reads or writes, but not - both (note that you can have a read while a write is committed, but - not applied). - - Our strategy therefore will be to have whichever in-progress op will - finish "last" be the owner of a particular extent. For now, we won't - cache reads, so 2) simply means that we can assume that reads and - recovery operations imply no unstable extents on the object in - question. - - Write: WaitRead -> WaitCommit -> Complete - - Invariant 1) above actually indicates that we can't have writes - bypassing the WaitRead state while there are writes waiting on - Reads. Thus, the set of operations pinning a particular extent - must always complete in order or arrival. - - This suggests that a particular extent may be in only the following - states: - - - 0) Empty (not in the map at all) - 1) Write Pending N - - Some write with reqid <= N is currently fetching the data for - this extent - - The extent must persist until Write reqid N completes - - All ops pinning this extent are writes in the WaitRead state of - the Write pipeline (there must be an in progress write, so no - reads can be in progress). - 2) Write Pinned N: - - This extent has data corresponding to some reqid M <= N - - The extent must persist until Write reqid N commits - - All ops pinning this extent are writes in some Write - state (all are possible). Reads are not possible - in this state (or the others) due to 2). - - All of the above suggests that there are 3 things users can - ask of the cache corresponding to the 3 Write pipelines - states. - */ - -/// If someone wants these types, but not ExtentCache, move to another file -struct bl_split_merge { - ceph::buffer::list split( - uint64_t offset, - uint64_t length, - ceph::buffer::list &bl) const { - ceph::buffer::list out; - out.substr_of(bl, offset, length); - return out; - } - bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const { - return true; - } - ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const { - ceph::buffer::list bl{std::move(left)}; - bl.claim_append(right); - return bl; - } - uint64_t length(const ceph::buffer::list &b) const { return b.length(); } -}; -using extent_set = interval_set; -using extent_map = interval_map; - -class ExtentCache { - struct object_extent_set; - struct pin_state; -private: - - struct extent { - object_extent_set *parent_extent_set = nullptr; - pin_state *parent_pin_state = nullptr; - boost::intrusive::set_member_hook<> extent_set_member; - boost::intrusive::list_member_hook<> pin_list_member; - - uint64_t offset; - uint64_t length; - std::optional bl; - - uint64_t get_length() const { - return length; - } - - bool is_pending() const { - return bl == std::nullopt; - } - - bool pinned_by_write() const { - ceph_assert(parent_pin_state); - return parent_pin_state->is_write(); - } - - uint64_t pin_tid() const { - ceph_assert(parent_pin_state); - return parent_pin_state->tid; - } - - extent(uint64_t offset, ceph::buffer::list _bl) - : offset(offset), length(_bl.length()), bl(_bl) {} - - extent(uint64_t offset, uint64_t length) - : offset(offset), length(length) {} - - bool operator<(const extent &rhs) const { - return offset < rhs.offset; - } - private: - // can briefly violate the two link invariant, used in unlink() and move() - void _link_pin_state(pin_state &pin_state); - void _unlink_pin_state(); - public: - void unlink(); - void link(object_extent_set &parent_extent_set, pin_state &pin_state); - void move(pin_state &to); - }; - - struct object_extent_set : boost::intrusive::set_base_hook<> { - hobject_t oid; - explicit object_extent_set(const hobject_t &oid) : oid(oid) {} - - using set_member_options = boost::intrusive::member_hook< - extent, - boost::intrusive::set_member_hook<>, - &extent::extent_set_member>; - using set = boost::intrusive::set; - set extent_set; - - bool operator<(const object_extent_set &rhs) const { - return oid < rhs.oid; - } - - struct uint_cmp { - bool operator()(uint64_t lhs, const extent &rhs) const { - return lhs < rhs.offset; - } - bool operator()(const extent &lhs, uint64_t rhs) const { - return lhs.offset < rhs; - } - }; - std::pair get_containing_range( - uint64_t offset, uint64_t length); - - void erase(uint64_t offset, uint64_t length); - - struct update_action { - enum type { - NONE, - UPDATE_PIN - }; - type action = NONE; - std::optional bl; - }; - template - void traverse_update( - pin_state &pin, - uint64_t offset, - uint64_t length, - F &&f) { - auto range = get_containing_range(offset, length); - - if (range.first == range.second || range.first->offset > offset) { - uint64_t extlen = range.first == range.second ? - length : range.first->offset - offset; - - update_action action; - f(offset, extlen, nullptr, &action); - ceph_assert(!action.bl || action.bl->length() == extlen); - if (action.action == update_action::UPDATE_PIN) { - extent *ext = action.bl ? - new extent(offset, *action.bl) : - new extent(offset, extlen); - ext->link(*this, pin); - } else { - ceph_assert(!action.bl); - } - } - - for (auto p = range.first; p != range.second;) { - extent *ext = &*p; - ++p; - - uint64_t extoff = std::max(ext->offset, offset); - uint64_t extlen = std::min( - ext->length - (extoff - ext->offset), - offset + length - extoff); - - update_action action; - f(extoff, extlen, ext, &action); - ceph_assert(!action.bl || action.bl->length() == extlen); - extent *final_extent = nullptr; - if (action.action == update_action::NONE) { - final_extent = ext; - } else { - pin_state *ps = ext->parent_pin_state; - ext->unlink(); - if ((ext->offset < offset) && - (ext->offset + ext->get_length() > offset)) { - extent *head = nullptr; - if (ext->bl) { - ceph::buffer::list bl; - bl.substr_of( - *(ext->bl), - 0, - offset - ext->offset); - head = new extent(ext->offset, bl); - } else { - head = new extent( - ext->offset, offset - ext->offset); - } - head->link(*this, *ps); - } - if ((ext->offset + ext->length > offset + length) && - (offset + length > ext->offset)) { - uint64_t nlen = - (ext->offset + ext->get_length()) - (offset + length); - extent *tail = nullptr; - if (ext->bl) { - ceph::buffer::list bl; - bl.substr_of( - *(ext->bl), - ext->get_length() - nlen, - nlen); - tail = new extent(offset + length, bl); - } else { - tail = new extent(offset + length, nlen); - } - tail->link(*this, *ps); - } - if (action.action == update_action::UPDATE_PIN) { - if (ext->bl) { - ceph::buffer::list bl; - bl.substr_of( - *(ext->bl), - extoff - ext->offset, - extlen); - final_extent = new ExtentCache::extent( - extoff, - bl); - } else { - final_extent = new ExtentCache::extent( - extoff, extlen); - } - final_extent->link(*this, pin); - } - delete ext; - } - - if (action.bl) { - ceph_assert(final_extent); - ceph_assert(final_extent->length == action.bl->length()); - final_extent->bl = *(action.bl); - } - - uint64_t next_off = p == range.second ? - offset + length : p->offset; - if (extoff + extlen < next_off) { - uint64_t tailoff = extoff + extlen; - uint64_t taillen = next_off - tailoff; - - update_action action; - f(tailoff, taillen, nullptr, &action); - ceph_assert(!action.bl || action.bl->length() == taillen); - if (action.action == update_action::UPDATE_PIN) { - extent *ext = action.bl ? - new extent(tailoff, *action.bl) : - new extent(tailoff, taillen); - ext->link(*this, pin); - } else { - ceph_assert(!action.bl); - } - } - } - } - }; - struct Cmp { - bool operator()(const hobject_t &oid, const object_extent_set &rhs) const { - return oid < rhs.oid; - } - bool operator()(const object_extent_set &lhs, const hobject_t &oid) const { - return lhs.oid < oid; - } - }; - - object_extent_set &get_or_create(const hobject_t &oid); - object_extent_set *get_if_exists(const hobject_t &oid); - - void remove_and_destroy_if_empty(object_extent_set &set); - using cache_set = boost::intrusive::set; - cache_set per_object_caches; - - uint64_t next_write_tid = 1; - uint64_t next_read_tid = 1; - struct pin_state { - uint64_t tid = 0; - enum pin_type_t { - NONE, - WRITE, - }; - pin_type_t pin_type = NONE; - bool is_write() const { return pin_type == WRITE; } - - pin_state(const pin_state &other) = delete; - pin_state &operator=(const pin_state &other) = delete; - pin_state(pin_state &&other) = delete; - pin_state() = default; - - using list_member_options = boost::intrusive::member_hook< - extent, - boost::intrusive::list_member_hook<>, - &extent::pin_list_member>; - using list = boost::intrusive::list, list_member_options>; - list pin_list; - ~pin_state() { - ceph_assert(pin_list.empty()); - ceph_assert(tid == 0); - ceph_assert(pin_type == NONE); - } - void _open(uint64_t in_tid, pin_type_t in_type) { - ceph_assert(pin_type == NONE); - ceph_assert(in_tid > 0); - tid = in_tid; - pin_type = in_type; - } - }; - - void release_pin(pin_state &p) { - for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) { - std::unique_ptr extent(&*iter); // we now own this - iter++; // unlink will invalidate - ceph_assert(extent->parent_extent_set); - auto &eset = *(extent->parent_extent_set); - extent->unlink(); - remove_and_destroy_if_empty(eset); - } - p.tid = 0; - p.pin_type = pin_state::NONE; - } - -public: - class write_pin : private pin_state { - friend class ExtentCache; - private: - void open(uint64_t in_tid) { - _open(in_tid, pin_state::WRITE); - } - public: - write_pin() : pin_state() {} - }; - - void open_write_pin(write_pin &pin) { - pin.open(next_write_tid++); - } - - /** - * Reserves extents required for rmw, and learn - * which need to be read - * - * Pins all extents in to_write. Returns subset of to_read not - * currently present in the cache. Caller must obtain those - * extents before calling get_remaining_extents_for_rmw. - * - * Transition table: - * - Empty -> Write Pending pin.reqid - * - Write Pending N -> Write Pending pin.reqid - * - Write Pinned N -> Write Pinned pin.reqid - * - * @param oid [in] object undergoing rmw - * @param pin [in,out] pin to use (obtained from create_write_pin) - * @param to_write [in] extents which will be written - * @param to_read [in] extents to read prior to write (must be subset - * of to_write) - * @return subset of to_read which isn't already present or pending - */ - extent_set reserve_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_write, - const extent_set &to_read); - - /** - * Gets extents required for rmw not returned from - * reserve_extents_for_rmw - * - * Requested extents (to_get) must be the set to_read \ the set - * returned from reserve_extents_for_rmw. No transition table, - * all extents at this point must be present and already pinned - * for this pin by reserve_extents_for_rmw. - * - * @param oid [in] object - * @param pin [in,out] pin associated with this IO - * @param to_get [in] extents to get (see above for restrictions) - * @return map of buffers from to_get - */ - extent_map get_remaining_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_get); - - /** - * Updates the cache to reflect the rmw write - * - * All presented extents must already have been specified in - * reserve_extents_for_rmw under to_write. - * - * Transition table: - * - Empty -> invalid, must call reserve_extents_for_rmw first - * - Write Pending N -> Write Pinned N, update buffer - * (assert N >= pin.reqid) - * - Write Pinned N -> Update buffer (assert N >= pin.reqid) - * - * @param oid [in] object - * @param pin [in,out] pin associated with this IO - * @param extents [in] map of buffers to update - * @return void - */ - void present_rmw_update( - const hobject_t &oid, - write_pin &pin, - const extent_map &extents); - - /** - * Release all buffers pinned by pin - */ - void release_write_pin( - write_pin &pin) { - release_pin(pin); - } - - std::ostream &print(std::ostream &out) const; -}; - -std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache); \ No newline at end of file diff --git a/src/test/erasure-code/TestErasureCodePluginJerasure.cc b/src/test/erasure-code/TestErasureCodePluginJerasure.cc index ad5082bd8c39f..e53c55c15cc63 100644 --- a/src/test/erasure-code/TestErasureCodePluginJerasure.cc +++ b/src/test/erasure-code/TestErasureCodePluginJerasure.cc @@ -17,6 +17,7 @@ #include #include + #include "erasure-code/ErasureCodePlugin.h" #include "log/Log.h" #include "global/global_context.h" @@ -62,6 +63,80 @@ TEST(ErasureCodePlugin, factory) } } +bufferptr create_bufferptr(uint64_t value) { + bufferlist bl; + bl.append_zero(4096); + memcpy(bl.c_str(), &value, sizeof(value)); + return bl.begin().get_current_ptr(); +} + +TEST(ErasureCodePlugin, parity_delta_write) { + ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance(); + ErasureCodeInterfaceRef erasure_code; + ErasureCodeProfile profile; + profile["technique"] = "reed_sol_van"; + profile["k"] = "5"; + int k=5; + profile["m"] = "3"; + int m=3; + EXPECT_EQ(0, instance.factory("jerasure", + g_conf().get_val("erasure_code_dir"), + profile, + &erasure_code, &cerr)); + shard_id_map data(8); + shard_id_map coding(8); + shard_id_map coding2(8); + shard_id_map decode_in(8); + shard_id_map decode_out(8); + + uint32_t seeds[] = {100, 101, 102, 103, 104}; + uint32_t overwrite3 = 1032; + + for (shard_id_t s; s < k; ++s) { + data[s] = create_bufferptr(seeds[int(s)]); + } + for (shard_id_t s(k); s < k + m; ++s) { + coding[s] = create_bufferptr(-1); + coding2[s] = create_bufferptr(-1); + } + + // Do a normal encode. + erasure_code->encode_chunks(data, coding); + + shard_id_map delta(8); + delta[shard_id_t(3)] = create_bufferptr(-1); + + bufferptr overwrite_bp = create_bufferptr(overwrite3); + + erasure_code->encode_delta(data[shard_id_t(3)], overwrite_bp, &delta[shard_id_t(3)]); + erasure_code->apply_delta(delta, coding); + data[shard_id_t(3)] = overwrite_bp; + + erasure_code->encode_chunks(data, coding2); + + for (shard_id_t s(k); s < k + m; ++s) { + ASSERT_EQ(*(uint32_t*)coding[s].c_str(), *(uint32_t*)coding2[s].c_str()); + } + + data.erase(shard_id_t(4)); + data.emplace(shard_id_t(4), (char*)malloc(4096), 4096); + shard_id_set want; + want.insert_range(shard_id_t(0), 5); + decode_in[shard_id_t(0)] = data[shard_id_t(0)]; + decode_in[shard_id_t(1)] = data[shard_id_t(1)]; + decode_in[shard_id_t(2)] = data[shard_id_t(2)]; + decode_in[shard_id_t(3)] = data[shard_id_t(3)]; + decode_out[shard_id_t(4)] = data[shard_id_t(4)]; + decode_in[shard_id_t(6)] = coding[shard_id_t(6)]; + + ASSERT_EQ(0, erasure_code->decode_chunks(want, decode_in, decode_out)); + + seeds[3] = overwrite3; + for (shard_id_t s(0); s < k; ++s) { + ASSERT_EQ(seeds[int(s)], *(uint32_t*)data[s].c_str()); + } +} + /* * Local Variables: * compile-command: "cd ../.. ; make -j4 && diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt index bcaf9b54bf26a..3661d91dffea1 100644 --- a/src/test/osd/CMakeLists.txt +++ b/src/test/osd/CMakeLists.txt @@ -76,6 +76,14 @@ add_executable(unittest_ecbackend add_ceph_unittest(unittest_ecbackend) target_link_libraries(unittest_ecbackend osd global) +# unittest_ecutil +add_executable(unittest_ecutil + TestECUtil.cc + $ +) +add_ceph_unittest(unittest_ecutil) +target_link_libraries(unittest_ecutil osd global) + # unittest_osdscrub add_executable(unittest_osdscrub TestOSDScrub.cc diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc index c979e84a518e1..ec875abdb9e69 100644 --- a/src/test/osd/TestECBackend.cc +++ b/src/test/osd/TestECBackend.cc @@ -19,6 +19,9 @@ #include "osd/ECCommon.h" #include "osd/ECBackend.h" #include "gtest/gtest.h" +#include "osd/osd_types.h" +#include "common/ceph_argparse.h" +#include "erasure-code/ErasureCode.h" using namespace std; @@ -31,216 +34,1066 @@ TEST(ECUtil, stripe_info_t) ECUtil::stripe_info_t s(k, m, swidth); ASSERT_EQ(s.get_stripe_width(), swidth); - ASSERT_EQ(s.logical_to_next_chunk_offset(0), 0u); - ASSERT_EQ(s.logical_to_next_chunk_offset(1), s.get_chunk_size()); - ASSERT_EQ(s.logical_to_next_chunk_offset(swidth - 1), + ASSERT_EQ(s.ro_offset_to_next_chunk_offset(0), 0u); + ASSERT_EQ(s.ro_offset_to_next_chunk_offset(1), s.get_chunk_size()); + ASSERT_EQ(s.ro_offset_to_next_chunk_offset(swidth - 1), s.get_chunk_size()); - ASSERT_EQ(s.logical_to_prev_chunk_offset(0), 0u); - ASSERT_EQ(s.logical_to_prev_chunk_offset(swidth), s.get_chunk_size()); - ASSERT_EQ(s.logical_to_prev_chunk_offset((swidth * 2) - 1), + ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(0), 0u); + ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(swidth), s.get_chunk_size()); + ASSERT_EQ(s.ro_offset_to_prev_chunk_offset((swidth * 2) - 1), s.get_chunk_size()); - ASSERT_EQ(s.logical_to_next_stripe_offset(0), 0u); - ASSERT_EQ(s.logical_to_next_stripe_offset(swidth - 1), + ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(0), 0u); + ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(swidth - 1), s.get_stripe_width()); - ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width()); - ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width()); - ASSERT_EQ(s.logical_to_prev_stripe_offset((swidth * 2) - 1), + ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width()); + ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width()); + ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset((swidth * 2) - 1), s.get_stripe_width()); - ASSERT_EQ(s.aligned_logical_offset_to_chunk_offset(2*swidth), + ASSERT_EQ(s.aligned_ro_offset_to_chunk_offset(2*swidth), 2*s.get_chunk_size()); - ASSERT_EQ(s.aligned_chunk_offset_to_logical_offset(2*s.get_chunk_size()), + ASSERT_EQ(s.chunk_aligned_shard_offset_to_ro_offset(2*s.get_chunk_size()), 2*s.get_stripe_width()); // Stripe 1 + 1 chunk for 10 stripes needs to read 11 stripes starting // from 1 because there is a partial stripe at the start and end - ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk( - make_pair(swidth+s.get_chunk_size(), 10*swidth)), + ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth+s.get_chunk_size(), 10*swidth), make_pair(s.get_chunk_size(), 11*s.get_chunk_size())); // Stripe 1 + 0 chunks for 10 stripes needs to read 10 stripes starting // from 1 because there are no partial stripes - ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(swidth, 10*swidth)), + ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth, 10*swidth), make_pair(s.get_chunk_size(), 10*s.get_chunk_size())); // Stripe 0 + 1 chunk for 10 stripes needs to read 11 stripes starting // from 0 because there is a partial stripe at the start and end - ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(), 10*swidth)), + ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(), 10*swidth), make_pair(0, 11*s.get_chunk_size())); // Stripe 0 + 1 chunk for (10 stripes + 1 chunk) needs to read 11 stripes // starting from 0 because there is a partial stripe at the start and end - ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(), - 10*swidth + s.get_chunk_size())), + ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(), + 10*swidth + s.get_chunk_size()), make_pair(0, 11*s.get_chunk_size())); // Stripe 0 + 2 chunks for (10 stripes + 2 chunks) needs to read 11 stripes // starting from 0 because there is a partial stripe at the start - ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(2*s.get_chunk_size(), - 10*swidth + 2*s.get_chunk_size())), - make_pair(0, 11*s.get_chunk_size())); + ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(2*s.get_chunk_size(), + 10*swidth + 2*s.get_chunk_size()), + make_pair(0, 11*s.get_chunk_size())); - ASSERT_EQ(s.offset_len_to_stripe_bounds(make_pair(swidth-10, (uint64_t)20)), + ASSERT_EQ(s.ro_offset_len_to_stripe_ro_offset_len(swidth-10, (uint64_t)20), make_pair((uint64_t)0, 2*swidth)); } -TEST(ECUtil, offset_length_is_same_stripe) -{ - const uint64_t swidth = 4096; - const uint64_t schunk = 1024; - const unsigned int k = 4; - const unsigned int m = 2; +class ErasureCodeDummyImpl : public ErasureCodeInterface { +public: - ECUtil::stripe_info_t s(k, m, swidth); - ASSERT_EQ(s.get_stripe_width(), swidth); - ASSERT_EQ(s.get_chunk_size(), schunk); + uint64_t get_supported_optimizations() const override { + return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION | + FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION | + FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION | + FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION | + FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION; + } - // read nothing at the very beginning - // +---+---+---+---+ - // | 0| | | | - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(0, 0)); - - // read nothing at the stripe end - // +---+---+---+---+ - // | | | | 0| - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(swidth, 0)); - - // read single byte at the stripe end - // +---+---+---+---+ - // | | | | ~1| - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(swidth - 1, 1)); - - // read single stripe - // +---+---+---+---+ - // | 1k| 1k| 1k| 1k| - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(0, swidth)); - - // read single chunk - // +---+---+---+---+ - // | 1k| | | | - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(0, schunk)); - - // read single stripe except its first chunk - // +---+---+---+---+ - // | | 1k| 1k| 1k| - // +---+---+---+---+ - // | | | | | - // +---+---+---+---+ - ASSERT_TRUE(s.offset_length_is_same_stripe(schunk, swidth - schunk)); - - // read two stripes - // +---+---+---+---+ - // | 1k| 1k| 1k| 1k| - // +---+---+---+---+ - // | 1k| 1k| 1k| 1k| - // +---+---+---+---+ - ASSERT_FALSE(s.offset_length_is_same_stripe(0, 2*swidth)); - - // multistripe read: 1st stripe without 1st byte + 1st byte of 2nd stripe - // +-----+---+---+---+ - // | 1k-1| 1k| 1k| 1k| - // +-----+---+---+---+ - // | 1| | | | - // +-----+---+---+---+ - ASSERT_FALSE(s.offset_length_is_same_stripe(1, swidth)); -} + ErasureCodeProfile _profile; + const std::vector chunk_mapping = {}; // no remapping + std::vector> default_sub_chunk = {std::pair(0,1)}; + int data_chunk_count = 4; + int chunk_count = 6; + + int init(ErasureCodeProfile &profile, std::ostream *ss) override { + return 0; + } + + const ErasureCodeProfile &get_profile() const override { + return _profile; + } + + int create_rule(const string &name, CrushWrapper &crush, std::ostream *ss) const override { + return 0; + } + + unsigned int get_chunk_count() const override { + return chunk_count; + } + + unsigned int get_data_chunk_count() const override { + return data_chunk_count; + } + + unsigned int get_coding_chunk_count() const override { + return 0; + } + + int get_sub_chunk_count() override { + return 1; + } + + unsigned int get_chunk_size(unsigned int stripe_width) const override { + return 0; + } + + int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available, + shard_id_set &minimum_set, + shard_id_map>> *minimum_sub_chunks) override { + shard_id_t parity_shard_index(data_chunk_count); + for (shard_id_t shard : want_to_read) { + if (available.contains(shard)) { + minimum_set.insert(shard); + } else { + // Shard is missing. Recover with every other shard and one parity + // for each missing shard. + for (shard_id_t i; iemplace(shard, default_sub_chunk); + } + return 0; + } + + [[deprecated]] + int minimum_to_decode(const std::set &want_to_read, + const std::set &available, + std::map>> *minimum) override + { + ADD_FAILURE(); + return 0; + } + + [[deprecated]] + int minimum_to_decode_with_cost(const std::set &want_to_read, + const std::map &available, std::set *minimum) override { + ADD_FAILURE(); + return 0; + } + + int minimum_to_decode_with_cost(const shard_id_set &want_to_read, const shard_id_map &available, + shard_id_set *minimum) override { + return 0; + } + + int encode(const shard_id_set &want_to_encode, const bufferlist &in, shard_id_map *encoded) override { + return 0; + } + + [[deprecated]] + int encode(const std::set &want_to_encode, const bufferlist &in + , std::map *encoded) override + { + ADD_FAILURE(); + return 0; + } + + [[deprecated]] + int encode_chunks(const std::set &want_to_encode, + std::map *encoded) override + { + ADD_FAILURE(); + return 0; + } + + int encode_chunks(const shard_id_map &in, shard_id_map &out) override { + return 0; + } + + int decode(const shard_id_set &want_to_read, const shard_id_map &chunks, shard_id_map *decoded, + int chunk_size) override { + return 0; + } + + [[deprecated]] + int decode(const std::set &want_to_read, const std::map &chunks, + std::map *decoded, int chunk_size) override + { + ADD_FAILURE(); + return 0; + } + + [[deprecated]] + int decode_chunks(const std::set &want_to_read, + const std::map &chunks, + std::map *decoded) override { + ADD_FAILURE(); + return 0; + } + + int decode_chunks(const shard_id_set &want_to_read, + shard_id_map &in, shard_id_map &out) override + { + if (in.size() < data_chunk_count) { + ADD_FAILURE(); + } + uint64_t len = 0; + for (auto &&[shard, bp] : in) { + if (len == 0) { + len = bp.length(); + } else if (len != bp.length()) { + ADD_FAILURE(); + } + } + if (len == 0) { + ADD_FAILURE(); + } + if (out.size() == 0) { + ADD_FAILURE(); + } + for (auto &&[shard, bp] : out) { + if (len != bp.length()) { + ADD_FAILURE(); + } + } + return 0; + } + + const vector &get_chunk_mapping() const override { + return chunk_mapping; + } + + [[deprecated]] + int decode_concat(const std::set &want_to_read, + const std::map &chunks, bufferlist *decoded) override { + ADD_FAILURE(); + return 0; + } + + [[deprecated]] + int decode_concat(const std::map &chunks, + bufferlist *decoded) override { + ADD_FAILURE(); + return 0; + } + + size_t get_minimum_granularity() override { return 0; } + void encode_delta(const bufferptr &old_data, const bufferptr &new_data + , bufferptr *delta) override {} + void apply_delta(const shard_id_map &in + , shard_id_map &out) override {} +}; + +class ECListenerStub : public ECListener { + OSDMapRef osd_map_ref; + pg_info_t pg_info; + set backfill_shards; + shard_id_set backfill_shard_id_set; + map> missing_loc_shards; + map shard_missing; + pg_missing_set shard_not_missing_const; + pg_pool_t pg_pool; + set acting_recovery_backfill_shards; + shard_id_set acting_recovery_backfill_shard_id_set; + map shard_info; + PGLog pg_log; + pg_info_t shard_pg_info; + std::string dbg_prefix = "stub"; + +public: + set acting_shards; + + ECListenerStub() + : pg_log(NULL) {} + + const OSDMapRef &pgb_get_osdmap() const override { + return osd_map_ref; + } + + epoch_t pgb_get_osdmap_epoch() const override { + return 0; + } + + const pg_info_t &get_info() const override { + return pg_info; + } + + void cancel_pull(const hobject_t &soid) override { + + } + + pg_shard_t primary_shard() const override { + return pg_shard_t(); + } + + bool pgb_is_primary() const override { + return false; + } + + void on_failed_pull(const set &from, const hobject_t &soid, const eversion_t &v) override { + + } + + void + on_local_recover(const hobject_t &oid, const ObjectRecoveryInfo &recovery_info, ObjectContextRef obc, bool is_delete, + ceph::os::Transaction *t) override { + + } + + void on_global_recover(const hobject_t &oid, const object_stat_sum_t &stat_diff, bool is_delete) override { + + } + + void on_peer_recover(pg_shard_t peer, const hobject_t &oid, const ObjectRecoveryInfo &recovery_info) override { + + } + void begin_peer_recover(pg_shard_t peer, const hobject_t oid) override { + + } + + bool pg_is_repair() const override { + return false; + } + + ObjectContextRef + get_obc(const hobject_t &hoid, const map> &attrs) override { + return ObjectContextRef(); + } + + bool check_failsafe_full() override { + return false; + } + + hobject_t get_temp_recovery_object(const hobject_t &target, eversion_t version) override { + return hobject_t(); + } + + bool pg_is_remote_backfilling() override { + return false; + } + + void pg_add_local_num_bytes(int64_t num_bytes) override { + + } + + void pg_add_num_bytes(int64_t num_bytes) override { + + } + + void inc_osd_stat_repaired() override { + + } + + void add_temp_obj(const hobject_t &oid) override { + + } + + void clear_temp_obj(const hobject_t &oid) override { + + } + + epoch_t get_last_peering_reset_epoch() const override { + return 0; + } + + GenContext *bless_unlocked_gencontext(GenContext *c) override { + return nullptr; + } + + void schedule_recovery_work(GenContext *c, uint64_t cost) override { + + } + + epoch_t get_interval_start_epoch() const override { + return 0; + } + + const set &get_acting_shards() const override { + return acting_shards; + } + + const set &get_backfill_shards() const override { + return backfill_shards; + } + + const map> &get_missing_loc_shards() const override { + return missing_loc_shards; + } + + const map &get_shard_missing() const override { + return shard_missing; + } + + const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const override { + return shard_not_missing_const; + } + + const pg_missing_const_i *maybe_get_shard_missing(pg_shard_t peer) const override { + return nullptr; + } + + const pg_info_t &get_shard_info(pg_shard_t peer) const override { + return shard_pg_info; + } + + ceph_tid_t get_tid() override { + return 0; + } + + pg_shard_t whoami_shard() const override { + return pg_shard_t(); + } + + void send_message_osd_cluster(vector> &messages, epoch_t from_epoch) override { + + } + + ostream &gen_dbg_prefix(ostream &out) const override { + out << dbg_prefix; + return out; + } + + const pg_pool_t &get_pool() const override { + return pg_pool; + } + + const set &get_acting_recovery_backfill_shards() const override { + return acting_recovery_backfill_shards; + } + + const shard_id_set &get_acting_recovery_backfill_shard_id_set() const override { + return acting_recovery_backfill_shard_id_set; + } + + bool should_send_op(pg_shard_t peer, const hobject_t &hoid) override { + return false; + } + + const map &get_shard_info() const override { + return shard_info; + } + + spg_t primary_spg_t() const override { + return spg_t(); + } + + const PGLog &get_log() const override { + return pg_log; + } + + DoutPrefixProvider *get_dpp() override { + return nullptr; + } + + void apply_stats(const hobject_t &soid, const object_stat_sum_t &delta_stats) override { + + } + + bool is_missing_object(const hobject_t &oid) const override { + return false; + } + + void add_local_next_event(const pg_log_entry_t &e) override { + + } + + void log_operation(vector &&logv, const optional &hset_history, + const eversion_t &trim_to, const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, bool transaction_applied, os::Transaction &t, + bool async) override { + + } + + void op_applied(const eversion_t &applied_version) override { + + } + + uint64_t min_peer_features() const { + return 0; + } +}; TEST(ECCommon, get_min_want_to_read_shards) { const uint64_t swidth = 4096; const unsigned int k = 4; const unsigned int m = 2; + const uint64_t csize = 1024; ECUtil::stripe_info_t s(k, m, swidth); + ECListenerStub listenerStub; ASSERT_EQ(s.get_stripe_width(), swidth); - ASSERT_EQ(s.get_chunk_size(), 1024); + ASSERT_EQ(s.get_chunk_size(), csize); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + ECUtil::shard_extent_set_t empty_extent_set_map(s.get_k_plus_m()); // read nothing at the very beginning { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 0, 0, s, &want_to_read); - ASSERT_TRUE(want_to_read == std::set{}); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(0, 0, 0); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ASSERT_EQ(want_to_read, empty_extent_set_map); } // read nothing at the middle (0-sized partial read) { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 2048, 0, s, &want_to_read); - ASSERT_TRUE(want_to_read == std::set{}); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(2048, 0, 0); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ASSERT_EQ(want_to_read, empty_extent_set_map); + } + // read nothing at the the second stripe (0-sized partial read) + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth, 0, 0); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ASSERT_EQ(want_to_read, empty_extent_set_map); } // read not-so-many (< chunk_size) bytes at the middle (partial read) { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 2048, 42, s, &want_to_read); - ASSERT_TRUE(want_to_read == std::set{2}); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(2048, 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(2)].insert(0, 42); + ASSERT_EQ(want_to_read, ref); + } + + // read not-so-many (< chunk_size) bytes after the first stripe. + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth+2048, 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(2)].insert(csize, 42); + ASSERT_EQ(want_to_read, ref); } // read more (> chunk_size) bytes at the middle (partial read) { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 1024, 1024+42, s, &want_to_read); - // extra () due to a language / macro limitation - ASSERT_TRUE(want_to_read == (std::set{1, 2})); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(csize, csize + 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(1)].insert(0, csize); + ref[shard_id_t(2)].insert(0, 42); + ASSERT_EQ(want_to_read, ref); + } + + // read more (> chunk_size) bytes at the middle (partial read), second stripe + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth + csize, csize + 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, 42); + ASSERT_EQ(want_to_read, ref); } // full stripe except last chunk { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 0, 3*1024, s, &want_to_read); - // extra () due to a language / macro limitation - ASSERT_TRUE(want_to_read == (std::set{0, 1, 2})); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(0, 3*csize, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(0)].insert(0, csize); + ref[shard_id_t(1)].insert(0, csize); + ref[shard_id_t(2)].insert(0, csize); + ASSERT_EQ(want_to_read, ref); + } + + // full stripe except last chunk (second stripe) + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth, 3*csize, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(0)].insert(csize, csize); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, csize); + ASSERT_EQ(want_to_read, ref); } // full stripe except 1st chunk { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 1024, swidth-1024, s, &want_to_read); - // extra () due to a language / macro limitation - ASSERT_TRUE(want_to_read == (std::set{1, 2, 3})); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(csize, swidth - csize, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(1)].insert(0, csize); + ref[shard_id_t(2)].insert(0, csize); + ref[shard_id_t(3)].insert(0, csize); + ASSERT_EQ(want_to_read, ref); + } + + // full stripe except 1st chunk (second stripe) + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth + csize, swidth - csize, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, csize); + ref[shard_id_t(3)].insert(csize, csize); + ASSERT_EQ(want_to_read, ref); } // large, multi-stripe read starting just after 1st chunk + // 0XXX + // XXXX x41 + // X000 + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(csize, swidth * 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(0)].insert(csize, csize*42); + ref[shard_id_t(1)].insert(0, csize*42); + ref[shard_id_t(2)].insert(0, csize*42); + ref[shard_id_t(3)].insert(0, csize*42); + ASSERT_EQ(want_to_read, ref); + } + + // large, multi-stripe read starting just after 1st chunk (second stripe) + // 0XXX + // XXXX x41 + // X000 + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth + csize, swidth * 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + ref[shard_id_t(0)].insert(csize*2, csize*42); + ref[shard_id_t(1)].insert(csize, csize*42); + ref[shard_id_t(2)].insert(csize, csize*42); + ref[shard_id_t(3)].insert(csize, csize*42); + ASSERT_EQ(want_to_read, ref); + } + + // large read from the beginning { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 1024, swidth*42, s, &want_to_read); - // extra () due to a language / macro limitation - ASSERT_TRUE(want_to_read == (std::set{0, 1, 2, 3})); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(0, swidth * 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(0, csize*42); + ref[shard_id_t(1)].insert(0, csize*42); + ref[shard_id_t(2)].insert(0, csize*42); + ref[shard_id_t(3)].insert(0, csize*42); + ASSERT_EQ(want_to_read, ref); } // large read from the beginning { - std::set want_to_read; - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 0, swidth*42, s, &want_to_read); - // extra () due to a language / macro limitation - ASSERT_TRUE(want_to_read == (std::set{0, 1, 2, 3})); + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(0, swidth * 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(0, csize*42); + ref[shard_id_t(1)].insert(0, csize*42); + ref[shard_id_t(2)].insert(0, csize*42); + ref[shard_id_t(3)].insert(0, csize*42); + ASSERT_EQ(want_to_read, ref); + } + + // large read from the beginning (second stripe) + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth, swidth * 42, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(csize, csize*42); + ref[shard_id_t(1)].insert(csize, csize*42); + ref[shard_id_t(2)].insert(csize, csize*42); + ref[shard_id_t(3)].insert(csize, csize*42); + ASSERT_EQ(want_to_read, ref); + } + + // large read that starts and ends on same shard. + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth, swidth+csize/2, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(csize, csize+csize/2); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, csize); + ref[shard_id_t(3)].insert(csize, csize); + ASSERT_EQ(want_to_read, ref); + } + + // large read that starts and ends on last shard + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth-csize, swidth+csize/2, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(csize, csize); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, csize); + ref[shard_id_t(3)].insert(0, csize+csize/2); + ASSERT_EQ(want_to_read, ref); + } + // large read that starts and ends on last shard, partial first shard. + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read(swidth-csize/2, swidth, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(csize, csize); + ref[shard_id_t(1)].insert(csize, csize); + ref[shard_id_t(2)].insert(csize, csize); + ref[shard_id_t(3)].insert(csize/2, csize); + ASSERT_EQ(want_to_read, ref); + } +} + +TEST(ECCommon, get_min_avail_to_read_shards) { + const uint64_t page_size = CEPH_PAGE_SIZE; + const uint64_t swidth = 64*page_size; + const unsigned int k = 4; + const unsigned int m = 2; + const int nshards = 6; + const uint64_t object_size = swidth * 1024; + + std::vector empty_shard_vector(k); + + ECUtil::stripe_info_t s(k, m, swidth, vector(0)); + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), swidth / k); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); + ErasureCodeInterfaceRef ec_impl(ecode); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + for (int i = 0; i < nshards; i++) { + listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i))); + } + + // read nothing + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + ECCommon::read_request_t read_request(to_read_list, false, object_size); + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request); + + ECCommon::read_request_t ref(to_read_list, false, object_size); + + ASSERT_EQ(read_request, ref); + } + + /* Read to every data shard. */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + + for (shard_id_t i; idefault_sub_chunk; + ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id)); + ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id), shard_id); + } + ASSERT_EQ(read_request, ref); + } + + /* Read to every data shard. */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + for (shard_id_t i; idefault_sub_chunk; + ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id); + } + + ASSERT_EQ(read_request, ref); + } + + + /* Read to every data shard - small read */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + + for (shard_id_t i; i < (int)k; ++i) { + to_read_list[i].insert(int(i) * 2 * page_size + int(i) + 1, int(i) + 1); + } + ECCommon::read_request_t ref(to_read_list, false, object_size); + ECCommon::read_request_t read_request(to_read_list, false, object_size); + for (int i=0; i < (int)k; i++) { + shard_id_t shard_id(i); + ECCommon::shard_read_t &ref_shard_read = ref.shard_reads[shard_id]; + ref_shard_read.subchunk = ecode->default_sub_chunk; + ref_shard_read.extents.insert(i*2*page_size, page_size); + ref_shard_read.pg_shard = pg_shard_t(i, shard_id_t(i)); + } + + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request); + ASSERT_EQ(read_request, ref); + } + + /* Read to every data shard, missing shard. */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + + for (shard_id_t i; idefault_sub_chunk; + ref.shard_reads[shard_id].extents = to_read_list[i]; + ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id); + } else { + ECCommon::shard_read_t parity_shard_read; + parity_shard_read.subchunk = ecode->default_sub_chunk; + parity_shard_read.extents.union_of(to_read_list[i]); + ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read; + ref.shard_reads[shard_id_t(parity_shard)].pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard)); + } + } + + ASSERT_EQ(read_request, ref); + + listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1))); + } + + + /* Read to every data shard, missing shard, missing shard is adjacent. */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + unsigned int missing_shard = 1; + + to_read_list[shard_id_t(0)].insert(0, page_size); + to_read_list[shard_id_t(1)].insert(page_size, page_size); + to_read_list[shard_id_t(2)].insert(2*page_size, page_size); + to_read_list[shard_id_t(3)].insert(3*page_size, page_size); + ECCommon::read_request_t read_request(to_read_list, false, object_size); + ECCommon::read_request_t ref(to_read_list, false, object_size); + + // Populating reference manually to check that adjacent shards get correctly combined. + ref.shard_reads[shard_id_t(0)].extents.insert(0, page_size*2); + ref.shard_reads[shard_id_t(2)].extents.insert(page_size, page_size*2); + ref.shard_reads[shard_id_t(3)].extents.insert(page_size, page_size); + ref.shard_reads[shard_id_t(3)].extents.insert(3*page_size, page_size); + ref.shard_reads[shard_id_t(4)].extents.insert(page_size, page_size); + ref.shard_reads[shard_id_t(0)].pg_shard = pg_shard_t(0, shard_id_t(0)); + ref.shard_reads[shard_id_t(2)].pg_shard = pg_shard_t(2, shard_id_t(2)); + ref.shard_reads[shard_id_t(3)].pg_shard = pg_shard_t(3, shard_id_t(3)); + ref.shard_reads[shard_id_t(4)].pg_shard = pg_shard_t(4, shard_id_t(4)); + for (unsigned int i=0; idefault_sub_chunk; + } + + listenerStub.acting_shards.erase(pg_shard_t(missing_shard, shard_id_t(missing_shard))); + + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request); + + ASSERT_EQ(read_request, ref); + + listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1))); + } + + /* Read to every data shard, but with "fast" (redundant) reads */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + + extent_set extents_to_read; + for (shard_id_t i; idefault_sub_chunk; + shard_read.extents = extents_to_read; + shard_read.pg_shard = pg_shard_t(i, shard_id_t(i)); + ref.shard_reads[shard_id_t(i)] = shard_read; + } + + ASSERT_EQ(read_request, ref); + } + + /* Read to every data shard, missing shard. */ + { + ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m()); + hobject_t hoid; + + for (shard_id_t i; i error_shards; + error_shards.emplace(int(missing_shard), shard_id_t(missing_shard)); + // Similar to previous tests with missing shards, but this time, emulate + // the shard being missing as a result of a bad read. + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request, error_shards); + + ECCommon::read_request_t ref(to_read_list, false, object_size); + std::vector want_to_read(empty_shard_vector); + for (shard_id_t i; idefault_sub_chunk; + want_to_read[int(i)].extents.union_of(to_read_list[missing_shard]); + want_to_read[int(i)].extents.union_of(to_read_list[i]); + want_to_read[int(i)].pg_shard = pg_shard_t(int(i), shard_id_t(i)); + ref.shard_reads[shard_id_t(i)] = want_to_read[int(i)]; + } else { + ECCommon::shard_read_t parity_shard_read; + parity_shard_read.subchunk = ecode->default_sub_chunk; + parity_shard_read.extents.union_of(to_read_list[missing_shard]); + parity_shard_read.pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard)); + ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read; + } + } + + ASSERT_EQ(read_request, ref); + + listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1))); + } +} + +TEST(ECCommon, shard_read_combo_tests) +{ + const uint64_t page_size = CEPH_PAGE_SIZE; + const uint64_t swidth = 2*page_size; + const unsigned int k = 2; + const unsigned int m = 2; + const int nshards = 4; + const uint64_t object_size = swidth * 1024; + hobject_t hoid; + + ECUtil::stripe_info_t s(k, m, swidth, vector(0)); + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), swidth/k); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); + ErasureCodeInterfaceRef ec_impl(ecode); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + for (int i = 0; i < nshards; i++) { + listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i))); + } + + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + + ec_align_t to_read(36*1024,10*1024, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECCommon::read_request_t read_request(want_to_read, false, object_size); + + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request); + + ECCommon::read_request_t ref(want_to_read, false, object_size); + { + ECCommon::shard_read_t shard_read; + shard_read.subchunk = ecode->default_sub_chunk; + shard_read.extents.insert(20*1024, 4*1024); + shard_read.pg_shard = pg_shard_t(0, shard_id_t(0)); + ref.shard_reads[shard_id_t(0)] = shard_read; + } + { + ECCommon::shard_read_t shard_read; + shard_read.subchunk = ecode->default_sub_chunk; + shard_read.extents.insert(16*1024, 8*1024); + shard_read.pg_shard = pg_shard_t(1, shard_id_t(1)); + ref.shard_reads[shard_id_t(1)] = shard_read; + } + + ASSERT_EQ(read_request, ref); + } + + { + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + + ec_align_t to_read(12*1024,12*1024, 1); + pipeline.get_min_want_to_read_shards(to_read, want_to_read); + ECCommon::read_request_t read_request(want_to_read, false, object_size); + pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request); + + ECCommon::read_request_t ref(want_to_read, false, object_size); + { + ECCommon::shard_read_t shard_read; + shard_read.subchunk = ecode->default_sub_chunk; + shard_read.extents.insert(8*1024, 4*1024); + shard_read.pg_shard = pg_shard_t(0, shard_id_t(0)); + ref.shard_reads[shard_id_t(0)] = shard_read; + } + { + ECCommon::shard_read_t shard_read; + shard_read.subchunk = ecode->default_sub_chunk; + shard_read.extents.insert(4*1024, 8*1024); + shard_read.pg_shard = pg_shard_t(1, shard_id_t(1)); + ref.shard_reads[shard_id_t(1)] = shard_read; + } + + ASSERT_EQ(read_request, ref); } } @@ -249,21 +1102,207 @@ TEST(ECCommon, get_min_want_to_read_shards_bug67087) const uint64_t swidth = 4096; const unsigned int k = 4; const unsigned int m = 2; + const uint64_t csize = 1024; ECUtil::stripe_info_t s(k, m, swidth); ASSERT_EQ(s.get_stripe_width(), swidth); ASSERT_EQ(s.get_chunk_size(), 1024); - std::set want_to_read; + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), csize); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m()); + ec_align_t to_read1(512,512, 1); + ec_align_t to_read2(512+16*1024,512, 1); + + ECUtil::shard_extent_set_t ref(s.get_k_plus_m()); + + ref[shard_id_t(0)].insert(512, 512); // multitple calls with the same want_to_read can happen during - // multi-region reads. + // multi-region reads. This will create multiple extents in want_to_read, + { + pipeline.get_min_want_to_read_shards( + to_read1, want_to_read); + ASSERT_EQ(want_to_read, ref); + + pipeline.get_min_want_to_read_shards( + to_read2, want_to_read); + // We have 4 data shards per stripe. + ref[shard_id_t(0)].insert(512+4*1024, 512); + } +} + +TEST(ECCommon, get_remaining_shards) +{ + const uint64_t page_size = CEPH_PAGE_SIZE; + const uint64_t swidth = 64*page_size; + const unsigned int k = 4; + const unsigned int m = 2; + const int nshards = 6; + const uint64_t chunk_size = swidth / k; + const uint64_t object_size = swidth * 1024; + + ECUtil::stripe_info_t s(k, m, swidth, vector(0)); + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), swidth/k); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); + ErasureCodeInterfaceRef ec_impl(ecode); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + std::vector empty_shard_vector(k); + ECCommon::shard_read_t empty_shard_read; + fill(empty_shard_vector.begin(), empty_shard_vector.end(), empty_shard_read); + + vector pg_shards(nshards); + for (int i = 0; i < nshards; i++) { + pg_shards[i] = pg_shard_t(i, shard_id_t(i)); + listenerStub.acting_shards.insert(pg_shards[i]); + } + { - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 512, 512, s, &want_to_read); - ASSERT_EQ(want_to_read, std::set{0}); - ECCommon::ReadPipeline::get_min_want_to_read_shards( - 512+16*1024, 512, s, &want_to_read); - ASSERT_EQ(want_to_read, std::set{0}); + hobject_t hoid; + + // Mock up a read request + ECUtil::shard_extent_set_t to_read(s.get_k_plus_m()); + to_read[shard_id_t(0)].insert(0, 4096); + ECCommon::read_request_t read_request(to_read, false, object_size); + int missing_shard = 0; + + // Mock up a read result. + ECCommon::read_result_t read_result(&s); + read_result.errors.emplace(pg_shards[missing_shard], -EIO); + + pipeline.get_remaining_shards(hoid, read_result, read_request, false, false); + + ECCommon::read_request_t ref(to_read, false, object_size); + int parity_shard = 4; + for (unsigned int i=0; idefault_sub_chunk; + shard_read.extents.insert(0,4096); + unsigned int shard_id = i==missing_shard?parity_shard:i; + shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id)); + ref.shard_reads[shard_id_t(shard_id)] = shard_read; + } + + ASSERT_EQ(read_request, ref); + } + + // Request re-read. There is a page of overlap in what is already read. + { + hobject_t hoid; + + ECUtil::shard_extent_set_t to_read(s.get_k_plus_m()); + s.ro_range_to_shard_extent_set(chunk_size/2, chunk_size+page_size, to_read); + ECCommon::read_request_t read_request(to_read, false, object_size); + unsigned int missing_shard = 1; + + // Mock up a read result. + ECCommon::read_result_t read_result(&s); + read_result.errors.emplace(pg_shards[missing_shard], -EIO); + buffer::list bl; + bl.append_zero(chunk_size/2); + read_result.buffers_read.insert_in_shard(shard_id_t(0), chunk_size/2, bl); + read_result.processed_read_requests[shard_id_t(0)].insert(chunk_size/2, bl.length()); + + pipeline.get_remaining_shards(hoid, read_result, read_request, false, false); + + // The result should be a read request for the first 4k of shard 0, as that + // is currently missing. + ECCommon::read_request_t ref(to_read, false, object_size); + int parity_shard = 4; + for (unsigned int i=0; idefault_sub_chunk; + unsigned int shard_id = i==missing_shard?parity_shard:i; + ref.shard_reads[shard_id_t(shard_id)] = shard_read; + } + ref.shard_reads[shard_id_t(0)].extents.insert(0, chunk_size/2); + ref.shard_reads[shard_id_t(0)].pg_shard = pg_shards[0]; + ref.shard_reads[shard_id_t(2)].extents.insert(0, chunk_size/2+page_size); + ref.shard_reads[shard_id_t(2)].pg_shard = pg_shards[2]; + ref.shard_reads[shard_id_t(3)].extents.insert(0, chunk_size/2+page_size); + ref.shard_reads[shard_id_t(3)].pg_shard = pg_shards[3]; + ref.shard_reads[shard_id_t(4)].extents.insert(0, chunk_size/2+page_size); + ref.shard_reads[shard_id_t(4)].pg_shard = pg_shards[4]; + ASSERT_EQ(read_request, ref); } } + +TEST(ECCommon, encode) +{ + const uint64_t page_size = CEPH_PAGE_SIZE; + const uint64_t swidth = 2*page_size; + const unsigned int k = 2; + const unsigned int m = 2; + + ECUtil::stripe_info_t s(k, m, swidth, vector(0)); + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), swidth/k); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); + ErasureCodeInterfaceRef ec_impl(ecode); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + ECUtil::shard_extent_map_t semap(&s); + + for (shard_id_t i; i=k?4096:2048); + semap.insert_in_shard(i, 12*1024, bl); + } + semap.encode(ec_impl, nullptr, 0); +} + +TEST(ECCommon, decode) +{ + const uint64_t page_size = CEPH_PAGE_SIZE; + const uint64_t swidth = 3*page_size; + const unsigned int k = 3; + const unsigned int m = 2; + + ECUtil::stripe_info_t s(k, m, swidth, vector(0)); + ECListenerStub listenerStub; + ASSERT_EQ(s.get_stripe_width(), swidth); + ASSERT_EQ(s.get_chunk_size(), swidth/k); + + const std::vector chunk_mapping = {}; // no remapping + ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl(); + ecode->data_chunk_count = k; + ecode->chunk_count = k + m; + ErasureCodeInterfaceRef ec_impl(ecode); + ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub); + + ECUtil::shard_extent_map_t semap(&s); + bufferlist bl12k; + bl12k.append_zero(12288); + bufferlist bl8k; + bl8k.append_zero(8192); + bufferlist bl16k; + bl16k.append_zero(16384); + semap.insert_in_shard(shard_id_t(1), 512000, bl12k); + semap.insert_in_shard(shard_id_t(1), 634880, bl12k); + semap.insert_in_shard(shard_id_t(2), 512000, bl12k); + semap.insert_in_shard(shard_id_t(2), 630784, bl16k); + semap.insert_in_shard(shard_id_t(3), 516096, bl8k); + semap.insert_in_shard(shard_id_t(3), 634880, bl12k); + ECUtil::shard_extent_set_t want = semap.get_extent_set(); + + want[shard_id_t(0)].insert(516096, 8192); + want[shard_id_t(0)].insert(634880, 12288); + want[shard_id_t(4)].insert(516096, 8192); + want[shard_id_t(4)].insert(634880, 12288); + + ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024)); +} diff --git a/src/test/osd/TestECUtil.cc b/src/test/osd/TestECUtil.cc new file mode 100644 index 0000000000000..71cf3bb614b9d --- /dev/null +++ b/src/test/osd/TestECUtil.cc @@ -0,0 +1,1034 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include "osd/ECUtil.h" +#include "gtest/gtest.h" +#include "osd/osd_types.h" +#include "common/ceph_argparse.h" +#include "osd/ECTransaction.h" + +using namespace std; +using namespace ECUtil; + +// FIXME: Once PRs are in, we should move the other ECUtil tests are moved here. + +TEST(ECUtil, stripe_info_t_chunk_mapping) +{ + int k=4; + int m=2; + int chunk_size = 4096; + vector forward_cm(k+m); + vector reverse_cm(k+m); + + std::iota(forward_cm.begin(), forward_cm.end(), 0); + std::iota(reverse_cm.rbegin(), reverse_cm.rend(), 0); + + stripe_info_t forward_sinfo1(k, m, chunk_size*k); + stripe_info_t forward_sinfo2(k, m, chunk_size*k, forward_cm); + stripe_info_t reverse_sinfo(k, m, chunk_size*k, reverse_cm); + + for (shard_id_t shard_id : forward_cm) { + raw_shard_id_t raw_shard_id((int)shard_id); + ASSERT_EQ(shard_id, forward_sinfo1.get_shard(raw_shard_id)); + ASSERT_EQ(raw_shard_id, forward_sinfo1.get_raw_shard(shard_id)); + ASSERT_EQ(shard_id, forward_sinfo2.get_shard(raw_shard_id)); + ASSERT_EQ(raw_shard_id, forward_sinfo2.get_raw_shard(shard_id)); + ASSERT_EQ(shard_id, reverse_sinfo.get_shard(raw_shard_id_t(k + m - int(raw_shard_id) - 1))); + ASSERT_EQ(raw_shard_id_t(k + m- int(shard_id) - 1), reverse_sinfo.get_raw_shard(shard_id)); + } + + ASSERT_EQ(k, forward_sinfo1.get_k()); + ASSERT_EQ(m, forward_sinfo1.get_m()); + ASSERT_EQ(k+m, forward_sinfo1.get_k_plus_m()); +} + +TEST(ECUtil, shard_extent_map_t) +{ + int k=4; + int m=2; + int chunk_size = 4096; + stripe_info_t sinfo(k, m, chunk_size*k, vector(0)); + + // insert_in_shard + { + shard_extent_map_t semap(&sinfo); + int new_off = 512; + int new_len = 1024; + shard_id_t shard0(0); + shard_id_t shard2(2); + + // Empty + ASSERT_FALSE(semap.contains_shard(shard_id_t(0))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(1))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(2))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(3))); + ASSERT_TRUE(semap.empty()); + ASSERT_EQ(std::numeric_limits::max(), semap.get_ro_start()); + ASSERT_EQ(std::numeric_limits::max(), semap.get_ro_end()); + ASSERT_EQ(std::numeric_limits::max(), semap.get_start_offset()); + ASSERT_EQ(std::numeric_limits::max(), semap.get_end_offset()); + + + // Insert a 1k buffer in shard 2 + buffer::list bl; + bl.append_zero(new_len); + semap.insert_in_shard(shard2, new_off, bl); + ASSERT_FALSE(semap.contains_shard(shard_id_t(0))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(1))); + ASSERT_TRUE(semap.contains_shard(shard_id_t(2))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(3))); + ASSERT_FALSE(semap.empty()); + ASSERT_EQ(int(shard2) * chunk_size + new_off, semap.get_ro_start()); + ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end()); + ASSERT_EQ(new_off, semap.get_start_offset()); + ASSERT_EQ(new_off + bl.length(), semap.get_end_offset()); + auto iter = semap.get_extent_map(shard2).begin(); + ASSERT_EQ(new_off, iter.get_off()); + ASSERT_EQ(new_len, iter.get_len()); + ++iter; + ASSERT_EQ(semap.get_extent_map(shard2).end(), iter); + + // Insert a 1k buffer in shard 0 + semap.insert_in_shard(shard0, new_off, bl); + ASSERT_TRUE(semap.contains_shard(shard_id_t(0))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(1))); + ASSERT_TRUE(semap.contains_shard(shard_id_t(2))); + ASSERT_FALSE(semap.contains_shard(shard_id_t(3))); + ASSERT_FALSE(semap.empty()); + ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start()); + ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end()); + ASSERT_EQ(new_off, semap.get_start_offset()); + ASSERT_EQ(new_off + bl.length(), semap.get_end_offset()); + iter = semap.get_extent_map(shard0).begin(); + ASSERT_EQ(new_off, iter.get_off()); + ASSERT_EQ(new_len, iter.get_len()); + ++iter; + ASSERT_EQ(semap.get_extent_map(shard0).end(), iter); + iter = semap.get_extent_map(shard2).begin(); + ASSERT_EQ(new_off, iter.get_off()); + ASSERT_EQ(new_len, iter.get_len()); + ++iter; + ASSERT_EQ(semap.get_extent_map(shard2).end(), iter); + + /* Insert overlapping into next stripe */ + semap.insert_in_shard(shard2, chunk_size - 512, bl); + ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start()); + ASSERT_EQ((int(shard2) + k) * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(new_off, semap.get_start_offset()); + ASSERT_EQ(chunk_size - 512 + bl.length(), semap.get_end_offset()); + + iter = semap.get_extent_map(shard2).begin(); + ASSERT_EQ(new_off, iter.get_off()); + ASSERT_EQ(new_len, iter.get_len()); + ++iter; + ASSERT_EQ(chunk_size - 512, iter.get_off()); + ASSERT_EQ(new_len, iter.get_len()); + ++iter; + ASSERT_EQ(semap.get_extent_map(shard2).end(), iter); + } + + //insert_ro_extent_map + //erase_after_ro_offset + { + shard_extent_map_t semap(&sinfo); + extent_map emap; + buffer::list bl1k; + buffer::list bl16k; + buffer::list bl64k; + + bl1k.append_zero(1024); + bl16k.append_zero(chunk_size * k); + bl64k.append_zero(chunk_size * k * 4); + shard_extent_set_t ref(sinfo.get_k_plus_m()); + + // 1: Strangely aligned. (shard 0 [5~1024]) + emap.insert(5, 1024, bl1k); + ref[shard_id_t(0)].insert(5, 1024); + // 2: Start of second chunk (shard 1 [0~1024]) + emap.insert(chunk_size, 1024, bl1k); + ref[shard_id_t(1)].insert(0, 1024); + // 3: Overlap two chunks (shard1[3584~512], shard2[0~512]) + emap.insert(chunk_size*2 - 512, 1024, bl1k); + ref[shard_id_t(1)].insert(3584, 512); + ref[shard_id_t(2)].insert(0, 512); + // 4: Overlap two stripes (shard3[3584~512], shard0[4096~512]) + emap.insert(chunk_size*4 - 512, 1024, bl1k); + ref[shard_id_t(3)].insert(3584, 512); + ref[shard_id_t(0)].insert(4096, 512); + // 5: Full stripe (shard*[8192~4096]) + emap.insert(chunk_size*k*2, chunk_size*k, bl16k); + for (auto &&[_, eset] : ref) + eset.insert(8192, 4096); + // 6: Two half stripes (shard0,1[20480~4096], shard 2,3[16384~4096]) + emap.insert(chunk_size*k*4 + 2*chunk_size, chunk_size * k, bl16k); + ref[shard_id_t(0)].insert(20480, 4096); + ref[shard_id_t(1)].insert(20480, 4096); + ref[shard_id_t(2)].insert(16384, 4096); + ref[shard_id_t(3)].insert(16384, 4096); + + // 7: Two half stripes, strange alignment (shard0,1[36864~4096], shard2[32773~4096], shard3[32784~4096]) + emap.insert(chunk_size*k*8 + 2*chunk_size + 5, chunk_size * k, bl16k); + ref[shard_id_t(0)].insert(36864, 4096); + ref[shard_id_t(1)].insert(36864, 4096); + ref[shard_id_t(2)].insert(32773, 4096); + ref[shard_id_t(3)].insert(32768, 4096); + + // 8: Multiple stripes (shard*[49152, 16384] + emap.insert(chunk_size*k*12, chunk_size * k * 4, bl64k); + for (auto &&[_, eset] : ref) + eset.insert(49152, 16384); + + semap.insert_ro_extent_map(emap); + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(emap.get_start_off(), semap.get_ro_start()); + ASSERT_EQ(emap.get_end_off(), semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(chunk_size * 16, semap.get_end_offset()); + + /* Erase the later parts at an obscure offset. */ + semap.erase_after_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512); + + { + extent_set tmp; + + tmp.union_insert(0, chunk_size * 8); + ref[shard_id_t(3)].intersection_of(tmp); + tmp.union_insert(0, chunk_size * 8 + 512); + ref[shard_id_t(2)].intersection_of(tmp); + tmp.union_insert(0, chunk_size * 9); + ref[shard_id_t(1)].intersection_of(tmp); + ref[shard_id_t(0)].intersection_of(tmp); + } + + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(5, semap.get_ro_start()); + ASSERT_EQ(chunk_size * k * 8 + 2 * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(33280, semap.get_end_offset()); + + /* Append again */ + semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512); + ref[shard_id_t(0)].insert(chunk_size * 9, chunk_size); + ref[shard_id_t(1)].insert(chunk_size * 9, chunk_size); + ref[shard_id_t(2)].insert(chunk_size * 8 + 512, chunk_size); + ref[shard_id_t(3)].insert(chunk_size * 8, chunk_size); + + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(5, semap.get_ro_start()); + ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(chunk_size * 10, semap.get_end_offset()); + + /* Append nothing */ + semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512); + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(5, semap.get_ro_start()); + ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(chunk_size * 10, semap.get_end_offset()); + + /* Append, to an offset before the end */ + semap.append_zeros_to_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512); + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(5, semap.get_ro_start()); + ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(chunk_size * 10, semap.get_end_offset()); + + /* Intersect the beginning ro range */ + shard_extent_map_t semap2 = semap.intersect_ro_range(chunk_size * 2 - 256, + chunk_size * k * 8); + + /* The original semap should be untouched */ + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(5, semap.get_ro_start()); + ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end()); + ASSERT_EQ(0, semap.get_start_offset()); + ASSERT_EQ(chunk_size * 10, semap.get_end_offset()); + { + extent_set tmp; + tmp.insert(chunk_size, chunk_size * 8); + ref[shard_id_t(0)].intersection_of(tmp); + } + { + extent_set tmp; + tmp.insert(chunk_size - 256, chunk_size * 8); + ref[shard_id_t(1)].intersection_of(tmp); + } + { + extent_set tmp; + tmp.insert(0, chunk_size * 8); + ref[shard_id_t(2)].intersection_of(tmp); + ref[shard_id_t(3)].intersection_of(tmp); + } + + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(chunk_size*2 - 256, semap2.get_ro_start()); + ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end()) + << "semap2=" << semap2; + ASSERT_EQ(0, semap2.get_start_offset()); + ASSERT_EQ(chunk_size * 6, semap2.get_end_offset()); + + // intersect with somethning bigger and it should be identical + semap2 = semap2.intersect_ro_range(0, chunk_size * k * 10); + for (auto &&[shard, eset] : ref) { + ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard; + } + ASSERT_EQ(chunk_size * 2 - 256, semap2.get_ro_start()); + ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end()); + ASSERT_EQ(0, semap2.get_start_offset()); + ASSERT_EQ(chunk_size * 6, semap2.get_end_offset()); + + extent_set superset; + for (auto &&[_, eset] : ref) + superset.union_of(eset); + + ASSERT_EQ(superset, semap2.get_extent_superset()); + } + + // To test "encode" we need more framework... So will leave to higher level + // tests. +} + +// This scenario went wrong in ec transaction code in a cluster-based test. +TEST(ECUtil, shard_extent_map_t_scenario_1) +{ + int k=2; + int m=2; + int chunk_size = 4096; + stripe_info_t sinfo(k, m, chunk_size*k, vector(0)); + shard_extent_map_t semap(&sinfo); + + bufferlist bl; + bl.append_zero(chunk_size); + semap.insert_in_shard(shard_id_t(0), chunk_size, bl); + semap.insert_in_shard(shard_id_t(0), chunk_size*3, bl); + semap.insert_in_shard(shard_id_t(1), chunk_size, bl); + semap.insert_in_shard(shard_id_t(1), chunk_size*3, bl); + + for (int i=0; i 2024-10-07T11:38:23.746+0100 7fa0df6f4800 0 == test 1 Random offset, random length read/write I/O with queue depth 1 (seqseed 1137522502) == +-170> 2024-10-07T11:38:23.746+0100 7fa0df6f4800 5 test Step 0: Create (size=44K) +-169> 2024-10-07T11:38:23.787+0100 7fa0df6f4800 5 test Step 1: Barrier +-168> 2024-10-07T11:38:23.787+0100 7fa0df6f4800 5 test Step 2: Write (offset=38K,length=4K) +-167> 2024-10-07T11:38:23.829+0100 7fa0df6f4800 5 test Step 3: Barrier +-166> 2024-10-07T11:38:23.829+0100 7fa0df6f4800 5 test Step 4: Write (offset=38K,length=4K) +-165> 2024-10-07T11:38:23.876+0100 7fa0df6f4800 5 test Step 5: Barrier +-164> 2024-10-07T11:38:23.876+0100 7fa0df6f4800 5 test Step 6: Write (offset=10K,length=6K) +-163> 2024-10-07T11:38:23.963+0100 7fa0df6f4800 5 test Step 7: Barrier +-162> 2024-10-07T11:38:23.963+0100 7fa0df6f4800 5 test Step 8: Write (offset=30K,length=2K) +*/ +TEST(ECUtil, shard_extent_map_t_insert_ro_buffer) +{ + int k=2; + int m=2; + int chunk_size = 4096; + char c = 1; + stripe_info_t sinfo(k, m, chunk_size*k, vector(0)); + shard_extent_map_t semap(&sinfo); + + bufferlist bl; + bl.append_zero(44*1024); + + char *buf = bl.c_str(); + + shard_extent_map_t ref_semap(&sinfo); + ref_semap.append_zeros_to_ro_offset(48*1024); + + for (char i=0; i<44; i++) { + buf[i*1024] = c; + int chunk = i/4; + shard_id_t shard(chunk % k); + int offset = chunk_size * (chunk / k) + i % 4 * 1024; + bufferlist tmp; + ref_semap.get_buffer(shard, offset, 1024, tmp); + tmp.c_str()[0] = c++; + } + + sinfo.ro_range_to_shard_extent_map(0, 44*1024, bl, semap); + semap.assert_buffer_contents_equal(ref_semap); + bufferlist insert_bl; + insert_bl.append_zero(2*1024); + insert_bl.c_str()[0] = c; + { + bufferlist tmp; + ref_semap.get_buffer(shard_id_t(1), 14*1024, 1024, tmp); + tmp.c_str()[0] = c++; + } + insert_bl.c_str()[1024] = c; + { + bufferlist tmp; + ref_semap.get_buffer(shard_id_t(1), 15*1024, 1024, tmp); + tmp.c_str()[0] = c++; + } + + sinfo.ro_range_to_shard_extent_map(30*1024, 1024, insert_bl, semap); + semap.assert_buffer_contents_equal(ref_semap); +} + +// Sanity check that k=3 buffer inserts work +TEST(ECUtil, shard_extent_map_t_insert_ro_buffer_3) +{ + int k=3; + int m=2; + int chunk_size = 4096; + uint64_t ro_offset = 10 * 1024; + uint64_t ro_length = 32 * 1024; + + char c = 5; + stripe_info_t sinfo(k, m, chunk_size*k, vector(0)); + shard_extent_map_t semap(&sinfo); + bufferlist ref; + bufferlist in; + ref.append_zero(ro_length); + in.append_zero(ro_length); + + for (uint64_t i=0; i chunk_mapping = {shard_id_t(1), shard_id_t(2), shard_id_t(0)}; + stripe_info_t sinfo(2, 1, 2 * 4096, chunk_mapping); + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(1, read_mask); + sinfo.ro_size_to_zero_mask(1, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(1)].insert(0, 4096); + ref_zero[shard_id_t(2)].insert(0, 4096); + ref_read[shard_id_t(0)].insert(0, 4096); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(38912, read_mask); + sinfo.ro_size_to_zero_mask(38912, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(1)].insert(0, 20480); + ref_read[shard_id_t(2)].insert(0, 20480); + ref_read[shard_id_t(0)].insert(0, 20480); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } +} + +TEST(ECUtil, sinfo_ro_size_to_read_mask) { + stripe_info_t sinfo(2, 1, 16*4096); + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(1, read_mask); + sinfo.ro_size_to_zero_mask(1, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, 4096); + ref_zero[shard_id_t(1)].insert(0, 4096); + ref_read[shard_id_t(2)].insert(0, 4096); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(4096, read_mask); + sinfo.ro_size_to_zero_mask(4096, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, 4096); + ref_zero[shard_id_t(1)].insert(0, 4096); + ref_read[shard_id_t(2)].insert(0, 4096); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(4097, read_mask); + sinfo.ro_size_to_zero_mask(4097, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, 8192); + ref_zero[shard_id_t(1)].insert(0, 8192); + ref_read[shard_id_t(2)].insert(0, 8192); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(8*4096+1, read_mask); + sinfo.ro_size_to_zero_mask(8*4096+1, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, 8*4096); + ref_read[shard_id_t(1)].insert(0, 4096); + ref_zero[shard_id_t(1)].insert(4096, 7*4096); + ref_read[shard_id_t(2)].insert(0, 8*4096); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } + + { + shard_extent_set_t read_mask(sinfo.get_k_plus_m()); + shard_extent_set_t zero_mask(sinfo.get_k_plus_m()); + sinfo.ro_size_to_read_mask(16*4096+1, read_mask); + sinfo.ro_size_to_zero_mask(16*4096+1, zero_mask); + + shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + shard_extent_set_t ref_zero(sinfo.get_k_plus_m()); + ref_read[shard_id_t(0)].insert(0, 9*4096); + ref_read[shard_id_t(1)].insert(0, 8*4096); + ref_zero[shard_id_t(1)].insert(8*4096, 1*4096); + ref_read[shard_id_t(2)].insert(0, 9*4096); + + ASSERT_EQ(ref_read, read_mask); + ASSERT_EQ(ref_zero, zero_mask); + } +} + +TEST(ECUtil, slice_iterator) +{ + stripe_info_t sinfo(2, 1, 2*4096); + shard_id_set out_set; + out_set.insert_range(shard_id_t(0), 3); + shard_extent_map_t sem(&sinfo); + { + auto iter = sem.begin_slice_iterator(out_set); + ASSERT_TRUE(iter.get_out_bufferptrs().empty()); + } + + bufferlist a, b; + a.append_zero(8192); + a.c_str()[0] = 'A'; + a.c_str()[4096] = 'C'; + b.append_zero(4096); + b.c_str()[0] = 'B'; + + sem.insert_in_shard(shard_id_t(0), 0, a); + sem.insert_in_shard(shard_id_t(1), 0, b); + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(0, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + { + auto out = iter.get_out_bufferptrs(); + + ASSERT_EQ(4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]); + } + + ++iter; + ASSERT_TRUE(iter.is_end()); + } + + // Create a gap. + bufferlist d, e; + d.append_zero(4096); + d.c_str()[0] = 'D'; + e.append_zero(4096); + e.c_str()[0] = 'E'; + sem.insert_in_shard(shard_id_t(0), 4096*4, d); + sem.insert_in_shard(shard_id_t(1), 4096*4, e); + + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(0, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]); + } + + ++iter; + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(4*4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ('D', out[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + ASSERT_TRUE(iter.is_end()); + } + + // Multiple buffers in each shard and gap at start. + sem.clear(); + a.clear(); + a.append_zero(4096); + a.c_str()[0] = 'A'; + bufferlist c; + c.append_zero(4096); + c.c_str()[0] = 'C'; + + sem.insert_in_shard(shard_id_t(0), 4096*1, a); + sem.insert_in_shard(shard_id_t(1), 4096*1, b); + sem.insert_in_shard(shard_id_t(0), 4096*2, c); + sem.insert_in_shard(shard_id_t(1), 4096*2, d); + + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + { + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(2*4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(4096, out[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + ASSERT_TRUE(iter.is_end()); + } + +} +TEST(ECUtil, slice_iterator_subset_out) +{ + stripe_info_t sinfo(2, 1, 2*4096); + shard_id_set out_set; + out_set.insert(shard_id_t(1)); + shard_extent_map_t sem(&sinfo); + { + auto iter = sem.begin_slice_iterator(out_set); + ASSERT_TRUE(iter.get_in_bufferptrs().empty()); + ASSERT_TRUE(iter.get_out_bufferptrs().empty()); + } + + bufferlist a, b; + a.append_zero(8192); + a.c_str()[0] = 'A'; + a.c_str()[4096] = 'C'; + b.append_zero(4096); + b.c_str()[0] = 'B'; + + sem.insert_in_shard(shard_id_t(0), 0, a); + sem.insert_in_shard(shard_id_t(1), 0, b); + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto in = iter.get_in_bufferptrs(); + auto out = iter.get_out_bufferptrs(); + ASSERT_EQ(0, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_EQ(1, in.size()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, in[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + /* The iterator only cares about outputs, so doesn't care that there is an + * extra 4k to go. + */ + ++iter; + ASSERT_TRUE(iter.is_end()); + } + + // Create a gap. + bufferlist d, e; + d.append_zero(4096); + d.c_str()[0] = 'D'; + e.append_zero(4096); + e.c_str()[0] = 'E'; + sem.insert_in_shard(shard_id_t(0), 4096*4, d); + sem.insert_in_shard(shard_id_t(1), 4096*4, e); + + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto in = iter.get_in_bufferptrs(); + auto out = iter.get_out_bufferptrs(); + + ASSERT_EQ(0, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(in.empty()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, in.size()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, in[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + // Skip the next 4k, since it is not in the output buffer. + + ++iter; + { + auto in = iter.get_in_bufferptrs(); + auto out = iter.get_out_bufferptrs(); + + ASSERT_EQ(4*4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(in.empty()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, in.size()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, in[shard_id_t(0)].length()); + ASSERT_EQ('D', in[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + ASSERT_TRUE(iter.is_end()); + } + + // Multiple buffers in each shard and gap at start. + sem.clear(); + a.clear(); + a.append_zero(4096); + a.c_str()[0] = 'A'; + bufferlist c; + c.append_zero(4096); + c.c_str()[0] = 'C'; + + sem.insert_in_shard(shard_id_t(0), 4096*1, a); + sem.insert_in_shard(shard_id_t(1), 4096*1, b); + sem.insert_in_shard(shard_id_t(0), 4096*2, c); + sem.insert_in_shard(shard_id_t(1), 4096*2, d); + + { + auto iter = sem.begin_slice_iterator(out_set); + + { + auto in = iter.get_in_bufferptrs(); + auto out = iter.get_out_bufferptrs(); + + ASSERT_EQ(4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(in.empty()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, in.size()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, in[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + { + auto in = iter.get_in_bufferptrs(); + auto out = iter.get_out_bufferptrs(); + + ASSERT_EQ(2*4096, iter.get_offset()); + ASSERT_EQ(4096, iter.get_length()); + ASSERT_FALSE(in.empty()); + ASSERT_FALSE(out.empty()); + ASSERT_EQ(1, in.size()); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(4096, in[shard_id_t(0)].length()); + ASSERT_EQ(4096, out[shard_id_t(1)].length()); + ASSERT_EQ('C', in[shard_id_t(0)].c_str()[0]); + ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]); + } + + ++iter; + ASSERT_TRUE(iter.is_end()); + } + +} + + +TEST(ECUtil, object_size_to_shard_size) +{ + // This should return aligned values, inputs verifying that the result is + // aligned to the next page + std::vector inputs = {0x4D000, 0x4CCFF, 0x4C001}; + + stripe_info_t sinfo(4, 2, 4*4096); + for (uint64_t input : inputs) + { + ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(0))); + ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(1))); + ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(2))); + ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(3))); + ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(4))); + ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(5))); + } + + // Verify +/-1 also rounds correctly + ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(0x4C000, shard_id_t(0))); + ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(0x4D001, shard_id_t(1))); +} + +TEST(ECUtil, slice) +{ + int k=4; + int m=2; + int chunk_size = 4096; + stripe_info_t sinfo(k, m, k*4096); + shard_extent_map_t sem(&sinfo); + + extent_map emap; + buffer::list bl1k; + buffer::list bl4k; + buffer::list bl16k; + buffer::list bl64k; + + bl1k.append_zero(1024); + bl4k.append_zero(4096); + bl16k.append_zero(chunk_size * k); + bl64k.append_zero(chunk_size * k * 4); + shard_extent_set_t ref(sinfo.get_k_plus_m()); + + sem.insert_in_shard(shard_id_t(1), 512, bl1k); + sem.insert_in_shard(shard_id_t(2), 5, bl4k); + sem.insert_in_shard(shard_id_t(3), 256, bl16k); + sem.insert_in_shard(shard_id_t(4), 5, bl64k); + + { + auto slice_map = sem.slice_map(512, 1024); + ASSERT_EQ(4, slice_map.get_extent_maps().size()); + ASSERT_EQ(512, slice_map.get_start_offset()); + ASSERT_EQ(512+1024, slice_map.get_end_offset()); + + for (int i=1; i<5; i++) { + ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(i)).get_start_off()); + ASSERT_EQ(512+1024, slice_map.get_extent_map(shard_id_t(i)).get_end_off()); + } + } + + { + auto slice_map = sem.slice_map(0, 4096); + ASSERT_EQ(4, slice_map.get_extent_maps().size()); + ASSERT_EQ(5, slice_map.get_start_offset()); + ASSERT_EQ(4096, slice_map.get_end_offset()); + ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(1)).get_start_off()); + ASSERT_EQ(512 + 1024, slice_map.get_extent_map(shard_id_t(1)).get_end_off()); + ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(2)).get_start_off()); + ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(2)).get_end_off()); + ASSERT_EQ(256, slice_map.get_extent_map(shard_id_t(3)).get_start_off()); + ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(3)).get_end_off()); + ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(4)).get_start_off()); + ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(4)).get_end_off()); + } + + { + auto slice_map = sem.slice_map(0, 5); + ASSERT_TRUE(slice_map.empty()); + } + + { + auto slice_map = sem.slice_map(64*1024+5, 5); + ASSERT_TRUE(slice_map.empty()); + } + + { + auto slice_map = sem.slice_map(5, 64*1024); + ASSERT_EQ(slice_map, sem); + } + + { + auto slice_map = sem.slice_map(0, 65*1024); + ASSERT_EQ(slice_map, sem); + } +} \ No newline at end of file diff --git a/src/test/osd/test_ec_transaction.cc b/src/test/osd/test_ec_transaction.cc index da089b6c2d287..5c662279a5dd9 100644 --- a/src/test/osd/test_ec_transaction.cc +++ b/src/test/osd/test_ec_transaction.cc @@ -16,6 +16,7 @@ #include "osd/PGTransaction.h" #include "osd/ECTransaction.h" #include "common/debug.h" +#include "osd/ECBackend.h" #include "test/unit.cc" @@ -27,99 +28,344 @@ struct mydpp : public DoutPrefixProvider { #define dout_context g_ceph_context -TEST(ectransaction, two_writes_separated) +struct ECTestOp : ECCommon::RMWPipeline::Op { + PGTransactionUPtr t; +}; + +TEST(ectransaction, two_writes_separated_append) { hobject_t h; - PGTransactionUPtr t(new PGTransaction); + PGTransaction::ObjectOperation op; bufferlist a, b; - t->create(h); a.append_zero(565760); - t->write(h, 0, a.length(), a, 0); + op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); b.append_zero(2437120); - t->write(h, 669856, b.length(), b, 0); + op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0}); - ECUtil::stripe_info_t sinfo(2, 2, 8192); - auto plan = ECTransaction::get_write_plan( + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool); + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + ECTransaction::WritePlanObj plan( + h, + op, sinfo, - *t, - [&](const hobject_t &i) { - ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1)); - return ref; - }, - &dpp); - generic_derr << "to_read " << plan.to_read << dendl; - generic_derr << "will_write " << plan.will_write << dendl; - - ASSERT_EQ(0u, plan.to_read.size()); - ASSERT_EQ(1u, plan.will_write.size()); + shards, + shards, + false, + 0, + std::nullopt, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + ASSERT_FALSE(plan.to_read); + ASSERT_EQ(4u, plan.will_write.shard_count()); } -TEST(ectransaction, two_writes_nearby) +TEST(ectransaction, two_writes_separated_misaligned_overwrite) { hobject_t h; - PGTransactionUPtr t(new PGTransaction); + PGTransaction::ObjectOperation op; bufferlist a, b; - t->create(h); - - // two nearby writes, both partly touching the same 8192-byte stripe - ECUtil::stripe_info_t sinfo(2, 2, 8192); a.append_zero(565760); - t->write(h, 0, a.length(), a, 0); + op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); b.append_zero(2437120); - t->write(h, 569856, b.length(), b, 0); + op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0}); - auto plan = ECTransaction::get_write_plan( + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 3112960; + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + + ECTransaction::WritePlanObj plan( + h, + op, sinfo, - *t, - [&](const hobject_t &i) { - ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1)); - return ref; - }, - &dpp); - generic_derr << "to_read " << plan.to_read << dendl; - generic_derr << "will_write " << plan.will_write << dendl; - - ASSERT_EQ(0u, plan.to_read.size()); - ASSERT_EQ(1u, plan.will_write.size()); + shards, + shards, + false, + oi.size, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + ASSERT_EQ(2u, (*plan.to_read).shard_count()); + ASSERT_EQ(4u, plan.will_write.shard_count()); } -TEST(ectransaction, many_writes) +// Test writing to an object at an offset which is beyond the end of the +// current object. +TEST(ectransaction, partial_write) { hobject_t h; - PGTransactionUPtr t(new PGTransaction); - bufferlist a, b; - a.append_zero(512); - b.append_zero(4096); - t->create(h); - - ECUtil::stripe_info_t sinfo(2, 2, 8192); - // write 2801664~512 - // write 2802176~512 - // write 2802688~512 - // write 2803200~512 - t->write(h, 2801664, a.length(), a, 0); - t->write(h, 2802176, a.length(), a, 0); - t->write(h, 2802688, a.length(), a, 0); - t->write(h, 2803200, a.length(), a, 0); - - // write 2805760~4096 - // write 2809856~4096 - // write 2813952~4096 - t->write(h, 2805760, b.length(), b, 0); - t->write(h, 2809856, b.length(), b, 0); - t->write(h, 2813952, b.length(), b, 0); - - auto plan = ECTransaction::get_write_plan( + PGTransaction::ObjectOperation op; + bufferlist a; + + // Start by writing 8 bytes to the start of an object. + a.append_zero(8); + op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 8; + shard_id_set shards; + shards.insert_range(shard_id_t(), 3); + + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 0, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // The object is empty, so we should have no reads and an 4k write. + ASSERT_FALSE(plan.to_read); + extent_set ref_write; + ref_write.insert(0, 4096); + ASSERT_EQ(2u, plan.will_write.shard_count()); + ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(0))); + ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(2))); +} + +TEST(ectransaction, overlapping_write_non_aligned) +{ + hobject_t h; + PGTransaction::ObjectOperation op; + bufferlist a; + + // Start by writing 8 bytes to the start of an object. + a.append_zero(8); + op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 8; + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 8, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // There should be no overlap of this read. + ASSERT_EQ(1u, (*plan.to_read).shard_count()); + extent_set ref; + ref.insert(0, 4096); + ASSERT_EQ(2u, plan.will_write.shard_count()); + ASSERT_EQ(1u, (*plan.to_read).shard_count()); + ASSERT_EQ(ref, plan.will_write.at(shard_id_t(0))); + ASSERT_EQ(ref, plan.will_write.at(shard_id_t(2))); +} + +TEST(ectransaction, test_appending_write_non_aligned) +{ + hobject_t h; + PGTransaction::ObjectOperation op; + bufferlist a; + + // Start by writing 8 bytes to the start of an object. + a.append_zero(4096); + op.buffer_updates.insert(3*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 4*4096; + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 8, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // We are growing an option from zero with a hole. + ASSERT_FALSE(plan.to_read); + + // The writes will cover not cover the zero parts + ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m()); + ref_write[shard_id_t(1)].insert(4096, 4096); + ref_write[shard_id_t(2)].insert(4096, 4096); + ASSERT_EQ(ref_write, plan.will_write); +} + +TEST(ectransaction, append_with_large_hole) +{ + hobject_t h; + PGTransaction::ObjectOperation op; + bufferlist a; + + // We have a 4k write quite a way after the current limit of a 4k object + a.append_zero(4096); + op.buffer_updates.insert(24*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 25*4096; + shard_id_set shards; + shards.insert_range(shard_id_t(), 4); + ECTransaction::WritePlanObj plan( + h, + op, sinfo, - *t, - [&](const hobject_t &i) { - ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1)); - return ref; - }, - &dpp); - generic_derr << "to_read " << plan.to_read << dendl; - generic_derr << "will_write " << plan.will_write << dendl; - - ASSERT_EQ(0u, plan.to_read.size()); - ASSERT_EQ(1u, plan.will_write.size()); + shards, + shards, + false, + 4096, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // Should not require any reads. + ASSERT_FALSE(plan.to_read); + + // The writes will cover the new zero parts. + ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m()); + ref_write[shard_id_t(0)].insert(12*4096, 4096); + ref_write[shard_id_t(2)].insert(12*4096, 4096); + ASSERT_EQ(ref_write, plan.will_write); } + +TEST(ectransaction, test_append_not_page_aligned_with_large_hole) +{ + hobject_t h; + PGTransaction::ObjectOperation op; + bufferlist a; + + // We have a 4k write quite a way after the current limit of a 4k object + a.append_zero(2048); + op.buffer_updates.insert(24*4096 + 1024, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 25*4096; + shard_id_set shards; + shards.insert_range(shard_id_t(), 3); + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 4096, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // No reads (because not yet written) + ASSERT_FALSE(plan.to_read); + + // Writes should grow to 4k + ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m()); + ref_write[shard_id_t(0)].insert(12*4096, 4096); + ref_write[shard_id_t(2)].insert(12*4096, 4096); + ASSERT_EQ(ref_write, plan.will_write); +} + +TEST(ectransaction, test_overwrite_with_missing) +{ + hobject_t h; + PGTransaction::ObjectOperation op, op2; + bufferlist a; + + // We have a 4k write quite a way after the current limit of a 4k object + a.append_zero(14*1024); + op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0}); + + pg_pool_t pool; + pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS); + ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector(0)); + object_info_t oi; + oi.size = 42*1024; + shard_id_set shards; + shards.insert(shard_id_t(0)); + shards.insert(shard_id_t(1)); + + ECTransaction::WritePlanObj plan( + h, + op, + sinfo, + shards, + shards, + false, + 42*1024, + oi, + std::nullopt, + ECUtil::HashInfoRef(new ECUtil::HashInfo(1)), + nullptr, + 0); + + generic_derr << "plan " << plan << dendl; + + // No reads (because not yet written) + ASSERT_TRUE(plan.to_read); + ECUtil::shard_extent_set_t ref_read(sinfo.get_k_plus_m()); + ref_read[shard_id_t(1)].insert(4096, 4096); + ASSERT_EQ(ref_read, plan.to_read); + + // Writes should grow to 4k + ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m()); + ref_write[shard_id_t(0)].insert(0, 8192); + ref_write[shard_id_t(1)].insert(0, 8192); + ASSERT_EQ(ref_write, plan.will_write); +} \ No newline at end of file diff --git a/src/test/osd/test_extent_cache.cc b/src/test/osd/test_extent_cache.cc index 9c789ca327451..12ded85f4082e 100644 --- a/src/test/osd/test_extent_cache.cc +++ b/src/test/osd/test_extent_cache.cc @@ -14,269 +14,682 @@ #include -#include "osd/ExtentCache.h" -#include +#include "osd/ECExtentCache.h" using namespace std; +using namespace ECUtil; -extent_map imap_from_vector(vector > &&in) +shard_extent_map_t imap_from_vector(vector>> &&in, stripe_info_t const *sinfo) { - extent_map out; - for (auto &&tup: in) { - bufferlist bl; - bl.append_zero(tup.second); - out.insert(tup.first, bl.length(), bl); + shard_extent_map_t out(sinfo); + for (int shard = 0; shard < (int)in.size(); shard++) { + for (auto &&tup: in[shard]) { + bufferlist bl; + bl.append_zero(tup.second); + out.insert_in_shard(shard_id_t(shard), tup.first, bl); + } } return out; } -extent_map imap_from_iset(const extent_set &set) +shard_extent_map_t imap_from_iset(const shard_extent_set_t &sset, stripe_info_t *sinfo) { - extent_map out; - for (auto &&iter: set) { - bufferlist bl; - bl.append_zero(iter.second); - out.insert(iter.first, iter.second, bl); + shard_extent_map_t out(sinfo); + + for (auto &&[shard, set]: sset) { + for (auto &&iter: set) { + bufferlist bl; + bl.append_zero(iter.second); + out.insert_in_shard(shard, iter.first, bl); + } } return out; } -extent_set iset_from_vector(vector > &&in) +shard_extent_set_t iset_from_vector(vector>> &&in, const stripe_info_t *sinfo) { - extent_set out; - for (auto &&tup: in) { - out.insert(tup.first, tup.second); + shard_extent_set_t out(sinfo->get_k_plus_m()); + for (int shard = 0; shard < (int)in.size(); shard++) { + for (auto &&tup: in[shard]) { + out[shard_id_t(shard)].insert(tup.first, tup.second); + } } return out; } -TEST(extentcache, simple_write) +struct Client : public ECExtentCache::BackendReadListener { - hobject_t oid; - - ExtentCache c; - ExtentCache::write_pin pin; - c.open_write_pin(pin); - - auto to_read = iset_from_vector( - {{0, 2}, {8, 2}, {20, 2}}); - auto to_write = iset_from_vector( - {{0, 10}, {20, 4}}); - auto must_read = c.reserve_extents_for_rmw( - oid, pin, to_write, to_read); - ASSERT_EQ( - must_read, - to_read); - - c.print(std::cerr); - - auto got = imap_from_iset(must_read); - auto pending_read = to_read; - pending_read.subtract(must_read); - - auto pending = c.get_remaining_extents_for_rmw( - oid, - pin, - pending_read); - ASSERT_TRUE(pending.empty()); - - auto write_map = imap_from_iset(to_write); - c.present_rmw_update( - oid, - pin, - write_map); - - c.release_write_pin(pin); + hobject_t oid = hobject_t().make_temp_hobject("My first object"); + stripe_info_t sinfo; + ECExtentCache::LRU lru; + ECExtentCache cache; + optional active_reads; + list results; + + Client(uint64_t chunk_size, int k, int m, uint64_t cache_size) : + sinfo(k, m, k*chunk_size, vector(0)), + lru(cache_size), cache(*this, lru, sinfo, g_ceph_context) {}; + + void backend_read(hobject_t _oid, const shard_extent_set_t& request, + uint64_t object_size) override { + ceph_assert(oid == _oid); + active_reads = request; + } + + void cache_ready(const hobject_t& _oid, const shard_extent_map_t& _result) + { + ceph_assert(oid == _oid); + results.emplace_back(_result); + } + + void complete_read() + { + auto reads_done = imap_from_iset(*active_reads, &sinfo); + active_reads.reset(); // set before done, as may be called back. + cache.read_done(oid, std::move(reads_done)); + } + + void complete_write(ECExtentCache::OpRef &op) + { + shard_extent_map_t emap = imap_from_iset(op->get_writes(), &sinfo); + //Fill in the parity. Parity correctness does not matter to the cache. + emap.insert_parity_buffers(); + results.clear(); + cache.write_done(op, std::move(emap)); + } + + void cache_execute(ECExtentCache::OpRef &op) + { + list l; + l.emplace_back(op); + cache.execute(l); + } + + const stripe_info_t *get_stripe_info() const { return &sinfo; } +}; + +TEST(ECExtentCache, double_write_done) +{ + Client cl(32, 2, 1, 64); + + auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info()); + + optional op = cl.cache.prepare(cl.oid, nullopt, to_write, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op); + cl.complete_write(*op); +} + +TEST(ECExtentCache, simple_write) +{ + Client cl(32, 2, 1, 64); + { + auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info()); + auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info()); + + /* OpRef request(hobject_t const &oid, + std::optional const &to_read, + std::shard_extent_set_t const &write, + uint64_t orig_size, + uint64_t projected_size, + CacheReadyCb &&ready_cb) + */ + + optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op); + ASSERT_EQ(to_read, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + cl.complete_read(); + + ASSERT_FALSE(cl.active_reads); + ASSERT_EQ(1, cl.results.size()); + ASSERT_EQ(to_read, cl.results.front().get_extent_set()); + cl.complete_write(*op); + + ASSERT_FALSE(cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + op.reset(); + } + + // Repeating the same read should complete without a backend read.. + { + auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info()); + auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info()); + optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op); + ASSERT_FALSE(cl.active_reads); + ASSERT_FALSE(cl.results.empty()); + ASSERT_EQ(1, cl.results.size()); + ASSERT_EQ(to_read, cl.results.front().get_extent_set()); + cl.complete_write(*op); + op.reset(); + } + + // Perform a read overlapping with the previous write, but not hte previous read. + // This should not result in any backend reads, since the cache can be honoured + // from the previous write. + { + auto to_read = iset_from_vector( {{{2, 2}}, {{2, 2}}}, cl.get_stripe_info()); + auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info()); + optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op); + + // SHould have remained in LRU! + ASSERT_FALSE(cl.active_reads); + ASSERT_EQ(1, cl.results.size()); + ASSERT_EQ(to_read, cl.results.front().get_extent_set()); + cl.complete_write(*op); + op.reset(); + } +} + +TEST(ECExtentCache, sequential_appends) { + Client cl(32, 2, 1, 32); + + auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info()); + + // The first write... + optional op1 = cl.cache.prepare(cl.oid, nullopt, to_write1, 0, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + + // Write should have been honoured immediately. + ASSERT_FALSE(cl.results.empty()); + auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info()); + cl.complete_write(*op1); + ASSERT_TRUE(cl.results.empty()); + + // The first write... + optional op2 = cl.cache.prepare(cl.oid, nullopt, to_write1, 10, 20, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op2); + } -TEST(extentcache, write_write_overlap) +TEST(ECExtentCache, multiple_writes) { - hobject_t oid; - - ExtentCache c; - ExtentCache::write_pin pin; - c.open_write_pin(pin); - - // start write 1 - auto to_read = iset_from_vector( - {{0, 2}, {8, 2}, {20, 2}}); - auto to_write = iset_from_vector( - {{0, 10}, {20, 4}}); - auto must_read = c.reserve_extents_for_rmw( - oid, pin, to_write, to_read); - ASSERT_EQ( - must_read, - to_read); - - c.print(std::cerr); - - // start write 2 - ExtentCache::write_pin pin2; - c.open_write_pin(pin2); - auto to_read2 = iset_from_vector( - {{2, 4}, {10, 4}, {18, 4}}); - auto to_write2 = iset_from_vector( - {{2, 12}, {18, 12}}); - auto must_read2 = c.reserve_extents_for_rmw( - oid, pin2, to_write2, to_read2); - ASSERT_EQ( - must_read2, - iset_from_vector({{10, 4}, {18, 2}})); - - c.print(std::cerr); - - // complete read for write 1 and start commit - auto got = imap_from_iset(must_read); - auto pending_read = to_read; - pending_read.subtract(must_read); - auto pending = c.get_remaining_extents_for_rmw( - oid, - pin, - pending_read); - ASSERT_TRUE(pending.empty()); - - auto write_map = imap_from_iset(to_write); - c.present_rmw_update( - oid, - pin, - write_map); - - c.print(std::cerr); - - // complete read for write 2 and start commit - auto pending_read2 = to_read2; - pending_read2.subtract(must_read2); - auto pending2 = c.get_remaining_extents_for_rmw( - oid, - pin2, - pending_read2); - ASSERT_EQ( - pending2, - imap_from_iset(pending_read2)); - - auto write_map2 = imap_from_iset(to_write2); - c.present_rmw_update( - oid, - pin2, - write_map2); - - c.print(std::cerr); - - c.release_write_pin(pin); - - c.print(std::cerr); - - c.release_write_pin(pin2); + Client cl(32, 2, 1, 32); + + auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info()); + + // This should drive a request for this IO, which we do not yet honour. + optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Perform another request. We should not see any change in the read requests. + auto to_read2 = iset_from_vector( {{{8, 4}}}, cl.get_stripe_info()); + auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info()); + optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Perform another request, this to check that reads are coalesced. + auto to_read3 = iset_from_vector( {{{32, 6}}}, cl.get_stripe_info()); + auto to_write3 = iset_from_vector({}, cl.get_stripe_info()); + optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op3); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Finally op4, with no reads. + auto to_write4 = iset_from_vector({{{20, 10}}}, cl.get_stripe_info()); + optional op4 = cl.cache.prepare(cl.oid, nullopt, to_write4, 10, 10, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op4); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Completing the first read will allow the first write and start a batched read. + // Note that the cache must not read what was written in op 1. + cl.complete_read(); + auto expected_read = iset_from_vector({{{10,2}, {32,6}}}, cl.get_stripe_info()); + ASSERT_EQ(expected_read, cl.active_reads); + ASSERT_EQ(1, cl.results.size()); + ASSERT_EQ(to_read1, cl.results.front().get_extent_set()); + cl.complete_write(*op1); + + // The next write requires some more reads, so should not occur. + ASSERT_TRUE(cl.results.empty()); + + // All reads complete, this should allow for op2 to be ready. + cl.complete_read(); + ASSERT_FALSE(cl.active_reads); + ASSERT_EQ(3, cl.results.size()); + auto result = cl.results.begin(); + ASSERT_EQ(to_read2, result++->get_extent_set()); + ASSERT_EQ(to_read3, result++->get_extent_set()); + ASSERT_TRUE(result++->empty()); + + cl.complete_write(*op2); + cl.complete_write(*op3); + cl.complete_write(*op4); + + op1.reset(); + op2.reset(); + op3.reset(); + op4.reset(); } -TEST(extentcache, write_write_overlap2) +int dummies; +struct Dummy +{ + Dummy() {dummies++;} + ~Dummy() {dummies--;} +}; + +TEST(ECExtentCache, on_change) { - hobject_t oid; - - ExtentCache c; - ExtentCache::write_pin pin; - c.open_write_pin(pin); - - // start write 1 - auto to_read = extent_set(); - auto to_write = iset_from_vector( - {{659456, 4096}}); - auto must_read = c.reserve_extents_for_rmw( - oid, pin, to_write, to_read); - ASSERT_EQ( - must_read, - to_read); - - c.print(std::cerr); - - // start write 2 - ExtentCache::write_pin pin2; - c.open_write_pin(pin2); - auto to_read2 = extent_set(); - auto to_write2 = iset_from_vector( - {{663552, 4096}}); - auto must_read2 = c.reserve_extents_for_rmw( - oid, pin2, to_write2, to_read2); - ASSERT_EQ( - must_read2, - to_read2); - - - // start write 3 - ExtentCache::write_pin pin3; - c.open_write_pin(pin3); - auto to_read3 = iset_from_vector({{659456, 8192}}); - auto to_write3 = iset_from_vector({{659456, 8192}}); - auto must_read3 = c.reserve_extents_for_rmw( - oid, pin3, to_write3, to_read3); - ASSERT_EQ( - must_read3, - extent_set()); - - c.print(std::cerr); - - // complete read for write 1 and start commit - auto got = imap_from_iset(must_read); - auto pending_read = to_read; - pending_read.subtract(must_read); - auto pending = c.get_remaining_extents_for_rmw( - oid, - pin, - pending_read); - ASSERT_TRUE(pending.empty()); - - auto write_map = imap_from_iset(to_write); - c.present_rmw_update( - oid, - pin, - write_map); - - c.print(std::cerr); - - // complete read for write 2 and start commit - auto pending_read2 = to_read2; - pending_read2.subtract(must_read2); - auto pending2 = c.get_remaining_extents_for_rmw( - oid, - pin2, - pending_read2); - ASSERT_EQ( - pending2, - imap_from_iset(pending_read2)); - - auto write_map2 = imap_from_iset(to_write2); - c.present_rmw_update( - oid, - pin2, - write_map2); - - // complete read for write 2 and start commit - auto pending_read3 = to_read3; - pending_read3.subtract(must_read3); - auto pending3 = c.get_remaining_extents_for_rmw( - oid, - pin3, - pending_read3); - ASSERT_EQ( - pending3, - imap_from_iset(pending_read3)); - - auto write_map3 = imap_from_iset(to_write3); - c.present_rmw_update( - oid, - pin3, - write_map3); - - - c.print(std::cerr); - - c.release_write_pin(pin); - - c.print(std::cerr); - - c.release_write_pin(pin2); - - c.print(std::cerr); - - c.release_write_pin(pin3); + Client cl(32, 2, 1, 64); + auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info()); + + optional op; + optional> dummy; + + dummy.emplace(make_shared()); + ceph_assert(dummies == 1); + { + shared_ptr d = *dummy; + /* Here we generate an op that we never expect to be completed. Note that + * some static code analysis tools suggest deleting d here. DO NOT DO THIS + * as we are relying on side effects from the destruction of d in this test. + */ + op.emplace(cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false, + [d](ECExtentCache::OpRef &ignored) + { + ceph_abort("Should be cancelled"); + })); + } + cl.cache_execute(*op); + + /* We now have the following graph of objects: + * cache -- op -- lambda -- d + * dummy --/ + */ + ASSERT_EQ(1, dummies); + + /* Executing the on_change will "cancel" this cache op. This will cause it + * to release the lambda, reducing us down to dummy -- d + */ + cl.cache.on_change(); + ASSERT_EQ(1, dummies); + + /* This emulates the rmw pipeline clearing outstanding IO. We now have no + * references to d, so we should have destructed the object. + * */ + dummy.reset(); + ASSERT_EQ(0, dummies); + + /* Keeping the op alive here is emulating the dummy keeping a record of the + * cache op. It will also be destroyed at this point by rmw pipeline. + */ + ASSERT_FALSE(cl.cache.idle()); + op.reset(); + ASSERT_TRUE(cl.cache.idle()); + + // The cache has its own asserts, which we should honour. + cl.cache.on_change2(); +} + +TEST(ECExtentCache, multiple_misaligned_writes) +{ + Client cl(256*1024, 2, 1, 1024*1024); + + // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is + // partial, so we read the second 4k to RMW + auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info()); + + // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the + // first and last pages. The first part of this read should be in the cache. + auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*4096, 4*4096}}}, cl.get_stripe_info()); + auto to_read2_exec = iset_from_vector( {{{12*4096, 4*4096}}}, cl.get_stripe_info()); + auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info()); + + // IO 3 is the next misaligned 4k, very similar to IO 3. + auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*4096, 4*4096}}}, cl.get_stripe_info()); + auto to_read3_exec = iset_from_vector( {{{20*4096, 4*4096}}}, cl.get_stripe_info()); + auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info()); + + //Perform the first write, which should result in a read. + optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Submit the second IO. + optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + // We should still be executing read 1. + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Allow the read to complete. We should now have op1 done... + cl.complete_read(); + ASSERT_EQ(to_read2_exec, cl.active_reads); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op1); + + // And move on to op3 + optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op3); + // We should still be executing read 1. + ASSERT_EQ(to_read2_exec, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Allow the read to complete. We should now have op2 done... + cl.complete_read(); + ASSERT_EQ(to_read3_exec, cl.active_reads); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op2); + ASSERT_EQ(to_read3_exec, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + cl.complete_read(); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op3); + } + +TEST(ECExtentCache, multiple_misaligned_writes2) +{ + Client cl(256*1024, 2, 1, 1024*1024); + + // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is + // partial, so we read the second 4k to RMW + auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info()); + + // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the + // first and last pages. The first part of this read should be in the cache. + auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_read2_exec = iset_from_vector( {{{12*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info()); + + // IO 3 is the next misaligned 4k, very similar to IO 3. + auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_read3_exec = iset_from_vector( {{{20*1024, 4*1024}}}, cl.get_stripe_info()); + auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info()); + + //Perform the first write, which should result in a read. + optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Submit the second IO. + optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + // We should still be executing read 1. + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Allow the read to complete. We should now have op1 done... + cl.complete_read(); + ASSERT_EQ(to_read2_exec, cl.active_reads); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op1); + + // And move on to op3 + optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op3); + // We should still be executing read 1. + ASSERT_EQ(to_read2_exec, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + // Allow the read to complete. We should now have op2 done... + cl.complete_read(); + ASSERT_EQ(to_read3_exec, cl.active_reads); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op2); + ASSERT_EQ(to_read3_exec, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + cl.complete_read(); + ASSERT_FALSE(cl.results.empty()); + cl.complete_write(*op3); + +} + +TEST(ECExtentCache, test_invalidate) +{ + Client cl(256*1024, 2, 1, 1024*1024); + + /* First attempt a write which does not do any reads */ + { + auto to_read1 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info()); + optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 4096, 4096, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + ASSERT_EQ(to_read1, cl.active_reads); + ASSERT_TRUE(cl.results.empty()); + + /* Now perform an invalidating cache write */ + optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + + cl.complete_read(); + ASSERT_EQ(2, cl.results.size()); + auto result = cl.results.begin(); + ASSERT_FALSE(result++->empty()); + ASSERT_TRUE(result++->empty()); + + cl.complete_write(*op1); + ASSERT_FALSE(cl.active_reads); + cl.complete_write(*op2); + + cl.cache.on_change(); + op1.reset(); + op2.reset(); + cl.cache.on_change2(); + ASSERT_TRUE(cl.cache.idle()); + } + + /* Second test, modifies, deletes, creates, then modifies. */ + { + auto to_read1 = iset_from_vector( {{{0, 8192}}}, cl.get_stripe_info()); + auto to_write1 = iset_from_vector({{{0, 8192}}}, cl.get_stripe_info()); + auto to_write2 = iset_from_vector({{{4096, 4096}}}, cl.get_stripe_info()); + auto to_read3 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info()); + auto to_write3 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info()); + optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 8192, 8192, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + optional op3 = cl.cache.prepare(cl.oid, nullopt, to_write2, 0, 8192, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + optional op4 = cl.cache.prepare(cl.oid, to_read3, to_write3, 8192, 8192, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op1); + cl.cache_execute(*op2); + cl.cache_execute(*op3); + cl.cache_execute(*op4); + + /* The first result must actually read. */ + cl.complete_read(); + ASSERT_EQ(4, cl.results.size()); + auto result = cl.results.begin(); + ASSERT_FALSE(result++->empty()); + ASSERT_TRUE(result++->empty()); + ASSERT_TRUE(result++->empty()); + ASSERT_TRUE(result++->empty()); + cl.complete_write(*op1); + cl.complete_write(*op2); + cl.complete_write(*op3); + cl.complete_write(*op4); + + cl.cache.on_change(); + op1.reset(); + op2.reset(); + op3.reset(); + op4.reset(); + cl.cache.on_change2(); + ASSERT_TRUE(cl.cache.idle()); + } +} + +TEST(ECExtentCache, test_invalidate_lru) +{ + uint64_t c = 4096; + int k = 4; + int m = 2; + Client cl(c, k, m, 1024*c); + + /* Populate the cache LRU and then invalidate the cache. */ + { + uint64_t bs = 3767; + auto io1 = iset_from_vector({{{align_page_prev(35*bs), align_page_next(36*bs) - align_page_prev(35*bs)}}}, cl.get_stripe_info()); + io1[shard_id_t(k)].insert(io1.get_extent_superset()); + io1[shard_id_t(k+1)].insert(io1.get_extent_superset()); + auto io2 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info()); + io2[shard_id_t(k)].insert(io1.get_extent_superset()); + io2[shard_id_t(k+1)].insert(io1.get_extent_superset()); + // io 3 is the truncate + auto io3 = shard_extent_set_t(cl.sinfo.get_k_plus_m()); + auto io4 = iset_from_vector({{{align_page_prev(30*bs), align_page_next(31*bs) - align_page_prev(30*bs)}}}, cl.get_stripe_info()); + io3[shard_id_t(k)].insert(io1.get_extent_superset()); + io3[shard_id_t(k+1)].insert(io1.get_extent_superset()); + auto io5 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info()); + io4[shard_id_t(k)].insert(io1.get_extent_superset()); + io4[shard_id_t(k+1)].insert(io1.get_extent_superset()); + + optional op1 = cl.cache.prepare(cl.oid, nullopt, io1, 0, align_page_next(36*bs), false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + + cl.cache_execute(*op1); + ASSERT_FALSE(cl.active_reads); + cl.complete_write(*op1); + op1.reset(); + + optional op2 = cl.cache.prepare(cl.oid, io2, io2, align_page_next(36*bs), align_page_next(36*bs), false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op2); + // We have active reads because the object was discarded fro the cache + // and has forgotten about all the zero reads. + ASSERT_TRUE(cl.active_reads); + cl.complete_read(); + cl.complete_write(*op2); + op2.reset(); + + optional op3 = cl.cache.prepare(cl.oid, nullopt, io3, align_page_next(36*bs), 0, false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op3); + ASSERT_FALSE(cl.active_reads); + cl.complete_write(*op3); + op3.reset(); + + optional op4 = cl.cache.prepare(cl.oid, nullopt, io4, 0, align_page_next(30*bs), false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op4); + ASSERT_FALSE(cl.active_reads); + cl.complete_write(*op4); + op4.reset(); + + optional op5 = cl.cache.prepare(cl.oid, io5, io5, align_page_next(30*bs), align_page_next(30*bs), false, + [&cl](ECExtentCache::OpRef &op) + { + cl.cache_ready(op->get_hoid(), op->get_result()); + }); + cl.cache_execute(*op5); + ASSERT_TRUE(cl.active_reads); + cl.complete_write(*op5); + op5.reset(); + } +} \ No newline at end of file diff --git a/src/tools/erasure-code/ceph-erasure-code-tool.cc b/src/tools/erasure-code/ceph-erasure-code-tool.cc index 9a5dc09100c42..3a0fb888da4b5 100644 --- a/src/tools/erasure-code/ceph-erasure-code-tool.cc +++ b/src/tools/erasure-code/ceph-erasure-code-tool.cc @@ -6,7 +6,6 @@ #include "common/ceph_argparse.h" #include "common/config_proxy.h" #include "common/errno.h" -#include "erasure-code/ErasureCode.h" #include "erasure-code/ErasureCodePlugin.h" #include "global/global_context.h" #include "global/global_init.h" @@ -96,7 +95,7 @@ int ec_init(const std::string &profile_str, uint64_t stripe_size = atoi(profile["k"].c_str()); ceph_assert(stripe_size > 0); uint64_t stripe_width = stripe_size * stripe_unit; - sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, stripe_width)); + sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, nullptr, stripe_width)); return 0; } @@ -196,37 +195,36 @@ int do_encode(const std::vector &args) { return r; } - std::set want; + ECUtil::shard_extent_map_t encoded_data(sinfo.get()); std::vector shards; boost::split(shards, args[2], boost::is_any_of(",")); - for (auto &shard : shards) { - want.insert(atoi(shard.c_str())); - } - ceph::bufferlist decoded_data; + ceph::bufferlist input_data; std::string fname = args[3]; std::string error; - r = decoded_data.read_file(fname.c_str(), &error); + r = input_data.read_file(fname.c_str(), &error); if (r < 0) { std::cerr << "failed to read " << fname << ": " << error << std::endl; return 1; } uint64_t stripe_width = sinfo->get_stripe_width(); - if (decoded_data.length() % stripe_width != 0) { - uint64_t pad = stripe_width - decoded_data.length() % stripe_width; - decoded_data.append_zero(pad); + if (input_data.length() % stripe_width != 0) { + uint64_t pad = stripe_width - input_data.length() % stripe_width; + input_data.append_zero(pad); } - std::map encoded_data; - r = ECUtil::encode(*sinfo, ec_impl, decoded_data, want, &encoded_data); + sinfo->ro_range_to_shard_extent_map(0, input_data.length(), input_data, encoded_data); + r = encoded_data.encode(ec_impl, nullptr, encoded_data.get_ro_end()); if (r < 0) { std::cerr << "failed to encode: " << cpp_strerror(r) << std::endl; return 1; } - for (auto &[shard, bl] : encoded_data) { + for (auto &[shard, _] : encoded_data.get_extent_maps()) { std::string name = fname + "." + stringify(shard); + bufferlist bl; + encoded_data.get_shard_first_buffer(shard, bl); r = bl.write_file(name.c_str()); if (r < 0) { std::cerr << "failed to write " << name << ": " << cpp_strerror(r) @@ -247,40 +245,41 @@ int do_decode(const std::vector &args) { ceph::ErasureCodeInterfaceRef ec_impl; std::unique_ptr sinfo; int r = ec_init(args[0], args[1], &ec_impl, &sinfo); - if (r < 0) { + if (r) { return r; } - std::map encoded_data; + ECUtil::shard_extent_map_t encoded_data(sinfo.get()); std::vector shards; boost::split(shards, args[2], boost::is_any_of(",")); - for (auto &shard : shards) { - encoded_data[atoi(shard.c_str())] = {}; - } - ceph::bufferlist decoded_data; std::string fname = args[3]; std::set want_to_read; const auto chunk_mapping = ec_impl->get_chunk_mapping(); - for (auto &[shard, bl] : encoded_data) { - std::string name = fname + "." + stringify(shard); + for (auto &shard_str : shards) { + std::string name = fname + "." + shard_str; std::string error; + bufferlist bl; r = bl.read_file(name.c_str(), &error); if (r < 0) { std::cerr << "failed to read " << name << ": " << error << std::endl; return 1; } - auto chunk = static_cast(chunk_mapping.size()) > shard ? - chunk_mapping[shard] : shard_id_t(shard); - want_to_read.insert(static_cast(chunk)); + shard_id_t shard = sinfo->get_shard(raw_shard_id_t(atoi(shard_str.c_str()))); + encoded_data.insert_in_shard(shard, 0, bl); } - r = ECUtil::decode(*sinfo, ec_impl, want_to_read, encoded_data, &decoded_data); + ECUtil::shard_extent_set_t wanted(sinfo->get_k_plus_m()); + sinfo->ro_range_to_shard_extent_set(encoded_data.get_ro_start(), + encoded_data.get_ro_end() - encoded_data.get_ro_start(), wanted); + + r = encoded_data.decode(ec_impl, wanted, encoded_data.get_ro_end()); if (r < 0) { std::cerr << "failed to decode: " << cpp_strerror(r) << std::endl; return 1; } + bufferlist decoded_data = encoded_data.get_ro_buffer(); r = decoded_data.write_file(fname.c_str()); if (r < 0) { std::cerr << "failed to write " << fname << ": " << cpp_strerror(r) -- 2.39.5