ECUtilL.cc
ECCommon.cc
ECBackend.cc
- ExtentCache.cc
+ ECExtentCache.cc
ECTransaction.cc
ECUtil.cc
ECInject.cc
#include "ECSwitch.h"
#include "PrimaryLogPG.h"
-#include "osd_tracer.h"
#define dout_context cct
#define dout_subsys ceph_subsys_osd
using ceph::ErasureCodeInterfaceRef;
using ceph::Formatter;
-static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) {
+static ostream &_prefix(std::ostream *_dout, ECBackend *pgb) {
return pgb->get_parent()->gen_dbg_prefix(*_dout);
}
-static ostream& _prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) {
+static ostream &_prefix(std::ostream *_dout, ECBackend::RecoveryBackend *pgb) {
return pgb->get_parent()->gen_dbg_prefix(*_dout);
}
list<ECBackend::RecoveryBackend::RecoveryOp> ops;
};
-static ostream &operator<<(ostream &lhs, const map<pg_shard_t, bufferlist> &rhs)
-{
- lhs << "[";
- for (map<pg_shard_t, bufferlist>::const_iterator i = rhs.begin();
- i != rhs.end();
- ++i) {
- if (i != rhs.begin())
- lhs << ", ";
- lhs << make_pair(i->first, i->second.length());
- }
- return lhs << "]";
-}
-
-static ostream &operator<<(ostream &lhs, const map<int, bufferlist> &rhs)
-{
- lhs << "[";
- for (map<int, bufferlist>::const_iterator i = rhs.begin();
- i != rhs.end();
- ++i) {
- if (i != rhs.begin())
- lhs << ", ";
- lhs << make_pair(i->first, i->second.length());
- }
- return lhs << "]";
-}
-
-ostream &operator<<(ostream &lhs, const ECBackend::RecoveryBackend::RecoveryOp &rhs)
-{
- return lhs << "RecoveryOp("
- << "hoid=" << rhs.hoid
- << " v=" << rhs.v
- << " missing_on=" << rhs.missing_on
- << " missing_on_shards=" << rhs.missing_on_shards
- << " recovery_info=" << rhs.recovery_info
- << " recovery_progress=" << rhs.recovery_progress
- << " obc refcount=" << rhs.obc.use_count()
- << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(rhs.state)
- << " waiting_on_pushes=" << rhs.waiting_on_pushes
- << " extent_requested=" << rhs.extent_requested
- << ")";
-}
-
-void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const
-{
+void ECBackend::RecoveryBackend::RecoveryOp::dump(Formatter *f) const {
f->dump_stream("hoid") << hoid;
f->dump_stream("v") << v;
f->dump_stream("missing_on") << missing_on;
f->dump_stream("recovery_progress") << recovery_progress;
f->dump_stream("state") << tostr(state);
f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
- f->dump_stream("extent_requested") << extent_requested;
}
ECBackend::ECBackend(
ErasureCodeInterfaceRef ec_impl,
uint64_t stripe_width,
ECSwitch *s,
- ECExtentCache::LRU &ignored)
+ ECExtentCache::LRU &ec_extent_cache_lru)
: parent(pg), cct(cct), switcher(s),
read_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener()),
- rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(), *this),
- recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline, unstable_hashinfo_registry, get_parent(), this),
+ rmw_pipeline(cct, ec_impl, this->sinfo, get_parent()->get_eclistener(),
+ *this, ec_extent_cache_lru),
+ recovery_backend(cct, switcher->coll, ec_impl, this->sinfo, read_pipeline,
+ unstable_hashinfo_registry, get_parent(), this),
ec_impl(ec_impl),
- sinfo(ec_impl, stripe_width),
+ sinfo(ec_impl, &(get_parent()->get_pool()), stripe_width),
unstable_hashinfo_registry(cct, ec_impl) {
+
+ /* EC makes some assumptions about how the plugin organises the *data* shards:
+ * - The chunk size is constant for a particular profile.
+ * - A stripe consists of k chunks.
+ */
ceph_assert((ec_impl->get_data_chunk_count() *
- ec_impl->get_chunk_size(stripe_width)) == stripe_width);
+ ec_impl->get_chunk_size(stripe_width)) == stripe_width);
}
-PGBackend::RecoveryHandle *ECBackend::open_recovery_op()
-{
+PGBackend::RecoveryHandle *ECBackend::open_recovery_op() {
return recovery_backend.open_recovery_op();
}
ECBackend::RecoveryBackend::RecoveryBackend(
- CephContext* cct,
+ CephContext *cct,
const coll_t &coll,
ceph::ErasureCodeInterfaceRef ec_impl,
- const ECUtil::stripe_info_t& sinfo,
- ReadPipeline& read_pipeline,
- UnstableHashInfoRegistry& unstable_hashinfo_registry,
- ECListener* parent,
- ECBackend* ecbackend)
+ const ECUtil::stripe_info_t &sinfo,
+ ReadPipeline &read_pipeline,
+ UnstableHashInfoRegistry &unstable_hashinfo_registry,
+ ECListener *parent,
+ ECBackend *ecbackend)
: cct(cct),
coll(coll),
ec_impl(std::move(ec_impl)),
read_pipeline(read_pipeline),
unstable_hashinfo_registry(unstable_hashinfo_registry),
parent(parent),
- ecbackend(ecbackend) {
-}
+ ecbackend(ecbackend) {}
-PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op()
-{
+PGBackend::RecoveryHandle *ECBackend::RecoveryBackend::open_recovery_op() {
return new ECRecoveryHandle;
}
-void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid, ECCommon::read_result_t &res)
-{
+void ECBackend::RecoveryBackend::_failed_push(const hobject_t &hoid,
+ ECCommon::read_result_t &res) {
dout(10) << __func__ << ": Read error " << hoid << " r="
<< res.r << " errors=" << res.errors << dendl;
dout(10) << __func__ << ": canceling recovery op for obj " << hoid
recovery_ops.erase(hoid);
set<pg_shard_t> fl;
- for (auto&& i : res.errors) {
+ for (auto &&i: res.errors) {
fl.insert(i.first);
}
get_parent()->on_failed_pull(fl, hoid, v);
}
struct RecoveryMessages {
- map<hobject_t,
- ECCommon::read_request_t> recovery_reads;
- map<hobject_t, set<int>> want_to_read;
-
- void recovery_read(
- const hobject_t &hoid, uint64_t off, uint64_t len,
- set<int> &&_want_to_read,
- const map<pg_shard_t, vector<pair<int, int>>> &need,
- bool attrs)
- {
- list<ec_align_t> to_read;
- to_read.emplace_back(ec_align_t{off, len, 0});
+ map<hobject_t, ECCommon::read_request_t> recovery_reads;
+
+ void recovery_read(const hobject_t &hoid,
+ const ECCommon::read_request_t &read_request) {
ceph_assert(!recovery_reads.count(hoid));
- want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
- recovery_reads.insert(
- make_pair(
- hoid,
- ECCommon::read_request_t(
- to_read,
- need,
- attrs)));
- }
-
- map<pg_shard_t, vector<PushOp> > pushes;
- map<pg_shard_t, vector<PushReplyOp> > push_replies;
+ recovery_reads.insert(make_pair(hoid, read_request));
+ }
+
+ map<pg_shard_t, vector<PushOp>> pushes;
+ map<pg_shard_t, vector<PushReplyOp>> push_replies;
ObjectStore::Transaction t;
};
void ECBackend::handle_recovery_push(
const PushOp &op,
RecoveryMessages *m,
- bool is_repair)
-{
+ bool is_repair) {
if (get_parent()->pg_is_remote_backfilling()) {
get_parent()->pg_add_local_num_bytes(op.data.length());
- get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
+ get_parent()->pg_add_num_bytes(op.data.length() * sinfo.get_k());
dout(10) << __func__ << " " << op.soid
<< " add new actual data by " << op.data.length()
- << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
+ << " add new num_bytes by " << op.data.length() * sinfo.get_k()
<< dendl;
}
recovery_backend.handle_recovery_push(op, m, is_repair);
if (op.after_progress.data_complete &&
- !(get_parent()->pgb_is_primary()) &&
- get_parent()->pg_is_remote_backfilling()) {
+ !(get_parent()->pgb_is_primary()) &&
+ get_parent()->pg_is_remote_backfilling()) {
struct stat st;
- int r = switcher->store->stat(switcher->ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
- get_parent()->whoami_shard().shard), &st);
+ int r = switcher->store->stat(switcher->ch, ghobject_t(
+ op.soid, ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard), &st);
if (r == 0) {
get_parent()->pg_sub_local_num_bytes(st.st_size);
// XXX: This can be way overestimated for small objects
- get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
+ get_parent()->pg_sub_num_bytes(st.st_size * sinfo.get_k());
dout(10) << __func__ << " " << op.soid
<< " sub actual data by " << st.st_size
- << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
+ << " sub num_bytes by " << st.st_size * sinfo.get_k()
<< dendl;
}
}
void ECBackend::RecoveryBackend::handle_recovery_push(
const PushOp &op,
RecoveryMessages *m,
- bool is_repair)
-{
+ bool is_repair) {
if (get_parent()->check_failsafe_full()) {
- dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+ dout(10) << __func__ << " Out of space (failsafe) processing push request."
+ << dendl;
ceph_abort();
}
ghobject_t tobj;
if (oneshot) {
tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
- get_parent()->whoami_shard().shard);
+ get_parent()->whoami_shard().shard);
} else {
tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid,
- op.version),
- ghobject_t::NO_GEN,
- get_parent()->whoami_shard().shard);
+ op.version),
+ ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard);
if (op.before_progress.first) {
dout(10) << __func__ << ": Adding oid "
<< tobj.hobj << " in the temp collection" << dendl;
<< tobj.hobj << " from the temp collection" << dendl;
clear_temp_obj(tobj.hobj);
m->t.remove(coll, ghobject_t(
- op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ op.soid, ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard));
m->t.collection_move_rename(
coll, tobj,
coll, ghobject_t(
- op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+ op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
}
if (op.after_progress.data_complete) {
if ((get_parent()->pgb_is_primary())) {
if (get_parent()->pg_is_repair() || is_repair)
get_parent()->inc_osd_stat_repaired();
get_parent()->on_local_recover(
- op.soid,
- op.recovery_info,
- recovery_ops[op.soid].obc,
- false,
- &m->t);
+ op.soid,
+ op.recovery_info,
+ recovery_ops[op.soid].obc,
+ false,
+ &m->t);
} else {
// If primary told us this is a repair, bump osd_stat_t::num_objects_repaired
if (is_repair)
get_parent()->inc_osd_stat_repaired();
get_parent()->on_local_recover(
- op.soid,
- op.recovery_info,
- ObjectContextRef(),
- false,
- &m->t);
+ op.soid,
+ op.recovery_info,
+ ObjectContextRef(),
+ false,
+ &m->t);
}
}
m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
void ECBackend::RecoveryBackend::handle_recovery_push_reply(
const PushReplyOp &op,
pg_shard_t from,
- RecoveryMessages *m)
-{
+ RecoveryMessages *m) {
if (!recovery_ops.count(op.soid))
return;
RecoveryOp &rop = recovery_ops[op.soid];
void ECBackend::RecoveryBackend::handle_recovery_read_complete(
const hobject_t &hoid,
- boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read,
- std::optional<map<string, bufferlist, less<>> > attrs,
- RecoveryMessages *m)
-{
- dout(10) << __func__ << ": returned " << hoid << " "
- << "(" << to_read.get<0>()
- << ", " << to_read.get<1>()
- << ", " << to_read.get<2>()
- << ")"
- << dendl;
- ceph_assert(recovery_ops.count(hoid));
+ ECUtil::shard_extent_map_t &&buffers_read,
+ std::optional<map<string, bufferlist, less<>>> attrs,
+ const ECUtil::shard_extent_set_t &want_to_read,
+ RecoveryMessages *m) {
+ dout(10) << __func__ << ": returned " << hoid << " " << buffers_read << dendl;
+ ceph_assert(recovery_ops.contains(hoid));
RecoveryBackend::RecoveryOp &op = recovery_ops[hoid];
- ceph_assert(op.returned_data.empty());
- map<int, bufferlist*> target;
- for (set<shard_id_t>::iterator i = op.missing_on_shards.begin();
- i != op.missing_on_shards.end();
- ++i) {
- target[static_cast<int>(*i)] = &(op.returned_data[static_cast<int>(*i)]);
- }
- map<int, bufferlist> from;
- for(map<pg_shard_t, bufferlist>::iterator i = to_read.get<2>().begin();
- i != to_read.get<2>().end();
- ++i) {
- from[static_cast<int>(i->first.shard)] = std::move(i->second);
- }
- dout(10) << __func__ << ": " << from << dendl;
- int r;
- r = ECUtil::decode(sinfo, ec_impl, from, target);
- ceph_assert(r == 0);
+
if (attrs) {
op.xattrs.swap(*attrs);
op.recovery_info.oi = op.obc->obs.oi;
}
- ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
- if (op.obc->obs.oi.size > 0) {
- ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
- auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
- decode(hinfo, bp);
+ if (sinfo.require_hinfo()) {
+ ECUtil::HashInfo hinfo(sinfo.get_k_plus_m());
+ if (op.obc->obs.oi.size > 0) {
+ ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
+ auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
+ decode(hinfo, bp);
+ }
+ op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info(
+ hoid, std::move(hinfo));
}
- op.hinfo = unstable_hashinfo_registry.maybe_put_hash_info(hoid, std::move(hinfo));
}
ceph_assert(op.xattrs.size());
ceph_assert(op.obc);
+
+ op.returned_data.emplace(std::move(buffers_read));
+
+ ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(op.recovery_info.size, read_mask);
+ ECUtil::shard_extent_set_t shard_want_to_read(sinfo.get_k_plus_m());
+
+ for (auto &[shard, eset] : want_to_read) {
+ /* Read buffers do not need recovering! */
+ if (buffers_read.contains(shard)) {
+ continue;
+ }
+
+ /* Read-buffers will be truncated to the end-of-object. Do not attempt
+ * to recover off-the-end.
+ */
+ shard_want_to_read[shard].intersection_of(read_mask.get(shard),eset);
+
+ /* Some shards may be empty */
+ if (shard_want_to_read[shard].empty()) {
+ shard_want_to_read.erase(shard);
+ }
+ }
+
+ uint64_t aligned_size = ECUtil::align_page_next(op.obc->obs.oi.size);
+
+ int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
+ ceph_assert(r == 0);
+ // We are never appending here, so we never need hinfo.
+ op.returned_data->insert_parity_buffers();
+ r = op.returned_data->encode(ec_impl, NULL, 0);
+ ceph_assert(r==0);
+
+ // Finally, we don't want to write any padding, so truncate the buffer
+ // to remove it.
+ op.returned_data->erase_after_ro_offset(aligned_size);
+
+ for (auto &&shard: op.missing_on_shards) {
+ if (read_mask.contains(shard) && op.returned_data->contains_shard(shard)) {
+ ceph_assert(read_mask.at(shard).range_end() >=
+ op.returned_data->get_extent_map(shard).get_end_off());
+ }
+ }
+
+ dout(20) << __func__ << ": oid=" << op.hoid << " "
+ << op.returned_data->debug_string(2048, 8) << dendl;
+
continue_recovery_op(op, m);
}
struct SendPushReplies : public Context {
PGBackend::Listener *l;
epoch_t epoch;
- map<int, MOSDPGPushReply*> replies;
+ std::map<int, MOSDPGPushReply*> replies;
+
SendPushReplies(
PGBackend::Listener *l,
epoch_t epoch,
- map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
+ std::map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
replies.swap(in);
}
+
void finish(int) override {
std::vector<std::pair<int, Message*>> messages;
messages.reserve(replies.size());
- for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
- i != replies.end();
- ++i) {
- messages.push_back(std::make_pair(i->first, i->second));
+ for (auto & reply : replies) {
+ messages.push_back(reply);
}
if (!messages.empty()) {
l->send_message_osd_cluster(messages, epoch);
}
replies.clear();
}
+
~SendPushReplies() override {
- for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
- i != replies.end();
- ++i) {
- i->second->put();
+ for (auto & [_, reply] : replies) {
+ reply->put();
}
replies.clear();
}
};
struct RecoveryReadCompleter : ECCommon::ReadCompleter {
- RecoveryReadCompleter(ECBackend::RecoveryBackend& backend)
+ RecoveryReadCompleter(ECBackend::RecoveryBackend &backend)
: backend(backend) {}
void finish_single_request(
- const hobject_t &hoid,
- ECCommon::read_result_t &res,
- list<ec_align_t>,
- set<int> wanted_to_read) override
- {
+ const hobject_t &hoid,
+ ECCommon::read_result_t &&res,
+ ECCommon::read_request_t &req) override {
if (!(res.r == 0 && res.errors.empty())) {
backend._failed_push(hoid, res);
return;
}
- ceph_assert(res.returned.size() == 1);
+ ceph_assert(req.to_read.size() == 0);
backend.handle_recovery_read_complete(
hoid,
- res.returned.back(),
+ std::move(res.buffers_read),
res.attrs,
+ req.shard_want_to_read,
&rm);
}
- void finish(int priority) && override
- {
+ void finish(int priority) && override {
backend.dispatch_recovery_messages(rm, priority);
}
- ECBackend::RecoveryBackend& backend;
+ ECBackend::RecoveryBackend &backend;
RecoveryMessages rm;
};
void ECBackend::ECRecoveryBackend::commit_txn_send_replies(
- ceph::os::Transaction&& txn,
- std::map<int, MOSDPGPushReply*> replies)
-{
+ ceph::os::Transaction &&txn,
+ std::map<int, MOSDPGPushReply*> replies) {
txn.register_on_complete(
- get_parent()->bless_context(
- new SendPushReplies(
- get_parent(),
- get_osdmap_epoch(),
- replies)));
+ get_parent()->bless_context(
+ new SendPushReplies(
+ get_parent(),
+ get_osdmap_epoch(),
+ replies)));
get_parent()->queue_transaction(std::move(txn));
}
-void ECBackend::RecoveryBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
-{
- for (map<pg_shard_t, vector<PushOp> >::iterator i = m.pushes.begin();
+void ECBackend::RecoveryBackend::dispatch_recovery_messages(
+ RecoveryMessages &m, int priority) {
+ for (map<pg_shard_t, vector<PushOp>>::iterator i = m.pushes.begin();
i != m.pushes.end();
m.pushes.erase(i++)) {
MOSDPGPush *msg = new MOSDPGPush();
msg->pushes.swap(i->second);
msg->compute_cost(cct);
msg->is_repair = get_parent()->pg_is_repair();
- std::vector wrapped_msg {
+ std::vector wrapped_msg{
std::make_pair(i->first.osd, static_cast<Message*>(msg))
};
get_parent()->send_message_osd_cluster(wrapped_msg, msg->map_epoch);
}
- map<int, MOSDPGPushReply*> replies;
- for (map<pg_shard_t, vector<PushReplyOp> >::iterator i =
- m.push_replies.begin();
+ std::map<int, MOSDPGPushReply*> replies;
+ for (map<pg_shard_t, vector<PushReplyOp>>::iterator i =
+ m.push_replies.begin();
i != m.push_replies.end();
m.push_replies.erase(i++)) {
MOSDPGPushReply *msg = new MOSDPGPushReply();
msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
msg->replies.swap(i->second);
msg->compute_cost(cct);
- replies.insert(make_pair(i->first.osd, msg));
+ replies.insert(std::pair(i->first.osd, msg));
}
#if 1
return;
read_pipeline.start_read_op(
priority,
- m.want_to_read,
m.recovery_reads,
- OpRequestRef(),
false,
true,
std::make_unique<RecoveryReadCompleter>(*this));
void ECBackend::RecoveryBackend::continue_recovery_op(
RecoveryBackend::RecoveryOp &op,
- RecoveryMessages *m)
-{
+ RecoveryMessages *m) {
dout(10) << __func__ << ": continuing " << op << dendl;
using RecoveryOp = RecoveryBackend::RecoveryOp;
while (1) {
switch (op.state) {
case RecoveryOp::IDLE: {
- // start read
- op.state = RecoveryOp::READING;
ceph_assert(!op.recovery_progress.data_complete);
- set<int> want(op.missing_on_shards.begin(), op.missing_on_shards.end());
- uint64_t from = op.recovery_progress.data_recovered_to;
- uint64_t amount = get_recovery_chunk_size();
+ ECUtil::shard_extent_set_t want(sinfo.get_k_plus_m());
+
+ op.state = RecoveryOp::READING;
+
+ // We always read the recovery chunk size (default 8MiB + parity). If that
+ // amount of data is not available, then the backend will truncate the
+ // response.
+ sinfo.ro_range_to_shard_extent_set_with_parity(
+ op.recovery_progress.data_recovered_to,
+ get_recovery_chunk_size(), want);
if (op.recovery_progress.first && op.obc) {
- if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(op.hoid);
- r >= 0 || r == -ENOENT) {
- op.hinfo = unstable_hashinfo_registry.get_hash_info(op.hoid, false, attrs, size);
- } else {
- derr << __func__ << ": can't stat-or-getattr on " << op.hoid << dendl;
- }
- if (!op.hinfo) {
- derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
+ op.xattrs = op.obc->attr_cache;
+ if (sinfo.require_hinfo()) {
+ if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(
+ op.hoid);
+ r >= 0 || r == -ENOENT) {
+ op.hinfo = unstable_hashinfo_registry.get_hash_info(
+ op.hoid, false, attrs, size);
+ } else {
+ derr << __func__ << ": can't stat-or-getattr on " << op.hoid <<
+ dendl;
+ }
+ if (!op.hinfo) {
+ derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
<< dendl;
- ceph_assert(recovery_ops.count(op.hoid));
- eversion_t v = recovery_ops[op.hoid].v;
- recovery_ops.erase(op.hoid);
- // TODO: not in crimson yet
- get_parent()->on_failed_pull({get_parent()->whoami_shard()},
- op.hoid, v);
- return;
+ ceph_assert(recovery_ops.count(op.hoid));
+ eversion_t v = recovery_ops[op.hoid].v;
+ recovery_ops.erase(op.hoid);
+ // TODO: not in crimson yet
+ get_parent()->on_failed_pull({get_parent()->whoami_shard()},
+ op.hoid, v);
+ return;
+ }
+ encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
}
- op.xattrs = op.obc->attr_cache;
- encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
}
- map<pg_shard_t, vector<pair<int, int>>> to_read;
+ read_request_t read_request(std::move(want),
+ op.recovery_progress.first && !op.obc,
+ op.obc
+ ? op.obc->obs.oi.size
+ : get_recovery_chunk_size());
+
int r = read_pipeline.get_min_avail_to_read_shards(
- op.hoid, want, true, false, &to_read);
+ op.hoid, true, false, read_request);
+
if (r != 0) {
- // we must have lost a recovery source
- ceph_assert(!op.recovery_progress.first);
- dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
- << dendl;
- // in crimson
- get_parent()->cancel_pull(op.hoid);
- recovery_ops.erase(op.hoid);
- return;
+ // we must have lost a recovery source
+ ceph_assert(!op.recovery_progress.first);
+ dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
+ << dendl;
+ // in crimson
+ get_parent()->cancel_pull(op.hoid);
+ recovery_ops.erase(op.hoid);
+ return;
+ }
+ if (read_request.shard_reads.empty()) {
+ ceph_assert(op.obc);
+ ceph_assert(0 == op.obc->obs.oi.size);
+ dout(10) << __func__ << "Zero size object recovery, skipping reads."
+ << op << dendl;
+ // Create an empty read result and fall through.
+ op.returned_data.emplace(&sinfo);
+ } else {
+ m->recovery_read(
+ op.hoid,
+ read_request);
+ dout(10) << __func__ << ": IDLE return " << op << dendl;
+ return;
}
- m->recovery_read(
- op.hoid,
- op.recovery_progress.data_recovered_to,
- amount,
- std::move(want),
- to_read,
- op.recovery_progress.first && !op.obc);
- op.extent_requested = make_pair(
- from,
- amount);
- dout(10) << __func__ << ": IDLE return " << op << dendl;
- return;
}
+ [[fallthrough]];
case RecoveryOp::READING: {
// read completed, start write
ceph_assert(op.xattrs.size());
- ceph_assert(op.returned_data.size());
+ ceph_assert(op.returned_data);
+ dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
op.state = RecoveryOp::WRITING;
ObjectRecoveryProgress after_progress = op.recovery_progress;
- after_progress.data_recovered_to += op.extent_requested.second;
+ after_progress.data_recovered_to = op.returned_data->get_ro_end();
after_progress.first = false;
if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
- after_progress.data_recovered_to =
- sinfo.logical_to_next_stripe_offset(
- op.obc->obs.oi.size);
- after_progress.data_complete = true;
+ after_progress.data_complete = true;
}
- for (set<pg_shard_t>::iterator mi = op.missing_on.begin();
- mi != op.missing_on.end();
- ++mi) {
- ceph_assert(op.returned_data.count(static_cast<int>(mi->shard)));
- m->pushes[*mi].push_back(PushOp());
- PushOp &pop = m->pushes[*mi].back();
- pop.soid = op.hoid;
- pop.version = op.v;
- pop.data = op.returned_data[static_cast<int>(mi->shard)];
- dout(10) << __func__ << ": before_progress=" << op.recovery_progress
+ for (auto &&pg_shard: op.missing_on) {
+ m->pushes[pg_shard].push_back(PushOp());
+ PushOp &pop = m->pushes[pg_shard].back();
+ pop.soid = op.hoid;
+ pop.version = op.v;
+ op.returned_data->get_shard_first_buffer(pg_shard.shard, pop.data);
+ dout(10) << __func__ << ": pop shard=" << pg_shard
+ << ", oid=" << pop.soid
+ << ", before_progress=" << op.recovery_progress
<< ", after_progress=" << after_progress
<< ", pop.data.length()=" << pop.data.length()
<< ", size=" << op.obc->obs.oi.size << dendl;
- ceph_assert(
- pop.data.length() ==
- sinfo.aligned_logical_offset_to_chunk_offset(
- after_progress.data_recovered_to -
- op.recovery_progress.data_recovered_to)
- );
- if (pop.data.length())
- pop.data_included.insert(
- sinfo.aligned_logical_offset_to_chunk_offset(
- op.recovery_progress.data_recovered_to),
- pop.data.length()
- );
- if (op.recovery_progress.first) {
- pop.attrset = op.xattrs;
- }
- pop.recovery_info = op.recovery_info;
- pop.before_progress = op.recovery_progress;
- pop.after_progress = after_progress;
- if (*mi != get_parent()->primary_shard())
- // already in crimson -- junction point with PeeringState
- get_parent()->begin_peer_recover(
- *mi,
- op.hoid);
+ if (pop.data.length())
+ pop.data_included.union_insert(
+ op.returned_data->get_shard_first_offset(pg_shard.shard),
+ pop.data.length());
+ if (op.recovery_progress.first) {
+ pop.attrset = op.xattrs;
+ }
+ pop.recovery_info = op.recovery_info;
+ pop.before_progress = op.recovery_progress;
+ pop.after_progress = after_progress;
+ if (pg_shard != get_parent()->primary_shard()) {
+ // already in crimson -- junction point with PeeringState
+ get_parent()->begin_peer_recover(
+ pg_shard,
+ op.hoid);
+ }
}
- op.returned_data.clear();
+ op.returned_data.reset();
op.waiting_on_pushes = op.missing_on;
op.recovery_progress = after_progress;
dout(10) << __func__ << ": READING return " << op << dendl;
}
case RecoveryOp::WRITING: {
if (op.waiting_on_pushes.empty()) {
- if (op.recovery_progress.data_complete) {
- op.state = RecoveryOp::COMPLETE;
- for (set<pg_shard_t>::iterator i = op.missing_on.begin();
- i != op.missing_on.end();
- ++i) {
- if (*i != get_parent()->primary_shard()) {
- dout(10) << __func__ << ": on_peer_recover on " << *i
+ if (op.recovery_progress.data_complete) {
+ op.state = RecoveryOp::COMPLETE;
+ for (set<pg_shard_t>::iterator i = op.missing_on.begin();
+ i != op.missing_on.end();
+ ++i) {
+ if (*i != get_parent()->primary_shard()) {
+ dout(10) << __func__ << ": on_peer_recover on " << *i
<< ", obj " << op.hoid << dendl;
- get_parent()->on_peer_recover(
- *i,
- op.hoid,
- op.recovery_info);
- }
- }
- object_stat_sum_t stat;
- stat.num_bytes_recovered = op.recovery_info.size;
- stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
- stat.num_objects_recovered = 1;
- // TODO: not in crimson yet
- if (get_parent()->pg_is_repair())
- stat.num_objects_repaired = 1;
- // pg_recovery.cc in crimson has it
- get_parent()->on_global_recover(op.hoid, stat, false);
- dout(10) << __func__ << ": WRITING return " << op << dendl;
- recovery_ops.erase(op.hoid);
- return;
- } else {
- op.state = RecoveryOp::IDLE;
- dout(10) << __func__ << ": WRITING continue " << op << dendl;
- continue;
- }
+ get_parent()->on_peer_recover(
+ *i,
+ op.hoid,
+ op.recovery_info);
+ }
+ }
+ object_stat_sum_t stat;
+ stat.num_bytes_recovered = op.recovery_info.size;
+ stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
+ stat.num_objects_recovered = 1;
+ // TODO: not in crimson yet
+ if (get_parent()->pg_is_repair())
+ stat.num_objects_repaired = 1;
+ // pg_recovery.cc in crimson has it
+ get_parent()->on_global_recover(op.hoid, stat, false);
+ dout(10) << __func__ << ": WRITING return " << op << dendl;
+ recovery_ops.erase(op.hoid);
+ return;
+ } else {
+ op.state = RecoveryOp::IDLE;
+ dout(10) << __func__ << ": WRITING continue " << op << dendl;
+ continue;
+ }
}
return;
}
void ECBackend::run_recovery_op(
PGBackend::RecoveryHandle *_h,
- int priority)
-{
+ int priority) {
ceph_assert(_h);
ECRecoveryHandle &h = static_cast<ECRecoveryHandle&>(*_h);
recovery_backend.run_recovery_op(h, priority);
void ECBackend::RecoveryBackend::run_recovery_op(
ECRecoveryHandle &h,
- int priority)
-{
+ int priority) {
RecoveryMessages m;
for (list<RecoveryOp>::iterator i = h.ops.begin();
i != h.ops.end();
eversion_t v,
ObjectContextRef head,
ObjectContextRef obc,
- PGBackend::RecoveryHandle *_h)
-{
+ PGBackend::RecoveryHandle *_h) {
return recovery_backend.recover_object(hoid, v, head, obc, _h);
}
eversion_t v,
ObjectContextRef head,
ObjectContextRef obc,
- PGBackend::RecoveryHandle *_h)
-{
+ PGBackend::RecoveryHandle *_h) {
ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
h->ops.push_back(RecoveryOp());
h->ops.back().v = v;
}
h->ops.back().recovery_progress.omap_complete = true;
for (set<pg_shard_t>::const_iterator i =
- get_parent()->get_acting_recovery_backfill_shards().begin();
+ get_parent()->get_acting_recovery_backfill_shards().begin();
i != get_parent()->get_acting_recovery_backfill_shards().end();
++i) {
dout(10) << "checking " << *i << dendl;
}
bool ECBackend::can_handle_while_inactive(
- OpRequestRef _op)
-{
+ OpRequestRef _op) {
return false;
}
bool ECBackend::_handle_message(
- OpRequestRef _op)
-{
+ OpRequestRef _op) {
dout(10) << __func__ << ": " << *_op->get_req() << dendl;
int priority = _op->get_req()->get_priority();
switch (_op->get_req()->get_type()) {
MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
_op->get_nonconst_req());
parent->maybe_preempt_replica_scrub(op->op.soid);
- handle_sub_write(op->op.from, _op, op->op, _op->pg_trace, *get_parent()->get_eclistener());
+ handle_sub_write(op->op.from, _op, op->op, _op->pg_trace,
+ *get_parent()->get_eclistener());
return true;
}
case MSG_OSD_EC_WRITE_REPLY: {
auto op = _op->get_req<MOSDPGPush>();
RecoveryMessages rm;
for (vector<PushOp>::const_iterator i = op->pushes.begin();
- i != op->pushes.end();
- ++i) {
+ i != op->pushes.end();
+ ++i) {
handle_recovery_push(*i, &rm, op->is_repair);
}
recovery_backend.dispatch_recovery_messages(rm, priority);
return true;
}
case MSG_OSD_PG_PUSH_REPLY: {
- const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply *>(
+ const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply*>(
_op->get_req());
RecoveryMessages rm;
for (vector<PushReplyOp>::const_iterator i = op->replies.begin();
- i != op->replies.end();
- ++i) {
+ i != op->replies.end();
+ ++i) {
recovery_backend.handle_recovery_push_reply(*i, op->from, &rm);
}
recovery_backend.dispatch_recovery_messages(rm, priority);
eversion_t version;
eversion_t last_complete;
const ZTracer::Trace trace;
+
SubWriteCommitted(
ECBackend *pg,
OpRequestRef msg,
const ZTracer::Trace &trace)
: pg(pg), msg(msg), tid(tid),
version(version), last_complete(last_complete), trace(trace) {}
+
void finish(int) override {
if (msg)
msg->mark_event("sub_op_committed");
pg->sub_write_committed(tid, version, last_complete, trace);
}
};
+
void ECBackend::sub_write_committed(
ceph_tid_t tid, eversion_t version, eversion_t last_complete,
const ZTracer::Trace &trace) {
OpRequestRef msg,
ECSubWrite &op,
const ZTracer::Trace &trace,
- ECListener&)
-{
+ ECListener &) {
if (msg) {
msg->mark_event("sub_op_started");
}
trace.event("handle_sub_write");
if (cct->_conf->bluestore_debug_inject_read_err &&
- ECInject::test_write_error3(op.soid)) {
+ ECInject::test_write_error3(op.soid)) {
ceph_abort_msg("Error inject - OSD down");
}
if (!get_parent()->pgb_is_primary())
}
if (op.backfill_or_async_recovery) {
for (set<hobject_t>::iterator i = op.temp_removed.begin();
- i != op.temp_removed.end();
- ++i) {
+ i != op.temp_removed.end();
+ ++i) {
dout(10) << __func__ << ": removing object " << *i
<< " since we won't get the transaction" << dendl;
localt.remove(
- switcher->coll,
- ghobject_t(
- *i,
- ghobject_t::NO_GEN,
- get_parent()->whoami_shard().shard));
+ switcher->coll,
+ ghobject_t(
+ *i,
+ ghobject_t::NO_GEN,
+ get_parent()->whoami_shard().shard));
}
}
switcher->clear_temp_objs(op.temp_removed);
- dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+ dout(30) << __func__ << " missing before " <<
+ get_parent()->get_log().get_missing().get_items() << dendl;
// flag set to true during async recovery
bool async = false;
pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
if (pmissing.is_missing(op.soid)) {
async = true;
- dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl;
+ dout(30) << __func__ << " is_missing " <<
+ pmissing.is_missing(op.soid) << dendl;
for (auto &&e: op.log_entries) {
dout(30) << " add_next_event entry " << e << dendl;
get_parent()->add_local_next_event(e);
async);
if (!get_parent()->pg_is_undersized() &&
- (unsigned)get_parent()->whoami_shard().shard >= sinfo.get_k())
+ get_parent()->whoami_shard().shard >= sinfo.get_k())
op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
localt.register_on_commit(
get_parent()->bless_context(
new SubWriteCommitted(
- this, msg, op.tid,
- op.at_version,
- get_parent()->get_info().last_complete, trace)));
+ this, msg, op.tid,
+ op.at_version,
+ get_parent()->get_info().last_complete, trace)));
vector<ObjectStore::Transaction> tls;
tls.reserve(2);
tls.push_back(std::move(op.t));
tls.push_back(std::move(localt));
get_parent()->queue_transactions(tls, msg);
- dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+ dout(30) << __func__ << " missing after" << get_parent()->get_log().
+ get_missing().
+ get_items() << dendl
+ ;
if (op.at_version != eversion_t()) {
// dummy rollforward transaction doesn't get at_version (and doesn't advance it)
get_parent()->op_applied(op.at_version);
pg_shard_t from,
const ECSubRead &op,
ECSubReadReply *reply,
- const ZTracer::Trace &trace)
-{
+ const ZTracer::Trace &trace) {
trace.event("handle sub read");
shard_id_t shard = get_parent()->whoami_shard().shard;
- for(auto i = op.to_read.begin();
- i != op.to_read.end();
- ++i) {
+ for (auto &&[hoid, to_read]: op.to_read) {
int r = 0;
- for (auto j = i->second.begin(); j != i->second.end(); ++j) {
+ for (auto &&[offset, len, flags]: to_read) {
bufferlist bl;
- if ((op.subchunks.find(i->first)->second.size() == 1) &&
- (op.subchunks.find(i->first)->second.front().second ==
- ec_impl->get_sub_chunk_count())) {
+ auto &subchunks = op.subchunks.at(hoid);
+ if ((subchunks.size() == 1) &&
+ (subchunks.front().second == ec_impl->get_sub_chunk_count())) {
dout(20) << __func__ << " case1: reading the complete chunk/shard." << dendl;
r = switcher->store->read(
- switcher->ch,
- ghobject_t(i->first, ghobject_t::NO_GEN, shard),
- j->get<0>(),
- j->get<1>(),
- bl, j->get<2>()); // Allow EIO return
+ switcher->ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+ offset, len, bl, flags); // Allow EIO return
} else {
int subchunk_size =
sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
<< " subchunk_size=" << subchunk_size
<< " chunk_size=" << sinfo.get_chunk_size() << dendl;
bool error = false;
- for (int m = 0; m < (int)j->get<1>() && !error;
+ for (int m = 0; m < (int)len && !error;
m += sinfo.get_chunk_size()) {
- for (auto &&k:op.subchunks.find(i->first)->second) {
+ for (auto &&k: subchunks) {
bufferlist bl0;
r = switcher->store->read(
- switcher->ch,
- ghobject_t(i->first, ghobject_t::NO_GEN, shard),
- j->get<0>() + m + (k.first)*subchunk_size,
- (k.second)*subchunk_size,
- bl0, j->get<2>());
+ switcher->ch,
+ ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+ offset + m + (k.first) * subchunk_size,
+ (k.second) * subchunk_size,
+ bl0, flags);
if (r < 0) {
error = true;
break;
}
if (r < 0) {
- // if we are doing fast reads, it's possible for one of the shard
- // reads to cross paths with another update and get a (harmless)
- // ENOENT. Suppress the message to the cluster log in that case.
- if (r == -ENOENT && get_parent()->get_pool().fast_read) {
- dout(5) << __func__ << ": Error " << r
- << " reading " << i->first << ", fast read, probably ok"
+ // if we are doing fast reads, it's possible for one of the shard
+ // reads to cross paths with another update and get a (harmless)
+ // ENOENT. Suppress the message to the cluster log in that case.
+ if (r == -ENOENT && get_parent()->get_pool().fast_read) {
+ dout(5) << __func__ << ": Error " << r
+ << " reading " << hoid << ", fast read, probably ok"
<< dendl;
- } else {
- get_parent()->clog_error() << "Error " << r
- << " reading object "
- << i->first;
- dout(5) << __func__ << ": Error " << r
- << " reading " << i->first << dendl;
- }
- goto error;
+ } else {
+ get_parent()->clog_error() << "Error " << r
+ << " reading object "
+ << hoid;
+ dout(5) << __func__ << ": Error " << r
+ << " reading " << hoid << dendl;
+ }
+ goto error;
} else {
- dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl;
- reply->buffers_read[i->first].push_back(
- make_pair(
- j->get<0>(),
- bl)
- );
+ dout(20) << __func__ << " read request=" << len << " r=" << r << " len="
+ << bl.length() << dendl;
+ reply->buffers_read[hoid].push_back(make_pair(offset, bl));
}
- if (!get_parent()->get_pool().allows_ecoverwrites()) {
- // This shows that we still need deep scrub because large enough files
- // are read in sections, so the digest check here won't be done here.
- // Do NOT check osd_read_eio_on_bad_digest here. We need to report
- // the state of our chunk in case other chunks could substitute.
+ if (!sinfo.supports_ec_overwrites()) {
+ // This shows that we still need deep scrub because large enough files
+ // are read in sections, so the digest check here won't be done here.
+ // Do NOT check osd_read_eio_on_bad_digest here. We need to report
+ // the state of our chunk in case other chunks could substitute.
ECUtil::HashInfoRef hinfo;
map<string, bufferlist, less<>> attrs;
- struct stat st;
- int r = object_stat(i->first, &st);
+ struct stat st;
+ int r = object_stat(hoid, &st);
+ if (r >= 0) {
+ dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
+ r = switcher->objects_get_attrs_with_hinfo(hoid, &attrs);
+ }
if (r >= 0) {
- dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
- r = switcher->objects_get_attrs_with_hinfo(i->first, &attrs);
- }
- if (r >= 0) {
- hinfo = unstable_hashinfo_registry.get_hash_info(i->first, false, attrs, st.st_size);
- } else {
- derr << __func__ << ": access (attrs) on " << i->first << " failed: "
+ hinfo = unstable_hashinfo_registry.get_hash_info(
+ hoid, false, attrs, st.st_size);
+ } else {
+ derr << __func__ << ": access (attrs) on " << hoid << " failed: "
<< cpp_strerror(r) << dendl;
- }
+ }
if (!hinfo) {
r = -EIO;
get_parent()->clog_error() << "Corruption detected: object "
- << i->first
- << " is missing hash_info";
- dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
+ << hoid
+ << " is missing hash_info";
+ dout(5) << __func__ << ": No hinfo for " << hoid << dendl;
goto error;
}
- ceph_assert(hinfo->has_chunk_hash());
- if ((bl.length() == hinfo->get_total_chunk_size()) &&
- (j->get<0>() == 0)) {
- dout(20) << __func__ << ": Checking hash of " << i->first << dendl;
- bufferhash h(-1);
- h << bl;
- if (h.digest() != hinfo->get_chunk_hash(shard)) {
- get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x"
- << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec;
- dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
- << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
- r = -EIO;
- goto error;
- }
- }
+ ceph_assert(hinfo->has_chunk_hash());
+ if ((bl.length() == hinfo->get_total_chunk_size()) &&
+ (offset == 0)) {
+ dout(20) << __func__ << ": Checking hash of " << hoid << dendl;
+ bufferhash h(-1);
+ h << bl;
+ if (h.digest() != hinfo->get_chunk_hash(shard)) {
+ get_parent()->clog_error() << "Bad hash for " << hoid <<
+ " digest 0x"
+ << hex << h.digest() << " expected 0x" << hinfo->
+ get_chunk_hash(shard) << dec;
+ dout(5) << __func__ << ": Bad hash for " << hoid << " digest 0x"
+ << hex << h.digest() << " expected 0x" << hinfo->
+get_chunk_hash(shard) << dec << dendl;
+ r = -EIO;
+ goto error;
+ }
+ }
}
}
continue;
-error:
+ error:
// Do NOT check osd_read_eio_on_bad_digest here. We need to report
// the state of our chunk in case other chunks could substitute.
- reply->buffers_read.erase(i->first);
- reply->errors[i->first] = r;
+ reply->buffers_read.erase(hoid);
+ reply->errors[hoid] = r;
}
for (set<hobject_t>::iterator i = op.attrs_to_read.begin();
i != op.attrs_to_read.end();
++i) {
dout(10) << __func__ << ": fulfilling attr request on "
<< *i << dendl;
- if (reply->errors.count(*i))
+ if (reply->errors.contains(*i))
continue;
int r = switcher->store->getattrs(
switcher->ch,
ghobject_t(
- *i, ghobject_t::NO_GEN, shard),
+ *i, ghobject_t::NO_GEN, shard),
reply->attrs_read[*i]);
if (r < 0) {
// If we read error, we should not return the attrs too.
void ECBackend::handle_sub_write_reply(
pg_shard_t from,
- const ECSubWriteReply &op,
- const ZTracer::Trace &trace)
-{
- map<ceph_tid_t, RMWPipeline::OpRef>::iterator i = rmw_pipeline.tid_to_op_map.find(op.tid);
- ceph_assert(i != rmw_pipeline.tid_to_op_map.end());
- if (op.committed) {
+ const ECSubWriteReply &ec_write_reply_op,
+ const ZTracer::Trace &trace) {
+ RMWPipeline::OpRef &op = rmw_pipeline.tid_to_op_map.at(ec_write_reply_op.tid);
+ if (ec_write_reply_op.committed) {
trace.event("sub write committed");
- ceph_assert(i->second->pending_commit.count(from));
- i->second->pending_commit.erase(from);
+ ceph_assert(op->pending_commits > 0);
+ op->pending_commits--;
if (from != get_parent()->whoami_shard()) {
- get_parent()->update_peer_last_complete_ondisk(from, op.last_complete);
+ get_parent()->update_peer_last_complete_ondisk(
+ from, ec_write_reply_op.last_complete);
}
}
- if (op.applied) {
- trace.event("sub write applied");
- ceph_assert(i->second->pending_apply.count(from));
- i->second->pending_apply.erase(from);
- }
- if (i->second->pending_commit.empty() &&
- i->second->on_all_commit &&
- // also wait for apply, to preserve ordering with luminous peers.
- i->second->pending_apply.empty()) {
- dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl;
- i->second->on_all_commit->complete(0);
- i->second->on_all_commit = 0;
- i->second->trace.event("ec write all committed");
- }
if (cct->_conf->bluestore_debug_inject_read_err &&
- (i->second->pending_commit.size() == 1) &&
- ECInject::test_write_error2(i->second->hoid)) {
+ (op->pending_commits == 1) &&
+ ECInject::test_write_error2(op->hoid)) {
std::string cmd =
- "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }";
+ "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string(
+ get_parent()->whoami()) + "\"] }";
vector<std::string> vcmd{cmd};
dout(0) << __func__ << " Error inject - marking OSD down" << dendl;
get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
}
- rmw_pipeline.check_ops();
+
+ if (op->pending_commits == 0) {
+ rmw_pipeline.try_finish_rmw();
+ }
}
void ECBackend::handle_sub_read_reply(
pg_shard_t from,
ECSubReadReply &op,
- const ZTracer::Trace &trace)
-{
+ const ZTracer::Trace &trace) {
trace.event("ec sub read reply");
dout(10) << __func__ << ": reply " << op << dendl;
- map<ceph_tid_t, ReadOp>::iterator iter = read_pipeline.tid_to_read_map.find(op.tid);
+ map<ceph_tid_t, ReadOp>::iterator iter = read_pipeline.tid_to_read_map.
+ find(op.tid);
if (iter == read_pipeline.tid_to_read_map.end()) {
//canceled
dout(20) << __func__ << ": dropped " << op << dendl;
ReadOp &rop = iter->second;
if (cct->_conf->bluestore_debug_inject_read_err) {
for (auto i = op.buffers_read.begin();
- i != op.buffers_read.end();
- ++i) {
- if (ECInject::test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) {
- dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl;
- op.buffers_read.erase(i->first);
- op.attrs_read.erase(i->first);
- op.errors[i->first] = -EIO;
+ i != op.buffers_read.end();
+ ++i) {
+ if (ECInject::test_read_error0(
+ ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) {
+ dout(0) << __func__ << " Error inject - EIO error for shard " << op.from
+ .shard
+ << dendl;
+ op.buffers_read.erase(i->first);
+ op.attrs_read.erase(i->first);
+ op.errors[i->first] = -EIO;
+ rop.debug_log.emplace_back(ECUtil::INJECT_EIO, op.from);
}
-
}
}
- for (auto i = op.buffers_read.begin();
- i != op.buffers_read.end();
- ++i) {
- ceph_assert(!op.errors.count(i->first)); // If attribute error we better not have sent a buffer
- if (!rop.to_read.count(i->first)) {
+ for (auto &&[hoid, offset_buffer_map]: op.buffers_read) {
+ ceph_assert(!op.errors.contains(hoid));
+ // If attribute error we better not have sent a buffer
+ if (!rop.to_read.contains(hoid)) {
+ rop.debug_log.emplace_back(ECUtil::CANCELLED, op.from);
+
// We canceled this read! @see filter_read_op
dout(20) << __func__ << " to_read skipping" << dendl;
continue;
}
- list<ec_align_t>::const_iterator req_iter =
- rop.to_read.find(i->first)->second.to_read.begin();
- list<
- boost::tuple<
- uint64_t, uint64_t, map<pg_shard_t, bufferlist> > >::iterator riter =
- rop.complete[i->first].returned.begin();
- for (list<pair<uint64_t, bufferlist> >::iterator j = i->second.begin();
- j != i->second.end();
- ++j, ++req_iter, ++riter) {
- ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
- ceph_assert(riter != rop.complete[i->first].returned.end());
- pair<uint64_t, uint64_t> aligned =
- sinfo.chunk_aligned_offset_len_to_chunk(
- make_pair(req_iter->offset, req_iter->size));
- ceph_assert(aligned.first == j->first);
- riter->get<2>()[from] = std::move(j->second);
+
+ if (!rop.complete.contains(hoid)) {
+ rop.complete.emplace(hoid, &sinfo);
+ }
+
+ auto &buffers_read = rop.complete.at(hoid).buffers_read;
+ for (auto &&[offset, buffer_list]: offset_buffer_map) {
+ buffers_read.insert_in_shard(from.shard, offset, buffer_list);
}
+ rop.debug_log.emplace_back(ECUtil::READ_DONE, op.from, buffers_read);
}
- for (auto i = op.attrs_read.begin();
- i != op.attrs_read.end();
- ++i) {
- ceph_assert(!op.errors.count(i->first)); // if read error better not have sent an attribute
- if (!rop.to_read.count(i->first)) {
+ for (auto &&[hoid, req]: rop.to_read) {
+ if (!rop.complete.contains(hoid)) {
+ rop.complete.emplace(hoid, &sinfo);
+ }
+ auto &complete = rop.complete.at(hoid);
+ for (auto &&[shard, read]: std::as_const(req.shard_reads)) {
+ if (complete.errors.contains(read.pg_shard)) continue;
+
+ complete.processed_read_requests[shard].union_of(read.extents);
+
+ if (!rop.complete.contains(hoid) ||
+ !complete.buffers_read.contains(shard)) {
+ if (!read.extents.empty()) continue; // Complete the actual read first.
+
+ // If we are first here, populate the completion.
+ if (!rop.complete.contains(hoid)) {
+ rop.complete.emplace(hoid, read_result_t(&sinfo));
+ }
+ }
+ }
+ }
+ for (auto &&[hoid, attr]: op.attrs_read) {
+ ceph_assert(!op.errors.count(hoid));
+ // if read error better not have sent an attribute
+ if (!rop.to_read.count(hoid)) {
// We canceled this read! @see filter_read_op
dout(20) << __func__ << " to_read skipping" << dendl;
continue;
}
- rop.complete[i->first].attrs.emplace();
- (*(rop.complete[i->first].attrs)).swap(i->second);
+ if (!rop.complete.contains(hoid)) {
+ rop.complete.emplace(hoid, &sinfo);
+ }
+ rop.complete.at(hoid).attrs.emplace();
+ (*(rop.complete.at(hoid).attrs)).swap(attr);
}
- for (auto i = op.errors.begin();
- i != op.errors.end();
- ++i) {
- rop.complete[i->first].errors.insert(
- make_pair(
- from,
- i->second));
- dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl;
+ for (auto &&[hoid, err]: op.errors) {
+ if (!rop.complete.contains(hoid)) {
+ rop.complete.emplace(hoid, &sinfo);
+ }
+ auto &complete = rop.complete.at(hoid);
+ complete.errors.emplace(from, err);
+ rop.debug_log.emplace_back(ECUtil::ERROR, op.from, complete.buffers_read);
+ complete.buffers_read.erase_shard(from.shard);
+ complete.processed_read_requests.erase(from.shard);
+ dout(20) << __func__ << " shard=" << from << " error=" << err << dendl;
}
- map<pg_shard_t, set<ceph_tid_t> >::iterator siter =
- read_pipeline.shard_to_read_map.find(from);
+ map<pg_shard_t, set<ceph_tid_t>>::iterator siter =
+ read_pipeline.shard_to_read_map.find(from);
ceph_assert(siter != read_pipeline.shard_to_read_map.end());
ceph_assert(siter->second.count(op.tid));
siter->second.erase(op.tid);
// For redundant reads check for completion as each shard comes in,
// or in a non-recovery read check for completion once all the shards read.
if (rop.do_redundant_reads || rop.in_progress.empty()) {
- for (map<hobject_t, read_result_t>::const_iterator iter =
- rop.complete.begin();
- iter != rop.complete.end();
- ++iter) {
- set<int> have;
- for (map<pg_shard_t, bufferlist>::const_iterator j =
- iter->second.returned.front().get<2>().begin();
- j != iter->second.returned.front().get<2>().end();
- ++j) {
- have.insert(static_cast<int>(j->first.shard));
- dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
- }
- map<int, vector<pair<int, int>>> dummy_minimum;
- int err;
- if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
- dout(20) << __func__ << " minimum_to_decode failed" << dendl;
+ for (auto &&[oid, read_result]: rop.complete) {
+ shard_id_set have;
+ read_result.processed_read_requests.populate_shard_id_set(have);
+ shard_id_set dummy_minimum;
+ shard_id_set want_to_read;
+ rop.to_read.at(oid).shard_want_to_read.
+ populate_shard_id_set(want_to_read);
+
+ int err = ec_impl->minimum_to_decode(want_to_read, have, dummy_minimum,
+ nullptr);
+ if (err) {
+ dout(20) << __func__ << " minimum_to_decode failed" << dendl;
if (rop.in_progress.empty()) {
- // If we don't have enough copies, try other pg_shard_ts if available.
- // During recovery there may be multiple osds with copies of the same shard,
- // so getting EIO from one may result in multiple passes through this code path.
- if (!rop.do_redundant_reads) {
- int r = read_pipeline.send_all_remaining_reads(iter->first, rop);
- if (r == 0) {
- // We changed the rop's to_read and not incrementing is_complete
- need_resend = true;
- continue;
- }
- // Couldn't read any additional shards so handle as completed with errors
- }
- // We don't want to confuse clients / RBD with objectstore error
- // values in particular ENOENT. We may have different error returns
- // from different shards, so we'll return minimum_to_decode() error
- // (usually EIO) to reader. It is likely an error here is due to a
- // damaged pg.
- rop.complete[iter->first].r = err;
- ++is_complete;
- }
- } else {
- ceph_assert(rop.complete[iter->first].r == 0);
- if (!rop.complete[iter->first].errors.empty()) {
- if (cct->_conf->osd_read_ec_check_for_errors) {
- dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl;
- err = rop.complete[iter->first].errors.begin()->second;
- rop.complete[iter->first].r = err;
- } else {
- get_parent()->clog_warn() << "Error(s) ignored for "
- << iter->first << " enough copies available";
- dout(10) << __func__ << " Error(s) ignored for " << iter->first
+ // If we don't have enough copies, try other pg_shard_ts if available.
+ // During recovery there may be multiple osds with copies of the same shard,
+ // so getting EIO from one may result in multiple passes through this code path.
+ if (!rop.do_redundant_reads) {
+ rop.debug_log.emplace_back(ECUtil::REQUEST_MISSING, op.from);
+ int r = read_pipeline.send_all_remaining_reads(oid, rop);
+ if (r == 0) {
+ // We found that new reads are required to do a decode.
+ need_resend = true;
+ continue;
+ } else if (r > 0) {
+ // No new reads were requested. This means that some parity
+ // shards can be assumed to be zeros.
+ err = 0;
+ }
+ // else insufficient shards are available, keep the errors.
+ }
+ // Couldn't read any additional shards so handle as completed with errors
+ // We don't want to confuse clients / RBD with objectstore error
+ // values in particular ENOENT. We may have different error returns
+ // from different shards, so we'll return minimum_to_decode() error
+ // (usually EIO) to reader. It is likely an error here is due to a
+ // damaged pg.
+ rop.complete.at(oid).r = err;
+ ++is_complete;
+ }
+ }
+
+ if (!err) {
+ ceph_assert(rop.complete.at(oid).r == 0);
+ if (!rop.complete.at(oid).errors.empty()) {
+ if (cct->_conf->osd_read_ec_check_for_errors) {
+ rop.debug_log.emplace_back(ECUtil::COMPLETE_ERROR, op.from);
+ dout(10) << __func__ << ": Not ignoring errors, use one shard" << dendl;
+ err = rop.complete.at(oid).errors.begin()->second;
+ rop.complete.at(oid).r = err;
+ } else {
+ get_parent()->clog_warn() << "Error(s) ignored for "
+ << iter->first << " enough copies available";
+ dout(10) << __func__ << " Error(s) ignored for " << iter->first
<< " enough copies available" << dendl;
- rop.complete[iter->first].errors.clear();
- }
- }
- // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
- rop.to_read.at(iter->first).need.clear();
- rop.to_read.at(iter->first).want_attrs = false;
- ++is_complete;
+ rop.debug_log.emplace_back(ECUtil::ERROR_CLEAR, op.from);
+ rop.complete.at(oid).errors.clear();
+ }
+ }
+ // avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
+ rop.to_read.at(oid).shard_reads.clear();
+ rop.to_read.at(oid).want_attrs = false;
+ ++is_complete;
}
}
}
if (need_resend) {
read_pipeline.do_read_op(rop);
- } else if (rop.in_progress.empty() ||
+ } else if (rop.in_progress.empty() ||
is_complete == rop.complete.size()) {
dout(20) << __func__ << " Complete: " << rop << dendl;
rop.trace.event("ec read complete");
- read_pipeline.complete_read_op(rop);
+ rop.debug_log.emplace_back(ECUtil::COMPLETE, op.from);
+ read_pipeline.complete_read_op(std::move(rop));
} else {
dout(10) << __func__ << " readop not complete: " << rop << dendl;
}
}
-void ECBackend::check_recovery_sources(const OSDMapRef& osdmap)
-{
- struct FinishReadOp : public GenContext<ThreadPool::TPHandle&> {
- ECCommon::ReadPipeline& read_pipeline;
+void ECBackend::check_recovery_sources(const OSDMapRef &osdmap) {
+ struct FinishReadOp : public GenContext<ThreadPool::TPHandle&> {
+ ECCommon::ReadPipeline &read_pipeline;
ceph_tid_t tid;
- FinishReadOp(ECCommon::ReadPipeline& read_pipeline, ceph_tid_t tid)
+
+ FinishReadOp(ECCommon::ReadPipeline &read_pipeline, ceph_tid_t tid)
: read_pipeline(read_pipeline), tid(tid) {}
- void finish(ThreadPool::TPHandle&) override {
+
+ void finish(ThreadPool::TPHandle &) override {
auto ropiter = read_pipeline.tid_to_read_map.find(tid);
ceph_assert(ropiter != read_pipeline.tid_to_read_map.end());
- read_pipeline.complete_read_op(ropiter->second);
+ read_pipeline.complete_read_op(std::move(ropiter->second));
}
};
read_pipeline.check_recovery_sources(
osdmap,
- [this] (const hobject_t& obj) {
+ [this](const hobject_t &obj) {
recovery_backend.recovery_ops.erase(obj);
},
- [this] (const ReadOp& op) {
+ [this](const ReadOp &op) {
get_parent()->schedule_recovery_work(
get_parent()->bless_unlocked_gencontext(
new FinishReadOp(read_pipeline, op.tid)),
});
}
-void ECBackend::on_change()
-{
+void ECBackend::on_change() {
rmw_pipeline.on_change();
read_pipeline.on_change();
+ rmw_pipeline.on_change2();
clear_recovery_state();
}
-void ECBackend::clear_recovery_state()
-{
+void ECBackend::clear_recovery_state() {
recovery_backend.recovery_ops.clear();
}
-void ECBackend::dump_recovery_info(Formatter *f) const
-{
+void ECBackend::dump_recovery_info(Formatter *f) const {
f->open_array_section("recovery_ops");
- for (map<hobject_t, RecoveryBackend::RecoveryOp>::const_iterator i = recovery_backend.recovery_ops.begin();
+ for (map<hobject_t, RecoveryBackend::RecoveryOp>::const_iterator i =
+ recovery_backend.recovery_ops.begin();
i != recovery_backend.recovery_ops.end();
++i) {
f->open_object_section("op");
}
f->close_section();
f->open_array_section("read_ops");
- for (map<ceph_tid_t, ReadOp>::const_iterator i = read_pipeline.tid_to_read_map.begin();
+ for (map<ceph_tid_t, ReadOp>::const_iterator i = read_pipeline.tid_to_read_map
+ .begin();
i != read_pipeline.tid_to_read_map.end();
++i) {
f->open_object_section("read_op");
PGTransactionUPtr t;
void generate_transactions(
- ceph::ErasureCodeInterfaceRef &ecimpl,
- pg_t pgid,
- const ECUtil::stripe_info_t &sinfo,
- std::map<hobject_t,extent_map> *written,
- std::map<shard_id_t, ObjectStore::Transaction> *transactions,
- DoutPrefixProvider *dpp,
- const ceph_release_t require_osd_release) final
- {
+ ceph::ErasureCodeInterfaceRef &ec_impl,
+ pg_t pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ map<hobject_t, ECUtil::shard_extent_map_t> *written,
+ shard_id_map<ObjectStore::Transaction> *transactions,
+ DoutPrefixProvider *dpp,
+ const OSDMapRef &osdmap) final {
assert(t);
ECTransaction::generate_transactions(
t.get(),
plan,
- ecimpl,
+ ec_impl,
pgid,
sinfo,
- remote_read_result,
+ remote_shard_extent_map,
log_entries,
written,
transactions,
&temp_added,
&temp_cleared,
dpp,
- require_osd_release);
+ osdmap);
}
- template <typename F>
- static ECTransaction::WritePlan get_write_plan(
- const ECUtil::stripe_info_t &sinfo,
- PGTransaction& t,
- F &&get_hinfo,
- DoutPrefixProvider *dpp)
- {
- return ECTransaction::get_write_plan(
- sinfo,
- t,
- std::forward<F>(get_hinfo),
- dpp);
+ bool skip_transaction(
+ std::set<shard_id_t> &pending_roll_forward,
+ shard_id_t shard,
+ ceph::os::Transaction &transaction) final {
+ if (transaction.empty()) {
+ return true;
+ }
+ pending_roll_forward.insert(shard);
+ return false;
}
};
int,
map<string, bufferlist, less<>>,
size_t
-> ECBackend::get_attrs_n_size_from_disk(const hobject_t& hoid)
-{
+> ECBackend::get_attrs_n_size_from_disk(const hobject_t &hoid) {
struct stat st;
if (int r = object_stat(hoid, &st); r < 0) {
dout(10) << __func__ << ": stat error " << r << " on" << hoid << dendl;
- return { r, {}, 0 };
+ return {r, {}, 0};
}
map<string, bufferlist, less<>> real_attrs;
if (int r = switcher->objects_get_attrs_with_hinfo(hoid, &real_attrs); r < 0) {
dout(10) << __func__ << ": get attr error " << r << " on" << hoid << dendl;
- return { r, {}, 0 };
+ return {r, {}, 0};
}
- return { 0, real_attrs, st.st_size };
+ return {0, real_attrs, st.st_size};
+}
+
+ECUtil::HashInfoRef ECBackend::get_hinfo_from_disk(hobject_t oid) {
+ auto [r, attrs, size] = get_attrs_n_size_from_disk(oid);
+ ceph_assert(r >= 0 || r == -ENOENT);
+ ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(
+ oid, true, attrs, size);
+ return hinfo;
+}
+
+std::optional<object_info_t> ECBackend::get_object_info_from_obc(
+ ObjectContextRef &obc) {
+ std::optional<object_info_t> ret;
+
+ auto attr_cache = obc->attr_cache;
+ if (!attr_cache.contains(OI_ATTR))
+ return ret;
+
+ ret.emplace(attr_cache.at(OI_ATTR));
+ return ret;
}
void ECBackend::submit_transaction(
PGTransactionUPtr &&t,
const eversion_t &trim_to,
const eversion_t &pg_committed_to,
- vector<pg_log_entry_t>&& log_entries,
+ vector<pg_log_entry_t> &&log_entries,
std::optional<pg_hit_set_history_t> &hset_history,
Context *on_all_commit,
ceph_tid_t tid,
osd_reqid_t reqid,
OpRequestRef client_op
- )
-{
- auto op = std::make_unique<ECClassicalOp>();
+) {
+ auto op = std::make_shared<ECClassicalOp>();
+ auto obc_map = t->obc_map;
op->t = std::move(t);
op->hoid = hoid;
op->delta_stats = delta_stats;
/* We update PeeringState::pg_committed_to via the callback
* invoked from ECBackend::handle_sub_write_reply immediately
* before updating rmw_pipeline.commited_to via
- * rmw_pipeline.check_ops()->try_finish_rmw(), so these will
+ * rmw_pipeline.check_ops()->finish_rmw(), so these will
* *usually* match. However, the PrimaryLogPG::submit_log_entries
* pathway can perform an out-of-band log update which updates
* PeeringState::pg_committed_to independently. Thus, the value
op->tid = tid;
op->reqid = reqid;
op->client_op = client_op;
+ op->pipeline = &rmw_pipeline;
if (client_op) {
op->trace = client_op->pg_trace;
}
- op->plan = op->get_write_plan(
- sinfo,
- *(op->t),
- [&](const hobject_t &i) {
- dout(10) << "submit_transaction: obtaining hash info for get_write_plan" << dendl;
- ECUtil::HashInfoRef ref;
- if (auto [r, attrs, size] = get_attrs_n_size_from_disk(i); r >= 0 || r == -ENOENT) {
- ref = unstable_hashinfo_registry.get_hash_info(
- i,
- true,
- attrs, //op->t->obc_map[hoid]->attr_cache,
- size); //op->t->obc_map[hoid]->obs.oi.size);
+ ECTransaction::WritePlan &plans = op->plan;
+
+ op->t->safe_create_traverse(
+ [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
+ const auto &[oid, inner_op] = i;
+ ECUtil::HashInfoRef shinfo;
+ auto &obc = obc_map.at(oid);
+ object_info_t oi = obc->obs.oi;
+ std::optional<object_info_t> soi;
+ ECUtil::HashInfoRef hinfo;
+
+ if (!sinfo.supports_ec_overwrites()) {
+ hinfo = get_hinfo_from_disk(oid);
}
- if (!ref) {
- derr << __func__ << ": get_hash_info(" << i << ")"
- << " returned a null pointer and there is no "
- << " way to recover from such an error in this "
- << " context" << dendl;
- ceph_abort();
+
+ hobject_t source;
+ if (inner_op.has_source(&source)) {
+ if (!sinfo.supports_ec_overwrites()) {
+ shinfo = get_hinfo_from_disk(source);
+ }
+ if (!inner_op.is_rename()) {
+ soi = get_object_info_from_obc(obc_map.at(source));
+ }
}
- return ref;
- },
- get_parent()->get_dpp());
- dout(10) << __func__ << ": op " << *op << " starting" << dendl;
+
+ uint64_t old_object_size = 0;
+ bool object_in_cache = false;
+ if (rmw_pipeline.extent_cache.contains_object(oid)) {
+ /* We have a valid extent cache for this object. If we need to read, we
+ * need to behave as if the object is already the size projected by the
+ * extent cache, or we may not read enough data.
+ */
+ old_object_size = rmw_pipeline.extent_cache.get_projected_size(oid);
+ object_in_cache = true;
+ } else {
+ std::optional<object_info_t> old_oi = get_object_info_from_obc(obc);
+ if (old_oi && !inner_op.delete_first) {
+ old_object_size = old_oi->size;
+ }
+ }
+
+ auto [readable_shards, writable_shards] =
+ read_pipeline.get_readable_writable_shard_id_sets();
+ ECTransaction::WritePlanObj plan(oid, inner_op, sinfo, readable_shards,
+ writable_shards,
+ object_in_cache, old_object_size,
+ oi, soi, std::move(hinfo),
+ std::move(shinfo),
+ rmw_pipeline.ec_pdw_write_mode);
+
+ if (plan.to_read) plans.want_read = true;
+ plans.plans.emplace_back(std::move(plan));
+ });
+ ldpp_dout(get_parent()->get_dpp(), 20) << __func__
+ << " plans=" << plans
+ << dendl;
rmw_pipeline.start_rmw(std::move(op));
}
uint64_t off,
uint64_t len,
uint32_t op_flags,
- bufferlist *bl)
-{
+ bufferlist *bl) {
return -EOPNOTSUPP;
}
const list<pair<ec_align_t,
pair<bufferlist*, Context*>>> &to_read,
Context *on_complete,
- bool fast_read)
-{
- map<hobject_t,std::list<ec_align_t>> reads;
+ bool fast_read) {
+ map<hobject_t, std::list<ec_align_t>> reads;
uint32_t flags = 0;
extent_set es;
- for (const auto& [read, ctx] : to_read) {
+ for (const auto &[read, ctx]: to_read) {
pair<uint64_t, uint64_t> tmp;
if (!cct->_conf->osd_ec_partial_reads || fast_read) {
- tmp = sinfo.offset_len_to_stripe_bounds(make_pair(read.offset, read.size));
+ tmp = sinfo.ro_offset_len_to_stripe_ro_offset_len(read.offset, read.size);
} else {
- tmp = sinfo.offset_len_to_chunk_bounds(make_pair(read.offset, read.size));
+ tmp.first = read.offset;
+ tmp.second = read.size;
}
es.union_insert(tmp.first, tmp.second);
flags |= read.flags;
if (!es.empty()) {
auto &offsets = reads[hoid];
- for (auto j = es.begin();
- j != es.end();
- ++j) {
- offsets.emplace_back(ec_align_t{j.get_start(), j.get_len(), flags});
+ for (auto [off, len]: es) {
+ offsets.emplace_back(ec_align_t{off, len, flags});
}
}
ECBackend *ec;
hobject_t hoid;
list<pair<ec_align_t,
- pair<bufferlist*, Context*> > > to_read;
+ pair<bufferlist*, Context*>>> to_read;
unique_ptr<Context> on_complete;
- cb(const cb&) = delete;
+ cb(const cb &) = delete;
cb(cb &&) = default;
+
cb(ECBackend *ec,
const hobject_t &hoid,
const list<pair<ec_align_t,
- pair<bufferlist*, Context*> > > &to_read,
+ pair<bufferlist*, Context*>>> &to_read,
Context *on_complete)
: ec(ec),
- hoid(hoid),
- to_read(to_read),
- on_complete(on_complete) {}
+ hoid(hoid),
+ to_read(to_read),
+ on_complete(on_complete) {}
+
void operator()(ECCommon::ec_extents_t &&results) {
auto dpp = ec->get_parent()->get_dpp();
ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results
<< dendl;
- ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->rmw_pipeline.cache
- << dendl;
- auto &got = results[hoid];
+ auto &got = results.at(hoid);
int r = 0;
- for (auto &&read: to_read) {
- if (got.err < 0) {
- // error handling
- if (read.second.second) {
- read.second.second->complete(got.err);
- }
- if (r == 0)
- r = got.err;
- } else {
- ceph_assert(read.second.first);
- uint64_t offset = read.first.offset;
- uint64_t length = read.first.size;
- auto range = got.emap.get_containing_range(offset, length);
- uint64_t range_offset = range.first.get_off();
- uint64_t range_length = range.first.get_len();
- ceph_assert(range.first != range.second);
- ceph_assert(range_offset <= offset);
+ for (auto &&[read, result]: to_read) {
+ auto &&[bufs, ctx] = result;
+ if (got.err < 0) {
+ // error handling
+ if (ctx) {
+ ctx->complete(got.err);
+ }
+ if (r == 0)
+ r = got.err;
+ } else {
+ ceph_assert(bufs);
+ uint64_t offset = read.offset;
+ uint64_t length = read.size;
+ auto range = got.emap.get_containing_range(offset, length);
+ uint64_t range_offset = range.first.get_off();
+ uint64_t range_length = range.first.get_len();
+ ceph_assert(range.first != range.second);
+ ceph_assert(range_offset <= offset);
ldpp_dout(dpp, 20) << "offset: " << offset << dendl;
ldpp_dout(dpp, 20) << "range offset: " << range_offset << dendl;
ldpp_dout(dpp, 20) << "length: " << length << dendl;
ldpp_dout(dpp, 20) << "range length: " << range_length << dendl;
- ceph_assert(offset + length <= range_offset + range_length);
- read.second.first->substr_of(
- range.first.get_val(),
- offset - range_offset,
- length);
- if (read.second.second) {
- read.second.second->complete(length);
- read.second.second = nullptr;
- }
- }
+ ceph_assert((offset + length) <= (range_offset + range_length));
+ bufs->substr_of(
+ range.first.get_val(),
+ offset - range_offset,
+ length);
+ if (ctx) {
+ ctx->complete(length);
+ ctx = nullptr;
+ }
+ }
}
to_read.clear();
if (on_complete) {
- on_complete.release()->complete(r);
+ on_complete.release()->complete(r);
}
}
+
~cb() {
for (auto &&i: to_read) {
- delete i.second.second;
+ delete i.second.second;
}
to_read.clear();
}
objects_read_and_reconstruct(
reads,
fast_read,
+ object_size,
make_gen_lambda_context<
- ECCommon::ec_extents_t &&, cb>(
- cb(this,
- hoid,
- to_read,
- on_complete)));
+ ECCommon::ec_extents_t&&, cb>(
+ cb(this,
+ hoid,
+ to_read,
+ on_complete)));
}
void ECBackend::objects_read_and_reconstruct(
- const map<hobject_t,
- std::list<ec_align_t>
- > &reads,
+ const map<hobject_t, std::list<ec_align_t>> &reads,
bool fast_read,
- GenContextURef<ECCommon::ec_extents_t &&> &&func)
-{
+ uint64_t object_size,
+ GenContextURef<ECCommon::ec_extents_t&&> &&func) {
return read_pipeline.objects_read_and_reconstruct(
- reads, fast_read, std::move(func));
+ reads, fast_read, object_size, std::move(func));
+}
+
+void ECBackend::objects_read_and_reconstruct_for_rmw(
+ map<hobject_t, read_request_t> &&to_read,
+ GenContextURef<ECCommon::ec_extents_t&&> &&func) {
+ return read_pipeline.objects_read_and_reconstruct_for_rmw(
+ std::move(to_read), std::move(func));
}
void ECBackend::kick_reads() {
int ECBackend::object_stat(
const hobject_t &hoid,
- struct stat* st)
-{
+ struct stat *st) {
int r = switcher->store->stat(
switcher->ch,
ghobject_t{hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard},
int ECBackend::objects_get_attrs(
const hobject_t &hoid,
- map<string, bufferlist, less<>> *out)
-{
+ map<string, bufferlist, less<>> *out) {
for (map<string, bufferlist>::iterator i = out->begin();
i != out->end();
- ) {
+ ) {
if (ECUtil::is_hinfo_key_string(i->first))
out->erase(i++);
else
const hobject_t &poid,
ScrubMap &map,
ScrubMapBuilder &pos,
- ScrubMap::object &o)
-{
+ ScrubMap::object &o) {
dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
int r;
uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
- CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+ CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
utime_t sleeptime;
sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
return -EINPROGRESS;
}
- ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(poid, false, o.attrs, o.size);
+ ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(
+ poid, false, o.attrs, o.size);
if (!hinfo) {
dout(0) << "_scan_list " << poid << " could not retrieve hash info" << dendl;
o.read_error = true;
o.digest_present = false;
return 0;
} else {
- if (!get_parent()->get_pool().allows_ecoverwrites()) {
+ if (!sinfo.supports_ec_overwrites()) {
if (!hinfo->has_chunk_hash()) {
dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl;
o.ec_size_mismatch = true;
return 0;
}
if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
- dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
+ dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
<< std::hex << pos
<< " expected 0x" << hinfo->get_total_chunk_size() << std::dec
<< dendl;
- o.ec_size_mismatch = true;
- return 0;
+ o.ec_size_mismatch = true;
+ return 0;
}
if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
- pos.data_hash.digest()) {
- dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
+ pos.data_hash.digest()) {
+ dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
<< std::hex << pos.data_hash.digest() << " != expected 0x"
<< hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
<< std::dec << dendl;
- o.ec_hash_mismatch = true;
- return 0;
+ o.ec_hash_mismatch = true;
+ return 0;
}
/* We checked above that we match our own stored hash. We cannot
#pragma once
-#include <boost/intrusive/set.hpp>
#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
#include "ECCommon.h"
+#include "ECExtentCache.h"
+#include "ECListener.h"
+#include "ECTypes.h"
+#include "ECUtil.h"
#include "OSD.h"
#include "PGBackend.h"
#include "erasure-code/ErasureCodeInterface.h"
-#include "ECUtil.h"
-#include "ECTransaction.h"
-#include "ExtentCache.h"
-#include "ECListener.h"
+#include "include/buffer.h"
+#include "osd/scrubber/scrub_backend.h"
/* This file is soon going to be replaced (before next release), so we are going
* to simply ignore all deprecated warnings.
* */
-IGNORE_DEPRECATED
//forward declaration
struct ECSubWrite;
class ECSwitch;
struct RecoveryMessages;
+class ECSwitch;
class ECBackend : public ECCommon {
-public:
- PGBackend::RecoveryHandle *open_recovery_op();
+ public:
+ PGBackend::RecoveryHandle *open_recovery_op();
void run_recovery_op(
PGBackend::RecoveryHandle *h,
- int priority
- );
+ int priority
+ );
int recover_object(
- const hobject_t &hoid,
- eversion_t v,
- ObjectContextRef head,
- ObjectContextRef obc,
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
PGBackend::RecoveryHandle *h
- );
-
- bool _handle_message(
- OpRequestRef op
- );
- bool can_handle_while_inactive(
- OpRequestRef op
- );
+ );
+
+ bool _handle_message(OpRequestRef op);
+ bool can_handle_while_inactive(OpRequestRef op);
friend struct SubWriteApplied;
friend struct SubWriteCommitted;
void sub_write_committed(
- ceph_tid_t tid,
- eversion_t version,
- eversion_t last_complete,
- const ZTracer::Trace &trace);
+ ceph_tid_t tid,
+ eversion_t version,
+ eversion_t last_complete,
+ const ZTracer::Trace &trace
+ );
void handle_sub_write(
- pg_shard_t from,
- OpRequestRef msg,
- ECSubWrite &op,
- const ZTracer::Trace &trace,
- ECListener& eclistener
+ pg_shard_t from,
+ OpRequestRef msg,
+ ECSubWrite &op,
+ const ZTracer::Trace &trace,
+ ECListener &eclistener
) override;
void handle_sub_read(
- pg_shard_t from,
- const ECSubRead &op,
- ECSubReadReply *reply,
- const ZTracer::Trace &trace
+ pg_shard_t from,
+ const ECSubRead &op,
+ ECSubReadReply *reply,
+ const ZTracer::Trace &trace
);
void handle_sub_write_reply(
- pg_shard_t from,
- const ECSubWriteReply &op,
- const ZTracer::Trace &trace
+ pg_shard_t from,
+ const ECSubWriteReply &op,
+ const ZTracer::Trace &trace
);
void handle_sub_read_reply(
- pg_shard_t from,
- ECSubReadReply &op,
- const ZTracer::Trace &trace
+ pg_shard_t from,
+ ECSubReadReply &op,
+ const ZTracer::Trace &trace
);
/// @see ReadOp below
- void check_recovery_sources(const OSDMapRef& osdmap);
+ void check_recovery_sources(const OSDMapRef &osdmap);
void on_change();
void clear_recovery_state();
}
void submit_transaction(
- const hobject_t &hoid,
- const object_stat_sum_t &delta_stats,
- const eversion_t &at_version,
- PGTransactionUPtr &&t,
- const eversion_t &trim_to,
- const eversion_t &pg_committed_to,
- std::vector<pg_log_entry_t>&& log_entries,
- std::optional<pg_hit_set_history_t> &hset_history,
- Context *on_all_commit,
- ceph_tid_t tid,
- osd_reqid_t reqid,
- OpRequestRef op
+ const hobject_t &hoid,
+ const object_stat_sum_t &delta_stats,
+ const eversion_t &at_version,
+ PGTransactionUPtr &&t,
+ const eversion_t &trim_to,
+ const eversion_t &pg_committed_to,
+ std::vector<pg_log_entry_t> &&log_entries,
+ std::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_all_commit,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ OpRequestRef op
);
int objects_read_sync(
- const hobject_t &hoid,
- uint64_t off,
- uint64_t len,
- uint32_t op_flags,
- ceph::buffer::list *bl);
+ const hobject_t &hoid,
+ uint64_t off,
+ uint64_t len,
+ uint32_t op_flags,
+ ceph::buffer::list *bl
+ );
/**
* Async read mechanism
* check_recovery_sources.
*/
void objects_read_and_reconstruct(
- const std::map<hobject_t, std::list<ec_align_t>> &reads,
- bool fast_read,
- GenContextURef<ECCommon::ec_extents_t &&> &&func) override;
+ const std::map<hobject_t, std::list<ec_align_t>> &reads,
+ bool fast_read,
+ uint64_t object_size,
+ GenContextURef<ECCommon::ec_extents_t&&> &&func
+ ) override;
+
+ /**
+ * Async read mechanism for read-modify-write (RMW) code paths. Here wthe
+ * client already knows the set of shard reads that are required, so these
+ * can be passed in directly. The "fast_read" mechanism is not needed.
+ *
+ * Otherwise this is the same as objects_read_and_reconstruct.
+ */
+ void objects_read_and_reconstruct_for_rmw(
+ std::map<hobject_t, read_request_t> &&reads,
+ GenContextURef<ECCommon::ec_extents_t&&> &&func
+ ) override;
void objects_read_async(
- const hobject_t &hoid,
- uint64_t object_size,
- const std::list<std::pair<ec_align_t,
- std::pair<ceph::buffer::list*, Context*>>> &to_read,
- Context *on_complete,
- bool fast_read = false);
-
-private:
+ const hobject_t &hoid,
+ uint64_t object_size,
+ const std::list<std::pair<ec_align_t,
+ std::pair<ceph::buffer::list*, Context*>>> &
+ to_read,
+ Context *on_complete,
+ bool fast_read = false
+ );
+
+ private:
friend struct ECRecoveryHandle;
void kick_reads();
* Transaction, and reads in a RecoveryMessages object which is passed
* among the recovery methods.
*/
-public:
+ public:
struct RecoveryBackend {
- CephContext* cct;
+ CephContext *cct;
const coll_t &coll;
ceph::ErasureCodeInterfaceRef ec_impl;
- const ECUtil::stripe_info_t& sinfo;
- ReadPipeline& read_pipeline;
- UnstableHashInfoRegistry& unstable_hashinfo_registry;
+ const ECUtil::stripe_info_t &sinfo;
+ ReadPipeline &read_pipeline;
+ UnstableHashInfoRegistry &unstable_hashinfo_registry;
// TODO: lay an interface down here
- ECListener* parent;
- ECBackend* ecbackend;
+ ECListener *parent;
+ ECBackend *ecbackend;
ECListener *get_parent() const { return parent; }
- const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
- epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
+
+ const OSDMapRef &get_osdmap() const {
+ return get_parent()->pgb_get_osdmap();
+ }
+
+ epoch_t get_osdmap_epoch() const {
+ return get_parent()->pgb_get_osdmap_epoch();
+ }
+
const pg_info_t &get_info() { return get_parent()->get_info(); }
void add_temp_obj(const hobject_t &oid) { get_parent()->add_temp_obj(oid); }
- void clear_temp_obj(const hobject_t &oid) { get_parent()->clear_temp_obj(oid); }
-
- RecoveryBackend(CephContext* cct,
- const coll_t &coll,
- ceph::ErasureCodeInterfaceRef ec_impl,
- const ECUtil::stripe_info_t& sinfo,
- ReadPipeline& read_pipeline,
- UnstableHashInfoRegistry& unstable_hashinfo_registry,
- ECListener* parent,
- ECBackend* ecbackend);
- struct RecoveryOp {
- hobject_t hoid;
- eversion_t v;
- std::set<pg_shard_t> missing_on;
- std::set<shard_id_t> missing_on_shards;
-
- ObjectRecoveryInfo recovery_info;
- ObjectRecoveryProgress recovery_progress;
-
- enum state_t { IDLE, READING, WRITING, COMPLETE } state;
-
- static const char* tostr(state_t state) {
- switch (state) {
- case RecoveryOp::IDLE:
- return "IDLE";
- case RecoveryOp::READING:
- return "READING";
- case RecoveryOp::WRITING:
- return "WRITING";
- case RecoveryOp::COMPLETE:
- return "COMPLETE";
- default:
- ceph_abort();
- return "";
- }
- }
- // must be filled if state == WRITING
- std::map<int, ceph::buffer::list> returned_data;
- std::map<std::string, ceph::buffer::list, std::less<>> xattrs;
- ECUtil::HashInfoRef hinfo;
- ObjectContextRef obc;
- std::set<pg_shard_t> waiting_on_pushes;
+ void clear_temp_obj(const hobject_t &oid) {
+ get_parent()->clear_temp_obj(oid);
+ }
- // valid in state READING
- std::pair<uint64_t, uint64_t> extent_requested;
+ RecoveryBackend(CephContext *cct,
+ const coll_t &coll,
+ ceph::ErasureCodeInterfaceRef ec_impl,
+ const ECUtil::stripe_info_t &sinfo,
+ ReadPipeline &read_pipeline,
+ UnstableHashInfoRegistry &unstable_hashinfo_registry,
+ ECListener *parent,
+ ECBackend *ecbackend);
+
+ struct RecoveryOp {
+ hobject_t hoid;
+ eversion_t v;
+ std::set<pg_shard_t> missing_on;
+ std::set<shard_id_t> missing_on_shards;
+
+ ObjectRecoveryInfo recovery_info;
+ ObjectRecoveryProgress recovery_progress;
+
+ enum state_t { IDLE, READING, WRITING, COMPLETE } state;
+
+ static const char *tostr(state_t state) {
+ switch (state) {
+ case RecoveryOp::IDLE:
+ return "IDLE";
+ case RecoveryOp::READING:
+ return "READING";
+ case RecoveryOp::WRITING:
+ return "WRITING";
+ case RecoveryOp::COMPLETE:
+ return "COMPLETE";
+ default:
+ ceph_abort();
+ return "";
+ }
+ }
- void dump(ceph::Formatter *f) const;
+ // must be filled if state == WRITING
+ std::optional<ECUtil::shard_extent_map_t> returned_data;
+ std::map<std::string, ceph::buffer::list, std::less<>> xattrs;
+ ECUtil::HashInfoRef hinfo;
+ ObjectContextRef obc;
+ std::set<pg_shard_t> waiting_on_pushes;
+
+ void dump(ceph::Formatter *f) const;
+
+ RecoveryOp() : state(IDLE) {}
+
+ void print(std::ostream &os) const {
+ os << "RecoveryOp("
+ << "hoid=" << hoid
+ << " v=" << v
+ << " missing_on=" << missing_on
+ << " missing_on_shards=" << missing_on_shards
+ << " recovery_info=" << recovery_info
+ << " recovery_progress=" << recovery_progress
+ << " obc refcount=" << obc.use_count()
+ << " state=" << ECBackend::RecoveryBackend::RecoveryOp::tostr(state)
+ << " waiting_on_pushes=" << waiting_on_pushes
+ << ")";
+ }
+ };
- RecoveryOp() : state(IDLE) {}
- };
- friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
- std::map<hobject_t, RecoveryOp> recovery_ops;
+ std::map<hobject_t, RecoveryOp> recovery_ops;
- uint64_t get_recovery_chunk_size() const {
- return round_up_to(cct->_conf->osd_recovery_max_chunk,
- sinfo.get_stripe_width());
- }
+ uint64_t get_recovery_chunk_size() const {
+ return round_up_to(cct->_conf->osd_recovery_max_chunk,
+ sinfo.get_stripe_width());
+ }
- virtual ~RecoveryBackend() = default;
- virtual void commit_txn_send_replies(
- ceph::os::Transaction&& txn,
- std::map<int, MOSDPGPushReply*> replies) = 0;
- void dispatch_recovery_messages(RecoveryMessages &m, int priority);
+ virtual ~RecoveryBackend() = default;
+ virtual void commit_txn_send_replies(
+ ceph::os::Transaction &&txn,
+ std::map<int, MOSDPGPushReply*> replies) = 0;
+ void dispatch_recovery_messages(RecoveryMessages &m, int priority);
- PGBackend::RecoveryHandle *open_recovery_op();
- void run_recovery_op(
- struct ECRecoveryHandle &h,
- int priority);
- int recover_object(
- const hobject_t &hoid,
- eversion_t v,
- ObjectContextRef head,
- ObjectContextRef obc,
- PGBackend::RecoveryHandle *h);
- void continue_recovery_op(
- RecoveryBackend::RecoveryOp &op,
- RecoveryMessages *m);
- void handle_recovery_read_complete(
- const hobject_t &hoid,
- boost::tuple<uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > &to_read,
- std::optional<std::map<std::string, ceph::buffer::list, std::less<>> > attrs,
- RecoveryMessages *m);
- void handle_recovery_push(
- const PushOp &op,
- RecoveryMessages *m,
- bool is_repair);
- void handle_recovery_push_reply(
- const PushReplyOp &op,
- pg_shard_t from,
- RecoveryMessages *m);
- friend struct RecoveryMessages;
- int get_ec_data_chunk_count() const {
- return ec_impl->get_data_chunk_count();
- }
- void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res);
+ PGBackend::RecoveryHandle *open_recovery_op();
+ void run_recovery_op(
+ struct ECRecoveryHandle &h,
+ int priority);
+ int recover_object(
+ const hobject_t &hoid,
+ eversion_t v,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ PGBackend::RecoveryHandle *h);
+ void continue_recovery_op(
+ RecoveryBackend::RecoveryOp &op,
+ RecoveryMessages *m);
+ void handle_recovery_read_complete(
+ const hobject_t &hoid,
+ ECUtil::shard_extent_map_t &&buffers_read,
+ std::optional<std::map<std::string, ceph::buffer::list, std::less<>>>
+ attrs,
+ const ECUtil::shard_extent_set_t &want_to_read,
+ RecoveryMessages *m);
+ void handle_recovery_push(
+ const PushOp &op,
+ RecoveryMessages *m,
+ bool is_repair);
+ void handle_recovery_push_reply(
+ const PushReplyOp &op,
+ pg_shard_t from,
+ RecoveryMessages *m);
+ friend struct RecoveryMessages;
+ void _failed_push(const hobject_t &hoid, ECCommon::read_result_t &res);
};
+
struct ECRecoveryBackend : RecoveryBackend {
- ECRecoveryBackend(CephContext* cct,
- const coll_t &coll,
- ceph::ErasureCodeInterfaceRef ec_impl,
- const ECUtil::stripe_info_t& sinfo,
- ReadPipeline& read_pipeline,
- UnstableHashInfoRegistry& unstable_hashinfo_registry,
- PGBackend::Listener* parent,
- ECBackend* ecbackend)
- : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline, unstable_hashinfo_registry, parent->get_eclistener(), ecbackend),
- parent(parent) {
- }
+ ECRecoveryBackend(CephContext *cct,
+ const coll_t &coll,
+ ceph::ErasureCodeInterfaceRef ec_impl,
+ const ECUtil::stripe_info_t &sinfo,
+ ReadPipeline &read_pipeline,
+ UnstableHashInfoRegistry &unstable_hashinfo_registry,
+ PGBackend::Listener *parent,
+ ECBackend *ecbackend)
+ : RecoveryBackend(cct, coll, std::move(ec_impl), sinfo, read_pipeline,
+ unstable_hashinfo_registry, parent->get_eclistener(),
+ ecbackend),
+ parent(parent) {}
void commit_txn_send_replies(
- ceph::os::Transaction&& txn,
- std::map<int, MOSDPGPushReply*> replies) override;
+ ceph::os::Transaction &&txn,
+ std::map<int, MOSDPGPushReply*> replies) override;
PGBackend::Listener *get_parent() const { return parent; }
- private:
+ private:
PGBackend::Listener *parent;
};
- friend ostream &operator<<(ostream &lhs, const RecoveryBackend::RecoveryOp &rhs);
+
+ friend ostream &operator<<(ostream &lhs,
+ const RecoveryBackend::RecoveryOp &rhs
+ );
friend struct RecoveryMessages;
friend struct OnRecoveryReadComplete;
friend struct RecoveryReadCompleter;
void handle_recovery_push(
- const PushOp &op,
- RecoveryMessages *m,
- bool is_repair);
+ const PushOp &op,
+ RecoveryMessages *m,
+ bool is_repair
+ );
-public:
- PGBackend::Listener *parent;
- CephContext *cct;
- ECSwitch *switcher;
- struct ReadPipeline read_pipeline;
- struct RMWPipeline rmw_pipeline;
- struct ECRecoveryBackend recovery_backend;
+ PGBackend::Listener *parent;
+ CephContext *cct;
+ ECSwitch *switcher;
+ ReadPipeline read_pipeline;
+ RMWPipeline rmw_pipeline;
+ ECRecoveryBackend recovery_backend;
ceph::ErasureCodeInterfaceRef ec_impl;
/**
* ECRecPred
*
- * Determines the whether _have is sufficient to recover an object
+ * Determines whether _have is sufficient to recover an object
*/
class ECRecPred : public IsPGRecoverablePredicate {
- std::set<int> want;
+ shard_id_set want;
+ const ECUtil::stripe_info_t *sinfo;
ceph::ErasureCodeInterfaceRef ec_impl;
- public:
- explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) {
- for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) {
- want.insert(i);
- }
+
+ public:
+ explicit ECRecPred(const ECUtil::stripe_info_t *sinfo,
+ ceph::ErasureCodeInterfaceRef ec_impl) :
+ sinfo(sinfo), ec_impl(ec_impl) {
+ want.insert_range(shard_id_t(0), sinfo->get_k_plus_m());
}
+
bool operator()(const std::set<pg_shard_t> &_have) const override {
- std::set<int> have;
- for (std::set<pg_shard_t>::const_iterator i = _have.begin();
- i != _have.end();
- ++i) {
- have.insert(static_cast<int>(i->shard));
+ shard_id_set have;
+ for (pg_shard_t p: _have) {
+ have.insert(p.shard);
+ }
+ std::unique_ptr<shard_id_map<std::vector<std::pair<int, int>>>>
+ min_sub_chunks = nullptr;
+ if (sinfo->supports_sub_chunks()) {
+ min_sub_chunks = std::make_unique<shard_id_map<std::vector<std::pair<
+ int, int>>>>(sinfo->get_k_plus_m());
}
- std::map<int, std::vector<std::pair<int, int>>> min;
+ shard_id_set min;
- return ec_impl->minimum_to_decode(want, have, &min) == 0;
+ return ec_impl->minimum_to_decode(want, have, min, min_sub_chunks.get())
+ == 0;
}
};
+
std::unique_ptr<ECRecPred> get_is_recoverable_predicate() const {
- return std::make_unique<ECRecPred>(ec_impl);
+ return std::make_unique<ECRecPred>(&sinfo, ec_impl);
}
- unsigned get_ec_data_chunk_count() const {
- return ec_impl->get_data_chunk_count();
+ unsigned get_ec_data_chunk_count() const {
+ return sinfo.get_k();
}
+
int get_ec_stripe_chunk_size() const {
return sinfo.get_chunk_size();
}
- uint64_t object_size_to_shard_size(const uint64_t size,
- shard_id_t shard) const {
- if (size == std::numeric_limits<uint64_t>::max()) {
- return size;
- }
- return sinfo.logical_to_next_chunk_offset(size);
+
+ uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard
+ ) const {
+ return sinfo.object_size_to_shard_size(size, shard);
}
+
/**
* ECReadPred
*
class ECReadPred : public IsPGReadablePredicate {
pg_shard_t whoami;
ECRecPred rec_pred;
- public:
+
+ public:
ECReadPred(
- pg_shard_t whoami,
- ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {}
+ pg_shard_t whoami,
+ const ECUtil::stripe_info_t *sinfo,
+ ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(sinfo, ec_impl) {}
+
bool operator()(const std::set<pg_shard_t> &_have) const override {
return _have.count(whoami) && rec_pred(_have);
}
};
- std::unique_ptr<ECReadPred> get_is_readable_predicate(pg_shard_t whoami) const {
- return std::make_unique<ECReadPred>(whoami, ec_impl);
+
+ std::unique_ptr<ECReadPred>
+ get_is_readable_predicate(pg_shard_t whoami) const {
+ return std::make_unique<ECReadPred>(whoami, &sinfo, ec_impl);
}
const ECUtil::stripe_info_t sinfo;
int,
std::map<std::string, ceph::bufferlist, std::less<>>,
size_t
- > get_attrs_n_size_from_disk(const hobject_t& hoid);
+ > get_attrs_n_size_from_disk(const hobject_t &hoid);
+
+ ECUtil::HashInfoRef get_hinfo_from_disk(hobject_t oid);
+
+ std::optional<object_info_t> get_object_info_from_obc(
+ ObjectContextRef &obc_map
+ );
-public:
- int object_stat(const hobject_t &hoid, struct stat* st);
+ public:
+ int object_stat(const hobject_t &hoid, struct stat *st);
ECBackend(
- PGBackend::Listener *pg,
- CephContext *cct,
- ceph::ErasureCodeInterfaceRef ec_impl,
- uint64_t stripe_width,
- ECSwitch *s,
- ECExtentCache::LRU &ignored);
+ PGBackend::Listener *pg,
+ CephContext *cct,
+ ceph::ErasureCodeInterfaceRef ec_impl,
+ uint64_t stripe_width,
+ ECSwitch *s,
+ ECExtentCache::LRU &ec_extent_cache_lru
+ );
int objects_get_attrs(
- const hobject_t &hoid,
- std::map<std::string, ceph::buffer::list, std::less<>> *out);
+ const hobject_t &hoid,
+ std::map<std::string, ceph::buffer::list, std::less<>> *out
+ );
bool auto_repair_supported() const { return true; }
int be_deep_scrub(
- const hobject_t &poid,
- ScrubMap &map,
- ScrubMapBuilder &pos,
- ScrubMap::object &o);
+ const hobject_t &poid,
+ ScrubMap &map,
+ ScrubMapBuilder &pos,
+ ScrubMap::object &o
+ );
- uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t ignored) const {
- return sinfo.logical_to_next_chunk_offset(logical_size);
+ uint64_t be_get_ondisk_size(uint64_t logical_size, shard_id_t shard_id
+ ) const {
+ return object_size_to_shard_size(logical_size, shard_id);
}
};
-ostream &operator<<(ostream &lhs, const ECBackend::RMWPipeline::pipeline_state_t &rhs);
-
-END_IGNORE_DEPRECATED
#include <iostream>
#include <sstream>
+#include <ranges>
+#include <fmt/ostream.h>
#include "ECInject.h"
-#include "messages/MOSDPGPush.h"
-#include "messages/MOSDPGPushReply.h"
#include "messages/MOSDECSubOpWrite.h"
-#include "messages/MOSDECSubOpWriteReply.h"
#include "messages/MOSDECSubOpRead.h"
-#include "messages/MOSDECSubOpReadReply.h"
#include "common/debug.h"
#include "ECMsgTypes.h"
#include "PGLog.h"
-
#include "osd_tracer.h"
#define dout_context cct
#undef dout_prefix
#define dout_prefix _prefix(_dout, this)
-/* This file is soon going to be replaced (before next release), so we are going
- * to simply ignore all deprecated warnings.
- * */
-IGNORE_DEPRECATED
-
using std::dec;
using std::hex;
using std::less;
using ceph::ErasureCodeInterfaceRef;
using ceph::Formatter;
-static ostream& _prefix(std::ostream *_dout, ECCommon::RMWPipeline *rmw_pipeline) {
+static ostream &_prefix(std::ostream *_dout,
+ ECCommon::RMWPipeline const *rmw_pipeline) {
return rmw_pipeline->get_parent()->gen_dbg_prefix(*_dout);
}
-static ostream& _prefix(std::ostream *_dout, ECCommon::ReadPipeline *read_pipeline) {
- return read_pipeline->get_parent()->gen_dbg_prefix(*_dout);
-}
-static ostream& _prefix(std::ostream *_dout,
- ECCommon::UnstableHashInfoRegistry *unstable_hash_info_registry) {
- // TODO: backref to ECListener?
- return *_dout;
-}
-static ostream& _prefix(std::ostream *_dout, struct ClientReadCompleter *read_completer);
-
-ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::pipeline_state_t &rhs) {
- switch (rhs.pipeline_state) {
- case ECCommon::RMWPipeline::pipeline_state_t::CACHE_VALID:
- return lhs << "CACHE_VALID";
- case ECCommon::RMWPipeline::pipeline_state_t::CACHE_INVALID:
- return lhs << "CACHE_INVALID";
- default:
- ceph_abort_msg("invalid pipeline state");
- }
- return lhs; // unreachable
-}
-ostream &operator<<(ostream &lhs, const ECCommon::ec_extent_t &rhs)
-{
- return lhs << rhs.err << ","
- << rhs.emap;
-}
-
-ostream &operator<<(ostream &lhs, const ECCommon::read_request_t &rhs)
-{
- return lhs << "read_request_t(to_read=[" << rhs.to_read << "]"
- << ", need=" << rhs.need
- << ", want_attrs=" << rhs.want_attrs
- << ")";
+static ostream &_prefix(std::ostream *_dout,
+ ECCommon::ReadPipeline const *read_pipeline) {
+ return read_pipeline->get_parent()->gen_dbg_prefix(*_dout);
}
-ostream &operator<<(ostream &lhs, const ECCommon::read_result_t &rhs)
-{
- lhs << "read_result_t(r=" << rhs.r
- << ", errors=" << rhs.errors;
- if (rhs.attrs) {
- lhs << ", attrs=" << *(rhs.attrs);
- } else {
- lhs << ", noattrs";
- }
- return lhs << ", returned=" << rhs.returned << ")";
+static ostream &_prefix(std::ostream *_dout,
+ ECCommon::UnstableHashInfoRegistry *
+ unstable_hash_info_registry) {
+ return *_dout;
}
-ostream &operator<<(ostream &lhs, const ECCommon::ReadOp &rhs)
-{
- lhs << "ReadOp(tid=" << rhs.tid;
-#ifndef WITH_CRIMSON
- if (rhs.op && rhs.op->get_req()) {
- lhs << ", op=";
- rhs.op->get_req()->print(lhs);
- }
-#endif
- return lhs << ", to_read=" << rhs.to_read
- << ", complete=" << rhs.complete
- << ", priority=" << rhs.priority
- << ", obj_to_source=" << rhs.obj_to_source
- << ", source_to_obj=" << rhs.source_to_obj
- << ", want_to_read" << rhs.want_to_read
- << ", in_progress=" << rhs.in_progress << ")";
-}
+static ostream &_prefix(std::ostream *_dout,
+ struct ClientReadCompleter const *read_completer
+ );
-void ECCommon::ReadOp::dump(Formatter *f) const
-{
+void ECCommon::ReadOp::dump(Formatter *f) const {
f->dump_unsigned("tid", tid);
#ifndef WITH_CRIMSON
if (op && op->get_req()) {
f->dump_int("priority", priority);
f->dump_stream("obj_to_source") << obj_to_source;
f->dump_stream("source_to_obj") << source_to_obj;
- f->dump_stream("want_to_read") << want_to_read;
f->dump_stream("in_progress") << in_progress;
}
-ostream &operator<<(ostream &lhs, const ECCommon::RMWPipeline::Op &rhs)
-{
- lhs << "Op(" << rhs.hoid
- << " v=" << rhs.version
- << " tt=" << rhs.trim_to
- << " tid=" << rhs.tid
- << " reqid=" << rhs.reqid;
-#ifndef WITH_CRIMSON
- if (rhs.client_op && rhs.client_op->get_req()) {
- lhs << " client_op=";
- rhs.client_op->get_req()->print(lhs);
- }
-#endif
- lhs << " pg_committed_to=" << rhs.pg_committed_to
- << " temp_added=" << rhs.temp_added
- << " temp_cleared=" << rhs.temp_cleared
- << " pending_read=" << rhs.pending_read
- << " remote_read=" << rhs.remote_read
- << " remote_read_result=" << rhs.remote_read_result
- << " pending_apply=" << rhs.pending_apply
- << " pending_commit=" << rhs.pending_commit
- << " plan.to_read=" << rhs.plan.to_read
- << " plan.will_write=" << rhs.plan.will_write
- << ")";
- return lhs;
-}
-
-void ECCommon::ReadPipeline::complete_read_op(ReadOp &rop)
-{
+void ECCommon::ReadPipeline::complete_read_op(ReadOp &&rop) {
dout(20) << __func__ << " completing " << rop << dendl;
- map<hobject_t, read_request_t>::iterator req_iter =
- rop.to_read.begin();
- map<hobject_t, read_result_t>::iterator resiter =
- rop.complete.begin();
+ auto req_iter = rop.to_read.begin();
+ auto resiter = rop.complete.begin();
ceph_assert(rop.to_read.size() == rop.complete.size());
for (; req_iter != rop.to_read.end(); ++req_iter, ++resiter) {
- ceph_assert(rop.want_to_read.contains(req_iter->first));
+ auto &hoid = req_iter->first;
+ read_result_t &res = resiter->second;
+ read_request_t &req = req_iter->second;
rop.on_complete->finish_single_request(
- req_iter->first,
- resiter->second,
- req_iter->second.to_read,
- rop.want_to_read[req_iter->first]);
+ hoid, std::move(res), req);
}
ceph_assert(rop.on_complete);
std::move(*rop.on_complete).finish(rop.priority);
rop.on_complete = nullptr;
+
// if the read op is over. clean all the data of this tid.
- for (set<pg_shard_t>::iterator iter = rop.in_progress.begin();
- iter != rop.in_progress.end();
- iter++) {
- shard_to_read_map[*iter].erase(rop.tid);
+ for (auto &pg_shard: rop.in_progress) {
+ shard_to_read_map[pg_shard].erase(rop.tid);
}
rop.in_progress.clear();
tid_to_read_map.erase(rop.tid);
}
-void ECCommon::ReadPipeline::on_change()
-{
- for (map<ceph_tid_t, ReadOp>::iterator i = tid_to_read_map.begin();
- i != tid_to_read_map.end();
- ++i) {
- dout(10) << __func__ << ": cancelling " << i->second << dendl;
+void ECCommon::ReadPipeline::on_change() {
+ for (auto &rop: std::views::keys(tid_to_read_map)) {
+ dout(10) << __func__ << ": cancelling " << rop << dendl;
}
tid_to_read_map.clear();
shard_to_read_map.clear();
in_progress_client_reads.clear();
}
+std::pair<const shard_id_set, const shard_id_set>
+ECCommon::ReadPipeline::get_readable_writable_shard_id_sets() {
+ shard_id_set readable;
+ shard_id_set writable;
+
+ for (auto &&pg_shard: get_parent()->get_acting_shards()) {
+ readable.insert(pg_shard.shard);
+ }
+
+ writable = get_parent()->get_acting_recovery_backfill_shard_id_set();
+ return std::make_pair(std::move(readable), std::move(writable));
+}
+
void ECCommon::ReadPipeline::get_all_avail_shards(
- const hobject_t &hoid,
- const set<pg_shard_t> &error_shards,
- set<int> &have,
- map<shard_id_t, pg_shard_t> &shards,
- bool for_recovery)
-{
- for (set<pg_shard_t>::const_iterator i =
- get_parent()->get_acting_shards().begin();
- i != get_parent()->get_acting_shards().end();
- ++i) {
- dout(10) << __func__ << ": checking acting " << *i << dendl;
- const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
- if (error_shards.contains(*i)) {
+ const hobject_t &hoid,
+ shard_id_set &have,
+ shard_id_map<pg_shard_t> &shards,
+ const bool for_recovery,
+ const std::optional<set<pg_shard_t>> &error_shards) {
+ for (auto &&pg_shard: get_parent()->get_acting_shards()) {
+ dout(10) << __func__ << ": checking acting " << pg_shard << dendl;
+ const pg_missing_t &missing = get_parent()->get_shard_missing(pg_shard);
+ if (error_shards && error_shards->contains(pg_shard)) {
continue;
}
+ const shard_id_t &shard = pg_shard.shard;
if (cct->_conf->bluestore_debug_inject_read_err &&
- ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) {
- dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl;
+ ECInject::test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, shard))) {
+ dout(0) << __func__ << " Error inject - Missing shard " << shard << dendl;
continue;
}
if (!missing.is_missing(hoid)) {
- ceph_assert(!have.count(static_cast<int>(i->shard)));
- have.insert(static_cast<int>(i->shard));
- ceph_assert(!shards.count(i->shard));
- shards.insert(make_pair(i->shard, *i));
+ ceph_assert(!have.contains(shard));
+ have.insert(shard);
+ ceph_assert(!shards.contains(shard));
+ shards.insert(shard, pg_shard);
}
}
if (for_recovery) {
- for (set<pg_shard_t>::const_iterator i =
- get_parent()->get_backfill_shards().begin();
- i != get_parent()->get_backfill_shards().end();
- ++i) {
- if (error_shards.find(*i) != error_shards.end())
- continue;
- if (have.count(static_cast<int>(i->shard))) {
- ceph_assert(shards.count(i->shard));
- continue;
+ for (auto &&pg_shard: get_parent()->get_backfill_shards()) {
+ if (error_shards && error_shards->contains(pg_shard))
+ continue;
+ const shard_id_t &shard = pg_shard.shard;
+ if (have.contains(shard)) {
+ ceph_assert(shards.contains(shard));
+ continue;
}
- dout(10) << __func__ << ": checking backfill " << *i << dendl;
- ceph_assert(!shards.count(i->shard));
- const pg_info_t &info = get_parent()->get_shard_info(*i);
- const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+ dout(10) << __func__ << ": checking backfill " << pg_shard << dendl;
+ ceph_assert(!shards.count(shard));
+ const pg_info_t &info = get_parent()->get_shard_info(pg_shard);
if (hoid < info.last_backfill &&
- !missing.is_missing(hoid)) {
- have.insert(static_cast<int>(i->shard));
- shards.insert(make_pair(i->shard, *i));
+ !get_parent()->get_shard_missing(pg_shard).is_missing(hoid)) {
+ have.insert(shard);
+ shards.insert(shard, pg_shard);
}
}
- map<hobject_t, set<pg_shard_t>>::const_iterator miter =
- get_parent()->get_missing_loc_shards().find(hoid);
+ auto miter = get_parent()->get_missing_loc_shards().find(hoid);
if (miter != get_parent()->get_missing_loc_shards().end()) {
- for (set<pg_shard_t>::iterator i = miter->second.begin();
- i != miter->second.end();
- ++i) {
- dout(10) << __func__ << ": checking missing_loc " << *i << dendl;
- auto m = get_parent()->maybe_get_shard_missing(*i);
- if (m) {
- ceph_assert(!(*m).is_missing(hoid));
- }
- if (error_shards.find(*i) != error_shards.end())
- continue;
- have.insert(static_cast<int>(i->shard));
- shards.insert(make_pair(i->shard, *i));
+ for (auto &&pg_shard: miter->second) {
+ dout(10) << __func__ << ": checking missing_loc " << pg_shard << dendl;
+ if (const auto m = get_parent()->maybe_get_shard_missing(pg_shard)) {
+ ceph_assert(!m->is_missing(hoid));
+ }
+ if (error_shards && error_shards->contains(pg_shard)) {
+ continue;
+ }
+ have.insert(pg_shard.shard);
+ shards.insert(pg_shard.shard, pg_shard);
}
}
}
}
int ECCommon::ReadPipeline::get_min_avail_to_read_shards(
- const hobject_t &hoid,
- const set<int> &want,
- bool for_recovery,
- bool do_redundant_reads,
- map<pg_shard_t, vector<pair<int, int>>> *to_read)
-{
+ const hobject_t &hoid,
+ bool for_recovery,
+ bool do_redundant_reads,
+ read_request_t &read_request,
+ const std::optional<set<pg_shard_t>> &error_shards) {
// Make sure we don't do redundant reads for recovery
ceph_assert(!for_recovery || !do_redundant_reads);
- set<int> have;
- map<shard_id_t, pg_shard_t> shards;
- set<pg_shard_t> error_shards;
+ if (read_request.object_size == 0) {
+ dout(10) << __func__ << " empty read" << dendl;
+ return 0;
+ }
- get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+ shard_id_set have;
+ shard_id_map<pg_shard_t> shards(sinfo.get_k_plus_m());
- map<int, vector<pair<int, int>>> need;
- int r = ec_impl->minimum_to_decode(want, have, &need);
- if (r < 0)
+ get_all_avail_shards(hoid, have, shards, for_recovery, error_shards);
+
+ std::unique_ptr<shard_id_map<vector<pair<int, int>>>> need_sub_chunks =
+ nullptr;
+ if (sinfo.supports_sub_chunks()) {
+ need_sub_chunks = std::make_unique<shard_id_map<vector<pair<int, int>>>>(
+ sinfo.get_k_plus_m());
+ }
+ shard_id_set need_set;
+ shard_id_set want;
+
+ read_request.shard_want_to_read.populate_shard_id_set(want);
+
+ int r = ec_impl->minimum_to_decode(want, have, need_set,
+ need_sub_chunks.get());
+ if (r < 0) {
+ dout(20) << "minimum_to_decode_failed r: " << r << "want: " << want
+ << " have: " << have << " need: " << need_set << dendl;
return r;
+ }
if (do_redundant_reads) {
+ if (need_sub_chunks) {
vector<pair<int, int>> subchunks_list;
subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
for (auto &&i: have) {
- need[i] = subchunks_list;
+ (*need_sub_chunks)[i] = subchunks_list;
}
- }
+ }
+ for (auto &&i: have) {
+ need_set.insert(i);
+ }
+ }
- if (!to_read)
- return 0;
+ extent_set extra_extents;
+ ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ ECUtil::shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+
+ sinfo.ro_size_to_read_mask(read_request.object_size, read_mask);
+ sinfo.ro_size_to_zero_mask(read_request.object_size, zero_mask);
- for (auto &&i:need) {
- ceph_assert(shards.count(shard_id_t(i.first)));
- to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second));
+ /* First deal with missing shards */
+ for (auto &&[shard, extent_set]: read_request.shard_want_to_read) {
+ /* Work out what extra extents we need to read on each shard. If do
+ * redundant reads is set, then we want to have the same reads on
+ * every extent. Otherwise, we need to read every shard only if the
+ * necessary shard is missing.
+ */
+ if (!have.contains(shard) || do_redundant_reads) {
+ extra_extents.union_of(extent_set);
+ }
}
- return 0;
-}
-// a static for the sake of unittesting
-void ECCommon::ReadPipeline::get_min_want_to_read_shards(
- const uint64_t offset,
- const uint64_t length,
- const ECUtil::stripe_info_t& sinfo,
- set<int> *want_to_read)
-{
- const auto [left_chunk_index, right_chunk_index] =
- sinfo.offset_length_to_data_chunk_indices(offset, length);
- const auto distance =
- std::min(right_chunk_index - left_chunk_index, (uint64_t)sinfo.get_k());
- for(uint64_t i = 0; i < distance; i++) {
- raw_shard_id_t raw_shard((left_chunk_index + i) % sinfo.get_k());
- want_to_read->insert(static_cast<int>(sinfo.get_shard(raw_shard)));
+ for (auto &shard: need_set) {
+ if (!have.contains(shard)) {
+ continue;
+ }
+ shard_id_t shard_id(shard);
+ extent_set extents = extra_extents;
+ shard_read_t shard_read;
+ if (need_sub_chunks) {
+ shard_read.subchunk = need_sub_chunks->at(shard_id);
+ }
+ shard_read.pg_shard = shards[shard_id];
+
+ if (read_request.shard_want_to_read.contains(shard)) {
+ extents.union_of(read_request.shard_want_to_read.at(shard));
+ }
+
+ extents.align(CEPH_PAGE_SIZE);
+ if (read_mask.contains(shard)) {
+ shard_read.extents.intersection_of(extents, read_mask.at(shard));
+ }
+
+ if (!shard_read.extents.empty()) {
+ read_request.shard_reads[shard_id] = std::move(shard_read);
+ }
}
+
+ dout(20) << __func__ << " for_recovery: " << for_recovery
+ << " do_redundant_reads: " << do_redundant_reads
+ << " read_request: " << read_request
+ << " error_shards: " << error_shards
+ << dendl;
+ return 0;
}
+
void ECCommon::ReadPipeline::get_min_want_to_read_shards(
- const uint64_t offset,
- const uint64_t length,
- set<int> *want_to_read)
-{
- get_min_want_to_read_shards(offset, length, sinfo, want_to_read);
- dout(20) << __func__ << ": offset " << offset << " length " << length
- << " want_to_read " << *want_to_read << dendl;
+ const ec_align_t &to_read,
+ ECUtil::shard_extent_set_t &want_shard_reads) {
+ sinfo.ro_range_to_shard_extent_set(to_read.offset, to_read.size,
+ want_shard_reads);
+ dout(20) << __func__ << ": to_read " << to_read
+ << " read_request " << want_shard_reads << dendl;
}
int ECCommon::ReadPipeline::get_remaining_shards(
- const hobject_t &hoid,
- const set<int> &avail,
- const set<int> &want,
- const read_result_t &result,
- map<pg_shard_t, vector<pair<int, int>>> *to_read,
- bool for_recovery)
-{
- ceph_assert(to_read);
-
- set<int> have;
- map<shard_id_t, pg_shard_t> shards;
+ const hobject_t &hoid,
+ read_result_t &read_result,
+ read_request_t &read_request,
+ const bool for_recovery,
+ const bool fast_read) {
+ shard_id_map<pg_shard_t> shards(sinfo.get_k_plus_m());
set<pg_shard_t> error_shards;
- for (auto &p : result.errors) {
- error_shards.insert(p.first);
+ for (auto &shard: std::views::keys(read_result.errors)) {
+ error_shards.insert(shard);
}
- get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+ const int r = get_min_avail_to_read_shards(
+ hoid,
+ for_recovery,
+ fast_read,
+ read_request,
+ error_shards);
- map<int, vector<pair<int, int>>> need;
- int r = ec_impl->minimum_to_decode(want, have, &need);
- if (r < 0) {
+ if (r) {
dout(0) << __func__ << " not enough shards left to try for " << hoid
- << " read result was " << result << dendl;
+ << " read result was " << read_result << dendl;
return -EIO;
}
- set<int> shards_left;
- for (auto p : need) {
- if (avail.find(p.first) == avail.end()) {
- shards_left.insert(p.first);
+ // Rather than repeating whole read, we can remove everything we already have.
+ for (auto iter = read_request.shard_reads.begin();
+ iter != read_request.shard_reads.end();) {
+ auto &&[shard_id, shard_read] = *iter;
+ bool do_erase = false;
+
+ // Ignore where shard has not been read at all.
+ if (read_result.processed_read_requests.contains(shard_id)) {
+ shard_read.extents.subtract(
+ read_result.processed_read_requests.at(shard_id));
+ do_erase = shard_read.extents.empty();
}
- }
- vector<pair<int, int>> subchunks;
- subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
- for (set<int>::iterator i = shards_left.begin();
- i != shards_left.end();
- ++i) {
- ceph_assert(shards.count(shard_id_t(*i)));
- ceph_assert(avail.find(*i) == avail.end());
- to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
+ if (do_erase) {
+ iter = read_request.shard_reads.erase(iter);
+ } else {
+ ++iter;
+ }
}
- return 0;
+
+ return read_request.shard_reads.empty()?1:0;
}
void ECCommon::ReadPipeline::start_read_op(
- int priority,
- map<hobject_t, set<int>> &want_to_read,
- map<hobject_t, read_request_t> &to_read,
- OpRequestRef _op,
- bool do_redundant_reads,
- bool for_recovery,
- std::unique_ptr<ECCommon::ReadCompleter> on_complete)
-{
+ const int priority,
+ map<hobject_t, read_request_t> &to_read,
+ const bool do_redundant_reads,
+ const bool for_recovery,
+ std::unique_ptr<ReadCompleter> on_complete) {
ceph_tid_t tid = get_parent()->get_tid();
- ceph_assert(!tid_to_read_map.count(tid));
+ ceph_assert(!tid_to_read_map.contains(tid));
auto &op = tid_to_read_map.emplace(
tid,
ReadOp(
do_redundant_reads,
for_recovery,
std::move(on_complete),
- _op,
- std::move(want_to_read),
std::move(to_read))).first->second;
dout(10) << __func__ << ": starting " << op << dendl;
- if (_op) {
+ if (op.op) {
#ifndef WITH_CRIMSON
- op.trace = _op->pg_trace;
+ op.trace = op.op->pg_trace;
#endif
op.trace.event("start ec read");
}
do_read_op(op);
}
-void ECCommon::ReadPipeline::do_read_op(ReadOp &op)
-{
- int priority = op.priority;
- ceph_tid_t tid = op.tid;
+void ECCommon::ReadPipeline::do_read_op(ReadOp &rop) {
+ const int priority = rop.priority;
+ const ceph_tid_t tid = rop.tid;
- dout(10) << __func__ << ": starting read " << op << dendl;
+ dout(10) << __func__ << ": starting read " << rop << dendl;
+ ceph_assert(!rop.to_read.empty());
map<pg_shard_t, ECSubRead> messages;
- for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin();
- i != op.to_read.end();
- ++i) {
- bool need_attrs = i->second.want_attrs;
-
- for (auto j = i->second.need.begin();
- j != i->second.need.end();
- ++j) {
- if (need_attrs) {
- messages[j->first].attrs_to_read.insert(i->first);
- need_attrs = false;
+ for (auto &&[hoid, read_request]: rop.to_read) {
+ bool need_attrs = read_request.want_attrs;
+ ceph_assert(!read_request.shard_reads.empty());
+
+ for (auto &&[shard, shard_read]: read_request.shard_reads) {
+ if (need_attrs && !sinfo.is_nonprimary_shard(shard)) {
+ messages[shard_read.pg_shard].attrs_to_read.insert(hoid);
+ need_attrs = false;
}
- messages[j->first].subchunks[i->first] = j->second;
- op.obj_to_source[i->first].insert(j->first);
- op.source_to_obj[j->first].insert(i->first);
+ if (shard_read.subchunk) {
+ messages[shard_read.pg_shard].subchunks[hoid] = *shard_read.subchunk;
+ } else {
+ static const std::vector default_sub_chunk = {make_pair(0, 1)};
+ messages[shard_read.pg_shard].subchunks[hoid] = default_sub_chunk;
+ }
+ rop.obj_to_source[hoid].insert(shard_read.pg_shard);
+ rop.source_to_obj[shard_read.pg_shard].insert(hoid);
}
- for (const auto& read : i->second.to_read) {
- auto p = make_pair(read.offset, read.size);
- pair<uint64_t, uint64_t> chunk_off_len = sinfo.chunk_aligned_offset_len_to_chunk(p);
- for (auto k = i->second.need.begin();
- k != i->second.need.end();
- ++k) {
- messages[k->first].to_read[i->first].push_back(
- boost::make_tuple(
- chunk_off_len.first,
- chunk_off_len.second,
- read.flags));
+ for (auto &[_, shard_read]: read_request.shard_reads) {
+ ceph_assert(!shard_read.extents.empty());
+ rop.debug_log.emplace_back(ECUtil::READ_REQUEST, shard_read.pg_shard,
+ shard_read.extents);
+ for (auto &[start, len]: shard_read.extents) {
+ messages[shard_read.pg_shard].to_read[hoid].emplace_back(
+ boost::make_tuple(start, len, read_request.flags));
}
- ceph_assert(!need_attrs);
}
+ ceph_assert(!need_attrs);
}
std::vector<std::pair<int, Message*>> m;
m.reserve(messages.size());
- for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin();
- i != messages.end();
- ++i) {
- op.in_progress.insert(i->first);
- shard_to_read_map[i->first].insert(op.tid);
- i->second.tid = tid;
- MOSDECSubOpRead *msg = new MOSDECSubOpRead;
+ for (auto &&[pg_shard, read]: messages) {
+ rop.in_progress.insert(pg_shard);
+ shard_to_read_map[pg_shard].insert(rop.tid);
+ read.tid = tid;
+ auto *msg = new MOSDECSubOpRead;
msg->set_priority(priority);
- msg->pgid = spg_t(
- get_info().pgid.pgid,
- i->first.shard);
+ msg->pgid = spg_t(get_info().pgid.pgid, pg_shard.shard);
msg->map_epoch = get_osdmap_epoch();
msg->min_epoch = get_parent()->get_interval_start_epoch();
- msg->op = i->second;
+ msg->op = read;
msg->op.from = get_parent()->whoami_shard();
msg->op.tid = tid;
- if (op.trace) {
+ if (rop.trace) {
// initialize a child span for this shard
- msg->trace.init("ec sub read", nullptr, &op.trace);
- msg->trace.keyval("shard", i->first.shard.id);
+ msg->trace.init("ec sub read", nullptr, &rop.trace);
+ msg->trace.keyval("shard", pg_shard.shard.id);
}
- m.push_back(std::make_pair(i->first.osd, msg));
+ m.push_back(std::make_pair(pg_shard.osd, msg));
}
if (!m.empty()) {
get_parent()->send_message_osd_cluster(m, get_osdmap_epoch());
}
- dout(10) << __func__ << ": started " << op << dendl;
+ dout(10) << __func__ << ": started " << rop << dendl;
}
void ECCommon::ReadPipeline::get_want_to_read_shards(
- std::set<int> *want_to_read) const
-{
- for (raw_shard_id_t i; i < (int)sinfo.get_k(); ++i) {
- want_to_read->insert(static_cast<int>(sinfo.get_shard(i)));
+ const list<ec_align_t> &to_read,
+ ECUtil::shard_extent_set_t &want_shard_reads) {
+ if (sinfo.supports_partial_reads()) {
+ // Optimised.
+ for (const auto &single_region: to_read) {
+ get_min_want_to_read_shards(single_region, want_shard_reads);
+ }
+ return;
+ }
+
+ // Non-optimised version.
+ for (const shard_id_t shard: sinfo.get_data_shards()) {
+ for (auto &&read: to_read) {
+ auto &&[offset, len] = sinfo.chunk_aligned_ro_range_to_shard_ro_range(
+ read.offset, read.size);
+ want_shard_reads[shard].union_insert(offset, len);
+ }
}
}
-struct ClientReadCompleter : ECCommon::ReadCompleter {
+struct ClientReadCompleter final : ECCommon::ReadCompleter {
ClientReadCompleter(ECCommon::ReadPipeline &read_pipeline,
- ECCommon::ClientAsyncReadStatus *status)
+ ECCommon::ClientAsyncReadStatus *status
+ )
: read_pipeline(read_pipeline),
status(status) {}
void finish_single_request(
- const hobject_t &hoid,
- ECCommon::read_result_t &res,
- list<ec_align_t> to_read,
- set<int> wanted_to_read) override
- {
- auto* cct = read_pipeline.cct;
+ const hobject_t &hoid,
+ ECCommon::read_result_t &&res,
+ ECCommon::read_request_t &req) override {
+ auto *cct = read_pipeline.cct;
dout(20) << __func__ << " completing hoid=" << hoid
- << " res=" << res << " to_read=" << to_read << dendl;
+ << " res=" << res << " req=" << req << dendl;
extent_map result;
- if (res.r != 0)
- goto out;
- ceph_assert(res.returned.size() == to_read.size());
- ceph_assert(res.errors.empty());
- for (auto &&read: to_read) {
- const auto bounds = make_pair(read.offset, read.size);
- const auto aligned =
- read_pipeline.sinfo.offset_len_to_chunk_bounds(bounds);
- ceph_assert(res.returned.front().get<0>() == aligned.first);
- ceph_assert(res.returned.front().get<1>() == aligned.second);
- map<int, bufferlist> to_decode;
- bufferlist bl;
- for (map<pg_shard_t, bufferlist>::iterator j =
- res.returned.front().get<2>().begin();
- j != res.returned.front().get<2>().end();
- ++j) {
- to_decode[static_cast<int>(j->first.shard)] = std::move(j->second);
- }
- dout(20) << __func__ << " going to decode: "
- << " wanted_to_read=" << wanted_to_read
- << " to_decode=" << to_decode
- << dendl;
- int r = ECUtil::decode(
- read_pipeline.sinfo,
- read_pipeline.ec_impl,
- wanted_to_read,
- to_decode,
- &bl);
- if (r < 0) {
- dout(10) << __func__ << " error on ECUtil::decode r=" << r << dendl;
- res.r = r;
- goto out;
- }
- bufferlist trimmed;
- // If partial stripe reads are disabled aligned_offset_in_stripe will
- // be 0 which will mean trim_offset is 0. When partial reads are enabled
- // the shards read (wanted_to_read) is a union of the requirements for
- // each stripe, each range being read may need to trim unneeded shards
- uint64_t aligned_offset_in_stripe = aligned.first -
- read_pipeline.sinfo.logical_to_prev_stripe_offset(aligned.first);
- uint64_t chunk_size = read_pipeline.sinfo.get_chunk_size();
- uint64_t trim_offset = 0;
- for (auto shard : wanted_to_read) {
- int s = static_cast<int>(read_pipeline.sinfo.get_raw_shard(shard_id_t(shard)));
- if ( s * chunk_size < aligned_offset_in_stripe) {
- trim_offset += chunk_size;
- } else {
- break;
- }
+ if (res.r == 0) {
+ ceph_assert(res.errors.empty());
+#if DEBUG_EC_BUFFERS
+ dout(20) << __func__ << ": before decode: " << res.buffers_read.debug_string(2048, 8) << dendl;
+#endif
+ /* Decode any missing buffers */
+ int r = res.buffers_read.decode(read_pipeline.ec_impl,
+ req.shard_want_to_read,
+ req.object_size);
+ ceph_assert( r == 0 );
+
+#if DEBUG_EC_BUFFERS
+ dout(20) << __func__ << ": after decode: " << res.buffers_read.debug_string(2048, 8) << dendl;
+#endif
+
+ for (auto &&read: req.to_read) {
+ result.insert(read.offset, read.size,
+ res.buffers_read.get_ro_buffer(read.offset, read.size));
}
- auto off = read.offset + trim_offset - aligned.first;
- dout(20) << __func__ << " bl.length()=" << bl.length()
- << " off=" << off
- << " read.offset=" << read.offset
- << " read.size=" << read.size
- << " trim_offset="<< trim_offset << dendl;
- ceph_assert(read.size <= bl.length() - off);
- trimmed.substr_of(bl, off, read.size);
- result.insert(
- read.offset, trimmed.length(), std::move(trimmed));
- res.returned.pop_front();
}
-out:
dout(20) << __func__ << " calling complete_object with result="
<< result << dendl;
- status->complete_object(hoid, res.r, std::move(result));
+ status->complete_object(hoid, res.r, std::move(result),
+ std::move(res.buffers_read));
read_pipeline.kick_reads();
}
- void finish(int priority) && override
- {
+ void finish(int priority) && override {
// NOP
}
ECCommon::ReadPipeline &read_pipeline;
ECCommon::ClientAsyncReadStatus *status;
};
-static ostream& _prefix(std::ostream *_dout, ClientReadCompleter *read_completer) {
+
+static ostream &_prefix(std::ostream *_dout,
+ ClientReadCompleter const *read_completer) {
return _prefix(_dout, &read_completer->read_pipeline);
}
void ECCommon::ReadPipeline::objects_read_and_reconstruct(
- const map<hobject_t, std::list<ec_align_t>> &reads,
- bool fast_read,
- GenContextURef<ECCommon::ec_extents_t &&> &&func)
-{
- in_progress_client_reads.emplace_back(
- reads.size(), std::move(func));
+ const map<hobject_t, std::list<ec_align_t>> &reads,
+ const bool fast_read,
+ const uint64_t object_size,
+ GenContextURef<ec_extents_t&&> &&func) {
+ in_progress_client_reads.emplace_back(reads.size(), std::move(func));
if (!reads.size()) {
kick_reads();
return;
}
- map<hobject_t, set<int>> obj_want_to_read;
-
map<hobject_t, read_request_t> for_read_op;
- for (auto &&to_read: reads) {
- set<int> want_to_read;
- if (cct->_conf->osd_ec_partial_reads) {
- for (const auto& single_region : to_read.second) {
- get_min_want_to_read_shards(single_region.offset,
- single_region.size,
- &want_to_read);
- }
- } else {
- get_want_to_read_shards(&want_to_read);
- }
- map<pg_shard_t, vector<pair<int, int>>> shards;
- int r = get_min_avail_to_read_shards(
- to_read.first,
- want_to_read,
+ for (auto &&[hoid, to_read]: reads) {
+ ECUtil::shard_extent_set_t want_shard_reads(sinfo.get_k_plus_m());
+ get_want_to_read_shards(to_read, want_shard_reads);
+
+ read_request_t read_request(to_read, want_shard_reads, false, object_size);
+ const int r = get_min_avail_to_read_shards(
+ hoid,
false,
fast_read,
- &shards);
+ read_request);
ceph_assert(r == 0);
- int subchunk_size =
- sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
+ const int subchunk_size =
+ sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
dout(20) << __func__
+ << " to_read=" << to_read
<< " subchunk_size=" << subchunk_size
<< " chunk_size=" << sinfo.get_chunk_size() << dendl;
- for_read_op.insert(
- make_pair(
- to_read.first,
- read_request_t(
- to_read.second,
- shards,
- false)));
- obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
+ for_read_op.insert(make_pair(hoid, read_request));
}
start_read_op(
CEPH_MSG_PRIO_DEFAULT,
- obj_want_to_read,
for_read_op,
- OpRequestRef(),
fast_read,
false,
- std::make_unique<ClientReadCompleter>(*this, &(in_progress_client_reads.back())));
+ std::make_unique<ClientReadCompleter>(
+ *this, &(in_progress_client_reads.back())));
}
+void ECCommon::ReadPipeline::objects_read_and_reconstruct_for_rmw(
+ map<hobject_t, read_request_t> &&to_read,
+ GenContextURef<ec_extents_t&&> &&func) {
+ in_progress_client_reads.emplace_back(to_read.size(), std::move(func));
+ if (!to_read.size()) {
+ kick_reads();
+ return;
+ }
-int ECCommon::ReadPipeline::send_all_remaining_reads(
- const hobject_t &hoid,
- ReadOp &rop)
-{
- set<int> already_read;
- const set<pg_shard_t>& ots = rop.obj_to_source[hoid];
- for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i)
- already_read.insert(static_cast<int>(i->shard));
- dout(10) << __func__ << " have/error shards=" << already_read << dendl;
- map<pg_shard_t, vector<pair<int, int>>> shards;
- int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
- rop.complete[hoid], &shards, rop.for_recovery);
- if (r)
- return r;
+ map<hobject_t, read_request_t> for_read_op;
+ for (auto &&[hoid, read_request]: to_read) {
+ const int r =
+ get_min_avail_to_read_shards(hoid, false, false, read_request);
+ ceph_assert(r == 0);
+
+ const int subchunk_size = sinfo.get_chunk_size() / ec_impl->
+ get_sub_chunk_count();
+ dout(20) << __func__
+ << " read_request=" << read_request
+ << " subchunk_size=" << subchunk_size
+ << " chunk_size=" << sinfo.get_chunk_size() << dendl;
- list<ec_align_t> to_read = rop.to_read.find(hoid)->second.to_read;
+ for_read_op.insert(make_pair(hoid, read_request));
+ }
+ start_read_op(
+ CEPH_MSG_PRIO_DEFAULT,
+ for_read_op, false, false,
+ std::make_unique<ClientReadCompleter>(
+ *this, &(in_progress_client_reads.back())));
+}
+
+
+int ECCommon::ReadPipeline::send_all_remaining_reads(
+ const hobject_t &hoid,
+ ReadOp &rop) {
// (Note cuixf) If we need to read attrs and we read failed, try to read again.
- bool want_attrs =
- rop.to_read.find(hoid)->second.want_attrs &&
- (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty());
+ const bool want_attrs =
+ rop.to_read.at(hoid).want_attrs &&
+ (!rop.complete.at(hoid).attrs || rop.complete.at(hoid).attrs->empty());
if (want_attrs) {
dout(10) << __func__ << " want attrs again" << dendl;
}
- rop.to_read.erase(hoid);
- rop.to_read.insert(make_pair(
- hoid,
- read_request_t(
- to_read,
- shards,
- want_attrs)));
- return 0;
+ read_request_t &read_request = rop.to_read.at(hoid);
+ // reset the old shard reads, we are going to read them again.
+ read_request.shard_reads.clear();
+ return get_remaining_shards(hoid, rop.complete.at(hoid), read_request,
+ rop.do_redundant_reads, want_attrs);
}
-void ECCommon::ReadPipeline::kick_reads()
-{
+void ECCommon::ReadPipeline::kick_reads() {
while (in_progress_client_reads.size() &&
in_progress_client_reads.front().is_complete()) {
- in_progress_client_reads.front().run();
- in_progress_client_reads.pop_front();
+ in_progress_client_reads.front().run();
+ in_progress_client_reads.pop_front();
}
}
-
-void ECCommon::RMWPipeline::start_rmw(OpRef op)
-{
- ceph_assert(op);
- dout(10) << __func__ << ": " << *op << dendl;
-
- ceph_assert(!tid_to_op_map.count(op->tid));
- waiting_state.push_back(*op);
- tid_to_op_map[op->tid] = std::move(op);
- check_ops();
+bool ec_align_t::operator==(const ec_align_t &other) const {
+ return offset == other.offset && size == other.size && flags == other.flags;
}
-bool ECCommon::RMWPipeline::try_state_to_reads()
-{
- if (waiting_state.empty())
- return false;
-
- Op *op = &(waiting_state.front());
- if (op->requires_rmw() && pipeline_state.cache_invalid()) {
- ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
- dout(20) << __func__ << ": blocking " << *op
- << " because it requires an rmw and the cache is invalid "
- << pipeline_state
- << dendl;
- return false;
- }
-
- if (!pipeline_state.caching_enabled()) {
- op->using_cache = false;
- } else if (op->invalidates_cache()) {
- dout(20) << __func__ << ": invalidating cache after this op"
- << dendl;
- pipeline_state.invalidate();
- }
-
- waiting_state.pop_front();
- waiting_reads.push_back(*op);
-
- if (op->using_cache) {
- cache.open_write_pin(op->pin);
+bool ECCommon::shard_read_t::operator==(const shard_read_t &other) const {
+ return extents == other.extents &&
+ subchunk == other.subchunk &&
+ pg_shard == other.pg_shard;
+}
- extent_set empty;
- for (auto &&hpair: op->plan.will_write) {
- auto to_read_plan_iter = op->plan.to_read.find(hpair.first);
- const extent_set &to_read_plan =
- to_read_plan_iter == op->plan.to_read.end() ?
- empty :
- to_read_plan_iter->second;
+bool ECCommon::read_request_t::operator==(const read_request_t &other) const {
+ return to_read == other.to_read &&
+ flags == other.flags &&
+ shard_want_to_read == other.shard_want_to_read &&
+ shard_reads == other.shard_reads &&
+ want_attrs == other.want_attrs;
+}
- extent_set remote_read = cache.reserve_extents_for_rmw(
- hpair.first,
- op->pin,
- hpair.second,
- to_read_plan);
+void ECCommon::RMWPipeline::start_rmw(OpRef op) {
+ dout(20) << __func__ << " op=" << *op << dendl;
- extent_set pending_read = to_read_plan;
- pending_read.subtract(remote_read);
+ ceph_assert(!tid_to_op_map.contains(op->tid));
+ tid_to_op_map[op->tid] = op;
- if (!remote_read.empty()) {
- op->remote_read[hpair.first] = std::move(remote_read);
- }
- if (!pending_read.empty()) {
- op->pending_read[hpair.first] = std::move(pending_read);
- }
- }
- } else {
- op->remote_read = op->plan.to_read;
- }
+ op->pending_cache_ops = op->plan.plans.size();
+ waiting_commit.push_back(op);
- dout(10) << __func__ << ": " << *op << dendl;
-
- if (!op->remote_read.empty()) {
- ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
- objects_read_async_no_cache(
- op->remote_read,
- [op, this](ec_extents_t &&results) {
- for (auto &&i: results) {
- op->remote_read_result.emplace(make_pair(i.first, i.second.emap));
- }
- check_ops();
+ for (auto &plan: op->plan.plans) {
+ ECExtentCache::OpRef cache_op = extent_cache.prepare(plan.hoid,
+ plan.to_read,
+ plan.will_write,
+ plan.orig_size,
+ plan.projected_size,
+ plan.invalidates_cache,
+ [op](ECExtentCache::OpRef const &cop)
+ {
+ op->cache_ready(cop->get_hoid(), cop->get_result());
});
+ op->cache_ops.emplace_back(std::move(cache_op));
}
-
- return true;
+ extent_cache.execute(op->cache_ops);
}
-bool ECCommon::RMWPipeline::try_reads_to_commit()
-{
- if (waiting_reads.empty())
- return false;
- Op *op = &(waiting_reads.front());
- if (op->read_in_progress())
- return false;
- waiting_reads.pop_front();
- waiting_commit.push_back(*op);
-
- dout(10) << __func__ << ": starting commit on " << *op << dendl;
- dout(20) << __func__ << ": " << cache << dendl;
-
+void ECCommon::RMWPipeline::cache_ready(Op &op) {
get_parent()->apply_stats(
- op->hoid,
- op->delta_stats);
-
- if (op->using_cache) {
- for (auto &&hpair: op->pending_read) {
- op->remote_read_result[hpair.first].insert(
- cache.get_remaining_extents_for_rmw(
- hpair.first,
- op->pin,
- hpair.second));
- }
- op->pending_read.clear();
- } else {
- ceph_assert(op->pending_read.empty());
- }
+ op.hoid,
+ op.delta_stats);
- map<shard_id_t, ObjectStore::Transaction> trans;
- for (set<pg_shard_t>::const_iterator i =
- get_parent()->get_acting_recovery_backfill_shards().begin();
- i != get_parent()->get_acting_recovery_backfill_shards().end();
- ++i) {
- trans.emplace(i->shard, get_parent()->min_peer_features());
+ shard_id_map<ObjectStore::Transaction> trans(sinfo.get_k_plus_m());
+ for (auto &&shard: get_parent()->
+ get_acting_recovery_backfill_shard_id_set()) {
+ trans[shard];
}
- op->trace.event("start ec write");
+ op.trace.event("start ec write");
- map<hobject_t,extent_map> written;
- op->generate_transactions(
+ map<hobject_t, ECUtil::shard_extent_map_t> written;
+ op.generate_transactions(
ec_impl,
get_parent()->get_info().pgid.pgid,
sinfo,
&written,
&trans,
get_parent()->get_dpp(),
- get_osdmap()->require_osd_release);
+ get_osdmap());
- dout(20) << __func__ << ": " << cache << dendl;
- dout(20) << __func__ << ": written: " << written << dendl;
- dout(20) << __func__ << ": op: " << *op << dendl;
+ dout(20) << __func__ << ": written: " << written << ", op: " << op << dendl;
- if (!get_parent()->get_pool().allows_ecoverwrites()) {
- for (auto &&i: op->log_entries) {
+ if (!sinfo.supports_ec_overwrites()) {
+ for (auto &&i: op.log_entries) {
if (i.requires_kraken()) {
- derr << __func__ << ": log entry " << i << " requires kraken"
- << " but overwrites are not enabled!" << dendl;
- ceph_abort();
+ derr << __func__ << ": log entry " << i << " requires kraken"
+ << " but overwrites are not enabled!" << dendl;
+ ceph_abort();
}
}
}
- map<hobject_t,extent_set> written_set;
- for (auto &&i: written) {
- written_set[i.first] = i.second.get_interval_set();
- }
- dout(20) << __func__ << ": written_set: " << written_set << dendl;
- ceph_assert(written_set == op->plan.will_write);
-
- if (op->using_cache) {
- for (auto &&hpair: written) {
- dout(20) << __func__ << ": " << hpair << dendl;
- cache.present_rmw_update(hpair.first, op->pin, hpair.second);
- }
- }
- op->remote_read.clear();
- op->remote_read_result.clear();
-
ObjectStore::Transaction empty;
bool should_write_local = false;
ECSubWrite local_write_op;
std::vector<std::pair<int, Message*>> messages;
messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size());
set<pg_shard_t> backfill_shards = get_parent()->get_backfill_shards();
- for (set<pg_shard_t>::const_iterator i =
- get_parent()->get_acting_recovery_backfill_shards().begin();
- i != get_parent()->get_acting_recovery_backfill_shards().end();
- ++i) {
- op->pending_apply.insert(*i);
- op->pending_commit.insert(*i);
- map<shard_id_t, ObjectStore::Transaction>::iterator iter =
- trans.find(i->shard);
- ceph_assert(iter != trans.end());
- bool should_send = get_parent()->should_send_op(*i, op->hoid);
+
+ if (op.version.version != 0) {
+ if (oid_to_version.contains(op.hoid)) {
+ ceph_assert(oid_to_version.at(op.hoid) <= op.version);
+ }
+ oid_to_version[op.hoid] = op.version;
+ }
+ for (auto &&pg_shard: get_parent()->get_acting_recovery_backfill_shards()) {
+ ObjectStore::Transaction &transaction = trans.at(pg_shard.shard);
+ shard_id_t shard = pg_shard.shard;
+ if (transaction.empty()) {
+ dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " is empty" << dendl;
+ } else {
+ dout(20) << __func__ << " Transaction for osd." << pg_shard.osd << " shard " << shard << " contents ";
+ Formatter *f = Formatter::create("json");
+ f->open_object_section("t");
+ transaction.dump(f);
+ f->close_section();
+ f->flush(*_dout);
+ delete f;
+ *_dout << dendl;
+ }
+ if (op.skip_transaction(pending_roll_forward, shard, transaction)) {
+ // Must be an empty transaction
+ ceph_assert(transaction.empty());
+ dout(20) << __func__ << " Skipping transaction for osd." << shard << dendl;
+ continue;
+ }
+ op.pending_commits++;
+ bool should_send = get_parent()->should_send_op(pg_shard, op.hoid);
const pg_stat_t &stats =
- (should_send || !backfill_shards.count(*i)) ?
- get_info().stats :
- get_parent()->get_shard_info().find(*i)->second.stats;
+ (should_send || !backfill_shards.contains(pg_shard))
+ ? get_info().stats
+ : get_parent()->get_shard_info().find(pg_shard)->second.stats;
ECSubWrite sop(
get_parent()->whoami_shard(),
- op->tid,
- op->reqid,
- op->hoid,
+ op.tid,
+ op.reqid,
+ op.hoid,
stats,
- should_send ? iter->second : empty,
- op->version,
- op->trim_to,
- op->pg_committed_to,
- op->log_entries,
- op->updated_hit_set_history,
- op->temp_added,
- op->temp_cleared,
+ should_send ? transaction : empty,
+ op.version,
+ op.trim_to,
+ op.pg_committed_to,
+ op.log_entries,
+ op.updated_hit_set_history,
+ op.temp_added,
+ op.temp_cleared,
!should_send);
ZTracer::Trace trace;
- if (op->trace) {
+ if (op.trace) {
// initialize a child span for this shard
- trace.init("ec sub write", nullptr, &op->trace);
- trace.keyval("shard", i->shard.id);
+ trace.init("ec sub write", nullptr, &op.trace);
+ trace.keyval("shard", pg_shard.shard.id);
}
- if (*i == get_parent()->whoami_shard()) {
+ if (pg_shard == get_parent()->whoami_shard()) {
should_write_local = true;
local_write_op.claim(sop);
} else if (cct->_conf->bluestore_debug_inject_read_err &&
- ECInject::test_write_error1(ghobject_t(op->hoid,
- ghobject_t::NO_GEN, i->shard))) {
+ ECInject::test_write_error1(ghobject_t(op.hoid,
+ ghobject_t::NO_GEN,
+ pg_shard.shard))) {
dout(0) << " Error inject - Dropping write message to shard " <<
- i->shard << dendl;
+ pg_shard.shard << dendl;
} else {
- MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
- r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
+ auto *r = new MOSDECSubOpWrite(sop);
+ r->pgid = spg_t(get_parent()->primary_spg_t().pgid, pg_shard.shard);
r->map_epoch = get_osdmap_epoch();
r->min_epoch = get_parent()->get_interval_start_epoch();
r->trace = trace;
- messages.push_back(std::make_pair(i->osd, r));
+ messages.push_back(std::make_pair(pg_shard.osd, r));
}
}
if (should_write_local) {
handle_sub_write(
get_parent()->whoami_shard(),
- op->client_op,
+ op.client_op,
local_write_op,
- op->trace);
+ op.trace);
}
- for (auto i = op->on_write.begin();
- i != op->on_write.end();
- op->on_write.erase(i++)) {
- (*i)();
- }
- return true;
+ for (auto &cop: op.cache_ops) {
+ const hobject_t &oid = cop->get_hoid();
+ if (written.contains(oid)) {
+ extent_cache.write_done(cop, std::move(written.at(oid)));
+ } else {
+ extent_cache.write_done(cop, ECUtil::shard_extent_map_t(&sinfo));
+ }
+ }
}
-struct ECDummyOp : ECCommon::RMWPipeline::Op {
+struct ECDummyOp final : ECCommon::RMWPipeline::Op {
void generate_transactions(
- ceph::ErasureCodeInterfaceRef &ecimpl,
+ ceph::ErasureCodeInterfaceRef &ec_impl,
pg_t pgid,
const ECUtil::stripe_info_t &sinfo,
- std::map<hobject_t,extent_map> *written,
- std::map<shard_id_t, ObjectStore::Transaction> *transactions,
+ map<hobject_t, ECUtil::shard_extent_map_t> *written,
+ shard_id_map<ObjectStore::Transaction> *transactions,
DoutPrefixProvider *dpp,
- const ceph_release_t require_osd_release) final
- {
+ const OSDMapRef &osdmap
+ ) override {
// NOP, as -- in constrast to ECClassicalOp -- there is no
// transaction involved
}
+
+ bool skip_transaction(
+ std::set<shard_id_t> &pending_roll_forward,
+ const shard_id_t shard,
+ ceph::os::Transaction &transaction
+ ) override {
+ return !pending_roll_forward.erase(shard);
+ }
};
-bool ECCommon::RMWPipeline::try_finish_rmw()
-{
- if (waiting_commit.empty())
- return false;
- Op *op = &(waiting_commit.front());
- if (op->write_in_progress())
- return false;
- waiting_commit.pop_front();
+void ECCommon::RMWPipeline::try_finish_rmw() {
+ while (waiting_commit.size() > 0) {
+ OpRef op = waiting_commit.front();
+
+ if (op->pending_commits != 0 || op->pending_cache_ops != 0) {
+ return;
+ }
+
+ waiting_commit.pop_front();
+ finish_rmw(op);
+ }
+}
+
+void ECCommon::RMWPipeline::finish_rmw(OpRef const &op) {
+ dout(20) << __func__ << " op=" << *op << dendl;
- dout(10) << __func__ << ": " << *op << dendl;
- dout(20) << __func__ << ": " << cache << dendl;
+ if (op->on_all_commit) {
+ dout(10) << __func__ << " Calling on_all_commit on " << op << dendl;
+ op->on_all_commit->complete(0);
+ op->on_all_commit = nullptr;
+ op->trace.event("ec write all committed");
+ }
if (op->pg_committed_to > completed_to)
completed_to = op->pg_committed_to;
if (op->version > committed_to)
committed_to = op->version;
- if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
- if (op->version > get_parent()->get_log().get_can_rollback_to() &&
- waiting_reads.empty() &&
- waiting_commit.empty()) {
+ op->cache_ops.clear();
+
+ if (extent_cache.idle()) {
+ if (op->version > get_parent()->get_log().get_can_rollback_to()) {
+ const int transactions_since_last_idle = extent_cache.
+ get_and_reset_counter();
+ dout(20) << __func__ << " version=" << op->version << " ec_counter=" <<
+ transactions_since_last_idle << dendl;
// submit a dummy, transaction-empty op to kick the rollforward
- auto tid = get_parent()->get_tid();
- auto nop = std::make_unique<ECDummyOp>();
+ const auto tid = get_parent()->get_tid();
+ const auto nop = std::make_shared<ECDummyOp>();
nop->hoid = op->hoid;
nop->trim_to = op->trim_to;
nop->pg_committed_to = op->version;
nop->tid = tid;
nop->reqid = op->reqid;
- waiting_reads.push_back(*nop);
- tid_to_op_map[tid] = std::move(nop);
- }
- }
+ nop->pending_cache_ops = 1;
+ nop->pipeline = this;
- if (op->using_cache) {
- cache.release_write_pin(op->pin);
- }
- tid_to_op_map.erase(op->tid);
+ tid_to_op_map[tid] = nop;
- if (waiting_reads.empty() &&
- waiting_commit.empty()) {
- pipeline_state.clear();
- dout(20) << __func__ << ": clearing pipeline_state "
- << pipeline_state
- << dendl;
+ /* The cache is idle (we checked above) and this IO never blocks for reads
+ * so we can skip the extent cache and immediately call the completion.
+ */
+ nop->cache_ready(nop->hoid, ECUtil::shard_extent_map_t(&sinfo));
+ }
}
- return true;
-}
-void ECCommon::RMWPipeline::check_ops()
-{
- while (try_state_to_reads() ||
- try_reads_to_commit() ||
- try_finish_rmw());
+ tid_to_op_map.erase(op->tid);
}
-void ECCommon::RMWPipeline::on_change()
-{
+void ECCommon::RMWPipeline::on_change() {
dout(10) << __func__ << dendl;
completed_to = eversion_t();
committed_to = eversion_t();
- pipeline_state.clear();
- waiting_reads.clear();
- waiting_state.clear();
- waiting_commit.clear();
- for (auto &&op: tid_to_op_map) {
- cache.release_write_pin(op.second->pin);
- }
+ extent_cache.on_change();
tid_to_op_map.clear();
+ oid_to_version.clear();
+ waiting_commit.clear();
+}
+
+void ECCommon::RMWPipeline::on_change2() {
+ extent_cache.on_change2();
}
void ECCommon::RMWPipeline::call_write_ordered(std::function<void(void)> &&cb) {
- if (!waiting_state.empty()) {
- waiting_state.back().on_write.emplace_back(std::move(cb));
- } else if (!waiting_reads.empty()) {
- waiting_reads.back().on_write.emplace_back(std::move(cb));
- } else {
- // Nothing earlier in the pipeline, just call it
- cb();
- }
+ extent_cache.add_on_write(std::move(cb));
}
ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::maybe_put_hash_info(
- const hobject_t &hoid,
- ECUtil::HashInfo &&hinfo)
-{
+ const hobject_t &hoid,
+ ECUtil::HashInfo &&hinfo) {
return registry.lookup_or_create(hoid, hinfo);
}
ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info(
- const hobject_t &hoid,
- bool create,
- const map<string, bufferlist, less<>>& attrs,
- uint64_t size)
-{
+ const hobject_t &hoid,
+ bool create,
+ const map<string, bufferlist, less<>> &attrs,
+ uint64_t size) {
dout(10) << __func__ << ": Getting attr on " << hoid << dendl;
- ECUtil::HashInfoRef ref = registry.lookup(hoid);
+ auto ref = registry.lookup(hoid);
if (!ref) {
dout(10) << __func__ << ": not in cache " << hoid << dendl;
ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
bufferlist bl;
- map<string, bufferlist>::const_iterator k = attrs.find(ECUtil::get_hinfo_key());
- if (k == attrs.end()) {
- dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
+ if (attrs.contains(ECUtil::get_hinfo_key())) {
+ bl = attrs.at(ECUtil::get_hinfo_key());
} else {
- bl = k->second;
+ dout(30) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
}
if (bl.length() > 0) {
auto bp = bl.cbegin();
try {
decode(hinfo, bp);
- } catch(...) {
+ }
+ catch (...) {
dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl;
return ECUtil::HashInfoRef();
}
dout(0) << __func__ << ": Mismatch of total_chunk_size "
<< hinfo.get_total_chunk_size() << dendl;
return ECUtil::HashInfoRef();
- } else {
- create = true;
}
- } else if (size == 0) { // If empty object and no hinfo, create it
+ create = true;
+ } else if (size == 0) {
+ // If empty object and no hinfo, create it
create = true;
}
if (create) {
}
return ref;
}
-
-END_IGNORE_DEPRECATED
#include "ECUtil.h"
#include "ECTypes.h"
#if WITH_CRIMSON
-#include "ExtentCache.h"
#include "crimson/osd/object_context.h"
#include "os/Transaction.h"
#include "osd/OSDMap.h"
struct WritePlan {
bool invalidates_cache = false; // Yes, both are possible
std::map<hobject_t,extent_set> to_read;
- std::map<hobject_t,extent_set> will_write; // superset of to_read
+ std::map<hobject_t,extent_set> will_write;
std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
};
#endif
#include "ECTransaction.h"
-#include "ExtentCache.h"
+#include "ECExtentCache.h"
#include "ECListener.h"
+#include "common/dout.h"
//forward declaration
struct ECSubWrite;
struct ec_extent_t {
int err;
extent_map emap;
+ ECUtil::shard_extent_map_t shard_extent_map;
+
+ void print(std::ostream &os) const {
+ os << err << "," << emap;
+ }
};
- friend std::ostream &operator<<(std::ostream &lhs, const ec_extent_t &rhs);
+
using ec_extents_t = std::map<hobject_t, ec_extent_t>;
virtual ~ECCommon() = default;
virtual void handle_sub_write(
- pg_shard_t from,
- OpRequestRef msg,
- ECSubWrite &op,
- const ZTracer::Trace &trace,
- ECListener& eclistener
- ) = 0;
+ pg_shard_t from,
+ OpRequestRef msg,
+ ECSubWrite &op,
+ const ZTracer::Trace &trace,
+ ECListener &eclistener) = 0;
virtual void objects_read_and_reconstruct(
- const std::map<hobject_t, std::list<ec_align_t>> &reads,
- bool fast_read,
- GenContextURef<ec_extents_t &&> &&func) = 0;
+ const std::map<hobject_t, std::list<ec_align_t>> &reads,
+ bool fast_read,
+ uint64_t object_size,
+ GenContextURef<ec_extents_t&&> &&func) = 0;
+
+ struct shard_read_t {
+ extent_set extents;
+ std::optional<std::vector<std::pair<int, int>>> subchunk;
+ pg_shard_t pg_shard;
+ bool operator==(const shard_read_t &other) const;
+
+ void print(std::ostream &os) const {
+ os << "shard_read_t(extents=[" << extents << "]"
+ << ", subchunk=" << subchunk
+ << ", pg_shard=" << pg_shard
+ << ")";
+ }
+ };
struct read_request_t {
const std::list<ec_align_t> to_read;
- std::map<pg_shard_t, std::vector<std::pair<int, int>>> need;
- bool want_attrs;
+ const uint32_t flags = 0;
+ const ECUtil::shard_extent_set_t shard_want_to_read;
+ shard_id_map<shard_read_t> shard_reads;
+ bool want_attrs = false;
+ uint64_t object_size;
+
read_request_t(
- const std::list<ec_align_t> &to_read,
- const std::map<pg_shard_t, std::vector<std::pair<int, int>>> &need,
- bool want_attrs)
- : to_read(to_read), need(need), want_attrs(want_attrs) {}
+ const std::list<ec_align_t> &to_read,
+ const ECUtil::shard_extent_set_t &shard_want_to_read,
+ bool want_attrs, uint64_t object_size) :
+ to_read(to_read),
+ flags(to_read.front().flags),
+ shard_want_to_read(shard_want_to_read),
+ shard_reads(shard_want_to_read.get_max_shards()),
+ want_attrs(want_attrs),
+ object_size(object_size) {}
+
+ read_request_t(const ECUtil::shard_extent_set_t &shard_want_to_read,
+ bool want_attrs, uint64_t object_size) :
+ shard_want_to_read(shard_want_to_read),
+ shard_reads(shard_want_to_read.get_max_shards()),
+ want_attrs(want_attrs),
+ object_size(object_size) {}
+
+ bool operator==(const read_request_t &other) const;
+
+ void print(std::ostream &os) const {
+ os << "read_request_t(to_read=[" << to_read << "]"
+ << ", flags=" << flags
+ << ", shard_want_to_read=" << shard_want_to_read
+ << ", shard_reads=" << shard_reads
+ << ", want_attrs=" << want_attrs
+ << ")";
+ }
};
- friend std::ostream &operator<<(std::ostream &lhs, const read_request_t &rhs);
+
+ virtual void objects_read_and_reconstruct_for_rmw(
+ std::map<hobject_t, read_request_t> &&to_read,
+ GenContextURef<ec_extents_t&&> &&func) = 0;
+
struct ReadOp;
/**
* Low level async read mechanism
struct read_result_t {
int r;
std::map<pg_shard_t, int> errors;
- std::optional<std::map<std::string, ceph::buffer::list, std::less<>> > attrs;
- std::list<
- boost::tuple<
- uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > > returned;
- read_result_t() : r(0) {}
+ std::optional<std::map<std::string, ceph::buffer::list, std::less<>>> attrs;
+ ECUtil::shard_extent_map_t buffers_read;
+ ECUtil::shard_extent_set_t processed_read_requests;
+
+ read_result_t(const ECUtil::stripe_info_t *sinfo) :
+ r(0), buffers_read(sinfo),
+ processed_read_requests(sinfo->get_k_plus_m()) {}
+
+ void print(std::ostream &os) const {
+ os << "read_result_t(r=" << r << ", errors=" << errors;
+ if (attrs) {
+ os << ", attrs=" << *(attrs);
+ } else {
+ os << ", noattrs";
+ }
+ os << ", buffers_read=" << buffers_read << ")";
+ }
};
struct ReadCompleter {
virtual void finish_single_request(
- const hobject_t &hoid,
- read_result_t &res,
- std::list<ec_align_t> to_read,
- std::set<int> wanted_to_read) = 0;
+ const hobject_t &hoid,
+ read_result_t &&res,
+ ECCommon::read_request_t &req) = 0;
virtual void finish(int priority) && = 0;
};
friend struct CallClientContexts;
+
struct ClientAsyncReadStatus {
unsigned objects_to_read;
- GenContextURef<ec_extents_t &&> func;
+ GenContextURef<ec_extents_t&&> func;
ec_extents_t results;
+
explicit ClientAsyncReadStatus(
- unsigned objects_to_read,
- GenContextURef<ec_extents_t &&> &&func)
+ unsigned objects_to_read,
+ GenContextURef<ec_extents_t&&> &&func)
: objects_to_read(objects_to_read), func(std::move(func)) {}
+
void complete_object(
- const hobject_t &hoid,
- int err,
- extent_map &&buffers) {
+ const hobject_t &hoid,
+ int err,
+ extent_map &&buffers,
+ ECUtil::shard_extent_map_t &&shard_extent_map) {
ceph_assert(objects_to_read);
--objects_to_read;
- ceph_assert(!results.count(hoid));
- results.emplace(hoid, ec_extent_t{err, std::move(buffers)});
+ ceph_assert(!results.contains(hoid));
+ results.emplace(hoid, ec_extent_t{
+ err, std::move(buffers),
+ std::move(shard_extent_map)
+ });
}
+
bool is_complete() const {
return objects_to_read == 0;
}
+
void run() {
func.release()->complete(std::move(results));
}
ZTracer::Trace trace;
- std::map<hobject_t, std::set<int>> want_to_read;
std::map<hobject_t, read_request_t> to_read;
std::map<hobject_t, read_result_t> complete;
std::map<hobject_t, std::set<pg_shard_t>> obj_to_source;
- std::map<pg_shard_t, std::set<hobject_t> > source_to_obj;
+ std::map<pg_shard_t, std::set<hobject_t>> source_to_obj;
void dump(ceph::Formatter *f) const;
std::set<pg_shard_t> in_progress;
+ std::list<ECUtil::log_entry_t> debug_log;
+
ReadOp(
- int priority,
- ceph_tid_t tid,
- bool do_redundant_reads,
- bool for_recovery,
- std::unique_ptr<ReadCompleter> _on_complete,
- OpRequestRef op,
- std::map<hobject_t, std::set<int>> &&_want_to_read,
- std::map<hobject_t, read_request_t> &&_to_read)
+ int priority,
+ ceph_tid_t tid,
+ bool do_redundant_reads,
+ bool for_recovery,
+ std::unique_ptr<ReadCompleter> _on_complete,
+ std::map<hobject_t, read_request_t> &&_to_read)
: priority(priority),
tid(tid),
- op(op),
do_redundant_reads(do_redundant_reads),
for_recovery(for_recovery),
on_complete(std::move(_on_complete)),
- want_to_read(std::move(_want_to_read)),
- to_read(std::move(_to_read)) {
- for (auto &&hpair: to_read) {
- auto &returned = complete[hpair.first].returned;
- for (auto &&extent: hpair.second.to_read) {
- returned.push_back(
- boost::make_tuple(
- extent.offset,
- extent.size,
- std::map<pg_shard_t, ceph::buffer::list>()));
- }
- }
- }
+ to_read(std::move(_to_read)) {}
+
ReadOp() = delete;
ReadOp(const ReadOp &) = delete; // due to on_complete being unique_ptr
ReadOp(ReadOp &&) = default;
+
+ void print(std::ostream &os) const {
+ os << "ReadOp(tid=" << tid;
+#ifndef WITH_CRIMSON
+ if (op && op->get_req()) {
+ os << ", op=";
+ op->get_req()->print(os);
+ }
+#endif
+ os << ", to_read=" << to_read << ", complete=" << complete
+ << ", priority=" << priority << ", obj_to_source=" << obj_to_source
+ << ", source_to_obj=" << source_to_obj << ", in_progress=" <<
+ in_progress
+ << ", debug_log=" << debug_log << ")";
+ }
};
+
struct ReadPipeline {
void objects_read_and_reconstruct(
- const std::map<hobject_t, std::list<ec_align_t>> &reads,
- bool fast_read,
- GenContextURef<ec_extents_t &&> &&func);
+ const std::map<hobject_t, std::list<ec_align_t>> &reads,
+ bool fast_read,
+ uint64_t object_size,
+ GenContextURef<ec_extents_t&&> &&func);
+
+ void objects_read_and_reconstruct_for_rmw(
+ std::map<hobject_t, read_request_t> &&to_read,
+ GenContextURef<ECCommon::ec_extents_t&&> &&func);
template <class F, class G>
void filter_read_op(
- const OSDMapRef& osdmap,
- ReadOp &op,
- F&& on_erase,
- G&& on_schedule_recovery);
+ const OSDMapRef &osdmap,
+ ReadOp &op,
+ F &&on_erase,
+ G &&on_schedule_recovery);
template <class F, class G>
void check_recovery_sources(
- const OSDMapRef& osdmap,
- F&& on_erase,
- G&& on_schedule_recovery);
+ const OSDMapRef &osdmap,
+ F &&on_erase,
+ G &&on_schedule_recovery);
- void complete_read_op(ReadOp &rop);
+ void complete_read_op(ReadOp &&rop);
void start_read_op(
- int priority,
- std::map<hobject_t, std::set<int>> &want_to_read,
- std::map<hobject_t, read_request_t> &to_read,
- OpRequestRef op,
- bool do_redundant_reads,
- bool for_recovery,
- std::unique_ptr<ReadCompleter> on_complete);
+ int priority,
+ std::map<hobject_t, read_request_t> &to_read,
+ bool do_redundant_reads,
+ bool for_recovery,
+ std::unique_ptr<ReadCompleter> on_complete);
void do_read_op(ReadOp &rop);
int send_all_remaining_reads(
- const hobject_t &hoid,
- ReadOp &rop);
+ const hobject_t &hoid,
+ ReadOp &rop);
void on_change();
void kick_reads();
std::map<ceph_tid_t, ReadOp> tid_to_read_map;
- std::map<pg_shard_t, std::set<ceph_tid_t> > shard_to_read_map;
+ std::map<pg_shard_t, std::set<ceph_tid_t>> shard_to_read_map;
std::list<ClientAsyncReadStatus> in_progress_client_reads;
- CephContext* cct;
+ CephContext *cct;
ceph::ErasureCodeInterfaceRef ec_impl;
- const ECUtil::stripe_info_t& sinfo;
+ const ECUtil::stripe_info_t &sinfo;
// TODO: lay an interface down here
- ECListener* parent;
+ ECListener *parent;
ECListener *get_parent() const { return parent; }
- const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
- epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
- const pg_info_t &get_info() { return get_parent()->get_info(); }
- ReadPipeline(CephContext* cct,
- ceph::ErasureCodeInterfaceRef ec_impl,
- const ECUtil::stripe_info_t& sinfo,
- ECListener* parent)
+ const OSDMapRef &get_osdmap() const {
+ return get_parent()->pgb_get_osdmap();
+ }
+
+ epoch_t get_osdmap_epoch() const {
+ return get_parent()->pgb_get_osdmap_epoch();
+ }
+
+ const pg_info_t &get_info() const { return get_parent()->get_info(); }
+
+ ReadPipeline(CephContext *cct,
+ ceph::ErasureCodeInterfaceRef ec_impl,
+ const ECUtil::stripe_info_t &sinfo,
+ ECListener *parent)
: cct(cct),
ec_impl(std::move(ec_impl)),
sinfo(sinfo),
- parent(parent) {
- }
+ parent(parent) {}
/**
* While get_want_to_read_shards creates a want_to_read based on the EC
*
*/
void get_min_want_to_read_shards(
- uint64_t offset, ///< [in]
- uint64_t length, ///< [in]
- std::set<int> *want_to_read ///< [out]
- );
- static void get_min_want_to_read_shards(
- const uint64_t offset,
- const uint64_t length,
- const ECUtil::stripe_info_t& sinfo,
- std::set<int> *want_to_read);
+ const ec_align_t &to_read, ///< [in]
+ ECUtil::shard_extent_set_t &want_shard_reads); ///< [out]
int get_remaining_shards(
- const hobject_t &hoid,
- const std::set<int> &avail,
- const std::set<int> &want,
- const read_result_t &result,
- std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read,
- bool for_recovery);
+ const hobject_t &hoid,
+ read_result_t &read_result,
+ read_request_t &read_request,
+ bool for_recovery,
+ bool fast_read);
void get_all_avail_shards(
- const hobject_t &hoid,
- const std::set<pg_shard_t> &error_shards,
- std::set<int> &have,
- std::map<shard_id_t, pg_shard_t> &shards,
- bool for_recovery);
+ const hobject_t &hoid,
+ shard_id_set &have,
+ shard_id_map<pg_shard_t> &shards,
+ bool for_recovery,
+ const std::optional<std::set<pg_shard_t>> &error_shards = std::nullopt);
+
+ std::pair<const shard_id_set, const shard_id_set> get_readable_writable_shard_id_sets();
- friend std::ostream &operator<<(std::ostream &lhs, const ReadOp &rhs);
friend struct FinishReadOp;
- void get_want_to_read_shards(std::set<int> *want_to_read) const;
+ void get_want_to_read_shards(
+ const std::list<ec_align_t> &to_read,
+ ECUtil::shard_extent_set_t &want_shard_reads);
/// Returns to_read replicas sufficient to reconstruct want
int get_min_avail_to_read_shards(
- const hobject_t &hoid, ///< [in] object
- const std::set<int> &want, ///< [in] desired shards
- bool for_recovery, ///< [in] true if we may use non-acting replicas
- bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency
- std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read ///< [out] shards, corresponding subchunks to read
+ const hobject_t &hoid, ///< [in] object
+ bool for_recovery, ///< [in] true if we may use non-acting replicas
+ bool do_redundant_reads,
+ ///< [in] true if we want to issue redundant reads to reduce latency
+ read_request_t &read_request,
+ ///< [out] shard_reads, corresponding subchunks / other sub reads to read
+ const std::optional<std::set<pg_shard_t>> &error_shards = std::nullopt
+ //< [in] Shards where reads have failed (optional)
); ///< @return error code, 0 on success
-
- void schedule_recovery_work();
-
};
/**
* on the writing std::list.
*/
- struct RMWPipeline {
+ struct RMWPipeline : ECExtentCache::BackendReadListener {
struct Op : boost::intrusive::list_base_hook<> {
/// From submit_transaction caller, describes operation
hobject_t hoid;
/// Ancillary also provided from submit_transaction caller
std::map<hobject_t, ObjectContextRef> obc_map;
- /// see call_write_ordered
- std::list<std::function<void(void)> > on_write;
-
/// Generated internally
std::set<hobject_t> temp_added;
std::set<hobject_t> temp_cleared;
ECTransaction::WritePlan plan;
- bool requires_rmw() const { return !plan.to_read.empty(); }
- bool invalidates_cache() const { return plan.invalidates_cache; }
+ bool requires_rmw() const { return !plan.want_read; }
// must be true if requires_rmw(), must be false if invalidates_cache()
bool using_cache = true;
/// In progress read state;
- std::map<hobject_t,extent_set> pending_read; // subset already being read
- std::map<hobject_t,extent_set> remote_read; // subset we must read
- std::map<hobject_t,extent_map> remote_read_result;
- bool read_in_progress() const {
- return !remote_read.empty() && remote_read_result.empty();
- }
+ int pending_cache_ops = 0;
+ std::map<hobject_t, ECUtil::shard_extent_map_t> remote_shard_extent_map;
/// In progress write state.
- std::set<pg_shard_t> pending_commit;
- // we need pending_apply for pre-mimic peers so that we don't issue a
- // read on a remote shard before it has applied a previous write. We can
- // remove this after nautilus.
- std::set<pg_shard_t> pending_apply;
+ int pending_commits = 0;
+
bool write_in_progress() const {
- return !pending_commit.empty() || !pending_apply.empty();
+ return pending_commits != 0;
}
/// optional, may be null, for tracking purposes
OpRequestRef client_op;
/// pin for cache
- ExtentCache::write_pin pin;
+ std::list<ECExtentCache::OpRef> cache_ops;
+ RMWPipeline *pipeline;
+
+ Op() : tid(), plan(), pipeline(nullptr) {}
/// Callbacks
Context *on_all_commit = nullptr;
+
virtual ~Op() {
delete on_all_commit;
}
virtual void generate_transactions(
- ceph::ErasureCodeInterfaceRef &ecimpl,
- pg_t pgid,
- const ECUtil::stripe_info_t &sinfo,
- std::map<hobject_t,extent_map> *written,
- std::map<shard_id_t, ceph::os::Transaction> *transactions,
- DoutPrefixProvider *dpp,
- const ceph_release_t require_osd_release = ceph_release_t::unknown) = 0;
- };
- using OpRef = std::unique_ptr<Op>;
- using op_list = boost::intrusive::list<Op>;
- friend std::ostream &operator<<(std::ostream &lhs, const Op &rhs);
+ ceph::ErasureCodeInterfaceRef &ec_impl,
+ pg_t pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ std::map<hobject_t, ECUtil::shard_extent_map_t> *written,
+ shard_id_map<ceph::os::Transaction> *transactions,
+ DoutPrefixProvider *dpp,
+ const OSDMapRef &osdmap) = 0;
+
+ virtual bool skip_transaction(
+ std::set<shard_id_t> &pending_roll_forward,
+ shard_id_t shard,
+ ceph::os::Transaction &transaction) = 0;
+
+ void cache_ready(const hobject_t &oid, const ECUtil::shard_extent_map_t &result) {
+ if (!result.empty()) {
+ remote_shard_extent_map.insert(std::pair(oid, result));
+ }
- ExtentCache cache;
- std::map<ceph_tid_t, OpRef> tid_to_op_map; /// Owns Op structure
- /**
- * We model the possible rmw states as a std::set of waitlists.
- * All writes at this time complete in order, so a write blocked
- * at waiting_state blocks all writes behind it as well (same for
- * other states).
- *
- * Future work: We can break this up into a per-object pipeline
- * (almost). First, provide an ordering token to submit_transaction
- * and require that all operations within a single transaction take
- * place on a subset of hobject_t space partitioned by that token
- * (the hashid seem about right to me -- even works for temp objects
- * if you recall that a temp object created for object head foo will
- * only ever be referenced by other transactions on foo and aren't
- * reused). Next, factor this part into a class and maintain one per
- * ordering token. Next, fixup PrimaryLogPG's repop queue to be
- * partitioned by ordering token. Finally, refactor the op pipeline
- * so that the log entries passed into submit_transaction aren't
- * versioned. We can't assign versions to them until we actually
- * submit the operation. That's probably going to be the hard part.
- */
- class pipeline_state_t {
- enum {
- CACHE_VALID = 0,
- CACHE_INVALID = 1
- } pipeline_state = CACHE_VALID;
- public:
- bool caching_enabled() const {
- return pipeline_state == CACHE_VALID;
- }
- bool cache_invalid() const {
- return !caching_enabled();
- }
- void invalidate() {
- pipeline_state = CACHE_INVALID;
+ if (!--pending_cache_ops) {
+ pipeline->cache_ready(*this);
+ }
}
- void clear() {
- pipeline_state = CACHE_VALID;
+
+ void print(std::ostream &os) const {
+ os << "Op(" << hoid << " v=" << version << " tt=" << trim_to
+ << " tid=" << tid << " reqid=" << reqid;
+#ifndef WITH_CRIMSON
+ if (client_op && client_op->get_req()) {
+ os << " client_op=";
+ client_op->get_req()->print(os);
+ }
+#endif
+ os << " pg_committed_to=" << pg_committed_to
+ << " temp_added=" << temp_added
+ << " temp_cleared=" << temp_cleared
+ << " remote_read_result=" << remote_shard_extent_map
+ << " pending_commits=" << pending_commits
+ << " plan.to_read=" << plan
+ << ")";
}
- friend std::ostream &operator<<(std::ostream &lhs, const pipeline_state_t &rhs);
- } pipeline_state;
+ };
+
+ void backend_read(hobject_t oid, ECUtil::shard_extent_set_t const &request,
+ uint64_t object_size) override {
+ std::map<hobject_t, read_request_t> to_read;
+ to_read.emplace(oid, read_request_t(request, false, object_size));
+
+ objects_read_async_no_cache(
+ std::move(to_read),
+ [this](ec_extents_t &&results) {
+ for (auto &&[oid, result]: results) {
+ extent_cache.read_done(oid, std::move(result.shard_extent_map));
+ }
+ });
+ }
- op_list waiting_state; /// writes waiting on pipe_state
- op_list waiting_reads; /// writes waiting on partial stripe reads
- op_list waiting_commit; /// writes waiting on initial commit
+ using OpRef = std::shared_ptr<Op>;
+
+ std::map<ceph_tid_t, OpRef> tid_to_op_map; /// Owns Op structure
+ std::map<hobject_t, eversion_t> oid_to_version;
+
+ std::list<OpRef> waiting_commit;
eversion_t completed_to;
eversion_t committed_to;
void start_rmw(OpRef op);
- bool try_state_to_reads();
- bool try_reads_to_commit();
- bool try_finish_rmw();
- void check_ops();
+ void cache_ready(Op &op);
+ void try_finish_rmw();
+ void finish_rmw(OpRef const &op);
void on_change();
+ void on_change2();
void call_write_ordered(std::function<void(void)> &&cb);
- CephContext* cct;
+ CephContext *cct;
ECListener *get_parent() const { return parent; }
- const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
- epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
- const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+ const OSDMapRef &get_osdmap() const {
+ return get_parent()->pgb_get_osdmap();
+ }
+
+ epoch_t get_osdmap_epoch() const {
+ return get_parent()->pgb_get_osdmap_epoch();
+ }
+
+ const pg_info_t &get_info() const { return get_parent()->get_info(); }
template <typename Func>
void objects_read_async_no_cache(
- const std::map<hobject_t,extent_set> &to_read,
- Func &&on_complete
- ) {
- std::map<hobject_t, std::list<ec_align_t>> _to_read;
- for (auto &&hpair: to_read) {
- auto &l = _to_read[hpair.first];
- for (auto extent: hpair.second) {
- l.emplace_back(ec_align_t{extent.first, extent.second, 0});
- }
- }
- ec_backend.objects_read_and_reconstruct(
- _to_read,
- false,
+ std::map<hobject_t, read_request_t> &&to_read,
+ Func &&on_complete) {
+ ec_backend.objects_read_and_reconstruct_for_rmw(
+ std::move(to_read),
make_gen_lambda_context<
- ECCommon::ec_extents_t &&, Func>(
- std::forward<Func>(on_complete)));
+ ECCommon::ec_extents_t&&, Func>(
+ std::forward<Func>(on_complete)));
}
+
void handle_sub_write(
- pg_shard_t from,
- OpRequestRef msg,
- ECSubWrite &op,
- const ZTracer::Trace &trace
- ) {
- ec_backend.handle_sub_write(from, std::move(msg), op, trace, *get_parent());
+ pg_shard_t from,
+ OpRequestRef msg,
+ ECSubWrite &op,
+ const ZTracer::Trace &trace) const {
+ ec_backend.handle_sub_write(from, std::move(msg), op, trace,
+ *get_parent());
}
+
// end of iface
+ // Set of shards that will need a dummy transaction for the final
+ // roll forward
+ std::set<shard_id_t> pending_roll_forward;
+
ceph::ErasureCodeInterfaceRef ec_impl;
- const ECUtil::stripe_info_t& sinfo;
- ECListener* parent;
- ECCommon& ec_backend;
+ const ECUtil::stripe_info_t &sinfo;
+ ECListener *parent;
+ ECCommon &ec_backend;
+ ECExtentCache extent_cache;
+ uint64_t ec_pdw_write_mode;
- RMWPipeline(CephContext* cct,
+ RMWPipeline(CephContext *cct,
ceph::ErasureCodeInterfaceRef ec_impl,
- const ECUtil::stripe_info_t& sinfo,
- ECListener* parent,
- ECCommon& ec_backend)
+ const ECUtil::stripe_info_t &sinfo,
+ ECListener *parent,
+ ECCommon &ec_backend,
+ ECExtentCache::LRU &ec_extent_cache_lru)
: cct(cct),
ec_impl(std::move(ec_impl)),
sinfo(sinfo),
parent(parent),
- ec_backend(ec_backend) {
- }
+ ec_backend(ec_backend),
+ extent_cache(*this, ec_extent_cache_lru, sinfo, cct),
+ ec_pdw_write_mode(cct->_conf.get_val<uint64_t>("ec_pdw_write_mode")) {}
};
class UnstableHashInfoRegistry {
/// If modified, ensure that the ref is held until the update is applied
SharedPtrRegistry<hobject_t, ECUtil::HashInfo> registry;
- public:
+ public:
UnstableHashInfoRegistry(
- CephContext *cct,
- ceph::ErasureCodeInterfaceRef ec_impl)
+ CephContext *cct,
+ ceph::ErasureCodeInterfaceRef ec_impl)
: cct(cct),
- ec_impl(std::move(ec_impl)) {}
+ ec_impl(std::move(ec_impl)) {}
ECUtil::HashInfoRef maybe_put_hash_info(
- const hobject_t &hoid,
- ECUtil::HashInfo &&hinfo);
+ const hobject_t &hoid,
+ ECUtil::HashInfo &&hinfo);
ECUtil::HashInfoRef get_hash_info(
- const hobject_t &hoid,
- bool create,
- const std::map<std::string, ceph::buffer::list, std::less<>>& attr,
- uint64_t size);
+ const hobject_t &hoid,
+ bool create,
+ const std::map<std::string, ceph::buffer::list, std::less<>> &attrs,
+ uint64_t size);
};
};
-std::ostream &operator<<(std::ostream &lhs,
- const ECCommon::RMWPipeline::pipeline_state_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
- const ECCommon::read_request_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
- const ECCommon::read_result_t &rhs);
-std::ostream &operator<<(std::ostream &lhs,
- const ECCommon::ReadOp &rhs);
-std::ostream &operator<<(std::ostream &lhs,
- const ECCommon::RMWPipeline::Op &rhs);
-
template <class F, class G>
void ECCommon::ReadPipeline::check_recovery_sources(
- const OSDMapRef& osdmap,
- F&& on_erase,
- G&& on_schedule_recovery)
-{
+ const OSDMapRef &osdmap,
+ F &&on_erase,
+ G &&on_schedule_recovery
+ ) {
std::set<ceph_tid_t> tids_to_filter;
- for (std::map<pg_shard_t, std::set<ceph_tid_t> >::iterator
+ for (std::map<pg_shard_t, std::set<ceph_tid_t>>::iterator
i = shard_to_read_map.begin();
- i != shard_to_read_map.end();
- ) {
+ i != shard_to_read_map.end();) {
if (osdmap->is_down(i->first.osd)) {
tids_to_filter.insert(i->second.begin(), i->second.end());
shard_to_read_map.erase(i++);
template <class F, class G>
void ECCommon::ReadPipeline::filter_read_op(
- const OSDMapRef& osdmap,
- ReadOp &op,
- F&& on_erase,
- G&& on_schedule_recovery)
-{
+ const OSDMapRef &osdmap,
+ ReadOp &op,
+ F &&on_erase,
+ G &&on_schedule_recovery
+ ) {
std::set<hobject_t> to_cancel;
- for (std::map<pg_shard_t, std::set<hobject_t> >::iterator i = op.source_to_obj.begin();
- i != op.source_to_obj.end();
- ++i) {
- if (osdmap->is_down(i->first.osd)) {
- to_cancel.insert(i->second.begin(), i->second.end());
- op.in_progress.erase(i->first);
- continue;
+ for (auto &&[pg_shard, hoid_set] : op.source_to_obj) {
+ if (osdmap->is_down(pg_shard.osd)) {
+ to_cancel.insert(hoid_set.begin(), hoid_set.end());
+ op.in_progress.erase(pg_shard);
}
}
if (to_cancel.empty())
return;
- for (std::map<pg_shard_t, std::set<hobject_t> >::iterator i = op.source_to_obj.begin();
- i != op.source_to_obj.end();
- ) {
- for (std::set<hobject_t>::iterator j = i->second.begin();
- j != i->second.end();
- ) {
- if (to_cancel.count(*j))
- i->second.erase(j++);
- else
- ++j;
+ for (auto iter = op.source_to_obj.begin();
+ iter != op.source_to_obj.end();) {
+ auto &[pg_shard, hoid_set] = *iter;
+ for (auto &hoid : hoid_set) {
+ if (to_cancel.contains(hoid)) {
+ hoid_set.erase(hoid);
+ }
}
- if (i->second.empty()) {
- op.source_to_obj.erase(i++);
+ if (hoid_set.empty()) {
+ op.source_to_obj.erase(iter++);
} else {
- ceph_assert(!osdmap->is_down(i->first.osd));
- ++i;
+ ceph_assert(!osdmap->is_down(pg_shard.osd));
+ ++iter;
}
}
- for (std::set<hobject_t>::iterator i = to_cancel.begin();
- i != to_cancel.end();
- ++i) {
- get_parent()->cancel_pull(*i);
+ for (auto hoid : to_cancel) {
+ get_parent()->cancel_pull(hoid);
- ceph_assert(op.to_read.count(*i));
- op.to_read.erase(*i);
- op.complete.erase(*i);
- on_erase(*i);
+ ceph_assert(op.to_read.contains(hoid));
+ op.to_read.erase(hoid);
+ op.complete.erase(hoid);
+ on_erase(hoid);
}
if (op.in_progress.empty()) {
}
}
-template <> struct fmt::formatter<ECCommon::RMWPipeline::pipeline_state_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::read_request_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::read_result_t> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::ReadOp> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<ECCommon::RMWPipeline::Op> : fmt::ostream_formatter {};
\ No newline at end of file
+template <>
+struct fmt::formatter<ECCommon::read_request_t> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::read_result_t> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::ReadOp> : fmt::ostream_formatter {};
+
+template <>
+struct fmt::formatter<ECCommon::RMWPipeline::Op> : fmt::ostream_formatter {};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ECExtentCache.h"
+#include "ECUtil.h"
+
+#include <ranges>
+
+using namespace std;
+using namespace ECUtil;
+
+void ECExtentCache::Object::request(OpRef &op) {
+ /* After a cache invalidation, we allow through a single cache-invalidating
+ * IO.
+ */
+ if (op->invalidates_cache) {
+ if (cache_invalidated) {
+ op->invalidates_cache = false;
+ } else {
+ cache_invalidate_expected = true;
+ }
+ }
+ cache_invalidated = false;
+
+ extent_set eset = op->get_pin_eset(line_size);
+
+ for (auto &&[start, len] : eset) {
+ for (uint64_t to_pin = start; to_pin < start + len; to_pin += line_size) {
+ LineRef l;
+ if (!lines.contains(to_pin)) {
+ l = make_shared<Line>(*this, to_pin);
+ if (!l->cache->empty()) {
+ l->cache->to_shard_extent_set(do_not_read);
+ }
+ lines.emplace(to_pin, weak_ptr(l));
+ } else {
+ l = lines.at(to_pin).lock();
+ }
+ op->lines.emplace_back(l);
+ }
+ }
+
+ bool read_required = false;
+
+ /* Deal with reads if there are any.
+ * If any cache invalidation ops have been added, there is no point adding any
+ * reads as they are all going to be thrown away before any of the
+ * post-invalidate ops are honoured.
+ */
+ if (op->reads && !cache_invalidate_expected) {
+ for (auto &&[shard, eset] : *(op->reads)) {
+ extent_set request = eset;
+ if (do_not_read.contains(shard)) {
+ request.subtract(do_not_read.at(shard));
+ }
+
+ if (!request.empty()) {
+ requesting[shard].union_of(request);
+ read_required = true;
+ requesting_ops.emplace_back(op);
+ }
+ }
+ }
+
+
+ /* Calculate the range of the object which no longer need to be written. This
+ * will include:
+ * - Any reads being issued by this IO.
+ * - Any writes being issued (these will be cached)
+ * - any unwritten regions in an append - these can assumed to be zero.
+ */
+ if (read_required) {
+ do_not_read.insert(requesting);
+ }
+ do_not_read.insert(op->writes);
+ if (op->projected_size > projected_size) {
+ /* This write is growing the size of the object. This essentially counts
+ * as a write (although the cache will not get populated). Future reads
+ * to this area will be skipped, but this makes them essentially zero
+ * reads.
+ */
+ shard_extent_set_t obj_hole(pg.sinfo.get_k_plus_m());
+ shard_extent_set_t read_mask(pg.sinfo.get_k_plus_m());
+
+ pg.sinfo.ro_size_to_read_mask(op->projected_size, obj_hole);
+ pg.sinfo.ro_size_to_read_mask(projected_size, read_mask);
+ obj_hole.subtract(read_mask);
+ do_not_read.insert(obj_hole);
+ } else if (op->projected_size < projected_size) {
+ // Invalidate the object's cache when we see any object reduce in size.
+ op->invalidates_cache = true;
+ }
+
+ projected_size = op->projected_size;
+
+ if (read_required) send_reads();
+ else op->read_done = true;
+}
+
+void ECExtentCache::Object::send_reads() {
+ if (reading || requesting.empty())
+ return; // Read busy
+
+ reading_ops.swap(requesting_ops);
+ pg.backend_read.backend_read(oid, requesting, current_size);
+ requesting.clear();
+ reading = true;
+}
+
+void ECExtentCache::Object::read_done(shard_extent_map_t const &buffers) {
+ reading = false;
+ for (auto &&op : reading_ops) {
+ op->read_done = true;
+ }
+ reading_ops.clear();
+ insert(buffers);
+}
+
+uint64_t ECExtentCache::Object::line_align(uint64_t x) const {
+ return x - (x % line_size);
+}
+
+void ECExtentCache::Object::insert(shard_extent_map_t const &buffers) const {
+ if (buffers.empty()) return;
+
+ /* The following gets quite inefficient for writes which write to the start
+ * and the end of a very large object, since we iterated over the middle.
+ * This seems like a strange use case, so currently this is not being
+ * optimised.
+ */
+ for (uint64_t slice_start = line_align(buffers.get_start_offset());
+ slice_start < buffers.get_end_offset();
+ slice_start += line_size) {
+ shard_extent_map_t slice = buffers.slice_map(slice_start, line_size);
+ if (!slice.empty()) {
+ LineRef l = lines.at(slice_start).lock();
+ /* The line should have been created already! */
+ l->cache->insert(slice);
+ uint64_t old_size = l->size;
+ l->size = l->cache->size();
+ ceph_assert(l->size >= old_size);
+ update_mempool(0, l->size - old_size);
+ }
+ }
+}
+
+void ECExtentCache::Object::write_done(shard_extent_map_t const &buffers,
+ uint64_t new_size) {
+ insert(buffers);
+ current_size = new_size;
+}
+
+void ECExtentCache::Object::unpin(Op &op) const {
+ op.lines.clear();
+ delete_maybe();
+}
+
+void ECExtentCache::Object::delete_maybe() const {
+ if (lines.empty() && active_ios == 0) {
+ pg.objects.erase(oid);
+ }
+}
+
+void check_seset_empty_for_range(shard_extent_set_t s, uint64_t off,
+ uint64_t len) {
+ for (auto &[shard, eset] : s) {
+ ceph_assert(!eset.intersects(off, len));
+ }
+}
+
+void ECExtentCache::Object::erase_line(uint64_t offset) {
+ check_seset_empty_for_range(requesting, offset, line_size);
+ do_not_read.erase_stripe(offset, line_size);
+ lines.erase(offset);
+ delete_maybe();
+}
+
+void ECExtentCache::Object::invalidate(const OpRef &invalidating_op) {
+ for (auto &l : std::views::values(lines)) {
+ auto line = l.lock();
+ line->cache->clear();
+ update_mempool(0, -line->size);
+ line->size = 0;
+ }
+
+ /* Remove all entries from the LRU */
+ pg.lru.remove_object(oid);
+
+ ceph_assert(!reading);
+ do_not_read.clear();
+ requesting.clear();
+ requesting_ops.clear();
+ reading_ops.clear();
+
+ /* Current size should reflect the actual size of the object, which was set
+ * by the previous write. We are going to replay all the writes now, so set
+ * the projected size to that of this op.
+ */
+ projected_size = invalidating_op->projected_size;
+
+ // Cache can now be replayed and invalidate teh cache!
+ invalidating_op->invalidates_cache = false;
+
+ cache_invalidated = true;
+ cache_invalidate_expected = false;
+
+ /* We now need to reply all outstanding ops, so as to regenerate the read */
+ for (auto &op : pg.waiting_ops) {
+ if (op->object.oid == oid) {
+ op->read_done = false;
+ request(op);
+ }
+ }
+}
+
+void ECExtentCache::cache_maybe_ready() {
+ while (!waiting_ops.empty()) {
+ OpRef op = waiting_ops.front();
+ if (op->invalidates_cache) {
+ /* We must wait for any outstanding reads to complete. The cache replans
+ * all reads as part of invalidate. If an in-flight read completes after
+ * the invalidate, it will potentially corrupt it, leading to data
+ * corruption at the host.
+ */
+ if (op->object.reading) {
+ return;
+ }
+ op->object.invalidate(op);
+ ceph_assert(!op->invalidates_cache);
+ }
+ /* If reads_done finds all reads complete it will call the completion
+ * callback. Typically, this will cause the client to execute the
+ * transaction and pop the front of waiting_ops. So we abort if either
+ * reads are not ready, or the client chooses not to complete the op
+ */
+ if (!op->complete_if_reads_cached(op)) {
+ return;
+ }
+
+ waiting_ops.pop_front();
+ }
+}
+
+ECExtentCache::OpRef ECExtentCache::prepare(GenContextURef<OpRef&> &&ctx,
+ hobject_t const &oid,
+ std::optional<shard_extent_set_t>
+ const &to_read,
+ shard_extent_set_t const &write,
+ uint64_t orig_size,
+ uint64_t projected_size,
+ bool invalidates_cache) {
+
+ auto object_iter = objects.find(oid);
+ if (object_iter == objects.end()) {
+ auto p = objects.emplace(oid, Object(*this, oid, orig_size));
+ object_iter = p.first;
+ }
+ OpRef op = std::make_shared<Op>(
+ std::move(ctx), object_iter->second, to_read, write, projected_size,
+ invalidates_cache);
+
+ return op;
+}
+
+void ECExtentCache::read_done(hobject_t const &oid,
+ shard_extent_map_t const &update) {
+ objects.at(oid).read_done(update);
+ cache_maybe_ready();
+ objects.at(oid).send_reads();
+}
+
+void ECExtentCache::write_done(OpRef const &op,
+ shard_extent_map_t const &update) {
+ op->write_done(std::move(update));
+}
+
+uint64_t ECExtentCache::get_projected_size(hobject_t const &oid) const {
+ return objects.at(oid).get_projected_size();
+}
+
+bool ECExtentCache::contains_object(hobject_t const &oid) const {
+ return objects.contains(oid);
+}
+
+ECExtentCache::Op::~Op() {
+ ceph_assert(object.active_ios > 0);
+ object.active_ios--;
+ ceph_assert(object.pg.active_ios > 0);
+ object.pg.active_ios--;
+
+ object.unpin(*this);
+}
+
+/* ECExtent cache cleanup on occurs in two parts. The first performs cleanup
+ * of the ops currently managed by the extent cache. At this point, however
+ * the cache will be waiting for other parts of EC to clean up (for example
+ * any outstanding reads). on_change2() executes once all of this cleanup has
+ * occurred.
+ */
+void ECExtentCache::on_change() {
+ for (auto &&o : std::views::values(objects)) {
+ o.reading_ops.clear();
+ o.requesting_ops.clear();
+ o.requesting.clear();
+ }
+ for (auto &&op : waiting_ops) {
+ op->cancel();
+ }
+ waiting_ops.clear();
+}
+
+/* This must be run toward the end of EC on_change handling. It asserts that
+ * any object which is automatically self-destructs when idle has done so.
+ * Additionally, it discards the entire LRU cache. This must be done after all
+ * in-flight reads/writes have completed, or we risk attempting to insert data
+ * into the cache after it has been cleared.
+ *
+ * Note that the LRU will end up being called multiple times. With some
+ * additional code complexity this could be fixed for a small (probably
+ * insignificant) performance improvement.
+ */
+void ECExtentCache::on_change2() const {
+ lru.discard();
+ /* If this assert fires in a unit test, make sure that all ops have completed
+ * and cleared any extent cache ops they contain */
+ ceph_assert(objects.empty());
+ ceph_assert(active_ios == 0);
+ ceph_assert(idle());
+}
+
+void ECExtentCache::execute(list<OpRef> &op_list) {
+ for (auto &op : op_list) {
+ op->object.request(op);
+ }
+ waiting_ops.insert(waiting_ops.end(), op_list.begin(), op_list.end());
+ counter++;
+ cache_maybe_ready();
+}
+
+bool ECExtentCache::idle() const {
+ return active_ios == 0;
+}
+
+uint32_t ECExtentCache::get_and_reset_counter() {
+ uint32_t ret = counter;
+ counter = 0;
+ return ret;
+}
+
+list<ECExtentCache::LRU::Key>::iterator ECExtentCache::LRU::erase(
+ const list<Key>::iterator &it,
+ bool do_update_mempool) {
+ uint64_t size_change = map.at(*it).second->size();
+ if (do_update_mempool) {
+ update_mempool(-1, 0 - size_change);
+ }
+ size -= size_change;
+ map.erase(*it);
+ return lru.erase(it);
+}
+
+void ECExtentCache::LRU::add(const Line &line) {
+ if (line.size == 0) {
+ update_mempool(-1, 0);
+ return;
+ }
+
+ const Key k(line.offset, line.object.oid);
+
+ shared_ptr<shard_extent_map_t> cache = line.cache;
+
+ mutex.lock();
+ ceph_assert(!map.contains(k));
+ auto i = lru.insert(lru.end(), k);
+ auto j = make_pair(std::move(i), std::move(cache));
+ map.insert(std::pair(std::move(k), std::move(j)));
+ size += line.size; // This is already accounted for in mempool.
+ free_maybe();
+ mutex.unlock();
+}
+
+shared_ptr<shard_extent_map_t> ECExtentCache::LRU::find(
+ const hobject_t &oid, uint64_t offset) {
+ Key k(offset, oid);
+ shared_ptr<shard_extent_map_t> cache = nullptr;
+ mutex.lock();
+ if (map.contains(k)) {
+ auto &&[lru_iter, c] = map.at(k);
+ cache = c;
+ auto it = lru_iter; // Intentional copy.
+ erase(it, false);
+ }
+ mutex.unlock();
+ return cache;
+}
+
+void ECExtentCache::LRU::remove_object(const hobject_t &oid) {
+ mutex.lock();
+ for (auto it = lru.begin(); it != lru.end();) {
+ if (it->oid == oid) {
+ it = erase(it, true);
+ } else {
+ ++it;
+ }
+ }
+ mutex.unlock();
+}
+
+void ECExtentCache::LRU::free_maybe() {
+ while (max_size < size) {
+ auto it = lru.begin();
+ erase(it, true);
+ }
+}
+
+void ECExtentCache::LRU::discard() {
+ mutex.lock();
+ lru.clear();
+ update_mempool(0 - map.size(), 0 - size);
+ map.clear();
+ size = 0;
+ mutex.unlock();
+}
+
+const extent_set ECExtentCache::Op::get_pin_eset(uint64_t alignment) const {
+ extent_set eset = writes.get_extent_superset();
+ if (reads) {
+ reads->get_extent_superset(eset);
+ }
+ eset.align(alignment);
+
+ return eset;
+}
+
+ECExtentCache::Op::Op(GenContextURef<OpRef&> &&cache_ready_cb,
+ Object &object,
+ std::optional<shard_extent_set_t> const &to_read,
+ shard_extent_set_t const &write,
+ uint64_t projected_size,
+ bool invalidates_cache) :
+ object(object),
+ reads(to_read),
+ writes(write),
+ result(&object.pg.sinfo),
+ invalidates_cache(invalidates_cache),
+ projected_size(projected_size),
+ cache_ready_cb(std::move(cache_ready_cb)) {
+ object.active_ios++;
+ object.pg.active_ios++;
+}
+
+shard_extent_map_t ECExtentCache::Object::get_cache(
+ std::optional<shard_extent_set_t> const &set) const {
+ if (!set) {
+ return shard_extent_map_t(&pg.sinfo);
+ }
+
+ shard_id_map<extent_map> res(pg.sinfo.get_k_plus_m());
+ for (auto &&[shard, eset] : *set) {
+ for (auto [off, len] : eset) {
+ for (uint64_t slice_start = line_align(off);
+ slice_start < off + len;
+ slice_start += line_size) {
+ uint64_t offset = max(slice_start, off);
+ uint64_t length = min(slice_start + line_size, off + len) - offset;
+ // This line must exist, as it was created when the op was created.
+ LineRef l = lines.at(slice_start).lock();
+ if (l->cache->contains_shard(shard)) {
+ extent_map m = l->cache->get_extent_map(shard).intersect(
+ offset, length);
+ if (!m.empty()) {
+ if (!res.contains(shard)) res.emplace(shard, std::move(m));
+ else res.at(shard).insert(m);
+ }
+ }
+ }
+ }
+ }
+ return shard_extent_map_t(&pg.sinfo, std::move(res));
+}
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/* EC "extent" cache. This extent cache attempts to improve performance,
+ * particularly for small sequential writes, by caching the results of recent
+ * reads and writes.
+ *
+ * The cache has two parts: The main cache which is active while an IO is
+ * outstanding to an object and an "LRU" which stashes recent IO according to
+ * a least-recently-used scheme.
+ *
+ * The cache keeps all caches indexed by shard, shard_offset. That is it
+ * independently tracks caches for each shard of an EC. It will keep a cache
+ * even for shards which are currently offline or missing, since the cache
+ * is formed from the result of reads and writes, which are required to always
+ * calculate missing shards.
+ *
+ * The cache allows for a single read to be outstanding per PG at a time. If
+ * multiple writes are received while a read is active, the next read will
+ * contain all necessary reads, so as to catch up. Early on in development, a
+ * more parallel read mechanism was explored but was found to have no benefit.
+ *
+ * This cache will never re-order IO.
+ *
+ * The LRU
+ *
+ * The LRU is a per-OSD-shard (not to be confused with an EC shard). Since the
+ * OSD-shard can have multiple threads, the LRU must have a mutex. This should
+ * not be required for crimson-based pools, since each osd shard has a single
+ * reactor. Some effort has been made to limit the frequency that this mutex is
+ * taken.
+ *
+ * The LRU has a maximum size (defined in the constructor) and will keep its
+ * usage below this amount.
+ *
+ * Cache Lines
+ *
+ * The LRU tracks extents of recent writes with cache Lines. These are
+ * simple-to-track ranges of offsets across all shards. Each line represents 32K
+ * of address space on each shard.
+ *
+ * A cache line can be owned by:
+ * - No-one (i.e. it is not instantiated)
+ * - Object - an IO is inflight for this cache line
+ * - LRU - A recent IO touched this cache line.
+ *
+ * This simple ownership model means that the locking required for the LRU does
+ * not leak out into the wider extent cache and allows for the entire cache
+ * to be built from reference-counters.
+ *
+ * Client API
+ *
+ * The client has a number of required interactions:
+ * 1. prepare(...). This creates a cache op. All cache ops required for a single
+ * parent op must be prepared before any are executed.
+ * 2. execute(...). Execute an IO. This gives the cache permission to perform
+ * the IO. This function can (and frequently does) call back
+ * re-entrantly, so the caller must be aware that this can
+ * happen.
+ *
+ * The client must provide a mechanism for the extent cache to read. It does
+ * this by extending the ECExtentCache::BackendRead class.
+ *
+ * Once a read is complete, the client must call cache.read_done().
+ *
+ * When the cache is ready, it will call back the lambda passed to execute.
+ * The client is expected to populate the write data, including any parity
+ * data, by calling the cache.write_done() method.
+ *
+ * Finally, there is an on_change() and on_change2() methods. The first of these
+ * instructs the extent cache to discard any ops it has queued. The second
+ * simply asserts that the cache is now idle, this is to ensure that the calling
+ * code has performed the required clean up to clear the extent cache.
+ */
+
#pragma once
-// Temporary stubs
+#include "ECUtil.h"
+#include "include/Context.h"
+
class ECExtentCache {
+ class Address;
+ class Line;
+ class Object;
+ typedef std::shared_ptr<Line> LineRef;
+ typedef std::list<LineRef>::iterator LineIter;
+
public:
+ class LRU;
+ class Op;
+ typedef std::shared_ptr<Op> OpRef;
+
+ struct BackendReadListener {
+ virtual void backend_read(hobject_t oid,
+ ECUtil::shard_extent_set_t const &request,
+ uint64_t object_size) = 0;
+ virtual ~BackendReadListener() = default;
+ };
+
+ static void update_mempool(int items, int64_t bytes) {
+ mempool::get_pool(mempool::pool_index_t(mempool::mempool_ec_extent_cache)).
+ adjust_count(items, bytes);
+ }
+
class LRU {
public:
- LRU(uint64_t) {}
+ class Key {
+ public:
+ uint64_t offset;
+ hobject_t oid;
+
+ Key(uint64_t offset, const hobject_t &oid) : offset(offset), oid(oid) {};
+
+ friend bool operator==(const Key &lhs, const Key &rhs) {
+ return lhs.offset == rhs.offset
+ && lhs.oid == rhs.oid;
+ }
+
+ friend bool operator!=(const Key &lhs, const Key &rhs) {
+ return !(lhs == rhs);
+ }
+ };
+
+ struct KeyHash {
+ std::size_t operator()(const Key &obj) const {
+ std::size_t seed = 0x625610ED;
+ seed ^= (seed << 6) + (seed >> 2) + 0x1E665363 + static_cast<
+ std::size_t>(obj.offset);
+ seed ^= (seed << 6) + (seed >> 2) + 0x51343C80 + obj.oid.get_hash();
+ return seed;
+ }
+ };
+
+ private:
+ friend class Object;
+ friend class ECExtentCache;
+ std::unordered_map<Key, std::pair<
+ std::list<Key>::iterator, std::shared_ptr<
+ ECUtil::shard_extent_map_t>>, KeyHash> map;
+ std::list<Key> lru;
+ uint64_t max_size = 0;
+ uint64_t size = 0;
+ ceph::mutex mutex = ceph::make_mutex("ECExtentCache::LRU");
+
+ void free_maybe();
+ void discard();
+ void add(const Line &line);
+ void erase(const Key &k);
+ std::list<Key>::iterator erase(const std::list<Key>::iterator &it,
+ bool update_mempool);
+ std::shared_ptr<ECUtil::shard_extent_map_t> find(
+ const hobject_t &oid, uint64_t offset);
+ void remove_object(const hobject_t &oid);
+
+ public:
+ explicit LRU(uint64_t max_size) : map(), max_size(max_size) {}
+ };
+
+ class Op {
+ friend class Object;
+ friend class ECExtentCache;
+
+ Object &object;
+ std::optional<ECUtil::shard_extent_set_t> const reads;
+ ECUtil::shard_extent_set_t const writes;
+ ECUtil::shard_extent_map_t result;
+ bool complete = false;
+ bool invalidates_cache = false;
+ bool reading = false;
+ bool read_done = false;
+ uint64_t projected_size = 0;
+ GenContextURef<OpRef&> cache_ready_cb;
+ std::list<LineRef> lines;
+
+ // List of callbacks to be executed on write completion (not commit)
+ std::list<std::function<void(void)>> on_write;
+
+ const extent_set get_pin_eset(uint64_t alignment) const;
+
+ public:
+ explicit Op(
+ GenContextURef<OpRef&> &&cache_ready_cb,
+ Object &object,
+ std::optional<ECUtil::shard_extent_set_t> const &to_read,
+ ECUtil::shard_extent_set_t const &write,
+ uint64_t projected_size,
+ bool invalidates_cache);
+
+ ~Op();
+ void cancel() { delete cache_ready_cb.release(); }
+ const ECUtil::shard_extent_set_t &get_writes() const { return writes; }
+ const Object &get_object() const { return object; }
+ const hobject_t &get_hoid() const { return object.oid; }
+ const ECUtil::shard_extent_map_t &get_result() { return result; }
+
+ void add_on_write(std::function<void(void)> &&cb) {
+ on_write.emplace_back(std::move(cb));
+ }
+
+ bool complete_if_reads_cached(OpRef &op_ref) {
+ if (!read_done) {
+ return false;
+ }
+ result = object.get_cache(reads);
+ complete = true;
+ cache_ready_cb.release()->complete(op_ref);
+ return true;
+ }
+
+ void write_done(ECUtil::shard_extent_map_t const &update) const {
+ object.write_done(update, projected_size);
+ for (auto &cb: on_write) {
+ cb();
+ }
+ }
};
-};
+
+#define MIN_LINE_SIZE (32UL*1024UL)
+
+private:
+ class Object {
+ friend class Op;
+ friend class LRU;
+ friend class Line;
+ friend class ECExtentCache;
+
+ ECExtentCache &pg;
+ ECUtil::shard_extent_set_t requesting;
+ ECUtil::shard_extent_set_t do_not_read;
+ std::list<OpRef> reading_ops;
+ std::list<OpRef> requesting_ops;
+ // Map of the byte-offset of the start of the line to the line.
+ std::map<uint64_t, std::weak_ptr<Line>> lines;
+ int active_ios = 0;
+ uint64_t current_size = 0;
+ uint64_t projected_size = 0;
+ uint64_t line_size = 0;
+ bool reading = false;
+ bool cache_invalidated = false;
+ bool cache_invalidate_expected = false;
+
+ void request(OpRef &op);
+ void send_reads();
+ void unpin(Op &op) const;
+ void delete_maybe() const;
+ void erase_line(uint64_t offset);
+ void invalidate(const OpRef &invalidating_op);
+
+ public:
+ hobject_t oid;
+
+ Object(ECExtentCache &pg, hobject_t const &oid, uint64_t size) :
+ pg(pg),
+ requesting(pg.sinfo.get_k_plus_m()),
+ do_not_read(pg.sinfo.get_k_plus_m()),
+ current_size(size),
+ projected_size(size),
+ oid(oid) {
+ line_size = std::max(MIN_LINE_SIZE, pg.sinfo.get_chunk_size());
+ }
+
+ void insert(ECUtil::shard_extent_map_t const &buffers) const;
+ void write_done(ECUtil::shard_extent_map_t const &buffers, uint64_t new_size);
+ void read_done(ECUtil::shard_extent_map_t const &result);
+ [[nodiscard]] uint64_t get_projected_size() const { return projected_size; }
+ ECUtil::shard_extent_map_t get_cache(
+ std::optional<ECUtil::shard_extent_set_t> const &set) const;
+ uint64_t line_align(uint64_t line) const;
+ };
+
+
+ class Line {
+ public:
+ uint64_t offset;
+ uint64_t size;
+ std::shared_ptr<ECUtil::shard_extent_map_t> cache;
+ Object &object;
+
+ Line(Object &object,
+ uint64_t offset) :
+ offset(offset),
+ object(object) {
+ std::shared_ptr<ECUtil::shard_extent_map_t> c = object.pg.lru.find(
+ object.oid, offset);
+
+ if (c == nullptr) {
+ cache = std::make_shared<ECUtil::shard_extent_map_t>(&object.pg.sinfo);
+ size = 0;
+ /* We are creating an empty cache line */
+ update_mempool(1, 0);
+ } else {
+ cache = c;
+ size = c->size();
+ }
+ }
+
+ ~Line() {
+ object.pg.lru.add(*this);
+ object.erase_line(offset);
+ }
+
+ friend bool operator==(const Line &lhs, const Line &rhs) {
+ return lhs.offset == rhs.offset
+ && lhs.object.oid == rhs.object.oid;
+ }
+
+ friend bool operator!=(const Line &lhs, const Line &rhs) {
+ return !(lhs == rhs);
+ }
+ };
+
+ std::map<hobject_t, Object> objects;
+ BackendReadListener &backend_read;
+ LRU &lru;
+ const ECUtil::stripe_info_t &sinfo;
+ std::list<OpRef> waiting_ops;
+ void cache_maybe_ready();
+ uint32_t counter = 0;
+ uint32_t active_ios = 0;
+ CephContext *cct;
+
+ OpRef prepare(GenContextURef<OpRef&> &&ctx,
+ hobject_t const &oid,
+ std::optional<ECUtil::shard_extent_set_t> const &to_read,
+ ECUtil::shard_extent_set_t const &write,
+ uint64_t orig_size,
+ uint64_t projected_size,
+ bool invalidates_cache);
+
+ public:
+ ~ECExtentCache() {
+ // This should really only be needed in failed tests, as the PG should
+ // clear up any IO before it gets destructed. However, here we make sure
+ // to clean up any outstanding IO.
+ on_change();
+ on_change2();
+ }
+
+ explicit ECExtentCache(BackendReadListener &backend_read,
+ LRU &lru, const ECUtil::stripe_info_t &sinfo,
+ CephContext *cct
+ ) :
+ backend_read(backend_read),
+ lru(lru),
+ sinfo(sinfo),
+ cct(cct) {}
+
+ // Insert some data into the cache.
+ void read_done(hobject_t const &oid, ECUtil::shard_extent_map_t const &update);
+ void write_done(OpRef const &op, ECUtil::shard_extent_map_t const &update);
+ void on_change();
+ void on_change2() const;
+ [[nodiscard]] bool contains_object(hobject_t const &oid) const;
+ [[nodiscard]] uint64_t get_projected_size(hobject_t const &oid) const;
+
+ template <typename CacheReadyCb>
+ OpRef prepare(hobject_t const &oid,
+ std::optional<ECUtil::shard_extent_set_t> const &to_read,
+ ECUtil::shard_extent_set_t const &write,
+ uint64_t orig_size,
+ uint64_t projected_size,
+ bool invalidates_cache,
+ CacheReadyCb &&ready_cb) {
+ GenContextURef<OpRef&> ctx =
+ make_gen_lambda_context<OpRef&, CacheReadyCb>(
+ std::forward<CacheReadyCb>(ready_cb));
+
+ return prepare(std::move(ctx), oid, to_read, write, orig_size,
+ projected_size, invalidates_cache);
+ }
+
+ void execute(std::list<OpRef> &op_list);
+ [[nodiscard]] bool idle() const;
+ uint32_t get_and_reset_counter();
+
+ void add_on_write(std::function<void(void)> &&cb) const {
+ if (waiting_ops.empty()) {
+ cb();
+ } else {
+ waiting_ops.back()->add_on_write(std::move(cb));
+ }
+ }
+}; // ECExtentCaches
using ceph::encode;
using ceph::ErasureCodeInterfaceRef;
-static void encode_and_write(
- pg_t pgid,
- const hobject_t &oid,
- const ECUtil::stripe_info_t &sinfo,
- ErasureCodeInterfaceRef &ecimpl,
- const set<int> &want,
- uint64_t offset,
- bufferlist bl,
- uint32_t flags,
- ECUtil::HashInfoRef hinfo,
- extent_map &written,
- map<shard_id_t, ObjectStore::Transaction> *transactions,
- DoutPrefixProvider *dpp)
-{
- const uint64_t before_size = hinfo->get_total_logical_size(sinfo);
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset));
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length()));
- ceph_assert(bl.length());
-
- map<int, bufferlist> buffers;
- int r = ECUtil::encode(
- sinfo, ecimpl, bl, want, &buffers);
- ceph_assert(r == 0);
+void debug(const hobject_t &oid, const std::string &str,
+ const ECUtil::shard_extent_map_t &map, DoutPrefixProvider *dpp
+ ) {
+#if DEBUG_EC_BUFFERS
+ ldpp_dout(dpp, 20)
+ << "EC_DEBUG_BUFFERS: generate_transactions: "
+ << "oid: " << oid
+ << " " << str << " " << map.debug_string(2048, 8) << dendl;
+#else
+ ldpp_dout(dpp, 20)
+ << "generate_transactions: "
+ << "oid: " << oid
+ << str << map << dendl;
+#endif
+}
- written.insert(offset, bl.length(), bl);
+void ECTransaction::Generate::encode_and_write() {
+ // For PDW, we already have necessary parity buffers.
+ if (!plan.do_parity_delta_write) {
+ to_write.insert_parity_buffers();
+ }
+ // If partial writes are not supported, pad out to_write to a full stripe.
+ if (!sinfo.supports_partial_writes()) {
+ for (auto &&[shard, eset]: plan.will_write) {
+ if (sinfo.get_raw_shard(shard) >= sinfo.get_k()) continue;
+
+ for (auto [off, len]: eset) {
+ to_write.zero_pad(shard, off, len);
+ }
+ }
+ }
+
+ int r = 0;
+ if (plan.do_parity_delta_write) {
+ /* For parity delta writes, we remove any unwanted writes before calculating
+ * the parity.
+ */
+ read_sem->zero_pad(plan.will_write);
+ to_write.pad_with_other(plan.will_write, *read_sem);
+ r = to_write.encode_parity_delta(ec_impl, *read_sem);
+ } else {
+ r = to_write.encode(ec_impl, plan.hinfo, plan.orig_size);
+ }
+ ceph_assert(r == 0);
+ // Remove any unnecessary writes.
+ //to_write = to_write.intersect(plan.will_write);
+
+ debug(oid, "parity", to_write, dpp);
ldpp_dout(dpp, 20) << __func__ << ": " << oid
- << " new_size "
- << offset + bl.length()
- << dendl;
-
- if (offset >= before_size) {
- ceph_assert(offset == before_size);
- hinfo->append(
- sinfo.aligned_logical_offset_to_chunk_offset(offset),
- buffers);
- }
-
- for (auto &&i : *transactions) {
- ceph_assert(buffers.count(static_cast<int>(i.first)));
- bufferlist &enc_bl = buffers[static_cast<int>(i.first)];
- if (offset >= before_size) {
- i.second.set_alloc_hint(
- coll_t(spg_t(pgid, i.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, i.first),
- 0, 0,
- CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
- CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+ << " plan " << plan
+ << dendl;
+
+ for (auto &&[shard, to_write_eset]: plan.will_write) {
+ /* Zero pad, even if we are not writing. The extent cache requires that
+ * all shards are fully populated with write data, even if the OSDs are
+ * down. This is not a fundamental requirement of the cache, but dealing
+ * with implied zeros due to incomplete writes is both difficult and
+ * removes a level of protection against bugs.
+ */
+ for (auto &&[offset, len]: to_write_eset) {
+ to_write.zero_pad(shard, offset, len);
+ }
+
+ if (transactions.contains(shard)) {
+ auto &t = transactions.at(shard);
+ if (to_write_eset.begin().get_start() >= plan.orig_size) {
+ t.set_alloc_hint(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ 0, 0,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+ }
+
+ for (auto &&[offset, len]: to_write_eset) {
+ buffer::list bl;
+ to_write.get_buffer(shard, offset, len, bl);
+ t.write(coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ offset, bl.length(), bl, fadvise_flags);
+ }
}
- i.second.write(
- coll_t(spg_t(pgid, i.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, i.first),
- sinfo.logical_to_prev_chunk_offset(
- offset),
- enc_bl.length(),
- enc_bl,
- flags);
}
}
-void ECTransaction::generate_transactions(
- PGTransaction* _t,
- WritePlan &plan,
- ErasureCodeInterfaceRef &ecimpl,
- pg_t pgid,
- const ECUtil::stripe_info_t &sinfo,
- const map<hobject_t,extent_map> &partial_extents,
- vector<pg_log_entry_t> &entries,
- map<hobject_t,extent_map> *written_map,
- map<shard_id_t, ObjectStore::Transaction> *transactions,
- set<hobject_t> *temp_added,
- set<hobject_t> *temp_removed,
- DoutPrefixProvider *dpp,
- const ceph_release_t require_osd_release)
+ECTransaction::WritePlanObj::WritePlanObj(
+ const hobject_t &hoid,
+ const PGTransaction::ObjectOperation &op,
+ const ECUtil::stripe_info_t &sinfo,
+ const shard_id_set readable_shards,
+ const shard_id_set writable_shards,
+ const bool object_in_cache,
+ uint64_t orig_size,
+ const std::optional<object_info_t> &oi,
+ const std::optional<object_info_t> &soi,
+ const ECUtil::HashInfoRef &&hinfo,
+ const ECUtil::HashInfoRef &&shinfo,
+ const unsigned pdw_write_mode
+ ) :
+ hoid(hoid),
+ will_write(sinfo.get_k_plus_m()),
+ hinfo(hinfo),
+ shinfo(shinfo),
+ orig_size(orig_size) // On-disk object sizes are rounded up to the next page.
{
- ceph_assert(written_map);
- ceph_assert(transactions);
- ceph_assert(temp_added);
- ceph_assert(temp_removed);
- ceph_assert(_t);
- auto &t = *_t;
+ extent_set unaligned_ro_writes;
- auto &hash_infos = plan.hash_infos;
+ projected_size = oi ? oi->size : 0;
- map<hobject_t, pg_log_entry_t*> obj_to_log;
- for (auto &&i: entries) {
- obj_to_log.insert(make_pair(i.soid, &i));
+ if (soi) {
+ projected_size = soi->size;
}
- t.safe_create_traverse(
- [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
- const hobject_t &oid = opair.first;
- auto &op = opair.second;
- auto &obc_map = t.obc_map;
- auto &written = (*written_map)[oid];
+ hobject_t source;
+ invalidates_cache = op.has_source(&source) || op.is_delete();
+
+ op.buffer_updates.to_interval_set(unaligned_ro_writes);
+ /* We can get multiple truncates/appends in a single tranaction. These get
+ * simplified to two values - a minimum and a maximum. It is not guaranteed
+ * that this region has writes. We create writes for this region so as to
+ * essentially write zeros (or holes) in that region.
+ */
+
+ if (op.truncate) {
+ uint64_t start = op.truncate->first;
+ uint64_t end = projected_size;
+ if (projected_size > op.truncate->second ) {
+ end = op.truncate->second;
+ }
+ if (end > start) {
+ unaligned_ro_writes.insert(start, end - start);
+ }
+ }
- auto iter = obj_to_log.find(oid);
- pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+ /* Calculate any non-aligned pages. These need to be read and written */
+ extent_set aligned_ro_writes(unaligned_ro_writes);
+ aligned_ro_writes.align(CEPH_PAGE_SIZE);
+ extent_set partial_page_ro_writes(aligned_ro_writes);
+ partial_page_ro_writes.subtract(unaligned_ro_writes);
+ partial_page_ro_writes.align(CEPH_PAGE_SIZE);
+
+ extent_set write_superset;
+ for (auto &&[off, len] : unaligned_ro_writes) {
+ sinfo.ro_range_to_shard_extent_set_with_superset(
+ off, len, will_write, write_superset);
+ }
+ write_superset.align(CEPH_PAGE_SIZE);
+
+ shard_id_set writable_parity_shards = shard_id_set::intersection(sinfo.get_parity_shards(), writable_shards);
+ for (auto shard : writable_parity_shards) {
+ will_write[shard].insert(write_superset);
+ }
+
+ ECUtil::shard_extent_set_t reads(sinfo.get_k_plus_m());
+ ECUtil::shard_extent_set_t read_mask(sinfo.get_k_plus_m());
- ObjectContextRef obc;
- auto obiter = t.obc_map.find(oid);
- if (obiter != t.obc_map.end()) {
- obc = obiter->second;
+ if (!sinfo.supports_partial_writes()) {
+ for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+ will_write[shard].insert(write_superset);
+ }
+ will_write.align(sinfo.get_chunk_size());
+ reads = will_write;
+ sinfo.ro_size_to_read_mask(sinfo.ro_offset_to_next_stripe_ro_offset(orig_size), read_mask);
+ reads.intersection_of(read_mask);
+ do_parity_delta_write = false;
+ } else {
+ will_write.align(CEPH_PAGE_SIZE);
+ ECUtil::shard_extent_set_t pdw_reads(will_write);
+
+ sinfo.ro_size_to_read_mask(ECUtil::align_page_next(orig_size), read_mask);
+
+ /* Next we need to add the reads required for a conventional write */
+ for (auto shard : sinfo.get_data_shards()) {
+ reads[shard].insert(write_superset);
+ if (will_write.contains(shard)) {
+ reads[shard].subtract(will_write.at(shard));
}
- if (entry) {
- ceph_assert(obc);
- } else {
- ceph_assert(oid.is_temp());
+ if (reads[shard].empty()) {
+ reads.erase(shard);
}
+ }
- ECUtil::HashInfoRef hinfo;
- {
- auto iter = hash_infos.find(oid);
- ceph_assert(iter != hash_infos.end());
- hinfo = iter->second;
- }
+ /* We now need to add in the partial page ro writes. This is not particularly
+ * efficient as the are many divs in here, but non-4k aligned writes are
+ * not very efficient anyway
+ */
+ for (auto &&[off, len] : partial_page_ro_writes) {
+ sinfo.ro_range_to_shard_extent_set(
+ off, len, reads);
+ }
- if (oid.is_temp()) {
- if (op.is_fresh_object()) {
- temp_added->insert(oid);
- } else if (op.is_delete()) {
- temp_removed->insert(oid);
- }
- }
+ reads.intersection_of(read_mask);
- if (entry &&
- entry->is_modify() &&
- op.updated_snaps) {
- bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
- encode(op.updated_snaps->second, bl);
- entry->snaps.swap(bl);
- entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
- }
+ /* Here we decide if we want to do a conventional write or a parity delta write. */
+ if (sinfo.supports_parity_delta_writes() && !object_in_cache &&
+ orig_size == projected_size && !reads.empty()) {
- ldpp_dout(dpp, 20) << "generate_transactions: "
- << opair.first
- << ", current size is "
- << hinfo->get_total_logical_size(sinfo)
- << " buffers are "
- << op.buffer_updates
- << dendl;
- if (op.truncate) {
- ldpp_dout(dpp, 20) << "generate_transactions: "
- << " truncate is "
- << *(op.truncate)
- << dendl;
- }
+ shard_id_set read_shards = reads.get_shard_id_set();
+ shard_id_set pdw_read_shards = pdw_reads.get_shard_id_set();
- if (entry && op.updated_snaps) {
- entry->mod_desc.update_snaps(op.updated_snaps->first);
+ if (pdw_write_mode != 0) {
+ do_parity_delta_write = (pdw_write_mode == 2);
+ } else if (!shard_id_set::difference(pdw_read_shards, readable_shards).empty()) {
+ // Some kind of reconstruct would be needed for PDW, so don't bother.
+ do_parity_delta_write = false;
+ } else if (!shard_id_set::difference(read_shards, readable_shards).empty()) {
+ // Some kind of reconstruct is needed for conventional, but NOT for PDW!
+ do_parity_delta_write = true;
+ } else {
+ /* Everything we need for both is available, opt for which ever is less
+ * reads.
+ */
+ do_parity_delta_write = pdw_read_shards.size() < read_shards.size();
}
- map<string, std::optional<bufferlist> > xattr_rollback;
- ceph_assert(hinfo);
- bufferlist old_hinfo;
- encode(*hinfo, old_hinfo);
- xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
-
- if (op.is_none() && op.truncate && op.truncate->first == 0) {
- ceph_assert(entry);
- ceph_assert(obc);
-
- if (op.truncate->first != op.truncate->second) {
- op.truncate->first = op.truncate->second;
- } else {
- op.truncate = std::nullopt;
- }
-
- op.delete_first = true;
- op.init_type = PGTransaction::ObjectOperation::Init::Create();
-
- if (obc) {
- /* We need to reapply all of the cached xattrs.
- * std::map insert fortunately only writes keys
- * which don't already exist, so this should do
- * the right thing. */
- op.attr_updates.insert(
- obc->attr_cache.begin(),
- obc->attr_cache.end());
- }
+ if (do_parity_delta_write) {
+ to_read = std::move(pdw_reads);
+ reads.clear(); // So we don't stash it at the end.
}
+ }
- if (op.delete_first) {
- /* We also want to remove the std::nullopt entries since
- * the keys already won't exist */
- for (auto j = op.attr_updates.begin();
- j != op.attr_updates.end();
- ) {
- if (j->second) {
- ++j;
- } else {
- op.attr_updates.erase(j++);
- }
- }
- /* Fill in all current entries for xattr rollback */
- if (obc) {
- xattr_rollback.insert(
- obc->attr_cache.begin(),
- obc->attr_cache.end());
- obc->attr_cache.clear();
- }
- if (entry) {
- entry->mod_desc.rmobject(entry->version.version);
- for (auto &&st: *transactions) {
- st.second.collection_move_rename(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, entry->version.version, st.first));
- }
- } else {
- for (auto &&st: *transactions) {
- st.second.remove(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first));
- }
- }
- hinfo->clear();
- }
+ /* NOTE: We intentionally leave un-writable shards in the write plan. As
+ * it is actually less efficient to take them out:- PDWs still need to
+ * compute the deltas and conventional writes still need to calcualte the
+ * parity. The transaction will be dropped by generate_transactions.
+ */
+ }
+
+ if (!reads.empty()) {
+ to_read = std::move(reads);
+ }
- if (op.is_fresh_object() && entry) {
- entry->mod_desc.create();
+ /* validate post conditions:
+ * to_read should have an entry for `obj` if it isn't empty
+ * and if we are reading from `obj`, we can't be renaming or
+ * cloning it */
+ ceph_assert(!to_read || !soi);
+}
+
+void ECTransaction::Generate::all_shards_written() {
+ if (entry) {
+ entry->written_shards.insert_range(shard_id_t(0), sinfo.get_k_plus_m());
+ }
+}
+
+void ECTransaction::Generate::shard_written(const shard_id_t shard) {
+ if (entry) {
+ entry->written_shards.insert(shard);
+ }
+}
+
+void ECTransaction::Generate::shards_written(const shard_id_set &shards) {
+ if (entry) {
+ entry->written_shards.insert(shards);
+ }
+}
+
+void ECTransaction::Generate::zero_truncate_to_delete() {
+ ceph_assert(obc);
+
+ if (op.truncate->first != op.truncate->second) {
+ op.truncate->first = op.truncate->second;
+ } else {
+ op.truncate = std::nullopt;
+ }
+
+ op.delete_first = true;
+ op.init_type = PGTransaction::ObjectOperation::Init::Create();
+
+ if (obc) {
+ /* We need to reapply all of the cached xattrs.
+ * std::map insert fortunately only writes keys
+ * which don't already exist, so this should do
+ * the right thing. */
+ op.attr_updates.insert(
+ obc->attr_cache.begin(),
+ obc->attr_cache.end());
+ }
+}
+
+void ECTransaction::Generate::delete_first() {
+ /* We also want to remove the std::nullopt entries since
+ * the keys already won't exist */
+ for (auto j = op.attr_updates.begin();
+ j != op.attr_updates.end();
+ ) {
+ if (j->second) {
+ ++j;
+ } else {
+ j = op.attr_updates.erase(j);
+ }
+ }
+ /* Fill in all current entries for xattr rollback */
+ if (obc) {
+ xattr_rollback.insert(
+ obc->attr_cache.begin(),
+ obc->attr_cache.end());
+ obc->attr_cache.clear();
+ }
+ if (entry) {
+ entry->mod_desc.rmobject(entry->version.version);
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ t.collection_move_rename(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, entry->version.version, shard));
+ }
+ } else {
+ for (auto &&[shard, t]: transactions) {
+ t.remove(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard));
+ }
+ }
+ if (plan.hinfo)
+ plan.hinfo->clear();
+}
+
+void ECTransaction::Generate::process_init() {
+ match(
+ op.init_type,
+ [&](const PGTransaction::ObjectOperation::Init::None &) {},
+ [&](const PGTransaction::ObjectOperation::Init::Create &_) {
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ if (osdmap->require_osd_release >= ceph_release_t::octopus) {
+ t.create(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard));
+ } else {
+ t.touch(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard));
+ }
+ }
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Clone &cop) {
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ t.clone(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(cop.source, ghobject_t::NO_GEN, shard),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard));
}
- match(
- op.init_type,
- [&](const PGTransaction::ObjectOperation::Init::None &) {},
- [&](const PGTransaction::ObjectOperation::Init::Create &op) {
- for (auto &&st: *transactions) {
- if (require_osd_release >= ceph_release_t::octopus) {
- st.second.create(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first));
- } else {
- st.second.touch(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first));
- }
- }
- },
- [&](const PGTransaction::ObjectOperation::Init::Clone &op) {
- for (auto &&st: *transactions) {
- st.second.clone(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first));
- }
-
- auto siter = hash_infos.find(op.source);
- ceph_assert(siter != hash_infos.end());
- hinfo->update_to(*(siter->second));
-
- if (obc) {
- auto cobciter = obc_map.find(op.source);
- ceph_assert(cobciter != obc_map.end());
- obc->attr_cache = cobciter->second->attr_cache;
- }
- },
- [&](const PGTransaction::ObjectOperation::Init::Rename &op) {
- ceph_assert(op.source.is_temp());
- for (auto &&st: *transactions) {
- st.second.collection_move_rename(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first));
- }
- auto siter = hash_infos.find(op.source);
- ceph_assert(siter != hash_infos.end());
- hinfo->update_to(*(siter->second));
- if (obc) {
- auto cobciter = obc_map.find(op.source);
- ceph_assert(cobciter == obc_map.end());
- obc->attr_cache.clear();
- }
- });
-
- // omap not supported (except 0, handled above)
- ceph_assert(!(op.clear_omap));
- ceph_assert(!(op.omap_header));
- ceph_assert(op.omap_updates.empty());
-
- if (!op.attr_updates.empty()) {
- map<string, bufferlist, less<>> to_set;
- for (auto &&j: op.attr_updates) {
- if (j.second) {
- to_set[j.first] = *(j.second);
- } else {
- for (auto &&st : *transactions) {
- st.second.rmattr(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- j.first);
- }
- }
- if (obc) {
- auto citer = obc->attr_cache.find(j.first);
- if (entry) {
- if (citer != obc->attr_cache.end()) {
- // won't overwrite anything we put in earlier
- xattr_rollback.insert(
- make_pair(
- j.first,
- std::optional<bufferlist>(citer->second)));
- } else {
- // won't overwrite anything we put in earlier
- xattr_rollback.insert(
- make_pair(
- j.first,
- std::nullopt));
- }
- }
- if (j.second) {
- obc->attr_cache[j.first] = *(j.second);
- } else if (citer != obc->attr_cache.end()) {
- obc->attr_cache.erase(citer);
- }
- } else {
- ceph_assert(!entry);
- }
- }
- for (auto &&st : *transactions) {
- st.second.setattrs(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- to_set);
- }
- ceph_assert(!xattr_rollback.empty());
+ if (plan.hinfo && plan.shinfo)
+ plan.hinfo->update_to(*plan.shinfo);
+
+ if (obc) {
+ auto cobciter = t.obc_map.find(cop.source);
+ ceph_assert(cobciter != t.obc_map.end());
+ obc->attr_cache = cobciter->second->attr_cache;
}
- if (entry && !xattr_rollback.empty()) {
- entry->mod_desc.setattrs(xattr_rollback);
+ },
+ [&](const PGTransaction::ObjectOperation::Init::Rename &rop) {
+ ceph_assert(rop.source.is_temp());
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ t.collection_move_rename(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(rop.source, ghobject_t::NO_GEN, shard),
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard));
}
+ if (plan.hinfo && plan.shinfo)
+ plan.hinfo->update_to(*plan.shinfo);
+ if (obc) {
+ auto cobciter = t.obc_map.find(rop.source);
+ ceph_assert(cobciter == t.obc_map.end());
+ obc->attr_cache.clear();
+ }
+ });
+}
+
+void alloc_hint(PGTransaction::ObjectOperation& op,
+ shard_id_map<ObjectStore::Transaction> &transactions,
+ pg_t &pgid,
+ const hobject_t &oid,
+ const ECUtil::stripe_info_t &sinfo) {
+ /* ro_offset_to_next_chunk_offset() scales down both aligned and
+ * unaligned offsets
+
+ * we don't bother to roll this back at this time for two reasons:
+ * 1) it's advisory
+ * 2) we don't track the old value */
+ uint64_t object_size = sinfo.ro_offset_to_next_chunk_offset(
+ op.alloc_hint->expected_object_size);
+ uint64_t write_size = sinfo.ro_offset_to_next_chunk_offset(
+ op.alloc_hint->expected_write_size);
+
+ for (auto &&[shard, t]: transactions) {
+ t.set_alloc_hint(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ object_size,
+ write_size,
+ op.alloc_hint->flags);
+ }
+}
+
+ECTransaction::Generate::Generate(PGTransaction &t,
+ ErasureCodeInterfaceRef &ec_impl,
+ pg_t &pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+ std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+ shard_id_map<ceph::os::Transaction> &transactions,
+ const OSDMapRef &osdmap,
+ const hobject_t &oid,
+ PGTransaction::ObjectOperation &op,
+ WritePlanObj &plan,
+ DoutPrefixProvider *dpp,
+ pg_log_entry_t *entry)
+ : t(t),
+ ec_impl(ec_impl),
+ pgid(pgid),
+ sinfo(sinfo),
+ transactions(transactions),
+ dpp(dpp),
+ osdmap(osdmap),
+ entry(entry),
+ oid(oid),
+ op(op),
+ plan(plan),
+ read_sem(&sinfo),
+ to_write(&sinfo) {
+ auto obiter = t.obc_map.find(oid);
+ if (obiter != t.obc_map.end()) {
+ obc = obiter->second;
+ }
+
+ if (entry) {
+ ceph_assert(obc);
+ } else {
+ ceph_assert(oid.is_temp());
+ }
+
+ if (entry && entry->is_modify() && op.updated_snaps) {
+ bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
+ encode(op.updated_snaps->second, bl);
+ entry->snaps.swap(bl);
+ entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << ": " << oid << plan
+ << " fresh_object: " << op.is_fresh_object()
+ << dendl;
+ if (op.truncate) {
+ ldpp_dout(dpp, 20) << __func__ << ": truncate is " << *(op.truncate) << dendl;
+ }
+
+ if (entry && op.updated_snaps) {
+ entry->mod_desc.update_snaps(op.updated_snaps->first);
+ }
+
+ bufferlist old_hinfo;
+ if (plan.hinfo) {
+ encode(*(plan.hinfo), old_hinfo);
+ xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
+ }
- if (op.alloc_hint) {
- /* logical_to_next_chunk_offset() scales down both aligned and
- * unaligned offsets
-
- * we don't bother to roll this back at this time for two reasons:
- * 1) it's advisory
- * 2) we don't track the old value */
- uint64_t object_size = sinfo.logical_to_next_chunk_offset(
- op.alloc_hint->expected_object_size);
- uint64_t write_size = sinfo.logical_to_next_chunk_offset(
- op.alloc_hint->expected_write_size);
-
- for (auto &&st : *transactions) {
- st.second.set_alloc_hint(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- object_size,
- write_size,
- op.alloc_hint->flags);
- }
+ if (op.is_none() && op.truncate && op.truncate->first == 0) {
+ zero_truncate_to_delete();
+ }
+
+ if (op.delete_first) {
+ delete_first();
+ }
+
+ if (op.is_fresh_object() && entry) {
+ entry->mod_desc.create();
+ }
+
+ process_init();
+
+ // omap not supported (except 0, handled above)
+ ceph_assert(!(op.clear_omap) && !(op.omap_header) && op.omap_updates.empty());
+
+ if (op.alloc_hint) {
+ alloc_hint(op, transactions, pgid, oid, sinfo);
+ }
+
+ auto pextiter = partial_extents.find(oid);
+ if (pextiter != partial_extents.end()) {
+ if (plan.do_parity_delta_write) {
+ read_sem = pextiter->second;
+ } else {
+ to_write = pextiter->second;
+ }
+ }
+ debug(oid, "to_write", to_write, dpp);
+ ldpp_dout(dpp, 20) << "generate_transactions: plan: " << plan << dendl;
+
+ if (op.truncate && op.truncate->first < plan.orig_size) {
+ truncate();
+ }
+
+ overlay_writes();
+ appends_and_clone_ranges();
+
+ /* The write plan is permitted to drop parity shards when the shard is
+ * missing. However, written_shards must contain all parity shards.
+ * Note that the write plan will *not* drop data shards.
+ */
+ shards_written(sinfo.get_parity_shards());
+
+ if (!to_write.empty()) {
+ encode_and_write();
+ }
+
+ written_map->emplace(oid, std::move(to_write));
+
+ if (entry && plan.hinfo) {
+ plan.hinfo->set_total_chunk_size_clear_hash(
+ sinfo.ro_offset_to_next_stripe_ro_offset(plan.projected_size));
+ }
+
+ if (entry && plan.orig_size < plan.projected_size) {
+ entry->mod_desc.append(ECUtil::align_page_next(plan.orig_size));
+ }
+
+ if (!op.attr_updates.empty()) {
+ attr_updates();
+ }
+
+ if (entry && !xattr_rollback.empty()) {
+ entry->mod_desc.setattrs(xattr_rollback);
+ }
+
+ if (!op.is_delete()) {
+ handle_deletes();
+ }
+
+ written_and_present_shards();
+}
+
+void ECTransaction::Generate::truncate() {
+ ceph_assert(!op.is_fresh_object());
+ // causes encode to invent zeros
+ to_write.erase_after_ro_offset(plan.orig_size);
+ all_shards_written();
+
+ debug(oid, "truncate_erase", to_write, dpp);
+
+ if (entry && !op.is_fresh_object()) {
+ uint64_t restore_from = sinfo.ro_offset_to_prev_chunk_offset(
+ op.truncate->first);
+ uint64_t restore_len = sinfo.aligned_ro_offset_to_chunk_offset(
+ plan.orig_size -
+ sinfo.ro_offset_to_prev_stripe_ro_offset(op.truncate->first));
+ shard_id_set all_shards; // intentionally left blank!
+ rollback_extents.emplace_back(make_pair(restore_from, restore_len));
+ rollback_shards.emplace_back(all_shards);
+ for (auto &&[shard, t]: transactions) {
+ t.touch(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, entry->version.version, shard));
+ t.clone_range(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ ghobject_t(oid, entry->version.version, shard),
+ restore_from,
+ restore_len,
+ restore_from);
+ }
+ }
+
+ for (auto &&[shard, t]: transactions) {
+ t.truncate(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ sinfo.ro_offset_to_shard_offset(plan.orig_size,
+ sinfo.get_raw_shard(shard)));
+ }
+}
+
+void ECTransaction::Generate::overlay_writes() {
+ for (auto &&extent: op.buffer_updates) {
+ using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+ bufferlist bl;
+ match(
+ extent.get_val(),
+ [&](const BufferUpdate::Write &wop) {
+ bl = wop.buffer;
+ fadvise_flags |= wop.fadvise_flags;
+ },
+ [&](const BufferUpdate::Zero &) {
+ bl.append_zero(extent.get_len());
+ },
+ [&](const BufferUpdate::CloneRange &) {
+ ceph_abort_msg(
+ "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+ });
+
+ uint64_t off = extent.get_off();
+ uint64_t len = extent.get_len();
+
+ sinfo.ro_range_to_shard_extent_map(off, len, bl, to_write);
+ debug(oid, "overlay_buffer", to_write, dpp);
+ }
+}
+
+void ECTransaction::Generate::appends_and_clone_ranges() {
+
+ extent_set clone_ranges = plan.will_write.get_extent_superset();
+ uint64_t clone_max = ECUtil::align_page_next(plan.orig_size);
+
+ if (op.delete_first) {
+ clone_max = 0;
+ } else if (op.truncate && op.truncate->first < clone_max) {
+ clone_max = ECUtil::align_page_next(op.truncate->first);
+ }
+ ECUtil::shard_extent_set_t cloneable_range(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(clone_max, cloneable_range);
+
+ if (plan.orig_size < plan.projected_size) {
+ ECUtil::shard_extent_set_t projected_cloneable_range(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(plan.projected_size,projected_cloneable_range);
+
+ for (auto &&[shard, eset]: projected_cloneable_range) {
+ uint64_t old_shard_size = 0;
+ if (cloneable_range.contains(shard)) {
+ old_shard_size = cloneable_range.at(shard).range_end();
}
+ uint64_t new_shard_size = eset.range_end();
- extent_map to_write;
- auto pextiter = partial_extents.find(oid);
- if (pextiter != partial_extents.end()) {
- to_write = pextiter->second;
+ if (new_shard_size == old_shard_size) continue;
+
+ uint64_t write_end = 0;
+ if (plan.will_write.contains(shard)) {
+ write_end = plan.will_write.at(shard).range_end();
}
- vector<pair<uint64_t, uint64_t> > rollback_extents;
- const uint64_t orig_size = hinfo->get_total_logical_size(sinfo);
-
- uint64_t new_size = orig_size;
- uint64_t append_after = new_size;
- ldpp_dout(dpp, 20) << "generate_transactions: new_size start "
- << new_size << dendl;
- if (op.truncate && op.truncate->first < new_size) {
- ceph_assert(!op.is_fresh_object());
- new_size = sinfo.logical_to_next_stripe_offset(
- op.truncate->first);
- ldpp_dout(dpp, 20) << "generate_transactions: new_size truncate down "
- << new_size << dendl;
- if (new_size != op.truncate->first) { // 0 the unaligned part
- bufferlist bl;
- bl.append_zero(new_size - op.truncate->first);
- to_write.insert(
- op.truncate->first,
- bl.length(),
- bl);
- append_after = sinfo.logical_to_prev_stripe_offset(
- op.truncate->first);
- } else {
- append_after = new_size;
- }
- to_write.erase(
- new_size,
- std::numeric_limits<uint64_t>::max() - new_size);
-
- if (entry && !op.is_fresh_object()) {
- uint64_t restore_from = sinfo.logical_to_prev_chunk_offset(
- op.truncate->first);
- uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
- orig_size -
- sinfo.logical_to_prev_stripe_offset(op.truncate->first));
- ceph_assert(rollback_extents.empty());
-
- ldpp_dout(dpp, 20) << "generate_transactions: saving extent "
- << make_pair(restore_from, restore_len)
- << dendl;
- ldpp_dout(dpp, 20) << "generate_transactions: truncating to "
- << new_size
- << dendl;
- rollback_extents.emplace_back(
- make_pair(restore_from, restore_len));
- for (auto &&st : *transactions) {
- st.second.touch(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, entry->version.version, st.first));
- st.second.clone_range(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- ghobject_t(oid, entry->version.version, st.first),
- restore_from,
- restore_len,
- restore_from);
-
- }
- } else {
- ldpp_dout(dpp, 20) << "generate_transactions: not saving extents"
- ", fresh object" << dendl;
- }
- for (auto &&st : *transactions) {
- st.second.truncate(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- sinfo.aligned_logical_offset_to_chunk_offset(new_size));
- }
+ if (write_end == new_shard_size) continue;
+
+ /* If code is executing here, it means that the written part of the
+ * shard does not reflect the size that EC believes the shard to be.
+ * This is not a problem for reads (they will be truncated), but it
+ * is a problem for writes, where future writes may attempt a clone
+ * off the end of the object.
+ * To solve this, we use an interesting quirk of "truncate" where we
+ * can actually truncate to a size larger than the object!
+ */
+ if (transactions.contains(shard)) {
+ auto &t = transactions.at(shard);
+ t.truncate(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ new_shard_size);
}
+ // Update written_shards because this must complete to consider
+ // the write as complete
+ shard_written(shard);
+ }
+ }
- uint32_t fadvise_flags = 0;
- for (auto &&extent: op.buffer_updates) {
- using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
- bufferlist bl;
- match(
- extent.get_val(),
- [&](const BufferUpdate::Write &op) {
- bl = op.buffer;
- fadvise_flags |= op.fadvise_flags;
- },
- [&](const BufferUpdate::Zero &) {
- bl.append_zero(extent.get_len());
- },
- [&](const BufferUpdate::CloneRange &) {
- ceph_assert(
- 0 ==
- "CloneRange is not allowed, do_op should have returned ENOTSUPP");
- });
-
- uint64_t off = extent.get_off();
- uint64_t len = extent.get_len();
- uint64_t end = off + len;
- ldpp_dout(dpp, 20) << "generate_transactions: adding buffer_update "
- << make_pair(off, len)
- << dendl;
- ceph_assert(len > 0);
- if (off > new_size) {
- ceph_assert(off > append_after);
- bl.prepend_zero(off - new_size);
- len += off - new_size;
- ldpp_dout(dpp, 20) << "generate_transactions: prepending zeroes to align "
- << off << "->" << new_size
- << dendl;
- off = new_size;
- }
- if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) {
- uint64_t aligned_end = sinfo.logical_to_next_stripe_offset(
- end);
- uint64_t tail = aligned_end - end;
- bl.append_zero(tail);
- ldpp_dout(dpp, 20) << "generate_transactions: appending zeroes to align end "
- << end << "->" << end+tail
- << ", len: " << len << "->" << len+tail
- << dendl;
- end += tail;
- len += tail;
- }
-
- to_write.insert(off, len, bl);
- if (end > new_size)
- new_size = end;
+ shard_id_set touched;
+
+ for (auto &[start, len]: clone_ranges) {
+ shard_id_set to_clone_shards;
+ uint64_t clone_end = 0;
+
+ for (auto &&[shard, eset]: plan.will_write) {
+ shard_written(shard);
+
+ // If no clonable range here, then ignore.
+ if (!cloneable_range.contains(shard)) continue;
+
+ // Do not clone off the end of the old range
+ uint64_t shard_clone_max = cloneable_range.at(shard).range_end();
+ uint64_t shard_end = start + len;
+ if (shard_end > shard_clone_max) shard_end = shard_clone_max;
+
+ // clone_end needs to be the biggest shard_end.
+ if (shard_end > clone_end) clone_end = shard_end;
+
+ // Ignore pure appends on this shard.
+ if (shard_end <= start) continue;
+
+ // Ignore clones that do not intersect with the write.
+ if (!eset.intersects(start, len)) continue;
+
+ // We need a clone...
+ if (transactions.contains(shard)) {
+ auto &t = transactions.at(shard);
+
+ // Only touch once.
+ if (!touched.contains(shard)) {
+ t.touch(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, entry->version.version, shard));
+ touched.insert(shard_id_t(shard));
+ }
+ t.clone_range(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ ghobject_t(oid, entry->version.version, shard),
+ start,
+ shard_end - start,
+ start);
+
+ // We have done a clone, so tell the rollback.
+ to_clone_shards.insert(shard);
}
+ }
- if (op.truncate &&
- op.truncate->second > new_size) {
- ceph_assert(op.truncate->second > append_after);
- uint64_t truncate_to =
- sinfo.logical_to_next_stripe_offset(
- op.truncate->second);
- uint64_t zeroes = truncate_to - new_size;
- bufferlist bl;
- bl.append_zero(zeroes);
- to_write.insert(
- new_size,
- zeroes,
- bl);
- new_size = truncate_to;
- ldpp_dout(dpp, 20) << "generate_transactions: truncating out to "
- << truncate_to
- << dendl;
+ if (!to_clone_shards.empty()) {
+ // It is more efficent to store an empty set to represent the common
+ // all shards case.
+ if (to_clone_shards.size() == sinfo.get_k_plus_m()) {
+ to_clone_shards.clear();
+ }
+ if (clone_end > start) {
+ rollback_extents.emplace_back(make_pair(start, clone_end - start));
+ rollback_shards.emplace_back(to_clone_shards);
}
+ }
+ }
+}
- set<int> want;
- for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) {
- want.insert(i);
+void ECTransaction::Generate::written_and_present_shards() {
+ if (entry) {
+ if (!rollback_extents.empty()) {
+ entry->mod_desc.rollback_extents(
+ entry->version.version,
+ rollback_extents,
+ ECUtil::align_page_next(plan.orig_size),
+ rollback_shards);
+ }
+ if (entry->written_shards.size() == sinfo.get_k_plus_m()) {
+ // More efficient to encode an empty set for all shards
+ entry->written_shards.clear();
+ }
+ // Calculate set of present shards
+ for (auto &&[shard, t]: transactions) {
+ entry->present_shards.insert(shard);
+ }
+ if (entry->present_shards.size() == sinfo.get_k_plus_m()) {
+ // More efficient to encode an empty set for all shards
+ entry->present_shards.clear();
+ }
+
+ // Update shard_versions in object_info to record which shards are being
+ // written
+ if (op.attr_updates.contains(OI_ATTR)) {
+ object_info_t oi(*(op.attr_updates[OI_ATTR]));
+ bool update = false;
+ if (entry->written_shards.empty()) {
+ if (!oi.shard_versions.empty()) {
+ oi.shard_versions.clear();
+ update = true;
+ }
+ } else {
+ for (shard_id_t shard; shard < sinfo.get_k_plus_m(); ++shard) {
+ if (sinfo.is_nonprimary_shard(shard)) {
+ if (entry->is_written_shard(shard) || plan.orig_size != plan.
+ projected_size) {
+ // Written - erase per shard version
+ if (oi.shard_versions.erase(shard)) {
+ update = true;
+ }
+ } else if (!oi.shard_versions.count(shard)) {
+ // Unwritten shard, previously up to date
+ oi.shard_versions[shard] = oi.prior_version;
+ update = true;
+ } else {
+ // Unwritten shard, already out of date
+ }
+ } else {
+ // Primary shards are always written and use oi.version
+ }
+ }
}
- auto to_overwrite = to_write.intersect(0, append_after);
- ldpp_dout(dpp, 20) << "generate_transactions: to_overwrite: "
- << to_overwrite
- << dendl;
- for (auto &&extent: to_overwrite) {
- ceph_assert(extent.get_off() + extent.get_len() <= append_after);
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
- if (entry) {
- uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset(
- extent.get_off());
- uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
- extent.get_len());
- ldpp_dout(dpp, 20) << "generate_transactions: overwriting "
- << restore_from << "~" << restore_len
- << dendl;
- if (rollback_extents.empty()) {
- for (auto &&st : *transactions) {
- st.second.touch(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, entry->version.version, st.first));
- }
- }
- rollback_extents.emplace_back(make_pair(restore_from, restore_len));
- for (auto &&st : *transactions) {
- st.second.clone_range(
- coll_t(spg_t(pgid, st.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, st.first),
- ghobject_t(oid, entry->version.version, st.first),
- restore_from,
- restore_len,
- restore_from);
- }
- }
- encode_and_write(
- pgid,
- oid,
- sinfo,
- ecimpl,
- want,
- extent.get_off(),
- extent.get_val(),
- fadvise_flags,
- hinfo,
- written,
- transactions,
- dpp);
+ if (update) {
+ bufferlist bl;
+ oi.encode(bl, osdmap->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+ op.attr_updates[OI_ATTR] = bl;
+ // Update cached OI
+ obc->obs.oi.shard_versions = oi.shard_versions;
}
+ ldpp_dout(dpp, 20) << __func__ << "shard_info: version=" << entry->version
+ << " present=" << entry->present_shards
+ << " written=" << entry->written_shards
+ << " shard_versions=" << oi.shard_versions << dendl;
+ }
- auto to_append = to_write.intersect(
- append_after,
- std::numeric_limits<uint64_t>::max() - append_after);
- ldpp_dout(dpp, 20) << "generate_transactions: to_append: "
- << to_append
- << dendl;
- for (auto &&extent: to_append) {
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
- ldpp_dout(dpp, 20) << "generate_transactions: appending "
- << extent.get_off() << "~" << extent.get_len()
- << dendl;
- encode_and_write(
- pgid,
- oid,
- sinfo,
- ecimpl,
- want,
- extent.get_off(),
- extent.get_val(),
- fadvise_flags,
- hinfo,
- written,
- transactions,
- dpp);
+ /* It is essential for rollback that every shard with a non-empty transaction
+ * is recorded in written_shards. In fact written shards contains every
+ * shard that would have a transaction if it were present. This is why we do
+ * not simply construct written shards here.
+ */
+ for (auto &&[shard, t] : transactions) {
+ if (entry && (!t.empty() || !sinfo.is_nonprimary_shard(shard))) {
+ ceph_assert(entry->is_written_shard(shard));
}
+ }
+ }
+}
- ldpp_dout(dpp, 20) << "generate_transactions: " << oid
- << " resetting hinfo to logical size "
- << new_size
- << dendl;
- if (!rollback_extents.empty() && entry) {
- if (entry) {
- ldpp_dout(dpp, 20) << "generate_transactions: " << oid
- << " marking rollback extents "
- << rollback_extents
- << dendl;
- entry->mod_desc.rollback_extents(
- entry->version.version, rollback_extents);
- }
- hinfo->set_total_chunk_size_clear_hash(
- sinfo.aligned_logical_offset_to_chunk_offset(new_size));
- } else {
- ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size);
+void ECTransaction::Generate::attr_updates() {
+ map<string, bufferlist, less<>> to_set;
+ for (auto &&[attr, update]: op.attr_updates) {
+ if (update) {
+ to_set[attr] = *(update);
+ } else {
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ t.rmattr(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ attr);
+ }
+ }
+ if (obc) {
+ auto citer = obc->attr_cache.find(attr);
+ if (entry) {
+ if (citer != obc->attr_cache.end()) {
+ // won't overwrite anything we put in earlier
+ xattr_rollback.insert(
+ make_pair(
+ attr,
+ std::optional<bufferlist>(citer->second)));
+ } else {
+ // won't overwrite anything we put in earlier
+ xattr_rollback.insert(
+ make_pair(
+ attr,
+ std::nullopt));
+ }
}
+ if (update) {
+ obc->attr_cache[attr] = *(update);
+ } else if (citer != obc->attr_cache.end()) {
+ obc->attr_cache.erase(citer);
+ }
+ } else {
+ ceph_assert(!entry);
+ }
+ }
+ all_shards_written();
+ for (auto &&[shard, t]: transactions) {
+ if (!sinfo.is_nonprimary_shard(shard)) {
+ // Primary shard - Update all attributes
+ t.setattrs(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ to_set);
+ } else if (entry->is_written_shard(shard)) {
+ // Written shard - Only update object_info attribute
+ t.setattr(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ OI_ATTR,
+ to_set[OI_ATTR]);
+ } // Else: Unwritten shard - Don't update any attributes
+ }
+ ceph_assert(!xattr_rollback.empty());
+}
- if (entry && !to_append.empty()) {
- ldpp_dout(dpp, 20) << "generate_transactions: marking append "
- << append_after
- << dendl;
- entry->mod_desc.append(append_after);
+void ECTransaction::Generate::handle_deletes() {
+ bufferlist hbuf;
+ if (plan.hinfo) {
+ encode(*plan.hinfo, hbuf);
+ for (auto &&[shard, t]: transactions) {
+ if (!sinfo.is_nonprimary_shard(shard)) {
+ shard_written(shard);
+ t.setattr(
+ coll_t(spg_t(pgid, shard)),
+ ghobject_t(oid, ghobject_t::NO_GEN, shard),
+ ECUtil::get_hinfo_key(),
+ hbuf);
}
+ }
+ }
+}
+
+void ECTransaction::generate_transactions(
+ PGTransaction *_t,
+ WritePlan &plans,
+ ErasureCodeInterfaceRef &ec_impl,
+ pg_t pgid,
+ const ECUtil::stripe_info_t &sinfo,
+ const map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+ vector<pg_log_entry_t> &entries,
+ map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+ shard_id_map<ObjectStore::Transaction> *transactions,
+ set<hobject_t> *temp_added,
+ set<hobject_t> *temp_removed,
+ DoutPrefixProvider *dpp,
+ const OSDMapRef &osdmap) {
+ ceph_assert(written_map);
+ ceph_assert(transactions);
+ ceph_assert(temp_added);
+ ceph_assert(temp_removed);
+ ceph_assert(_t);
+ auto &t = *_t;
+
+ map<hobject_t, pg_log_entry_t*> obj_to_log;
+ for (auto &&i: entries) {
+ obj_to_log.insert(make_pair(i.soid, &i));
+ }
- if (!op.is_delete()) {
- bufferlist hbuf;
- encode(*hinfo, hbuf);
- for (auto &&i : *transactions) {
- i.second.setattr(
- coll_t(spg_t(pgid, i.first)),
- ghobject_t(oid, ghobject_t::NO_GEN, i.first),
- ECUtil::get_hinfo_key(),
- hbuf);
- }
+ t.safe_create_traverse(
+ [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
+ auto oid = opair.first;
+ PGTransaction::ObjectOperation& op = opair.second;
+ auto iter = obj_to_log.find(oid);
+ pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+ if (oid.is_temp()) {
+ if (op.is_fresh_object()) {
+ temp_added->insert(oid);
+ } else if (op.is_delete()) {
+ temp_removed->insert(oid);
+ }
}
- });
+
+ // Transactions must be submitted in the same order that they were planned in.
+ ceph_assert(!plans.plans.empty());
+ ECTransaction::WritePlanObj &plan = plans.plans.front();
+ ceph_assert(plan.hoid == oid);
+
+ Generate generate(t, ec_impl, pgid, sinfo, partial_extents, written_map,
+ *transactions, osdmap, oid, op, plan, dpp, entry);
+
+ plans.plans.pop_front();
+ });
}
*
*/
-#ifndef ECTRANSACTION_H
-#define ECTRANSACTION_H
+#pragma once
+#include "common/dout.h"
#include "ECUtil.h"
-#include "ExtentCache.h"
#include "erasure-code/ErasureCodeInterface.h"
#include "os/Transaction.h"
#include "PGTransaction.h"
namespace ECTransaction {
- struct WritePlan {
- bool invalidates_cache = false; // Yes, both are possible
- std::map<hobject_t,extent_set> to_read;
- std::map<hobject_t,extent_set> will_write; // superset of to_read
+class WritePlanObj {
+ public:
+ const hobject_t hoid;
+ std::optional<ECUtil::shard_extent_set_t> to_read;
+ ECUtil::shard_extent_set_t will_write;
+ const ECUtil::HashInfoRef hinfo;
+ const ECUtil::HashInfoRef shinfo;
+ const uint64_t orig_size;
+ uint64_t projected_size;
+ bool invalidates_cache;
+ bool do_parity_delta_write = false;
+
+ WritePlanObj(
+ const hobject_t &hoid,
+ const PGTransaction::ObjectOperation &op,
+ const ECUtil::stripe_info_t &sinfo,
+ const shard_id_set readable_shards,
+ const shard_id_set writable_shards,
+ const bool object_in_cache,
+ uint64_t orig_size,
+ const std::optional<object_info_t> &oi,
+ const std::optional<object_info_t> &soi,
+ const ECUtil::HashInfoRef &&hinfo,
+ const ECUtil::HashInfoRef &&shinfo,
+ const unsigned pdw_write_mode);
+
+ void print(std::ostream &os) const {
+ os << "to_read: " << to_read
+ << " will_write: " << will_write
+ << " hinfo: " << hinfo
+ << " shinfo: " << shinfo
+ << " orig_size: " << orig_size
+ << " projected_size: " << projected_size
+ << " invalidates_cache: " << invalidates_cache
+ << " do_pdw: " << do_parity_delta_write;
+ }
+};
- std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
- };
+struct WritePlan {
+ bool want_read;
+ std::list<WritePlanObj> plans;
+
+ void print(std::ostream &os) const {
+ os << " { plans : ";
+ bool first = true;
+ for (auto && p : plans) {
+ if (first) {
+ first = false;
+ } else {
+ os << ", ";
+ }
+ os << p;
+ }
+ os << "}";
+ }
+};
- template <typename F>
- WritePlan get_write_plan(
+class Generate {
+ PGTransaction &t;
+ const ErasureCodeInterfaceRef &ec_impl;
+ const pg_t &pgid;
+ const ECUtil::stripe_info_t &sinfo;
+ shard_id_map<ceph::os::Transaction> &transactions;
+ DoutPrefixProvider *dpp;
+ const OSDMapRef &osdmap;
+ pg_log_entry_t *entry;
+ const hobject_t &oid;
+ PGTransaction::ObjectOperation& op;
+ ObjectContextRef obc;
+ std::map<std::string, std::optional<bufferlist>> xattr_rollback;
+ const WritePlanObj &plan;
+ std::optional<ECUtil::shard_extent_map_t> read_sem;
+ ECUtil::shard_extent_map_t to_write;
+ std::vector<std::pair<uint64_t, uint64_t>> rollback_extents;
+ std::vector<shard_id_set> rollback_shards;
+ uint32_t fadvise_flags = 0;
+
+ void all_shards_written();
+ void shard_written(const shard_id_t shard);
+ void shards_written(const shard_id_set &shards);
+ void delete_first();
+ void zero_truncate_to_delete();
+ void process_init();
+ void encode_and_write();
+ void truncate();
+ void overlay_writes();
+ void appends_and_clone_ranges();
+ void written_and_present_shards();
+ void attr_updates();
+ void handle_deletes();
+
+ public:
+ Generate(PGTransaction &t,
+ ErasureCodeInterfaceRef &ec_impl, pg_t &pgid,
const ECUtil::stripe_info_t &sinfo,
- PGTransaction& t,
- F &&get_hinfo,
- DoutPrefixProvider *dpp) {
- WritePlan plan;
- t.safe_create_traverse(
- [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
- const auto& [obj, op] = i;
- ECUtil::HashInfoRef hinfo = get_hinfo(obj);
- plan.hash_infos[obj] = hinfo;
-
- uint64_t projected_size =
- hinfo->get_projected_total_logical_size(sinfo);
-
- if (op.deletes_first()) {
- ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size"
- << " to 0" << dendl;
- projected_size = 0;
- }
-
- hobject_t source;
- if (op.has_source(&source)) {
- // typically clone or mv
- plan.invalidates_cache = true;
-
- ECUtil::HashInfoRef shinfo = get_hinfo(source);
- projected_size = shinfo->get_projected_total_logical_size(sinfo);
- plan.hash_infos[source] = shinfo;
- }
-
- auto &will_write = plan.will_write[obj];
- if (op.truncate &&
- op.truncate->first < projected_size) {
- if (!(sinfo.logical_offset_is_stripe_aligned(
- op.truncate->first))) {
- plan.to_read[obj].union_insert(
- sinfo.logical_to_prev_stripe_offset(op.truncate->first),
- sinfo.get_stripe_width());
-
- ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl;
-
- will_write.union_insert(
- sinfo.logical_to_prev_stripe_offset(op.truncate->first),
- sinfo.get_stripe_width());
- }
- projected_size = sinfo.logical_to_next_stripe_offset(
- op.truncate->first);
- }
-
- extent_set raw_write_set;
- for (auto &&extent: op.buffer_updates) {
- using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
- if (boost::get<BufferUpdate::CloneRange>(&(extent.get_val()))) {
- ceph_assert(
- 0 ==
- "CloneRange is not allowed, do_op should have returned ENOTSUPP");
- }
- raw_write_set.insert(extent.get_off(), extent.get_len());
- }
-
- auto orig_size = projected_size;
- for (auto extent = raw_write_set.begin();
- extent != raw_write_set.end();
- ++extent) {
- uint64_t head_start =
- sinfo.logical_to_prev_stripe_offset(extent.get_start());
- uint64_t head_finish =
- sinfo.logical_to_next_stripe_offset(extent.get_start());
- if (head_start > projected_size) {
- head_start = projected_size;
- }
- if (head_start != head_finish &&
- head_start < orig_size) {
- ceph_assert(head_finish <= orig_size);
- ceph_assert(head_finish - head_start == sinfo.get_stripe_width());
- ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe "
- << head_start << "~" << sinfo.get_stripe_width()
- << dendl;
- plan.to_read[obj].union_insert(
- head_start, sinfo.get_stripe_width());
- }
-
- uint64_t tail_start =
- sinfo.logical_to_prev_stripe_offset(
- extent.get_start() + extent.get_len());
- uint64_t tail_finish =
- sinfo.logical_to_next_stripe_offset(
- extent.get_start() + extent.get_len());
- if (tail_start != tail_finish &&
- (head_start == head_finish || tail_start != head_start) &&
- tail_start < orig_size) {
- ceph_assert(tail_finish <= orig_size);
- ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width());
- ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe "
- << tail_start << "~" << sinfo.get_stripe_width()
- << dendl;
- plan.to_read[obj].union_insert(
- tail_start, sinfo.get_stripe_width());
- }
-
- if (head_start != tail_finish) {
- ceph_assert(
- sinfo.logical_offset_is_stripe_aligned(
- tail_finish - head_start)
- );
- will_write.union_insert(
- head_start, tail_finish - head_start);
- if (tail_finish > projected_size)
- projected_size = tail_finish;
- } else {
- ceph_assert(tail_finish <= projected_size);
- }
- }
-
- if (op.truncate && op.truncate->second > projected_size) {
- uint64_t truncating_to =
- sinfo.logical_to_next_stripe_offset(op.truncate->second);
- ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
- << truncating_to
- << dendl;
- will_write.union_insert(projected_size,
- truncating_to - projected_size);
- projected_size = truncating_to;
- }
-
- ldpp_dout(dpp, 20) << __func__ << ": " << obj
- << " projected size "
- << projected_size
- << dendl;
- hinfo->set_projected_total_logical_size(
- sinfo,
- projected_size);
-
- /* validate post conditions:
- * to_read should have an entry for `obj` if it isn't empty
- * and if we are reading from `obj`, we can't be renaming or
- * cloning it */
- ceph_assert(plan.to_read.count(obj) == 0 ||
- (!plan.to_read.at(obj).empty() &&
- !i.second.has_source()));
- });
- return plan;
- }
+ const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
+ std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+ shard_id_map<ceph::os::Transaction> &transactions,
+ const OSDMapRef &osdmap,
+ const hobject_t &oid, PGTransaction::ObjectOperation &op,
+ WritePlanObj &plan,
+ DoutPrefixProvider *dpp,
+ pg_log_entry_t *entry);
+};
- void generate_transactions(
- PGTransaction* _t,
+void generate_transactions(
+ PGTransaction *_t,
WritePlan &plan,
- ceph::ErasureCodeInterfaceRef &ecimpl,
+ ceph::ErasureCodeInterfaceRef &ec_impl,
pg_t pgid,
const ECUtil::stripe_info_t &sinfo,
- const std::map<hobject_t,extent_map> &partial_extents,
+ const std::map<hobject_t, ECUtil::shard_extent_map_t> &partial_extents,
std::vector<pg_log_entry_t> &entries,
- std::map<hobject_t,extent_map> *written,
- std::map<shard_id_t, ceph::os::Transaction> *transactions,
+ std::map<hobject_t, ECUtil::shard_extent_map_t> *written_map,
+ shard_id_map<ceph::os::Transaction> *transactions,
std::set<hobject_t> *temp_added,
std::set<hobject_t> *temp_removed,
DoutPrefixProvider *dpp,
- const ceph_release_t require_osd_release = ceph_release_t::unknown);
-};
-
-#endif
+ const OSDMapRef &osdmap
+ );
+}
#include "global/global_context.h"
#include "include/encoding.h"
-/* This file is soon going to be replaced (before next release), so we are going
- * to simply ignore all deprecated warnings.
- * */
-IGNORE_DEPRECATED
-
using namespace std;
using ceph::bufferlist;
using ceph::ErasureCodeInterfaceRef;
using ceph::Formatter;
-std::pair<uint64_t, uint64_t> ECUtil::stripe_info_t::chunk_aligned_offset_len_to_chunk(
- std::pair<uint64_t, uint64_t> in) const {
- pair<uint64_t, uint64_t> tmp = offset_len_to_stripe_bounds(in);
+template <typename T>
+using shard_id_map = shard_id_map<T>;
+
+std::pair<uint64_t, uint64_t>
+ECUtil::stripe_info_t::chunk_aligned_ro_range_to_shard_ro_range(
+ uint64_t _off, uint64_t _len) const {
+ auto [off, len] = ro_offset_len_to_stripe_ro_offset_len(_off, _len);
return std::make_pair(
- chunk_aligned_logical_offset_to_chunk_offset(tmp.first),
- chunk_aligned_logical_size_to_chunk_size(tmp.second));
+ chunk_aligned_ro_offset_to_chunk_offset(off),
+ chunk_aligned_ro_length_to_shard_length(len));
}
-int ECUtil::decode(
- const stripe_info_t &sinfo,
- ErasureCodeInterfaceRef &ec_impl,
- const set<int> want_to_read,
- map<int, bufferlist> &to_decode,
- bufferlist *out)
-{
- ceph_assert(to_decode.size());
+/*
+ASCII Art describing the various variables in the following function:
+ start end
+ | |
+ | |
+ | |
+ - - - - - -v- -+---+-----------+ - - - - - -
+ start_adj| | | ^
+to_read.offset - ->-------+ | | chunk_size
+ | | | v
+ +------+ - - - - - + - - - - - + - - - - - -
+ | | |
+ | v |
+ | - - - - +-------+
+ | end_adj|
+ | +-------+
+ | | |
+ +--------------+ |
+ | |
+ | shard |
+
+Given an offset and size, this adds to a vector of extents describing the
+minimal IO ranges on each shard. If passed, this method will also populate
+a superset of all extents required.
+ */
+void ECUtil::stripe_info_t::ro_range_to_shards(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ shard_extent_set_t *shard_extent_set,
+ extent_set *extent_superset,
+ buffer::list *bl,
+ shard_extent_map_t *shard_extent_map) const {
+ // Some of the maths below assumes size not zero.
+ if (ro_size == 0) {
+ return;
+ }
+
+ uint64_t k = get_k();
+
+ // Aim is to minimise non-^2 divs (chunk_size is assumed to be a power of 2).
+ // These should be the only non ^2 divs.
+ uint64_t begin_div = ro_offset / stripe_width;
+ uint64_t end_div = (ro_offset + ro_size + stripe_width - 1) / stripe_width -
+ 1;
+ uint64_t start = begin_div * chunk_size;
+ uint64_t end = end_div * chunk_size;
+
+ uint64_t start_shard = (ro_offset - begin_div * stripe_width) / chunk_size;
+ uint64_t chunk_count = (ro_offset + ro_size + chunk_size - 1) / chunk_size -
+ ro_offset / chunk_size;;
+
+ // The end_shard needs a modulus to calculate the actual shard, however
+ // it is convenient to store it like this for the loop.
+ auto end_shard = start_shard + std::min(chunk_count, k);
+
+ // The last shard is the raw shard index which contains the last chunk.
+ // Is it possible to calculate this without th e +%?
+ uint64_t last_shard = (start_shard + chunk_count - 1) % k;
+
+ uint64_t buffer_shard_start_offset = 0;
+
+ for (auto i = start_shard; i < end_shard; i++) {
+ raw_shard_id_t raw_shard(i >= k ? i - k : i);
+
+ // Adjust the start and end blocks if needed.
+ uint64_t start_adj = 0;
+ uint64_t end_adj = 0;
+
+ if (raw_shard < start_shard) {
+ // Shards before the start, must start on the next chunk.
+ start_adj = chunk_size;
+ } else if (int(raw_shard) == int(start_shard)) {
+ // The start shard itself needs to be moved a partial-chunk forward.
+ start_adj = ro_offset % chunk_size;
+ }
+
+ // The end is similar to the start, but the end must be rounded up.
+ if (raw_shard < last_shard) {
+ end_adj = chunk_size;
+ } else if (int(raw_shard) == int(last_shard)) {
+ end_adj = (ro_offset + ro_size - 1) % chunk_size + 1;
+ }
+
+ shard_id_t shard = get_shard(raw_shard);
+
+ uint64_t off = start + start_adj;
+ uint64_t len = end + end_adj - start - start_adj;
+ if (shard_extent_set) {
+ (*shard_extent_set)[shard].union_insert(off, len);
+ }
+
+ if (extent_superset) {
+ extent_superset->union_insert(off, len);
+ }
- uint64_t total_data_size = to_decode.begin()->second.length();
- ceph_assert(total_data_size % sinfo.get_chunk_size() == 0);
+ if (shard_extent_map) {
+ ceph_assert(bl);
+ buffer::list shard_bl;
- ceph_assert(out);
- ceph_assert(out->length() == 0);
+ uint64_t bl_offset = buffer_shard_start_offset;
- for (map<int, bufferlist>::iterator i = to_decode.begin();
- i != to_decode.end();
- ++i) {
- ceph_assert(i->second.length() == total_data_size);
+ // Start with any partial chunks.
+ if (chunk_size != start_adj) {
+ shard_bl.substr_of(*bl, bl_offset,
+ min(static_cast<uint64_t>(bl->length()) - bl_offset,
+ chunk_size - start_adj));
+ buffer_shard_start_offset += chunk_size - start_adj;
+ bl_offset += chunk_size - start_adj + (k - 1) * chunk_size;
+ } else {
+ buffer_shard_start_offset += chunk_size;
+ }
+ while (bl_offset < bl->length()) {
+ buffer::list tmp;
+ tmp.substr_of(*bl, bl_offset,
+ min(chunk_size, bl->length() - bl_offset));
+ shard_bl.append(tmp);
+ bl_offset += k * chunk_size;
+ }
+ shard_extent_map->insert_in_shard(shard, off, shard_bl, ro_offset,
+ ro_offset + ro_size);
+ }
}
+}
- if (total_data_size == 0)
- return 0;
+void ECUtil::stripe_info_t::trim_shard_extent_set_for_ro_offset(
+ uint64_t ro_offset,
+ shard_extent_set_t &shard_extent_set) const {
+ /* If the offset is within the first shard, then the remaining shards are
+ * not written and we don't need to generated zeros for either */
+ int ro_offset_shard = (ro_offset / chunk_size) % k;
+ if (ro_offset_shard == 0) {
+ uint64_t shard_offset = ro_offset_to_shard_offset(
+ ro_offset, raw_shard_id_t(0));
+ for (auto &&iter = shard_extent_set.begin(); iter != shard_extent_set.end()
+ ;) {
+ iter->second.erase_after(align_page_next(shard_offset));
+ if (iter->second.empty()) iter = shard_extent_set.erase(iter);
+ else ++iter;
+ }
+ }
+}
+
+void ECUtil::stripe_info_t::ro_size_to_stripe_aligned_read_mask(
+ uint64_t ro_size,
+ shard_extent_set_t &shard_extent_set) const {
+ ro_range_to_shard_extent_set_with_parity(
+ 0, ro_offset_to_next_stripe_ro_offset(ro_size), shard_extent_set);
+ trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set);
+}
+
+void ECUtil::stripe_info_t::ro_size_to_read_mask(
+ uint64_t ro_size,
+ shard_extent_set_t &shard_extent_set) const {
+ ro_range_to_shard_extent_set_with_parity(0, align_page_next(ro_size),
+ shard_extent_set);
+}
+
+void ECUtil::stripe_info_t::ro_size_to_zero_mask(
+ uint64_t ro_size,
+ shard_extent_set_t &shard_extent_set) const {
+ // There should never be any zero padding on the parity.
+ ro_range_to_shard_extent_set(align_page_next(ro_size),
+ ro_offset_to_next_stripe_ro_offset(ro_size) -
+ align_page_next(ro_size),
+ shard_extent_set);
+ trim_shard_extent_set_for_ro_offset(ro_size, shard_extent_set);
+}
+
+namespace ECUtil {
+void shard_extent_map_t::erase_after_ro_offset(uint64_t ro_offset) {
+ /* Ignore the null case */
+ if (ro_offset >= ro_end) {
+ return;
+ }
- for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
- map<int, bufferlist> chunks;
- for (map<int, bufferlist>::iterator j = to_decode.begin();
- j != to_decode.end();
- ++j) {
- chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size());
+ shard_extent_set_t ro_to_erase(sinfo->get_k_plus_m());
+ sinfo->ro_range_to_shard_extent_set(ro_offset, ro_end - ro_start,
+ ro_to_erase);
+ for (auto &&[shard, eset] : ro_to_erase) {
+ if (extent_maps.contains(shard)) {
+ extent_maps[shard].erase(eset.range_start(), eset.range_end());
+ }
+
+ // If the result is empty, delete the extent map.
+ if (extent_maps[shard].empty()) {
+ extent_maps.erase(shard);
}
- bufferlist bl;
- int r = ec_impl->decode_concat(want_to_read, chunks, &bl);
- ceph_assert(r == 0);
- ceph_assert(bl.length() % sinfo.get_chunk_size() == 0);
- out->claim_append(bl);
}
- return 0;
+
+ compute_ro_range();
+}
+
+shard_extent_map_t shard_extent_map_t::intersect_ro_range(
+ uint64_t ro_offset,
+ uint64_t ro_length) const {
+ // Optimise (common) use case where the overlap is everything
+ if (ro_offset <= ro_start &&
+ ro_offset + ro_length >= ro_end) {
+ return *this;
+ }
+
+ // Optimise (common) use cases where the overlap is nothing
+ if (ro_offset >= ro_end ||
+ ro_offset + ro_length <= ro_start) {
+ return shard_extent_map_t(sinfo);
+ }
+
+ shard_extent_set_t ro_to_intersect(sinfo->get_k_plus_m());
+ sinfo->ro_range_to_shard_extent_set(ro_offset, ro_length, ro_to_intersect);
+
+ return intersect(ro_to_intersect);
+}
+
+shard_extent_map_t shard_extent_map_t::intersect(
+ optional<shard_extent_set_t> const &other) const {
+ if (!other) {
+ return shard_extent_map_t(sinfo);
+ }
+
+ return intersect(*other);
+}
+
+shard_extent_map_t shard_extent_map_t::intersect(
+ shard_extent_set_t const &other) const {
+ shard_extent_map_t out(sinfo);
+ out.ro_end = 0;
+ out.end_offset = 0;
+
+ for (auto &&[shard, this_eset] : other) {
+ if (extent_maps.contains(shard)) {
+ extent_map tmp;
+ extent_set eset;
+ extent_maps.at(shard).to_interval_set(eset);
+ eset.intersection_of(this_eset);
+
+ for (auto [offset, len] : eset) {
+ bufferlist bl;
+ get_buffer(shard, offset, len, bl);
+ tmp.insert(offset, len, bl);
+ }
+ if (!tmp.empty()) {
+ uint64_t range_start = tmp.get_start_off();
+ uint64_t range_end = tmp.get_end_off();
+
+ out.start_offset = min(out.start_offset, range_start);
+ out.end_offset = max(out.end_offset, range_end);
+
+ raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+ if (raw_shard < sinfo->get_k()) {
+ out.ro_start = std::min(out.ro_start,
+ calc_ro_offset(raw_shard, range_start));
+ out.ro_end = std::max(out.ro_end, calc_ro_end(raw_shard, range_end));
+ }
+
+ out.extent_maps.emplace(shard, std::move(tmp));
+ }
+ }
+ }
+
+ if (out.ro_start == invalid_offset) {
+ out.ro_end = out.end_offset = invalid_offset;
+ }
+
+ return out;
+}
+
+void shard_extent_map_t::insert(shard_extent_map_t const &other) {
+ for (auto &&[shard, emap] : other.extent_maps) {
+ if (!extent_maps.contains(shard)) {
+ extent_maps.emplace(shard, emap);
+ } else {
+ extent_maps[shard].insert(emap);
+ }
+ }
+
+ if (ro_start == invalid_offset || other.ro_start < ro_start) {
+ ro_start = other.ro_start;
+ }
+ if (ro_end == invalid_offset || other.ro_end > ro_end) {
+ ro_end = other.ro_end;
+ }
+ if (start_offset == invalid_offset || other.start_offset < start_offset) {
+ start_offset = other.start_offset;
+ }
+ if (end_offset == invalid_offset || other.end_offset > end_offset) {
+ end_offset = other.end_offset;
+ }
+}
+
+uint64_t shard_extent_map_t::size() {
+ uint64_t size = 0;
+ for (auto &i : extent_maps) {
+ for (auto &j : i.second) {
+ size += j.get_len();
+ }
+ }
+
+ return size;
+}
+
+void shard_extent_map_t::clear() {
+ ro_start = ro_end = start_offset = end_offset = invalid_offset;
+ extent_maps.clear();
+}
+
+void shard_extent_map_t::deep_copy(shard_extent_map_t const &other) {
+ for (auto &&[shard, emap] : other.extent_maps) {
+ for (auto iter : emap) {
+ uint64_t off = iter.get_off();
+ uint64_t len = iter.get_len();
+ bufferlist bl = iter.get_val();
+ bl.rebuild();
+ extent_maps[shard].insert(off, len, bl);
+ }
+ }
+}
+
+/* Insert a buffer for a particular shard.
+ * NOTE: DO NOT CALL sinfo->get_min_want_shards()
+ */
+void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off,
+ const buffer::list &bl) {
+ if (bl.length() == 0) {
+ return;
+ }
+
+ extent_maps[shard].insert(off, bl.length(), bl);
+ raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+
+ if (raw_shard > sinfo->get_k()) {
+ return;
+ }
+
+ uint64_t new_start = calc_ro_offset(sinfo->get_raw_shard(shard), off);
+ uint64_t new_end =
+ calc_ro_end(sinfo->get_raw_shard(shard), off + bl.length());
+ if (empty()) {
+ ro_start = new_start;
+ ro_end = new_end;
+ start_offset = off;
+ end_offset = off + bl.length();
+ } else {
+ ro_start = min(ro_start, new_start);
+ ro_end = max(ro_end, new_end);
+ start_offset = min(start_offset, off);
+ end_offset = max(end_offset, off + bl.length());
+ }
}
-int ECUtil::decode(
- const stripe_info_t &sinfo,
- ErasureCodeInterfaceRef &ec_impl,
- map<int, bufferlist> &to_decode,
- map<int, bufferlist*> &out) {
+/* Insert a buffer for a particular shard.
+ * If the client knows the new start and end, use this interface to improve
+ * performance.
+ */
+void shard_extent_map_t::insert_in_shard(shard_id_t shard, uint64_t off,
+ const buffer::list &bl,
+ uint64_t new_start, uint64_t new_end) {
+ if (bl.length() == 0) {
+ return;
+ }
+
+ extent_maps[shard].insert(off, bl.length(), bl);
+ if (empty()) {
+ ro_start = new_start;
+ ro_end = new_end;
+ start_offset = off;
+ end_offset = off + bl.length();
+ } else {
+ ro_start = min(ro_start, new_start);
+ ro_end = max(ro_end, new_end);
+ start_offset = min(start_offset, off);
+ end_offset = max(end_offset, off + bl.length());
+ }
+}
- ceph_assert(to_decode.size());
+/* Insert a region of zeros in rados object address space..
+ */
+void shard_extent_map_t::insert_ro_zero_buffer(uint64_t ro_offset,
+ uint64_t ro_length) {
+ buffer::list zero_buffer;
+ zero_buffer.append_zero(ro_length);
+ sinfo->ro_range_to_shard_extent_map(ro_offset, ro_length, zero_buffer, *this);
+}
- for (auto &&i : to_decode) {
- if(i.second.length() == 0)
- return 0;
+/* Append zeros to the extent maps, such that all bytes from the current end
+ * of the rados object range to the specified offset are zero. Note that the
+ * byte at ro_offset does NOT get populated, so that this works as an
+ * addition to length.
+ */
+void shard_extent_map_t::append_zeros_to_ro_offset(uint64_t ro_offset) {
+ uint64_t _ro_end = ro_end == invalid_offset ? 0 : ro_end;
+ if (ro_offset <= _ro_end) {
+ return;
}
+ uint64_t append_offset = _ro_end;
+ uint64_t append_length = ro_offset - _ro_end;
+ insert_ro_zero_buffer(append_offset, append_length);
+}
- set<int> need;
- for (map<int, bufferlist*>::iterator i = out.begin();
- i != out.end();
- ++i) {
- ceph_assert(i->second);
- ceph_assert(i->second->length() == 0);
- need.insert(i->first);
+/* This method rearranges buffers from a rados object extent map into a shard
+ * extent map. Note that it is a simple transformation, it does NOT perform
+ * any encoding of parity shards.
+ */
+void shard_extent_map_t::insert_ro_extent_map(const extent_map &host_extent_map) {
+ for (auto &&range = host_extent_map.begin();
+ range != host_extent_map.end();
+ ++range) {
+ buffer::list bl = range.get_val();
+ sinfo->ro_range_to_shard_extent_map(
+ range.get_off(),
+ range.get_len(),
+ bl,
+ *this);
}
+}
- set<int> avail;
- for (auto &&i : to_decode) {
- ceph_assert(i.second.length() != 0);
- avail.insert(i.first);
+extent_set shard_extent_map_t::get_extent_superset() const {
+ extent_set eset;
+
+ for (auto &&[shard, emap] : extent_maps) {
+ emap.to_interval_set(eset);
}
- map<int, vector<pair<int, int>>> min;
- int r = ec_impl->minimum_to_decode(need, avail, &min);
- ceph_assert(r == 0);
+ return eset;
+}
- int chunks_count = 0;
- int repair_data_per_chunk = 0;
- int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count();
+void shard_extent_map_t::insert_parity_buffers() {
+ extent_set encode_set = get_extent_superset();
- for (auto &&i : to_decode) {
- auto found = min.find(i.first);
- if (found != min.end()) {
- int repair_subchunk_count = 0;
- for (auto& subchunks : min[i.first]) {
- repair_subchunk_count += subchunks.second;
+ /* Invent buffers for the parity coding, if they were not provided.
+ * e.g. appends will not provide parity buffers.
+ * We should EITHER have no buffers, or have the right buffers.
+ */
+ for (raw_shard_id_t raw_shard(sinfo->get_k()); raw_shard < sinfo->
+ get_k_plus_m(); ++raw_shard) {
+ shard_id_t shard = sinfo->get_shard(raw_shard);
+
+ for (auto &&[offset, length] : encode_set) {
+ /* No need to recreate buffers we already have */
+ if (extent_maps.contains(shard)) {
+ extent_map emap = extent_maps.at(shard);
+ if (emap.contains(offset, length))
+ continue;
}
- repair_data_per_chunk = repair_subchunk_count * subchunk_size;
- chunks_count = (int)i.second.length() / repair_data_per_chunk;
- break;
+ bufferlist bl;
+ bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+ extent_maps[shard].insert(offset, length, bl);
}
}
+}
+
+slice_iterator<shard_id_t, extent_map> shard_extent_map_t::begin_slice_iterator(
+ const shard_id_set &out) {
+ return slice_iterator(extent_maps, out);
+}
+
+/* Encode parity chunks, using the encode_chunks interface into the
+ * erasure coding. This generates all parity using full stripe writes.
+ */
+int shard_extent_map_t::_encode(const ErasureCodeInterfaceRef &ec_impl) {
+ shard_id_set out_set = sinfo->get_parity_shards();
+ bool rebuild_req = false;
+
+ for (auto iter = begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+ if (!iter.is_page_aligned()) {
+ rebuild_req = true;
+ break;
+ }
- for (int i = 0; i < chunks_count; i++) {
- map<int, bufferlist> chunks;
- for (auto j = to_decode.begin();
- j != to_decode.end();
- ++j) {
- chunks[j->first].substr_of(j->second,
- i*repair_data_per_chunk,
- repair_data_per_chunk);
+ shard_id_map<bufferptr> &in = iter.get_in_bufferptrs();
+ shard_id_map<bufferptr> &out = iter.get_out_bufferptrs();
+
+ if (int ret = ec_impl->encode_chunks(in, out)) {
+ return ret;
}
- map<int, bufferlist> out_bls;
- r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size());
- ceph_assert(r == 0);
- for (auto j = out.begin(); j != out.end(); ++j) {
- ceph_assert(out_bls.count(j->first));
- ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size());
- j->second->claim_append(out_bls[j->first]);
+ }
+
+ if (rebuild_req) {
+ pad_and_rebuild_to_page_align();
+ return _encode(ec_impl);
+ }
+
+ return 0;
+}
+
+/* Encode parity chunks, using the encode_chunks interface into the
+ * erasure coding. This generates all parity using full stripe writes.
+ */
+int shard_extent_map_t::encode(const ErasureCodeInterfaceRef &ec_impl,
+ const HashInfoRef &hinfo,
+ uint64_t before_ro_size) {
+ int r = _encode(ec_impl);
+
+ if (!r && hinfo && ro_start >= before_ro_size) {
+ /* NEEDS REVIEW: The following calculates the new hinfo CRCs. This is
+ * currently considering ALL the buffers, including the
+ * parity buffers. Is this really right?
+ * Also, does this really belong here? Its convenient
+ * because have just built the buffer list...
+ */
+ shard_id_set full_set;
+ full_set.insert_range(shard_id_t(0), sinfo->get_k_plus_m());
+ for (auto iter = begin_slice_iterator(full_set); !iter.is_end(); ++iter) {
+ ceph_assert(ro_start == before_ro_size);
+ hinfo->append(iter.get_offset(), iter.get_in_bufferptrs());
}
}
- for (auto &&i : out) {
- ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size());
+
+ return r;
+}
+
+/* Encode parity chunks, using the parity delta write interfaces on plugins
+ * that support them.
+ */
+int shard_extent_map_t::encode_parity_delta(
+ const ErasureCodeInterfaceRef &ec_impl,
+ shard_extent_map_t &old_sem) {
+ shard_id_set out_set = sinfo->get_parity_shards();
+
+ pad_and_rebuild_to_page_align();
+ old_sem.pad_and_rebuild_to_page_align();
+
+ for (auto data_shard : sinfo->get_data_shards()) {
+ shard_extent_map_t s(sinfo);
+ if (!contains_shard(data_shard)) {
+ continue;
+ }
+ s.extent_maps[shard_id_t(0)] = old_sem.extent_maps[data_shard];
+ s.extent_maps[shard_id_t(1)] = extent_maps[data_shard];
+ for (shard_id_t parity_shard : sinfo->get_parity_shards()) {
+ if (extent_maps.contains(parity_shard)) {
+ s.extent_maps[parity_shard] = extent_maps[parity_shard];
+ }
+ }
+
+ s.compute_ro_range();
+
+ for (auto iter = s.begin_slice_iterator(out_set); !iter.is_end(); ++iter) {
+ ceph_assert(iter.is_page_aligned());
+ shard_id_map<bufferptr> &data_shards = iter.get_in_bufferptrs();
+ shard_id_map<bufferptr> &parity_shards = iter.get_out_bufferptrs();
+
+ unsigned int size = iter.get_length();
+ ceph_assert(size % 4096 == 0);
+ ceph_assert(size > 0);
+ bufferptr delta = buffer::create_aligned(size, CEPH_PAGE_SIZE);
+
+ if (data_shards[shard_id_t(0)].length() != 0 && data_shards[shard_id_t(1)]
+ .length() != 0) {
+ ec_impl->encode_delta(data_shards[shard_id_t(0)],
+ data_shards[shard_id_t(1)], &delta);
+ shard_id_map<bufferptr> in(sinfo->get_k_plus_m());
+ in.emplace(data_shard, delta);
+ ec_impl->apply_delta(in, parity_shards);
+ }
+ }
}
+
+ compute_ro_range();
return 0;
}
-int ECUtil::encode(
- const stripe_info_t &sinfo,
- ErasureCodeInterfaceRef &ec_impl,
- bufferlist &in,
- const set<int> &want,
- map<int, bufferlist> *out) {
+void shard_extent_map_t::pad_on_shards(const shard_extent_set_t &pad_to,
+ const shard_id_set &shards) {
+ for (auto &shard : shards) {
+ if (!pad_to.contains(shard)) {
+ continue;
+ }
+ for (auto &[off, length] : pad_to.at(shard)) {
+ bufferlist bl;
+ bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+ insert_in_shard(shard, off, bl);
+ }
+ }
+}
- uint64_t logical_size = in.length();
+void shard_extent_map_t::pad_on_shards(const extent_set &pad_to,
+ const shard_id_set &shards) {
+ for (auto &shard : shards) {
+ for (auto &[off, length] : pad_to) {
+ bufferlist bl;
+ bl.push_back(buffer::create_aligned(length, CEPH_PAGE_SIZE));
+ insert_in_shard(shard, off, bl);
+ }
+ }
+}
+
+/* Trim to the specified extent set. Note that this will panic if the shard
+ * extent set does not contain the extents described in trim_to.
+ */
+void shard_extent_map_t::trim(const shard_extent_set_t &trim_to) {
- ceph_assert(logical_size % sinfo.get_stripe_width() == 0);
- ceph_assert(out);
- ceph_assert(out->empty());
+ // Erase any shards missing from trim_to
+ for ( auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+ auto && [shard, emap] = *iter;
+ if (!trim_to.contains(shard)) {
+ iter = extent_maps.erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+ for (auto &&[shard, want_eset] : trim_to) {
+ extent_set tmp;
+ ceph_assert(extent_maps.contains(shard));
+ extent_map &emap = extent_maps.at(shard);
+ emap.to_interval_set(tmp);
+ ceph_assert(tmp.contains(want_eset));
+
+ // Now trim to what was requested.
+ if (tmp.size() != want_eset.size()) {
+ tmp.subtract(trim_to.at(shard));
+ for (auto [off, len] : tmp) {
+ emap.erase(off, len);
+ }
+ }
+ }
- if (logical_size == 0)
+ compute_ro_range();
+}
+
+int shard_extent_map_t::decode(const ErasureCodeInterfaceRef &ec_impl,
+ const shard_extent_set_t &want,
+ uint64_t object_size) {
+ shard_id_set want_set;
+ shard_id_set have_set;
+ want.populate_shard_id_set(want_set);
+ extent_maps.populate_bitset_set(have_set);
+
+ shard_id_set need_set = shard_id_set::difference(want_set, have_set);
+
+ /* Optimise the no-op */
+ if (need_set.empty()) {
return 0;
+ }
+
+ if (add_zero_padding_for_decode(object_size, need_set)) {
+ // We added some zero buffers, which means our have and need set may change
+ extent_maps.populate_bitset_set(have_set);
+ need_set = shard_id_set::difference(want_set, have_set);
+ }
+
+ shard_id_set decode_set = shard_id_set::intersection(need_set, sinfo->get_data_shards());
+ shard_id_set encode_set = shard_id_set::intersection(need_set, sinfo->get_parity_shards());
+ int r = 0;
+ if (!decode_set.empty()) {
+ pad_on_shards(want, decode_set);
+ /* If we are going to be encoding, we need to make sure all the necessary
+ * shards are decoded. The get_min_available functions should have already
+ * worked out what needs to be read for this.
+ */
+ extent_set decode_for_parity;
+ for (auto shard : encode_set) {
+ decode_for_parity.insert(want.at(shard));
+ }
+ pad_on_shards(decode_for_parity, decode_set);
+ r = _decode(ec_impl, want_set, decode_set);
+ }
+ if (!r && !encode_set.empty()) {
+ pad_on_shards(want, encode_set);
+ r = _encode(ec_impl);
+ }
+
+ // If we failed to decode, then bail out, or the trimming below might fail.
+ if (r) {
+ return r;
+ }
+
+ /* Some of the above can invent buffers. There are some edge cases whereby
+ * they can invent buffers outside the want extent_set which are actually
+ * invalid. So here, we trim off those buffers.
+ */
+ trim(want);
+
+ return 0;
+}
+
+int shard_extent_map_t::_decode(const ErasureCodeInterfaceRef &ec_impl,
+ const shard_id_set &want_set,
+ const shard_id_set &need_set) {
+ bool rebuild_req = false;
+ for (auto iter = begin_slice_iterator(need_set); !iter.is_end(); ++iter) {
+ if (!iter.is_page_aligned()) {
+ rebuild_req = true;
+ break;
+ }
+ shard_id_map<bufferptr> &in = iter.get_in_bufferptrs();
+ shard_id_map<bufferptr> &out = iter.get_out_bufferptrs();
- for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) {
- map<int, bufferlist> encoded;
- bufferlist buf;
- buf.substr_of(in, i, sinfo.get_stripe_width());
- int r = ec_impl->encode(want, buf, &encoded);
- ceph_assert(r == 0);
- for (map<int, bufferlist>::iterator i = encoded.begin();
- i != encoded.end();
- ++i) {
- ceph_assert(i->second.length() == sinfo.get_chunk_size());
- (*out)[i->first].claim_append(i->second);
+ if (int ret = ec_impl->decode_chunks(want_set, in, out)) {
+ return ret;
}
}
- for (map<int, bufferlist>::iterator i = out->begin();
- i != out->end();
- ++i) {
- ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0);
- ceph_assert(
- sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) ==
- logical_size);
+ if (rebuild_req) {
+ pad_and_rebuild_to_page_align();
+ return _decode(ec_impl, want_set, need_set);
}
+
+ compute_ro_range();
+
return 0;
}
+void shard_extent_map_t::pad_and_rebuild_to_page_align() {
+ bool resized = false;
+ for (auto &&[shard, emap] : extent_maps) {
+ extent_map aligned;
+
+ // Inserting while iterating is not supported in extent maps, make the
+ // iterated-over emap const to help defend against mistakes.
+ const extent_map &cemap = emap;
+ for (auto i = cemap.begin(); i != cemap.end(); ++i) {
+ bool resized_i = false;
+ bufferlist bl = i.get_val();
+ uint64_t start = i.get_off();
+ uint64_t end = start + i.get_len();
+
+ if ((start & ~CEPH_PAGE_MASK) != 0) {
+ bl.prepend_zero(start - (start & CEPH_PAGE_MASK));
+ start = start & CEPH_PAGE_MASK;
+ resized_i = true;
+ }
+ if ((end & ~CEPH_PAGE_MASK) != 0) {
+ bl.append_zero((end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE - end);
+ end = (end & CEPH_PAGE_MASK) + CEPH_PAGE_SIZE;
+ resized_i = true;
+ }
+
+ // Perhaps we can get away without page aligning here and only SIMD
+ // align. However, typical workloads are actually page aligned already,
+ // so this should not cause problems on any sensible workload.
+ if (bl.rebuild_aligned_size_and_memory(bl.length(), CEPH_PAGE_SIZE) ||
+ resized_i) {
+ // We are not permitted to modify the emap while iterating.
+ aligned.insert(start, end - start, bl);
+ }
+ if (resized_i) resized = true;
+ }
+ emap.insert(aligned);
+ }
+
+ if (resized) {
+ compute_ro_range();
+ }
+}
+
+shard_extent_map_t shard_extent_map_t::slice_map(
+ uint64_t offset, uint64_t length) const {
+ // Range entirely contains offset - this will be common for small IO.
+ if (offset <= start_offset && offset + length >= end_offset) return *this;
+
+ shard_extent_map_t slice(sinfo);
+
+ // Null cases just generate an empty map.
+ if (offset >= end_offset) {
+ return slice;
+ }
+ if (offset + length <= start_offset) {
+ return slice;
+ }
+
+ slice.end_offset = slice.ro_end = 0;
+
+ for (auto &&[shard, emap] : extent_maps) {
+ extent_map iemap = emap.intersect(offset, length);
+
+ if (!iemap.empty()) {
+ slice.start_offset = min(slice.start_offset, iemap.get_start_off());
+ slice.end_offset = max(slice.start_offset, iemap.get_end_off());
+ slice.ro_start = min(slice.start_offset,
+ calc_ro_offset(sinfo->get_raw_shard(shard),
+ iemap.get_start_off()));
+ slice.ro_end = min(slice.ro_end,
+ calc_ro_end(sinfo->get_raw_shard(shard),
+ iemap.get_end_off()));
+ slice.extent_maps.emplace(shard, iemap);
+ }
+ }
+
+ if (slice.end_offset == 0) {
+ slice.end_offset = slice.ro_end = invalid_offset;
+ }
+
+ return slice;
+}
+
+void shard_extent_map_t::get_buffer(shard_id_t shard, uint64_t offset,
+ uint64_t length,
+ buffer::list &append_to) const {
+ const extent_map &emap = extent_maps.at(shard);
+ auto &&range = emap.get_lower_range(offset, length);
+
+ if (range == emap.end() || !emap.contains(offset, length)) {
+ return;
+ }
+
+ if (range.get_len() == length) {
+ buffer::list bl = range.get_val();
+ // This should be asserted on extent map insertion.
+ ceph_assert(bl.length() == length);
+ append_to.append(bl);
+ } else {
+ buffer::list bl;
+ bl.substr_of(range.get_val(), offset - range.get_off(), length);
+ append_to.append(bl);
+ }
+}
+
+void shard_extent_map_t::get_shard_first_buffer(shard_id_t shard,
+ buffer::list &append_to) const {
+ if (!extent_maps.contains(shard)) {
+ return;
+ }
+ const extent_map &emap = extent_maps.at(shard);
+ auto range = emap.begin();
+ if (range == emap.end()) {
+ return;
+ }
+
+ append_to.append(range.get_val());
+}
+
+uint64_t shard_extent_map_t::get_shard_first_offset(shard_id_t shard) const {
+ if (!extent_maps.contains(shard)) {
+ return invalid_offset;
+ }
+ const extent_map &emap = extent_maps.at(shard);
+ auto range = emap.begin();
+ if (range == emap.end()) {
+ return invalid_offset;
+ }
+
+ return range.get_off();
+}
+
+void shard_extent_map_t::zero_pad(shard_extent_set_t const &pad_to) {
+ for (auto &&[shard, eset] : pad_to) {
+ for (auto &&[off, len] : eset) {
+ zero_pad(shard, off, len);
+ }
+ }
+}
+
+void shard_extent_map_t::zero_pad(shard_id_t shard, uint64_t offset,
+ uint64_t length) {
+ const extent_map &emap = extent_maps[shard];
+ if (emap.contains(offset, length)) {
+ return;
+ }
+
+ extent_set required;
+ required.union_insert(offset, length);
+ extent_set not_required;
+ emap.to_interval_set(not_required);
+ required.subtract(not_required);
+
+ for (auto [z_off, z_len] : required) {
+ bufferlist zeros;
+ zeros.append_zero(z_len);
+ insert_in_shard(shard, z_off, zeros);
+ }
+}
+
+void shard_extent_map_t::pad_with_other(shard_extent_set_t const &pad_to,
+ shard_extent_map_t const &other) {
+ for (auto &&[shard, eset] : pad_to) {
+ for (auto &&[off, len] : eset) {
+ pad_with_other(shard, off, len, other);
+ }
+ }
+}
+
+void shard_extent_map_t::pad_with_other(shard_id_t shard, uint64_t offset,
+ uint64_t length,
+ shard_extent_map_t const &other) {
+ const extent_map &emap = extent_maps[shard];
+ if (emap.contains(offset, length)) return;
+
+ extent_set required;
+ required.union_insert(offset, length);
+ extent_set not_required;
+ emap.to_interval_set(not_required);
+ required.subtract(not_required);
+
+ for (auto [z_off, z_len] : required) {
+ bufferlist bl;
+ other.get_buffer(shard, z_off, z_len, bl);
+ bl.rebuild();
+ insert_in_shard(shard, z_off, bl);
+ }
+}
+
+ECUtil::shard_extent_set_t shard_extent_map_t::get_extent_set() {
+ shard_extent_set_t shard_eset(sinfo->get_k_plus_m());
+ for (auto &&[shard, emap] : extent_maps) {
+ emap.to_interval_set(shard_eset[shard]);
+ }
+
+ return shard_eset;
+}
+
+void shard_extent_map_t::erase_shard(shard_id_t shard) {
+ if (extent_maps.erase(shard)) {
+ compute_ro_range();
+ }
+}
+
+bufferlist shard_extent_map_t::get_ro_buffer(
+ uint64_t ro_offset,
+ uint64_t ro_length) const {
+ bufferlist bl;
+ uint64_t chunk_size = sinfo->get_chunk_size();
+ uint64_t stripe_size = sinfo->get_stripe_width();
+ int data_chunk_count = sinfo->get_k();
+
+ pair read_pair(ro_offset, ro_length);
+ auto chunk_aligned_read = sinfo->ro_range_to_chunk_ro_range(read_pair);
+
+ raw_shard_id_t raw_shard((ro_offset / chunk_size) % data_chunk_count);
+
+ for (uint64_t chunk_offset = chunk_aligned_read.first;
+ chunk_offset < chunk_aligned_read.first + chunk_aligned_read.second;
+ chunk_offset += chunk_size, ++raw_shard) {
+ if ((int(raw_shard) == data_chunk_count)) {
+ raw_shard = 0;
+ }
+
+ uint64_t sub_chunk_offset = std::max(chunk_offset, ro_offset);
+ uint64_t sub_chunk_shard_offset = (chunk_offset / stripe_size) * chunk_size
+ + sub_chunk_offset - chunk_offset;
+ uint64_t sub_chunk_len = std::min(ro_offset + ro_length,
+ chunk_offset + chunk_size) -
+ sub_chunk_offset;
+
+ get_buffer(sinfo->get_shard(raw_shard), sub_chunk_shard_offset,
+ sub_chunk_len, bl);
+ }
+ return bl;
+}
+
+bufferlist shard_extent_map_t::get_ro_buffer() const {
+ return get_ro_buffer(ro_start, ro_end - ro_start);
+}
+
+std::string shard_extent_map_t::debug_string(uint64_t interval, uint64_t offset) const {
+ std::stringstream str;
+ str << "shard_extent_map_t: " << *this << " bufs: [";
+
+ bool s_comma = false;
+ for (auto &&[shard, emap] : get_extent_maps()) {
+ if (s_comma) str << ", ";
+ s_comma = true;
+ str << shard << ": [";
+
+ bool comma = false;
+ for (auto &&extent : emap) {
+ bufferlist bl = extent.get_val();
+ char *buf = bl.c_str();
+ for (uint64_t i = 0; i < extent.get_len(); i += interval) {
+ int *seed = (int*)&buf[i + offset];
+ if (comma) str << ", ";
+ str << (i + extent.get_off()) << ":" << std::to_string(*seed);
+ comma = true;
+ }
+ }
+ str << "]";
+ }
+ str << "]";
+ return str.str();
+}
+
+void shard_extent_map_t::erase_stripe(uint64_t offset, uint64_t length) {
+ for (auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+ auto &&[shard, emap] = *iter;
+ emap.erase(offset, length);
+ if (emap.empty()) {
+ iter = extent_maps.erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+ compute_ro_range();
+}
+
+bool shard_extent_map_t::contains(shard_id_t shard) const {
+ return extent_maps.contains(shard);
+}
+
+bool shard_extent_map_t::contains(optional<shard_extent_set_t> const &other) const {
+ if (!other) {
+ return true;
+ }
+
+ return contains(*other);
+}
+
+bool shard_extent_map_t::contains(shard_extent_set_t const &other) const {
+ for (auto &&[shard, other_eset] : other) {
+ if (!extent_maps.contains(shard)) {
+ return false;
+ }
+
+ extent_set eset;
+ extent_maps.at(shard).to_interval_set(eset);
+
+ if (!eset.contains(other_eset)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void shard_extent_set_t::subtract(const shard_extent_set_t &other) {
+ for (auto &&[shard, eset] : other) {
+ if (!contains(shard)) {
+ continue;
+ }
+
+ at(shard).subtract(eset);
+ if (at(shard).empty()) {
+ erase(shard);
+ }
+ }
+}
+
+void shard_extent_set_t::intersection_of(const shard_extent_set_t &other) {
+ for (shard_id_t s; s < map.max_size(); ++s) {
+ if (!map.contains(s) || !other.contains(s)) {
+ erase(s);
+ } else {
+ at(s).intersection_of(other.at(s));
+ if (at(s).empty()) {
+ erase(s);
+ }
+ }
+ }
+}
+
+void shard_extent_set_t::insert(const shard_extent_set_t &other) {
+ for (auto &&[shard, eset] : other) {
+ map[shard].union_of(other.at(shard));
+ }
+}
+}
+
void ECUtil::HashInfo::append(uint64_t old_size,
- map<int, bufferlist> &to_append) {
+ shard_id_map<bufferptr> &to_append) {
ceph_assert(old_size == total_chunk_size);
uint64_t size_to_append = to_append.begin()->second.length();
if (has_chunk_hash()) {
ceph_assert(to_append.size() == cumulative_shard_hashes.size());
- for (map<int, bufferlist>::iterator i = to_append.begin();
- i != to_append.end();
- ++i) {
- ceph_assert(size_to_append == i->second.length());
- ceph_assert((unsigned)i->first < cumulative_shard_hashes.size());
- uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
- cumulative_shard_hashes[i->first] = new_hash;
+ for (auto &&[shard, ptr] : to_append) {
+ ceph_assert(size_to_append == ptr.length());
+ ceph_assert(shard < static_cast<int>(cumulative_shard_hashes.size()));
+ cumulative_shard_hashes[int(shard)] =
+ ceph_crc32c(cumulative_shard_hashes[int(shard)],
+ (unsigned char*)ptr.c_str(), ptr.length());
}
}
total_chunk_size += size_to_append;
}
-void ECUtil::HashInfo::encode(bufferlist &bl) const
-{
+void ECUtil::HashInfo::encode(bufferlist &bl) const {
ENCODE_START(1, 1, bl);
encode(total_chunk_size, bl);
encode(cumulative_shard_hashes, bl);
ENCODE_FINISH(bl);
}
-void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl)
-{
+void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl) {
DECODE_START(1, bl);
decode(total_chunk_size, bl);
decode(cumulative_shard_hashes, bl);
- projected_total_chunk_size = total_chunk_size;
DECODE_FINISH(bl);
}
-void ECUtil::HashInfo::dump(Formatter *f) const
-{
+void ECUtil::HashInfo::dump(Formatter *f) const {
f->dump_unsigned("total_chunk_size", total_chunk_size);
f->open_array_section("cumulative_shard_hashes");
for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) {
}
namespace ECUtil {
-std::ostream& operator<<(std::ostream& out, const HashInfo& hi)
-{
+std::ostream &operator<<(std::ostream &out, const HashInfo &hi) {
ostringstream hashes;
- for (auto hash: hi.cumulative_shard_hashes)
+ for (auto hash : hi.cumulative_shard_hashes) {
hashes << " " << hex << hash;
+ }
return out << "tcs=" << hi.total_chunk_size << hashes.str();
}
+
+std::ostream &operator<<(std::ostream &out, const shard_extent_map_t &rhs) {
+ // sinfo not thought to be needed for debug, as it is constant.
+ return out << "shard_extent_map: ({" << rhs.ro_start << "~"
+ << rhs.ro_end << "}, maps=" << rhs.extent_maps << ")";
}
-void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
-{
+std::ostream &operator<<(std::ostream &out, const log_entry_t &rhs) {
+ switch (rhs.event) {
+ case READ_REQUEST: out << "READ_REQUEST";
+ break;
+ case READ_DONE: out << "READ_DONE";
+ break;
+ case INJECT_EIO: out << "INJECT_EIO";
+ break;
+ case CANCELLED: out << "CANCELLED";
+ break;
+ case ERROR: out << "ERROR";
+ break;
+ case REQUEST_MISSING: out << "REQUEST_MISSING";
+ break;
+ case COMPLETE_ERROR: out << "COMPLETE_ERROR";
+ break;
+ case ERROR_CLEAR: out << "ERROR_CLEAR";
+ break;
+ case COMPLETE: out << "COMPLETE";
+ break;
+ default:
+ ceph_assert(false);
+ }
+ return out << "[" << rhs.shard << "]->" << rhs.io << "\n";
+}
+}
+
+void ECUtil::HashInfo::generate_test_instances(list<HashInfo*> &o) {
o.push_back(new HashInfo(3));
{
bufferlist bl;
bl.append_zero(20);
- map<int, bufferlist> buffers;
- buffers[0] = bl;
- buffers[1] = bl;
- buffers[2] = bl;
+
+ bufferptr bp = bl.begin().get_current_ptr();
+
+ // We don't have the k+m here, but this is not critical performance, so
+ // create an oversized map.
+ shard_id_map<bufferptr> buffers(128);
+ buffers[shard_id_t(0)] = bp;
+ buffers[shard_id_t(1)] = bp;
+ buffers[shard_id_t(2)] = bp;
o.back()->append(0, buffers);
o.back()->append(20, buffers);
}
const string HINFO_KEY = "hinfo_key";
-bool ECUtil::is_hinfo_key_string(const string &key)
-{
+bool ECUtil::is_hinfo_key_string(const string &key) {
return key == HINFO_KEY;
}
-const string &ECUtil::get_hinfo_key()
-{
+const string &ECUtil::get_hinfo_key() {
return HINFO_KEY;
}
-
-END_IGNORE_DEPRECATED
#include "include/buffer_fwd.h"
#include "include/ceph_assert.h"
#include "include/encoding.h"
-#include "common/Formatter.h"
+#include "common/interval_map.h"
+#include "common/mini_flat_map.h"
+
+#include "osd_types.h"
+
+/// If someone wants these types, but not ExtentCache, move to another file
+struct bl_split_merge {
+ ceph::buffer::list split(
+ uint64_t offset,
+ uint64_t length,
+ ceph::buffer::list &bl) const {
+ ceph::buffer::list out;
+ out.substr_of(bl, offset, length);
+ return out;
+ }
+
+ bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
+ return true;
+ }
+
+ ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
+ ceph::buffer::list bl{std::move(left)};
+ bl.claim_append(right);
+ return bl;
+ }
+
+ uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
+};
+
+using extent_set = interval_set<uint64_t, boost::container::flat_map, false>;
+using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge,
+ boost::container::flat_map>;
+
+/* Slice iterator. This looks for contiguous buffers which are common
+ * across all shards in the out_set.
+ *
+ * It is a template, but essentially:
+ * K must a key suitable for a mini_flat_map.
+ * T must be either an extent map or a reference to an extent map.
+ */
+template <typename K, typename T>
+class slice_iterator {
+ mini_flat_map<K, T> &input;
+ uint64_t offset = std::numeric_limits<uint64_t>::max();
+ uint64_t length = std::numeric_limits<uint64_t>::max();
+ uint64_t start = std::numeric_limits<uint64_t>::max();
+ uint64_t end = std::numeric_limits<uint64_t>::max();
+ shard_id_map<std::pair<extent_map::const_iterator,
+ bufferlist::const_iterator>> iters;
+ shard_id_map<bufferptr> in;
+ shard_id_map<bufferptr> out;
+ const shard_id_set &out_set;
+
+ void advance() {
+ in.clear();
+ out.clear();
+ offset = start;
+ end = std::numeric_limits<uint64_t>::max();
+
+ if (iters.empty()) {
+ return;
+ }
+
+ // First we find the last buffer in the list
+ for (auto &&[shard, iters] : iters) {
+ auto &&[emap_iter, bl_iter] = iters;
+ uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off();
+ ceph_assert(iter_offset >= start);
+ // If this iterator is after the current offset, then we will ignore
+ // it for this buffer ptr. The end must move to or before this point.
+ if (iter_offset > start && iter_offset < end) {
+ end = iter_offset;
+ continue;
+ }
+
+ uint64_t iter_end = iter_offset + bl_iter.get_current_ptr().length();
+ if (iter_end < end) {
+ end = iter_end;
+ }
+ }
+
+ for (auto &&iter = iters.begin(); iter != iters.end();) {
+ auto shard = iter->first;
+ auto &&[emap_iter, bl_iter] = iter->second;
+ uint64_t iter_offset = emap_iter.get_off() + bl_iter.get_off();
+ bool erase = false;
+
+ // Ignore any blank buffers.
+ if (iter_offset == start) {
+ ceph_assert(iter_offset == start);
+
+ // Create a new buffer pointer for the result. We don't want the client
+ // manipulating the ptr.
+ if (out_set.contains(shard)) {
+ out.emplace(
+ shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
+ } else {
+ in.emplace(
+ shard, bufferptr(bl_iter.get_current_ptr(), 0, end - start));
+ }
+
+ // Now we need to move on the iterators.
+ bl_iter += end - start;
+
+ // If we have reached the end of the extent, we need to move that on too.
+ if (bl_iter == emap_iter.get_val().end()) {
+ ++emap_iter;
+ if (emap_iter == input[shard].end()) {
+ erase = true;
+ } else {
+ iters.at(shard).second = emap_iter.get_val().begin();
+ }
+ }
+ } else
+ ceph_assert(iter_offset > start);
+
+ if (erase) {
+ iter = iters.erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+
+ // We can now move the offset on.
+ length = end - start;
+ start = end;
+
+ /* This can arise in two ways:
+ * 1. We can generate an empty buffer out of a gap, so just skip over.
+ * 2. Only the inputs contain any interesting data. We don't need
+ * to perform a decode/encode on a slice in that case.
+ */
+ if (out.empty()) {
+ advance();
+ }
+ }
+
+public:
+ slice_iterator(mini_flat_map<K, T> &_input, const shard_id_set &out_set) :
+ input(_input),
+ iters(input.max_size()),
+ in(input.max_size()),
+ out(input.max_size()),
+ out_set(out_set) {
+ for (auto &&[shard, emap] : input) {
+ auto emap_iter = emap.begin();
+ auto bl_iter = emap_iter.get_val().begin();
+ auto p = std::make_pair(std::move(emap_iter), std::move(bl_iter));
+ iters.emplace(shard, std::move(p));
+
+ if (emap_iter.get_off() < start) {
+ start = emap_iter.get_off();
+ }
+ }
+
+ advance();
+ }
+
+ shard_id_map<bufferptr> &get_in_bufferptrs() { return in; }
+ shard_id_map<bufferptr> &get_out_bufferptrs() { return out; }
+ uint64_t get_offset() const { return offset; }
+ uint64_t get_length() const { return length; }
+ bool is_end() const { return in.empty() && out.empty(); }
+
+ bool is_page_aligned() const {
+ for (auto &&[_, ptr] : in) {
+ uintptr_t p = (uintptr_t)ptr.c_str();
+ if (p & ~CEPH_PAGE_MASK) return false;
+ if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false;
+ }
+
+ for (auto &&[_, ptr] : out) {
+ uintptr_t p = (uintptr_t)ptr.c_str();
+ if (p & ~CEPH_PAGE_MASK) return false;
+ if ((p + ptr.length()) & ~CEPH_PAGE_MASK) return false;
+ }
+
+ return true;
+ }
+
+ slice_iterator &operator++() {
+ advance();
+ return *this;
+ }
+};
+
+// Setting to 1 turns on very large amounts of level 0 debug containing the
+// contents of buffers. Even on level 20 this is not really wanted.
+#define DEBUG_EC_BUFFERS 1
namespace ECUtil {
+class shard_extent_map_t;
+
+struct shard_extent_set_t {
+ // The following boilerplate is just to make this look like a map.
+ shard_id_map<extent_set> map;
+
+ shard_extent_set_t(short max_shards) : map(max_shards) {}
+
+ bool contains(shard_id_t shard) const { return map.contains(shard); }
+ bool empty() const { return map.empty(); }
+ void swap(shard_extent_set_t &other) noexcept { map.swap(other.map); }
+ void clear() { map.clear(); }
+ auto erase(shard_id_t shard) { return map.erase(shard); }
+
+ auto erase(shard_id_map<extent_set>::iterator &iter) {
+ return map.erase(iter);
+ }
+
+ void erase_stripe(uint64_t offset, uint64_t length) {
+ for (auto it = map.begin(); it != map.end();) {
+ it->second.erase(offset, length);
+ if (it->second.empty()) it = map.erase(it);
+ else ++it;
+ }
+ }
+
+ auto begin() const { return map.cbegin(); }
+ auto begin() { return map.begin(); }
+ auto end() const { return map.cend(); }
+ auto end() { return map.end(); }
+
+ void emplace(shard_id_t shard, extent_set &&set) {
+ map.emplace(shard, std::move(set));
+ }
+
+ size_t shard_count() const { return map.size(); }
+ extent_set &at(shard_id_t shard) { return map.at(shard); }
+ const extent_set &at(shard_id_t shard) const { return map.at(shard); }
+
+ extent_set get(shard_id_t shard) const {
+ if (!map.contains(shard)) {
+ return extent_set();
+ }
+ return at(shard);
+ }
+
+ extent_set &operator[](shard_id_t shard) { return map[shard]; }
+
+ bool operator==(shard_extent_set_t const &other) const {
+ return map == other.map;
+ }
+
+ friend std::ostream &operator<<(std::ostream &lhs,
+ const shard_extent_set_t &rhs) {
+ lhs << rhs.map;
+ return lhs;
+ }
+
+ void get_extent_superset(extent_set &eset) const {
+ for (auto &&[_, e] : map) {
+ eset.union_of(e);
+ }
+ }
+
+ extent_set get_extent_superset() const {
+ extent_set eset;
+ get_extent_superset(eset);
+ return eset;
+ }
+
+ /* Return the extent set which is common across all populated shards. */
+ extent_set get_extent_common_set() const {
+ extent_set eset;
+ bool first = true;
+ for (auto &&[_, e] : map) {
+ if (first) {
+ eset.insert(e);
+ first = false;
+ } else {
+ eset.intersection_of(e);
+ }
+ }
+ return eset;
+ }
+
+ void align(uint64_t a) {
+ for (auto &&[_, e] : map) {
+ e.align(a);
+ }
+ }
+
+ size_t get_max_shards() const { return map.max_size(); }
+
+ void subtract(const shard_extent_set_t &set);
+ void intersection_of(const shard_extent_set_t &set);
+ void insert(const shard_extent_set_t &set);
+
+ /** return the sum of extent_set.size */
+ uint64_t size() const {
+ uint64_t size = 0;
+ for (auto &&[_, e] : map) size += e.size();
+
+ return size;
+ }
+
+ void populate_shard_id_set(shard_id_set &set) const {
+ map.populate_bitset_set(set);
+ }
+
+ shard_id_set get_shard_id_set() const {
+ shard_id_set r;
+ map.populate_bitset_set(r);
+ return r;
+ }
+};
+
+inline uint64_t page_mask() {
+ static const uint64_t page_mask = ((uint64_t)CEPH_PAGE_SIZE) - 1;
+ return page_mask;
+}
+
+inline uint64_t align_page_next(uint64_t val) {
+ return p2roundup(val, (uint64_t)CEPH_PAGE_SIZE);
+}
+
+inline uint64_t align_page_prev(uint64_t val) {
+ return p2align(val, (uint64_t)CEPH_PAGE_SIZE);
+}
class stripe_info_t {
+ friend class shard_extent_map_t;
+
const uint64_t stripe_width;
+ const uint64_t plugin_flags;
const uint64_t chunk_size;
- const unsigned int k; // Can be calculated with a division from above. Better to cache.
+ const pg_pool_t *pool;
+ const unsigned int k;
+ // Can be calculated with a division from above. Better to cache.
const unsigned int m;
const std::vector<shard_id_t> chunk_mapping;
const std::vector<raw_shard_id_t> chunk_mapping_reverse;
+ const shard_id_set data_shards;
+ const shard_id_set parity_shards;
+
private:
+ void ro_range_to_shards(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t *shard_extent_set,
+ extent_set *extent_superset,
+ buffer::list *bl,
+ shard_extent_map_t *shard_extent_map) const;
+
static std::vector<shard_id_t> complete_chunk_mapping(
- std::vector<shard_id_t> _chunk_mapping, unsigned int n)
- {
- unsigned int size = _chunk_mapping.size();
+ const std::vector<shard_id_t> &_chunk_mapping, unsigned int n) {
+ unsigned int size = (int)_chunk_mapping.size();
std::vector<shard_id_t> chunk_mapping(n);
- for (shard_id_t i; i < n; ++i) {
+ for (unsigned int i = 0; i < n; i++) {
if (size > i) {
- chunk_mapping.at(static_cast<int>(i)) = _chunk_mapping.at(static_cast<int>(i));
+ chunk_mapping.at(i) = _chunk_mapping.at(i);
} else {
- chunk_mapping.at(static_cast<int>(i)) = i;
+ chunk_mapping.at(i) = static_cast<int>(i);
}
}
return chunk_mapping;
}
+
static std::vector<raw_shard_id_t> reverse_chunk_mapping(
- std::vector<shard_id_t> chunk_mapping)
- {
- unsigned int size = chunk_mapping.size();
+ const std::vector<shard_id_t> &chunk_mapping) {
+ size_t size = chunk_mapping.size();
std::vector<raw_shard_id_t> reverse(size);
shard_id_set used;
- for (raw_shard_id_t i; i < size; ++i) {
- shard_id_t index = chunk_mapping.at(static_cast<int>(i));
+ for (raw_shard_id_t raw_shard; raw_shard < size; ++raw_shard) {
+ shard_id_t shard = chunk_mapping[int(raw_shard)];
// Mapping must be a bijection and a permutation
- ceph_assert(!used.contains(index));
- used.insert(index);
- reverse.at(static_cast<int>(index)) = i;
+ ceph_assert(!used.contains(shard));
+ used.insert(shard);
+ reverse.at(int(shard)) = raw_shard;
}
return reverse;
}
+
+ static shard_id_set calc_shards(raw_shard_id_t start,
+ int count,
+ const std::vector<shard_id_t> &chunk_mapping) {
+ shard_id_set data_shards;
+ for (raw_shard_id_t raw_shard = start;
+ raw_shard < int(start) + count;
+ ++raw_shard) {
+ shard_id_t shard = chunk_mapping[int(raw_shard)];
+ data_shards.insert(shard);
+ }
+ return data_shards;
+ }
+
public:
- stripe_info_t(ErasureCodeInterfaceRef ec_impl, uint64_t stripe_width)
+ stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
+ uint64_t stripe_width
+ )
: stripe_width(stripe_width),
+ plugin_flags(ec_impl->get_supported_optimizations()),
chunk_size(stripe_width / ec_impl->get_data_chunk_count()),
+ pool(pool),
k(ec_impl->get_data_chunk_count()),
m(ec_impl->get_coding_chunk_count()),
- chunk_mapping(complete_chunk_mapping(ec_impl->get_chunk_mapping(),
- k + m)),
- chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+ chunk_mapping(
+ complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
+ chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+ data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ ceph_assert(stripe_width != 0);
ceph_assert(stripe_width % k == 0);
}
+
// Simpler constructors for unit tests
stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width)
: stripe_width(stripe_width),
+ plugin_flags(0xFFFFFFFFFFFFFFFFul),
+ // Everything enabled for test harnesses.
chunk_size(stripe_width / k),
+ pool(nullptr),
k(k),
m(m),
chunk_mapping(complete_chunk_mapping(std::vector<shard_id_t>(), k + m)),
- chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+ chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+ data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ ceph_assert(stripe_width != 0);
+ ceph_assert(stripe_width % k == 0);
+ }
+
+ stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
+ const std::vector<shard_id_t> &_chunk_mapping)
+ : stripe_width(stripe_width),
+ plugin_flags(0xFFFFFFFFFFFFFFFFul),
+ // Everything enabled for test harnesses.
+ chunk_size(stripe_width / k),
+ pool(nullptr),
+ k(k),
+ m(m),
+ chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)),
+ chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+ data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ ceph_assert(stripe_width != 0);
ceph_assert(stripe_width % k == 0);
}
+
stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
- std::vector<shard_id_t> _chunk_mapping)
+ const pg_pool_t *pool, const std::vector<shard_id_t> &_chunk_mapping)
: stripe_width(stripe_width),
+ plugin_flags(0xFFFFFFFFFFFFFFFFul),
+ // Everything enabled for test harnesses.
chunk_size(stripe_width / k),
+ pool(pool),
k(k),
m(m),
chunk_mapping(complete_chunk_mapping(_chunk_mapping, k + m)),
- chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)) {
+ chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+ data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ ceph_assert(stripe_width != 0);
ceph_assert(stripe_width % k == 0);
}
- bool logical_offset_is_stripe_aligned(uint64_t logical) const {
- return (logical % stripe_width) == 0;
+
+ stripe_info_t(unsigned int k, unsigned int m, uint64_t stripe_width,
+ const pg_pool_t *pool)
+ : stripe_width(stripe_width),
+ plugin_flags(0xFFFFFFFFFFFFFFFFul),
+ // Everything enabled for test harnesses.
+ chunk_size(stripe_width / k),
+ pool(pool),
+ k(k),
+ m(m),
+ chunk_mapping(complete_chunk_mapping(std::vector<shard_id_t>(), k + m)),
+ chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
+ data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ ceph_assert(stripe_width != 0);
+ ceph_assert(stripe_width % k == 0);
+ }
+
+ uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const {
+ uint64_t remainder = size % get_stripe_width();
+ uint64_t shard_size = (size - remainder) / k;
+ raw_shard_id_t raw_shard = get_raw_shard(shard);
+ if (raw_shard >= get_k()) {
+ // coding parity shards have same size as data shard 0
+ raw_shard = 0;
+ }
+ if (remainder > uint64_t(raw_shard) * get_chunk_size()) {
+ remainder -= uint64_t(raw_shard) * get_chunk_size();
+ if (remainder > get_chunk_size()) {
+ remainder = get_chunk_size();
+ }
+ shard_size += remainder;
+ }
+ return ECUtil::align_page_next(shard_size);
+ }
+
+ uint64_t ro_offset_to_shard_offset(uint64_t ro_offset,
+ const raw_shard_id_t raw_shard) const {
+ uint64_t full_stripes = (ro_offset / stripe_width) * chunk_size;
+ int offset_shard = (ro_offset / chunk_size) % k;
+
+ if (int(raw_shard) == offset_shard) {
+ return full_stripes + ro_offset % chunk_size;
+ }
+ if (raw_shard < offset_shard) {
+ return full_stripes + chunk_size;
+ }
+ return full_stripes;
+ }
+
+ /**
+ * Return true if shard does not require metadata updates
+ */
+ bool is_nonprimary_shard(const shard_id_t shard) const {
+ return pool->is_nonprimary_shard(shard);
}
+
+ bool supports_ec_overwrites() const {
+ return pool->allows_ecoverwrites();
+ }
+
+ bool supports_sub_chunks() const {
+ return (plugin_flags &
+ ErasureCodeInterface::FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS) != 0;
+ }
+
+ bool require_hinfo() const {
+ return !supports_ec_overwrites();
+ }
+
+ bool supports_partial_reads() const {
+ return (plugin_flags &
+ ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION) != 0;
+ }
+
+ bool supports_partial_writes() const {
+ return (plugin_flags &
+ ErasureCodeInterface::FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION) != 0;
+ }
+
+ bool supports_parity_delta_writes() const {
+ return (plugin_flags &
+ ErasureCodeInterface::FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION) != 0;
+ }
+
uint64_t get_stripe_width() const {
return stripe_width;
}
+
uint64_t get_chunk_size() const {
return chunk_size;
}
+
unsigned int get_m() const {
return m;
}
+
unsigned int get_k() const {
return k;
}
+
unsigned int get_k_plus_m() const {
return k + m;
}
- shard_id_t get_shard(raw_shard_id_t raw_shard) const {
- return chunk_mapping[static_cast<int>(raw_shard)];
+
+ const shard_id_t get_shard(const raw_shard_id_t raw_shard) const {
+ return chunk_mapping[int(raw_shard)];
}
+
raw_shard_id_t get_raw_shard(shard_id_t shard) const {
- return chunk_mapping_reverse[static_cast<int>(shard)];
+ return chunk_mapping_reverse.at(int(shard));
+ }
+
+ /* Return a "span" - which can be iterated over */
+ auto get_data_shards() const {
+ return data_shards;
}
- uint64_t logical_to_prev_chunk_offset(uint64_t offset) const {
+
+ auto get_parity_shards() const {
+ return parity_shards;
+ }
+
+ uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
return (offset / stripe_width) * chunk_size;
}
- uint64_t logical_to_next_chunk_offset(uint64_t offset) const {
- return ((offset + stripe_width - 1)/ stripe_width) * chunk_size;
+
+ uint64_t ro_offset_to_next_chunk_offset(uint64_t offset) const {
+ return ((offset + stripe_width - 1) / stripe_width) * chunk_size;
}
- uint64_t logical_to_prev_stripe_offset(uint64_t offset) const {
+
+ uint64_t ro_offset_to_prev_stripe_ro_offset(uint64_t offset) const {
return offset - (offset % stripe_width);
}
- uint64_t logical_to_next_stripe_offset(uint64_t offset) const {
- return ((offset % stripe_width) ?
- (offset - (offset % stripe_width) + stripe_width) :
- offset);
+
+ uint64_t ro_offset_to_next_stripe_ro_offset(uint64_t offset) const {
+ return ((offset % stripe_width)
+ ? (offset - (offset % stripe_width) + stripe_width)
+ : offset);
}
- uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+
+ uint64_t aligned_ro_offset_to_chunk_offset(uint64_t offset) const {
ceph_assert(offset % stripe_width == 0);
return (offset / stripe_width) * chunk_size;
}
- uint64_t chunk_aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+
+ uint64_t chunk_aligned_ro_offset_to_chunk_offset(uint64_t offset) const {
[[maybe_unused]] const auto residue_in_stripe = offset % stripe_width;
ceph_assert(residue_in_stripe % chunk_size == 0);
ceph_assert(stripe_width % chunk_size == 0);
// this rounds down
return (offset / stripe_width) * chunk_size;
}
- uint64_t chunk_aligned_logical_size_to_chunk_size(uint64_t len) const {
- [[maybe_unused]] const auto residue_in_stripe = len % stripe_width;
- ceph_assert(residue_in_stripe % chunk_size == 0);
- ceph_assert(stripe_width % chunk_size == 0);
+
+ uint64_t chunk_aligned_ro_length_to_shard_length(uint64_t len) const {
// this rounds up
return ((len + stripe_width - 1) / stripe_width) * chunk_size;
}
- uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const {
+
+ uint64_t chunk_aligned_shard_offset_to_ro_offset(uint64_t offset) const {
ceph_assert(offset % chunk_size == 0);
return (offset / chunk_size) * stripe_width;
}
- std::pair<uint64_t, uint64_t> chunk_aligned_offset_len_to_chunk(
- std::pair<uint64_t, uint64_t> in) const;
- std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds(
- std::pair<uint64_t, uint64_t> in) const {
- uint64_t off = logical_to_prev_stripe_offset(in.first);
- uint64_t len = logical_to_next_stripe_offset(
- (in.first - off) + in.second);
+
+ std::pair<uint64_t, uint64_t> chunk_aligned_ro_range_to_shard_ro_range(
+ uint64_t off, uint64_t len) const;
+
+ std::pair<uint64_t, uint64_t> ro_offset_len_to_stripe_ro_offset_len(
+ uint64_t _off, uint64_t _len) const {
+ uint64_t off = ro_offset_to_prev_stripe_ro_offset(_off);
+ uint64_t len = ro_offset_to_next_stripe_ro_offset(
+ (_off - off) + _len);
return std::make_pair(off, len);
}
- std::pair<uint64_t, uint64_t> offset_len_to_chunk_bounds(
- std::pair<uint64_t, uint64_t> in) const {
+
+ std::pair<uint64_t, uint64_t> ro_range_to_chunk_ro_range(
+ const std::pair<uint64_t, uint64_t> &in) const {
uint64_t off = in.first - (in.first % chunk_size);
uint64_t tmp_len = (in.first - off) + in.second;
- uint64_t len = ((tmp_len % chunk_size) ?
- (tmp_len - (tmp_len % chunk_size) + chunk_size) :
- tmp_len);
+ uint64_t len = ((tmp_len % chunk_size)
+ ? (tmp_len - (tmp_len % chunk_size) + chunk_size)
+ : tmp_len);
return std::make_pair(off, len);
}
- std::pair<uint64_t, uint64_t> offset_length_to_data_chunk_indices(
- uint64_t off, uint64_t len) const {
- assert(chunk_size > 0);
- const auto first_chunk_idx = (off / chunk_size);
- const auto last_chunk_idx = (chunk_size - 1 + off + len) / chunk_size;
- return {first_chunk_idx, last_chunk_idx};
- }
- bool offset_length_is_same_stripe(
- uint64_t off, uint64_t len) const {
- if (len == 0) {
- return true;
+
+ void ro_range_to_shard_extent_set(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set) const {
+ ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, nullptr, nullptr, nullptr);
+ }
+
+ void ro_range_to_shard_extent_set(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set,
+ extent_set &extent_superset) const {
+ ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &extent_superset,
+ nullptr,
+ nullptr);
+ }
+
+ void ro_range_to_shard_extent_set_with_parity(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set) const {
+ extent_set parity;
+ ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &parity, nullptr,
+ nullptr);
+
+ if (parity.empty()) return;
+
+ for (shard_id_t shard : get_parity_shards()) {
+ shard_extent_set[shard].union_of(parity);
}
- assert(chunk_size > 0);
- const auto first_stripe_idx = off / stripe_width;
- const auto last_inc_stripe_idx = (off + len - 1) / stripe_width;
- return first_stripe_idx == last_inc_stripe_idx;
}
-};
-int decode(
- const stripe_info_t &sinfo,
- ceph::ErasureCodeInterfaceRef &ec_impl,
- const std::set<int> want_to_read,
- std::map<int, ceph::buffer::list> &to_decode,
- ceph::buffer::list *out);
-
-int decode(
- const stripe_info_t &sinfo,
- ceph::ErasureCodeInterfaceRef &ec_impl,
- std::map<int, ceph::buffer::list> &to_decode,
- std::map<int, ceph::buffer::list*> &out);
-
-int encode(
- const stripe_info_t &sinfo,
- ceph::ErasureCodeInterfaceRef &ec_impl,
- ceph::buffer::list &in,
- const std::set<int> &want,
- std::map<int, ceph::buffer::list> *out);
+ void ro_range_to_shard_extent_set_with_superset(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set,
+ extent_set &superset) const {
+ ro_range_to_shards(ro_offset, ro_size, &shard_extent_set, &superset, nullptr,
+ nullptr);
+ }
+
+ void ro_range_to_shard_extent_map(
+ uint64_t ro_offset,
+ uint64_t ro_size,
+ buffer::list &bl,
+ shard_extent_map_t &shard_extent_map) const {
+ ro_range_to_shards(ro_offset, ro_size, nullptr, nullptr, &bl, &shard_extent_map);
+ }
+
+ void trim_shard_extent_set_for_ro_offset(uint64_t ro_offset,
+ ECUtil::shard_extent_set_t &
+ shard_extent_set) const;
+
+ void ro_size_to_stripe_aligned_read_mask(
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set) const;
+
+ void ro_size_to_read_mask(
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set) const;
+
+ void ro_size_to_zero_mask(
+ uint64_t ro_size,
+ ECUtil::shard_extent_set_t &shard_extent_set) const;
+};
class HashInfo {
uint64_t total_chunk_size = 0;
std::vector<uint32_t> cumulative_shard_hashes;
- // purely ephemeral, represents the size once all in-flight ops commit
- uint64_t projected_total_chunk_size = 0;
public:
HashInfo() {}
+
explicit HashInfo(unsigned num_chunks) :
cumulative_shard_hashes(num_chunks, -1) {}
- void append(uint64_t old_size, std::map<int, ceph::buffer::list> &to_append);
+
+ void append(uint64_t old_size, shard_id_map<bufferptr> &to_append);
+
void clear() {
total_chunk_size = 0;
cumulative_shard_hashes = std::vector<uint32_t>(
cumulative_shard_hashes.size(),
-1);
}
+
void encode(ceph::buffer::list &bl) const;
void decode(ceph::buffer::list::const_iterator &bl);
void dump(ceph::Formatter *f) const;
- static void generate_test_instances(std::list<HashInfo*>& o);
+ static void generate_test_instances(std::list<HashInfo*> &o);
+
uint32_t get_chunk_hash(shard_id_t shard) const {
ceph_assert(shard < cumulative_shard_hashes.size());
- return cumulative_shard_hashes[static_cast<int>(shard)];
+ return cumulative_shard_hashes[int(shard)];
}
+
uint64_t get_total_chunk_size() const {
return total_chunk_size;
}
- uint64_t get_projected_total_chunk_size() const {
- return projected_total_chunk_size;
- }
- uint64_t get_total_logical_size(const stripe_info_t &sinfo) const {
- return get_total_chunk_size() *
- (sinfo.get_stripe_width()/sinfo.get_chunk_size());
- }
- uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const {
- return get_projected_total_chunk_size() *
- (sinfo.get_stripe_width()/sinfo.get_chunk_size());
- }
- void set_projected_total_logical_size(
- const stripe_info_t &sinfo,
- uint64_t logical_size) {
- ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size));
- projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset(
- logical_size);
- }
+
void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) {
cumulative_shard_hashes.clear();
total_chunk_size = new_chunk_size;
}
+
bool has_chunk_hash() const {
return !cumulative_shard_hashes.empty();
}
+
void update_to(const HashInfo &rhs) {
- auto ptcs = projected_total_chunk_size;
*this = rhs;
- projected_total_chunk_size = ptcs;
}
- friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi);
+
+ friend std::ostream &operator<<(std::ostream &out, const HashInfo &hi);
};
typedef std::shared_ptr<HashInfo> HashInfoRef;
+class shard_extent_map_t {
+ static const uint64_t invalid_offset = std::numeric_limits<uint64_t>::max();
+
+public:
+ const stripe_info_t *sinfo;
+ // The maximal range of all extents maps within rados object space.
+ uint64_t ro_start;
+ uint64_t ro_end;
+ uint64_t start_offset;
+ uint64_t end_offset;
+ shard_id_map<extent_map> extent_maps;
+
+ slice_iterator<shard_id_t, extent_map> begin_slice_iterator(
+ const shard_id_set &out_set);
+
+ /* This caculates the ro offset for an offset into a particular shard */
+ uint64_t calc_ro_offset(raw_shard_id_t raw_shard, int shard_offset) const {
+ int stripes = shard_offset / sinfo->chunk_size;
+ return stripes * sinfo->stripe_width + uint64_t(raw_shard) * sinfo->
+ chunk_size +
+ shard_offset % sinfo->chunk_size;
+ }
+
+ uint64_t calc_ro_end(raw_shard_id_t raw_shard, int shard_offset) const {
+ return calc_ro_offset(raw_shard, shard_offset - 1) + 1;
+ }
+
+ /* This is a relatively expensive operation to update the ro offset/length.
+ * Ideally, we should be able to update offset/length incrementally.
+ */
+ void compute_ro_range() {
+ uint64_t start = invalid_offset;
+ uint64_t end = 0;
+ uint64_t o_start = invalid_offset;
+ uint64_t o_end = 0;
+
+ for (auto &&[shard, emap] : extent_maps) {
+ raw_shard_id_t raw_shard = sinfo->get_raw_shard(shard);
+ uint64_t start_off = emap.get_start_off();
+ uint64_t end_off = emap.get_end_off();
+ o_start = std::min(o_start, start_off);
+ o_end = std::max(o_end, end_off);
+
+ if (raw_shard < sinfo->get_k()) {
+ start = std::min(start, calc_ro_offset(raw_shard, start_off));
+ end = std::max(end, calc_ro_end(raw_shard, end_off));
+ }
+ }
+ if (end != 0) {
+ ro_start = start;
+ ro_end = end;
+ start_offset = o_start;
+ end_offset = o_end;
+ } else {
+ ro_start = invalid_offset;
+ ro_end = invalid_offset;
+ start_offset = invalid_offset;
+ end_offset = invalid_offset;
+ }
+ }
+
+public:
+ shard_extent_map_t(const stripe_info_t *sinfo) :
+ sinfo(sinfo),
+ ro_start(invalid_offset),
+ ro_end(invalid_offset),
+ start_offset(invalid_offset),
+ end_offset(invalid_offset),
+ extent_maps(sinfo->get_k_plus_m()) {}
+
+ shard_extent_map_t(const stripe_info_t *sinfo,
+ shard_id_map<extent_map> &&_extent_maps) :
+ sinfo(sinfo),
+ extent_maps(std::move(_extent_maps)) {
+ // Empty shards are not permitted, so clear them out.
+ for (auto iter = extent_maps.begin(); iter != extent_maps.end();) {
+ if (iter->second.empty()) {
+ iter = extent_maps.erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+ compute_ro_range();
+ }
+
+ bool empty() const {
+ return ro_end == invalid_offset;
+ }
+
+ uint64_t get_ro_start() const {
+ return ro_start;
+ }
+
+ uint64_t get_ro_end() const {
+ return ro_end;
+ }
+
+ /* Return the extent maps. For reading only, set to const as the returned
+ * map should not be modified.
+ * We want to avoid:
+ * - Empty extent maps on shards
+ * - getting the offset/length out of sync.
+ */
+ const auto &get_extent_maps() const {
+ return extent_maps;
+ }
+
+ /* Return a particlar extent map. This must be const because updating it
+ * would cause the shard_extent_map to become inconsistent.
+ *
+ * * This method will raise an exception if the shard has no extents.
+ */
+ const extent_map &get_extent_map(shard_id_t shard) const {
+ return extent_maps.at(shard);
+ }
+
+ extent_set get_extent_set(const shard_id_t &shard) const {
+ extent_set ret;
+ if (extent_maps.contains(shard)) {
+ extent_maps.at(shard).to_interval_set(ret);
+ }
+ return ret;
+ }
+
+ void to_shard_extent_set(shard_extent_set_t &set) const {
+ for (auto &&[shard, emap] : extent_maps) {
+ emap.to_interval_set(set[shard]);
+ }
+ }
+
+ bool contains_shard(shard_id_t shard) const {
+ return extent_maps.contains(shard);
+ }
+
+ void erase_after_ro_offset(uint64_t ro_offset);
+ shard_extent_map_t intersect_ro_range(uint64_t ro_offset, uint64_t ro_length) const;
+ shard_extent_map_t intersect(std::optional<shard_extent_set_t> const &other) const;
+ shard_extent_map_t intersect(shard_extent_set_t const &other) const;
+ void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl);
+ void insert_in_shard(shard_id_t shard, uint64_t off, const buffer::list &bl,
+ uint64_t new_start, uint64_t new_end);
+ void insert_ro_zero_buffer(uint64_t ro_offset, uint64_t ro_length);
+ void insert(shard_extent_map_t const &other);
+ void append_zeros_to_ro_offset(uint64_t ro_offset);
+ void insert_ro_extent_map(const extent_map &host_extent_map);
+ extent_set get_extent_superset() const;
+ int encode(const ErasureCodeInterfaceRef &ec_impl, const HashInfoRef &hinfo,
+ uint64_t before_ro_size);
+ int _encode(const ErasureCodeInterfaceRef &ec_impl);
+ int encode_parity_delta(const ErasureCodeInterfaceRef &ec_impl,
+ shard_extent_map_t &old_sem);
+
+ void pad_on_shards(const shard_extent_set_t &pad_to,
+ const shard_id_set &shards);
+ void pad_on_shards(const extent_set &pad_to,
+ const shard_id_set &shards);
+ void trim(const shard_extent_set_t &trim_to);
+ int decode(const ErasureCodeInterfaceRef &ec_impl,
+ const shard_extent_set_t &want,
+ uint64_t object_size);
+ int _decode(const ErasureCodeInterfaceRef &ec_impl,
+ const shard_id_set &want_set,
+ const shard_id_set &need_set);
+ void get_buffer(shard_id_t shard, uint64_t offset, uint64_t length,
+ buffer::list &append_to) const;
+ void get_shard_first_buffer(shard_id_t shard, buffer::list &append_to) const;
+ uint64_t get_shard_first_offset(shard_id_t shard) const;
+ void zero_pad(shard_extent_set_t const &pad_to);
+ void zero_pad(shard_id_t shard, uint64_t offset, uint64_t length);
+ void pad_with_other(shard_extent_set_t const &pad_to,
+ shard_extent_map_t const &other);
+ void pad_with_other(shard_id_t shard, uint64_t offset, uint64_t length,
+ shard_extent_map_t const &other);
+ bufferlist get_ro_buffer(uint64_t ro_offset, uint64_t ro_length) const;
+ /* Returns a buffer assuming that there is a single contigious buffer
+ * represented by the map. */
+ bufferlist get_ro_buffer() const;
+ shard_extent_set_t get_extent_set();
+ void insert_parity_buffers();
+ void erase_shard(shard_id_t shard);
+ shard_extent_map_t slice_map(uint64_t offset, uint64_t length) const;
+ std::string debug_string(uint64_t inteval, uint64_t offset) const;
+ void erase_stripe(uint64_t offset, uint64_t length);
+ bool contains(shard_id_t shard) const;
+ bool contains(std::optional<shard_extent_set_t> const &other) const;
+ bool contains(shard_extent_set_t const &other) const;
+ void pad_and_rebuild_to_page_align();
+ uint64_t size();
+ void clear();
+ uint64_t get_start_offset() const { return start_offset; }
+ uint64_t get_end_offset() const { return end_offset; }
+ void deep_copy(shard_extent_map_t const &other);
+ void swap() {}
+ size_t shard_count() { return extent_maps.size(); }
+
+
+ void assert_buffer_contents_equal(shard_extent_map_t other) const {
+ for (auto &&[shard, emap] : extent_maps) {
+ for (auto &&i : emap) {
+ bufferlist bl = i.get_val();
+ bufferlist otherbl;
+ other.get_buffer(shard, i.get_off(), i.get_len(), otherbl);
+ ceph_assert(bl.contents_equal(otherbl));
+ }
+ }
+ }
+
+ bool add_zero_padding_for_decode(uint64_t object_size, shard_id_set &exclude_set) {
+ shard_extent_set_t zeros(sinfo->get_k_plus_m());
+ sinfo->ro_size_to_zero_mask(object_size, zeros);
+ extent_set superset = get_extent_superset();
+ bool changed = false;
+ for (auto &&[shard, z] : zeros) {
+ if (exclude_set.contains(shard)) {
+ continue;
+ }
+ z.intersection_of(superset);
+ for (auto [off, len] : z) {
+ changed = true;
+ bufferlist bl;
+ bl.append_zero(len);
+ extent_maps[shard].insert(off, len, bl);
+ }
+ }
+
+ if (changed) {
+ compute_ro_range();
+ }
+
+ return changed;
+ }
+
+ friend std::ostream &operator<<(std::ostream &lhs,
+ const shard_extent_map_t &rhs);
+
+ friend bool operator==(const shard_extent_map_t &lhs,
+ const shard_extent_map_t &rhs) {
+ return lhs.sinfo == rhs.sinfo
+ && lhs.ro_start == rhs.ro_start
+ && lhs.ro_end == rhs.ro_end
+ && lhs.extent_maps == rhs.extent_maps;
+ }
+};
+
+typedef enum {
+ READ_REQUEST,
+ READ_DONE,
+ INJECT_EIO,
+ CANCELLED,
+ ERROR,
+ REQUEST_MISSING,
+ COMPLETE_ERROR,
+ ERROR_CLEAR,
+ COMPLETE
+} log_event_t;
+
+struct log_entry_t {
+ const log_event_t event;
+ const pg_shard_t shard;
+ const extent_set io;
+
+ log_entry_t(
+ const log_event_t event,
+ const pg_shard_t &shard,
+ const extent_set &io) :
+ event(event), shard(shard), io(io) {}
+
+ log_entry_t(
+ const log_event_t event,
+ const pg_shard_t &shard) :
+ event(event), shard(shard) {}
+
+ log_entry_t(
+ const log_event_t event,
+ const pg_shard_t &pg_shard,
+ const shard_extent_map_t &extent_map) :
+ event(event), shard(pg_shard),
+ io(extent_map.contains(pg_shard.shard)
+ ? extent_map.get_extent_set(pg_shard.shard)
+ : extent_set()) {}
+
+ friend std::ostream &operator<<(std::ostream &out, const log_entry_t &lhs);
+};
+
bool is_hinfo_key_string(const std::string &key);
const std::string &get_hinfo_key();
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "ExtentCache.h"
-
-using std::ostream;
-
-using ceph::bufferlist;
-
-void ExtentCache::extent::_link_pin_state(pin_state &pin_state)
-{
- ceph_assert(parent_extent_set);
- ceph_assert(!parent_pin_state);
- parent_pin_state = &pin_state;
- pin_state.pin_list.push_back(*this);
-}
-
-void ExtentCache::extent::_unlink_pin_state()
-{
- ceph_assert(parent_extent_set);
- ceph_assert(parent_pin_state);
- auto liter = pin_state::list::s_iterator_to(*this);
- parent_pin_state->pin_list.erase(liter);
- parent_pin_state = nullptr;
-}
-
-void ExtentCache::extent::unlink()
-{
- ceph_assert(parent_extent_set);
- ceph_assert(parent_pin_state);
-
- _unlink_pin_state();
-
- // remove from extent set
- {
- auto siter = object_extent_set::set::s_iterator_to(*this);
- auto &set = object_extent_set::set::container_from_iterator(siter);
- ceph_assert(&set == &(parent_extent_set->extent_set));
- set.erase(siter);
- }
-
- parent_extent_set = nullptr;
- ceph_assert(!parent_pin_state);
-}
-
-void ExtentCache::extent::link(
- object_extent_set &extent_set,
- pin_state &pin_state)
-{
- ceph_assert(!parent_extent_set);
- parent_extent_set = &extent_set;
- extent_set.extent_set.insert(*this);
-
- _link_pin_state(pin_state);
-}
-
-void ExtentCache::extent::move(
- pin_state &to)
-{
- _unlink_pin_state();
- _link_pin_state(to);
-}
-
-void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset)
-{
- if (eset.extent_set.empty()) {
- auto siter = cache_set::s_iterator_to(eset);
- auto &set = cache_set::container_from_iterator(siter);
- ceph_assert(&set == &per_object_caches);
-
- // per_object_caches owns eset
- per_object_caches.erase(eset);
- delete &eset;
- }
-}
-
-ExtentCache::object_extent_set &ExtentCache::get_or_create(
- const hobject_t &oid)
-{
- cache_set::insert_commit_data data;
- auto p = per_object_caches.insert_check(oid, Cmp(), data);
- if (p.second) {
- auto *eset = new object_extent_set(oid);
- per_object_caches.insert_commit(*eset, data);
- return *eset;
- } else {
- return *(p.first);
- }
-}
-
-ExtentCache::object_extent_set *ExtentCache::get_if_exists(
- const hobject_t &oid)
-{
- cache_set::insert_commit_data data;
- auto p = per_object_caches.insert_check(oid, Cmp(), data);
- if (p.second) {
- return nullptr;
- } else {
- return &*(p.first);
- }
-}
-
-std::pair<
- ExtentCache::object_extent_set::set::iterator,
- ExtentCache::object_extent_set::set::iterator
- > ExtentCache::object_extent_set::get_containing_range(
- uint64_t off, uint64_t len)
-{
- // fst is first iterator with end after off (may be end)
- auto fst = extent_set.upper_bound(off, uint_cmp());
- if (fst != extent_set.begin())
- --fst;
- if (fst != extent_set.end() && off >= (fst->offset + fst->get_length()))
- ++fst;
-
- // lst is first iterator with start >= off + len (may be end)
- auto lst = extent_set.lower_bound(off + len, uint_cmp());
- return std::make_pair(fst, lst);
-}
-
-extent_set ExtentCache::reserve_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_write,
- const extent_set &to_read)
-{
- if (to_write.empty() && to_read.empty()) {
- return extent_set();
- }
- extent_set must_read;
- auto &eset = get_or_create(oid);
- extent_set missing;
- for (auto &&res: to_write) {
- eset.traverse_update(
- pin,
- res.first,
- res.second,
- [&](uint64_t off, uint64_t len,
- extent *ext, object_extent_set::update_action *action) {
- action->action = object_extent_set::update_action::UPDATE_PIN;
- if (!ext) {
- missing.insert(off, len);
- }
- });
- }
- must_read.intersection_of(
- to_read,
- missing);
- return must_read;
-}
-
-extent_map ExtentCache::get_remaining_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_get)
-{
- if (to_get.empty()) {
- return extent_map();
- }
- extent_map ret;
- auto &eset = get_or_create(oid);
- for (auto &&res: to_get) {
- bufferlist bl;
- uint64_t cur = res.first;
- eset.traverse_update(
- pin,
- res.first,
- res.second,
- [&](uint64_t off, uint64_t len,
- extent *ext, object_extent_set::update_action *action) {
- ceph_assert(off == cur);
- cur = off + len;
- action->action = object_extent_set::update_action::NONE;
- ceph_assert(ext && ext->bl && ext->pinned_by_write());
- bl.substr_of(
- *(ext->bl),
- off - ext->offset,
- len);
- ret.insert(off, len, bl);
- });
- }
- return ret;
-}
-
-void ExtentCache::present_rmw_update(
- const hobject_t &oid,
- write_pin &pin,
- const extent_map &extents)
-{
- if (extents.empty()) {
- return;
- }
- auto &eset = get_or_create(oid);
- for (auto &&res: extents) {
- eset.traverse_update(
- pin,
- res.get_off(),
- res.get_len(),
- [&](uint64_t off, uint64_t len,
- extent *ext, object_extent_set::update_action *action) {
- action->action = object_extent_set::update_action::NONE;
- ceph_assert(ext && ext->pinned_by_write());
- action->bl = bufferlist();
- action->bl->substr_of(
- res.get_val(),
- off - res.get_off(),
- len);
- });
- }
-}
-
-ostream &ExtentCache::print(ostream &out) const
-{
- out << "ExtentCache(" << std::endl;
- for (auto esiter = per_object_caches.begin();
- esiter != per_object_caches.end();
- ++esiter) {
- out << " Extents(" << esiter->oid << ")[" << std::endl;
- for (auto exiter = esiter->extent_set.begin();
- exiter != esiter->extent_set.end();
- ++exiter) {
- out << " Extent(" << exiter->offset
- << "~" << exiter->get_length()
- << ":" << exiter->pin_tid()
- << ")" << std::endl;
- }
- }
- return out << ")" << std::endl;
-}
-
-ostream &operator<<(ostream &lhs, const ExtentCache &cache)
-{
- return cache.print(lhs);
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <map>
-#include <list>
-#include <vector>
-#include <utility>
-#include <optional>
-#include <boost/intrusive/set.hpp>
-#include <boost/intrusive/list.hpp>
-#include "include/interval_set.h"
-#include "common/interval_map.h"
-#include "include/buffer.h"
-#include "common/hobject.h"
-
-/**
- ExtentCache
-
- The main purpose of this cache is to ensure that we can pipeline
- overlapping partial overwrites.
-
- To that end we need to ensure that an extent pinned for an operation is
- live until that operation completes. However, a particular extent
- might be pinned by multiple operations (several pipelined writes
- on the same object).
-
- 1) When we complete an operation, we only look at extents owned only
- by that operation.
- 2) Per-extent overhead is fixed size.
- 2) Per-operation metadata is fixed size.
-
- This is simple enough to realize with two main structures:
- - extent: contains a pointer to the pin owning it and intrusive list
- pointers to other extents owned by the same pin
- - pin_state: contains the list head for extents owned by it
-
- This works as long as we only need to remember one "owner" for
- each extent. To make this work, we'll need to leverage some
- invariants guaranteed by higher layers:
-
- 1) Writes on a particular object must be ordered
- 2) A particular object will have outstanding reads or writes, but not
- both (note that you can have a read while a write is committed, but
- not applied).
-
- Our strategy therefore will be to have whichever in-progress op will
- finish "last" be the owner of a particular extent. For now, we won't
- cache reads, so 2) simply means that we can assume that reads and
- recovery operations imply no unstable extents on the object in
- question.
-
- Write: WaitRead -> WaitCommit -> Complete
-
- Invariant 1) above actually indicates that we can't have writes
- bypassing the WaitRead state while there are writes waiting on
- Reads. Thus, the set of operations pinning a particular extent
- must always complete in order or arrival.
-
- This suggests that a particular extent may be in only the following
- states:
-
-
- 0) Empty (not in the map at all)
- 1) Write Pending N
- - Some write with reqid <= N is currently fetching the data for
- this extent
- - The extent must persist until Write reqid N completes
- - All ops pinning this extent are writes in the WaitRead state of
- the Write pipeline (there must be an in progress write, so no
- reads can be in progress).
- 2) Write Pinned N:
- - This extent has data corresponding to some reqid M <= N
- - The extent must persist until Write reqid N commits
- - All ops pinning this extent are writes in some Write
- state (all are possible). Reads are not possible
- in this state (or the others) due to 2).
-
- All of the above suggests that there are 3 things users can
- ask of the cache corresponding to the 3 Write pipelines
- states.
- */
-
-/// If someone wants these types, but not ExtentCache, move to another file
-struct bl_split_merge {
- ceph::buffer::list split(
- uint64_t offset,
- uint64_t length,
- ceph::buffer::list &bl) const {
- ceph::buffer::list out;
- out.substr_of(bl, offset, length);
- return out;
- }
- bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
- return true;
- }
- ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
- ceph::buffer::list bl{std::move(left)};
- bl.claim_append(right);
- return bl;
- }
- uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
-};
-using extent_set = interval_set<uint64_t>;
-using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge>;
-
-class ExtentCache {
- struct object_extent_set;
- struct pin_state;
-private:
-
- struct extent {
- object_extent_set *parent_extent_set = nullptr;
- pin_state *parent_pin_state = nullptr;
- boost::intrusive::set_member_hook<> extent_set_member;
- boost::intrusive::list_member_hook<> pin_list_member;
-
- uint64_t offset;
- uint64_t length;
- std::optional<ceph::buffer::list> bl;
-
- uint64_t get_length() const {
- return length;
- }
-
- bool is_pending() const {
- return bl == std::nullopt;
- }
-
- bool pinned_by_write() const {
- ceph_assert(parent_pin_state);
- return parent_pin_state->is_write();
- }
-
- uint64_t pin_tid() const {
- ceph_assert(parent_pin_state);
- return parent_pin_state->tid;
- }
-
- extent(uint64_t offset, ceph::buffer::list _bl)
- : offset(offset), length(_bl.length()), bl(_bl) {}
-
- extent(uint64_t offset, uint64_t length)
- : offset(offset), length(length) {}
-
- bool operator<(const extent &rhs) const {
- return offset < rhs.offset;
- }
- private:
- // can briefly violate the two link invariant, used in unlink() and move()
- void _link_pin_state(pin_state &pin_state);
- void _unlink_pin_state();
- public:
- void unlink();
- void link(object_extent_set &parent_extent_set, pin_state &pin_state);
- void move(pin_state &to);
- };
-
- struct object_extent_set : boost::intrusive::set_base_hook<> {
- hobject_t oid;
- explicit object_extent_set(const hobject_t &oid) : oid(oid) {}
-
- using set_member_options = boost::intrusive::member_hook<
- extent,
- boost::intrusive::set_member_hook<>,
- &extent::extent_set_member>;
- using set = boost::intrusive::set<extent, set_member_options>;
- set extent_set;
-
- bool operator<(const object_extent_set &rhs) const {
- return oid < rhs.oid;
- }
-
- struct uint_cmp {
- bool operator()(uint64_t lhs, const extent &rhs) const {
- return lhs < rhs.offset;
- }
- bool operator()(const extent &lhs, uint64_t rhs) const {
- return lhs.offset < rhs;
- }
- };
- std::pair<set::iterator, set::iterator> get_containing_range(
- uint64_t offset, uint64_t length);
-
- void erase(uint64_t offset, uint64_t length);
-
- struct update_action {
- enum type {
- NONE,
- UPDATE_PIN
- };
- type action = NONE;
- std::optional<ceph::buffer::list> bl;
- };
- template <typename F>
- void traverse_update(
- pin_state &pin,
- uint64_t offset,
- uint64_t length,
- F &&f) {
- auto range = get_containing_range(offset, length);
-
- if (range.first == range.second || range.first->offset > offset) {
- uint64_t extlen = range.first == range.second ?
- length : range.first->offset - offset;
-
- update_action action;
- f(offset, extlen, nullptr, &action);
- ceph_assert(!action.bl || action.bl->length() == extlen);
- if (action.action == update_action::UPDATE_PIN) {
- extent *ext = action.bl ?
- new extent(offset, *action.bl) :
- new extent(offset, extlen);
- ext->link(*this, pin);
- } else {
- ceph_assert(!action.bl);
- }
- }
-
- for (auto p = range.first; p != range.second;) {
- extent *ext = &*p;
- ++p;
-
- uint64_t extoff = std::max(ext->offset, offset);
- uint64_t extlen = std::min(
- ext->length - (extoff - ext->offset),
- offset + length - extoff);
-
- update_action action;
- f(extoff, extlen, ext, &action);
- ceph_assert(!action.bl || action.bl->length() == extlen);
- extent *final_extent = nullptr;
- if (action.action == update_action::NONE) {
- final_extent = ext;
- } else {
- pin_state *ps = ext->parent_pin_state;
- ext->unlink();
- if ((ext->offset < offset) &&
- (ext->offset + ext->get_length() > offset)) {
- extent *head = nullptr;
- if (ext->bl) {
- ceph::buffer::list bl;
- bl.substr_of(
- *(ext->bl),
- 0,
- offset - ext->offset);
- head = new extent(ext->offset, bl);
- } else {
- head = new extent(
- ext->offset, offset - ext->offset);
- }
- head->link(*this, *ps);
- }
- if ((ext->offset + ext->length > offset + length) &&
- (offset + length > ext->offset)) {
- uint64_t nlen =
- (ext->offset + ext->get_length()) - (offset + length);
- extent *tail = nullptr;
- if (ext->bl) {
- ceph::buffer::list bl;
- bl.substr_of(
- *(ext->bl),
- ext->get_length() - nlen,
- nlen);
- tail = new extent(offset + length, bl);
- } else {
- tail = new extent(offset + length, nlen);
- }
- tail->link(*this, *ps);
- }
- if (action.action == update_action::UPDATE_PIN) {
- if (ext->bl) {
- ceph::buffer::list bl;
- bl.substr_of(
- *(ext->bl),
- extoff - ext->offset,
- extlen);
- final_extent = new ExtentCache::extent(
- extoff,
- bl);
- } else {
- final_extent = new ExtentCache::extent(
- extoff, extlen);
- }
- final_extent->link(*this, pin);
- }
- delete ext;
- }
-
- if (action.bl) {
- ceph_assert(final_extent);
- ceph_assert(final_extent->length == action.bl->length());
- final_extent->bl = *(action.bl);
- }
-
- uint64_t next_off = p == range.second ?
- offset + length : p->offset;
- if (extoff + extlen < next_off) {
- uint64_t tailoff = extoff + extlen;
- uint64_t taillen = next_off - tailoff;
-
- update_action action;
- f(tailoff, taillen, nullptr, &action);
- ceph_assert(!action.bl || action.bl->length() == taillen);
- if (action.action == update_action::UPDATE_PIN) {
- extent *ext = action.bl ?
- new extent(tailoff, *action.bl) :
- new extent(tailoff, taillen);
- ext->link(*this, pin);
- } else {
- ceph_assert(!action.bl);
- }
- }
- }
- }
- };
- struct Cmp {
- bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
- return oid < rhs.oid;
- }
- bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
- return lhs.oid < oid;
- }
- };
-
- object_extent_set &get_or_create(const hobject_t &oid);
- object_extent_set *get_if_exists(const hobject_t &oid);
-
- void remove_and_destroy_if_empty(object_extent_set &set);
- using cache_set = boost::intrusive::set<object_extent_set>;
- cache_set per_object_caches;
-
- uint64_t next_write_tid = 1;
- uint64_t next_read_tid = 1;
- struct pin_state {
- uint64_t tid = 0;
- enum pin_type_t {
- NONE,
- WRITE,
- };
- pin_type_t pin_type = NONE;
- bool is_write() const { return pin_type == WRITE; }
-
- pin_state(const pin_state &other) = delete;
- pin_state &operator=(const pin_state &other) = delete;
- pin_state(pin_state &&other) = delete;
- pin_state() = default;
-
- using list_member_options = boost::intrusive::member_hook<
- extent,
- boost::intrusive::list_member_hook<>,
- &extent::pin_list_member>;
- using list = boost::intrusive::list<extent, boost::intrusive::constant_time_size<false>, list_member_options>;
- list pin_list;
- ~pin_state() {
- ceph_assert(pin_list.empty());
- ceph_assert(tid == 0);
- ceph_assert(pin_type == NONE);
- }
- void _open(uint64_t in_tid, pin_type_t in_type) {
- ceph_assert(pin_type == NONE);
- ceph_assert(in_tid > 0);
- tid = in_tid;
- pin_type = in_type;
- }
- };
-
- void release_pin(pin_state &p) {
- for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
- std::unique_ptr<extent> extent(&*iter); // we now own this
- iter++; // unlink will invalidate
- ceph_assert(extent->parent_extent_set);
- auto &eset = *(extent->parent_extent_set);
- extent->unlink();
- remove_and_destroy_if_empty(eset);
- }
- p.tid = 0;
- p.pin_type = pin_state::NONE;
- }
-
-public:
- class write_pin : private pin_state {
- friend class ExtentCache;
- private:
- void open(uint64_t in_tid) {
- _open(in_tid, pin_state::WRITE);
- }
- public:
- write_pin() : pin_state() {}
- };
-
- void open_write_pin(write_pin &pin) {
- pin.open(next_write_tid++);
- }
-
- /**
- * Reserves extents required for rmw, and learn
- * which need to be read
- *
- * Pins all extents in to_write. Returns subset of to_read not
- * currently present in the cache. Caller must obtain those
- * extents before calling get_remaining_extents_for_rmw.
- *
- * Transition table:
- * - Empty -> Write Pending pin.reqid
- * - Write Pending N -> Write Pending pin.reqid
- * - Write Pinned N -> Write Pinned pin.reqid
- *
- * @param oid [in] object undergoing rmw
- * @param pin [in,out] pin to use (obtained from create_write_pin)
- * @param to_write [in] extents which will be written
- * @param to_read [in] extents to read prior to write (must be subset
- * of to_write)
- * @return subset of to_read which isn't already present or pending
- */
- extent_set reserve_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_write,
- const extent_set &to_read);
-
- /**
- * Gets extents required for rmw not returned from
- * reserve_extents_for_rmw
- *
- * Requested extents (to_get) must be the set to_read \ the set
- * returned from reserve_extents_for_rmw. No transition table,
- * all extents at this point must be present and already pinned
- * for this pin by reserve_extents_for_rmw.
- *
- * @param oid [in] object
- * @param pin [in,out] pin associated with this IO
- * @param to_get [in] extents to get (see above for restrictions)
- * @return map of buffers from to_get
- */
- extent_map get_remaining_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_get);
-
- /**
- * Updates the cache to reflect the rmw write
- *
- * All presented extents must already have been specified in
- * reserve_extents_for_rmw under to_write.
- *
- * Transition table:
- * - Empty -> invalid, must call reserve_extents_for_rmw first
- * - Write Pending N -> Write Pinned N, update buffer
- * (assert N >= pin.reqid)
- * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
- *
- * @param oid [in] object
- * @param pin [in,out] pin associated with this IO
- * @param extents [in] map of buffers to update
- * @return void
- */
- void present_rmw_update(
- const hobject_t &oid,
- write_pin &pin,
- const extent_map &extents);
-
- /**
- * Release all buffers pinned by pin
- */
- void release_write_pin(
- write_pin &pin) {
- release_pin(pin);
- }
-
- std::ostream &print(std::ostream &out) const;
-};
-
-std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache);
\ No newline at end of file
#include <errno.h>
#include <stdlib.h>
+
#include "erasure-code/ErasureCodePlugin.h"
#include "log/Log.h"
#include "global/global_context.h"
}
}
+bufferptr create_bufferptr(uint64_t value) {
+ bufferlist bl;
+ bl.append_zero(4096);
+ memcpy(bl.c_str(), &value, sizeof(value));
+ return bl.begin().get_current_ptr();
+}
+
+TEST(ErasureCodePlugin, parity_delta_write) {
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ ErasureCodeInterfaceRef erasure_code;
+ ErasureCodeProfile profile;
+ profile["technique"] = "reed_sol_van";
+ profile["k"] = "5";
+ int k=5;
+ profile["m"] = "3";
+ int m=3;
+ EXPECT_EQ(0, instance.factory("jerasure",
+ g_conf().get_val<std::string>("erasure_code_dir"),
+ profile,
+ &erasure_code, &cerr));
+ shard_id_map<bufferptr> data(8);
+ shard_id_map<bufferptr> coding(8);
+ shard_id_map<bufferptr> coding2(8);
+ shard_id_map<bufferptr> decode_in(8);
+ shard_id_map<bufferptr> decode_out(8);
+
+ uint32_t seeds[] = {100, 101, 102, 103, 104};
+ uint32_t overwrite3 = 1032;
+
+ for (shard_id_t s; s < k; ++s) {
+ data[s] = create_bufferptr(seeds[int(s)]);
+ }
+ for (shard_id_t s(k); s < k + m; ++s) {
+ coding[s] = create_bufferptr(-1);
+ coding2[s] = create_bufferptr(-1);
+ }
+
+ // Do a normal encode.
+ erasure_code->encode_chunks(data, coding);
+
+ shard_id_map<bufferptr> delta(8);
+ delta[shard_id_t(3)] = create_bufferptr(-1);
+
+ bufferptr overwrite_bp = create_bufferptr(overwrite3);
+
+ erasure_code->encode_delta(data[shard_id_t(3)], overwrite_bp, &delta[shard_id_t(3)]);
+ erasure_code->apply_delta(delta, coding);
+ data[shard_id_t(3)] = overwrite_bp;
+
+ erasure_code->encode_chunks(data, coding2);
+
+ for (shard_id_t s(k); s < k + m; ++s) {
+ ASSERT_EQ(*(uint32_t*)coding[s].c_str(), *(uint32_t*)coding2[s].c_str());
+ }
+
+ data.erase(shard_id_t(4));
+ data.emplace(shard_id_t(4), (char*)malloc(4096), 4096);
+ shard_id_set want;
+ want.insert_range(shard_id_t(0), 5);
+ decode_in[shard_id_t(0)] = data[shard_id_t(0)];
+ decode_in[shard_id_t(1)] = data[shard_id_t(1)];
+ decode_in[shard_id_t(2)] = data[shard_id_t(2)];
+ decode_in[shard_id_t(3)] = data[shard_id_t(3)];
+ decode_out[shard_id_t(4)] = data[shard_id_t(4)];
+ decode_in[shard_id_t(6)] = coding[shard_id_t(6)];
+
+ ASSERT_EQ(0, erasure_code->decode_chunks(want, decode_in, decode_out));
+
+ seeds[3] = overwrite3;
+ for (shard_id_t s(0); s < k; ++s) {
+ ASSERT_EQ(seeds[int(s)], *(uint32_t*)data[s].c_str());
+ }
+}
+
/*
* Local Variables:
* compile-command: "cd ../.. ; make -j4 &&
add_ceph_unittest(unittest_ecbackend)
target_link_libraries(unittest_ecbackend osd global)
+# unittest_ecutil
+add_executable(unittest_ecutil
+ TestECUtil.cc
+ $<TARGET_OBJECTS:unit-main>
+)
+add_ceph_unittest(unittest_ecutil)
+target_link_libraries(unittest_ecutil osd global)
+
# unittest_osdscrub
add_executable(unittest_osdscrub
TestOSDScrub.cc
#include "osd/ECCommon.h"
#include "osd/ECBackend.h"
#include "gtest/gtest.h"
+#include "osd/osd_types.h"
+#include "common/ceph_argparse.h"
+#include "erasure-code/ErasureCode.h"
using namespace std;
ECUtil::stripe_info_t s(k, m, swidth);
ASSERT_EQ(s.get_stripe_width(), swidth);
- ASSERT_EQ(s.logical_to_next_chunk_offset(0), 0u);
- ASSERT_EQ(s.logical_to_next_chunk_offset(1), s.get_chunk_size());
- ASSERT_EQ(s.logical_to_next_chunk_offset(swidth - 1),
+ ASSERT_EQ(s.ro_offset_to_next_chunk_offset(0), 0u);
+ ASSERT_EQ(s.ro_offset_to_next_chunk_offset(1), s.get_chunk_size());
+ ASSERT_EQ(s.ro_offset_to_next_chunk_offset(swidth - 1),
s.get_chunk_size());
- ASSERT_EQ(s.logical_to_prev_chunk_offset(0), 0u);
- ASSERT_EQ(s.logical_to_prev_chunk_offset(swidth), s.get_chunk_size());
- ASSERT_EQ(s.logical_to_prev_chunk_offset((swidth * 2) - 1),
+ ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(0), 0u);
+ ASSERT_EQ(s.ro_offset_to_prev_chunk_offset(swidth), s.get_chunk_size());
+ ASSERT_EQ(s.ro_offset_to_prev_chunk_offset((swidth * 2) - 1),
s.get_chunk_size());
- ASSERT_EQ(s.logical_to_next_stripe_offset(0), 0u);
- ASSERT_EQ(s.logical_to_next_stripe_offset(swidth - 1),
+ ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(0), 0u);
+ ASSERT_EQ(s.ro_offset_to_next_stripe_ro_offset(swidth - 1),
s.get_stripe_width());
- ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width());
- ASSERT_EQ(s.logical_to_prev_stripe_offset(swidth), s.get_stripe_width());
- ASSERT_EQ(s.logical_to_prev_stripe_offset((swidth * 2) - 1),
+ ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width());
+ ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset(swidth), s.get_stripe_width());
+ ASSERT_EQ(s.ro_offset_to_prev_stripe_ro_offset((swidth * 2) - 1),
s.get_stripe_width());
- ASSERT_EQ(s.aligned_logical_offset_to_chunk_offset(2*swidth),
+ ASSERT_EQ(s.aligned_ro_offset_to_chunk_offset(2*swidth),
2*s.get_chunk_size());
- ASSERT_EQ(s.aligned_chunk_offset_to_logical_offset(2*s.get_chunk_size()),
+ ASSERT_EQ(s.chunk_aligned_shard_offset_to_ro_offset(2*s.get_chunk_size()),
2*s.get_stripe_width());
// Stripe 1 + 1 chunk for 10 stripes needs to read 11 stripes starting
// from 1 because there is a partial stripe at the start and end
- ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(
- make_pair(swidth+s.get_chunk_size(), 10*swidth)),
+ ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth+s.get_chunk_size(), 10*swidth),
make_pair(s.get_chunk_size(), 11*s.get_chunk_size()));
// Stripe 1 + 0 chunks for 10 stripes needs to read 10 stripes starting
// from 1 because there are no partial stripes
- ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(swidth, 10*swidth)),
+ ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(swidth, 10*swidth),
make_pair(s.get_chunk_size(), 10*s.get_chunk_size()));
// Stripe 0 + 1 chunk for 10 stripes needs to read 11 stripes starting
// from 0 because there is a partial stripe at the start and end
- ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(), 10*swidth)),
+ ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(), 10*swidth),
make_pair<uint64_t>(0, 11*s.get_chunk_size()));
// Stripe 0 + 1 chunk for (10 stripes + 1 chunk) needs to read 11 stripes
// starting from 0 because there is a partial stripe at the start and end
- ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(s.get_chunk_size(),
- 10*swidth + s.get_chunk_size())),
+ ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(s.get_chunk_size(),
+ 10*swidth + s.get_chunk_size()),
make_pair<uint64_t>(0, 11*s.get_chunk_size()));
// Stripe 0 + 2 chunks for (10 stripes + 2 chunks) needs to read 11 stripes
// starting from 0 because there is a partial stripe at the start
- ASSERT_EQ(s.chunk_aligned_offset_len_to_chunk(make_pair(2*s.get_chunk_size(),
- 10*swidth + 2*s.get_chunk_size())),
- make_pair<uint64_t>(0, 11*s.get_chunk_size()));
+ ASSERT_EQ(s.chunk_aligned_ro_range_to_shard_ro_range(2*s.get_chunk_size(),
+ 10*swidth + 2*s.get_chunk_size()),
+ make_pair<uint64_t>(0, 11*s.get_chunk_size()));
- ASSERT_EQ(s.offset_len_to_stripe_bounds(make_pair(swidth-10, (uint64_t)20)),
+ ASSERT_EQ(s.ro_offset_len_to_stripe_ro_offset_len(swidth-10, (uint64_t)20),
make_pair((uint64_t)0, 2*swidth));
}
-TEST(ECUtil, offset_length_is_same_stripe)
-{
- const uint64_t swidth = 4096;
- const uint64_t schunk = 1024;
- const unsigned int k = 4;
- const unsigned int m = 2;
+class ErasureCodeDummyImpl : public ErasureCodeInterface {
+public:
- ECUtil::stripe_info_t s(k, m, swidth);
- ASSERT_EQ(s.get_stripe_width(), swidth);
- ASSERT_EQ(s.get_chunk_size(), schunk);
+ uint64_t get_supported_optimizations() const override {
+ return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
+ FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
+ FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
+ FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION |
+ FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
+ }
- // read nothing at the very beginning
- // +---+---+---+---+
- // | 0| | | |
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(0, 0));
-
- // read nothing at the stripe end
- // +---+---+---+---+
- // | | | | 0|
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(swidth, 0));
-
- // read single byte at the stripe end
- // +---+---+---+---+
- // | | | | ~1|
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(swidth - 1, 1));
-
- // read single stripe
- // +---+---+---+---+
- // | 1k| 1k| 1k| 1k|
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(0, swidth));
-
- // read single chunk
- // +---+---+---+---+
- // | 1k| | | |
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(0, schunk));
-
- // read single stripe except its first chunk
- // +---+---+---+---+
- // | | 1k| 1k| 1k|
- // +---+---+---+---+
- // | | | | |
- // +---+---+---+---+
- ASSERT_TRUE(s.offset_length_is_same_stripe(schunk, swidth - schunk));
-
- // read two stripes
- // +---+---+---+---+
- // | 1k| 1k| 1k| 1k|
- // +---+---+---+---+
- // | 1k| 1k| 1k| 1k|
- // +---+---+---+---+
- ASSERT_FALSE(s.offset_length_is_same_stripe(0, 2*swidth));
-
- // multistripe read: 1st stripe without 1st byte + 1st byte of 2nd stripe
- // +-----+---+---+---+
- // | 1k-1| 1k| 1k| 1k|
- // +-----+---+---+---+
- // | 1| | | |
- // +-----+---+---+---+
- ASSERT_FALSE(s.offset_length_is_same_stripe(1, swidth));
-}
+ ErasureCodeProfile _profile;
+ const std::vector<shard_id_t> chunk_mapping = {}; // no remapping
+ std::vector<std::pair<int, int>> default_sub_chunk = {std::pair(0,1)};
+ int data_chunk_count = 4;
+ int chunk_count = 6;
+
+ int init(ErasureCodeProfile &profile, std::ostream *ss) override {
+ return 0;
+ }
+
+ const ErasureCodeProfile &get_profile() const override {
+ return _profile;
+ }
+
+ int create_rule(const string &name, CrushWrapper &crush, std::ostream *ss) const override {
+ return 0;
+ }
+
+ unsigned int get_chunk_count() const override {
+ return chunk_count;
+ }
+
+ unsigned int get_data_chunk_count() const override {
+ return data_chunk_count;
+ }
+
+ unsigned int get_coding_chunk_count() const override {
+ return 0;
+ }
+
+ int get_sub_chunk_count() override {
+ return 1;
+ }
+
+ unsigned int get_chunk_size(unsigned int stripe_width) const override {
+ return 0;
+ }
+
+ int minimum_to_decode(const shard_id_set &want_to_read, const shard_id_set &available,
+ shard_id_set &minimum_set,
+ shard_id_map<std::vector<std::pair<int, int>>> *minimum_sub_chunks) override {
+ shard_id_t parity_shard_index(data_chunk_count);
+ for (shard_id_t shard : want_to_read) {
+ if (available.contains(shard)) {
+ minimum_set.insert(shard);
+ } else {
+ // Shard is missing. Recover with every other shard and one parity
+ // for each missing shard.
+ for (shard_id_t i; i<data_chunk_count; ++i) {
+ if (available.contains(i)) {
+ minimum_set.insert(i);
+ } else {
+ minimum_set.insert(parity_shard_index);
+ ++parity_shard_index;
+ }
+
+ if (int(parity_shard_index) == chunk_count)
+ return -EIO; // Cannot recover.
+ }
+ }
+ }
+
+ for (auto &&shard : minimum_set) {
+ minimum_sub_chunks->emplace(shard, default_sub_chunk);
+ }
+ return 0;
+ }
+
+ [[deprecated]]
+ int minimum_to_decode(const std::set<int> &want_to_read,
+ const std::set<int> &available,
+ std::map<int, std::vector<std::pair<int, int>>> *minimum) override
+ {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ [[deprecated]]
+ int minimum_to_decode_with_cost(const std::set<int> &want_to_read,
+ const std::map<int, int> &available, std::set<int> *minimum) override {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ int minimum_to_decode_with_cost(const shard_id_set &want_to_read, const shard_id_map<int> &available,
+ shard_id_set *minimum) override {
+ return 0;
+ }
+
+ int encode(const shard_id_set &want_to_encode, const bufferlist &in, shard_id_map<bufferlist> *encoded) override {
+ return 0;
+ }
+
+ [[deprecated]]
+ int encode(const std::set<int> &want_to_encode, const bufferlist &in
+ , std::map<int, bufferlist> *encoded) override
+ {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ [[deprecated]]
+ int encode_chunks(const std::set<int> &want_to_encode,
+ std::map<int, bufferlist> *encoded) override
+ {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ int encode_chunks(const shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override {
+ return 0;
+ }
+
+ int decode(const shard_id_set &want_to_read, const shard_id_map<bufferlist> &chunks, shard_id_map<bufferlist> *decoded,
+ int chunk_size) override {
+ return 0;
+ }
+
+ [[deprecated]]
+ int decode(const std::set<int> &want_to_read, const std::map<int, bufferlist> &chunks,
+ std::map<int, bufferlist> *decoded, int chunk_size) override
+ {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ [[deprecated]]
+ int decode_chunks(const std::set<int> &want_to_read,
+ const std::map<int, bufferlist> &chunks,
+ std::map<int, bufferlist> *decoded) override {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ int decode_chunks(const shard_id_set &want_to_read,
+ shard_id_map<bufferptr> &in, shard_id_map<bufferptr> &out) override
+ {
+ if (in.size() < data_chunk_count) {
+ ADD_FAILURE();
+ }
+ uint64_t len = 0;
+ for (auto &&[shard, bp] : in) {
+ if (len == 0) {
+ len = bp.length();
+ } else if (len != bp.length()) {
+ ADD_FAILURE();
+ }
+ }
+ if (len == 0) {
+ ADD_FAILURE();
+ }
+ if (out.size() == 0) {
+ ADD_FAILURE();
+ }
+ for (auto &&[shard, bp] : out) {
+ if (len != bp.length()) {
+ ADD_FAILURE();
+ }
+ }
+ return 0;
+ }
+
+ const vector<shard_id_t> &get_chunk_mapping() const override {
+ return chunk_mapping;
+ }
+
+ [[deprecated]]
+ int decode_concat(const std::set<int> &want_to_read,
+ const std::map<int, bufferlist> &chunks, bufferlist *decoded) override {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ [[deprecated]]
+ int decode_concat(const std::map<int, bufferlist> &chunks,
+ bufferlist *decoded) override {
+ ADD_FAILURE();
+ return 0;
+ }
+
+ size_t get_minimum_granularity() override { return 0; }
+ void encode_delta(const bufferptr &old_data, const bufferptr &new_data
+ , bufferptr *delta) override {}
+ void apply_delta(const shard_id_map<bufferptr> &in
+ , shard_id_map<bufferptr> &out) override {}
+};
+
+class ECListenerStub : public ECListener {
+ OSDMapRef osd_map_ref;
+ pg_info_t pg_info;
+ set<pg_shard_t> backfill_shards;
+ shard_id_set backfill_shard_id_set;
+ map<hobject_t, set<pg_shard_t>> missing_loc_shards;
+ map<pg_shard_t, pg_missing_t> shard_missing;
+ pg_missing_set<false> shard_not_missing_const;
+ pg_pool_t pg_pool;
+ set<pg_shard_t> acting_recovery_backfill_shards;
+ shard_id_set acting_recovery_backfill_shard_id_set;
+ map<pg_shard_t, pg_info_t> shard_info;
+ PGLog pg_log;
+ pg_info_t shard_pg_info;
+ std::string dbg_prefix = "stub";
+
+public:
+ set<pg_shard_t> acting_shards;
+
+ ECListenerStub()
+ : pg_log(NULL) {}
+
+ const OSDMapRef &pgb_get_osdmap() const override {
+ return osd_map_ref;
+ }
+
+ epoch_t pgb_get_osdmap_epoch() const override {
+ return 0;
+ }
+
+ const pg_info_t &get_info() const override {
+ return pg_info;
+ }
+
+ void cancel_pull(const hobject_t &soid) override {
+
+ }
+
+ pg_shard_t primary_shard() const override {
+ return pg_shard_t();
+ }
+
+ bool pgb_is_primary() const override {
+ return false;
+ }
+
+ void on_failed_pull(const set<pg_shard_t> &from, const hobject_t &soid, const eversion_t &v) override {
+
+ }
+
+ void
+ on_local_recover(const hobject_t &oid, const ObjectRecoveryInfo &recovery_info, ObjectContextRef obc, bool is_delete,
+ ceph::os::Transaction *t) override {
+
+ }
+
+ void on_global_recover(const hobject_t &oid, const object_stat_sum_t &stat_diff, bool is_delete) override {
+
+ }
+
+ void on_peer_recover(pg_shard_t peer, const hobject_t &oid, const ObjectRecoveryInfo &recovery_info) override {
+
+ }
+ void begin_peer_recover(pg_shard_t peer, const hobject_t oid) override {
+
+ }
+
+ bool pg_is_repair() const override {
+ return false;
+ }
+
+ ObjectContextRef
+ get_obc(const hobject_t &hoid, const map<std::string, ceph::buffer::list, std::less<>> &attrs) override {
+ return ObjectContextRef();
+ }
+
+ bool check_failsafe_full() override {
+ return false;
+ }
+
+ hobject_t get_temp_recovery_object(const hobject_t &target, eversion_t version) override {
+ return hobject_t();
+ }
+
+ bool pg_is_remote_backfilling() override {
+ return false;
+ }
+
+ void pg_add_local_num_bytes(int64_t num_bytes) override {
+
+ }
+
+ void pg_add_num_bytes(int64_t num_bytes) override {
+
+ }
+
+ void inc_osd_stat_repaired() override {
+
+ }
+
+ void add_temp_obj(const hobject_t &oid) override {
+
+ }
+
+ void clear_temp_obj(const hobject_t &oid) override {
+
+ }
+
+ epoch_t get_last_peering_reset_epoch() const override {
+ return 0;
+ }
+
+ GenContext<ThreadPool::TPHandle &> *bless_unlocked_gencontext(GenContext<ThreadPool::TPHandle &> *c) override {
+ return nullptr;
+ }
+
+ void schedule_recovery_work(GenContext<ThreadPool::TPHandle &> *c, uint64_t cost) override {
+
+ }
+
+ epoch_t get_interval_start_epoch() const override {
+ return 0;
+ }
+
+ const set<pg_shard_t> &get_acting_shards() const override {
+ return acting_shards;
+ }
+
+ const set<pg_shard_t> &get_backfill_shards() const override {
+ return backfill_shards;
+ }
+
+ const map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards() const override {
+ return missing_loc_shards;
+ }
+
+ const map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+ return shard_missing;
+ }
+
+ const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const override {
+ return shard_not_missing_const;
+ }
+
+ const pg_missing_const_i *maybe_get_shard_missing(pg_shard_t peer) const override {
+ return nullptr;
+ }
+
+ const pg_info_t &get_shard_info(pg_shard_t peer) const override {
+ return shard_pg_info;
+ }
+
+ ceph_tid_t get_tid() override {
+ return 0;
+ }
+
+ pg_shard_t whoami_shard() const override {
+ return pg_shard_t();
+ }
+
+ void send_message_osd_cluster(vector<std::pair<int, Message *>> &messages, epoch_t from_epoch) override {
+
+ }
+
+ ostream &gen_dbg_prefix(ostream &out) const override {
+ out << dbg_prefix;
+ return out;
+ }
+
+ const pg_pool_t &get_pool() const override {
+ return pg_pool;
+ }
+
+ const set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
+ return acting_recovery_backfill_shards;
+ }
+
+ const shard_id_set &get_acting_recovery_backfill_shard_id_set() const override {
+ return acting_recovery_backfill_shard_id_set;
+ }
+
+ bool should_send_op(pg_shard_t peer, const hobject_t &hoid) override {
+ return false;
+ }
+
+ const map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+ return shard_info;
+ }
+
+ spg_t primary_spg_t() const override {
+ return spg_t();
+ }
+
+ const PGLog &get_log() const override {
+ return pg_log;
+ }
+
+ DoutPrefixProvider *get_dpp() override {
+ return nullptr;
+ }
+
+ void apply_stats(const hobject_t &soid, const object_stat_sum_t &delta_stats) override {
+
+ }
+
+ bool is_missing_object(const hobject_t &oid) const override {
+ return false;
+ }
+
+ void add_local_next_event(const pg_log_entry_t &e) override {
+
+ }
+
+ void log_operation(vector<pg_log_entry_t> &&logv, const optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to, const eversion_t &roll_forward_to,
+ const eversion_t &min_last_complete_ondisk, bool transaction_applied, os::Transaction &t,
+ bool async) override {
+
+ }
+
+ void op_applied(const eversion_t &applied_version) override {
+
+ }
+
+ uint64_t min_peer_features() const {
+ return 0;
+ }
+};
TEST(ECCommon, get_min_want_to_read_shards)
{
const uint64_t swidth = 4096;
const unsigned int k = 4;
const unsigned int m = 2;
+ const uint64_t csize = 1024;
ECUtil::stripe_info_t s(k, m, swidth);
+ ECListenerStub listenerStub;
ASSERT_EQ(s.get_stripe_width(), swidth);
- ASSERT_EQ(s.get_chunk_size(), 1024);
+ ASSERT_EQ(s.get_chunk_size(), csize);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ ECUtil::shard_extent_set_t empty_extent_set_map(s.get_k_plus_m());
// read nothing at the very beginning
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 0, 0, s, &want_to_read);
- ASSERT_TRUE(want_to_read == std::set<int>{});
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(0, 0, 0);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ASSERT_EQ(want_to_read, empty_extent_set_map);
}
// read nothing at the middle (0-sized partial read)
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 2048, 0, s, &want_to_read);
- ASSERT_TRUE(want_to_read == std::set<int>{});
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(2048, 0, 0);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ASSERT_EQ(want_to_read, empty_extent_set_map);
+ }
+ // read nothing at the the second stripe (0-sized partial read)
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth, 0, 0);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ASSERT_EQ(want_to_read, empty_extent_set_map);
}
// read not-so-many (< chunk_size) bytes at the middle (partial read)
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 2048, 42, s, &want_to_read);
- ASSERT_TRUE(want_to_read == std::set<int>{2});
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(2048, 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(2)].insert(0, 42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // read not-so-many (< chunk_size) bytes after the first stripe.
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth+2048, 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(2)].insert(csize, 42);
+ ASSERT_EQ(want_to_read, ref);
}
// read more (> chunk_size) bytes at the middle (partial read)
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 1024, 1024+42, s, &want_to_read);
- // extra () due to a language / macro limitation
- ASSERT_TRUE(want_to_read == (std::set<int>{1, 2}));
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(csize, csize + 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(1)].insert(0, csize);
+ ref[shard_id_t(2)].insert(0, 42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // read more (> chunk_size) bytes at the middle (partial read), second stripe
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth + csize, csize + 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, 42);
+ ASSERT_EQ(want_to_read, ref);
}
// full stripe except last chunk
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 0, 3*1024, s, &want_to_read);
- // extra () due to a language / macro limitation
- ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2}));
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(0, 3*csize, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(0)].insert(0, csize);
+ ref[shard_id_t(1)].insert(0, csize);
+ ref[shard_id_t(2)].insert(0, csize);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // full stripe except last chunk (second stripe)
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth, 3*csize, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(0)].insert(csize, csize);
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, csize);
+ ASSERT_EQ(want_to_read, ref);
}
// full stripe except 1st chunk
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 1024, swidth-1024, s, &want_to_read);
- // extra () due to a language / macro limitation
- ASSERT_TRUE(want_to_read == (std::set<int>{1, 2, 3}));
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(csize, swidth - csize, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(1)].insert(0, csize);
+ ref[shard_id_t(2)].insert(0, csize);
+ ref[shard_id_t(3)].insert(0, csize);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // full stripe except 1st chunk (second stripe)
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth + csize, swidth - csize, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, csize);
+ ref[shard_id_t(3)].insert(csize, csize);
+ ASSERT_EQ(want_to_read, ref);
}
// large, multi-stripe read starting just after 1st chunk
+ // 0XXX
+ // XXXX x41
+ // X000
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(csize, swidth * 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(0)].insert(csize, csize*42);
+ ref[shard_id_t(1)].insert(0, csize*42);
+ ref[shard_id_t(2)].insert(0, csize*42);
+ ref[shard_id_t(3)].insert(0, csize*42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // large, multi-stripe read starting just after 1st chunk (second stripe)
+ // 0XXX
+ // XXXX x41
+ // X000
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth + csize, swidth * 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+ ref[shard_id_t(0)].insert(csize*2, csize*42);
+ ref[shard_id_t(1)].insert(csize, csize*42);
+ ref[shard_id_t(2)].insert(csize, csize*42);
+ ref[shard_id_t(3)].insert(csize, csize*42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // large read from the beginning
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 1024, swidth*42, s, &want_to_read);
- // extra () due to a language / macro limitation
- ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(0, swidth * 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(0, csize*42);
+ ref[shard_id_t(1)].insert(0, csize*42);
+ ref[shard_id_t(2)].insert(0, csize*42);
+ ref[shard_id_t(3)].insert(0, csize*42);
+ ASSERT_EQ(want_to_read, ref);
}
// large read from the beginning
{
- std::set<int> want_to_read;
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 0, swidth*42, s, &want_to_read);
- // extra () due to a language / macro limitation
- ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(0, swidth * 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(0, csize*42);
+ ref[shard_id_t(1)].insert(0, csize*42);
+ ref[shard_id_t(2)].insert(0, csize*42);
+ ref[shard_id_t(3)].insert(0, csize*42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // large read from the beginning (second stripe)
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth, swidth * 42, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(csize, csize*42);
+ ref[shard_id_t(1)].insert(csize, csize*42);
+ ref[shard_id_t(2)].insert(csize, csize*42);
+ ref[shard_id_t(3)].insert(csize, csize*42);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // large read that starts and ends on same shard.
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth, swidth+csize/2, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(csize, csize+csize/2);
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, csize);
+ ref[shard_id_t(3)].insert(csize, csize);
+ ASSERT_EQ(want_to_read, ref);
+ }
+
+ // large read that starts and ends on last shard
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth-csize, swidth+csize/2, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(csize, csize);
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, csize);
+ ref[shard_id_t(3)].insert(0, csize+csize/2);
+ ASSERT_EQ(want_to_read, ref);
+ }
+ // large read that starts and ends on last shard, partial first shard.
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read(swidth-csize/2, swidth, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(csize, csize);
+ ref[shard_id_t(1)].insert(csize, csize);
+ ref[shard_id_t(2)].insert(csize, csize);
+ ref[shard_id_t(3)].insert(csize/2, csize);
+ ASSERT_EQ(want_to_read, ref);
+ }
+}
+
+TEST(ECCommon, get_min_avail_to_read_shards) {
+ const uint64_t page_size = CEPH_PAGE_SIZE;
+ const uint64_t swidth = 64*page_size;
+ const unsigned int k = 4;
+ const unsigned int m = 2;
+ const int nshards = 6;
+ const uint64_t object_size = swidth * 1024;
+
+ std::vector<ECCommon::shard_read_t> empty_shard_vector(k);
+
+ ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), swidth / k);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+ ErasureCodeInterfaceRef ec_impl(ecode);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ for (int i = 0; i < nshards; i++) {
+ listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i)));
+ }
+
+ // read nothing
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+
+ ASSERT_EQ(read_request, ref);
+ }
+
+ /* Read to every data shard. */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+
+ for (shard_id_t i; i<k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+ }
+
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ for (shard_id_t shard_id; shard_id < k; ++shard_id) {
+ ref.shard_reads[shard_id].extents = to_read_list[shard_id];
+ ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+ ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id));
+ ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(shard_id), shard_id);
+ }
+ ASSERT_EQ(read_request, ref);
+ }
+
+ /* Read to every data shard. */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+ for (shard_id_t i; i<k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+ }
+
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ for (shard_id_t i; i<k; ++i) {
+ shard_id_t shard_id(i);
+ ref.shard_reads[shard_id].extents = to_read_list[i];
+ ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+ ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id);
+ }
+
+ ASSERT_EQ(read_request, ref);
+ }
+
+
+ /* Read to every data shard - small read */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+
+ for (shard_id_t i; i < (int)k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size + int(i) + 1, int(i) + 1);
+ }
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+ for (int i=0; i < (int)k; i++) {
+ shard_id_t shard_id(i);
+ ECCommon::shard_read_t &ref_shard_read = ref.shard_reads[shard_id];
+ ref_shard_read.subchunk = ecode->default_sub_chunk;
+ ref_shard_read.extents.insert(i*2*page_size, page_size);
+ ref_shard_read.pg_shard = pg_shard_t(i, shard_id_t(i));
+ }
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+ ASSERT_EQ(read_request, ref);
+ }
+
+ /* Read to every data shard, missing shard. */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+
+ for (shard_id_t i; i<k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+ }
+
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+ shard_id_t missing_shard(1);
+ int parity_shard = k;
+ listenerStub.acting_shards.erase(pg_shard_t(int(missing_shard), shard_id_t(missing_shard)));
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ for (shard_id_t i; i<k; ++i) {
+ if (i != missing_shard) {
+ shard_id_t shard_id(i);
+ to_read_list[i].union_of(to_read_list[missing_shard]);
+ ref.shard_reads[shard_id].subchunk = ecode->default_sub_chunk;
+ ref.shard_reads[shard_id].extents = to_read_list[i];
+ ref.shard_reads[shard_id].pg_shard = pg_shard_t(int(i), shard_id);
+ } else {
+ ECCommon::shard_read_t parity_shard_read;
+ parity_shard_read.subchunk = ecode->default_sub_chunk;
+ parity_shard_read.extents.union_of(to_read_list[i]);
+ ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read;
+ ref.shard_reads[shard_id_t(parity_shard)].pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard));
+ }
+ }
+
+ ASSERT_EQ(read_request, ref);
+
+ listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+ }
+
+
+ /* Read to every data shard, missing shard, missing shard is adjacent. */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+ unsigned int missing_shard = 1;
+
+ to_read_list[shard_id_t(0)].insert(0, page_size);
+ to_read_list[shard_id_t(1)].insert(page_size, page_size);
+ to_read_list[shard_id_t(2)].insert(2*page_size, page_size);
+ to_read_list[shard_id_t(3)].insert(3*page_size, page_size);
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+
+ // Populating reference manually to check that adjacent shards get correctly combined.
+ ref.shard_reads[shard_id_t(0)].extents.insert(0, page_size*2);
+ ref.shard_reads[shard_id_t(2)].extents.insert(page_size, page_size*2);
+ ref.shard_reads[shard_id_t(3)].extents.insert(page_size, page_size);
+ ref.shard_reads[shard_id_t(3)].extents.insert(3*page_size, page_size);
+ ref.shard_reads[shard_id_t(4)].extents.insert(page_size, page_size);
+ ref.shard_reads[shard_id_t(0)].pg_shard = pg_shard_t(0, shard_id_t(0));
+ ref.shard_reads[shard_id_t(2)].pg_shard = pg_shard_t(2, shard_id_t(2));
+ ref.shard_reads[shard_id_t(3)].pg_shard = pg_shard_t(3, shard_id_t(3));
+ ref.shard_reads[shard_id_t(4)].pg_shard = pg_shard_t(4, shard_id_t(4));
+ for (unsigned int i=0; i<k+1; i++) {
+ if (i==missing_shard) {
+ continue;
+ }
+ ref.shard_reads[shard_id_t(i)].subchunk = ecode->default_sub_chunk;
+ }
+
+ listenerStub.acting_shards.erase(pg_shard_t(missing_shard, shard_id_t(missing_shard)));
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ASSERT_EQ(read_request, ref);
+
+ listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+ }
+
+ /* Read to every data shard, but with "fast" (redundant) reads */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+
+ extent_set extents_to_read;
+ for (shard_id_t i; i<k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+ extents_to_read.insert(int(i) * 2 * page_size, page_size);
+ }
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, true, read_request);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ for (unsigned int i=0; i<k+2; i++) {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents = extents_to_read;
+ shard_read.pg_shard = pg_shard_t(i, shard_id_t(i));
+ ref.shard_reads[shard_id_t(i)] = shard_read;
+ }
+
+ ASSERT_EQ(read_request, ref);
+ }
+
+ /* Read to every data shard, missing shard. */
+ {
+ ECUtil::shard_extent_set_t to_read_list(s.get_k_plus_m());
+ hobject_t hoid;
+
+ for (shard_id_t i; i<k; ++i) {
+ to_read_list[i].insert(int(i) * 2 * page_size, page_size);
+ }
+ ECCommon::read_request_t read_request(to_read_list, false, object_size);
+
+ shard_id_t missing_shard(1);
+ int parity_shard = k;
+ std::set<pg_shard_t> error_shards;
+ error_shards.emplace(int(missing_shard), shard_id_t(missing_shard));
+ // Similar to previous tests with missing shards, but this time, emulate
+ // the shard being missing as a result of a bad read.
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request, error_shards);
+
+ ECCommon::read_request_t ref(to_read_list, false, object_size);
+ std::vector<ECCommon::shard_read_t> want_to_read(empty_shard_vector);
+ for (shard_id_t i; i<k; ++i) {
+ if (i != missing_shard) {
+ want_to_read[int(i)].subchunk = ecode->default_sub_chunk;
+ want_to_read[int(i)].extents.union_of(to_read_list[missing_shard]);
+ want_to_read[int(i)].extents.union_of(to_read_list[i]);
+ want_to_read[int(i)].pg_shard = pg_shard_t(int(i), shard_id_t(i));
+ ref.shard_reads[shard_id_t(i)] = want_to_read[int(i)];
+ } else {
+ ECCommon::shard_read_t parity_shard_read;
+ parity_shard_read.subchunk = ecode->default_sub_chunk;
+ parity_shard_read.extents.union_of(to_read_list[missing_shard]);
+ parity_shard_read.pg_shard = pg_shard_t(parity_shard, shard_id_t(parity_shard));
+ ref.shard_reads[shard_id_t(parity_shard)] = parity_shard_read;
+ }
+ }
+
+ ASSERT_EQ(read_request, ref);
+
+ listenerStub.acting_shards.insert(pg_shard_t(1, shard_id_t(1)));
+ }
+}
+
+TEST(ECCommon, shard_read_combo_tests)
+{
+ const uint64_t page_size = CEPH_PAGE_SIZE;
+ const uint64_t swidth = 2*page_size;
+ const unsigned int k = 2;
+ const unsigned int m = 2;
+ const int nshards = 4;
+ const uint64_t object_size = swidth * 1024;
+ hobject_t hoid;
+
+ ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+ ErasureCodeInterfaceRef ec_impl(ecode);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ for (int i = 0; i < nshards; i++) {
+ listenerStub.acting_shards.insert(pg_shard_t(i, shard_id_t(i)));
+ }
+
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+
+ ec_align_t to_read(36*1024,10*1024, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECCommon::read_request_t read_request(want_to_read, false, object_size);
+
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(want_to_read, false, object_size);
+ {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents.insert(20*1024, 4*1024);
+ shard_read.pg_shard = pg_shard_t(0, shard_id_t(0));
+ ref.shard_reads[shard_id_t(0)] = shard_read;
+ }
+ {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents.insert(16*1024, 8*1024);
+ shard_read.pg_shard = pg_shard_t(1, shard_id_t(1));
+ ref.shard_reads[shard_id_t(1)] = shard_read;
+ }
+
+ ASSERT_EQ(read_request, ref);
+ }
+
+ {
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+
+ ec_align_t to_read(12*1024,12*1024, 1);
+ pipeline.get_min_want_to_read_shards(to_read, want_to_read);
+ ECCommon::read_request_t read_request(want_to_read, false, object_size);
+ pipeline.get_min_avail_to_read_shards(hoid, false, false, read_request);
+
+ ECCommon::read_request_t ref(want_to_read, false, object_size);
+ {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents.insert(8*1024, 4*1024);
+ shard_read.pg_shard = pg_shard_t(0, shard_id_t(0));
+ ref.shard_reads[shard_id_t(0)] = shard_read;
+ }
+ {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents.insert(4*1024, 8*1024);
+ shard_read.pg_shard = pg_shard_t(1, shard_id_t(1));
+ ref.shard_reads[shard_id_t(1)] = shard_read;
+ }
+
+ ASSERT_EQ(read_request, ref);
}
}
const uint64_t swidth = 4096;
const unsigned int k = 4;
const unsigned int m = 2;
+ const uint64_t csize = 1024;
ECUtil::stripe_info_t s(k, m, swidth);
ASSERT_EQ(s.get_stripe_width(), swidth);
ASSERT_EQ(s.get_chunk_size(), 1024);
- std::set<int> want_to_read;
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), csize);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeInterfaceRef ec_impl(new ErasureCodeDummyImpl);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ ECUtil::shard_extent_set_t want_to_read(s.get_k_plus_m());
+ ec_align_t to_read1(512,512, 1);
+ ec_align_t to_read2(512+16*1024,512, 1);
+
+ ECUtil::shard_extent_set_t ref(s.get_k_plus_m());
+
+ ref[shard_id_t(0)].insert(512, 512);
// multitple calls with the same want_to_read can happen during
- // multi-region reads.
+ // multi-region reads. This will create multiple extents in want_to_read,
+ {
+ pipeline.get_min_want_to_read_shards(
+ to_read1, want_to_read);
+ ASSERT_EQ(want_to_read, ref);
+
+ pipeline.get_min_want_to_read_shards(
+ to_read2, want_to_read);
+ // We have 4 data shards per stripe.
+ ref[shard_id_t(0)].insert(512+4*1024, 512);
+ }
+}
+
+TEST(ECCommon, get_remaining_shards)
+{
+ const uint64_t page_size = CEPH_PAGE_SIZE;
+ const uint64_t swidth = 64*page_size;
+ const unsigned int k = 4;
+ const unsigned int m = 2;
+ const int nshards = 6;
+ const uint64_t chunk_size = swidth / k;
+ const uint64_t object_size = swidth * 1024;
+
+ ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+ ErasureCodeInterfaceRef ec_impl(ecode);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ std::vector<ECCommon::shard_read_t> empty_shard_vector(k);
+ ECCommon::shard_read_t empty_shard_read;
+ fill(empty_shard_vector.begin(), empty_shard_vector.end(), empty_shard_read);
+
+ vector<pg_shard_t> pg_shards(nshards);
+ for (int i = 0; i < nshards; i++) {
+ pg_shards[i] = pg_shard_t(i, shard_id_t(i));
+ listenerStub.acting_shards.insert(pg_shards[i]);
+ }
+
{
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 512, 512, s, &want_to_read);
- ASSERT_EQ(want_to_read, std::set<int>{0});
- ECCommon::ReadPipeline::get_min_want_to_read_shards(
- 512+16*1024, 512, s, &want_to_read);
- ASSERT_EQ(want_to_read, std::set<int>{0});
+ hobject_t hoid;
+
+ // Mock up a read request
+ ECUtil::shard_extent_set_t to_read(s.get_k_plus_m());
+ to_read[shard_id_t(0)].insert(0, 4096);
+ ECCommon::read_request_t read_request(to_read, false, object_size);
+ int missing_shard = 0;
+
+ // Mock up a read result.
+ ECCommon::read_result_t read_result(&s);
+ read_result.errors.emplace(pg_shards[missing_shard], -EIO);
+
+ pipeline.get_remaining_shards(hoid, read_result, read_request, false, false);
+
+ ECCommon::read_request_t ref(to_read, false, object_size);
+ int parity_shard = 4;
+ for (unsigned int i=0; i<k; i++) {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ shard_read.extents.insert(0,4096);
+ unsigned int shard_id = i==missing_shard?parity_shard:i;
+ shard_read.pg_shard = pg_shard_t(shard_id, shard_id_t(shard_id));
+ ref.shard_reads[shard_id_t(shard_id)] = shard_read;
+ }
+
+ ASSERT_EQ(read_request, ref);
+ }
+
+ // Request re-read. There is a page of overlap in what is already read.
+ {
+ hobject_t hoid;
+
+ ECUtil::shard_extent_set_t to_read(s.get_k_plus_m());
+ s.ro_range_to_shard_extent_set(chunk_size/2, chunk_size+page_size, to_read);
+ ECCommon::read_request_t read_request(to_read, false, object_size);
+ unsigned int missing_shard = 1;
+
+ // Mock up a read result.
+ ECCommon::read_result_t read_result(&s);
+ read_result.errors.emplace(pg_shards[missing_shard], -EIO);
+ buffer::list bl;
+ bl.append_zero(chunk_size/2);
+ read_result.buffers_read.insert_in_shard(shard_id_t(0), chunk_size/2, bl);
+ read_result.processed_read_requests[shard_id_t(0)].insert(chunk_size/2, bl.length());
+
+ pipeline.get_remaining_shards(hoid, read_result, read_request, false, false);
+
+ // The result should be a read request for the first 4k of shard 0, as that
+ // is currently missing.
+ ECCommon::read_request_t ref(to_read, false, object_size);
+ int parity_shard = 4;
+ for (unsigned int i=0; i<k; i++) {
+ ECCommon::shard_read_t shard_read;
+ shard_read.subchunk = ecode->default_sub_chunk;
+ unsigned int shard_id = i==missing_shard?parity_shard:i;
+ ref.shard_reads[shard_id_t(shard_id)] = shard_read;
+ }
+ ref.shard_reads[shard_id_t(0)].extents.insert(0, chunk_size/2);
+ ref.shard_reads[shard_id_t(0)].pg_shard = pg_shards[0];
+ ref.shard_reads[shard_id_t(2)].extents.insert(0, chunk_size/2+page_size);
+ ref.shard_reads[shard_id_t(2)].pg_shard = pg_shards[2];
+ ref.shard_reads[shard_id_t(3)].extents.insert(0, chunk_size/2+page_size);
+ ref.shard_reads[shard_id_t(3)].pg_shard = pg_shards[3];
+ ref.shard_reads[shard_id_t(4)].extents.insert(0, chunk_size/2+page_size);
+ ref.shard_reads[shard_id_t(4)].pg_shard = pg_shards[4];
+ ASSERT_EQ(read_request, ref);
}
}
+
+TEST(ECCommon, encode)
+{
+ const uint64_t page_size = CEPH_PAGE_SIZE;
+ const uint64_t swidth = 2*page_size;
+ const unsigned int k = 2;
+ const unsigned int m = 2;
+
+ ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+ ErasureCodeInterfaceRef ec_impl(ecode);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ ECUtil::shard_extent_map_t semap(&s);
+
+ for (shard_id_t i; i<k+m; ++i) {
+ bufferlist bl;
+ bl.append_zero(i>=k?4096:2048);
+ semap.insert_in_shard(i, 12*1024, bl);
+ }
+ semap.encode(ec_impl, nullptr, 0);
+}
+
+TEST(ECCommon, decode)
+{
+ const uint64_t page_size = CEPH_PAGE_SIZE;
+ const uint64_t swidth = 3*page_size;
+ const unsigned int k = 3;
+ const unsigned int m = 2;
+
+ ECUtil::stripe_info_t s(k, m, swidth, vector<shard_id_t>(0));
+ ECListenerStub listenerStub;
+ ASSERT_EQ(s.get_stripe_width(), swidth);
+ ASSERT_EQ(s.get_chunk_size(), swidth/k);
+
+ const std::vector<int> chunk_mapping = {}; // no remapping
+ ErasureCodeDummyImpl *ecode = new ErasureCodeDummyImpl();
+ ecode->data_chunk_count = k;
+ ecode->chunk_count = k + m;
+ ErasureCodeInterfaceRef ec_impl(ecode);
+ ECCommon::ReadPipeline pipeline(g_ceph_context, ec_impl, s, &listenerStub);
+
+ ECUtil::shard_extent_map_t semap(&s);
+ bufferlist bl12k;
+ bl12k.append_zero(12288);
+ bufferlist bl8k;
+ bl8k.append_zero(8192);
+ bufferlist bl16k;
+ bl16k.append_zero(16384);
+ semap.insert_in_shard(shard_id_t(1), 512000, bl12k);
+ semap.insert_in_shard(shard_id_t(1), 634880, bl12k);
+ semap.insert_in_shard(shard_id_t(2), 512000, bl12k);
+ semap.insert_in_shard(shard_id_t(2), 630784, bl16k);
+ semap.insert_in_shard(shard_id_t(3), 516096, bl8k);
+ semap.insert_in_shard(shard_id_t(3), 634880, bl12k);
+ ECUtil::shard_extent_set_t want = semap.get_extent_set();
+
+ want[shard_id_t(0)].insert(516096, 8192);
+ want[shard_id_t(0)].insert(634880, 12288);
+ want[shard_id_t(4)].insert(516096, 8192);
+ want[shard_id_t(4)].insert(634880, 12288);
+
+ ceph_assert(0 == semap.decode(ec_impl, want, 2*1024*1024));
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <errno.h>
+#include <signal.h>
+#include "osd/ECUtil.h"
+#include "gtest/gtest.h"
+#include "osd/osd_types.h"
+#include "common/ceph_argparse.h"
+#include "osd/ECTransaction.h"
+
+using namespace std;
+using namespace ECUtil;
+
+// FIXME: Once PRs are in, we should move the other ECUtil tests are moved here.
+
+TEST(ECUtil, stripe_info_t_chunk_mapping)
+{
+ int k=4;
+ int m=2;
+ int chunk_size = 4096;
+ vector<shard_id_t> forward_cm(k+m);
+ vector<shard_id_t> reverse_cm(k+m);
+
+ std::iota(forward_cm.begin(), forward_cm.end(), 0);
+ std::iota(reverse_cm.rbegin(), reverse_cm.rend(), 0);
+
+ stripe_info_t forward_sinfo1(k, m, chunk_size*k);
+ stripe_info_t forward_sinfo2(k, m, chunk_size*k, forward_cm);
+ stripe_info_t reverse_sinfo(k, m, chunk_size*k, reverse_cm);
+
+ for (shard_id_t shard_id : forward_cm) {
+ raw_shard_id_t raw_shard_id((int)shard_id);
+ ASSERT_EQ(shard_id, forward_sinfo1.get_shard(raw_shard_id));
+ ASSERT_EQ(raw_shard_id, forward_sinfo1.get_raw_shard(shard_id));
+ ASSERT_EQ(shard_id, forward_sinfo2.get_shard(raw_shard_id));
+ ASSERT_EQ(raw_shard_id, forward_sinfo2.get_raw_shard(shard_id));
+ ASSERT_EQ(shard_id, reverse_sinfo.get_shard(raw_shard_id_t(k + m - int(raw_shard_id) - 1)));
+ ASSERT_EQ(raw_shard_id_t(k + m- int(shard_id) - 1), reverse_sinfo.get_raw_shard(shard_id));
+ }
+
+ ASSERT_EQ(k, forward_sinfo1.get_k());
+ ASSERT_EQ(m, forward_sinfo1.get_m());
+ ASSERT_EQ(k+m, forward_sinfo1.get_k_plus_m());
+}
+
+TEST(ECUtil, shard_extent_map_t)
+{
+ int k=4;
+ int m=2;
+ int chunk_size = 4096;
+ stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+
+ // insert_in_shard
+ {
+ shard_extent_map_t semap(&sinfo);
+ int new_off = 512;
+ int new_len = 1024;
+ shard_id_t shard0(0);
+ shard_id_t shard2(2);
+
+ // Empty
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(0)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+ ASSERT_TRUE(semap.empty());
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_ro_start());
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_ro_end());
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_start_offset());
+ ASSERT_EQ(std::numeric_limits<uint64_t>::max(), semap.get_end_offset());
+
+
+ // Insert a 1k buffer in shard 2
+ buffer::list bl;
+ bl.append_zero(new_len);
+ semap.insert_in_shard(shard2, new_off, bl);
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(0)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+ ASSERT_TRUE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+ ASSERT_FALSE(semap.empty());
+ ASSERT_EQ(int(shard2) * chunk_size + new_off, semap.get_ro_start());
+ ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end());
+ ASSERT_EQ(new_off, semap.get_start_offset());
+ ASSERT_EQ(new_off + bl.length(), semap.get_end_offset());
+ auto iter = semap.get_extent_map(shard2).begin();
+ ASSERT_EQ(new_off, iter.get_off());
+ ASSERT_EQ(new_len, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+
+ // Insert a 1k buffer in shard 0
+ semap.insert_in_shard(shard0, new_off, bl);
+ ASSERT_TRUE(semap.contains_shard(shard_id_t(0)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(1)));
+ ASSERT_TRUE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+ ASSERT_FALSE(semap.empty());
+ ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start());
+ ASSERT_EQ(int(shard2) * chunk_size + new_off + new_len, semap.get_ro_end());
+ ASSERT_EQ(new_off, semap.get_start_offset());
+ ASSERT_EQ(new_off + bl.length(), semap.get_end_offset());
+ iter = semap.get_extent_map(shard0).begin();
+ ASSERT_EQ(new_off, iter.get_off());
+ ASSERT_EQ(new_len, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard0).end(), iter);
+ iter = semap.get_extent_map(shard2).begin();
+ ASSERT_EQ(new_off, iter.get_off());
+ ASSERT_EQ(new_len, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+
+ /* Insert overlapping into next stripe */
+ semap.insert_in_shard(shard2, chunk_size - 512, bl);
+ ASSERT_EQ(int(shard0) * chunk_size + new_off, semap.get_ro_start());
+ ASSERT_EQ((int(shard2) + k) * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(new_off, semap.get_start_offset());
+ ASSERT_EQ(chunk_size - 512 + bl.length(), semap.get_end_offset());
+
+ iter = semap.get_extent_map(shard2).begin();
+ ASSERT_EQ(new_off, iter.get_off());
+ ASSERT_EQ(new_len, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size - 512, iter.get_off());
+ ASSERT_EQ(new_len, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard2).end(), iter);
+ }
+
+ //insert_ro_extent_map
+ //erase_after_ro_offset
+ {
+ shard_extent_map_t semap(&sinfo);
+ extent_map emap;
+ buffer::list bl1k;
+ buffer::list bl16k;
+ buffer::list bl64k;
+
+ bl1k.append_zero(1024);
+ bl16k.append_zero(chunk_size * k);
+ bl64k.append_zero(chunk_size * k * 4);
+ shard_extent_set_t ref(sinfo.get_k_plus_m());
+
+ // 1: Strangely aligned. (shard 0 [5~1024])
+ emap.insert(5, 1024, bl1k);
+ ref[shard_id_t(0)].insert(5, 1024);
+ // 2: Start of second chunk (shard 1 [0~1024])
+ emap.insert(chunk_size, 1024, bl1k);
+ ref[shard_id_t(1)].insert(0, 1024);
+ // 3: Overlap two chunks (shard1[3584~512], shard2[0~512])
+ emap.insert(chunk_size*2 - 512, 1024, bl1k);
+ ref[shard_id_t(1)].insert(3584, 512);
+ ref[shard_id_t(2)].insert(0, 512);
+ // 4: Overlap two stripes (shard3[3584~512], shard0[4096~512])
+ emap.insert(chunk_size*4 - 512, 1024, bl1k);
+ ref[shard_id_t(3)].insert(3584, 512);
+ ref[shard_id_t(0)].insert(4096, 512);
+ // 5: Full stripe (shard*[8192~4096])
+ emap.insert(chunk_size*k*2, chunk_size*k, bl16k);
+ for (auto &&[_, eset] : ref)
+ eset.insert(8192, 4096);
+ // 6: Two half stripes (shard0,1[20480~4096], shard 2,3[16384~4096])
+ emap.insert(chunk_size*k*4 + 2*chunk_size, chunk_size * k, bl16k);
+ ref[shard_id_t(0)].insert(20480, 4096);
+ ref[shard_id_t(1)].insert(20480, 4096);
+ ref[shard_id_t(2)].insert(16384, 4096);
+ ref[shard_id_t(3)].insert(16384, 4096);
+
+ // 7: Two half stripes, strange alignment (shard0,1[36864~4096], shard2[32773~4096], shard3[32784~4096])
+ emap.insert(chunk_size*k*8 + 2*chunk_size + 5, chunk_size * k, bl16k);
+ ref[shard_id_t(0)].insert(36864, 4096);
+ ref[shard_id_t(1)].insert(36864, 4096);
+ ref[shard_id_t(2)].insert(32773, 4096);
+ ref[shard_id_t(3)].insert(32768, 4096);
+
+ // 8: Multiple stripes (shard*[49152, 16384]
+ emap.insert(chunk_size*k*12, chunk_size * k * 4, bl64k);
+ for (auto &&[_, eset] : ref)
+ eset.insert(49152, 16384);
+
+ semap.insert_ro_extent_map(emap);
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(emap.get_start_off(), semap.get_ro_start());
+ ASSERT_EQ(emap.get_end_off(), semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(chunk_size * 16, semap.get_end_offset());
+
+ /* Erase the later parts at an obscure offset. */
+ semap.erase_after_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512);
+
+ {
+ extent_set tmp;
+
+ tmp.union_insert(0, chunk_size * 8);
+ ref[shard_id_t(3)].intersection_of(tmp);
+ tmp.union_insert(0, chunk_size * 8 + 512);
+ ref[shard_id_t(2)].intersection_of(tmp);
+ tmp.union_insert(0, chunk_size * 9);
+ ref[shard_id_t(1)].intersection_of(tmp);
+ ref[shard_id_t(0)].intersection_of(tmp);
+ }
+
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(5, semap.get_ro_start());
+ ASSERT_EQ(chunk_size * k * 8 + 2 * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(33280, semap.get_end_offset());
+
+ /* Append again */
+ semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512);
+ ref[shard_id_t(0)].insert(chunk_size * 9, chunk_size);
+ ref[shard_id_t(1)].insert(chunk_size * 9, chunk_size);
+ ref[shard_id_t(2)].insert(chunk_size * 8 + 512, chunk_size);
+ ref[shard_id_t(3)].insert(chunk_size * 8, chunk_size);
+
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(5, semap.get_ro_start());
+ ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+ /* Append nothing */
+ semap.append_zeros_to_ro_offset(chunk_size * k * 9 + 2 * chunk_size + 512);
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(5, semap.get_ro_start());
+ ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+ /* Append, to an offset before the end */
+ semap.append_zeros_to_ro_offset(chunk_size * k * 8 + 2 * chunk_size + 512);
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(5, semap.get_ro_start());
+ ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+
+ /* Intersect the beginning ro range */
+ shard_extent_map_t semap2 = semap.intersect_ro_range(chunk_size * 2 - 256,
+ chunk_size * k * 8);
+
+ /* The original semap should be untouched */
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(5, semap.get_ro_start());
+ ASSERT_EQ(chunk_size * k * 9 + 2 * chunk_size + 512, semap.get_ro_end());
+ ASSERT_EQ(0, semap.get_start_offset());
+ ASSERT_EQ(chunk_size * 10, semap.get_end_offset());
+ {
+ extent_set tmp;
+ tmp.insert(chunk_size, chunk_size * 8);
+ ref[shard_id_t(0)].intersection_of(tmp);
+ }
+ {
+ extent_set tmp;
+ tmp.insert(chunk_size - 256, chunk_size * 8);
+ ref[shard_id_t(1)].intersection_of(tmp);
+ }
+ {
+ extent_set tmp;
+ tmp.insert(0, chunk_size * 8);
+ ref[shard_id_t(2)].intersection_of(tmp);
+ ref[shard_id_t(3)].intersection_of(tmp);
+ }
+
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(chunk_size*2 - 256, semap2.get_ro_start());
+ ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end())
+ << "semap2=" << semap2;
+ ASSERT_EQ(0, semap2.get_start_offset());
+ ASSERT_EQ(chunk_size * 6, semap2.get_end_offset());
+
+ // intersect with somethning bigger and it should be identical
+ semap2 = semap2.intersect_ro_range(0, chunk_size * k * 10);
+ for (auto &&[shard, eset] : ref) {
+ ASSERT_EQ(eset, semap2.get_extent_set(shard)) << "shard=" << shard;
+ }
+ ASSERT_EQ(chunk_size * 2 - 256, semap2.get_ro_start());
+ ASSERT_EQ(chunk_size * (k * 5 + 2), semap2.get_ro_end());
+ ASSERT_EQ(0, semap2.get_start_offset());
+ ASSERT_EQ(chunk_size * 6, semap2.get_end_offset());
+
+ extent_set superset;
+ for (auto &&[_, eset] : ref)
+ superset.union_of(eset);
+
+ ASSERT_EQ(superset, semap2.get_extent_superset());
+ }
+
+ // To test "encode" we need more framework... So will leave to higher level
+ // tests.
+}
+
+// This scenario went wrong in ec transaction code in a cluster-based test.
+TEST(ECUtil, shard_extent_map_t_scenario_1)
+{
+ int k=2;
+ int m=2;
+ int chunk_size = 4096;
+ stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+ shard_extent_map_t semap(&sinfo);
+
+ bufferlist bl;
+ bl.append_zero(chunk_size);
+ semap.insert_in_shard(shard_id_t(0), chunk_size, bl);
+ semap.insert_in_shard(shard_id_t(0), chunk_size*3, bl);
+ semap.insert_in_shard(shard_id_t(1), chunk_size, bl);
+ semap.insert_in_shard(shard_id_t(1), chunk_size*3, bl);
+
+ for (int i=0; i<k; i++) {
+ auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+ ASSERT_EQ(chunk_size, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size*3, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+ }
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+ ASSERT_EQ(2*chunk_size, semap.get_ro_start());
+ ASSERT_EQ(8*chunk_size, semap.get_ro_end());
+ ASSERT_EQ(chunk_size, semap.get_start_offset());
+ ASSERT_EQ(4*chunk_size, semap.get_end_offset());
+
+ bufferlist bl2;
+ bl2.append_zero(2048);
+ bl2.c_str()[0]='A';
+ ASSERT_EQ('A', bl2.c_str()[0]);
+ bufferlist bl3;
+ bl3.append_zero(2048);
+ bl3.c_str()[0]='B';
+ ASSERT_EQ('B', bl3.c_str()[0]);
+ sinfo.ro_range_to_shard_extent_map(3*chunk_size, 2048, bl2, semap);
+ sinfo.ro_range_to_shard_extent_map(6*chunk_size, 2048, bl3, semap);
+
+ for (int i=0; i<k; i++) {
+ auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+ ASSERT_EQ(chunk_size, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size*3, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+ }
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+ ASSERT_EQ(2*chunk_size, semap.get_ro_start());
+ ASSERT_EQ(8*chunk_size, semap.get_ro_end());
+ ASSERT_EQ(chunk_size, semap.get_start_offset());
+ ASSERT_EQ(4*chunk_size, semap.get_end_offset());
+
+
+ shard_extent_map_t semap2 = semap.intersect_ro_range(0, 8*chunk_size);
+ for (int i=0; i<k; i++) {
+ auto &&iter = semap.get_extent_map(shard_id_t(i)).begin();
+ ASSERT_EQ(chunk_size, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size*3, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap.get_extent_map(shard_id_t(i)).end(), iter);
+ }
+
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap.contains_shard(shard_id_t(3)));
+
+ for (int i=0; i<k; i++) {
+ auto &&iter = semap2.get_extent_map(shard_id_t(i)).begin();
+ ASSERT_EQ(chunk_size, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size*3, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap2.get_extent_map(shard_id_t(i)).end(), iter);
+ }
+
+ ASSERT_FALSE(semap2.contains_shard(shard_id_t(2)));
+ ASSERT_FALSE(semap2.contains_shard(shard_id_t(3)));
+
+ semap2.insert_parity_buffers();
+ for (int i=0; i<(k+m); i++) {
+ auto &&iter = semap2.get_extent_map(shard_id_t(i)).begin();
+ ASSERT_EQ(chunk_size, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(chunk_size*3, iter.get_off());
+ ASSERT_EQ(chunk_size, iter.get_len());
+ ++iter;
+ ASSERT_EQ(semap2.get_extent_map(shard_id_t(i)).end(), iter);
+ }
+}
+
+
+// This scenario went wrong in ec transaction code in a cluster-based test.
+/*
+ *Recreate of this failure:
+-171> 2024-10-07T11:38:23.746+0100 7fa0df6f4800 0 == test 1 Random offset, random length read/write I/O with queue depth 1 (seqseed 1137522502) ==
+-170> 2024-10-07T11:38:23.746+0100 7fa0df6f4800 5 test Step 0: Create (size=44K)
+-169> 2024-10-07T11:38:23.787+0100 7fa0df6f4800 5 test Step 1: Barrier
+-168> 2024-10-07T11:38:23.787+0100 7fa0df6f4800 5 test Step 2: Write (offset=38K,length=4K)
+-167> 2024-10-07T11:38:23.829+0100 7fa0df6f4800 5 test Step 3: Barrier
+-166> 2024-10-07T11:38:23.829+0100 7fa0df6f4800 5 test Step 4: Write (offset=38K,length=4K)
+-165> 2024-10-07T11:38:23.876+0100 7fa0df6f4800 5 test Step 5: Barrier
+-164> 2024-10-07T11:38:23.876+0100 7fa0df6f4800 5 test Step 6: Write (offset=10K,length=6K)
+-163> 2024-10-07T11:38:23.963+0100 7fa0df6f4800 5 test Step 7: Barrier
+-162> 2024-10-07T11:38:23.963+0100 7fa0df6f4800 5 test Step 8: Write (offset=30K,length=2K)
+*/
+TEST(ECUtil, shard_extent_map_t_insert_ro_buffer)
+{
+ int k=2;
+ int m=2;
+ int chunk_size = 4096;
+ char c = 1;
+ stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+ shard_extent_map_t semap(&sinfo);
+
+ bufferlist bl;
+ bl.append_zero(44*1024);
+
+ char *buf = bl.c_str();
+
+ shard_extent_map_t ref_semap(&sinfo);
+ ref_semap.append_zeros_to_ro_offset(48*1024);
+
+ for (char i=0; i<44; i++) {
+ buf[i*1024] = c;
+ int chunk = i/4;
+ shard_id_t shard(chunk % k);
+ int offset = chunk_size * (chunk / k) + i % 4 * 1024;
+ bufferlist tmp;
+ ref_semap.get_buffer(shard, offset, 1024, tmp);
+ tmp.c_str()[0] = c++;
+ }
+
+ sinfo.ro_range_to_shard_extent_map(0, 44*1024, bl, semap);
+ semap.assert_buffer_contents_equal(ref_semap);
+ bufferlist insert_bl;
+ insert_bl.append_zero(2*1024);
+ insert_bl.c_str()[0] = c;
+ {
+ bufferlist tmp;
+ ref_semap.get_buffer(shard_id_t(1), 14*1024, 1024, tmp);
+ tmp.c_str()[0] = c++;
+ }
+ insert_bl.c_str()[1024] = c;
+ {
+ bufferlist tmp;
+ ref_semap.get_buffer(shard_id_t(1), 15*1024, 1024, tmp);
+ tmp.c_str()[0] = c++;
+ }
+
+ sinfo.ro_range_to_shard_extent_map(30*1024, 1024, insert_bl, semap);
+ semap.assert_buffer_contents_equal(ref_semap);
+}
+
+// Sanity check that k=3 buffer inserts work
+TEST(ECUtil, shard_extent_map_t_insert_ro_buffer_3)
+{
+ int k=3;
+ int m=2;
+ int chunk_size = 4096;
+ uint64_t ro_offset = 10 * 1024;
+ uint64_t ro_length = 32 * 1024;
+
+ char c = 5;
+ stripe_info_t sinfo(k, m, chunk_size*k, vector<shard_id_t>(0));
+ shard_extent_map_t semap(&sinfo);
+ bufferlist ref;
+ bufferlist in;
+ ref.append_zero(ro_length);
+ in.append_zero(ro_length);
+
+ for (uint64_t i=0; i<ro_length; i += 2048) {
+ ref.c_str()[i+8] = c;
+ in.c_str()[i+8] = c;
+ c++;
+ }
+
+ extent_map emap_in;
+ emap_in.insert(ro_offset, ro_length, in);
+ semap.insert_ro_extent_map(emap_in);
+ bufferlist out = semap.get_ro_buffer(ro_offset, ro_length);
+
+ ASSERT_TRUE(out.contents_equal(ref)) << semap.debug_string(2048, 0);
+}
+
+TEST(ECUtil, sinfo_ro_size_to_read_mask_lrc) {
+ std::vector<shard_id_t> chunk_mapping = {shard_id_t(1), shard_id_t(2), shard_id_t(0)};
+ stripe_info_t sinfo(2, 1, 2 * 4096, chunk_mapping);
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(1, read_mask);
+ sinfo.ro_size_to_zero_mask(1, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(1)].insert(0, 4096);
+ ref_zero[shard_id_t(2)].insert(0, 4096);
+ ref_read[shard_id_t(0)].insert(0, 4096);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(38912, read_mask);
+ sinfo.ro_size_to_zero_mask(38912, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(1)].insert(0, 20480);
+ ref_read[shard_id_t(2)].insert(0, 20480);
+ ref_read[shard_id_t(0)].insert(0, 20480);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+}
+
+TEST(ECUtil, sinfo_ro_size_to_read_mask) {
+ stripe_info_t sinfo(2, 1, 16*4096);
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(1, read_mask);
+ sinfo.ro_size_to_zero_mask(1, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(0)].insert(0, 4096);
+ ref_zero[shard_id_t(1)].insert(0, 4096);
+ ref_read[shard_id_t(2)].insert(0, 4096);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(4096, read_mask);
+ sinfo.ro_size_to_zero_mask(4096, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(0)].insert(0, 4096);
+ ref_zero[shard_id_t(1)].insert(0, 4096);
+ ref_read[shard_id_t(2)].insert(0, 4096);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(4097, read_mask);
+ sinfo.ro_size_to_zero_mask(4097, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(0)].insert(0, 8192);
+ ref_zero[shard_id_t(1)].insert(0, 8192);
+ ref_read[shard_id_t(2)].insert(0, 8192);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(8*4096+1, read_mask);
+ sinfo.ro_size_to_zero_mask(8*4096+1, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(0)].insert(0, 8*4096);
+ ref_read[shard_id_t(1)].insert(0, 4096);
+ ref_zero[shard_id_t(1)].insert(4096, 7*4096);
+ ref_read[shard_id_t(2)].insert(0, 8*4096);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+
+ {
+ shard_extent_set_t read_mask(sinfo.get_k_plus_m());
+ shard_extent_set_t zero_mask(sinfo.get_k_plus_m());
+ sinfo.ro_size_to_read_mask(16*4096+1, read_mask);
+ sinfo.ro_size_to_zero_mask(16*4096+1, zero_mask);
+
+ shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ shard_extent_set_t ref_zero(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(0)].insert(0, 9*4096);
+ ref_read[shard_id_t(1)].insert(0, 8*4096);
+ ref_zero[shard_id_t(1)].insert(8*4096, 1*4096);
+ ref_read[shard_id_t(2)].insert(0, 9*4096);
+
+ ASSERT_EQ(ref_read, read_mask);
+ ASSERT_EQ(ref_zero, zero_mask);
+ }
+}
+
+TEST(ECUtil, slice_iterator)
+{
+ stripe_info_t sinfo(2, 1, 2*4096);
+ shard_id_set out_set;
+ out_set.insert_range(shard_id_t(0), 3);
+ shard_extent_map_t sem(&sinfo);
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+ ASSERT_TRUE(iter.get_out_bufferptrs().empty());
+ }
+
+ bufferlist a, b;
+ a.append_zero(8192);
+ a.c_str()[0] = 'A';
+ a.c_str()[4096] = 'C';
+ b.append_zero(4096);
+ b.c_str()[0] = 'B';
+
+ sem.insert_in_shard(shard_id_t(0), 0, a);
+ sem.insert_in_shard(shard_id_t(1), 0, b);
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(0, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_EQ(2, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ {
+ auto out = iter.get_out_bufferptrs();
+
+ ASSERT_EQ(4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+ }
+
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+ // Create a gap.
+ bufferlist d, e;
+ d.append_zero(4096);
+ d.c_str()[0] = 'D';
+ e.append_zero(4096);
+ e.c_str()[0] = 'E';
+ sem.insert_in_shard(shard_id_t(0), 4096*4, d);
+ sem.insert_in_shard(shard_id_t(1), 4096*4, e);
+
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(0, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(2, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+ }
+
+ ++iter;
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(4*4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(2, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ('D', out[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+ // Multiple buffers in each shard and gap at start.
+ sem.clear();
+ a.clear();
+ a.append_zero(4096);
+ a.c_str()[0] = 'A';
+ bufferlist c;
+ c.append_zero(4096);
+ c.c_str()[0] = 'C';
+
+ sem.insert_in_shard(shard_id_t(0), 4096*1, a);
+ sem.insert_in_shard(shard_id_t(1), 4096*1, b);
+ sem.insert_in_shard(shard_id_t(0), 4096*2, c);
+ sem.insert_in_shard(shard_id_t(1), 4096*2, d);
+
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(2, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', out[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ {
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(2*4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(2, out.size());
+ ASSERT_EQ(4096, out[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('C', out[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+}
+TEST(ECUtil, slice_iterator_subset_out)
+{
+ stripe_info_t sinfo(2, 1, 2*4096);
+ shard_id_set out_set;
+ out_set.insert(shard_id_t(1));
+ shard_extent_map_t sem(&sinfo);
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+ ASSERT_TRUE(iter.get_in_bufferptrs().empty());
+ ASSERT_TRUE(iter.get_out_bufferptrs().empty());
+ }
+
+ bufferlist a, b;
+ a.append_zero(8192);
+ a.c_str()[0] = 'A';
+ a.c_str()[4096] = 'C';
+ b.append_zero(4096);
+ b.c_str()[0] = 'B';
+
+ sem.insert_in_shard(shard_id_t(0), 0, a);
+ sem.insert_in_shard(shard_id_t(1), 0, b);
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto in = iter.get_in_bufferptrs();
+ auto out = iter.get_out_bufferptrs();
+ ASSERT_EQ(0, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_EQ(1, in.size());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, in[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ /* The iterator only cares about outputs, so doesn't care that there is an
+ * extra 4k to go.
+ */
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+ // Create a gap.
+ bufferlist d, e;
+ d.append_zero(4096);
+ d.c_str()[0] = 'D';
+ e.append_zero(4096);
+ e.c_str()[0] = 'E';
+ sem.insert_in_shard(shard_id_t(0), 4096*4, d);
+ sem.insert_in_shard(shard_id_t(1), 4096*4, e);
+
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto in = iter.get_in_bufferptrs();
+ auto out = iter.get_out_bufferptrs();
+
+ ASSERT_EQ(0, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(in.empty());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, in.size());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, in[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ // Skip the next 4k, since it is not in the output buffer.
+
+ ++iter;
+ {
+ auto in = iter.get_in_bufferptrs();
+ auto out = iter.get_out_bufferptrs();
+
+ ASSERT_EQ(4*4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(in.empty());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, in.size());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, in[shard_id_t(0)].length());
+ ASSERT_EQ('D', in[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('E', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+ // Multiple buffers in each shard and gap at start.
+ sem.clear();
+ a.clear();
+ a.append_zero(4096);
+ a.c_str()[0] = 'A';
+ bufferlist c;
+ c.append_zero(4096);
+ c.c_str()[0] = 'C';
+
+ sem.insert_in_shard(shard_id_t(0), 4096*1, a);
+ sem.insert_in_shard(shard_id_t(1), 4096*1, b);
+ sem.insert_in_shard(shard_id_t(0), 4096*2, c);
+ sem.insert_in_shard(shard_id_t(1), 4096*2, d);
+
+ {
+ auto iter = sem.begin_slice_iterator(out_set);
+
+ {
+ auto in = iter.get_in_bufferptrs();
+ auto out = iter.get_out_bufferptrs();
+
+ ASSERT_EQ(4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(in.empty());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, in.size());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, in[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('A', in[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('B', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ {
+ auto in = iter.get_in_bufferptrs();
+ auto out = iter.get_out_bufferptrs();
+
+ ASSERT_EQ(2*4096, iter.get_offset());
+ ASSERT_EQ(4096, iter.get_length());
+ ASSERT_FALSE(in.empty());
+ ASSERT_FALSE(out.empty());
+ ASSERT_EQ(1, in.size());
+ ASSERT_EQ(1, out.size());
+ ASSERT_EQ(4096, in[shard_id_t(0)].length());
+ ASSERT_EQ(4096, out[shard_id_t(1)].length());
+ ASSERT_EQ('C', in[shard_id_t(0)].c_str()[0]);
+ ASSERT_EQ('D', out[shard_id_t(1)].c_str()[0]);
+ }
+
+ ++iter;
+ ASSERT_TRUE(iter.is_end());
+ }
+
+}
+
+
+TEST(ECUtil, object_size_to_shard_size)
+{
+ // This should return aligned values, inputs verifying that the result is
+ // aligned to the next page
+ std::vector<uint64_t> inputs = {0x4D000, 0x4CCFF, 0x4C001};
+
+ stripe_info_t sinfo(4, 2, 4*4096);
+ for (uint64_t input : inputs)
+ {
+ ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(0)));
+ ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(1)));
+ ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(2)));
+ ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(input, shard_id_t(3)));
+ ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(4)));
+ ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(input, shard_id_t(5)));
+ }
+
+ // Verify +/-1 also rounds correctly
+ ASSERT_EQ(0x13000, sinfo.object_size_to_shard_size(0x4C000, shard_id_t(0)));
+ ASSERT_EQ(0x14000, sinfo.object_size_to_shard_size(0x4D001, shard_id_t(1)));
+}
+
+TEST(ECUtil, slice)
+{
+ int k=4;
+ int m=2;
+ int chunk_size = 4096;
+ stripe_info_t sinfo(k, m, k*4096);
+ shard_extent_map_t sem(&sinfo);
+
+ extent_map emap;
+ buffer::list bl1k;
+ buffer::list bl4k;
+ buffer::list bl16k;
+ buffer::list bl64k;
+
+ bl1k.append_zero(1024);
+ bl4k.append_zero(4096);
+ bl16k.append_zero(chunk_size * k);
+ bl64k.append_zero(chunk_size * k * 4);
+ shard_extent_set_t ref(sinfo.get_k_plus_m());
+
+ sem.insert_in_shard(shard_id_t(1), 512, bl1k);
+ sem.insert_in_shard(shard_id_t(2), 5, bl4k);
+ sem.insert_in_shard(shard_id_t(3), 256, bl16k);
+ sem.insert_in_shard(shard_id_t(4), 5, bl64k);
+
+ {
+ auto slice_map = sem.slice_map(512, 1024);
+ ASSERT_EQ(4, slice_map.get_extent_maps().size());
+ ASSERT_EQ(512, slice_map.get_start_offset());
+ ASSERT_EQ(512+1024, slice_map.get_end_offset());
+
+ for (int i=1; i<5; i++) {
+ ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(i)).get_start_off());
+ ASSERT_EQ(512+1024, slice_map.get_extent_map(shard_id_t(i)).get_end_off());
+ }
+ }
+
+ {
+ auto slice_map = sem.slice_map(0, 4096);
+ ASSERT_EQ(4, slice_map.get_extent_maps().size());
+ ASSERT_EQ(5, slice_map.get_start_offset());
+ ASSERT_EQ(4096, slice_map.get_end_offset());
+ ASSERT_EQ(512, slice_map.get_extent_map(shard_id_t(1)).get_start_off());
+ ASSERT_EQ(512 + 1024, slice_map.get_extent_map(shard_id_t(1)).get_end_off());
+ ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(2)).get_start_off());
+ ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(2)).get_end_off());
+ ASSERT_EQ(256, slice_map.get_extent_map(shard_id_t(3)).get_start_off());
+ ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(3)).get_end_off());
+ ASSERT_EQ(5, slice_map.get_extent_map(shard_id_t(4)).get_start_off());
+ ASSERT_EQ(4096, slice_map.get_extent_map(shard_id_t(4)).get_end_off());
+ }
+
+ {
+ auto slice_map = sem.slice_map(0, 5);
+ ASSERT_TRUE(slice_map.empty());
+ }
+
+ {
+ auto slice_map = sem.slice_map(64*1024+5, 5);
+ ASSERT_TRUE(slice_map.empty());
+ }
+
+ {
+ auto slice_map = sem.slice_map(5, 64*1024);
+ ASSERT_EQ(slice_map, sem);
+ }
+
+ {
+ auto slice_map = sem.slice_map(0, 65*1024);
+ ASSERT_EQ(slice_map, sem);
+ }
+}
\ No newline at end of file
#include "osd/PGTransaction.h"
#include "osd/ECTransaction.h"
#include "common/debug.h"
+#include "osd/ECBackend.h"
#include "test/unit.cc"
#define dout_context g_ceph_context
-TEST(ectransaction, two_writes_separated)
+struct ECTestOp : ECCommon::RMWPipeline::Op {
+ PGTransactionUPtr t;
+};
+
+TEST(ectransaction, two_writes_separated_append)
{
hobject_t h;
- PGTransactionUPtr t(new PGTransaction);
+ PGTransaction::ObjectOperation op;
bufferlist a, b;
- t->create(h);
a.append_zero(565760);
- t->write(h, 0, a.length(), a, 0);
+ op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
b.append_zero(2437120);
- t->write(h, 669856, b.length(), b, 0);
+ op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0});
- ECUtil::stripe_info_t sinfo(2, 2, 8192);
- auto plan = ECTransaction::get_write_plan(
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool);
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 4);
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
sinfo,
- *t,
- [&](const hobject_t &i) {
- ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
- return ref;
- },
- &dpp);
- generic_derr << "to_read " << plan.to_read << dendl;
- generic_derr << "will_write " << plan.will_write << dendl;
-
- ASSERT_EQ(0u, plan.to_read.size());
- ASSERT_EQ(1u, plan.will_write.size());
+ shards,
+ shards,
+ false,
+ 0,
+ std::nullopt,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ ASSERT_FALSE(plan.to_read);
+ ASSERT_EQ(4u, plan.will_write.shard_count());
}
-TEST(ectransaction, two_writes_nearby)
+TEST(ectransaction, two_writes_separated_misaligned_overwrite)
{
hobject_t h;
- PGTransactionUPtr t(new PGTransaction);
+ PGTransaction::ObjectOperation op;
bufferlist a, b;
- t->create(h);
-
- // two nearby writes, both partly touching the same 8192-byte stripe
- ECUtil::stripe_info_t sinfo(2, 2, 8192);
a.append_zero(565760);
- t->write(h, 0, a.length(), a, 0);
+ op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
b.append_zero(2437120);
- t->write(h, 569856, b.length(), b, 0);
+ op.buffer_updates.insert(669856, b.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{b, 0});
- auto plan = ECTransaction::get_write_plan(
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 2, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 3112960;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 4);
+
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
sinfo,
- *t,
- [&](const hobject_t &i) {
- ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
- return ref;
- },
- &dpp);
- generic_derr << "to_read " << plan.to_read << dendl;
- generic_derr << "will_write " << plan.will_write << dendl;
-
- ASSERT_EQ(0u, plan.to_read.size());
- ASSERT_EQ(1u, plan.will_write.size());
+ shards,
+ shards,
+ false,
+ oi.size,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ ASSERT_EQ(2u, (*plan.to_read).shard_count());
+ ASSERT_EQ(4u, plan.will_write.shard_count());
}
-TEST(ectransaction, many_writes)
+// Test writing to an object at an offset which is beyond the end of the
+// current object.
+TEST(ectransaction, partial_write)
{
hobject_t h;
- PGTransactionUPtr t(new PGTransaction);
- bufferlist a, b;
- a.append_zero(512);
- b.append_zero(4096);
- t->create(h);
-
- ECUtil::stripe_info_t sinfo(2, 2, 8192);
- // write 2801664~512
- // write 2802176~512
- // write 2802688~512
- // write 2803200~512
- t->write(h, 2801664, a.length(), a, 0);
- t->write(h, 2802176, a.length(), a, 0);
- t->write(h, 2802688, a.length(), a, 0);
- t->write(h, 2803200, a.length(), a, 0);
-
- // write 2805760~4096
- // write 2809856~4096
- // write 2813952~4096
- t->write(h, 2805760, b.length(), b, 0);
- t->write(h, 2809856, b.length(), b, 0);
- t->write(h, 2813952, b.length(), b, 0);
-
- auto plan = ECTransaction::get_write_plan(
+ PGTransaction::ObjectOperation op;
+ bufferlist a;
+
+ // Start by writing 8 bytes to the start of an object.
+ a.append_zero(8);
+ op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 8;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 3);
+
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
+ sinfo,
+ shards,
+ shards,
+ false,
+ 0,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // The object is empty, so we should have no reads and an 4k write.
+ ASSERT_FALSE(plan.to_read);
+ extent_set ref_write;
+ ref_write.insert(0, 4096);
+ ASSERT_EQ(2u, plan.will_write.shard_count());
+ ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(0)));
+ ASSERT_EQ(ref_write, plan.will_write.at(shard_id_t(2)));
+}
+
+TEST(ectransaction, overlapping_write_non_aligned)
+{
+ hobject_t h;
+ PGTransaction::ObjectOperation op;
+ bufferlist a;
+
+ // Start by writing 8 bytes to the start of an object.
+ a.append_zero(8);
+ op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 8;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 4);
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
+ sinfo,
+ shards,
+ shards,
+ false,
+ 8,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // There should be no overlap of this read.
+ ASSERT_EQ(1u, (*plan.to_read).shard_count());
+ extent_set ref;
+ ref.insert(0, 4096);
+ ASSERT_EQ(2u, plan.will_write.shard_count());
+ ASSERT_EQ(1u, (*plan.to_read).shard_count());
+ ASSERT_EQ(ref, plan.will_write.at(shard_id_t(0)));
+ ASSERT_EQ(ref, plan.will_write.at(shard_id_t(2)));
+}
+
+TEST(ectransaction, test_appending_write_non_aligned)
+{
+ hobject_t h;
+ PGTransaction::ObjectOperation op;
+ bufferlist a;
+
+ // Start by writing 8 bytes to the start of an object.
+ a.append_zero(4096);
+ op.buffer_updates.insert(3*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 4*4096;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 4);
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
+ sinfo,
+ shards,
+ shards,
+ false,
+ 8,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // We are growing an option from zero with a hole.
+ ASSERT_FALSE(plan.to_read);
+
+ // The writes will cover not cover the zero parts
+ ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+ ref_write[shard_id_t(1)].insert(4096, 4096);
+ ref_write[shard_id_t(2)].insert(4096, 4096);
+ ASSERT_EQ(ref_write, plan.will_write);
+}
+
+TEST(ectransaction, append_with_large_hole)
+{
+ hobject_t h;
+ PGTransaction::ObjectOperation op;
+ bufferlist a;
+
+ // We have a 4k write quite a way after the current limit of a 4k object
+ a.append_zero(4096);
+ op.buffer_updates.insert(24*4096, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 25*4096;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 4);
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
sinfo,
- *t,
- [&](const hobject_t &i) {
- ECUtil::HashInfoRef ref(new ECUtil::HashInfo(1));
- return ref;
- },
- &dpp);
- generic_derr << "to_read " << plan.to_read << dendl;
- generic_derr << "will_write " << plan.will_write << dendl;
-
- ASSERT_EQ(0u, plan.to_read.size());
- ASSERT_EQ(1u, plan.will_write.size());
+ shards,
+ shards,
+ false,
+ 4096,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // Should not require any reads.
+ ASSERT_FALSE(plan.to_read);
+
+ // The writes will cover the new zero parts.
+ ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+ ref_write[shard_id_t(0)].insert(12*4096, 4096);
+ ref_write[shard_id_t(2)].insert(12*4096, 4096);
+ ASSERT_EQ(ref_write, plan.will_write);
}
+
+TEST(ectransaction, test_append_not_page_aligned_with_large_hole)
+{
+ hobject_t h;
+ PGTransaction::ObjectOperation op;
+ bufferlist a;
+
+ // We have a 4k write quite a way after the current limit of a 4k object
+ a.append_zero(2048);
+ op.buffer_updates.insert(24*4096 + 1024, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 25*4096;
+ shard_id_set shards;
+ shards.insert_range(shard_id_t(), 3);
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
+ sinfo,
+ shards,
+ shards,
+ false,
+ 4096,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // No reads (because not yet written)
+ ASSERT_FALSE(plan.to_read);
+
+ // Writes should grow to 4k
+ ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+ ref_write[shard_id_t(0)].insert(12*4096, 4096);
+ ref_write[shard_id_t(2)].insert(12*4096, 4096);
+ ASSERT_EQ(ref_write, plan.will_write);
+}
+
+TEST(ectransaction, test_overwrite_with_missing)
+{
+ hobject_t h;
+ PGTransaction::ObjectOperation op, op2;
+ bufferlist a;
+
+ // We have a 4k write quite a way after the current limit of a 4k object
+ a.append_zero(14*1024);
+ op.buffer_updates.insert(0, a.length(), PGTransaction::ObjectOperation::BufferUpdate::Write{a, 0});
+
+ pg_pool_t pool;
+ pool.set_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS);
+ ECUtil::stripe_info_t sinfo(2, 1, 8192, &pool, std::vector<shard_id_t>(0));
+ object_info_t oi;
+ oi.size = 42*1024;
+ shard_id_set shards;
+ shards.insert(shard_id_t(0));
+ shards.insert(shard_id_t(1));
+
+ ECTransaction::WritePlanObj plan(
+ h,
+ op,
+ sinfo,
+ shards,
+ shards,
+ false,
+ 42*1024,
+ oi,
+ std::nullopt,
+ ECUtil::HashInfoRef(new ECUtil::HashInfo(1)),
+ nullptr,
+ 0);
+
+ generic_derr << "plan " << plan << dendl;
+
+ // No reads (because not yet written)
+ ASSERT_TRUE(plan.to_read);
+ ECUtil::shard_extent_set_t ref_read(sinfo.get_k_plus_m());
+ ref_read[shard_id_t(1)].insert(4096, 4096);
+ ASSERT_EQ(ref_read, plan.to_read);
+
+ // Writes should grow to 4k
+ ECUtil::shard_extent_set_t ref_write(sinfo.get_k_plus_m());
+ ref_write[shard_id_t(0)].insert(0, 8192);
+ ref_write[shard_id_t(1)].insert(0, 8192);
+ ASSERT_EQ(ref_write, plan.will_write);
+}
\ No newline at end of file
#include <gtest/gtest.h>
-#include "osd/ExtentCache.h"
-#include <iostream>
+#include "osd/ECExtentCache.h"
using namespace std;
+using namespace ECUtil;
-extent_map imap_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
+shard_extent_map_t imap_from_vector(vector<vector<pair<uint64_t, uint64_t>>> &&in, stripe_info_t const *sinfo)
{
- extent_map out;
- for (auto &&tup: in) {
- bufferlist bl;
- bl.append_zero(tup.second);
- out.insert(tup.first, bl.length(), bl);
+ shard_extent_map_t out(sinfo);
+ for (int shard = 0; shard < (int)in.size(); shard++) {
+ for (auto &&tup: in[shard]) {
+ bufferlist bl;
+ bl.append_zero(tup.second);
+ out.insert_in_shard(shard_id_t(shard), tup.first, bl);
+ }
}
return out;
}
-extent_map imap_from_iset(const extent_set &set)
+shard_extent_map_t imap_from_iset(const shard_extent_set_t &sset, stripe_info_t *sinfo)
{
- extent_map out;
- for (auto &&iter: set) {
- bufferlist bl;
- bl.append_zero(iter.second);
- out.insert(iter.first, iter.second, bl);
+ shard_extent_map_t out(sinfo);
+
+ for (auto &&[shard, set]: sset) {
+ for (auto &&iter: set) {
+ bufferlist bl;
+ bl.append_zero(iter.second);
+ out.insert_in_shard(shard, iter.first, bl);
+ }
}
return out;
}
-extent_set iset_from_vector(vector<pair<uint64_t, uint64_t> > &&in)
+shard_extent_set_t iset_from_vector(vector<vector<pair<uint64_t, uint64_t>>> &&in, const stripe_info_t *sinfo)
{
- extent_set out;
- for (auto &&tup: in) {
- out.insert(tup.first, tup.second);
+ shard_extent_set_t out(sinfo->get_k_plus_m());
+ for (int shard = 0; shard < (int)in.size(); shard++) {
+ for (auto &&tup: in[shard]) {
+ out[shard_id_t(shard)].insert(tup.first, tup.second);
+ }
}
return out;
}
-TEST(extentcache, simple_write)
+struct Client : public ECExtentCache::BackendReadListener
{
- hobject_t oid;
-
- ExtentCache c;
- ExtentCache::write_pin pin;
- c.open_write_pin(pin);
-
- auto to_read = iset_from_vector(
- {{0, 2}, {8, 2}, {20, 2}});
- auto to_write = iset_from_vector(
- {{0, 10}, {20, 4}});
- auto must_read = c.reserve_extents_for_rmw(
- oid, pin, to_write, to_read);
- ASSERT_EQ(
- must_read,
- to_read);
-
- c.print(std::cerr);
-
- auto got = imap_from_iset(must_read);
- auto pending_read = to_read;
- pending_read.subtract(must_read);
-
- auto pending = c.get_remaining_extents_for_rmw(
- oid,
- pin,
- pending_read);
- ASSERT_TRUE(pending.empty());
-
- auto write_map = imap_from_iset(to_write);
- c.present_rmw_update(
- oid,
- pin,
- write_map);
-
- c.release_write_pin(pin);
+ hobject_t oid = hobject_t().make_temp_hobject("My first object");
+ stripe_info_t sinfo;
+ ECExtentCache::LRU lru;
+ ECExtentCache cache;
+ optional<shard_extent_set_t> active_reads;
+ list<shard_extent_map_t> results;
+
+ Client(uint64_t chunk_size, int k, int m, uint64_t cache_size) :
+ sinfo(k, m, k*chunk_size, vector<shard_id_t>(0)),
+ lru(cache_size), cache(*this, lru, sinfo, g_ceph_context) {};
+
+ void backend_read(hobject_t _oid, const shard_extent_set_t& request,
+ uint64_t object_size) override {
+ ceph_assert(oid == _oid);
+ active_reads = request;
+ }
+
+ void cache_ready(const hobject_t& _oid, const shard_extent_map_t& _result)
+ {
+ ceph_assert(oid == _oid);
+ results.emplace_back(_result);
+ }
+
+ void complete_read()
+ {
+ auto reads_done = imap_from_iset(*active_reads, &sinfo);
+ active_reads.reset(); // set before done, as may be called back.
+ cache.read_done(oid, std::move(reads_done));
+ }
+
+ void complete_write(ECExtentCache::OpRef &op)
+ {
+ shard_extent_map_t emap = imap_from_iset(op->get_writes(), &sinfo);
+ //Fill in the parity. Parity correctness does not matter to the cache.
+ emap.insert_parity_buffers();
+ results.clear();
+ cache.write_done(op, std::move(emap));
+ }
+
+ void cache_execute(ECExtentCache::OpRef &op)
+ {
+ list<ECExtentCache::OpRef> l;
+ l.emplace_back(op);
+ cache.execute(l);
+ }
+
+ const stripe_info_t *get_stripe_info() const { return &sinfo; }
+};
+
+TEST(ECExtentCache, double_write_done)
+{
+ Client cl(32, 2, 1, 64);
+
+ auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+
+ optional op = cl.cache.prepare(cl.oid, nullopt, to_write, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op);
+ cl.complete_write(*op);
+}
+
+TEST(ECExtentCache, simple_write)
+{
+ Client cl(32, 2, 1, 64);
+ {
+ auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info());
+ auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+
+ /* OpRef request(hobject_t const &oid,
+ std::optional<std::shard_extent_set_t> const &to_read,
+ std::shard_extent_set_t const &write,
+ uint64_t orig_size,
+ uint64_t projected_size,
+ CacheReadyCb &&ready_cb)
+ */
+
+ optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op);
+ ASSERT_EQ(to_read, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+ cl.complete_read();
+
+ ASSERT_FALSE(cl.active_reads);
+ ASSERT_EQ(1, cl.results.size());
+ ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+ cl.complete_write(*op);
+
+ ASSERT_FALSE(cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+ op.reset();
+ }
+
+ // Repeating the same read should complete without a backend read..
+ {
+ auto to_read = iset_from_vector( {{{0, 2}}, {{0, 2}}}, cl.get_stripe_info());
+ auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+ optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op);
+ ASSERT_FALSE(cl.active_reads);
+ ASSERT_FALSE(cl.results.empty());
+ ASSERT_EQ(1, cl.results.size());
+ ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+ cl.complete_write(*op);
+ op.reset();
+ }
+
+ // Perform a read overlapping with the previous write, but not hte previous read.
+ // This should not result in any backend reads, since the cache can be honoured
+ // from the previous write.
+ {
+ auto to_read = iset_from_vector( {{{2, 2}}, {{2, 2}}}, cl.get_stripe_info());
+ auto to_write = iset_from_vector({{{0, 10}}, {{0, 10}}}, cl.get_stripe_info());
+ optional op = cl.cache.prepare(cl.oid, to_read, to_write, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op);
+
+ // SHould have remained in LRU!
+ ASSERT_FALSE(cl.active_reads);
+ ASSERT_EQ(1, cl.results.size());
+ ASSERT_EQ(to_read, cl.results.front().get_extent_set());
+ cl.complete_write(*op);
+ op.reset();
+ }
+}
+
+TEST(ECExtentCache, sequential_appends) {
+ Client cl(32, 2, 1, 32);
+
+ auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+ // The first write...
+ optional op1 = cl.cache.prepare(cl.oid, nullopt, to_write1, 0, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+
+ // Write should have been honoured immediately.
+ ASSERT_FALSE(cl.results.empty());
+ auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info());
+ cl.complete_write(*op1);
+ ASSERT_TRUE(cl.results.empty());
+
+ // The first write...
+ optional op2 = cl.cache.prepare(cl.oid, nullopt, to_write1, 10, 20, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op2);
+
}
-TEST(extentcache, write_write_overlap)
+TEST(ECExtentCache, multiple_writes)
{
- hobject_t oid;
-
- ExtentCache c;
- ExtentCache::write_pin pin;
- c.open_write_pin(pin);
-
- // start write 1
- auto to_read = iset_from_vector(
- {{0, 2}, {8, 2}, {20, 2}});
- auto to_write = iset_from_vector(
- {{0, 10}, {20, 4}});
- auto must_read = c.reserve_extents_for_rmw(
- oid, pin, to_write, to_read);
- ASSERT_EQ(
- must_read,
- to_read);
-
- c.print(std::cerr);
-
- // start write 2
- ExtentCache::write_pin pin2;
- c.open_write_pin(pin2);
- auto to_read2 = iset_from_vector(
- {{2, 4}, {10, 4}, {18, 4}});
- auto to_write2 = iset_from_vector(
- {{2, 12}, {18, 12}});
- auto must_read2 = c.reserve_extents_for_rmw(
- oid, pin2, to_write2, to_read2);
- ASSERT_EQ(
- must_read2,
- iset_from_vector({{10, 4}, {18, 2}}));
-
- c.print(std::cerr);
-
- // complete read for write 1 and start commit
- auto got = imap_from_iset(must_read);
- auto pending_read = to_read;
- pending_read.subtract(must_read);
- auto pending = c.get_remaining_extents_for_rmw(
- oid,
- pin,
- pending_read);
- ASSERT_TRUE(pending.empty());
-
- auto write_map = imap_from_iset(to_write);
- c.present_rmw_update(
- oid,
- pin,
- write_map);
-
- c.print(std::cerr);
-
- // complete read for write 2 and start commit
- auto pending_read2 = to_read2;
- pending_read2.subtract(must_read2);
- auto pending2 = c.get_remaining_extents_for_rmw(
- oid,
- pin2,
- pending_read2);
- ASSERT_EQ(
- pending2,
- imap_from_iset(pending_read2));
-
- auto write_map2 = imap_from_iset(to_write2);
- c.present_rmw_update(
- oid,
- pin2,
- write_map2);
-
- c.print(std::cerr);
-
- c.release_write_pin(pin);
-
- c.print(std::cerr);
-
- c.release_write_pin(pin2);
+ Client cl(32, 2, 1, 32);
+
+ auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+ // This should drive a request for this IO, which we do not yet honour.
+ optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Perform another request. We should not see any change in the read requests.
+ auto to_read2 = iset_from_vector( {{{8, 4}}}, cl.get_stripe_info());
+ auto to_write2 = iset_from_vector({{{10, 10}}}, cl.get_stripe_info());
+ optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Perform another request, this to check that reads are coalesced.
+ auto to_read3 = iset_from_vector( {{{32, 6}}}, cl.get_stripe_info());
+ auto to_write3 = iset_from_vector({}, cl.get_stripe_info());
+ optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op3);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Finally op4, with no reads.
+ auto to_write4 = iset_from_vector({{{20, 10}}}, cl.get_stripe_info());
+ optional op4 = cl.cache.prepare(cl.oid, nullopt, to_write4, 10, 10, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op4);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Completing the first read will allow the first write and start a batched read.
+ // Note that the cache must not read what was written in op 1.
+ cl.complete_read();
+ auto expected_read = iset_from_vector({{{10,2}, {32,6}}}, cl.get_stripe_info());
+ ASSERT_EQ(expected_read, cl.active_reads);
+ ASSERT_EQ(1, cl.results.size());
+ ASSERT_EQ(to_read1, cl.results.front().get_extent_set());
+ cl.complete_write(*op1);
+
+ // The next write requires some more reads, so should not occur.
+ ASSERT_TRUE(cl.results.empty());
+
+ // All reads complete, this should allow for op2 to be ready.
+ cl.complete_read();
+ ASSERT_FALSE(cl.active_reads);
+ ASSERT_EQ(3, cl.results.size());
+ auto result = cl.results.begin();
+ ASSERT_EQ(to_read2, result++->get_extent_set());
+ ASSERT_EQ(to_read3, result++->get_extent_set());
+ ASSERT_TRUE(result++->empty());
+
+ cl.complete_write(*op2);
+ cl.complete_write(*op3);
+ cl.complete_write(*op4);
+
+ op1.reset();
+ op2.reset();
+ op3.reset();
+ op4.reset();
}
-TEST(extentcache, write_write_overlap2)
+int dummies;
+struct Dummy
+{
+ Dummy() {dummies++;}
+ ~Dummy() {dummies--;}
+};
+
+TEST(ECExtentCache, on_change)
{
- hobject_t oid;
-
- ExtentCache c;
- ExtentCache::write_pin pin;
- c.open_write_pin(pin);
-
- // start write 1
- auto to_read = extent_set();
- auto to_write = iset_from_vector(
- {{659456, 4096}});
- auto must_read = c.reserve_extents_for_rmw(
- oid, pin, to_write, to_read);
- ASSERT_EQ(
- must_read,
- to_read);
-
- c.print(std::cerr);
-
- // start write 2
- ExtentCache::write_pin pin2;
- c.open_write_pin(pin2);
- auto to_read2 = extent_set();
- auto to_write2 = iset_from_vector(
- {{663552, 4096}});
- auto must_read2 = c.reserve_extents_for_rmw(
- oid, pin2, to_write2, to_read2);
- ASSERT_EQ(
- must_read2,
- to_read2);
-
-
- // start write 3
- ExtentCache::write_pin pin3;
- c.open_write_pin(pin3);
- auto to_read3 = iset_from_vector({{659456, 8192}});
- auto to_write3 = iset_from_vector({{659456, 8192}});
- auto must_read3 = c.reserve_extents_for_rmw(
- oid, pin3, to_write3, to_read3);
- ASSERT_EQ(
- must_read3,
- extent_set());
-
- c.print(std::cerr);
-
- // complete read for write 1 and start commit
- auto got = imap_from_iset(must_read);
- auto pending_read = to_read;
- pending_read.subtract(must_read);
- auto pending = c.get_remaining_extents_for_rmw(
- oid,
- pin,
- pending_read);
- ASSERT_TRUE(pending.empty());
-
- auto write_map = imap_from_iset(to_write);
- c.present_rmw_update(
- oid,
- pin,
- write_map);
-
- c.print(std::cerr);
-
- // complete read for write 2 and start commit
- auto pending_read2 = to_read2;
- pending_read2.subtract(must_read2);
- auto pending2 = c.get_remaining_extents_for_rmw(
- oid,
- pin2,
- pending_read2);
- ASSERT_EQ(
- pending2,
- imap_from_iset(pending_read2));
-
- auto write_map2 = imap_from_iset(to_write2);
- c.present_rmw_update(
- oid,
- pin2,
- write_map2);
-
- // complete read for write 2 and start commit
- auto pending_read3 = to_read3;
- pending_read3.subtract(must_read3);
- auto pending3 = c.get_remaining_extents_for_rmw(
- oid,
- pin3,
- pending_read3);
- ASSERT_EQ(
- pending3,
- imap_from_iset(pending_read3));
-
- auto write_map3 = imap_from_iset(to_write3);
- c.present_rmw_update(
- oid,
- pin3,
- write_map3);
-
-
- c.print(std::cerr);
-
- c.release_write_pin(pin);
-
- c.print(std::cerr);
-
- c.release_write_pin(pin2);
-
- c.print(std::cerr);
-
- c.release_write_pin(pin3);
+ Client cl(32, 2, 1, 64);
+ auto to_read1 = iset_from_vector( {{{0, 2}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 10}}}, cl.get_stripe_info());
+
+ optional<ECExtentCache::OpRef> op;
+ optional<shared_ptr<Dummy>> dummy;
+
+ dummy.emplace(make_shared<Dummy>());
+ ceph_assert(dummies == 1);
+ {
+ shared_ptr<Dummy> d = *dummy;
+ /* Here we generate an op that we never expect to be completed. Note that
+ * some static code analysis tools suggest deleting d here. DO NOT DO THIS
+ * as we are relying on side effects from the destruction of d in this test.
+ */
+ op.emplace(cl.cache.prepare(cl.oid, to_read1, to_write1, 10, 10, false,
+ [d](ECExtentCache::OpRef &ignored)
+ {
+ ceph_abort("Should be cancelled");
+ }));
+ }
+ cl.cache_execute(*op);
+
+ /* We now have the following graph of objects:
+ * cache -- op -- lambda -- d
+ * dummy --/
+ */
+ ASSERT_EQ(1, dummies);
+
+ /* Executing the on_change will "cancel" this cache op. This will cause it
+ * to release the lambda, reducing us down to dummy -- d
+ */
+ cl.cache.on_change();
+ ASSERT_EQ(1, dummies);
+
+ /* This emulates the rmw pipeline clearing outstanding IO. We now have no
+ * references to d, so we should have destructed the object.
+ * */
+ dummy.reset();
+ ASSERT_EQ(0, dummies);
+
+ /* Keeping the op alive here is emulating the dummy keeping a record of the
+ * cache op. It will also be destroyed at this point by rmw pipeline.
+ */
+ ASSERT_FALSE(cl.cache.idle());
+ op.reset();
+ ASSERT_TRUE(cl.cache.idle());
+
+ // The cache has its own asserts, which we should honour.
+ cl.cache.on_change2();
+}
+
+TEST(ECExtentCache, multiple_misaligned_writes)
+{
+ Client cl(256*1024, 2, 1, 1024*1024);
+
+ // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is
+ // partial, so we read the second 4k to RMW
+ auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info());
+
+ // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the
+ // first and last pages. The first part of this read should be in the cache.
+ auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*4096, 4*4096}}}, cl.get_stripe_info());
+ auto to_read2_exec = iset_from_vector( {{{12*4096, 4*4096}}}, cl.get_stripe_info());
+ auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info());
+
+ // IO 3 is the next misaligned 4k, very similar to IO 3.
+ auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*4096, 4*4096}}}, cl.get_stripe_info());
+ auto to_read3_exec = iset_from_vector( {{{20*4096, 4*4096}}}, cl.get_stripe_info());
+ auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info());
+
+ //Perform the first write, which should result in a read.
+ optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Submit the second IO.
+ optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+ // We should still be executing read 1.
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Allow the read to complete. We should now have op1 done...
+ cl.complete_read();
+ ASSERT_EQ(to_read2_exec, cl.active_reads);
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op1);
+
+ // And move on to op3
+ optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op3);
+ // We should still be executing read 1.
+ ASSERT_EQ(to_read2_exec, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Allow the read to complete. We should now have op2 done...
+ cl.complete_read();
+ ASSERT_EQ(to_read3_exec, cl.active_reads);
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op2);
+ ASSERT_EQ(to_read3_exec, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+ cl.complete_read();
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op3);
+
}
+
+TEST(ECExtentCache, multiple_misaligned_writes2)
+{
+ Client cl(256*1024, 2, 1, 1024*1024);
+
+ // IO 1 is really a 6k write. The write is inflated to 8k, but the second 4k is
+ // partial, so we read the second 4k to RMW
+ auto to_read1 = iset_from_vector( {{{4*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 8*1024}}}, cl.get_stripe_info());
+
+ // IO 2 is the next 8k write, starting at 6k. So we have a 12k write, reading the
+ // first and last pages. The first part of this read should be in the cache.
+ auto to_read2 = iset_from_vector( {{{4*1024, 4*1024}, {12*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_read2_exec = iset_from_vector( {{{12*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_write2 = iset_from_vector({{{4*1024, 12*1024}}}, cl.get_stripe_info());
+
+ // IO 3 is the next misaligned 4k, very similar to IO 3.
+ auto to_read3 = iset_from_vector( {{{12*1024, 4*1024}, {20*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_read3_exec = iset_from_vector( {{{20*1024, 4*1024}}}, cl.get_stripe_info());
+ auto to_write3 = iset_from_vector({{{12*1024, 12*1024}}}, cl.get_stripe_info());
+
+ //Perform the first write, which should result in a read.
+ optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Submit the second IO.
+ optional op2 = cl.cache.prepare(cl.oid, to_read2, to_write2, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+ // We should still be executing read 1.
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Allow the read to complete. We should now have op1 done...
+ cl.complete_read();
+ ASSERT_EQ(to_read2_exec, cl.active_reads);
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op1);
+
+ // And move on to op3
+ optional op3 = cl.cache.prepare(cl.oid, to_read3, to_write3, 22*1024, 22*1024, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op3);
+ // We should still be executing read 1.
+ ASSERT_EQ(to_read2_exec, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ // Allow the read to complete. We should now have op2 done...
+ cl.complete_read();
+ ASSERT_EQ(to_read3_exec, cl.active_reads);
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op2);
+ ASSERT_EQ(to_read3_exec, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+ cl.complete_read();
+ ASSERT_FALSE(cl.results.empty());
+ cl.complete_write(*op3);
+
+}
+
+TEST(ECExtentCache, test_invalidate)
+{
+ Client cl(256*1024, 2, 1, 1024*1024);
+
+ /* First attempt a write which does not do any reads */
+ {
+ auto to_read1 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info());
+ optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 4096, 4096, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+ ASSERT_EQ(to_read1, cl.active_reads);
+ ASSERT_TRUE(cl.results.empty());
+
+ /* Now perform an invalidating cache write */
+ optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+
+ cl.complete_read();
+ ASSERT_EQ(2, cl.results.size());
+ auto result = cl.results.begin();
+ ASSERT_FALSE(result++->empty());
+ ASSERT_TRUE(result++->empty());
+
+ cl.complete_write(*op1);
+ ASSERT_FALSE(cl.active_reads);
+ cl.complete_write(*op2);
+
+ cl.cache.on_change();
+ op1.reset();
+ op2.reset();
+ cl.cache.on_change2();
+ ASSERT_TRUE(cl.cache.idle());
+ }
+
+ /* Second test, modifies, deletes, creates, then modifies. */
+ {
+ auto to_read1 = iset_from_vector( {{{0, 8192}}}, cl.get_stripe_info());
+ auto to_write1 = iset_from_vector({{{0, 8192}}}, cl.get_stripe_info());
+ auto to_write2 = iset_from_vector({{{4096, 4096}}}, cl.get_stripe_info());
+ auto to_read3 = iset_from_vector( {{{0, 4096}}}, cl.get_stripe_info());
+ auto to_write3 = iset_from_vector({{{0, 4096}}}, cl.get_stripe_info());
+ optional op1 = cl.cache.prepare(cl.oid, to_read1, to_write1, 8192, 8192, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ optional op2 = cl.cache.prepare(cl.oid, nullopt, shard_extent_set_t(cl.sinfo.get_k_plus_m()), 4*1024, 0, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ optional op3 = cl.cache.prepare(cl.oid, nullopt, to_write2, 0, 8192, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ optional op4 = cl.cache.prepare(cl.oid, to_read3, to_write3, 8192, 8192, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op1);
+ cl.cache_execute(*op2);
+ cl.cache_execute(*op3);
+ cl.cache_execute(*op4);
+
+ /* The first result must actually read. */
+ cl.complete_read();
+ ASSERT_EQ(4, cl.results.size());
+ auto result = cl.results.begin();
+ ASSERT_FALSE(result++->empty());
+ ASSERT_TRUE(result++->empty());
+ ASSERT_TRUE(result++->empty());
+ ASSERT_TRUE(result++->empty());
+ cl.complete_write(*op1);
+ cl.complete_write(*op2);
+ cl.complete_write(*op3);
+ cl.complete_write(*op4);
+
+ cl.cache.on_change();
+ op1.reset();
+ op2.reset();
+ op3.reset();
+ op4.reset();
+ cl.cache.on_change2();
+ ASSERT_TRUE(cl.cache.idle());
+ }
+}
+
+TEST(ECExtentCache, test_invalidate_lru)
+{
+ uint64_t c = 4096;
+ int k = 4;
+ int m = 2;
+ Client cl(c, k, m, 1024*c);
+
+ /* Populate the cache LRU and then invalidate the cache. */
+ {
+ uint64_t bs = 3767;
+ auto io1 = iset_from_vector({{{align_page_prev(35*bs), align_page_next(36*bs) - align_page_prev(35*bs)}}}, cl.get_stripe_info());
+ io1[shard_id_t(k)].insert(io1.get_extent_superset());
+ io1[shard_id_t(k+1)].insert(io1.get_extent_superset());
+ auto io2 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info());
+ io2[shard_id_t(k)].insert(io1.get_extent_superset());
+ io2[shard_id_t(k+1)].insert(io1.get_extent_superset());
+ // io 3 is the truncate
+ auto io3 = shard_extent_set_t(cl.sinfo.get_k_plus_m());
+ auto io4 = iset_from_vector({{{align_page_prev(30*bs), align_page_next(31*bs) - align_page_prev(30*bs)}}}, cl.get_stripe_info());
+ io3[shard_id_t(k)].insert(io1.get_extent_superset());
+ io3[shard_id_t(k+1)].insert(io1.get_extent_superset());
+ auto io5 = iset_from_vector({{{align_page_prev(18*bs), align_page_next(19*bs) - align_page_prev(18*bs)}}}, cl.get_stripe_info());
+ io4[shard_id_t(k)].insert(io1.get_extent_superset());
+ io4[shard_id_t(k+1)].insert(io1.get_extent_superset());
+
+ optional op1 = cl.cache.prepare(cl.oid, nullopt, io1, 0, align_page_next(36*bs), false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+
+ cl.cache_execute(*op1);
+ ASSERT_FALSE(cl.active_reads);
+ cl.complete_write(*op1);
+ op1.reset();
+
+ optional op2 = cl.cache.prepare(cl.oid, io2, io2, align_page_next(36*bs), align_page_next(36*bs), false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op2);
+ // We have active reads because the object was discarded fro the cache
+ // and has forgotten about all the zero reads.
+ ASSERT_TRUE(cl.active_reads);
+ cl.complete_read();
+ cl.complete_write(*op2);
+ op2.reset();
+
+ optional op3 = cl.cache.prepare(cl.oid, nullopt, io3, align_page_next(36*bs), 0, false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op3);
+ ASSERT_FALSE(cl.active_reads);
+ cl.complete_write(*op3);
+ op3.reset();
+
+ optional op4 = cl.cache.prepare(cl.oid, nullopt, io4, 0, align_page_next(30*bs), false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op4);
+ ASSERT_FALSE(cl.active_reads);
+ cl.complete_write(*op4);
+ op4.reset();
+
+ optional op5 = cl.cache.prepare(cl.oid, io5, io5, align_page_next(30*bs), align_page_next(30*bs), false,
+ [&cl](ECExtentCache::OpRef &op)
+ {
+ cl.cache_ready(op->get_hoid(), op->get_result());
+ });
+ cl.cache_execute(*op5);
+ ASSERT_TRUE(cl.active_reads);
+ cl.complete_write(*op5);
+ op5.reset();
+ }
+}
\ No newline at end of file
#include "common/ceph_argparse.h"
#include "common/config_proxy.h"
#include "common/errno.h"
-#include "erasure-code/ErasureCode.h"
#include "erasure-code/ErasureCodePlugin.h"
#include "global/global_context.h"
#include "global/global_init.h"
uint64_t stripe_size = atoi(profile["k"].c_str());
ceph_assert(stripe_size > 0);
uint64_t stripe_width = stripe_size * stripe_unit;
- sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, stripe_width));
+ sinfo->reset(new ECUtil::stripe_info_t(*ec_impl, nullptr, stripe_width));
return 0;
}
return r;
}
- std::set<int> want;
+ ECUtil::shard_extent_map_t encoded_data(sinfo.get());
std::vector<std::string> shards;
boost::split(shards, args[2], boost::is_any_of(","));
- for (auto &shard : shards) {
- want.insert(atoi(shard.c_str()));
- }
- ceph::bufferlist decoded_data;
+ ceph::bufferlist input_data;
std::string fname = args[3];
std::string error;
- r = decoded_data.read_file(fname.c_str(), &error);
+ r = input_data.read_file(fname.c_str(), &error);
if (r < 0) {
std::cerr << "failed to read " << fname << ": " << error << std::endl;
return 1;
}
uint64_t stripe_width = sinfo->get_stripe_width();
- if (decoded_data.length() % stripe_width != 0) {
- uint64_t pad = stripe_width - decoded_data.length() % stripe_width;
- decoded_data.append_zero(pad);
+ if (input_data.length() % stripe_width != 0) {
+ uint64_t pad = stripe_width - input_data.length() % stripe_width;
+ input_data.append_zero(pad);
}
- std::map<int, ceph::bufferlist> encoded_data;
- r = ECUtil::encode(*sinfo, ec_impl, decoded_data, want, &encoded_data);
+ sinfo->ro_range_to_shard_extent_map(0, input_data.length(), input_data, encoded_data);
+ r = encoded_data.encode(ec_impl, nullptr, encoded_data.get_ro_end());
if (r < 0) {
std::cerr << "failed to encode: " << cpp_strerror(r) << std::endl;
return 1;
}
- for (auto &[shard, bl] : encoded_data) {
+ for (auto &[shard, _] : encoded_data.get_extent_maps()) {
std::string name = fname + "." + stringify(shard);
+ bufferlist bl;
+ encoded_data.get_shard_first_buffer(shard, bl);
r = bl.write_file(name.c_str());
if (r < 0) {
std::cerr << "failed to write " << name << ": " << cpp_strerror(r)
ceph::ErasureCodeInterfaceRef ec_impl;
std::unique_ptr<ECUtil::stripe_info_t> sinfo;
int r = ec_init(args[0], args[1], &ec_impl, &sinfo);
- if (r < 0) {
+ if (r) {
return r;
}
- std::map<int, ceph::bufferlist> encoded_data;
+ ECUtil::shard_extent_map_t encoded_data(sinfo.get());
std::vector<std::string> shards;
boost::split(shards, args[2], boost::is_any_of(","));
- for (auto &shard : shards) {
- encoded_data[atoi(shard.c_str())] = {};
- }
- ceph::bufferlist decoded_data;
std::string fname = args[3];
std::set<int> want_to_read;
const auto chunk_mapping = ec_impl->get_chunk_mapping();
- for (auto &[shard, bl] : encoded_data) {
- std::string name = fname + "." + stringify(shard);
+ for (auto &shard_str : shards) {
+ std::string name = fname + "." + shard_str;
std::string error;
+ bufferlist bl;
r = bl.read_file(name.c_str(), &error);
if (r < 0) {
std::cerr << "failed to read " << name << ": " << error << std::endl;
return 1;
}
- auto chunk = static_cast<ssize_t>(chunk_mapping.size()) > shard ?
- chunk_mapping[shard] : shard_id_t(shard);
- want_to_read.insert(static_cast<int>(chunk));
+ shard_id_t shard = sinfo->get_shard(raw_shard_id_t(atoi(shard_str.c_str())));
+ encoded_data.insert_in_shard(shard, 0, bl);
}
- r = ECUtil::decode(*sinfo, ec_impl, want_to_read, encoded_data, &decoded_data);
+ ECUtil::shard_extent_set_t wanted(sinfo->get_k_plus_m());
+ sinfo->ro_range_to_shard_extent_set(encoded_data.get_ro_start(),
+ encoded_data.get_ro_end() - encoded_data.get_ro_start(), wanted);
+
+ r = encoded_data.decode(ec_impl, wanted, encoded_data.get_ro_end());
if (r < 0) {
std::cerr << "failed to decode: " << cpp_strerror(r) << std::endl;
return 1;
}
+ bufferlist decoded_data = encoded_data.get_ro_buffer();
r = decoded_data.write_file(fname.c_str());
if (r < 0) {
std::cerr << "failed to write " << fname << ": " << cpp_strerror(r)