From 2587f2e3865596f9e89b7b80d0563037b9faa4c9 Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Fri, 3 Oct 2025 14:39:03 +0100 Subject: [PATCH] osd: Implement read sync for EC direct reads When doing a direct read in EC, only a single OSD is involved and that OSD, by definition is the only OSD involved. As such we can do the more performant sync read, rather than async read. Signed-off-by: Alex Ainscow --- src/osd/ECBackend.cc | 95 +++++++++++++++++++++++++++++++++++++++++ src/osd/ECBackend.h | 8 ++++ src/osd/PrimaryLogPG.cc | 8 ++++ 3 files changed, 111 insertions(+) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index a5f9c238f47..3b3bc75847e 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1007,9 +1007,38 @@ int ECBackend::objects_read_sync( uint64_t len, uint32_t op_flags, bufferlist *bl) { + + if (!sinfo.supports_direct_reads()) { return -EOPNOTSUPP; } + int r = _objects_read_sync(hoid, off, len, op_flags, bl); + + if (r < 0) { + dout(20) << __func__ << " r=" << r + << " hoid=" << hoid + << " off=" << off + << " len=" << len + << " op_flags=" << op_flags + << " primary=" << switcher->is_primary() + << " shard=" << (off / sinfo.get_chunk_size()) % sinfo.get_k() + << dendl; + } else { + return r; + } + + // The above returns errors largely only interesting for tracing. Here we + // simplify this down to: + // Primary returns EIO, which causes an async read to be executed immediately. + // A non-primary returns EAGAIN which forces the client to resent to the + // primary. + if (switcher->is_primary()) { + return -EIO; + } + + return -EAGAIN; +} + std::pair ECBackend::extent_to_shard_extent(uint64_t off, uint64_t len) { // sync reads are supported for sub-chunk reads where no reconstruct is // required. @@ -1034,6 +1063,72 @@ std::pair ECBackend::extent_to_shard_extent(uint64_t off, ui return std::pair(shard_offset, shard_len); } +// NOTE: Return codes from this function are largely nonsense and translated +// to more useful values before returning to client. +int ECBackend::_objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + bufferlist *bl) { + + if (get_parent()->get_local_missing().is_missing(hoid)) { + return -EACCES; // Permission denied (cos its missing) + } + + auto [shard_offset, shard_len] = extent_to_shard_extent(off, len); + + + dout(20) << __func__ << " Submitting sync read: " + << " hoid=" << hoid + << " shard_offset=" << shard_offset + << " shard_len=" << shard_len + << " op_flags=" << op_flags + << " primary=" << switcher->is_primary() + << dendl; + + + return switcher->store->read(switcher->ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + shard_offset, + shard_len, *bl, op_flags); +} + +int ECBackend::objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl) { + if (get_parent()->get_local_missing().is_missing(hoid)) { + return -EACCES; // Permission denied (cos its missing) + } + + // Not using extent set, since we need the one used by readv. + + auto shard = get_parent()->whoami_shard().shard; + interval_set im(std::move(m)); + m.clear(); // Make m safe to write to again. + auto r = switcher->store->readv(switcher->ch, ghobject_t(hoid, ghobject_t::NO_GEN, shard), im, *bl, op_flags); + if (r >= 0) { + uint64_t chunk_size = sinfo.get_chunk_size(); + for (auto [off, len] : im) { + uint64_t ro_offset = sinfo.shard_offset_to_ro_offset(shard, off); + uint64_t to_next_chunk = ((off / chunk_size) + 1) * chunk_size - off; + uint64_t ro_len = std::min(to_next_chunk, len); + while (len > 0 ) { + dout(20) << __func__ << " shard=" << shard << " extent=" << off << "~" << len << ">" << ro_offset << "~" << ro_len << dendl; + m.emplace(ro_offset, ro_len); + len -= ro_len; + ro_offset += ro_len + sinfo.get_stripe_width() - chunk_size; + ro_len = std::min(len, chunk_size); + } + } + } else { + return r; + } + + return 0; +} + void ECBackend::objects_read_async( const hobject_t &hoid, uint64_t object_size, diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index f5d81bc00f1..e29ffa0833f 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -198,6 +198,14 @@ class ECBackend : public ECCommon { void kick_reads(); + int _objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl + ); + public: struct ECRecoveryBackend : RecoveryBackend { ECRecoveryBackend(CephContext *cct, diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index b808ce95ebd..02b818ce155 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -5894,6 +5894,13 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { if (oi.is_data_digest() && op.extent.offset == 0 && op.extent.length >= oi.size) maybe_crc = oi.data_digest; + + if (ctx->op->ec_direct_read()) { + result = pgbackend->objects_read_sync( + soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + + dout(20) << " EC sync read for " << soid << " result=" << result << dendl; + } else { ctx->pending_async_reads.push_back( make_pair( boost::make_tuple(op.extent.offset, op.extent.length, op.flags), @@ -5905,6 +5912,7 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { ctx->op_finishers[ctx->current_osd_subop_num].reset( new ReadFinisher(osd_op)); + } } else { int r = pgbackend->objects_read_sync( soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); -- 2.39.5