From 80f9c5148157de442584d0dd5943aea10cbfc5e9 Mon Sep 17 00:00:00 2001 From: Alex Ainscow Date: Fri, 3 Oct 2025 14:47:15 +0100 Subject: [PATCH] osd: Implement sparse reads for EC for direct reads only Sparse reads for EC are simple to implement, as the code is essentially identical to that of replica, with some address translation. Signed-off-by: Alex Ainscow --- src/osd/ECBackend.cc | 74 +++++++++++------------------------------ src/osd/ECBackend.h | 5 +++ src/osd/ECSwitch.h | 11 ++++++ src/osd/PrimaryLogPG.cc | 11 ++++-- 4 files changed, 43 insertions(+), 58 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 3b3bc75847e..d2e983675e0 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1009,34 +1009,29 @@ int ECBackend::objects_read_sync( bufferlist *bl) { if (!sinfo.supports_direct_reads()) { - return -EOPNOTSUPP; -} - - int r = _objects_read_sync(hoid, off, len, op_flags, bl); - - if (r < 0) { - dout(20) << __func__ << " r=" << r - << " hoid=" << hoid - << " off=" << off - << " len=" << len - << " op_flags=" << op_flags - << " primary=" << switcher->is_primary() - << " shard=" << (off / sinfo.get_chunk_size()) % sinfo.get_k() - << dendl; - } else { - return r; + return -EOPNOTSUPP; } - // The above returns errors largely only interesting for tracing. Here we - // simplify this down to: - // Primary returns EIO, which causes an async read to be executed immediately. - // A non-primary returns EAGAIN which forces the client to resent to the - // primary. - if (switcher->is_primary()) { - return -EIO; + if (get_parent()->get_local_missing().is_missing(hoid)) { + return -EIO; // Permission denied (cos its missing) } - return -EAGAIN; + auto [shard_offset, shard_len] = extent_to_shard_extent(off, len); + + + dout(20) << __func__ << " Submitting sync read: " + << " hoid=" << hoid + << " shard_offset=" << shard_offset + << " shard_len=" << shard_len + << " op_flags=" << op_flags + << " primary=" << switcher->is_primary() + << dendl; + + + return switcher->store->read(switcher->ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + shard_offset, + shard_len, *bl, op_flags); } std::pair ECBackend::extent_to_shard_extent(uint64_t off, uint64_t len) { @@ -1063,37 +1058,6 @@ std::pair ECBackend::extent_to_shard_extent(uint64_t off, ui return std::pair(shard_offset, shard_len); } -// NOTE: Return codes from this function are largely nonsense and translated -// to more useful values before returning to client. -int ECBackend::_objects_read_sync( - const hobject_t &hoid, - uint64_t off, - uint64_t len, - uint32_t op_flags, - bufferlist *bl) { - - if (get_parent()->get_local_missing().is_missing(hoid)) { - return -EACCES; // Permission denied (cos its missing) - } - - auto [shard_offset, shard_len] = extent_to_shard_extent(off, len); - - - dout(20) << __func__ << " Submitting sync read: " - << " hoid=" << hoid - << " shard_offset=" << shard_offset - << " shard_len=" << shard_len - << " op_flags=" << op_flags - << " primary=" << switcher->is_primary() - << dendl; - - - return switcher->store->read(switcher->ch, - ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), - shard_offset, - shard_len, *bl, op_flags); -} - int ECBackend::objects_readv_sync(const hobject_t &hoid, std::map& m, uint32_t op_flags, diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index e29ffa0833f..23670897eda 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -140,6 +140,11 @@ class ECBackend : public ECCommon { std::pair extent_to_shard_extent(uint64_t off, uint64_t len); + int objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl); + /** * Async read mechanism * diff --git a/src/osd/ECSwitch.h b/src/osd/ECSwitch.h index aa5520e9b8c..2a778f3c6b7 100644 --- a/src/osd/ECSwitch.h +++ b/src/osd/ECSwitch.h @@ -267,6 +267,17 @@ public: return legacy.objects_read_sync(hoid, off, len, op_flags, bl); } + int objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl) override + { + if (is_optimized()) { + return optimized.objects_readv_sync(hoid, m, op_flags, bl); + } + ceph_abort_msg("Sync reads legacy EC"); + } + std::pair extent_to_shard_extent( uint64_t off, uint64_t len) override { if (is_optimized()) { diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 02b818ce155..3e5de805cdf 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -5972,7 +5972,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { } ++ctx->num_read; - if (pool.info.is_erasure()) { + if (pool.info.is_erasure() && !ctx->op->ec_direct_read()) { // translate sparse read to a normal one if not supported if (length > 0) { @@ -5995,9 +5995,10 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { } else { // read into a buffer map m; + auto [shard_offset, shard_length] = pgbackend->extent_to_shard_extent(offset, length); int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, info.pgid.shard), - offset, length, m); + shard_offset, shard_length, m); if (r < 0) { return r; } @@ -6008,6 +6009,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { r = rep_repair_primary_object(soid, ctx); } if (r < 0) { + dout(10) << " sparse_read failed r=" << r << " from object " << soid << dendl; return r; } @@ -6015,7 +6017,10 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { // Maybe at first, there is no much whole objects. With continued use, more // and more whole object exist. So from this point, for spare-read add // checksum make sense. - if ((uint64_t)r == oi.size && oi.is_data_digest()) { + // For now, we do not check the CRC for EC ever. We could implement this, + // but it would only work for very small objects and as such is probably + // not very useful. + if ((uint64_t)r == oi.size && oi.is_data_digest() && !pool.info.is_erasure()) { uint32_t crc = data_bl.crc32c(-1); if (oi.data_digest != crc) { osd->clog->error() << info.pgid << std::hex -- 2.39.5