From: Alex Ainscow Date: Fri, 3 Oct 2025 13:39:03 +0000 (+0100) Subject: osd: Implement sync reads and sparse reads for EC for direct reads X-Git-Tag: testing/wip-vshankar-testing-20260212.053105~1^2~2^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=127457a85a6dfc6d40e320f3588c7b2e2800d17d;p=ceph-ci.git osd: Implement sync reads and sparse reads for EC for direct reads Sparse reads for EC are simple to implement, as the code is essentially identical to that of replica, with some address translation. When doing a direct read in EC, only a single OSD is involved and that OSD, by definition is the only OSD involved. As such we can do the more performant sync read, rather than async read. Signed-off-by: Alex Ainscow --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index bf134d67591..8e462f1a061 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1007,7 +1007,31 @@ int ECBackend::objects_read_sync( uint64_t len, uint32_t op_flags, bufferlist *bl) { - return -EOPNOTSUPP; + + if (!sinfo.supports_direct_reads()) { + return -EOPNOTSUPP; + } + + if (get_parent()->get_local_missing().is_missing(hoid)) { + return -EIO; // Permission denied (cos its missing) + } + + auto [shard_offset, shard_len] = extent_to_shard_extent(off, len); + + + dout(20) << __func__ << " Submitting sync read: " + << " hoid=" << hoid + << " shard_offset=" << shard_offset + << " shard_len=" << shard_len + << " op_flags=" << op_flags + << " primary=" << switcher->is_primary() + << dendl; + + + return switcher->store->read(switcher->ch, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + shard_offset, + shard_len, *bl, op_flags); } std::pair ECBackend::extent_to_shard_extent(uint64_t off, uint64_t len) { @@ -1034,6 +1058,41 @@ std::pair ECBackend::extent_to_shard_extent(uint64_t off, ui return std::pair(shard_offset, shard_len); } +int ECBackend::objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl) { + if (get_parent()->get_local_missing().is_missing(hoid)) { + return -EACCES; // Permission denied (cos its missing) + } + + // Not using extent set, since we need the one used by readv. + + auto shard = get_parent()->whoami_shard().shard; + interval_set im(std::move(m)); + m.clear(); // Make m safe to write to again. + auto r = switcher->store->readv(switcher->ch, ghobject_t(hoid, ghobject_t::NO_GEN, shard), im, *bl, op_flags); + if (r >= 0) { + uint64_t chunk_size = sinfo.get_chunk_size(); + for (auto [off, len] : im) { + uint64_t ro_offset = sinfo.shard_offset_to_ro_offset(shard, off); + uint64_t to_next_chunk = ((off / chunk_size) + 1) * chunk_size - off; + uint64_t ro_len = std::min(to_next_chunk, len); + while (len > 0 ) { + dout(20) << __func__ << " shard=" << shard << " extent=" << off << "~" << len << ">" << ro_offset << "~" << ro_len << dendl; + m.emplace(ro_offset, ro_len); + len -= ro_len; + ro_offset += ro_len + sinfo.get_stripe_width() - chunk_size; + ro_len = std::min(len, chunk_size); + } + } + } else { + return r; + } + + return 0; +} + void ECBackend::objects_read_async( const hobject_t &hoid, uint64_t object_size, diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index d3c490b1581..0068dfec01c 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -140,6 +140,11 @@ class ECBackend : public ECCommon { std::pair extent_to_shard_extent(uint64_t off, uint64_t len); + int objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl); + /** * Async read mechanism * @@ -198,6 +203,14 @@ class ECBackend : public ECCommon { void kick_reads(); + int _objects_read_sync( + const hobject_t &hoid, + uint64_t off, + uint64_t len, + uint32_t op_flags, + ceph::buffer::list *bl + ); + public: struct ECRecoveryBackend : RecoveryBackend { ECRecoveryBackend(CephContext *cct, diff --git a/src/osd/ECSwitch.h b/src/osd/ECSwitch.h index a8b133456f6..641e939edde 100644 --- a/src/osd/ECSwitch.h +++ b/src/osd/ECSwitch.h @@ -267,6 +267,17 @@ public: return legacy.objects_read_sync(hoid, off, len, op_flags, bl); } + int objects_readv_sync(const hobject_t &hoid, + std::map& m, + uint32_t op_flags, + ceph::buffer::list *bl) override + { + if (is_optimized()) { + return optimized.objects_readv_sync(hoid, m, op_flags, bl); + } + ceph_abort_msg("Sync reads legacy EC"); + } + std::pair extent_to_shard_extent( uint64_t off, uint64_t len) override { if (is_optimized()) { diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 37c5d670cf6..d3a06e71412 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -5894,6 +5894,13 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { if (oi.is_data_digest() && op.extent.offset == 0 && op.extent.length >= oi.size) maybe_crc = oi.data_digest; + + if (ctx->op->ec_direct_read()) { + result = pgbackend->objects_read_sync( + soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + + dout(20) << " EC sync read for " << soid << " result=" << result << dendl; + } else { ctx->pending_async_reads.push_back( make_pair( boost::make_tuple(op.extent.offset, op.extent.length, op.flags), @@ -5905,6 +5912,7 @@ int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) { ctx->op_finishers[ctx->current_osd_subop_num].reset( new ReadFinisher(osd_op)); + } } else { int r = pgbackend->objects_read_sync( soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); @@ -5964,7 +5972,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { } ++ctx->num_read; - if (pool.info.is_erasure()) { + if (pool.info.is_erasure() && !ctx->op->ec_direct_read()) { // translate sparse read to a normal one if not supported if (length > 0) { @@ -5987,9 +5995,10 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { } else { // read into a buffer map m; + auto [shard_offset, shard_length] = pgbackend->extent_to_shard_extent(offset, length); int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN, info.pgid.shard), - offset, length, m); + shard_offset, shard_length, m); if (r < 0) { return r; } @@ -6000,6 +6009,7 @@ int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) { r = rep_repair_primary_object(soid, ctx); } if (r < 0) { + dout(10) << " sparse_read failed r=" << r << " from object " << soid << dendl; return r; }