From: Alex Ainscow Date: Thu, 5 Feb 2026 13:14:07 +0000 (+0000) Subject: osd: Torn write protection for Direct Reads X-Git-Tag: v21.0.1~73^2~20 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=829aced79a3c7116366f6d93b8eadb554dc7ef50;p=ceph.git osd: Torn write protection for Direct Reads It is possible for direct reads to query two seperate shards and get different versions of the object for each shard when using direct reads. To solve this we add a get_internal_version op to tell us the version of the object on that shard and submit that in the same transaction as the read so we can ensure the versions are what we expect. If we have a mismatch, we resubmit the read through the primary path. Also a couple of spelling/tidy ups Signed-off-by: Jon Bailey Signed-off-by: Alex Ainscow Signed-off-by: Callum James --- diff --git a/src/include/rados.h b/src/include/rados.h index 3b11d112fcf..14b447af04e 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -266,7 +266,9 @@ extern const char *ceph_osd_state_name(int s); f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ \ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ - \ + \ + f(GET_INTERNAL_VERSIONS, __CEPH_OSD_OP(RD, DATA, 33), "get-internal-versions") \ + \ /* sync */ \ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ \ diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 7684fe40688..3531fb08cd9 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -2046,13 +2046,14 @@ void PrimaryLogPG::do_op_impl(OpRequestRef op) // check for op with rwordered and rebalance or localize reads if (m->has_flag(CEPH_OSD_FLAGS_DIRECT_READ) && op->rwordered()) { - dout(4) << __func__ << ": rebelance or localized reads with rwordered not allowed " + dout(4) << __func__ << ": rebalance or localized reads with rwordered not allowed " << *m << dendl; osd->reply_op_error(op, -EINVAL); return; } if (m->get_flags() & CEPH_OSD_FLAG_EC_DIRECT_READ) { + // This means "is in acting set" if (is_primary() || is_nonprimary()) { op->set_ec_direct_read(); } else { @@ -2064,6 +2065,7 @@ void PrimaryLogPG::do_op_impl(OpRequestRef op) op->may_read() && !(op->may_write() || op->may_cache())) { // balanced reads; any replica will do + // This means "is in acting set" if (!(is_primary() || is_nonprimary())) { osd->handle_misdirected_op(this, op); return; @@ -6647,6 +6649,15 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) } break; + case CEPH_OSD_OP_GET_INTERNAL_VERSIONS: { + std::map out; + result = get_internal_versions(soid, &out); + if (result >= 0) { + encode(out, osd_op.outdata); + } + } + break; + case CEPH_OSD_OP_LIST_WATCHERS: ++ctx->num_read; { @@ -16168,6 +16179,27 @@ int PrimaryLogPG::getattrs_maybe_cache( return r; } +int PrimaryLogPG::get_internal_versions(const hobject_t& soid, + std::map* out) { + ObjectContextRef obc = get_object_context(soid, false); + + if (!obc || !obc->obs.exists) { + return -ENOENT; + } + + if (is_primary() && pool.info.is_erasure()) { + for (unsigned int i = 0; i < pool.info.get_size(); ++i) { + (*out)[shard_id_t(i)] = obc->obs.oi.version; + } + for (const auto& [shard, version] : obc->obs.oi.shard_versions) { + out->at(shard) = version; + } + } else { + (*out)[pg_whoami.shard] = obc->obs.oi.version; + } + return 0; +} + bool PrimaryLogPG::check_failsafe_full() { return osd->check_failsafe_full(get_dpp()); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 8e0b29f72bc..d094d4f60ad 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -2021,6 +2021,8 @@ public: int getattrs_maybe_cache( ObjectContextRef obc, std::map> *out); + int get_internal_versions(const hobject_t& soid, + std::map* out); public: void set_dynamic_perf_stats_queries( diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index a40ba465fcc..d547f8469ce 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1510,6 +1510,14 @@ struct ObjectOperation { osd_op.op.assert_ver.ver = ver; } + void get_internal_versions(boost::system::error_code* ec, + buffer::list *pbl) { + ceph::buffer::list bl; + add_op(CEPH_OSD_OP_GET_INTERNAL_VERSIONS); + out_bl.back() = pbl; + out_ec.back() = ec; + } + void cmpxattr(const char *name, const ceph::buffer::list& val, int op, int mode) { add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);