From: Alex Ainscow Date: Thu, 5 Feb 2026 13:14:07 +0000 (+0000) Subject: Torn write protection for Direct Reads X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4ca32bad04b002567a09e097a0c1c5ae8ae50c83;p=ceph-ci.git Torn write protection for Direct Reads It is possible for direct reads to query two seperate shards and get different versions of the object for each shard when using direct reads. To solve this we add a get_internal_version op to tell us the version of the object on that shard and submit that in the same transaction as the read so we can ensure the versions are what we expect. If we have a mismatch, we resubmit the read through the primary path. Signed-off-by: Jon Bailey Signed-off-by: Alex Ainscow --- diff --git a/src/include/rados.h b/src/include/rados.h index 65092d42328..23529521411 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -265,7 +265,9 @@ extern const char *ceph_osd_state_name(int s); f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ \ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ - \ + \ + f(GET_INTERNAL_VERSIONS, __CEPH_OSD_OP(RD, DATA, 33), "get-internal-versions") \ + \ /* sync */ \ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ \ diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 883a32a77e6..196b3eb9753 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -6572,6 +6572,15 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) } break; + case CEPH_OSD_OP_GET_INTERNAL_VERSIONS: { + std::map out; + result = get_internal_versions(soid, &out); + if (result >= 0) { + encode(out, osd_op.outdata); + } + } + break; + case CEPH_OSD_OP_LIST_WATCHERS: ++ctx->num_read; { @@ -16067,6 +16076,28 @@ int PrimaryLogPG::getattrs_maybe_cache( return r; } +int PrimaryLogPG::get_internal_versions(const hobject_t& soid, + std::map* out) { + const std::set& acting_shards = get_acting_shards(); + ObjectContextRef obc = get_object_context(soid, false); + + if (!obc->obs.exists) { + return -ENOENT; + } + + if (is_primary()) { + for (const auto& shard : acting_shards) { + (*out)[shard.shard] = obc->obs.oi.version; + } + for (const auto& [shard, version] : obc->obs.oi.shard_versions) { + out->at(shard) = version; + } + } else { + (*out)[pg_whoami.shard] = obc->obs.oi.version; + } + return 0; +} + bool PrimaryLogPG::check_failsafe_full() { return osd->check_failsafe_full(get_dpp()); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index b01472c09bb..dc98f272873 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -2011,6 +2011,8 @@ public: int getattrs_maybe_cache( ObjectContextRef obc, std::map> *out); + int get_internal_versions(const hobject_t& soid, + std::map* out); public: void set_dynamic_perf_stats_queries( diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index a40ba465fcc..d547f8469ce 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1510,6 +1510,14 @@ struct ObjectOperation { osd_op.op.assert_ver.ver = ver; } + void get_internal_versions(boost::system::error_code* ec, + buffer::list *pbl) { + ceph::buffer::list bl; + add_op(CEPH_OSD_OP_GET_INTERNAL_VERSIONS); + out_bl.back() = pbl; + out_ec.back() = ec; + } + void cmpxattr(const char *name, const ceph::buffer::list& val, int op, int mode) { add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);