]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Torn write protection for Direct Reads
authorAlex Ainscow <aainscow@uk.ibm.com>
Thu, 5 Feb 2026 13:14:07 +0000 (13:14 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Fri, 6 Feb 2026 10:31:30 +0000 (10:31 +0000)
It is possible for direct reads to query two seperate shards and
get different versions of the object for each shard when using
direct reads.

To solve this we add a get_internal_version op to tell us the version
of the object on that shard and submit that in the same transaction
as the read so we can ensure the versions are what we expect. If we
have a mismatch, we resubmit the read through the primary path.

Signed-off-by: Jon Bailey <jonathan.bailey1@ibm.com>
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/include/rados.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osdc/Objecter.h

index 65092d42328b2c51538369c62626abacea44ddf4..23529521411016b21897ba1634be3ab1cac2bcd2 100644 (file)
@@ -265,7 +265,9 @@ extern const char *ceph_osd_state_name(int s);
        f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9),    "list-watchers")    \
                                                                            \
        f(LIST_SNAPS,   __CEPH_OSD_OP(RD, DATA, 10),    "list-snaps")       \
-                                                                           \
+                                                                               \
+       f(GET_INTERNAL_VERSIONS, __CEPH_OSD_OP(RD, DATA, 33), "get-internal-versions") \
+                                                                               \
        /* sync */                                                          \
        f(SYNC_READ,    __CEPH_OSD_OP(RD, DATA, 11),    "sync_read")        \
                                                                            \
index 883a32a77e6e433c1140b95ff9381d48c9095abd..196b3eb9753240c8a6bc2d258d73e05c927b8412 100644 (file)
@@ -6572,6 +6572,15 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       }
       break;
 
+    case CEPH_OSD_OP_GET_INTERNAL_VERSIONS: {
+      std::map<shard_id_t, eversion_t> out;
+      result = get_internal_versions(soid, &out);
+      if (result >= 0) {
+        encode(out, osd_op.outdata);
+      }
+    }
+    break;
+
     case CEPH_OSD_OP_LIST_WATCHERS:
       ++ctx->num_read;
       {
@@ -16067,6 +16076,28 @@ int PrimaryLogPG::getattrs_maybe_cache(
   return r;
 }
 
+int PrimaryLogPG::get_internal_versions(const hobject_t& soid,
+                                        std::map<shard_id_t, eversion_t>* out) {
+  const std::set<pg_shard_t>& acting_shards = get_acting_shards();
+  ObjectContextRef obc = get_object_context(soid, false);
+
+  if (!obc->obs.exists) {
+    return -ENOENT;
+  }
+
+  if (is_primary()) {
+    for (const auto& shard : acting_shards) {
+      (*out)[shard.shard] = obc->obs.oi.version;
+    }
+    for (const auto& [shard, version] : obc->obs.oi.shard_versions) {
+      out->at(shard) = version;
+    }
+  } else {
+    (*out)[pg_whoami.shard] = obc->obs.oi.version;
+  }
+  return 0;
+}
+
 bool PrimaryLogPG::check_failsafe_full() {
     return osd->check_failsafe_full(get_dpp());
 }
index b01472c09bbb729b13290b1985e9b7c62b32ee64..dc98f2728734b20c9f5db2c25c4278ebd493a668 100644 (file)
@@ -2011,6 +2011,8 @@ public:
   int getattrs_maybe_cache(
     ObjectContextRef obc,
     std::map<std::string, ceph::buffer::list, std::less<>> *out);
+  int get_internal_versions(const hobject_t& soid,
+                            std::map<shard_id_t, eversion_t>* out);
 
 public:
   void set_dynamic_perf_stats_queries(
index a40ba465fcc7a037824c64122219223df94bab3c..d547f8469ce3047adeb085fcc7ec7002ee9e328f 100644 (file)
@@ -1510,6 +1510,14 @@ struct ObjectOperation {
     osd_op.op.assert_ver.ver = ver;
   }
 
+  void get_internal_versions(boost::system::error_code* ec,
+               buffer::list *pbl) {
+       ceph::buffer::list bl;
+       add_op(CEPH_OSD_OP_GET_INTERNAL_VERSIONS);
+       out_bl.back() = pbl;
+       out_ec.back() = ec;
+  }
+
   void cmpxattr(const char *name, const ceph::buffer::list& val,
                int op, int mode) {
     add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);