]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Torn write protection for Direct Reads
authorAlex Ainscow <aainscow@uk.ibm.com>
Thu, 5 Feb 2026 13:14:07 +0000 (13:14 +0000)
committerJon Bailey <jonathan.bailey1@ibm.com>
Thu, 28 May 2026 14:15:50 +0000 (15:15 +0100)
It is possible for direct reads to query two seperate shards and
get different versions of the object for each shard when using
direct reads.

To solve this we add a get_internal_version op to tell us the version
of the object on that shard and submit that in the same transaction
as the read so we can ensure the versions are what we expect. If we
have a mismatch, we resubmit the read through the primary path.

Also a couple of spelling/tidy ups

Signed-off-by: Jon Bailey <jonathan.bailey1@ibm.com>
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
Signed-off-by: Callum James <callum.james@ibm.com>
src/include/rados.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osdc/Objecter.h

index 3b11d112fcfff210528719c0ed9a29dd01831244..14b447af04e572e8e91dd09b0f7230252d784a78 100644 (file)
@@ -266,7 +266,9 @@ extern const char *ceph_osd_state_name(int s);
        f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9),    "list-watchers")    \
                                                                            \
        f(LIST_SNAPS,   __CEPH_OSD_OP(RD, DATA, 10),    "list-snaps")       \
-                                                                           \
+                                                                               \
+       f(GET_INTERNAL_VERSIONS, __CEPH_OSD_OP(RD, DATA, 33), "get-internal-versions") \
+                                                                               \
        /* sync */                                                          \
        f(SYNC_READ,    __CEPH_OSD_OP(RD, DATA, 11),    "sync_read")        \
                                                                            \
index 7684fe406887054663fc337e41ef3ff87797b879..3531fb08cd902b6a8a46542689cdec89824da173 100644 (file)
@@ -2046,13 +2046,14 @@ void PrimaryLogPG::do_op_impl(OpRequestRef op)
 
   // check for op with rwordered and rebalance or localize reads
   if (m->has_flag(CEPH_OSD_FLAGS_DIRECT_READ) && op->rwordered()) {
-    dout(4) << __func__ << ": rebelance or localized reads with rwordered not allowed "
+    dout(4) << __func__ << ": rebalance or localized reads with rwordered not allowed "
        << *m << dendl;
     osd->reply_op_error(op, -EINVAL);
     return;
   }
 
   if (m->get_flags() & CEPH_OSD_FLAG_EC_DIRECT_READ) {
+    // This means "is in acting set"
     if (is_primary() || is_nonprimary()) {
       op->set_ec_direct_read();
     } else {
@@ -2064,6 +2065,7 @@ void PrimaryLogPG::do_op_impl(OpRequestRef op)
       op->may_read() &&
       !(op->may_write() || op->may_cache())) {
     // balanced reads; any replica will do
+    // This means "is in acting set"
     if (!(is_primary() || is_nonprimary())) {
       osd->handle_misdirected_op(this, op);
       return;
@@ -6647,6 +6649,15 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       }
       break;
 
+    case CEPH_OSD_OP_GET_INTERNAL_VERSIONS: {
+      std::map<shard_id_t, eversion_t> out;
+      result = get_internal_versions(soid, &out);
+      if (result >= 0) {
+        encode(out, osd_op.outdata);
+      }
+    }
+    break;
+
     case CEPH_OSD_OP_LIST_WATCHERS:
       ++ctx->num_read;
       {
@@ -16168,6 +16179,27 @@ int PrimaryLogPG::getattrs_maybe_cache(
   return r;
 }
 
+int PrimaryLogPG::get_internal_versions(const hobject_t& soid,
+                                        std::map<shard_id_t, eversion_t>* out) {
+  ObjectContextRef obc = get_object_context(soid, false);
+
+  if (!obc || !obc->obs.exists) {
+    return -ENOENT;
+  }
+
+  if (is_primary() && pool.info.is_erasure()) {
+    for (unsigned int i = 0; i < pool.info.get_size(); ++i) {
+      (*out)[shard_id_t(i)] = obc->obs.oi.version;
+    }
+    for (const auto& [shard, version] : obc->obs.oi.shard_versions) {
+      out->at(shard) = version;
+    }
+  } else {
+    (*out)[pg_whoami.shard] = obc->obs.oi.version;
+  }
+  return 0;
+}
+
 bool PrimaryLogPG::check_failsafe_full() {
     return osd->check_failsafe_full(get_dpp());
 }
index 8e0b29f72bccf35ea7ad181f83528c79b97bbec4..d094d4f60ad99b5411471484913b21cc1ed86aab 100644 (file)
@@ -2021,6 +2021,8 @@ public:
   int getattrs_maybe_cache(
     ObjectContextRef obc,
     std::map<std::string, ceph::buffer::list, std::less<>> *out);
+  int get_internal_versions(const hobject_t& soid,
+                            std::map<shard_id_t, eversion_t>* out);
 
 public:
   void set_dynamic_perf_stats_queries(
index a40ba465fcc7a037824c64122219223df94bab3c..d547f8469ce3047adeb085fcc7ec7002ee9e328f 100644 (file)
@@ -1510,6 +1510,14 @@ struct ObjectOperation {
     osd_op.op.assert_ver.ver = ver;
   }
 
+  void get_internal_versions(boost::system::error_code* ec,
+               buffer::list *pbl) {
+       ceph::buffer::list bl;
+       add_op(CEPH_OSD_OP_GET_INTERNAL_VERSIONS);
+       out_bl.back() = pbl;
+       out_ec.back() = ec;
+  }
+
   void cmpxattr(const char *name, const ceph::buffer::list& val,
                int op, int mode) {
     add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);