osdc: Add FORCE and FAIL_ON_EAGAIN flags.

author Alex Ainscow <aainscow@uk.ibm.com>

Thu, 5 Feb 2026 13:16:25 +0000 (13:16 +0000)

committer Alex Ainscow <aainscow@uk.ibm.com>

Fri, 6 Feb 2026 10:31:30 +0000 (10:31 +0000)
author Alex Ainscow <aainscow@uk.ibm.com>
Thu, 5 Feb 2026 13:16:25 +0000 (13:16 +0000)
committer Alex Ainscow <aainscow@uk.ibm.com>
Fri, 6 Feb 2026 10:31:30 +0000 (10:31 +0000)
diff --git a/src/include/rados.h b/src/include/rados.h

index 23529521411016b21897ba1634be3ab1cac2bcd2..6e6918cbf4c106b4cade19f7588d2338fa667e71 100644 (file)
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -483,7 +483,9 @@ enum {
         CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000,  /* ignore redirection */
         CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */
         CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000,   /* client understands pool EIO flag */
-        CEPH_OSD_FLAG_EC_DIRECT_READ = 0x10000000,  /* Erasure code doing a partial read direct to OSD. */
+       CEPH_OSD_FLAG_EC_DIRECT_READ = 0x10000000,  /* Erasure code doing a partial read direct to OSD. */
+       CEPH_OSD_FLAG_FAIL_ON_EAGAIN = 0x20000000,  /* -EAGAIN will not retry, but fail IO. */
+       CEPH_OSD_FLAG_FORCE_OSD = 0x40000000,  /* osd field contains a forced target. */
  };
  
  // Indicates an IO which is direct-to-OSD and may not be on the primary.
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc

index 18d76a5cd48e7a461fca176ae6af403ad3cf619d..3e9554992c50c8249877b9a1ac6f9af8a5c2666d 100644 (file)
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1618,7 +1618,11 @@ void Objecter::_check_op_pool_dne(Op *op, std::unique_lock<std::shared_mutex> *s
                      << " dne" << dendl;
        if (op->has_completion()) {
         num_in_flight--;
-       op->complete(make_error_code(osdc_errc::pool_dne), -ENOENT, service.get_executor());
+        // If FORCE_OSD is set, the forced OSD doesn't exist in the current map.
+        // This may be transient (OSD temporarily down) or permanent (OSD removed).
+        // Return -EAGAIN instead of -ENOENT to allow caller to retry.
+        int rc = (op->target.flags & CEPH_OSD_FLAG_FORCE_OSD) ? -EAGAIN : -ENOENT;
+       op->complete(make_error_code(osdc_errc::pool_dne), rc, service.get_executor());
        }
  
        OSDSession *s = op->session;
@@ -3120,18 +3124,40 @@ int Objecter::_calc_target(op_target_t *t, bool any_change)
      t->pg_num_mask = pg_num_mask;
      t->pg_num_pending = pg_num_pending;
      spg_t spgid(actual_pgid);
-    if (t->force_shard) {
-      t->osd = t->acting[int(*t->force_shard)];
-      // In some redrive scenarios, the acting set can change. Fail the IO
-      // and retry.
-      if (!osdmap->exists(t->osd)) {
-        t->osd = -1;
-        return RECALC_OP_TARGET_POOL_DNE;
+    if (t->flags & CEPH_OSD_FLAG_FORCE_OSD) {
+      // In some redrive scenarios, the acting set can change. If the forced
+      // OSD doesn't exist in the acting set (e.g., it disappeared from the
+      // upmap), we need to handle it appropriately.
+      bool osd_in_acting = false;
+      for (auto acting_osd : t->acting) {
+        if (acting_osd == t->osd) {
+          osd_in_acting = true;
+          break;
+        }
        }
-      if (pi->is_erasure()) {
-        spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, *t->force_shard));
+      if (!osd_in_acting) {
+        // If FAIL_ON_EAGAIN is set, we must not failover - the caller expects
+        // -EAGAIN to be returned. Otherwise, clear the direct read flags and
+        // redrive to the primary OSD (similar to what happens when we get -EAGAIN).
+        if (t->flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) {
+          ldout(cct, 10) << __func__ << " forced osd." << t->osd
+                         << " not in acting set " << t->acting
+                         << ", FAIL_ON_EAGAIN set, returning POOL_DNE to trigger -EAGAIN"
+                         << dendl;
+          t->osd = -1;
+          return RECALC_OP_TARGET_POOL_DNE;
+        } else {
+          ldout(cct, 10) << __func__ << " forced osd." << t->osd
+                         << " not in acting set " << t->acting
+                         << ", clearing direct read flags and redriving to primary"
+                         << dendl;
+          // Clear all direct read flags (EC_DIRECT_READ, BALANCE_READS, LOCALIZE_READS)
+          t->flags &= ~CEPH_OSD_FLAGS_DIRECT_READ;
+          t->flags &= ~CEPH_OSD_FLAG_FORCE_OSD;
+        }
        }
-    } else if (pi->is_erasure()) {
+    }
+    if (pi->is_erasure()) {
        // Optimized EC pools need to be careful when calculating the shard
        // because an OSD may have multiple shards and the primary shard
        // might not be the first one in the acting set. The lookup
@@ -3160,7 +3186,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change)
                    << " acting " << t->acting
                    << " primary " << acting_primary << dendl;
      t->used_replica = false;
-    if (!t->force_shard && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+    if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD) && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
                       CEPH_OSD_FLAG_LOCALIZE_READS)) &&
          !is_write && pi->is_replicated() && t->acting.size() > 1) {
        int osd;
@@ -3197,7 +3223,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change)
         osd = t->acting[best];
        }
        t->osd = osd;
-    } else if (!t->force_shard) {
+    } else if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD)) {
        t->osd = acting_primary;
      }
    }
@@ -3760,7 +3786,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
      }
    }
  
-  if (rc == -EAGAIN && !op->target.force_shard) {
+  if (rc == -EAGAIN && (op->target.flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) == 0) {
      ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
      if (op->has_completion())
        num_in_flight--;
@@ -3768,8 +3794,12 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
      sl.unlock();
  
      op->tid = 0;
-    op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
-                         CEPH_OSD_FLAG_LOCALIZE_READS);
+    op->target.flags &= ~CEPH_OSD_FLAGS_DIRECT_READ;
+
+    // If IGNORE_EAGAIN is not set and FORCE_OSD is set, the implication is
+    // that it is safe to redrive the IO to the primary, without any balanced
+    // read flag.
+    op->target.flags &= ~CEPH_OSD_FLAG_FORCE_OSD;
      op->target.pgid = pg_t();
      _op_submit(op, sul, NULL);
      m->put();
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h

index d547f8469ce3047adeb085fcc7ec7002ee9e328f..4fbcb87763ea3b3edc6df76ee09a519ca132e78d 100644 (file)
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1891,7 +1891,6 @@ public:
      bool paused = false;
  
      int osd = -1;      ///< the final target osd, or -1
-    std::optional<shard_id_t> force_shard; // If set, only this shard may be used.
  
      epoch_t last_force_resend = 0;
author	Alex Ainscow <aainscow@uk.ibm.com>
	Thu, 5 Feb 2026 13:16:25 +0000 (13:16 +0000)
committer	Alex Ainscow <aainscow@uk.ibm.com>
	Fri, 6 Feb 2026 10:31:30 +0000 (10:31 +0000)
src/include/rados.h		patch \| blob \| history
src/osdc/Objecter.cc		patch \| blob \| history
src/osdc/Objecter.h		patch \| blob \| history