From: Alex Ainscow Date: Thu, 5 Feb 2026 13:16:25 +0000 (+0000) Subject: osdc: Add FORCE and FAIL_ON_EAGAIN flags. X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a9f3eacc3754e7855fa9b9e9ebc38038ee13c68d;p=ceph-ci.git osdc: Add FORCE and FAIL_ON_EAGAIN flags. Previously, the lower levels of Objecter would potentially redrive ops to different OSDs when the map changed, or the OSD returns -EAGAIN. These flags will be used to change this behaviour: * FORCE_OSD means that the OSD is fixed and cannot be changed. * FAIL_ON_EGAIN means that rather than redriving, the OP should be failed (to splitops) Signed-off-by: Alex Ainscow --- diff --git a/src/include/rados.h b/src/include/rados.h index 23529521411..6e6918cbf4c 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -483,7 +483,9 @@ enum { CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */ CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */ CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000, /* client understands pool EIO flag */ - CEPH_OSD_FLAG_EC_DIRECT_READ = 0x10000000, /* Erasure code doing a partial read direct to OSD. */ + CEPH_OSD_FLAG_EC_DIRECT_READ = 0x10000000, /* Erasure code doing a partial read direct to OSD. */ + CEPH_OSD_FLAG_FAIL_ON_EAGAIN = 0x20000000, /* -EAGAIN will not retry, but fail IO. */ + CEPH_OSD_FLAG_FORCE_OSD = 0x40000000, /* osd field contains a forced target. */ }; // Indicates an IO which is direct-to-OSD and may not be on the primary. diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 18d76a5cd48..3e9554992c5 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1618,7 +1618,11 @@ void Objecter::_check_op_pool_dne(Op *op, std::unique_lock *s << " dne" << dendl; if (op->has_completion()) { num_in_flight--; - op->complete(make_error_code(osdc_errc::pool_dne), -ENOENT, service.get_executor()); + // If FORCE_OSD is set, the forced OSD doesn't exist in the current map. + // This may be transient (OSD temporarily down) or permanent (OSD removed). + // Return -EAGAIN instead of -ENOENT to allow caller to retry. + int rc = (op->target.flags & CEPH_OSD_FLAG_FORCE_OSD) ? -EAGAIN : -ENOENT; + op->complete(make_error_code(osdc_errc::pool_dne), rc, service.get_executor()); } OSDSession *s = op->session; @@ -3120,18 +3124,40 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) t->pg_num_mask = pg_num_mask; t->pg_num_pending = pg_num_pending; spg_t spgid(actual_pgid); - if (t->force_shard) { - t->osd = t->acting[int(*t->force_shard)]; - // In some redrive scenarios, the acting set can change. Fail the IO - // and retry. - if (!osdmap->exists(t->osd)) { - t->osd = -1; - return RECALC_OP_TARGET_POOL_DNE; + if (t->flags & CEPH_OSD_FLAG_FORCE_OSD) { + // In some redrive scenarios, the acting set can change. If the forced + // OSD doesn't exist in the acting set (e.g., it disappeared from the + // upmap), we need to handle it appropriately. + bool osd_in_acting = false; + for (auto acting_osd : t->acting) { + if (acting_osd == t->osd) { + osd_in_acting = true; + break; + } } - if (pi->is_erasure()) { - spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, *t->force_shard)); + if (!osd_in_acting) { + // If FAIL_ON_EAGAIN is set, we must not failover - the caller expects + // -EAGAIN to be returned. Otherwise, clear the direct read flags and + // redrive to the primary OSD (similar to what happens when we get -EAGAIN). + if (t->flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) { + ldout(cct, 10) << __func__ << " forced osd." << t->osd + << " not in acting set " << t->acting + << ", FAIL_ON_EAGAIN set, returning POOL_DNE to trigger -EAGAIN" + << dendl; + t->osd = -1; + return RECALC_OP_TARGET_POOL_DNE; + } else { + ldout(cct, 10) << __func__ << " forced osd." << t->osd + << " not in acting set " << t->acting + << ", clearing direct read flags and redriving to primary" + << dendl; + // Clear all direct read flags (EC_DIRECT_READ, BALANCE_READS, LOCALIZE_READS) + t->flags &= ~CEPH_OSD_FLAGS_DIRECT_READ; + t->flags &= ~CEPH_OSD_FLAG_FORCE_OSD; + } } - } else if (pi->is_erasure()) { + } + if (pi->is_erasure()) { // Optimized EC pools need to be careful when calculating the shard // because an OSD may have multiple shards and the primary shard // might not be the first one in the acting set. The lookup @@ -3160,7 +3186,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) << " acting " << t->acting << " primary " << acting_primary << dendl; t->used_replica = false; - if (!t->force_shard && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS | + if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD) && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS)) && !is_write && pi->is_replicated() && t->acting.size() > 1) { int osd; @@ -3197,7 +3223,7 @@ int Objecter::_calc_target(op_target_t *t, bool any_change) osd = t->acting[best]; } t->osd = osd; - } else if (!t->force_shard) { + } else if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD)) { t->osd = acting_primary; } } @@ -3760,7 +3786,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) } } - if (rc == -EAGAIN && !op->target.force_shard) { + if (rc == -EAGAIN && (op->target.flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) == 0) { ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl; if (op->has_completion()) num_in_flight--; @@ -3768,8 +3794,12 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) sl.unlock(); op->tid = 0; - op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | - CEPH_OSD_FLAG_LOCALIZE_READS); + op->target.flags &= ~CEPH_OSD_FLAGS_DIRECT_READ; + + // If IGNORE_EAGAIN is not set and FORCE_OSD is set, the implication is + // that it is safe to redrive the IO to the primary, without any balanced + // read flag. + op->target.flags &= ~CEPH_OSD_FLAG_FORCE_OSD; op->target.pgid = pg_t(); _op_submit(op, sul, NULL); m->put(); diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index d547f8469ce..4fbcb87763e 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1891,7 +1891,6 @@ public: bool paused = false; int osd = -1; ///< the final target osd, or -1 - std::optional force_shard; // If set, only this shard may be used. epoch_t last_force_resend = 0;