<< " dne" << dendl;
if (op->has_completion()) {
num_in_flight--;
- op->complete(make_error_code(osdc_errc::pool_dne), -ENOENT, service.get_executor());
+ // If FORCE_OSD is set, the forced OSD doesn't exist in the current map.
+ // This may be transient (OSD temporarily down) or permanent (OSD removed).
+ // Return -EAGAIN instead of -ENOENT to allow caller to retry.
+ int rc = (op->target.flags & CEPH_OSD_FLAG_FORCE_OSD) ? -EAGAIN : -ENOENT;
+ op->complete(make_error_code(osdc_errc::pool_dne), rc, service.get_executor());
}
OSDSession *s = op->session;
t->pg_num_mask = pg_num_mask;
t->pg_num_pending = pg_num_pending;
spg_t spgid(actual_pgid);
- if (t->force_shard) {
- t->osd = t->acting[int(*t->force_shard)];
- // In some redrive scenarios, the acting set can change. Fail the IO
- // and retry.
- if (!osdmap->exists(t->osd)) {
- t->osd = -1;
- return RECALC_OP_TARGET_POOL_DNE;
+ if (t->flags & CEPH_OSD_FLAG_FORCE_OSD) {
+ // In some redrive scenarios, the acting set can change. If the forced
+ // OSD doesn't exist in the acting set (e.g., it disappeared from the
+ // upmap), we need to handle it appropriately.
+ bool osd_in_acting = false;
+ for (auto acting_osd : t->acting) {
+ if (acting_osd == t->osd) {
+ osd_in_acting = true;
+ break;
+ }
}
- if (pi->is_erasure()) {
- spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, *t->force_shard));
+ if (!osd_in_acting) {
+ // If FAIL_ON_EAGAIN is set, we must not failover - the caller expects
+ // -EAGAIN to be returned. Otherwise, clear the direct read flags and
+ // redrive to the primary OSD (similar to what happens when we get -EAGAIN).
+ if (t->flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) {
+ ldout(cct, 10) << __func__ << " forced osd." << t->osd
+ << " not in acting set " << t->acting
+ << ", FAIL_ON_EAGAIN set, returning POOL_DNE to trigger -EAGAIN"
+ << dendl;
+ t->osd = -1;
+ return RECALC_OP_TARGET_POOL_DNE;
+ } else {
+ ldout(cct, 10) << __func__ << " forced osd." << t->osd
+ << " not in acting set " << t->acting
+ << ", clearing direct read flags and redriving to primary"
+ << dendl;
+ // Clear all direct read flags (EC_DIRECT_READ, BALANCE_READS, LOCALIZE_READS)
+ t->flags &= ~CEPH_OSD_FLAGS_DIRECT_READ;
+ t->flags &= ~CEPH_OSD_FLAG_FORCE_OSD;
+ }
}
- } else if (pi->is_erasure()) {
+ }
+ if (pi->is_erasure()) {
// Optimized EC pools need to be careful when calculating the shard
// because an OSD may have multiple shards and the primary shard
// might not be the first one in the acting set. The lookup
<< " acting " << t->acting
<< " primary " << acting_primary << dendl;
t->used_replica = false;
- if (!t->force_shard && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+ if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD) && (t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
CEPH_OSD_FLAG_LOCALIZE_READS)) &&
!is_write && pi->is_replicated() && t->acting.size() > 1) {
int osd;
osd = t->acting[best];
}
t->osd = osd;
- } else if (!t->force_shard) {
+ } else if (!(t->flags & CEPH_OSD_FLAG_FORCE_OSD)) {
t->osd = acting_primary;
}
}
}
}
- if (rc == -EAGAIN && !op->target.force_shard) {
+ if (rc == -EAGAIN && (op->target.flags & CEPH_OSD_FLAG_FAIL_ON_EAGAIN) == 0) {
ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
if (op->has_completion())
num_in_flight--;
sl.unlock();
op->tid = 0;
- op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
- CEPH_OSD_FLAG_LOCALIZE_READS);
+ op->target.flags &= ~CEPH_OSD_FLAGS_DIRECT_READ;
+
+ // If IGNORE_EAGAIN is not set and FORCE_OSD is set, the implication is
+ // that it is safe to redrive the IO to the primary, without any balanced
+ // read flag.
+ op->target.flags &= ~CEPH_OSD_FLAG_FORCE_OSD;
op->target.pgid = pg_t();
_op_submit(op, sul, NULL);
m->put();