From: David Zafman Date: Mon, 24 Apr 2017 23:54:06 +0000 (-0700) Subject: osd: On EIO from read recover the primary replica from another copy X-Git-Tag: ses5-milestone8~1^2~19^2~13 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=91a0f5cd786686021c7b896b9cea1e292f18a325;p=ceph.git osd: On EIO from read recover the primary replica from another copy Signed-off-by: David Zafman --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index e1eb150d81e4..39b373fd478a 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6853,7 +6853,7 @@ PG::RecoveryState::Clean::Clean(my_context ctx) pg->share_pg_info(); pg->publish_stats_to_osd(); - + pg->requeue_ops(pg->waiting_for_clean_to_primary_repair); } void PG::RecoveryState::Clean::exit() diff --git a/src/osd/PG.h b/src/osd/PG.h index 6213166d7407..8b3fef6d3965 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -256,6 +256,7 @@ protected: CephContext *cct; OSDriver osdriver; SnapMapper snap_mapper; + bool eio_errors_to_process = false; virtual PGBackend *get_pgbackend() = 0; public: @@ -904,6 +905,7 @@ protected: list waiting_for_scrub; list waiting_for_cache_not_full; + list waiting_for_clean_to_primary_repair; map> waiting_for_unreadable_object, waiting_for_degraded_object, waiting_for_blocked_object; @@ -1855,6 +1857,7 @@ public: struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { typedef boost::mpl::list< boost::statechart::transition< GoClean, Clean >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, boost::statechart::custom_reaction< AllReplicasActivated > > reactions; explicit Recovered(my_context ctx); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 29354a83badd..710bcb54b71a 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -619,6 +619,15 @@ void PrimaryLogPG::block_write_on_full_cache( op->mark_delayed("waiting for cache not full"); } +void PrimaryLogPG::block_for_clean( + const hobject_t& oid, OpRequestRef op) +{ + dout(20) << __func__ << ": blocking object " << oid + << " on primary repair" << dendl; + waiting_for_clean_to_primary_repair.push_back(op); + op->mark_delayed("waiting for clean to repair"); +} + void PrimaryLogPG::block_write_on_snap_rollback( const hobject_t& oid, ObjectContextRef obc, OpRequestRef op) { @@ -4752,6 +4761,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) } else { int r = pgbackend->objects_read_sync( soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata); + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx->op); + } if (r >= 0) op.extent.length = r; else { @@ -4884,6 +4896,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) bufferlist t; uint64_t len = miter->first - last; r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t); + if (r == -EIO) { + r = rep_repair_primary_object(soid, ctx->op); + } if (r < 0) { osd->clog->error() << coll << " " << soid << " sparse-read failed to read: " @@ -10649,6 +10664,7 @@ void PrimaryLogPG::on_activate() RequestBackfill()))); } else { dout(10) << "activate all replicas clean, no recovery" << dendl; + eio_errors_to_process = false; queue_peering_event( CephPeeringEvtRef( std::make_shared( @@ -11069,6 +11085,7 @@ bool PrimaryLogPG::start_recovery_ops( RequestBackfill()))); } else { dout(10) << "recovery done, no backfill" << dendl; + eio_errors_to_process = false; queue_peering_event( CephPeeringEvtRef( std::make_shared( @@ -11079,6 +11096,7 @@ bool PrimaryLogPG::start_recovery_ops( } else { // backfilling state_clear(PG_STATE_BACKFILL); dout(10) << "recovery done, backfill done" << dendl; + eio_errors_to_process = false; queue_peering_event( CephPeeringEvtRef( std::make_shared( @@ -13820,6 +13838,77 @@ bool PrimaryLogPG::check_osdmap_full(const set &missing_on) return osd->check_osdmap_full(missing_on); } +int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op) +{ + // Only supports replicated pools + assert(!pool.info.require_rollback()); + assert(is_primary()); + + // Get non-primary shards + list op_shards; + for (auto&& i : actingset) { + if (i == pg_whoami) continue; // Exclude self (primary) + op_shards.push_back(i); + } + if (op_shards.empty()) { + dout(0) << __func__ << " No other replicas available for " << soid << dendl; + return -EIO; + } + + dout(10) << __func__ << " " << soid + << " peers osd.{" << op_shards << "}" << dendl; + + if (!is_clean()) { + block_for_clean(soid, op); + return -EAGAIN; + } + + assert(!pg_log.get_missing().is_missing(soid)); + bufferlist bv; + int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv); + if (r < 0) + return r; + object_info_t oi; + try { + bufferlist::iterator bliter = bv.begin(); + ::decode(oi, bliter); + } catch (...) { + dout(0) << __func__ << ": bad object_info_t: " << soid << dendl; + // XXX: Too bad I can't get the version to recover, so can't repair + return -EIO; + } + + pg_log.missing_add(soid, oi.version, eversion_t()); + + pg_log.set_last_requested(0); + + missing_loc.add_missing(soid, oi.version, eversion_t()); + for (auto &&i : op_shards) + missing_loc.add_location(soid, i); + + // Restart the op after object becomes readable again + waiting_for_unreadable_object[soid].push_back(op); + op->mark_delayed("waiting for missing object"); + + if (!eio_errors_to_process) { + eio_errors_to_process = true; + assert(is_clean()); + queue_peering_event( + CephPeeringEvtRef( + std::make_shared( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + DoRecovery()))); + } else { + // A prior error must have already cleared clean state and queued recovery + // or a map change has triggered re-peering. + // Not inlining the recovery by calling maybe_kick_recovery(soid); + dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl; + } + + return -EAGAIN; +} + /*---SnapTrimmer Logging---*/ #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index f6c1b97d8a36..304e751e9d28 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1731,6 +1731,8 @@ public: void block_write_on_full_cache( const hobject_t& oid, OpRequestRef op); + void block_for_clean( + const hobject_t& oid, OpRequestRef op); void block_write_on_snap_rollback( const hobject_t& oid, ObjectContextRef obc, OpRequestRef op); void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op); @@ -1763,6 +1765,7 @@ public: void on_shutdown() override; bool check_failsafe_full(ostream &ss) override; bool check_osdmap_full(const set &missing_on) override; + int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op); // attr cache handling void setattr_maybe_cache( diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 6a63f5415f1c..8ef6d5db90a6 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -1592,6 +1592,7 @@ void ReplicatedBackend::prep_push( &new_progress, pop, &(pi.stat), cache_dont_need); + // XXX: What can we do here? assert(r == 0); pi.recovery_progress = new_progress; } @@ -2007,9 +2008,11 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, p != out_op->data_included.end(); ++p) { bufferlist bit; - store->read(ch, ghobject_t(recovery_info.soid), + int r = store->read(ch, ghobject_t(recovery_info.soid), p.get_start(), p.get_len(), bit, cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); + if (r < 0) + return r; if (p.get_len() != bit.length()) { dout(10) << " extent " << p.get_start() << "~" << p.get_len() << " is actually " << p.get_start() << "~" << bit.length() @@ -2086,6 +2089,7 @@ bool ReplicatedBackend::handle_push_reply( pi->recovery_info, pi->recovery_progress, &new_progress, reply, &(pi->stat)); + // XXX: What can we do here? assert(r == 0); pi->recovery_progress = new_progress; return true;