From: Sage Weil Date: Sun, 17 Sep 2017 22:29:16 +0000 (-0500) Subject: osd/PG: move more recovery logic into PG X-Git-Tag: v13.0.1~634^2~31 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6493f8ec5a1bc610e82edc4a73d0aac2862dfd2d;p=ceph.git osd/PG: move more recovery logic into PG I suspect we eventually want to move the create_context and dispatch_context into OSDService (if it isn't there already) and move even more of this logic into PG. Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 342051a1c6a0..de71ab9855f9 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8859,55 +8859,19 @@ void OSD::do_recovery( dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl; #endif - bool more = pg->start_recovery_ops(reserved_pushes, handle, &started); + bool wip = pg->start_recovery_ops(reserved_pushes, handle, &started); dout(10) << "do_recovery started " << started << "/" << reserved_pushes << " on " << *pg << dendl; // If no recovery op is started, don't bother to manipulate the RecoveryCtx - if (!started && (more || !pg->have_unfound())) { + if (!started && (wip || !pg->have_unfound())) { goto out; } PG::RecoveryCtx rctx = create_context(); rctx.handle = &handle; - - /* - * if we couldn't start any recovery ops and things are still - * unfound, see if we can discover more missing object locations. - * It may be that our initial locations were bad and we errored - * out while trying to pull. - */ - if (!more && pg->have_unfound()) { - pg->discover_all_missing(*rctx.query_map); - if (rctx.query_map->empty()) { - string action; - if (pg->state_test(PG_STATE_BACKFILLING)) { - auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt( - queued, - queued, - PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval))); - pg->queue_peering_event(evt); - action = "in backfill"; - } else if (pg->state_test(PG_STATE_RECOVERING)) { - auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt( - queued, - queued, - PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval))); - pg->queue_peering_event(evt); - action = "in recovery"; - } else { - action = "already out of recovery/backfill"; - } - dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; - } else { - dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; - pg->queue_recovery(); - } - } - - pg->write_if_dirty(*rctx.transaction); - OSDMapRef curmap = pg->get_osdmap(); - dispatch_context(rctx, pg, curmap); + pg->stuck_on_unfound(queued, wip, &rctx); + dispatch_context(rctx, pg, pg->get_osdmap()); } out: diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 36058c439b64..05779f5eaf36 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5722,6 +5722,46 @@ void PG::queue_query(epoch_t msg_epoch, MQuery(from, q, query_epoch)))); } +void PG::stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx) +{ + /* + * if we couldn't start any recovery ops and things are still + * unfound, see if we can discover more missing object locations. + * It may be that our initial locations were bad and we errored + * out while trying to pull. + */ + if (!wip && have_unfound()) { + discover_all_missing(*rctx->query_map); + if (rctx->query_map->empty()) { + string action; + if (state_test(PG_STATE_BACKFILLING)) { + auto evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + queued, + queued, + PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval))); + queue_peering_event(evt); + action = "in backfill"; + } else if (state_test(PG_STATE_RECOVERING)) { + auto evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + queued, + queued, + PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval))); + queue_peering_event(evt); + action = "in recovery"; + } else { + action = "already out of recovery/backfill"; + } + dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; + } else { + dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; + queue_recovery(); + } + } + write_if_dirty(*rctx->transaction); +} + void PG::handle_advance_map( OSDMapRef osdmap, OSDMapRef lastmap, vector& newup, int up_primary, diff --git a/src/osd/PG.h b/src/osd/PG.h index 4c64a0e60010..fe4571477cab 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -410,6 +410,18 @@ public: void handle_pg_trim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to); + /** + * @param ops_begun returns how many recovery ops the function started + * @returns true if any useful work was accomplished; false otherwise + */ + virtual bool start_recovery_ops( + uint64_t max, + ThreadPool::TPHandle &handle, + uint64_t *ops_begun) = 0; + + // more work after the above, but with a RecoveryCtx + void stuck_on_unfound(epoch_t queued, bool wip, RecoveryCtx *rctx); + virtual void get_watchers(std::list *ls) = 0; void dump_pgstate_history(Formatter *f); @@ -1342,15 +1354,6 @@ protected: virtual void check_local() = 0; - /** - * @param ops_begun returns how many recovery ops the function started - * @returns true if any useful work was accomplished; false otherwise - */ - virtual bool start_recovery_ops( - uint64_t max, - ThreadPool::TPHandle &handle, - uint64_t *ops_begun) = 0; - void purge_strays(); void update_heartbeat_peers();