From ff9a32d94bed8a03942b2b4e50455b3d7c5c892c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Piotr=20Da=C5=82ek?= Date: Fri, 24 Feb 2017 08:45:30 +0100 Subject: [PATCH] osd, pg: implement force_recovery MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This commit implements the MOSDForceRecovery handler along with all required code to have PGs processed in desired order (PGs with force_recovery/force_backfill first). Obviously it's not going to work cluster-wide and OSDs that are not affected - but may affect affected OSDs - may cut into PG recovery queue and cause PGs with force_* flags to get recovered or backfilled later than expected, but still way earlier than without it. Signed-off-by: Piotr Dałek --- src/osd/OSD.cc | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/osd/OSD.h | 3 ++ src/osd/PG.cc | 15 +++++++++- src/osd/PG.h | 1 + 4 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7a494f8e18d78..279e6e6a3db58 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -88,6 +88,7 @@ #include "messages/MOSDPGBackfill.h" #include "messages/MBackfillReserve.h" #include "messages/MRecoveryReserve.h" +#include "messages/MOSDForceRecovery.h" #include "messages/MOSDECSubOpWrite.h" #include "messages/MOSDECSubOpWriteReply.h" #include "messages/MOSDECSubOpRead.h" @@ -7063,6 +7064,10 @@ void OSD::_dispatch(Message *m) handle_scrub(static_cast(m)); break; + case MSG_OSD_FORCE_RECOVERY: + handle_force_recovery(m); + break; + // -- need OSDMap -- case MSG_OSD_PG_CREATE: @@ -8870,6 +8875,31 @@ void OSD::handle_pg_recovery_reserve(OpRequestRef op) pg->unlock(); } +void OSD::handle_force_recovery(Message *m) +{ + MOSDForceRecovery *msg = static_cast(m); + assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY); + RWLock::RLocker l(pg_map_lock); + + vector local_pgs; + local_pgs.reserve(msg->forced_pgs.size()); + + for (auto& i : msg->forced_pgs) { + spg_t locpg; + if (osdmap->get_primary_shard(i, &locpg)) { + auto pg_map_entry = pg_map.find(locpg); + if (pg_map_entry != pg_map.end()) { + local_pgs.push_back(pg_map_entry->second); + } + } + } + + if (local_pgs.size()) { + service.adjust_pg_priorities(local_pgs, msg->options); + } + + msg->put(); +} /** PGQuery * from primary to replica | stray @@ -9124,6 +9154,50 @@ bool OSDService::_recover_now(uint64_t *available_pushes) return true; } + +void OSDService::adjust_pg_priorities(vector pgs, int newflags) +{ + if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY))) + return; + int newstate = 0; + + Mutex::Locker l(recovery_lock); + + if (newflags & OFR_BACKFILL) { + newstate = PG_STATE_FORCED_BACKFILL; + } else if (newflags & OFR_RECOVERY) { + newstate = PG_STATE_FORCED_RECOVERY; + } + + // debug output here may get large, don't generate it if debug level is below + // 10 and use abbreviated pg ids otherwise + if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) { + stringstream ss; + + for (auto& i : pgs) { + ss << i->get_pgid() << " "; + } + + dout(10) << __func__ << " working on " << ss.str() << dendl; + } + + if (newflags & OFR_CANCEL) { + for (auto& i : pgs) { + i->change_recovery_force_mode(newstate, true); + } + } else { + for (auto& i : pgs) { + // make sure the PG is in correct state before forcing backfill or recovery, or + // else we'll make PG keeping FORCE_* flag forever, requiring osds restart + // or forcing somehow recovery/backfill. + int pgstate = i->get_state(); + if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) || + ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) ) + i->change_recovery_force_mode(newstate, false); + } + } +} + void OSD::do_recovery( PG *pg, epoch_t queued, uint64_t reserved_pushes, ThreadPool::TPHandle &handle) diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 6da927e1affd7..38942c44aac47 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -962,6 +962,7 @@ public: _queue_for_recovery(make_pair(queued, pg), reserved_pushes); } + void adjust_pg_priorities(vector pgs, int newflags); // osd map cache (past osd maps) Mutex map_cache_lock; @@ -2170,6 +2171,8 @@ protected: void handle_pg_backfill_reserve(OpRequestRef op); void handle_pg_recovery_reserve(OpRequestRef op); + void handle_force_recovery(Message *m); + void handle_pg_remove(OpRequestRef op); void _remove_pg(PG *pg); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 49eb61498b044..eb8dab61cd034 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2037,6 +2037,19 @@ void PG::mark_clean() kick_snap_trim(); } +void PG::change_recovery_force_mode(int new_mode, bool clear) +{ + lock(true); + if (clear) { + state_clear(new_mode); + } else { + state_set(new_mode); + } + publish_stats_to_osd(); + + unlock(); +} + inline int PG::clamp_recovery_priority(int priority) { static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range"); @@ -2063,7 +2076,7 @@ unsigned PG::get_recovery_priority() pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret); ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret); } - + dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl; return static_cast(ret); } diff --git a/src/osd/PG.h b/src/osd/PG.h index d986f3f473c58..924c930dd83d9 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -982,6 +982,7 @@ public: unsigned get_backfill_priority(); void mark_clean(); ///< mark an active pg clean + void change_recovery_force_mode(int new_mode, bool clear); /// return [start,end) bounds for required past_intervals static pair get_required_past_interval_bounds( -- 2.39.5