From: xie xingguo Date: Wed, 20 Feb 2019 10:40:02 +0000 (+0800) Subject: mgr: add per pool force-recovery/backfill commands X-Git-Tag: v14.1.1~158^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F26560%2Fhead;p=ceph.git mgr: add per pool force-recovery/backfill commands For those with multiple storage pools sharing the same devices, I think it would make much more sense to offer per-pool commands to bring pools with high priority, e.g., because they are hosting data of more importance than others, back to normal quickly. Fixes: http://tracker.ceph.com/issues/38456 Signed-off-by: xie xingguo --- diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst index f1f3c9838d3f..63048cdd735a 100644 --- a/doc/rados/operations/placement-groups.rst +++ b/doc/rados/operations/placement-groups.rst @@ -564,6 +564,10 @@ or mismatched, and their contents are consistent. Assuming the replicas all match, a final semantic sweep ensures that all of the snapshot-related object metadata is consistent. Errors are reported via logs. +To scrub all placement groups from a specific pool, execute the following:: + + ceph osd pool scrub {pool-name} + Prioritize backfill/recovery of a Placement Group(s) ==================================================== @@ -595,6 +599,32 @@ group, only those that are still queued. The "force" flag is cleared automatically after recovery or backfill of group is done. +Similarly, you may use the following commands to force Ceph to perform recovery +or backfill on all placement groups from a specified pool first:: + + ceph osd pool force-recovery {pool-name} + ceph osd pool force-backfill {pool-name} + +or:: + + ceph osd pool cancel-force-recovery {pool-name} + ceph osd pool cancel-force-backfill {pool-name} + +to restore to the default recovery or backfill priority if you change your mind. + +Note that these commands could possibly break the ordering of Ceph's internal +priority computations, so use them with caution! +Especially, if you have multiple pools that are currently sharing the same +underlying OSDs, and some particular pools hold data more important than others, +we recommend you use the following command to re-arrange all pools's +recovery/backfill priority in a better order:: + + ceph osd pool set {pool-name} recovery_priority {value} + +For example, if you have 10 pools you could make the most important one priority 10, +next 9, etc. Or you could leave most pools alone and have say 3 important pools +all priority 1 or priorities 3, 2, 1 respectively. + Revert Lost =========== diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 0619e184cea7..e20d61d34e9c 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1439,13 +1439,17 @@ function test_mon_osd() ceph osd deep-scrub 0 ceph osd repair 0 - # pool scrub + # pool scrub, force-recovery/backfill pool_names=`rados lspools` for pool_name in $pool_names do ceph osd pool scrub $pool_name ceph osd pool deep-scrub $pool_name ceph osd pool repair $pool_name + ceph osd pool force-recovery $pool_name + ceph osd pool cancel-force-recovery $pool_name + ceph osd pool force-backfill $pool_name + ceph osd pool cancel-force-backfill $pool_name done for f in noup nodown noin noout noscrub nodeep-scrub nobackfill norebalance norecover notieragent full diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index c7cc8c56565a..534de149ade9 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -1542,11 +1542,18 @@ bool DaemonServer::_handle_command( cmdctx->reply(0, ss); return true; } else if (prefix == "pg force-recovery" || - prefix == "pg force-backfill" || - prefix == "pg cancel-force-recovery" || - prefix == "pg cancel-force-backfill") { - string forceop = prefix.substr(3, string::npos); - list parsed_pgs; + prefix == "pg force-backfill" || + prefix == "pg cancel-force-recovery" || + prefix == "pg cancel-force-backfill" || + prefix == "osd pool force-recovery" || + prefix == "osd pool force-backfill" || + prefix == "osd pool cancel-force-recovery" || + prefix == "osd pool cancel-force-backfill") { + vector vs; + get_str_vec(prefix, vs); + auto& granularity = vs.front(); + auto& forceop = vs.back(); + vector pgs; // figure out actual op just once int actual_op = 0; @@ -1560,89 +1567,109 @@ bool DaemonServer::_handle_command( actual_op = OFR_RECOVERY | OFR_CANCEL; } - // covnert pg names to pgs, discard any invalid ones while at it - { - // we don't want to keep pgidstr and pgidstr_nodup forever - vector pgidstr; - // get pgids to process and prune duplicates - cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr); - set pgidstr_nodup(pgidstr.begin(), pgidstr.end()); - if (pgidstr.size() != pgidstr_nodup.size()) { - // move elements only when there were duplicates, as this - // reorders them - pgidstr.resize(pgidstr_nodup.size()); - auto it = pgidstr_nodup.begin(); - for (size_t i = 0 ; i < pgidstr_nodup.size(); i++) { - pgidstr[i] = std::move(*it++); - } + set candidates; // deduped + if (granularity == "pg") { + // covnert pg names to pgs, discard any invalid ones while at it + vector pgids; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgids); + for (auto& i : pgids) { + pg_t pgid; + if (!pgid.parse(i.c_str())) { + ss << "invlaid pgid '" << i << "'; "; + r = -EINVAL; + continue; + } + candidates.insert(pgid); } - - cluster_state.with_pgmap([&](const PGMap& pg_map) { - for (auto& pstr : pgidstr) { - pg_t parsed_pg; - if (!parsed_pg.parse(pstr.c_str())) { - ss << "invalid pgid '" << pstr << "'; "; - r = -EINVAL; - } else { - auto workit = pg_map.pg_stat.find(parsed_pg); - if (workit == pg_map.pg_stat.end()) { - ss << "pg " << pstr << " does not exist; "; - r = -ENOENT; - } else { - pg_stat_t workpg = workit->second; - - // discard pgs for which user requests are pointless - switch (actual_op) - { - case OFR_RECOVERY: - if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING)) == 0) { - // don't return error, user script may be racing with cluster. not fatal. - ss << "pg " << pstr << " doesn't require recovery; "; - continue; - } else if (workpg.state & PG_STATE_FORCED_RECOVERY) { - ss << "pg " << pstr << " recovery already forced; "; - // return error, as it may be a bug in user script - r = -EINVAL; - continue; - } - break; - case OFR_BACKFILL: - if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) { - ss << "pg " << pstr << " doesn't require backfilling; "; - continue; - } else if (workpg.state & PG_STATE_FORCED_BACKFILL) { - ss << "pg " << pstr << " backfill already forced; "; - r = -EINVAL; - continue; - } - break; - case OFR_BACKFILL | OFR_CANCEL: - if ((workpg.state & PG_STATE_FORCED_BACKFILL) == 0) { - ss << "pg " << pstr << " backfill not forced; "; - continue; - } - break; - case OFR_RECOVERY | OFR_CANCEL: - if ((workpg.state & PG_STATE_FORCED_RECOVERY) == 0) { - ss << "pg " << pstr << " recovery not forced; "; - continue; - } - break; - default: - ceph_abort_msg("actual_op value is not supported"); - } - - parsed_pgs.push_back(std::move(parsed_pg)); - } - } - } + } else { + // per pool + vector pool_names; + cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", pool_names); + if (pool_names.empty()) { + ss << "must specify one or more pool names"; + cmdctx->reply(-EINVAL, ss); + return true; + } + cluster_state.with_osdmap([&](const OSDMap& osdmap) { + for (auto& pool_name : pool_names) { + auto pool_id = osdmap.lookup_pg_pool_name(pool_name); + if (pool_id < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + return; + } + auto pool_pg_num = osdmap.get_pg_num(pool_id); + for (int i = 0; i < pool_pg_num; i++) + candidates.insert({(unsigned int)i, (uint64_t)pool_id}); + } }); + if (r < 0) { + cmdctx->reply(r, ss); + return true; + } } + cluster_state.with_pgmap([&](const PGMap& pg_map) { + for (auto& i : candidates) { + auto it = pg_map.pg_stat.find(i); + if (it == pg_map.pg_stat.end()) { + ss << "pg " << i << " does not exist; "; + r = -ENOENT; + continue; + } + auto state = it->second.state; + // discard pgs for which user requests are pointless + switch (actual_op) { + case OFR_RECOVERY: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_RECOVERY_WAIT | + PG_STATE_RECOVERING)) == 0) { + // don't return error, user script may be racing with cluster. + // not fatal. + ss << "pg " << i << " doesn't require recovery; "; + continue; + } else if (state & PG_STATE_FORCED_RECOVERY) { + ss << "pg " << i << " recovery already forced; "; + // return error, as it may be a bug in user script + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL: + if ((state & (PG_STATE_DEGRADED | + PG_STATE_BACKFILL_WAIT | + PG_STATE_BACKFILLING)) == 0) { + ss << "pg " << i << " doesn't require backfilling; "; + continue; + } else if (state & PG_STATE_FORCED_BACKFILL) { + ss << "pg " << i << " backfill already forced; "; + r = -EINVAL; + continue; + } + break; + case OFR_BACKFILL | OFR_CANCEL: + if ((state & PG_STATE_FORCED_BACKFILL) == 0) { + ss << "pg " << i << " backfill not forced; "; + continue; + } + break; + case OFR_RECOVERY | OFR_CANCEL: + if ((state & PG_STATE_FORCED_RECOVERY) == 0) { + ss << "pg " << i << " recovery not forced; "; + continue; + } + break; + default: + ceph_abort_msg("actual_op value is not supported"); + } + pgs.push_back(i); + } // for + }); + // respond with error only when no pgs are correct // yes, in case of mixed errors, only the last one will be emitted, // but the message presented will be fine - if (parsed_pgs.size() != 0) { + if (pgs.size() != 0) { // clear error to not confuse users/scripts r = 0; } @@ -1652,7 +1679,7 @@ bool DaemonServer::_handle_command( cluster_state.with_osdmap([&](const OSDMap& osdmap) { // group pgs to process by osd map> osdpgs; - for (auto& pgid : parsed_pgs) { + for (auto& pgid : pgs) { int primary; spg_t spg; if (osdmap.get_primary_shard(pgid, &primary, &spg)) { diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h index 674c1d75de92..b5dcab56555c 100644 --- a/src/mgr/MgrCommands.h +++ b/src/mgr/MgrCommands.h @@ -89,6 +89,22 @@ COMMAND("osd pool repair " \ "name=who,type=CephPoolname,n=N", \ "initiate repair on pool ", \ "osd", "rw") +COMMAND("osd pool force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "force recovery of specified pool first", \ + "osd", "rw") +COMMAND("osd pool force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "force backfill of specified pool first", \ + "osd", "rw") +COMMAND("osd pool cancel-force-recovery " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool ", \ + "osd", "rw") +COMMAND("osd pool cancel-force-backfill " \ + "name=who,type=CephPoolname,n=N", \ + "restore normal recovery priority of specified pool ", \ + "osd", "rw") COMMAND("osd reweight-by-utilization " \ "name=oload,type=CephInt,req=false " \ "name=max_change,type=CephFloat,req=false " \