From: xie xingguo <xie.xingguo@zte.com.cn>
Date: Wed, 20 Feb 2019 10:40:02 +0000 (+0800)
Subject: mgr: add per pool force-recovery/backfill commands
X-Git-Tag: v14.1.1~158^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F26560%2Fhead;p=ceph.git

mgr: add per pool force-recovery/backfill commands

For those with multiple storage pools sharing the same devices,
I think it would make much more sense to offer per-pool
commands to bring pools with high priority, e.g., because they
are hosting data of more importance than others, back to normal
quickly.

Fixes: http://tracker.ceph.com/issues/38456
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
---

diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst
index f1f3c9838d3f..63048cdd735a 100644
--- a/doc/rados/operations/placement-groups.rst
+++ b/doc/rados/operations/placement-groups.rst
@@ -564,6 +564,10 @@ or mismatched, and their contents are consistent.  Assuming the replicas all
 match, a final semantic sweep ensures that all of the snapshot-related object
 metadata is consistent. Errors are reported via logs.
 
+To scrub all placement groups from a specific pool, execute the following::
+
+        ceph osd pool scrub {pool-name}
+
 Prioritize backfill/recovery of a Placement Group(s)
 ====================================================
 
@@ -595,6 +599,32 @@ group, only those that are still queued.
 The "force" flag is cleared automatically after recovery or backfill of group
 is done.
 
+Similarly, you may use the following commands to force Ceph to perform recovery
+or backfill on all placement groups from a specified pool first::
+
+        ceph osd pool force-recovery {pool-name}
+        ceph osd pool force-backfill {pool-name}
+
+or::
+
+        ceph osd pool cancel-force-recovery {pool-name}
+        ceph osd pool cancel-force-backfill {pool-name}
+
+to restore to the default recovery or backfill priority if you change your mind.
+
+Note that these commands could possibly break the ordering of Ceph's internal
+priority computations, so use them with caution!
+Especially, if you have multiple pools that are currently sharing the same
+underlying OSDs, and some particular pools hold data more important than others,
+we recommend you use the following command to re-arrange all pools's
+recovery/backfill priority in a better order::
+
+        ceph osd pool set {pool-name} recovery_priority {value}
+
+For example, if you have 10 pools you could make the most important one priority 10,
+next 9, etc. Or you could leave most pools alone and have say 3 important pools
+all priority 1 or priorities 3, 2, 1 respectively.
+
 Revert Lost
 ===========
 
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 0619e184cea7..e20d61d34e9c 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -1439,13 +1439,17 @@ function test_mon_osd()
   ceph osd deep-scrub 0
   ceph osd repair 0
 
-  # pool scrub
+  # pool scrub, force-recovery/backfill
   pool_names=`rados lspools`
   for pool_name in $pool_names
   do
     ceph osd pool scrub $pool_name
     ceph osd pool deep-scrub $pool_name
     ceph osd pool repair $pool_name
+    ceph osd pool force-recovery $pool_name
+    ceph osd pool cancel-force-recovery $pool_name
+    ceph osd pool force-backfill $pool_name
+    ceph osd pool cancel-force-backfill $pool_name
   done
 
   for f in noup nodown noin noout noscrub nodeep-scrub nobackfill norebalance norecover notieragent full
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index c7cc8c56565a..534de149ade9 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -1542,11 +1542,18 @@ bool DaemonServer::_handle_command(
     cmdctx->reply(0, ss);
     return true;
   } else if (prefix == "pg force-recovery" ||
-  	       prefix == "pg force-backfill" ||
-  	       prefix == "pg cancel-force-recovery" ||
-  	       prefix == "pg cancel-force-backfill") {
-    string forceop = prefix.substr(3, string::npos);
-    list<pg_t> parsed_pgs;
+  	     prefix == "pg force-backfill" ||
+  	     prefix == "pg cancel-force-recovery" ||
+  	     prefix == "pg cancel-force-backfill" ||
+             prefix == "osd pool force-recovery" ||
+             prefix == "osd pool force-backfill" ||
+             prefix == "osd pool cancel-force-recovery" ||
+             prefix == "osd pool cancel-force-backfill") {
+    vector<string> vs;
+    get_str_vec(prefix, vs);
+    auto& granularity = vs.front();
+    auto& forceop = vs.back();
+    vector<pg_t> pgs;
 
     // figure out actual op just once
     int actual_op = 0;
@@ -1560,89 +1567,109 @@ bool DaemonServer::_handle_command(
       actual_op = OFR_RECOVERY | OFR_CANCEL;
     }
 
-    // covnert pg names to pgs, discard any invalid ones while at it
-    {
-      // we don't want to keep pgidstr and pgidstr_nodup forever
-      vector<string> pgidstr;
-      // get pgids to process and prune duplicates
-      cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr);
-      set<string> pgidstr_nodup(pgidstr.begin(), pgidstr.end());
-      if (pgidstr.size() != pgidstr_nodup.size()) {
-	// move elements only when there were duplicates, as this
-	// reorders them
-	pgidstr.resize(pgidstr_nodup.size());
-	auto it = pgidstr_nodup.begin();
-	for (size_t i = 0 ; i < pgidstr_nodup.size(); i++) {
-	  pgidstr[i] = std::move(*it++);
-	}
+    set<pg_t> candidates; // deduped
+    if (granularity == "pg") {
+      // covnert pg names to pgs, discard any invalid ones while at it
+      vector<string> pgids;
+      cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgids);
+      for (auto& i : pgids) {
+        pg_t pgid;
+        if (!pgid.parse(i.c_str())) {
+          ss << "invlaid pgid '" << i << "'; ";
+          r = -EINVAL;
+          continue;
+        }
+        candidates.insert(pgid);
       }
-
-      cluster_state.with_pgmap([&](const PGMap& pg_map) {
-	for (auto& pstr : pgidstr) {
-	  pg_t parsed_pg;
-	  if (!parsed_pg.parse(pstr.c_str())) {
-	    ss << "invalid pgid '" << pstr << "'; ";
-	    r = -EINVAL;
-	  } else {
-	    auto workit = pg_map.pg_stat.find(parsed_pg);
-	    if (workit == pg_map.pg_stat.end()) {
-	      ss << "pg " << pstr << " does not exist; ";
-	      r = -ENOENT;
-	    } else {
-	      pg_stat_t workpg = workit->second;
-
-	      // discard pgs for which user requests are pointless
-	      switch (actual_op)
-	      {
-		case OFR_RECOVERY:
-		  if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING)) == 0) {
-		    // don't return error, user script may be racing with cluster. not fatal.
-		    ss << "pg " << pstr << " doesn't require recovery; ";
-		    continue;
-		  } else  if (workpg.state & PG_STATE_FORCED_RECOVERY) {
-		    ss << "pg " << pstr << " recovery already forced; ";
-		    // return error, as it may be a bug in user script
-		    r = -EINVAL;
-		    continue;
-		  }
-		  break;
-		case OFR_BACKFILL:
-		  if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
-		    ss << "pg " << pstr << " doesn't require backfilling; ";
-		    continue;
-		  } else  if (workpg.state & PG_STATE_FORCED_BACKFILL) {
-		    ss << "pg " << pstr << " backfill already forced; ";
-		    r = -EINVAL;
-		    continue;
-		  }
-		  break;
-		case OFR_BACKFILL | OFR_CANCEL:
-		  if ((workpg.state & PG_STATE_FORCED_BACKFILL) == 0) {
-		    ss << "pg " << pstr << " backfill not forced; ";
-		    continue;
-		  }
-		  break;
-		case OFR_RECOVERY | OFR_CANCEL:
-		  if ((workpg.state & PG_STATE_FORCED_RECOVERY) == 0) {
-		    ss << "pg " << pstr << " recovery not forced; ";
-		    continue;
-		  }
-		  break;
-		default:
-		  ceph_abort_msg("actual_op value is not supported");
-	      }
-
-	      parsed_pgs.push_back(std::move(parsed_pg));
-	    }
-	  }
-	}
+    } else {
+      // per pool
+      vector<string> pool_names;
+      cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", pool_names);
+      if (pool_names.empty()) {
+        ss << "must specify one or more pool names";
+        cmdctx->reply(-EINVAL, ss);
+        return true;
+      }
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+        for (auto& pool_name : pool_names) {
+          auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+          if (pool_id < 0) {
+            ss << "unrecognized pool '" << pool_name << "'";
+            r = -ENOENT;
+            return;
+          }
+          auto pool_pg_num = osdmap.get_pg_num(pool_id);
+          for (int i = 0; i < pool_pg_num; i++)
+            candidates.insert({(unsigned int)i, (uint64_t)pool_id});
+        }
       });
+      if (r < 0) {
+        cmdctx->reply(r, ss);
+        return true;
+      }
     }
 
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+      for (auto& i : candidates) {
+	auto it = pg_map.pg_stat.find(i);
+	if (it == pg_map.pg_stat.end()) {
+	  ss << "pg " << i << " does not exist; ";
+	  r = -ENOENT;
+          continue;
+	}
+        auto state = it->second.state;
+	// discard pgs for which user requests are pointless
+	switch (actual_op) {
+        case OFR_RECOVERY:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_RECOVERY_WAIT |
+                        PG_STATE_RECOVERING)) == 0) {
+            // don't return error, user script may be racing with cluster.
+            // not fatal.
+            ss << "pg " << i << " doesn't require recovery; ";
+            continue;
+          } else  if (state & PG_STATE_FORCED_RECOVERY) {
+            ss << "pg " << i << " recovery already forced; ";
+            // return error, as it may be a bug in user script
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_BACKFILL_WAIT |
+                        PG_STATE_BACKFILLING)) == 0) {
+            ss << "pg " << i << " doesn't require backfilling; ";
+            continue;
+          } else if (state & PG_STATE_FORCED_BACKFILL) {
+            ss << "pg " << i << " backfill already forced; ";
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_BACKFILL) == 0) {
+            ss << "pg " << i << " backfill not forced; ";
+            continue;
+          }
+          break;
+        case OFR_RECOVERY | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_RECOVERY) == 0) {
+            ss << "pg " << i << " recovery not forced; ";
+            continue;
+          }
+          break;
+        default:
+          ceph_abort_msg("actual_op value is not supported");
+        }
+	pgs.push_back(i);
+      } // for
+    });
+
     // respond with error only when no pgs are correct
     // yes, in case of mixed errors, only the last one will be emitted,
     // but the message presented will be fine
-    if (parsed_pgs.size() != 0) {
+    if (pgs.size() != 0) {
       // clear error to not confuse users/scripts
       r = 0;
     }
@@ -1652,7 +1679,7 @@ bool DaemonServer::_handle_command(
     cluster_state.with_osdmap([&](const OSDMap& osdmap) {
 	// group pgs to process by osd
 	map<int, vector<spg_t>> osdpgs;
-	for (auto& pgid : parsed_pgs) {
+	for (auto& pgid : pgs) {
 	  int primary;
 	  spg_t spg;
 	  if (osdmap.get_primary_shard(pgid, &primary, &spg)) {
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h
index 674c1d75de92..b5dcab56555c 100644
--- a/src/mgr/MgrCommands.h
+++ b/src/mgr/MgrCommands.h
@@ -89,6 +89,22 @@ COMMAND("osd pool repair " \
         "name=who,type=CephPoolname,n=N", \
         "initiate repair on pool <who>", \
         "osd", "rw")
+COMMAND("osd pool force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "force recovery of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "force backfill of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
 COMMAND("osd reweight-by-utilization " \
 	"name=oload,type=CephInt,req=false " \
 	"name=max_change,type=CephFloat,req=false "			\