mgr: add per pool force-recovery/backfill commands

author xie xingguo <xie.xingguo@zte.com.cn>

Wed, 20 Feb 2019 10:40:02 +0000 (18:40 +0800)

committer xie xingguo <xie.xingguo@zte.com.cn>

Sat, 23 Feb 2019 04:18:24 +0000 (12:18 +0800)
author xie xingguo <xie.xingguo@zte.com.cn>
Wed, 20 Feb 2019 10:40:02 +0000 (18:40 +0800)
committer xie xingguo <xie.xingguo@zte.com.cn>
Sat, 23 Feb 2019 04:18:24 +0000 (12:18 +0800)
diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst

index f1f3c9838d3fb4840ce3a0ff88e1515243e09c34..63048cdd735a0c8aa39a7ba667696e32efdfff44 100644 (file)
--- a/doc/rados/operations/placement-groups.rst
+++ b/doc/rados/operations/placement-groups.rst
@@ -564,6 +564,10 @@ or mismatched, and their contents are consistent.  Assuming the replicas all
  match, a final semantic sweep ensures that all of the snapshot-related object
  metadata is consistent. Errors are reported via logs.
  
+To scrub all placement groups from a specific pool, execute the following::
+
+        ceph osd pool scrub {pool-name}
+
  Prioritize backfill/recovery of a Placement Group(s)
  ====================================================
  
@@ -595,6 +599,32 @@ group, only those that are still queued.
  The "force" flag is cleared automatically after recovery or backfill of group
  is done.
  
+Similarly, you may use the following commands to force Ceph to perform recovery
+or backfill on all placement groups from a specified pool first::
+
+        ceph osd pool force-recovery {pool-name}
+        ceph osd pool force-backfill {pool-name}
+
+or::
+
+        ceph osd pool cancel-force-recovery {pool-name}
+        ceph osd pool cancel-force-backfill {pool-name}
+
+to restore to the default recovery or backfill priority if you change your mind.
+
+Note that these commands could possibly break the ordering of Ceph's internal
+priority computations, so use them with caution!
+Especially, if you have multiple pools that are currently sharing the same
+underlying OSDs, and some particular pools hold data more important than others,
+we recommend you use the following command to re-arrange all pools's
+recovery/backfill priority in a better order::
+
+        ceph osd pool set {pool-name} recovery_priority {value}
+
+For example, if you have 10 pools you could make the most important one priority 10,
+next 9, etc. Or you could leave most pools alone and have say 3 important pools
+all priority 1 or priorities 3, 2, 1 respectively.
+
  Revert Lost
  ===========
  
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh

index 0619e184cea70995e1dd71ea612db1cac9516ed8..e20d61d34e9c73104bef1f083a5b5ee31bac8f47 100755 (executable)
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -1439,13 +1439,17 @@ function test_mon_osd()
    ceph osd deep-scrub 0
    ceph osd repair 0
  
-  # pool scrub
+  # pool scrub, force-recovery/backfill
    pool_names=`rados lspools`
    for pool_name in $pool_names
    do
      ceph osd pool scrub $pool_name
      ceph osd pool deep-scrub $pool_name
      ceph osd pool repair $pool_name
+    ceph osd pool force-recovery $pool_name
+    ceph osd pool cancel-force-recovery $pool_name
+    ceph osd pool force-backfill $pool_name
+    ceph osd pool cancel-force-backfill $pool_name
    done
  
    for f in noup nodown noin noout noscrub nodeep-scrub nobackfill norebalance norecover notieragent full
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc

index c7cc8c56565a37cbd7a296582a54a110552f4e06..534de149ade989899c727aa0fe9675438cf98868 100644 (file)
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -1542,11 +1542,18 @@ bool DaemonServer::_handle_command(
      cmdctx->reply(0, ss);
      return true;
    } else if (prefix == "pg force-recovery" ||
-              prefix == "pg force-backfill" ||
-              prefix == "pg cancel-force-recovery" ||
-              prefix == "pg cancel-force-backfill") {
-    string forceop = prefix.substr(3, string::npos);
-    list<pg_t> parsed_pgs;
+            prefix == "pg force-backfill" ||
+            prefix == "pg cancel-force-recovery" ||
+            prefix == "pg cancel-force-backfill" ||
+             prefix == "osd pool force-recovery" ||
+             prefix == "osd pool force-backfill" ||
+             prefix == "osd pool cancel-force-recovery" ||
+             prefix == "osd pool cancel-force-backfill") {
+    vector<string> vs;
+    get_str_vec(prefix, vs);
+    auto& granularity = vs.front();
+    auto& forceop = vs.back();
+    vector<pg_t> pgs;
  
      // figure out actual op just once
      int actual_op = 0;
@@ -1560,89 +1567,109 @@ bool DaemonServer::_handle_command(
        actual_op = OFR_RECOVERY | OFR_CANCEL;
      }
  
-    // covnert pg names to pgs, discard any invalid ones while at it
-    {
-      // we don't want to keep pgidstr and pgidstr_nodup forever
-      vector<string> pgidstr;
-      // get pgids to process and prune duplicates
-      cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr);
-      set<string> pgidstr_nodup(pgidstr.begin(), pgidstr.end());
-      if (pgidstr.size() != pgidstr_nodup.size()) {
-       // move elements only when there were duplicates, as this
-       // reorders them
-       pgidstr.resize(pgidstr_nodup.size());
-       auto it = pgidstr_nodup.begin();
-       for (size_t i = 0 ; i < pgidstr_nodup.size(); i++) {
-         pgidstr[i] = std::move(*it++);
-       }
+    set<pg_t> candidates; // deduped
+    if (granularity == "pg") {
+      // covnert pg names to pgs, discard any invalid ones while at it
+      vector<string> pgids;
+      cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgids);
+      for (auto& i : pgids) {
+        pg_t pgid;
+        if (!pgid.parse(i.c_str())) {
+          ss << "invlaid pgid '" << i << "'; ";
+          r = -EINVAL;
+          continue;
+        }
+        candidates.insert(pgid);
        }
-
-      cluster_state.with_pgmap([&](const PGMap& pg_map) {
-       for (auto& pstr : pgidstr) {
-         pg_t parsed_pg;
-         if (!parsed_pg.parse(pstr.c_str())) {
-           ss << "invalid pgid '" << pstr << "'; ";
-           r = -EINVAL;
-         } else {
-           auto workit = pg_map.pg_stat.find(parsed_pg);
-           if (workit == pg_map.pg_stat.end()) {
-             ss << "pg " << pstr << " does not exist; ";
-             r = -ENOENT;
-           } else {
-             pg_stat_t workpg = workit->second;
-
-             // discard pgs for which user requests are pointless
-             switch (actual_op)
-             {
-               case OFR_RECOVERY:
-                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING)) == 0) {
-                   // don't return error, user script may be racing with cluster. not fatal.
-                   ss << "pg " << pstr << " doesn't require recovery; ";
-                   continue;
-                 } else  if (workpg.state & PG_STATE_FORCED_RECOVERY) {
-                   ss << "pg " << pstr << " recovery already forced; ";
-                   // return error, as it may be a bug in user script
-                   r = -EINVAL;
-                   continue;
-                 }
-                 break;
-               case OFR_BACKFILL:
-                 if ((workpg.state & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING)) == 0) {
-                   ss << "pg " << pstr << " doesn't require backfilling; ";
-                   continue;
-                 } else  if (workpg.state & PG_STATE_FORCED_BACKFILL) {
-                   ss << "pg " << pstr << " backfill already forced; ";
-                   r = -EINVAL;
-                   continue;
-                 }
-                 break;
-               case OFR_BACKFILL | OFR_CANCEL:
-                 if ((workpg.state & PG_STATE_FORCED_BACKFILL) == 0) {
-                   ss << "pg " << pstr << " backfill not forced; ";
-                   continue;
-                 }
-                 break;
-               case OFR_RECOVERY | OFR_CANCEL:
-                 if ((workpg.state & PG_STATE_FORCED_RECOVERY) == 0) {
-                   ss << "pg " << pstr << " recovery not forced; ";
-                   continue;
-                 }
-                 break;
-               default:
-                 ceph_abort_msg("actual_op value is not supported");
-             }
-
-             parsed_pgs.push_back(std::move(parsed_pg));
-           }
-         }
-       }
+    } else {
+      // per pool
+      vector<string> pool_names;
+      cmd_getval(g_ceph_context, cmdctx->cmdmap, "who", pool_names);
+      if (pool_names.empty()) {
+        ss << "must specify one or more pool names";
+        cmdctx->reply(-EINVAL, ss);
+        return true;
+      }
+      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+        for (auto& pool_name : pool_names) {
+          auto pool_id = osdmap.lookup_pg_pool_name(pool_name);
+          if (pool_id < 0) {
+            ss << "unrecognized pool '" << pool_name << "'";
+            r = -ENOENT;
+            return;
+          }
+          auto pool_pg_num = osdmap.get_pg_num(pool_id);
+          for (int i = 0; i < pool_pg_num; i++)
+            candidates.insert({(unsigned int)i, (uint64_t)pool_id});
+        }
        });
+      if (r < 0) {
+        cmdctx->reply(r, ss);
+        return true;
+      }
      }
  
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+      for (auto& i : candidates) {
+       auto it = pg_map.pg_stat.find(i);
+       if (it == pg_map.pg_stat.end()) {
+         ss << "pg " << i << " does not exist; ";
+         r = -ENOENT;
+          continue;
+       }
+        auto state = it->second.state;
+       // discard pgs for which user requests are pointless
+       switch (actual_op) {
+        case OFR_RECOVERY:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_RECOVERY_WAIT |
+                        PG_STATE_RECOVERING)) == 0) {
+            // don't return error, user script may be racing with cluster.
+            // not fatal.
+            ss << "pg " << i << " doesn't require recovery; ";
+            continue;
+          } else  if (state & PG_STATE_FORCED_RECOVERY) {
+            ss << "pg " << i << " recovery already forced; ";
+            // return error, as it may be a bug in user script
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL:
+          if ((state & (PG_STATE_DEGRADED |
+                        PG_STATE_BACKFILL_WAIT |
+                        PG_STATE_BACKFILLING)) == 0) {
+            ss << "pg " << i << " doesn't require backfilling; ";
+            continue;
+          } else if (state & PG_STATE_FORCED_BACKFILL) {
+            ss << "pg " << i << " backfill already forced; ";
+            r = -EINVAL;
+            continue;
+          }
+          break;
+        case OFR_BACKFILL | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_BACKFILL) == 0) {
+            ss << "pg " << i << " backfill not forced; ";
+            continue;
+          }
+          break;
+        case OFR_RECOVERY | OFR_CANCEL:
+          if ((state & PG_STATE_FORCED_RECOVERY) == 0) {
+            ss << "pg " << i << " recovery not forced; ";
+            continue;
+          }
+          break;
+        default:
+          ceph_abort_msg("actual_op value is not supported");
+        }
+       pgs.push_back(i);
+      } // for
+    });
+
      // respond with error only when no pgs are correct
      // yes, in case of mixed errors, only the last one will be emitted,
      // but the message presented will be fine
-    if (parsed_pgs.size() != 0) {
+    if (pgs.size() != 0) {
        // clear error to not confuse users/scripts
        r = 0;
      }
@@ -1652,7 +1679,7 @@ bool DaemonServer::_handle_command(
      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
         // group pgs to process by osd
         map<int, vector<spg_t>> osdpgs;
-       for (auto& pgid : parsed_pgs) {
+       for (auto& pgid : pgs) {
           int primary;
           spg_t spg;
           if (osdmap.get_primary_shard(pgid, &primary, &spg)) {
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h

index 674c1d75de926d349c0fbff4f6b74a1d519c67a7..b5dcab56555c03799f98bf2437245ee4e772005d 100644 (file)
--- a/src/mgr/MgrCommands.h
+++ b/src/mgr/MgrCommands.h
@@ -89,6 +89,22 @@ COMMAND("osd pool repair " \
          "name=who,type=CephPoolname,n=N", \
          "initiate repair on pool <who>", \
          "osd", "rw")
+COMMAND("osd pool force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "force recovery of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "force backfill of specified pool <who> first", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-recovery " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
+COMMAND("osd pool cancel-force-backfill " \
+        "name=who,type=CephPoolname,n=N", \
+        "restore normal recovery priority of specified pool <who>", \
+        "osd", "rw")
  COMMAND("osd reweight-by-utilization " \
         "name=oload,type=CephInt,req=false " \
         "name=max_change,type=CephFloat,req=false "                     \
author	xie xingguo <xie.xingguo@zte.com.cn>
	Wed, 20 Feb 2019 10:40:02 +0000 (18:40 +0800)
committer	xie xingguo <xie.xingguo@zte.com.cn>
	Sat, 23 Feb 2019 04:18:24 +0000 (12:18 +0800)
doc/rados/operations/placement-groups.rst		patch \| blob \| history
qa/workunits/cephtool/test.sh		patch \| blob \| history
src/mgr/DaemonServer.cc		patch \| blob \| history
src/mgr/MgrCommands.h		patch \| blob \| history