]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crush: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
authorJim Schutt <jaschut@sandia.gov>
Mon, 10 Sep 2012 21:43:19 +0000 (15:43 -0600)
committerSage Weil <sage@inktank.com>
Tue, 27 Nov 2012 01:15:45 +0000 (17:15 -0800)
Consider the CRUSH rule
  step chooseleaf firstn 0 type <node_type>

This rule means that <n> replicas will be chosen in a manner such that
each chosen leaf's branch will contain a unique instance of <node_type>.

When an object is re-replicated after a leaf failure, if the CRUSH map uses
a chooseleaf rule the remapped replica ends up under the <node_type> bucket
that held the failed leaf.  This causes uneven data distribution across the
storage cluster, to the point that when all the leaves but one fail under a
particular <node_type> bucket, that remaining leaf holds all the data from
its failed peers.

This behavior also limits the number of peers that can participate in the
re-replication of the data held by the failed leaf, which increases the
time required to re-replicate after a failure.

For a chooseleaf CRUSH rule, the tree descent has two steps: call them the
inner and outer descents.

If the tree descent down to <node_type> is the outer descent, and the descent
from <node_type> down to a leaf is the inner descent, the issue is that a
down leaf is detected on the inner descent, so only the inner descent is
retried.

In order to disperse re-replicated data as widely as possible across a
storage cluster after a failure, we want to retry the outer descent. So,
fix up crush_choose() to allow the inner descent to return immediately on
choosing a failed leaf.  Wire this up as a new CRUSH tunable.

Note that after this change, for a chooseleaf rule, if the primary OSD
in a placement group has failed, choosing a replacement may result in
one of the other OSDs in the PG colliding with the new primary.  This
requires that OSD's data for that PG to need moving as well.  This
seems unavoidable but should be relatively rare.

Signed-off-by: Jim Schutt <jaschut@sandia.gov>
src/crush/CrushCompiler.cc
src/crush/CrushWrapper.cc
src/crush/CrushWrapper.h
src/crush/builder.c
src/crush/crush.h
src/crush/mapper.c
src/crushtool.cc

index 5a0cdbd084863108f0eab297dfb2b93a986afc25..b290c2ecf0240025e5f703e7e3c912acbf45ca3e 100644 (file)
@@ -189,6 +189,8 @@ int CrushCompiler::decompile(ostream &out)
     out << "tunable choose_local_fallback_tries " << crush.get_choose_local_fallback_tries() << "\n";
   if (crush.get_choose_total_tries() != 19)
     out << "tunable choose_total_tries " << crush.get_choose_total_tries() << "\n";
+  if (crush.get_chooseleaf_descend_once() != 0)
+    out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n";
 
   out << "\n# devices\n";
   for (int i=0; i<crush.get_max_devices(); i++) {
@@ -342,6 +344,8 @@ int CrushCompiler::parse_tunable(iter_t const& i)
     crush.set_choose_local_fallback_tries(val);
   else if (name == "choose_total_tries")
     crush.set_choose_total_tries(val);
+  else if (name == "chooseleaf_descend_once")
+    crush.set_chooseleaf_descend_once(val);
   else {
     err << "tunable " << name << " not recognized" << std::endl;
     return -1;
index f38a7698376792bbfdb8fc98b05ebacbdec195e6..3bae96c8689d98ccefb4ca1ff90b02311de6b0fb 100644 (file)
@@ -540,6 +540,7 @@ void CrushWrapper::encode(bufferlist& bl, bool lean) const
   ::encode(crush->choose_local_tries, bl);
   ::encode(crush->choose_local_fallback_tries, bl);
   ::encode(crush->choose_total_tries, bl);
+  ::encode(crush->chooseleaf_descend_once, bl);
 }
 
 static void decode_32_or_64_string_map(map<int32_t,string>& m, bufferlist::iterator& blp)
@@ -615,6 +616,9 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
       ::decode(crush->choose_local_fallback_tries, blp);
       ::decode(crush->choose_total_tries, blp);
     }
+    if (!blp.end()) {
+      ::decode(crush->chooseleaf_descend_once, blp);
+    }
     finalize();
   }
   catch (...) {
index 5ea68b9a83d4d7b85db115ae160b54e714ea7ff6..0e495c1df1cfbfd14c14da6a14e13068ba08cb22 100644 (file)
@@ -118,6 +118,13 @@ public:
     crush->choose_total_tries = n;
   }
 
+  int get_chooseleaf_descend_once() {
+    return crush->chooseleaf_descend_once;
+  }
+  void set_chooseleaf_descend_once(int n) {
+    crush->chooseleaf_descend_once = !!n;
+  }
+
   bool has_nondefault_tunables() const {
     return
       (crush->choose_local_tries != 2 ||
index 64ad0051a6ff980d7d4fff74c4c12ae0bc8c95fa..880c80902438bfda4e489364d36202e66b50b52f 100644 (file)
@@ -23,6 +23,7 @@ struct crush_map *crush_create()
        m->choose_local_tries = 2;
        m->choose_local_fallback_tries = 5;
        m->choose_total_tries = 19;
+       m->chooseleaf_descend_once = 0;
        return m;
 }
 
index de22386ed7003148b7d7640810c54208095572ca..9fd37e9e516ee5da205960c33625b0f78ec8fce7 100644 (file)
@@ -169,6 +169,8 @@ struct crush_map {
        __u32 choose_local_fallback_tries;
        /* choose attempts before giving up */ 
        __u32 choose_total_tries;
+       /* attempt chooseleaf inner descent once; on failure retry outer descent */
+       __u32 chooseleaf_descend_once;
 
        __u32 *choose_tries;
 };
index 267b9b696c3d7825851086b8b2dacb4bdedec1fb..c4f244524a5ff35410d83d4f47768c33acbd285f 100644 (file)
@@ -287,7 +287,8 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int weight_m
  * @param out pointer to output vector
  * @param outpos our position in that vector
  * @param firstn true if choosing "first n" items, false if choosing "indep"
- * @param recurseto_leaf: true if we want one device under each item of given type
+ * @param recurse_to_leaf: true if we want one device under each item of given type
+ * @descend_once: true if we should only try one descent before giving up
  * @param out2 second output vector for leaf items (if @a recurse_to_leaf)
  */
 static int crush_choose(const struct crush_map *map,
@@ -296,7 +297,7 @@ static int crush_choose(const struct crush_map *map,
                        int x, int numrep, int type,
                        int *out, int outpos,
                        int firstn, int recurse_to_leaf,
-                       int *out2)
+                       int descend_once, int *out2)
 {
        int rep;
        unsigned int ftotal, flocal;
@@ -400,6 +401,7 @@ static int crush_choose(const struct crush_map *map,
                                                         x, outpos+1, 0,
                                                         out2, outpos,
                                                         firstn, 0,
+                                                        map->chooseleaf_descend_once,
                                                         NULL) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
@@ -423,7 +425,10 @@ reject:
                                        ftotal++;
                                        flocal++;
 
-                                       if (collide && flocal <= map->choose_local_tries)
+                                       if (reject && descend_once)
+                                               /* let outer call try again */
+                                               skip_rep = 1;
+                                       else if (collide && flocal <= map->choose_local_tries)
                                                /* retry locally a few times */
                                                retry_bucket = 1;
                                        else if (map->choose_local_fallback_tries > 0 &&
@@ -489,6 +494,7 @@ int crush_do_rule(const struct crush_map *map,
        int i, j;
        int numrep;
        int firstn;
+       const int descend_once = 0;
 
        if ((__u32)ruleno >= map->max_rules) {
                dprintk(" bad ruleno %d\n", ruleno);
@@ -548,7 +554,8 @@ int crush_do_rule(const struct crush_map *map,
                                                      curstep->arg2,
                                                      o+osize, j,
                                                      firstn,
-                                                     recurse_to_leaf, c+osize);
+                                                     recurse_to_leaf,
+                                                     descend_once, c+osize);
                        }
 
                        if (recurse_to_leaf)
index 14fc49eb9ad2cf57b15351dae14bb1dcce623eb3..9789dd0861dff31d7e8511cbb1662dbed1e9d634 100644 (file)
@@ -188,6 +188,7 @@ int main(int argc, const char **argv)
   int choose_local_tries = -1;
   int choose_local_fallback_tries = -1;
   int choose_total_tries = -1;
+  int chooseleaf_descend_once = -1;
 
   CrushWrapper crush;
 
@@ -249,6 +250,9 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_withint(args, i, &choose_total_tries, &err,
                                     "--set_choose_total_tries", (char*)NULL)) {
       adjust = true;
+    } else if (ceph_argparse_withint(args, i, &chooseleaf_descend_once, &err,
+                                    "--set_chooseleaf_descend_once", (char*)NULL)) {
+      adjust = true;
     } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
       reweight = true;
     } else if (ceph_argparse_withint(args, i, &add_item, &err, "--add_item", (char*)NULL)) {
@@ -668,6 +672,14 @@ int main(int argc, const char **argv)
     crush.set_choose_total_tries(choose_total_tries);
     modified = true;
   }
+  if (chooseleaf_descend_once >= 0) {
+    if (!unsafe_tunables) {
+      cerr << scary_tunables_message << std::endl;
+      return -1;
+    }
+    crush.set_chooseleaf_descend_once(chooseleaf_descend_once);
+    modified = true;
+  }
   if (modified) {
     crush.finalize();