From: Jim Schutt Date: Mon, 10 Sep 2012 21:43:19 +0000 (-0600) Subject: crush: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed X-Git-Tag: v0.55~40^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=88f218181a9e6d2292e2697fc93797d0f6d6e5dc;p=ceph.git crush: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Consider the CRUSH rule step chooseleaf firstn 0 type This rule means that replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of . When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to is the outer descent, and the descent from down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. Signed-off-by: Jim Schutt --- diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc index 5a0cdbd08486..b290c2ecf024 100644 --- a/src/crush/CrushCompiler.cc +++ b/src/crush/CrushCompiler.cc @@ -189,6 +189,8 @@ int CrushCompiler::decompile(ostream &out) out << "tunable choose_local_fallback_tries " << crush.get_choose_local_fallback_tries() << "\n"; if (crush.get_choose_total_tries() != 19) out << "tunable choose_total_tries " << crush.get_choose_total_tries() << "\n"; + if (crush.get_chooseleaf_descend_once() != 0) + out << "tunable chooseleaf_descend_once " << crush.get_chooseleaf_descend_once() << "\n"; out << "\n# devices\n"; for (int i=0; ichoose_local_tries, bl); ::encode(crush->choose_local_fallback_tries, bl); ::encode(crush->choose_total_tries, bl); + ::encode(crush->chooseleaf_descend_once, bl); } static void decode_32_or_64_string_map(map& m, bufferlist::iterator& blp) @@ -615,6 +616,9 @@ void CrushWrapper::decode(bufferlist::iterator& blp) ::decode(crush->choose_local_fallback_tries, blp); ::decode(crush->choose_total_tries, blp); } + if (!blp.end()) { + ::decode(crush->chooseleaf_descend_once, blp); + } finalize(); } catch (...) { diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 5ea68b9a83d4..0e495c1df1cf 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -118,6 +118,13 @@ public: crush->choose_total_tries = n; } + int get_chooseleaf_descend_once() { + return crush->chooseleaf_descend_once; + } + void set_chooseleaf_descend_once(int n) { + crush->chooseleaf_descend_once = !!n; + } + bool has_nondefault_tunables() const { return (crush->choose_local_tries != 2 || diff --git a/src/crush/builder.c b/src/crush/builder.c index 64ad0051a6ff..880c80902438 100644 --- a/src/crush/builder.c +++ b/src/crush/builder.c @@ -23,6 +23,7 @@ struct crush_map *crush_create() m->choose_local_tries = 2; m->choose_local_fallback_tries = 5; m->choose_total_tries = 19; + m->chooseleaf_descend_once = 0; return m; } diff --git a/src/crush/crush.h b/src/crush/crush.h index de22386ed700..9fd37e9e516e 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -169,6 +169,8 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; + /* attempt chooseleaf inner descent once; on failure retry outer descent */ + __u32 chooseleaf_descend_once; __u32 *choose_tries; }; diff --git a/src/crush/mapper.c b/src/crush/mapper.c index 267b9b696c3d..c4f244524a5f 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -287,7 +287,8 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int weight_m * @param out pointer to output vector * @param outpos our position in that vector * @param firstn true if choosing "first n" items, false if choosing "indep" - * @param recurseto_leaf: true if we want one device under each item of given type + * @param recurse_to_leaf: true if we want one device under each item of given type + * @descend_once: true if we should only try one descent before giving up * @param out2 second output vector for leaf items (if @a recurse_to_leaf) */ static int crush_choose(const struct crush_map *map, @@ -296,7 +297,7 @@ static int crush_choose(const struct crush_map *map, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, - int *out2) + int descend_once, int *out2) { int rep; unsigned int ftotal, flocal; @@ -400,6 +401,7 @@ static int crush_choose(const struct crush_map *map, x, outpos+1, 0, out2, outpos, firstn, 0, + map->chooseleaf_descend_once, NULL) <= outpos) /* didn't get leaf */ reject = 1; @@ -423,7 +425,10 @@ reject: ftotal++; flocal++; - if (collide && flocal <= map->choose_local_tries) + if (reject && descend_once) + /* let outer call try again */ + skip_rep = 1; + else if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; else if (map->choose_local_fallback_tries > 0 && @@ -489,6 +494,7 @@ int crush_do_rule(const struct crush_map *map, int i, j; int numrep; int firstn; + const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -548,7 +554,8 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, firstn, - recurse_to_leaf, c+osize); + recurse_to_leaf, + descend_once, c+osize); } if (recurse_to_leaf) diff --git a/src/crushtool.cc b/src/crushtool.cc index 14fc49eb9ad2..9789dd0861df 100644 --- a/src/crushtool.cc +++ b/src/crushtool.cc @@ -188,6 +188,7 @@ int main(int argc, const char **argv) int choose_local_tries = -1; int choose_local_fallback_tries = -1; int choose_total_tries = -1; + int chooseleaf_descend_once = -1; CrushWrapper crush; @@ -249,6 +250,9 @@ int main(int argc, const char **argv) } else if (ceph_argparse_withint(args, i, &choose_total_tries, &err, "--set_choose_total_tries", (char*)NULL)) { adjust = true; + } else if (ceph_argparse_withint(args, i, &chooseleaf_descend_once, &err, + "--set_chooseleaf_descend_once", (char*)NULL)) { + adjust = true; } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) { reweight = true; } else if (ceph_argparse_withint(args, i, &add_item, &err, "--add_item", (char*)NULL)) { @@ -668,6 +672,14 @@ int main(int argc, const char **argv) crush.set_choose_total_tries(choose_total_tries); modified = true; } + if (chooseleaf_descend_once >= 0) { + if (!unsafe_tunables) { + cerr << scary_tunables_message << std::endl; + return -1; + } + crush.set_chooseleaf_descend_once(chooseleaf_descend_once); + modified = true; + } if (modified) { crush.finalize();