From: Sage Weil Date: Tue, 4 Feb 2014 21:38:29 +0000 (-0800) Subject: crush: add chooseleaf_vary_r tunable X-Git-Tag: v0.78~202^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a8e6c9fbf88bad056dd05d3eb790e98a5e43451a;p=ceph.git crush: add chooseleaf_vary_r tunable The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Signed-off-by: Sage Weil --- diff --git a/src/crush/builder.c b/src/crush/builder.c index c524cfcf1e2..eff0bf63a52 100644 --- a/src/crush/builder.c +++ b/src/crush/builder.c @@ -26,6 +26,7 @@ struct crush_map *crush_create() m->choose_local_fallback_tries = 5; m->choose_total_tries = 19; m->chooseleaf_descend_once = 0; + m->chooseleaf_vary_r = 0; return m; } diff --git a/src/crush/crush.h b/src/crush/crush.h index 0da7180b449..249a9929107 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -182,6 +182,12 @@ struct crush_map { * to. */ __u32 chooseleaf_descend_once; + /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) + * bits. a value of 1 is best for new clusters. for legacy clusters + * that want to limit reshuffling, a value of 3 or 4 will make the + * mappings line up a bit better with previous mappings. */ + __u8 chooseleaf_vary_r; + __u32 *choose_tries; }; diff --git a/src/crush/mapper.c b/src/crush/mapper.c index 0b318443bcd..89227028f72 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -296,7 +296,9 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, struct crush_bucket *bucket, @@ -308,7 +310,9 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_retries, unsigned int local_fallback_retries, int recurse_to_leaf, - int *out2) + unsigned int vary_r, + int *out2, + int parent_r) { int rep; unsigned int ftotal, flocal; @@ -320,8 +324,11 @@ static int crush_choose_firstn(const struct crush_map *map, int itemtype; int collide, reject; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -336,7 +343,7 @@ static int crush_choose_firstn(const struct crush_map *map, do { collide = 0; retry_bucket = 0; - r = rep; + r = rep + parent_r; /* r' = r + f_total */ r += ftotal; @@ -388,6 +395,11 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, @@ -397,7 +409,9 @@ static int crush_choose_firstn(const struct crush_map *map, local_retries, local_fallback_retries, 0, - NULL) <= outpos) + vary_r, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -685,6 +699,8 @@ int crush_do_rule(const struct crush_map *map, int choose_local_retries = map->choose_local_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries; + int vary_r = map->chooseleaf_vary_r; + if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); return 0; @@ -777,7 +793,9 @@ int crush_do_rule(const struct crush_map *map, choose_local_retries, choose_local_fallback_retries, recurse_to_leaf, - c+osize); + vary_r, + c+osize, + 0); } else { crush_choose_indep( map,