From 19537a450fd5c5a0bb8b7830947507a76db2ceca Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 13 Apr 2017 18:14:44 +0200 Subject: [PATCH] crush: implement weight and id overrides for straw2 bucket_straw2_choose needs to use weights that may be different from weight_items. For instance to compensate for an uneven distribution caused by a low number of values. Or to fix the probability biais introduced by conditional probabilities (see http://tracker.ceph.com/issues/15653 for more information). We introduce a weight_set for each straw2 bucket to set the desired weight for a given item at a given position. The weight of a given item when picking the first replica (first position) may be different from the weight the second replica (second position). For instance the weight matrix for a given bucket containing items 3, 7 and 13 could be as follows: position 0 position 1 item 3 0x10000 0x100000 item 7 0x40000 0x10000 item 13 0x40000 0x10000 When crush_do_rule picks the first of two replicas (position 0), item 7, 3 are four times more likely to be choosen by bucket_straw2_choose than item 13. When choosing the second replica (position 1), item 3 is ten times more likely to be choosen than item 7, 13. By default the weight_set of each bucket exactly matches the content of item_weights for each position to ensure backward compatibility. bucket_straw2_choose compares items by using their id. The same ids are also used to index buckets and they must be unique. For each item in a bucket an array of ids can be provided for placement purposes and they are used instead of the ids. If no replacement ids are provided, the legacy behavior is preserved. Signed-off-by: Loic Dachary --- src/crush/builder.c | 61 +++++++++++++++++++++++++++++++++++++++ src/crush/builder.h | 2 ++ src/crush/crush.h | 55 +++++++++++++++++++++++++++++++++++ src/crush/mapper.c | 70 +++++++++++++++++++++++++++++++++------------ src/crush/mapper.h | 5 ++-- 5 files changed, 172 insertions(+), 21 deletions(-) diff --git a/src/crush/builder.c b/src/crush/builder.c index 5a401d40067..00f4b6c2ee9 100644 --- a/src/crush/builder.c +++ b/src/crush/builder.c @@ -1400,6 +1400,67 @@ int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *b) } } +struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions) +{ + int b; + int sum_bucket_size = 0; + int bucket_count = 0; + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == 0) + continue; + sum_bucket_size += map->buckets[b]->size; + bucket_count++; + } + dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n", + sum_bucket_size, map->max_buckets, bucket_count); + int size = (sizeof(struct crush_choose_arg) * map->max_buckets + + sizeof(struct crush_weight_set) * bucket_count * num_positions + + sizeof(__u32) * sum_bucket_size * num_positions + // weights + sizeof(__u32) * sum_bucket_size); // ids + char *space = malloc(size); + struct crush_choose_arg *arg = (struct crush_choose_arg *)space; + struct crush_weight_set *weight_set = (struct crush_weight_set *)(arg + map->max_buckets); + __u32 *weights = (__u32 *)(weight_set + bucket_count * num_positions); + char *weight_set_ends = (char*)weights; + int *ids = (int *)(weights + sum_bucket_size * num_positions); + char *weights_end = (char *)ids; + char *ids_end = (char *)(ids + sum_bucket_size); + BUG_ON(space + size != ids_end); + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == 0) { + memset(&arg[b], '\0', sizeof(struct crush_choose_arg)); + continue; + } + struct crush_bucket_straw2 *bucket = (struct crush_bucket_straw2 *)map->buckets[b]; + + int position; + for (position = 0; position < num_positions; position++) { + memcpy(weights, bucket->item_weights, sizeof(__u32) * bucket->h.size); + weight_set[position].weights = weights; + weight_set[position].size = bucket->h.size; + dprintk("moving weight %d bytes forward\n", (int)((weights + bucket->h.size) - weights)); + weights += bucket->h.size; + } + arg[b].weight_set = weight_set; + arg[b].weight_set_size = num_positions; + weight_set += position; + + memcpy(ids, bucket->h.items, sizeof(int) * bucket->h.size); + arg[b].ids = ids; + arg[b].ids_size = bucket->h.size; + ids += bucket->h.size; + } + BUG_ON((char*)weight_set_ends != (char*)weight_set); + BUG_ON((char*)weights_end != (char*)weights); + BUG_ON((char*)ids != (char*)ids_end); + return arg; +} + +void crush_destroy_choose_args(struct crush_choose_arg *args) +{ + free(args); +} + /***************************/ /* methods to check for safe arithmetic operations */ diff --git a/src/crush/builder.h b/src/crush/builder.h index 05f2fe1cf12..651729ff0e0 100644 --- a/src/crush/builder.h +++ b/src/crush/builder.h @@ -208,6 +208,8 @@ extern int crush_add_bucket(struct crush_map *map, * @returns a pointer to the newly created bucket or NULL */ struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights); +extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions); +extern void crush_destroy_choose_args(struct crush_choose_arg *args); /** @ingroup API * * Add __item__ to __bucket__ with __weight__. The weight of the new diff --git a/src/crush/crush.h b/src/crush/crush.h index 4c89c2781e5..83963ef9cc0 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -238,6 +238,61 @@ struct crush_bucket { __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */ }; +/** @ingroup API + * + * Replacement weights for each item in a bucket. The size of the + * array must be exactly the size of the straw2 bucket, just as the + * item_weights array. + * + */ +struct crush_weight_set { + __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */ + __u32 size; /*!< size of the __weights__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for a given straw2 bucket, for + * placement purposes. + * + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the + * replacement weights found at __weight_set[N]__ are used instead of + * the weights from __item_weights__. If __N__ is greater than + * __weight_set_size__, the weights found at __weight_set_size-1__ are + * used instead. For instance if __weight_set__ is: + * + * [ [ 0x10000, 0x20000 ], // position 0 + * [ 0x20000, 0x40000 ] ] // position 1 + * + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] + * etc. + * + */ +struct crush_choose_arg { + int *ids; /*!< values to use instead of items */ + __u32 ids_size; /*!< size of the __ids__ array */ + struct crush_weight_set *weight_set; /*!< weight replacements for a given position */ + __u32 weight_set_size; /*!< size of the __weight_set__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for each bucket in the crushmap. The + * __size__ of the __args__ array must be exactly the same as the + * __map->max_buckets__. + * + * The __crush_choose_arg__ at index N will be used when choosing + * an item from the bucket __map->buckets[N]__ bucket, provided it + * is a straw2 bucket. + * + */ +struct crush_choose_arg_map { + struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ + __u32 size; /*!< size of the __args__ array */ +}; + /** @ingroup API * The weight of each item in the bucket when * __h.alg__ == ::CRUSH_BUCKET_UNIFORM. diff --git a/src/crush/mapper.c b/src/crush/mapper.c index f790cb431cd..9fc06e89a31 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -299,19 +299,40 @@ static __u64 crush_ln(unsigned int xin) * */ +static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg, + int position) +{ + if ((arg == NULL) || + (arg->weight_set == NULL) || + (arg->weight_set_size == 0)) + return bucket->item_weights; + if (position >= arg->weight_set_size) + position = arg->weight_set_size - 1; + return arg->weight_set[position].weights; +} + +static inline int *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg) +{ + if ((arg == NULL) || (arg->ids == NULL)) + return bucket->h.items; + return arg->ids; +} + static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, - int x, int r) + int x, int r, const struct crush_choose_arg *arg, + int position) { unsigned int i, high = 0; unsigned int u; - unsigned int w; __s64 ln, draw, high_draw = 0; - + __u32 *weights = get_choose_arg_weights(bucket, arg, position); + int *ids = get_choose_arg_ids(bucket, arg); for (i = 0; i < bucket->h.size; i++) { - w = bucket->item_weights[i]; - if (w) { - u = crush_hash32_3(bucket->h.hash, x, - bucket->h.items[i], r); + dprintk("weight 0x%x item %d\n", weights[i], ids[i]); + if (weights[i]) { + u = crush_hash32_3(bucket->h.hash, x, ids[i], r); u &= 0xffff; /* @@ -332,7 +353,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, * weight means a larger (less negative) value * for draw. */ - draw = div64_s64(ln, w); + draw = div64_s64(ln, weights[i]); } else { draw = S64_MIN; } @@ -349,7 +370,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, static int crush_bucket_choose(const struct crush_bucket *in, struct crush_work_bucket *work, - int x, int r) + int x, int r, + const struct crush_choose_arg *arg, + int position) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); @@ -371,7 +394,7 @@ static int crush_bucket_choose(const struct crush_bucket *in, case CRUSH_BUCKET_STRAW2: return bucket_straw2_choose( (const struct crush_bucket_straw2 *)in, - x, r); + x, r, arg, position); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; @@ -433,7 +456,8 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int vary_r, unsigned int stable, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { int rep; unsigned int ftotal, flocal; @@ -485,7 +509,9 @@ parent_r %d stable %d\n", else item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -542,7 +568,8 @@ parent_r %d stable %d\n", vary_r, stable, NULL, - sub_r) <= outpos) + sub_r, + choose_args) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -619,7 +646,8 @@ static void crush_choose_indep(const struct crush_map *map, unsigned int recurse_tries, int recurse_to_leaf, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { const struct crush_bucket *in = bucket; int endpos = outpos + left; @@ -691,7 +719,9 @@ static void crush_choose_indep(const struct crush_map *map, item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -745,7 +775,7 @@ static void crush_choose_indep(const struct crush_map *map, x, 1, numrep, 0, out2, rep, recurse_tries, 0, - 0, NULL, r); + 0, NULL, r, choose_args); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -854,7 +884,7 @@ void crush_init_workspace(const struct crush_map *m, void *v) { int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - void *cwin) + void *cwin, const struct crush_choose_arg *choose_args) { int result_len; struct crush_work *cw = cwin; @@ -1009,7 +1039,8 @@ int crush_do_rule(const struct crush_map *map, vary_r, stable, c+osize, - 0); + 0, + choose_args); } else { out_size = ((numrep < (result_max-osize)) ? numrep : (result_max-osize)); @@ -1026,7 +1057,8 @@ int crush_do_rule(const struct crush_map *map, choose_leaf_tries : 1, recurse_to_leaf, c+osize, - 0); + 0, + choose_args); osize += out_size; } } diff --git a/src/crush/mapper.h b/src/crush/mapper.h index 8de37e82928..b397070bef2 100644 --- a/src/crush/mapper.h +++ b/src/crush/mapper.h @@ -67,7 +67,8 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i * @param result_max the size of the __result__ array * @param weights an array of weights of size __weight_max__ * @param weight_max the size of the __weights__ array - * @param cwin must be the value of crush_work_size(__map__, __result_max__) + * @param cwin must be an char array initialized by crush_init_workspace + * @param choose_args weights and ids for each known bucket * * @return 0 on error or the size of __result__ on success */ @@ -75,7 +76,7 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - void *cwin); + void *cwin, const struct crush_choose_arg *choose_args); /* Returns the exact amount of workspace that will need to be used for a given combination of crush_map and result_max. The caller can -- 2.39.5