From: Adam C. Emerson Date: Tue, 28 Jun 2016 21:55:39 +0000 (-0400) Subject: crush: Remove mutable part of CRUSH map X-Git-Tag: v11.1.0~328^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cbcd039651c0569551cb90d26ce27e1432671f2a;p=ceph.git crush: Remove mutable part of CRUSH map Then add it to the working state. It would be very nice if we didn't have to take a lock to calculate a crush placement. By moving the permutation array into the working data, we can treat the CRUSH map as immutable. Signed-off-by: Adam C. Emerson --- diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 490c79fc97a1..afc91f5d301c 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -1365,9 +1365,6 @@ void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator ::decode(bucket->items[j], blp); } - bucket->perm = (__u32*)calloc(1, bucket->size * sizeof(__u32)); - bucket->perm_n = 0; - switch (bucket->alg) { case CRUSH_BUCKET_UNIFORM: ::decode((reinterpret_cast(bucket))->item_weight, blp); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 077417d0ee4c..36013321aa25 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -1091,23 +1091,25 @@ public: Mutex::Locker l(mapper_lock); int rawout[maxout]; int scratch[maxout * 3]; - int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], weight.size(), scratch); + char work[crush->working_size]; + crush_init_workspace(crush, work); + int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], + weight.size(), work, scratch); if (numrep < 0) numrep = 0; out.resize(numrep); for (int i=0; imax_rules; i++) { if (crush->rules[i] && - crush->rules[i]->mask.ruleset == ruleset && - crush->rules[i]->mask.type == type) { + crush->rules[i]->mask.ruleset == ruleset && + crush->rules[i]->mask.type == type) { if (crush->rules[i]->mask.min_size <= size && crush->rules[i]->mask.max_size >= size) { diff --git a/src/crush/builder.c b/src/crush/builder.c index 9331f6d8bada..71a6264e76a9 100644 --- a/src/crush/builder.c +++ b/src/crush/builder.c @@ -45,6 +45,13 @@ void crush_finalize(struct crush_map *map) int b; __u32 i; + /* Calculate the needed working space while we do other + finalization tasks. */ + map->working_size = sizeof(struct crush_work); + /* Space for the array of pointers to per-bucket workspace */ + map->working_size += map->max_buckets * + sizeof(struct crush_work_bucket *); + /* calc max_devices */ map->max_devices = 0; for (b=0; bmax_buckets; b++) { @@ -53,13 +60,21 @@ void crush_finalize(struct crush_map *map) for (i=0; ibuckets[b]->size; i++) if (map->buckets[b]->items[i] >= map->max_devices) map->max_devices = map->buckets[b]->items[i] + 1; + + switch (map->buckets[b]->alg) { + default: + /* The base case, permutation variables and + the pointer to the permutation array. */ + map->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + map->working_size += map->buckets[b]->size * sizeof(__u32); } } - - /** rules **/ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno) @@ -212,16 +227,11 @@ crush_make_uniform_bucket(int hash, int type, int size, if (!bucket->h.items) goto err; - bucket->h.perm = malloc(sizeof(__u32)*size); - - if (!bucket->h.perm) - goto err; for (i=0; ih.items[i] = items[i]; return bucket; err: - free(bucket->h.perm); free(bucket->h.items); free(bucket); return NULL; @@ -251,9 +261,6 @@ crush_make_list_bucket(int hash, int type, int size, bucket->h.items = malloc(sizeof(__s32)*size); if (!bucket->h.items) goto err; - bucket->h.perm = malloc(sizeof(__u32)*size); - if (!bucket->h.perm) - goto err; bucket->item_weights = malloc(sizeof(__u32)*size); @@ -282,7 +289,6 @@ crush_make_list_bucket(int hash, int type, int size, err: free(bucket->sum_weights); free(bucket->item_weights); - free(bucket->h.perm); free(bucket->h.items); free(bucket); return NULL; @@ -347,7 +353,6 @@ crush_make_tree_bucket(int hash, int type, int size, if (size == 0) { bucket->h.items = NULL; - bucket->h.perm = NULL; bucket->h.weight = 0; bucket->node_weights = NULL; bucket->num_nodes = 0; @@ -358,9 +363,6 @@ crush_make_tree_bucket(int hash, int type, int size, bucket->h.items = malloc(sizeof(__s32)*size); if (!bucket->h.items) goto err; - bucket->h.perm = malloc(sizeof(__u32)*size); - if (!bucket->h.perm) - goto err; /* calc tree depth */ depth = calc_depth(size); @@ -399,7 +401,6 @@ crush_make_tree_bucket(int hash, int type, int size, return bucket; err: free(bucket->node_weights); - free(bucket->h.perm); free(bucket->h.items); free(bucket); return NULL; @@ -577,9 +578,6 @@ crush_make_straw_bucket(struct crush_map *map, bucket->h.items = malloc(sizeof(__s32)*size); if (!bucket->h.items) goto err; - bucket->h.perm = malloc(sizeof(__u32)*size); - if (!bucket->h.perm) - goto err; bucket->item_weights = malloc(sizeof(__u32)*size); if (!bucket->item_weights) goto err; @@ -601,7 +599,6 @@ crush_make_straw_bucket(struct crush_map *map, err: free(bucket->straws); free(bucket->item_weights); - free(bucket->h.perm); free(bucket->h.items); free(bucket); return NULL; @@ -630,9 +627,6 @@ crush_make_straw2_bucket(struct crush_map *map, bucket->h.items = malloc(sizeof(__s32)*size); if (!bucket->h.items) goto err; - bucket->h.perm = malloc(sizeof(__u32)*size); - if (!bucket->h.perm) - goto err; bucket->item_weights = malloc(sizeof(__u32)*size); if (!bucket->item_weights) goto err; @@ -647,7 +641,6 @@ crush_make_straw2_bucket(struct crush_map *map, return bucket; err: free(bucket->item_weights); - free(bucket->h.perm); free(bucket->h.items); free(bucket); return NULL; @@ -698,11 +691,6 @@ int crush_add_uniform_bucket_item(struct crush_bucket_uniform *bucket, int item, } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } bucket->h.items[newsize-1] = item; @@ -725,11 +713,6 @@ int crush_add_list_bucket_item(struct crush_bucket_list *bucket, int item, int w } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -775,17 +758,12 @@ int crush_add_tree_bucket_item(struct crush_bucket_tree *bucket, int item, int w } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->node_weights, sizeof(__u32)*bucket->num_nodes)) == NULL) { return -ENOMEM; } else { bucket->node_weights = _realloc; } - + node = crush_calc_tree_node(newsize-1); bucket->node_weights[node] = weight; @@ -824,7 +802,7 @@ int crush_add_straw_bucket_item(struct crush_map *map, int item, int weight) { int newsize = bucket->h.size + 1; - + void *_realloc = NULL; if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { @@ -832,11 +810,6 @@ int crush_add_straw_bucket_item(struct crush_map *map, } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -873,11 +846,6 @@ int crush_add_straw2_bucket_item(struct crush_map *map, } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -899,9 +867,6 @@ int crush_add_straw2_bucket_item(struct crush_map *map, int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *b, int item, int weight) { - /* invalidate perm cache */ - b->perm_n = 0; - switch (b->alg) { case CRUSH_BUCKET_UNIFORM: return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight); @@ -945,11 +910,6 @@ int crush_remove_uniform_bucket_item(struct crush_bucket_uniform *bucket, int it } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } return 0; } @@ -984,11 +944,6 @@ int crush_remove_list_bucket_item(struct crush_bucket_list *bucket, int item) } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -1053,11 +1008,6 @@ int crush_remove_tree_bucket_item(struct crush_bucket_tree *bucket, int item) } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } olddepth = calc_depth(bucket->h.size); newdepth = calc_depth(newsize); @@ -1106,11 +1056,6 @@ int crush_remove_straw_bucket_item(struct crush_map *map, } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -1155,11 +1100,6 @@ int crush_remove_straw2_bucket_item(struct crush_map *map, } else { bucket->h.items = _realloc; } - if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { - return -ENOMEM; - } else { - bucket->h.perm = _realloc; - } if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) { return -ENOMEM; } else { @@ -1171,9 +1111,6 @@ int crush_remove_straw2_bucket_item(struct crush_map *map, int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *b, int item) { - /* invalidate perm cache */ - b->perm_n = 0; - switch (b->alg) { case CRUSH_BUCKET_UNIFORM: return crush_remove_uniform_bucket_item((struct crush_bucket_uniform *)b, item); diff --git a/src/crush/crush.c b/src/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/src/crush/crush.c +++ b/src/crush/crush.c @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) { kfree(b->item_weights); kfree(b->sum_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b->node_weights); kfree(b); @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { kfree(b->straws); kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } diff --git a/src/crush/crush.h b/src/crush/crush.h index be8f12b8f195..d2c235af690f 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -135,13 +135,6 @@ struct crush_bucket { __u32 size; /* num items */ __s32 *items; - /* - * cached random permutation: used for uniform bucket and for - * the linear search fallback for the other bucket types. - */ - __u32 perm_x; /* @x for which *perm is defined */ - __u32 perm_n; /* num elements of *perm that are permuted/defined */ - __u32 *perm; }; struct crush_bucket_uniform { @@ -211,6 +204,19 @@ struct crush_map { * device fails. */ __u8 chooseleaf_stable; + /* This value is calculated after decode or construction by + the builder. It is exposed here (rather than having a + 'build CRUSH working space' function) so that callers can + reserve a static buffer, allocate space on the stack, or + otherwise avoid calling into the heap allocator if they + want to. The size of the working space depends on the map, + while the size of the scratch vector passed to the mapper + depends on the size of the desired result set. + + Nothing stops the caller from allocating both in one swell + foop and passing in two points, though. */ + size_t working_size; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 @@ -248,4 +254,26 @@ static inline int crush_calc_tree_node(int i) return ((i+1) << 1)-1; } +/* --------------------------------------------------------------------- + Private + --------------------------------------------------------------------- */ + +/* These data structures are private to the CRUSH implementation. They + are exposed in this header file because builder needs their + definitions to calculate the total working size. + + Moving this out of the crush map allow us to treat the CRUSH map as + immutable within the mapper and removes the requirement for a CRUSH + map lock. */ + +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + #endif diff --git a/src/crush/mapper.c b/src/crush/mapper.c index d565a67b149b..4d37e0e9829f 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -52,7 +52,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size return -1; } - /* * bucket choose methods * @@ -70,59 +69,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size * Since this is expensive, we optimize for the r=0 case, which * captures the vast majority of calls. */ -static int bucket_perm_choose(struct crush_bucket *bucket, +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, int x, int r) { unsigned int pr = r % bucket->size; unsigned int i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + if (work->perm_x != (__u32)x || work->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; + work->perm_x = x; /* optimize common r=0 case */ if (pr == 0) { s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ goto out; } for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { /* clean up after the r=0 case above */ for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; } /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) + for (i = 0; i < work->perm_n; i++) dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned int p = bucket->perm_n; + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; /* no point in swapping the final entry */ if (p < bucket->size - 1) { i = crush_hash32_3(bucket->hash, x, bucket->id, p) % (bucket->size - p); if (i) { - unsigned int t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; } dprintk(" perm_choose swap %d with %d\n", p, p+i); } - bucket->perm_n++; + work->perm_n++; } for (i = 0; i < bucket->size; i++) dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); - s = bucket->perm[pr]; + s = work->perm[pr]; out: dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, bucket->size, x, r, pr, s); @@ -130,14 +130,14 @@ out: } /* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) { - return bucket_perm_choose(&bucket->h, x, r); + return bucket_perm_choose(&bucket->h, work, x, r); } /* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, +static int bucket_list_choose(const struct crush_bucket_list *bucket, int x, int r) { int i; @@ -153,8 +153,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, w *= bucket->sum_weights[i]; w = w >> 16; /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) + if (w < bucket->item_weights[i]) { return bucket->h.items[i]; + } } dprintk("bad list sums for bucket %d\n", bucket->h.id); @@ -190,7 +191,7 @@ static int terminal(int x) return x & 1; } -static int bucket_tree_choose(struct crush_bucket_tree *bucket, +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, int x, int r) { int n; @@ -222,7 +223,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, /* straw */ -static int bucket_straw_choose(struct crush_bucket_straw *bucket, +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, int x, int r) { __u32 i; @@ -255,7 +256,7 @@ static __u64 crush_ln(unsigned int xin) iexpon = 15; // figure out number of bits we need to shift and - // do it in one step instead of iteratively + // do it in one step instead of iteratively if (!(x & 0x18000)) { int bits = __builtin_clz(x & 0x1FFFF) - 16; x <<= bits; @@ -297,7 +298,7 @@ static __u64 crush_ln(unsigned int xin) * */ -static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, int x, int r) { unsigned int i, high = 0; @@ -340,37 +341,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, high_draw = draw; } } + return bucket->h.items[high]; } -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, + return bucket_list_choose((const struct crush_bucket_list *)in, x, r); case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, + return bucket_tree_choose((const struct crush_bucket_tree *)in, x, r); case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); case CRUSH_BUCKET_STRAW2: - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, - x, r); + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } - /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -412,7 +418,8 @@ static int is_out(const struct crush_map *map, * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, @@ -430,7 +437,7 @@ static int crush_choose_firstn(const struct crush_map *map, int rep; unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int r; int i; int item = 0; @@ -452,7 +459,7 @@ parent_r %d stable %d\n", skip_rep = 0; do { retry_descent = 0; - in = bucket; /* initial bucket */ + in = bucket; /* initial bucket */ /* choose through intervening buckets */ flocal = 0; @@ -471,9 +478,13 @@ parent_r %d stable %d\n", if (local_fallback_retries > 0 && flocal >= (in->size>>1) && flocal > local_fallback_retries) - item = bucket_perm_choose(in, x, r); + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); else - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -516,25 +527,27 @@ parent_r %d stable %d\n", sub_r = r >> (vary_r-1); else sub_r = 0; - if (crush_choose_firstn(map, - map->buckets[-1-item], - weight, weight_max, - x, stable ? 1 : outpos+1, 0, - out2, outpos, count, - recurse_tries, 0, - local_retries, - local_fallback_retries, - 0, - vary_r, - stable, - NULL, - sub_r) <= outpos) + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { /* we already have a leaf! */ out2[outpos] = item; - } + } } if (!reject) { @@ -598,7 +611,8 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, @@ -608,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, int *out2, int parent_r) { - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int endpos = outpos + left; int rep; unsigned int ftotal; @@ -676,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, break; } - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -722,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { - crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - recurse_tries, 0, - 0, NULL, r); + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -779,6 +797,50 @@ static void crush_choose_indep(const struct crush_map *map, #endif } + +/* This takes a chunk of memory and sets it up to be a shiny new + working area for a CRUSH placement computation. It must be called + on any newly allocated memory before passing it in to + crush_do_rule. It may be used repeatedly after that, so long as the + map has not changed. If the map /has/ changed, you must make sure + the working size is no smaller than what was allocated and re-run + crush_init_workspace. + + If you do retain the working space between calls to crush, make it + thread-local. If you reinstitute the locking I've spent so much + time getting rid of, I will be very unhappy with you. */ + +void crush_init_workspace(const struct crush_map *m, void *v) { + /* We work by moving through the available space and setting + values and pointers as we go. + + It's a bit like Forth's use of the 'allot' word since we + set the pointer first and then reserve the space for it to + point to by incrementing the point. */ + struct crush_work *w = (struct crush_work *)v; + char *point = (char *)v; + __s32 b; + point += sizeof(struct crush_work *); + w->work = (struct crush_work_bucket **)point; + point += m->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < m->max_buckets; ++b) { + if (m->buckets[b] == 0) + continue; + + w->work[b] = (struct crush_work_bucket *) point; + switch (m->buckets[b]->alg) { + default: + point += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = (__u32 *)point; + point += m->buckets[b]->size * sizeof(__u32); + } + BUG_ON((char *)point - (char *)w != m->working_size); +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -788,14 +850,16 @@ static void crush_choose_indep(const struct crush_map *map, * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector + * @cwin: Pointer to at least map->working_size bytes of memory or NULL. * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - int *scratch) + void *cwin, int *scratch) { int result_len; + struct crush_work *cw = cwin; int *a = scratch; int *b = scratch + result_max; int *c = scratch + result_max*2; @@ -805,7 +869,7 @@ int crush_do_rule(const struct crush_map *map, int *o; int osize; int *tmp; - struct crush_rule *rule; + const struct crush_rule *rule; __u32 step; int i, j; int numrep; @@ -836,9 +900,10 @@ int crush_do_rule(const struct crush_map *map, w = a; o = b; + for (step = 0; step < rule->len; step++) { int firstn = 0; - struct crush_rule_step *curstep = &rule->steps[step]; + const struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: @@ -934,6 +999,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, + cw, map->buckets[bno], weight, weight_max, x, numrep, @@ -954,6 +1020,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, + cw, map->buckets[bno], weight, weight_max, x, out_size, numrep, @@ -995,5 +1062,6 @@ int crush_do_rule(const struct crush_map *map, break; } } + return result_len; } diff --git a/src/crush/mapper.h b/src/crush/mapper.h index 5dfd5b1125d2..0b0f05e0cdc6 100644 --- a/src/crush/mapper.h +++ b/src/crush/mapper.h @@ -15,6 +15,8 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - int *scratch); + void *cwin, int *scratch); + +extern void crush_init_workspace(const struct crush_map *m, void *v); #endif diff --git a/src/test/crush/crush.cc b/src/test/crush/crush.cc index c46fa87ab540..6d659ae19b1c 100644 --- a/src/test/crush/crush.cc +++ b/src/test/crush/crush.cc @@ -68,6 +68,8 @@ CrushWrapper *build_indep_map(CephContext *cct, int num_rack, int num_host, assert(ret == 0); c->set_rule_name(ruleno, "data"); + c->finalize(); + if (false) { Formatter *f = Formatter::create("json-pretty"); f->open_object_section("crush_map"); @@ -291,6 +293,8 @@ TEST(CRUSH, straw_zero) { "firstn", pg_pool_t::TYPE_REPLICATED); EXPECT_EQ(1, ruleset1); + c->finalize(); + vector reweight(n, 0x10000); for (int i=0; i<10000; ++i) { vector out0, out1; @@ -382,6 +386,8 @@ TEST(CRUSH, straw_same) { jf.flush(cout); } + c->finalize(); + vector sum0(n, 0), sum1(n, 0); vector reweight(n, 0x10000); int different = 0; @@ -451,6 +457,8 @@ double calc_straw2_stddev(int *weights, int n, bool verbose) totalweight /= (double)0x10000; double avgweight = totalweight / n; + c->finalize(); + int total = 1000000; for (int i=0; i out; @@ -591,6 +599,8 @@ TEST(CRUSH, straw2_reweight) { totalweight /= (double)0x10000; double avgweight = totalweight / n; + c->finalize(); + int total = 1000000; for (int i=0; i out0, out1; diff --git a/src/test/erasure-code/TestErasureCodeIsa.cc b/src/test/erasure-code/TestErasureCodeIsa.cc index 2b794ce6c7a4..382d789a360e 100644 --- a/src/test/erasure-code/TestErasureCodeIsa.cc +++ b/src/test/erasure-code/TestErasureCodeIsa.cc @@ -905,6 +905,8 @@ TEST_F(IsaErasureCodeTest, create_ruleset) } } + c->finalize(); + { stringstream ss; ErasureCodeIsaDefault isa(tcache); diff --git a/src/test/erasure-code/TestErasureCodeJerasure.cc b/src/test/erasure-code/TestErasureCodeJerasure.cc index c8f0e37da872..01c27d95439c 100644 --- a/src/test/erasure-code/TestErasureCodeJerasure.cc +++ b/src/test/erasure-code/TestErasureCodeJerasure.cc @@ -307,6 +307,8 @@ TEST(ErasureCodeTest, create_ruleset) } } + c->finalize(); + { stringstream ss; ErasureCodeJerasureReedSolomonVandermonde jerasure; diff --git a/src/test/erasure-code/TestErasureCodeLrc.cc b/src/test/erasure-code/TestErasureCodeLrc.cc index 758fd8e9e192..5d03467805cf 100644 --- a/src/test/erasure-code/TestErasureCodeLrc.cc +++ b/src/test/erasure-code/TestErasureCodeLrc.cc @@ -130,6 +130,8 @@ TEST(ErasureCodeTest, create_ruleset) } } + c->finalize(); + ErasureCodeLrc lrc(g_conf->erasure_code_dir); EXPECT_EQ(0, lrc.create_ruleset("rule1", *c, &cerr));