From bbfa5bac97e624e042ba67bab4d0b8bcafefaa0c Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 16 Nov 2023 20:22:19 -0800 Subject: [PATCH] crush/mapper: add support for MSR types Signed-off-by: Samuel Just --- src/crush/mapper.c | 1087 +++++++++++++++++++++++++++++++++++++++++--- src/crush/mapper.h | 31 +- 2 files changed, 1050 insertions(+), 68 deletions(-) diff --git a/src/crush/mapper.c b/src/crush/mapper.c index 736cc6162c9..021777ef0b2 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -27,6 +27,9 @@ #define dprintk(args...) /* printf(args) */ +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define MAX(y, x) ((x) < (y) ? (y) : (x)) + /* * Implement the core CRUSH mapping algorithm. */ @@ -820,65 +823,11 @@ static void crush_choose_indep(const struct crush_map *map, #endif } - -/* This takes a chunk of memory and sets it up to be a shiny new - working area for a CRUSH placement computation. It must be called - on any newly allocated memory before passing it in to - crush_do_rule. It may be used repeatedly after that, so long as the - map has not changed. If the map /has/ changed, you must make sure - the working size is no smaller than what was allocated and re-run - crush_init_workspace. - - If you do retain the working space between calls to crush, make it - thread-local. If you reinstitute the locking I've spent so much - time getting rid of, I will be very unhappy with you. */ - -void crush_init_workspace(const struct crush_map *m, void *v) { - /* We work by moving through the available space and setting - values and pointers as we go. - - It's a bit like Forth's use of the 'allot' word since we - set the pointer first and then reserve the space for it to - point to by incrementing the point. */ - struct crush_work *w = (struct crush_work *)v; - char *point = (char *)v; - __s32 b; - point += sizeof(struct crush_work); - w->work = (struct crush_work_bucket **)point; - point += m->max_buckets * sizeof(struct crush_work_bucket *); - for (b = 0; b < m->max_buckets; ++b) { - if (m->buckets[b] == 0) - continue; - - w->work[b] = (struct crush_work_bucket *) point; - switch (m->buckets[b]->alg) { - default: - point += sizeof(struct crush_work_bucket); - break; - } - w->work[b]->perm_x = 0; - w->work[b]->perm_n = 0; - w->work[b]->perm = (__u32 *)point; - point += m->buckets[b]->size * sizeof(__u32); - } - BUG_ON((char *)point - (char *)w != m->working_size); -} - -/** - * crush_do_rule - calculate a mapping with the given input and rule - * @map: the crush_map - * @ruleno: the rule id - * @x: hash input - * @result: pointer to result vector - * @result_max: maximum result size - * @weight: weight vector (for map leaves) - * @weight_max: size of weight vector - * @cwin: Pointer to at least map->working_size bytes of memory or NULL. - */ -int crush_do_rule(const struct crush_map *map, - int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max, - void *cwin, const struct crush_choose_arg *choose_args) +static int crush_do_rule_no_retry( + const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args) { int result_len; struct crush_work *cw = cwin; @@ -1081,3 +1030,1023 @@ int crush_do_rule(const struct crush_map *map, return result_len; } + +/// invariant through crush_msr_do_rule invocation +struct crush_msr_input { + const struct crush_map *map; + const struct crush_rule *rule; + + const unsigned result_max; + + const unsigned weight_len; + const __u32 *weights; + + const int map_input; + const struct crush_choose_arg *choose_args; + + const unsigned msr_descents; + const unsigned msr_collision_tries; +}; + +/// encapsulates work space, invariant within an EMIT block +struct crush_msr_workspace { + const unsigned start_stepno; + const unsigned end_stepno; + + const unsigned result_len; + + const struct crush_work *crush_work; + + // step_vecs has shape int[end_stepno - start_stepno][result_len] + int **step_vecs; +}; + +/** + * crush_msr_output + * + * Encapsulates output space. Successive results through a crush_msr_do_rule + * invocation are placed into *out. + */ +struct crush_msr_output { + const unsigned result_len; + unsigned returned_so_far; + int *out; +}; + +/** + * crush_msr_scan_config_steps + * + * Scans possibly empty sequence of CRUSH_RULE_SET_MSR_*_TRIES + * steps at the start of the rule. Returns index of next step. + * Populates *msr_descents and *msr_collision_tries (if non-null) with + * last matching rule. + * @steps: steps to scan + * @step_len: length of steps + * @msr_descents: out param for CRUSH_RULE_SET_MSR_DESCENTS + * @msr_collision_tries: out param for CRUSH_RULE_SET_MSR_COLLISION_TRIES + */ +static unsigned crush_msr_scan_config_steps( + const struct crush_rule_step *steps, + unsigned step_len, + unsigned *msr_descents, + unsigned *msr_collision_tries) +{ + unsigned stepno = 0; + for (; stepno < step_len; ++stepno) { + const struct crush_rule_step *step = &steps[stepno]; + switch (step->op) { + case CRUSH_RULE_SET_MSR_DESCENTS: + if (msr_descents) *msr_descents = step->arg1; + break; + case CRUSH_RULE_SET_MSR_COLLISION_TRIES: + if (msr_collision_tries) *msr_collision_tries = step->arg1; + break; + default: + return stepno; + } + } + return stepno; +} + +/// clear workspace represented by *ws +static void crush_msr_clear_workspace( + struct crush_msr_workspace *ws) +{ + for (unsigned stepno = ws->start_stepno; stepno < ws->end_stepno; + ++stepno) { + for (unsigned i = 0; i < ws->result_len; ++i) { + ws->step_vecs[stepno - ws->start_stepno][i] = + CRUSH_ITEM_UNDEF; + } + } +} + +/** + * crush_msr_scan_next + * + * Validates an EMIT block of the form (TAKE CHOOSE_MSR* EMIT) + * If sequence is valid, populates total_children with the width + * of the mapping from the choose steps and next_emit with the + * index of the next EMIT step. + * + * @rule: rule to scan + * @result_max: max number of results to return + * @stepno: points at step at which to begin scanning, must be CRUSH_RULE_TAKE + * @total_children: output param for total fanout of EMIT block + * @next_emit: output param for ending EMIT step + * @return 0 if valid, -1 if there were validation errors + */ +static int crush_msr_scan_next( + const struct crush_rule *rule, + unsigned result_max, + unsigned stepno, + unsigned *total_children, + unsigned *next_emit) +{ + if (stepno + 1 >= rule->len) { + dprintk("stepno too large\n"); + return -1; + } + if (rule->steps[stepno].op != CRUSH_RULE_TAKE) { + dprintk("first rule not CRUSH_RULE_TAKE\n"); + return -1; + } + ++stepno; + + if (total_children) *total_children = 1; + for (; stepno < rule->len; ++stepno) { + const struct crush_rule_step *curstep = + &(rule->steps[stepno]); + if (curstep->op == CRUSH_RULE_EMIT) { + break; + } + if (rule->steps[stepno].op != CRUSH_RULE_CHOOSE_MSR) { + dprintk("found non-choose non-emit step %d\n", stepno); + return -1; + } + if (total_children) { + *total_children *= curstep->arg1 ? curstep->arg1 + : result_max; + } + } + if (stepno >= rule->len) { + dprintk("did not find emit\n"); + return -1; + } + if (next_emit) { + *next_emit = stepno; + } + return 0; +} + +/** + * crush_msr_scan_rule + * + * MSR rules must have the form: + * 1) Possibly empty sequence of CRUSH_RULE_SET_MSR_.*_TRIES steps + * 2) A sequence of EMIT blocks of the form + * (TAKE CHOOSE_MSR* EMIT)* + * + * crush_msr_scan_rule validates that the form obeys the above form and + * populates max_steps with the length of the longest sequence of CHOOSE_MSR + * steps. + * + * crush_msr_scan_rule replicates the scan behavior of crush_msr_do_rule. + * + * @rule: rule to scan + * @result_max: max number of results to return + * @max_steps: length of longest string of choosemsr steps + * @return 0 if valid, -1 otherwise + */ +static int crush_msr_scan_rule( + const struct crush_rule *rule, + unsigned result_max, + unsigned *max_steps) +{ + if (max_steps) *max_steps = 0; + unsigned next_stepno = crush_msr_scan_config_steps( + rule->steps, + rule->len, + NULL, NULL); + while (next_stepno < rule->len) { + unsigned next_emit_stepno; + int r = crush_msr_scan_next( + rule, result_max, next_stepno, + NULL, &next_emit_stepno); + if (r < 0) return r; + + if (max_steps) { + *max_steps = MAX( + *max_steps, + next_emit_stepno - (next_stepno + 1)); + } + next_stepno = next_emit_stepno + 1; + } + return 0; +} + +/// Returns true if all leaf slots in [start, end) are mapped +static int crush_msr_leaf_vec_populated( + const struct crush_msr_workspace *workspace, + const unsigned start, const unsigned end) +{ + BUG_ON(start >= end); + BUG_ON(end > workspace->result_len); + BUG_ON(workspace->end_stepno <= workspace->start_stepno); + // we check the last step vector here because output + // won't be ordered by index for FIRSTN rules + int *leaf_vec = workspace->step_vecs[ + workspace->end_stepno - workspace->start_stepno - 1]; + for (unsigned i = start; i < end; ++i) { + if (leaf_vec[i] == CRUSH_ITEM_UNDEF) { + return 0; + } + } + return 1; +} + +/// Returns try value to pass to crush based on index, tries, and local_tries +static unsigned crush_msr_get_retry_value( + const unsigned result_max, + const unsigned index, + const unsigned msr_descents, + const unsigned msr_collision_tries) +{ + const unsigned total_index = (msr_descents * result_max) + index; + return (total_index << 16) + msr_collision_tries; +} + +/** + * crush_msr_descend + * + * Descend recursively from bucket until we either hit a leaf or an + * interior node of type type. + * @input: crush input information + * @workspace: struct with working space + * @bucket: bucket from which to descend + * @type: target node type + * @tryno: top level try number, incremented with each call into crush_msr_choose + * from crush_msr_do_rule + * @local_tryno: local collision try number, incremented with each call into + * crush_msr_descend from crush_msr_choose after collision + * @index: mapping index + */ +static int crush_msr_descend( + const struct crush_msr_input *input, + const struct crush_msr_workspace *workspace, + const struct crush_bucket *bucket, + const int type, + const unsigned tryno, + const unsigned local_tryno, + const unsigned index) +{ + dprintk(" crush_msr_descend type %d tryno %d local_tryno %d index %d\n", + type, tryno, local_tryno, index); + while (1) { + const int child_bucket_candidate = crush_bucket_choose( + bucket, + workspace->crush_work->work[-1 - bucket->id], + input->map_input, + crush_msr_get_retry_value( + input->result_max, + index, tryno, local_tryno), + (input->choose_args ? + &(input->choose_args[-1 - bucket->id]) : 0), + index); + + if (child_bucket_candidate >= 0) { + return child_bucket_candidate; + } + + bucket = input->map->buckets[-1 - child_bucket_candidate]; + if (bucket->type == type) { + return child_bucket_candidate; + } + } +} + +/** + * crush_msr_valid_candidate + * + * Checks whether candidate is a valid choice given buckets already + * mapped for step stepno. + * + * If candidate has already been mapped for a position in + * [include_start, include_end), candidate is valid. + * + * Else, if candidate has already been mapped for a position in + * [exclude_start, exclude_end), candidate is invalid. + * + * Otherwise, candidate is valid. + * + * @stepno: step to check + * @exclude_start: start of exclusion range + * @exclude_end: end of exlusion range + * @include_start: start of inclusion range + * @include_end: end of inclusion range + * @candidate: bucket to check + * + * Note, [exclude_start, exclude_end) must contain [include_start, include_end). + */ +static int crush_msr_valid_candidate( + const struct crush_msr_workspace *workspace, + unsigned stepno, + unsigned exclude_start, + unsigned exclude_end, + unsigned include_start, + unsigned include_end, + int candidate) +{ + BUG_ON(stepno >= workspace->end_stepno); + BUG_ON(stepno < workspace->start_stepno); + + BUG_ON(exclude_end <= exclude_start); + BUG_ON(include_end <= include_start); + + BUG_ON(exclude_start > include_start); + BUG_ON(exclude_end < include_end); + + BUG_ON(exclude_end > workspace->result_len); + + int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; + for (unsigned i = exclude_start; i < exclude_end; ++i) { + if (vec[i] == candidate) { + if (i >= include_start && i < include_end) { + dprintk(" crush_msr_valid_candidate: " + "candidate %d already chosen for " + "stride\n", + candidate); + return 1; + } else { + dprintk(" crush_msr_valid_candidate: " + "candidate %d collision\n", + candidate); + return 0; + } + } + } + dprintk(" crush_msr_valid_candidate: candidate %d no collision\n", + candidate); + return 1; +} + +/** + * crush_msr_push_used + * + * See crush_msr_choose for details, used to push bucket indicies onto collision + * set for specified stride. User is responsible for ensuring that + * [stride_start, stride_end) never holds more than stride_end - stride_start + * entries. + * @workspace: holds working space information + * @stepno: index of step + * @stride_start: start of stride + * @stride_end: one past end of stride + * @candidate: element to add to set + * @return 1 if added (not already present), 0 if not added due to already + * being present + */ +static int crush_msr_push_used( + const struct crush_msr_workspace *workspace, + unsigned stepno, + unsigned stride_start, + unsigned stride_end, + int candidate) +{ + BUG_ON(stepno >= workspace->end_stepno); + BUG_ON(stepno < workspace->start_stepno); + + BUG_ON(stride_end <= stride_start); + BUG_ON(stride_end > workspace->result_len); + int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; + for (unsigned i = stride_start; i < stride_end; ++i) { + if (vec[i] == candidate) { + return 0; + } else if (vec[i] == CRUSH_ITEM_UNDEF) { + vec[i] = candidate; + return 1; + } + } + BUG_ON("impossible"); + return 0; +} + +/** + * crush_msr_pop_used + * + * See crush_msr_choose for details, used to pop bucket indicies from collision + * set for specified stride. If an element is to be popped, crush_msr_pop_used + * must be called prior to pushing another element. + * @workspace: holds working space information + * @stepno: index of step + * @stride_start: start of stride + * @stride_end: one past end of stride + * @candidate: element to pop from set + */ +static void crush_msr_pop_used( + const struct crush_msr_workspace *workspace, + unsigned stepno, + unsigned stride_start, + unsigned stride_end, + int candidate) +{ + BUG_ON(stepno >= workspace->end_stepno); + BUG_ON(stepno < workspace->start_stepno); + + BUG_ON(stride_end <= stride_start); + BUG_ON(stride_end > workspace->result_len); + int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; + for (unsigned i = stride_end; i > stride_start;) { + --i; + if (vec[i] != CRUSH_ITEM_UNDEF) { + BUG_ON(vec[i] != candidate); + vec[i] = CRUSH_ITEM_UNDEF; + return; + } + } + BUG_ON(0 == "impossible"); +} + +/** + * crush_msr_emit_result + * + * Outputs mapping result from specified position. Position in output + * buffer depends on rule type -- FIRSTN outputs in output order, INDEP + * outputs into specified position. + * @output: output buffer + * @rule_type: CRUSH_RULE_TYPE_MSR_FIRSTN or CRUSH_RULE_TYPE_MSR_INDEP + * @position: mapping position + * @result: mapping value to output + */ +static void crush_msr_emit_result( + struct crush_msr_output *output, + int rule_type, + unsigned position, + int result) +{ + BUG_ON(position >= output->result_len); + BUG_ON(output->returned_so_far >= output->result_len); + if (rule_type == CRUSH_RULE_TYPE_MSR_FIRSTN) { + BUG_ON(output->out[output->returned_so_far] != CRUSH_ITEM_NONE); + output->out[(output->returned_so_far)++] = result; + } else { + BUG_ON(output->out[position] != CRUSH_ITEM_NONE); + output->out[position] = result; + ++output->returned_so_far; + } + dprintk(" emit: %d, returned_so_far: %d\n", + result, output->returned_so_far); +} + +/** + * crush_msr_choose + * + * Performs mapping for a single EMIT block containing CHOOSE steps + * [current_stepno, end_stepno) into mapping indices [start_index, end_index). + * + * Like chooseleaf, crush_msr_choose is essentially depth-first -- it chooses + * an item and all of the descendents under that item before moving to the + * next item. Each choose step in the block gets its own workspace for + * collision detection. + * + * crush_msr_choose (and its recursive calls) will locally retry any bucket + * selections that produce a collision (up to msr_collision_tries times), but + * won't retry if it hits an out osd -- that's handled by calling back into + * crush_msr_choose up to msr_descents times. + * + * @input: crush input information + * @workspace: working space for this EMIT block + * @output: crush mapping output buffer specification + * @total_children: total number of children implied by the step sequence, may + * be larger than end_index - start_index. + * @start_index: start mapping index + * @end_index: end mapping index + * @current_stepno: first choose step + * @end_stepno: one past last choose step, must be an EMIT + * @tryno: try number, see crush_msr_do_rule + */ +static unsigned crush_msr_choose( + const struct crush_msr_input *input, + const struct crush_msr_workspace *workspace, + struct crush_msr_output *output, + const struct crush_bucket *bucket, + const unsigned total_descendants, + const unsigned start_index, const unsigned end_index, + const unsigned current_stepno, const unsigned end_stepno, + const unsigned tryno) +{ + dprintk("crush_msr_choose: bucket %d, start_index %d, end_index %d\n", + bucket->id, start_index, end_index); + + BUG_ON(current_stepno >= input->rule->len); + const struct crush_rule_step *curstep = + &(input->rule->steps[current_stepno]); + BUG_ON(curstep->op != CRUSH_RULE_CHOOSE_MSR); + + /* This call into crush_msr_choose is responsible, ultimately, for + * populating indices [start_index, end_index). We do this by + * dividing that range into a set of strides specified in the + * step -- choosemsr 4 host would dictate that the range be divided + * into 4 strides. + * + * If the full rule is + * + * ... + * step take root + * step choosemsr 4 host + * step choosemsr 2 osd + * step emit + * + * total_descendants for the initial call would be 8 (4*2) with + * num_stride=4 (4 hosts) and stride_length = 2 (2 osds per host). + * For the recursive calls, total_descendants would be 2 (8 / 4), + * stride_length would be 1 and num_strides would be 2. + */ + + // choosemsr 0 host should select result_max hosts + const unsigned num_strides = curstep->arg1 ? curstep->arg1 + : input->result_max; + + // total_descendants is the product of the steps in the block + BUG_ON(total_descendants % num_strides != 0); + const unsigned stride_length = total_descendants / num_strides; + + /* MSR steps like + * + * step choosemsr 4 host + * + * guarantee that the output mapping will be divided into at least + * 4 hosts, not exactly 4 hosts. We achieve this by ensuring that + * the sets of hosts for each stride are disjoint -- a host selected + * for stride 0 will not be used for any other stride. + * + * However, a single stride might end up using more than one host. + * If an OSD on a host is marked out, crush_msr_choose will simply + * skip that index when it hits it. crush_msr_do_rule will then + * call back into crush_msr_choose and eventually find another OSD + * either on the same host or on another one not already used in + * another stride. For this reason, a single stride may need to + * remember up to stride_length entries for collision detection + * purposes. + * + * Unfortunately, we only have stride_length entries to work with + * in workspace. Thus, prior to returning from crush_msr_choose, + * we remove entries that didn't actually result in a mapping. We + * use the following undo vector to achieve this -- any strides that + * didn't result in a successful mapping are set in undo to be undone + * immediately prior to returning. + * + * Why prior to returning and not immediately? Selecting a bucket in + * a stride impacts subsequent choices as they may have collided. In + * order to limit the impact of marking an OSD out, we treat it as + * collidable until the next pass. + */ + int undo[num_strides]; + for (unsigned stride = 0; stride < num_strides; ++stride) { + undo[stride] = CRUSH_ITEM_UNDEF; + } + + dprintk("crush_msr_choose: bucket %d, start_index %d, " + "end_index %d, stride_length %d\n", + bucket->id, start_index, end_index, stride_length); + + unsigned mapped = 0; + unsigned stride_index = 0; + for (unsigned stride_start = start_index; + stride_start < end_index; + stride_start += stride_length, ++stride_index) { + const unsigned stride_end = + MIN(stride_start + stride_length, end_index); + + // all descendants for this stride have been mapped already + if (crush_msr_leaf_vec_populated( + workspace, stride_start, stride_end)) { + continue; + } + + int found = 0; + int child_bucket_candidate; + for (unsigned local_tryno = 0; + local_tryno < input->msr_collision_tries; + ++local_tryno) { + child_bucket_candidate = crush_msr_descend( + input, workspace, bucket, + curstep->arg2, tryno, local_tryno, + stride_index); + + /* candidate is valid if: + * - we already chose it for this stride + * - it hasn't been chosen for any stride */ + if (crush_msr_valid_candidate( + workspace, + current_stepno, + // Collision on elements in [start_index, end_index) + start_index, end_index, + // ...unless in [stride_start, stride_end) + stride_start, stride_end, + child_bucket_candidate)) { + found = 1; + break; + } + } + + /* failed to find non-colliding choice after msr_collision_tries + * attempts */ + if (!found) continue; + + if (curstep->arg2 == 0 /* leaf */) { + if (stride_length != 1 || + (current_stepno + 1 != end_stepno)) { + /* Either condition above implies that there's + * another step after a choosemsr step for the + * leaf type, rule is malformed, bail */ + continue; + } + if (is_out(input->map, input->weights, + input->weight_len, + child_bucket_candidate, input->map_input)) { + dprintk(" crush_msr_choose: item %d out\n", + child_bucket_candidate); + /* crush_msr_do_rule will try again, + * msr_descents permitting */ + continue; + } + // for collision detection + int pushed = crush_msr_push_used( + workspace, current_stepno, stride_start, stride_end, + child_bucket_candidate); + /* stride_length == 1, can't already be there */ + BUG_ON(!pushed); + // final output, ordering depending on input->rule->type + crush_msr_emit_result( + output, input->rule->type, + stride_start, child_bucket_candidate); + mapped++; + } else /* not leaf */ { + if (current_stepno + 1 >= end_stepno) { + /* Type isn't leaf, rule is malformed since there + * isn't another step */ + continue; + } + struct crush_bucket *child_bucket = input->map->buckets[ + -1 - child_bucket_candidate]; + unsigned child_mapped = crush_msr_choose( + input, workspace, output, + child_bucket, + // total_descendants for recursive call + stride_length, + // recursive call populates + // [stride_start, stride_end) + stride_start, stride_end, + // next step + current_stepno + 1, end_stepno, + tryno); + int pushed = crush_msr_push_used( + workspace, + current_stepno, + stride_start, + stride_end, + child_bucket_candidate); + /* pushed may be false if we already chose this bucket + * for this stride. If so, child_mapped must have been + * != 0 at the time, so we still retain it */ + if (pushed && (child_mapped == 0)) { + // no child mapped, and we didn't choose it + // before + undo[stride_index] = child_bucket_candidate; + } else { + mapped += child_mapped; + } + } + } + + // pop unused buckets + stride_index = 0; + for (unsigned stride_start = start_index; + stride_start < end_index; + stride_start += stride_length, ++stride_index) { + if (undo[stride_index] != CRUSH_ITEM_UNDEF) { + unsigned stride_end = + MIN(stride_start + stride_length, end_index); + crush_msr_pop_used( + workspace, + current_stepno, + stride_start, + stride_end, + undo[stride_index]); + } + } + + return mapped; +} + +/** + * crush_msr_do_rule - calculate a mapping with the given input and msr rule + * + * msr_firstn and msr_indep rules are intended to address a limitation of + * conventional crush rules in that they do not retry steps outside of + * a CHOOSELEAF step. In the case of a crush rule like + * + * rule replicated_rule_1 { + * ... + * step take default class hdd + * step chooseleaf firstn 3 type host + * step emit + * } + * + * the chooseleaf step will ensure that if all of the osds on a + * particular host are marked out, mappings including those OSDs would + * end up on another host (provided that there are enough hosts). + * + * However, if the rule used two choose steps instead + * + * rule replicated_rule_1 { + * ... + * step take default class hdd + * step choose firstn 3 type host + * step choose firstn 1 type osd + * step emit + * } + * + * marking an OSD down could cause it to be remapped to another on the same + * host, but not to another host. If all of the OSDs on a host are marked + * down, the PGs will simply be degraded and unable to remap until the host + * is removed from the CRUSH heirarchy or reweighted to 0. + * + * Normally, we can comfortably work around this by using a chooseleaf + * step as in the first example, but there are cases where we want to map + * multiple OSDs to each host (wide EC codes on small clusters, for + * example) which can't be handled with chooseleaf as it currently + * exists. + * + * rule ecpool-86 { + * type msr_indep + * ... + * step choosemsr 4 type host + * step choosemsr 4 type osd + * step emit + * } + * + * With an 8+6 code, this rule can tolerate a host and a single OSD down without + * becoming unavailable on 4 hosts. It relies on ensuring that no more than 4 + * OSDs are mapped to any single host, however, which can't be done with a + * conventional CRUSH rule without the drawback described above. By using + * msr_indep, this rule can deal with an OSD failure by remapping to another + * host. + * + * MSR rules have some structural differences from conventional rules: + * - The rule type determines whether the mapping is FIRSTN or INDEP. Because + * the descent can retry steps, it doesn't really make sense for steps to + * individually specify output order and I'm not really aware of any use cases + * that would benefit from it. + * - MSR rules *must* be structured as a (possibly empty) prefix of config + * steps (CRUSH_RULE_SET_MSR*) followed by a sequence of EMIT blocks + * each comprised of a TAKE step, a sequence of CHOOSE_MSR steps, and + * ended by an EMIT step. + * - MSR choose steps must be choosemsr. choose and chooseleaf are not permitted. + * + * MSR rules also have different requirements for working space. Conventional + * CRUSH requires 3 vectors of size result_max to use for working space -- two + * to alternate as it processes each rule and one, additionally, for chooseleaf. + * MSR rules need N vectors where N is the number of choosemsr in the longest + * EMIT block since it needs to retain all of the choices made as part of each + * descent. + * + * See crush_msr_choose for details. + * + * doc/dev/crush-msr.rst has an overview of the motivation behind CRUSH MSR + * rules and should be kept up to date with any changes to implementation or + * documentation in this file. + * + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @cwin: Pointer to at least map->working_size bytes of memory or NULL. + */ +static int crush_msr_do_rule( + const struct crush_map *map, + int ruleno, int map_input, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args) +{ + unsigned msr_descents = map->msr_descents; + unsigned msr_collision_tries = map->msr_collision_tries; + struct crush_rule *rule = map->rules[ruleno]; + unsigned start_stepno = crush_msr_scan_config_steps( + rule->steps, rule->len, + &msr_descents, &msr_collision_tries); + + struct crush_msr_input input = { + .map = map, + .rule = map->rules[ruleno], + .result_max = result_max, + .weight_len = weight_max, + .weights = weight, + .map_input = map_input, + .choose_args = choose_args, + .msr_descents = msr_descents, + .msr_collision_tries = msr_collision_tries + }; + + struct crush_msr_output output = { + .result_len = result_max, + .returned_so_far = 0, + .out = result + }; + for (unsigned i = 0; i < output.result_len; ++i) { + output.out[i] = CRUSH_ITEM_NONE; + } + + unsigned start_index = 0; + while (start_stepno < input.rule->len) { + unsigned emit_stepno, total_children; + if (crush_msr_scan_next( + input.rule, input.result_max, + start_stepno, &total_children, + &emit_stepno) != 0) { + // invalid rule, return whatever we have + dprintk("crush_msr_scan_returned -1\n"); + return 0; + } + + const struct crush_rule_step *take_step = + &(input.rule->steps[start_stepno]); + BUG_ON(take_step->op != CRUSH_RULE_TAKE); + + if (take_step->arg1 >= 0) { + if (start_stepno + 1 != emit_stepno) { + // invalid rule + dprintk("take step specifies osd, but " + "there are subsequent choose steps\n"); + return 0; + } else { + crush_msr_emit_result( + &output, input.rule->type, + start_index, take_step->arg1); + } + } else { + dprintk("start_stepno %d\n", start_stepno); + dprintk("root bucket: %d\n", + input.rule->steps[start_stepno].arg1); + struct crush_bucket *root_bucket = input.map->buckets[ + -1 - input.rule->steps[start_stepno].arg1]; + dprintk( + "root bucket: %d %p\n", + input.rule->steps[start_stepno].arg1, + root_bucket); + + ++start_stepno; + BUG_ON(emit_stepno >= input.rule->len); + BUG_ON(emit_stepno < start_stepno); + BUG_ON(start_stepno >= input.rule->len); + + struct crush_work *cw = cwin; + int *out_vecs[input.rule->len]; + for (unsigned stepno = 0; + stepno < input.rule->len; ++stepno) { + out_vecs[stepno] = + (int*)((char*)cw + map->working_size) + + (stepno * result_max); + } + struct crush_msr_workspace workspace = { + .start_stepno = start_stepno, + .end_stepno = emit_stepno, + .result_len = result_max, + .crush_work = cw, + .step_vecs = out_vecs + }; + crush_msr_clear_workspace(&workspace); + + + unsigned tries_so_far = 0; + unsigned end_index = MIN(start_index + total_children, + input.result_max); + unsigned return_limit_for_block = + output.returned_so_far + (end_index - start_index); + while (tries_so_far < input.msr_descents && + output.returned_so_far < return_limit_for_block) { + crush_msr_choose( + &input, &workspace, &output, + root_bucket, + total_children, + start_index, + end_index, + start_stepno, emit_stepno, + tries_so_far); + dprintk("returned_so_far: %d\n", + output.returned_so_far); + ++tries_so_far; + } + start_index = end_index; + start_stepno = emit_stepno + 1; + } + } + + if (rule->type == CRUSH_RULE_TYPE_MSR_FIRSTN) { + return output.returned_so_far; + } else { + return input.result_max; + } +} + +/// Return 1 if msr, 0 otherwise +static int rule_type_is_msr(int type) +{ + return type == CRUSH_RULE_TYPE_MSR_FIRSTN || + type == CRUSH_RULE_TYPE_MSR_INDEP; +} + +size_t crush_work_size(const struct crush_map *map, + int result_max) +{ + unsigned ruleno; + unsigned out_vecs = 3; /* normal do_rule needs 3 outvecs */ + for (ruleno = 0; ruleno < map->max_rules; ++ruleno) { + const struct crush_rule *rule = map->rules[ruleno]; + if (!rule) continue; + if (!rule_type_is_msr(rule->type)) + continue; + unsigned rule_max_msr_steps; + // we ignore the return value because rule_max_msr_steps will be + // populated with the longest step sequence before hitting + // the error + (void)crush_msr_scan_rule(rule, result_max, &rule_max_msr_steps); + out_vecs = MAX(rule_max_msr_steps, out_vecs); + } + return map->working_size + result_max * out_vecs * sizeof(__u32); +} + +/* This takes a chunk of memory and sets it up to be a shiny new + working area for a CRUSH placement computation. It must be called + on any newly allocated memory before passing it in to + crush_do_rule. It may be used repeatedly after that, so long as the + map has not changed. If the map /has/ changed, you must make sure + the working size is no smaller than what was allocated and re-run + crush_init_workspace. + + If you do retain the working space between calls to crush, make it + thread-local. If you reinstitute the locking I've spent so much + time getting rid of, I will be very unhappy with you. */ + +void crush_init_workspace(const struct crush_map *m, void *v) { + /* We work by moving through the available space and setting + values and pointers as we go. + + It's a bit like Forth's use of the 'allot' word since we + set the pointer first and then reserve the space for it to + point to by incrementing the point. */ + struct crush_work *w = (struct crush_work *)v; + char *point = (char *)v; + __s32 b; + point += sizeof(struct crush_work); + w->work = (struct crush_work_bucket **)point; + point += m->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < m->max_buckets; ++b) { + if (m->buckets[b] == 0) + continue; + + w->work[b] = (struct crush_work_bucket *) point; + switch (m->buckets[b]->alg) { + default: + point += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = (__u32 *)point; + point += m->buckets[b]->size * sizeof(__u32); + } + BUG_ON((char *)point - (char *)w != m->working_size); +} + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @cwin: Pointer to at least map->working_size bytes of memory or NULL. + */ +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args) +{ + const struct crush_rule *rule; + + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } + + rule = map->rules[ruleno]; + if (rule_type_is_msr(rule->type)) { + return crush_msr_do_rule( + map, + ruleno, + x, + result, + result_max, + weight, + weight_max, + cwin, + choose_args); + } else { + return crush_do_rule_no_retry( + map, + ruleno, + x, + result, + result_max, + weight, + weight_max, + cwin, + choose_args); + } +} diff --git a/src/crush/mapper.h b/src/crush/mapper.h index 0ec927d9e61..840449620fb 100644 --- a/src/crush/mapper.h +++ b/src/crush/mapper.h @@ -59,6 +59,23 @@ * char __cwin__[crush_work_size(__map__, __result_max__)]; * crush_init_workspace(__map__, __cwin__); * + * There are two CRUSH variants implemented. Rules of type + * - CRUSH_RULE_TYPE_REPLICATED + * - CRUSH_RULE_TYPE_ERASURE + * use crush_do_rule_no_retry. The crush descent algorithm implemented + * there cannot retry prior steps upon hitting an out osd, so such rules + * rely on the chooseleaf variant to implement failure domains and have + * important limitations when mapping multiple OSDs per failure domain. + * See crush_msr_do_rule in mapper.c for a more detailed explanation. + * + * Rules of type + * - CRUSH_RULE_TYPE_MSR_FIRSTN + * - CRUSH_RULE_TYPE_MSR_INDEP + * use crush_msr_do_rule, which retries the full descent when it hits an + * out OSD. This extra flexibility allows it to more effectively map multiple + * OSDs per failure domain. See the comment on crush_msr_do_rule in mapper.c + * for more details. + * * @param map the crush_map * @param ruleno a positive integer < __CRUSH_MAX_RULES__ * @param x the value to map to __result_max__ items @@ -77,15 +94,11 @@ extern int crush_do_rule(const struct crush_map *map, const __u32 *weights, int weight_max, void *cwin, const struct crush_choose_arg *choose_args); -/* Returns the exact amount of workspace that will need to be used - for a given combination of crush_map and result_max. The caller can - then allocate this much on its own, either on the stack, in a - per-thread long-lived buffer, or however it likes. */ - -static inline size_t crush_work_size(const struct crush_map *map, - int result_max) { - return map->working_size + result_max * 3 * sizeof(__u32); -} +/* Returns enough workspace for any crush rule within map to generate + result_max outputs. The caller can then allocate this much on its own, + either on the stack, in a per-thread long-lived buffer, or however it likes.*/ +extern size_t crush_work_size(const struct crush_map *map, + int result_max); extern void crush_init_workspace(const struct crush_map *m, void *v); -- 2.39.5