-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#ifndef __CRUSH_WRAPPER_H
for (std::map<int, string>::iterator p = f.begin(); p != f.end(); p++)
r[p->second] = p->first;
}
-
+
public:
CrushWrapper() : crush(0), have_rmaps(false) {}
~CrushWrapper() {
if (crush) crush_destroy(crush);
- }
+ }
/* building */
void create() {
if (name_rmap.count(name))
return name_rmap[name];
return 0; /* hrm */
- }
+ }
const char *get_item_name(int t) {
if (name_map.count(t))
return name_map[t].c_str();
if (rule_name_rmap.count(name))
return rule_name_rmap[name];
return 0; /* hrm */
- }
+ }
const char *get_rule_name(int t) {
if (rule_name_map.count(t))
return rule_name_map[t].c_str();
if (d >= crush->max_devices) return -1;
return crush->device_offload[d];
}
-
-
+
+
/*** rules ***/
private:
crush_rule *get_rule(unsigned ruleno) {
if (!crush) return (crush_rule *)(-ENOENT);
- if (ruleno >= crush->max_rules)
+ if (ruleno >= crush->max_rules)
return 0;
return crush->rules[ruleno];
}
int set_rule_step_emit(unsigned ruleno, unsigned step) {
return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
}
-
-
+
+
/** buckets **/
private:
crush_bucket *get_bucket(int id) {
if (!crush) return (crush_bucket *)(-ENOENT);
int pos = -1 - id;
- if ((unsigned)pos >= crush->max_buckets) return 0;
+ if (pos >= crush->max_buckets) return 0;
return crush->buckets[pos];
}
crush_bucket *b = crush_make_bucket(alg, type, size, items, weights);
return crush_add_bucket(crush, bucketno, b);
}
-
+
void finalize() {
assert(crush);
crush_finalize(crush);
}
void do_rule(int rule, int x, vector<int>& out, int maxout, int forcefeed) {
int rawout[maxout];
-
+
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, forcefeed);
out.resize(numrep);
for (map<int,double>::iterator p = weights.begin(); p != weights.end(); p++)
if (p->second > max)
max = p->second;
-
+
for (map<int,double>::iterator p = weights.begin(); p != weights.end(); p++) {
unsigned w = 0x10000 - (unsigned)(p->second / max * 0x10000);
set_offload(p->first, w);
}
}
-
+
int read_from_file(const char *fn) {
::encode(crush->device_offload[i], bl);
// buckets
- for (unsigned i=0; i<crush->max_buckets; i++) {
+ for (int i=0; i<crush->max_buckets; i++) {
__u32 alg = 0;
if (crush->buckets[i]) alg = crush->buckets[i]->alg;
::encode(alg, bl);
::encode(crush->buckets[i]->size, bl);
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::encode(crush->buckets[i]->items[j], bl);
-
+
switch (crush->buckets[i]->alg) {
case CRUSH_BUCKET_UNIFORM:
for (unsigned j=0; j<crush->buckets[i]->size; j++)
break;
case CRUSH_BUCKET_TREE:
- for (unsigned j=0; j<crush->buckets[i]->size; j++)
+ for (unsigned j=0; j<crush->buckets[i]->size; j++)
::encode(((crush_bucket_tree*)crush->buckets[i])->node_weights[j], bl);
break;
crush->device_offload = (__u32*)malloc(sizeof(crush->device_offload[0])*crush->max_devices);
for (int i=0; i < crush->max_devices; i++)
::decode(crush->device_offload[i], blp);
-
+
// buckets
crush->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*crush->max_buckets);
- for (unsigned i=0; i<crush->max_buckets; i++) {
+ for (int i=0; i<crush->max_buckets; i++) {
__u32 alg;
::decode(alg, blp);
if (!alg) {
}
crush->buckets[i] = (crush_bucket*)malloc(size);
memset(crush->buckets[i], 0, size);
-
+
::decode(crush->buckets[i]->id, blp);
::decode(crush->buckets[i]->type, blp);
::decode(crush->buckets[i]->alg, blp);
switch (crush->buckets[i]->alg) {
case CRUSH_BUCKET_UNIFORM:
- ((crush_bucket_uniform*)crush->buckets[i])->primes =
+ ((crush_bucket_uniform*)crush->buckets[i])->primes =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++)
::decode(((crush_bucket_uniform*)crush->buckets[i])->primes[j], blp);
break;
case CRUSH_BUCKET_LIST:
- ((crush_bucket_list*)crush->buckets[i])->item_weights =
+ ((crush_bucket_list*)crush->buckets[i])->item_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
- ((crush_bucket_list*)crush->buckets[i])->sum_weights =
+ ((crush_bucket_list*)crush->buckets[i])->sum_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++) {
break;
case CRUSH_BUCKET_TREE:
- ((crush_bucket_tree*)crush->buckets[i])->node_weights =
+ ((crush_bucket_tree*)crush->buckets[i])->node_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
- for (unsigned j=0; j<crush->buckets[i]->size; j++)
+ for (unsigned j=0; j<crush->buckets[i]->size; j++)
::decode(((crush_bucket_tree*)crush->buckets[i])->node_weights[j], blp);
break;
case CRUSH_BUCKET_STRAW:
- ((crush_bucket_straw*)crush->buckets[i])->straws =
+ ((crush_bucket_straw*)crush->buckets[i])->straws =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
- ((crush_bucket_straw*)crush->buckets[i])->item_weights =
+ ((crush_bucket_straw*)crush->buckets[i])->item_weights =
(__u32*)malloc(crush->buckets[i]->size * sizeof(__u32));
for (unsigned j=0; j<crush->buckets[i]->size; j++) {
::decode(((crush_bucket_straw*)crush->buckets[i])->item_weights[j], blp);
#include "crush.h"
#include "hash.h"
-int
+int
void crush_finalize(struct crush_map *map)
{
int b, i;
-
+
/* calc max_devices */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
- for (i=0; i<map->buckets[b]->size; i++)
+ for (i=0; i<map->buckets[b]->size; i++)
if (map->buckets[b]->items[i] >= map->max_devices)
map->max_devices = map->buckets[b]->items[i] + 1;
}
-
+
/* allocate arrays */
map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
-
+
/* build parent maps */
crush_calc_parents(map);
map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0]));
memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
}
-
+
/* add it */
map->rules[ruleno] = rule;
return ruleno;
{
int pos;
for (pos=0; pos < map->max_buckets; pos++)
- if (map->buckets[pos] == 0)
+ if (map->buckets[pos] == 0)
break;
return -1 - pos;
}
int pos;
/* find a bucket id */
- if (id == 0)
+ if (id == 0)
id = crush_get_next_bucket_id(map);
pos = -1 - id;
{
int i, j, x;
struct crush_bucket_uniform *bucket;
-
+
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_UNIFORM;
bucket->h.type = type;
bucket->h.size = size;
bucket->h.weight = size * item_weight;
-
+
bucket->item_weight = item_weight;
-
+
bucket->h.items = malloc(sizeof(__u32)*size);
for (i=0; i<size; i++)
bucket->h.items[i] = items[i];
-
+
/* generate some primes */
bucket->primes = malloc(sizeof(__u32)*size);
x = size + 1;
x += crush_hash32(size) % (3*size); /* make it big */
x |= 1; /* and odd */
-
+
i=0;
while (i < size) {
- for (j=2; j*j <= x; j++)
+ for (j=2; j*j <= x; j++)
if (x % j == 0) break;
- if (j*j > x)
+ if (j*j > x)
bucket->primes[i++] = x;
x += 2;
}
bucket->h.alg = CRUSH_BUCKET_LIST;
bucket->h.type = type;
bucket->h.size = size;
-
+
bucket->h.items = malloc(sizeof(__u32)*size);
bucket->item_weights = malloc(sizeof(__u32)*size);
bucket->sum_weights = malloc(sizeof(__u32)*size);
/*
* caller will place new items at end. so, we reverse things,
* since we put new items at the beginning.
- */
+ */
for (i=0; i<size; i++) {
int pos = size - i - 1;
bucket->h.items[pos] = items[i];
/*printf("%d item %d weight %d sum %d\n",
i, items[i], weights[i], bucket->sum_weights[i]);*/
}
-
+
bucket->h.weight = w;
return bucket;
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
- h++;
+ h++;
n = n >> 1;
}
return h;
}
-static int on_right(int n, int h) {
- return n & (1 << (h+1));
+static int on_right(int n, int h) {
+ return n & (1 << (h+1));
}
-static int parent(int n)
+static int parent(int n)
{
int h = height(n);
if (on_right(n, h))
int depth;
int node;
int t, i, j;
-
+
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_TREE;
memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size);
memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size);
-
+
for (i=0; i<size; i++) {
node = ((i+1) << 1)-1;
bucket->h.items[node] = items[i];
/* straw bucket */
struct crush_bucket_straw *
-crush_make_straw_bucket(int type,
+crush_make_straw_bucket(int type,
int size,
int *items,
int *weights)
struct crush_bucket_straw *bucket;
int *reverse;
int i, j, k;
-
+
double straw, wbelow, lastw, wnext, pbelow;
int numleft;
-
+
bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
bucket->h.alg = CRUSH_BUCKET_STRAW;
bucket->h.type = type;
bucket->h.size = size;
-
+
bucket->h.items = malloc(sizeof(__u32)*size);
bucket->item_weights = malloc(sizeof(__u32)*size);
bucket->straws = malloc(sizeof(__u32)*size);
-
+
bucket->h.weight = 0;
for (i=0; i<size; i++) {
bucket->h.items[i] = items[i];
bucket->h.weight += weights[i];
bucket->item_weights[i] = weights[i];
}
-
+
/* reverse sort by weight (simple insertion sort) */
reverse = malloc(sizeof(int) * size);
reverse[0] = 0;
if (j == i)
reverse[i] = i;
}
-
+
numleft = size;
straw = 1.0;
wbelow = 0;
lastw = 0;
-
+
i=0;
while (i < size) {
/* set this item's straw */
bucket->straws[reverse[i]] = straw * 0x10000;
- /*printf("item %d at %d weight %d straw %d (%lf)\n",
+ /*printf("item %d at %d weight %d straw %d (%lf)\n",
items[reverse[i]],
reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/
i++;
if (i == size) break;
-
+
/* same weight as previous? */
if (weights[reverse[i]] == weights[reverse[i-1]]) {
/*printf("same as previous\n");*/
continue;
}
-
+
/* adjust straw for next guy */
wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft;
for (j=i; j<size; j++)
wnext = numleft * (weights[reverse[i]] - weights[reverse[i-1]]);
pbelow = wbelow / (wbelow + wnext);
/*printf("wbelow %lf wnext %lf pbelow %lf\n", wbelow, wnext, pbelow);*/
-
+
straw *= pow((double)1.0 / pbelow, (double)1.0 / (double)numleft);
-
+
lastw = weights[reverse[i-1]];
}
-
+
free(reverse);
-
+
return bucket;
}
else
item_weight = 0;
return (struct crush_bucket *)crush_make_uniform_bucket(type, size, items, item_weight);
-
+
case CRUSH_BUCKET_LIST:
return (struct crush_bucket *)crush_make_list_bucket(type, size, items, weights);
case CRUSH_BUCKET_TREE:
return (struct crush_bucket *)crush_make_tree_bucket(type, size, items, weights);
-
+
case CRUSH_BUCKET_STRAW:
return (struct crush_bucket *)crush_make_straw_bucket(type, size, items, weights);
- }
+ }
return 0;
}
#else
# include <stdlib.h>
# include <assert.h>
-# define kfree(x) free(x)
+# define kfree(x) do { if (x) free(x); } while (0)
# define BUG_ON(x) assert(!(x))
#endif
#include "crush.h"
-int crush_get_bucket_item_weight(struct crush_bucket *b, int pos)
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
{
- if (pos >= b->size)
+ if (p >= b->size)
return 0;
- switch (b->alg) {
+
+ switch (b->alg) {
case CRUSH_BUCKET_UNIFORM:
return ((struct crush_bucket_uniform*)b)->item_weight;
case CRUSH_BUCKET_LIST:
- return ((struct crush_bucket_list*)b)->item_weights[pos];
- case CRUSH_BUCKET_TREE:
- if (pos & 1)
- return ((struct crush_bucket_tree*)b)->node_weights[pos];
+ return ((struct crush_bucket_list*)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ if (p & 1)
+ return ((struct crush_bucket_tree*)b)->node_weights[p];
return 0;
case CRUSH_BUCKET_STRAW:
- return ((struct crush_bucket_straw*)b)->item_weights[pos];
+ return ((struct crush_bucket_straw*)b)->item_weights[p];
}
return 0;
}
-
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
void crush_calc_parents(struct crush_map *map)
{
int i, b, c;
- for (b=0; b<map->max_buckets; b++) {
- if (map->buckets[b] == NULL) continue;
- for (i=0; i<map->buckets[b]->size; i++) {
+
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ for (i = 0; i < map->buckets[b]->size; i++) {
c = map->buckets[b]->items[i];
- BUG_ON(c >= map->max_devices);
+ BUG_ON(c >= map->max_devices ||
+ c < -map->max_buckets);
if (c >= 0)
map->device_parents[c] = map->buckets[b]->id;
else
kfree(b);
}
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ }
+}
-/*
- * deallocate
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
*/
void crush_destroy(struct crush_map *map)
{
int b;
-
+
/* buckets */
if (map->buckets) {
- for (b=0; b<map->max_buckets; b++) {
- if (map->buckets[b] == NULL) continue;
- switch (map->buckets[b]->alg) {
- case CRUSH_BUCKET_UNIFORM:
- crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]);
- break;
- case CRUSH_BUCKET_LIST:
- crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]);
- break;
- case CRUSH_BUCKET_TREE:
- crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]);
- break;
- case CRUSH_BUCKET_STRAW:
- crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]);
- break;
- }
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
}
kfree(map->buckets);
}
-
+
/* rules */
if (map->rules) {
- for (b=0; b<map->max_rules; b++) {
- if (map->rules[b] == NULL) continue;
+ for (b = 0; b < map->max_rules; b++)
kfree(map->rules[b]);
- }
kfree(map->rules);
}
-
- if (map->bucket_parents)
- kfree(map->bucket_parents);
- if (map->device_parents)
- kfree(map->device_parents);
- if (map->device_offload)
- kfree(map->device_offload);
+
+ kfree(map->bucket_parents);
+ kfree(map->device_parents);
+ kfree(map->device_offload);
kfree(map);
}
#include <linux/types.h>
-/*** RULES ***/
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ */
+
+
+#define CRUSH_MAX_DEPTH 10
+#define CRUSH_MAX_SET 10
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices. A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+ __u32 op;
+ __s32 arg1;
+ __s32 arg2;
+};
+
+/* step op codes */
enum {
CRUSH_RULE_NOOP = 0,
CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
};
-#define CRUSH_MAX_DEPTH 10
-#define CRUSH_MAX_SET 10
-
/*
- * for specifying choose numrep relative to the max
- * parameter passed to do_rule
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
*/
#define CRUSH_CHOOSE_N 0
#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
-struct crush_rule_step {
- __u32 op;
- __s32 arg1;
- __s32 arg2;
-};
-
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a storage pool and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
struct crush_rule_mask {
__u8 pool;
__u8 type;
-/*** BUCKETS ***/
-
-/* bucket algorithms */
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets). Items within a bucket are chosen using one of a
+ * few different algorithms. The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ * Bucket Alg Speed Additions Removals
+ * ------------------------------------------------
+ * uniform O(1) poor poor
+ * list O(n) optimal poor
+ * tree O(log n) good good
+ * straw O(n) optimal optimal
+ */
enum {
CRUSH_BUCKET_UNIFORM = 1,
CRUSH_BUCKET_LIST = 2,
struct crush_bucket {
__s32 id; /* this'll be negative */
- __u16 type; /* non-zero; 0 is reserved for devices */
+ __u16 type; /* non-zero; type=0 is reserved for devices */
__u16 alg; /* one of CRUSH_BUCKET_* */
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 *primes;
- __u32 item_weight; /* 16-bit fixed point */
+ __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
};
struct crush_bucket_list {
struct crush_bucket h;
__u32 *item_weights; /* 16-bit fixed point */
- __u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */
+ __u32 *sum_weights; /* 16-bit fixed point. element i is sum
+ of weights 0..i, inclusive */
};
struct crush_bucket_tree {
- struct crush_bucket h; /* note: h.size is tree size, not number of actual items */
+ struct crush_bucket h; /* note: h.size is _tree_ size, not number of
+ actual items */
__u32 *node_weights;
};
struct crush_bucket_straw {
struct crush_bucket h;
- __u32 *item_weights;
- __u32 *straws; /* 16-bit fixed point */
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *straws; /* 16-bit fixed point */
};
-/*** CRUSH ***/
-
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
struct crush_map {
struct crush_bucket **buckets;
struct crush_rule **rules;
-
- /* parent pointers */
+
+ /*
+ * Parent pointers to identify the parent bucket a device or
+ * bucket in the hierarchy. If an item appears more than
+ * once, this is the _last_ time it appeared (where buckets
+ * are processed in bucket id order, from -1 on down to
+ * -max_buckets.
+ */
__u32 *bucket_parents;
__u32 *device_parents;
-
- /* offload
- * size max_devices, values 0...0xffff
+
+ /*
+ * device offload.
+ * size max_devices, values 0..0x10000
* 0 == normal
* 0x10000 == 100% offload (i.e. failed)
*/
- __u32 *device_offload;
-
- __u32 max_buckets;
+ __u32 *device_offload;
+
+ __s32 max_buckets;
__u32 max_rules;
__s32 max_devices;
};
-/* common */
+/* crush.c */
extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *m);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
extern void crush_destroy(struct crush_map *map);
#endif
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
+ * License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
- *
+ *
*/
#ifndef __CRUSH_GRAMMAR
rule<ScannerT, parser_context<>, parser_tag<_crushrule> > crushrule;
rule<ScannerT, parser_context<>, parser_tag<_crushmap> > crushmap;
-
+
definition(crush_grammar const& /*self*/)
{
// base types
>> !( ( str_p("offload") >> real_p ) |
( str_p("load") >> real_p ) |
str_p("down"));
-
+
// bucket types
bucket_type = str_p("type") >> posint >> name;
// buckets
bucket_id = str_p("id") >> negint;
- bucket_alg = str_p("alg") >> ( str_p("uniform") |
- str_p("list") |
- str_p("tree") |
+ bucket_alg = str_p("alg") >> ( str_p("uniform") |
+ str_p("list") |
+ str_p("tree") |
str_p("straw") );
bucket_item = str_p("item") >> name
>> !( str_p("weight") >> real_p )
>> !( str_p("pos") >> posint );
bucket = name >> name >> '{' >> !bucket_id >> bucket_alg >> *bucket_item >> '}';
-
+
// rules
step_take = str_p("take") >> name;
step_choose = str_p("choose")
>> integer
>> str_p("type") >> name;
step_emit = str_p("emit");
- step = str_p("step") >> ( step_take |
- step_choose |
- step_chooseleaf |
+ step = str_p("step") >> ( step_take |
+ step_choose |
+ step_chooseleaf |
step_emit );
- crushrule = str_p("rule") >> !name >> '{'
+ crushrule = str_p("rule") >> !name >> '{'
>> str_p("pool") >> posint
>> str_p("type") >> ( str_p("replicated") | str_p("raid4") )
>> str_p("min_size") >> posint
// the whole crush map
crushmap = *(device | bucket_type) >> *bucket >> *crushrule;
}
-
- rule<ScannerT, parser_context<>, parser_tag<_crushmap> > const&
+
+ rule<ScannerT, parser_context<>, parser_tag<_crushmap> > const&
start() const { return crushmap; }
};
};
// http://burtleburtle.net/bob/hash/evahash.html
// a, b = random bits, c = input and output
#define hashmix(a,b,c) \
- a=a-b; a=a-c; a=a^(c>>13); \
- b=b-c; b=b-a; b=b^(a<<8); \
- c=c-a; c=c-b; c=c^(b>>13); \
- a=a-b; a=a-c; a=a^(c>>12); \
- b=b-c; b=b-a; b=b^(a<<16); \
- c=c-a; c=c-b; c=c^(b>>5); \
- a=a-b; a=a-c; a=a^(c>>3); \
- b=b-c; b=b-a; b=b^(a<<10); \
- c=c-a; c=c-b; c=c^(b>>15);
+ a=a-b; a=a-c; a=a^(c>>13); \
+ b=b-c; b=b-a; b=b^(a<<8); \
+ c=c-a; c=c-b; c=c^(b>>13); \
+ a=a-b; a=a-c; a=a^(c>>12); \
+ b=b-c; b=b-a; b=b^(a<<16); \
+ c=c-a; c=c-b; c=c^(b>>5); \
+ a=a-b; a=a-c; a=a^(c>>3); \
+ b=b-c; b=b-a; b=b^(a<<10); \
+ c=c-a; c=c-b; c=c^(b>>15);
#define crush_hash_seed 1315423911
#include "hash.h"
-
+/**
+ * crush_find_rule - find a crush_rule id for a given pool, type, and size.
+ * @map: the crush_map
+ * @pool: the storage pool id (user defined)
+ * @type: storage pool type (user defined)
+ * @size: output set size
+ */
int crush_find_rule(struct crush_map *map, int pool, int type, int size)
{
int i;
+
for (i = 0; i < map->max_rules; i++) {
if (map->rules[i] &&
map->rules[i]->mask.pool == pool &&
}
-/** bucket choose methods **/
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
/* uniform */
-
-static int
-crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+ int x, int r)
{
- unsigned o, p, s;
- o = crush_hash32_2(x, bucket->h.id) & 0xffff;
- p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
- s = (x + o + (r+1)*p) % bucket->h.size;
+ unsigned o = crush_hash32_2(x, bucket->h.id) & 0xffff;
+ unsigned p = bucket->primes[crush_hash32_2(bucket->h.id, x) %
+ bucket->h.size];
+ unsigned s = (x + o + (r+1)*p) % bucket->h.size;
/*printf("%d %d %d %d\n", x, o, r, p);*/
return bucket->h.items[s];
}
-
/* list */
-
-static int
-crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+ int x, int r)
{
int i;
- __u64 w;
-
- for (i=0; i<bucket->h.size; i++) {
- w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id);
+
+ for (i = 0; i < bucket->h.size; i++) {
+ __u64 w = crush_hash32_4(x, bucket->h.items[i], r,
+ bucket->h.id);
w &= 0xffff;
- /*printf("%d x %d item %d weight %d sum_weight %d r %lld",
+ /*printf("%d x %d item %d weight %d sum_weight %d r %lld",
i, x, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/
w *= bucket->sum_weights[i];
w = w >> 16;
if (w < bucket->item_weights[i])
return bucket->h.items[i];
}
-
+
BUG_ON(1);
return 0;
}
/* tree */
-
static int height(int n) {
int h = 0;
while ((n & 1) == 0) {
- h++;
+ h++;
n = n >> 1;
}
return h;
}
+
static int left(int x) {
int h = height(x);
return x - (1 << (h-1));
}
+
static int right(int x) {
int h = height(x);
return x + (1 << (h-1));
}
+
static int terminal(int x) {
return x & 1;
}
-static int
-crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+ int x, int r)
{
int n, l;
__u32 w;
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
t = t >> 32;
-
- /* left or right? */
+
+ /* descend to the left or right? */
l = left(n);
if (t < bucket->node_weights[l])
n = l;
/* straw */
-static int
-crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+ int x, int r)
{
int i;
int high = 0;
__u64 high_draw = 0;
__u64 draw;
-
- for (i=0; i<bucket->h.size; i++) {
+
+ for (i = 0; i < bucket->h.size; i++) {
draw = crush_hash32_3(x, bucket->h.items[i], r);
draw &= 0xffff;
draw *= bucket->straws[i];
high_draw = draw;
}
}
-
return bucket->h.items[high];
}
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+ x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((struct crush_bucket_list *)in, x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((struct crush_bucket_tree *)in, x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose((struct crush_bucket_straw *)in,
+ x, r);
+ default:
+ BUG_ON(1);
+ return in->items[0];
+ }
+}
-
-/** crush proper **/
-
-
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
static int is_out(struct crush_map *map, int item, int x)
{
if (map->device_offload[item]) {
- if (map->device_offload[item] >= 0x10000)
+ if (map->device_offload[item] >= 0x10000)
return 1;
- else if ((crush_hash32_2(x, item) & 0xffff) < map->device_offload[item])
+ else if ((crush_hash32_2(x, item) & 0xffff) <
+ map->device_offload[item])
return 1;
}
return 0;
}
-/*
- * choose numrep distinct items of given type
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
*/
static int crush_choose(struct crush_map *map,
struct crush_bucket *bucket,
int x, int numrep, int type,
- int *out, int outpos,
+ int *out, int outpos,
int firstn, int recurse_to_leaf,
int *out2)
{
int item;
int itemtype;
int collide, reject;
-
+
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
do {
retry_descent = 0;
in = bucket; /* initial bucket */
-
+
/* choose through intervening buckets */
flocal = 0;
do {
r = rep;
if (in->alg == CRUSH_BUCKET_UNIFORM) {
/* be careful */
- if (firstn || numrep >= in->size)
- r += ftotal; /* r' = r + f_total */
+ if (firstn || numrep >= in->size)
+ /* r' = r + f_total */
+ r += ftotal;
else if (in->size % numrep == 0)
- r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */
+ /* r'=r+(n+1)*f_local */
+ r += (numrep+1) * flocal;
else
- r += numrep * flocal; /* r' = r + n*f_local */
+ /* r' = r + n*f_local */
+ r += numrep * flocal;
} else {
- if (firstn)
- r += ftotal; /* r' = r + f_total */
- else
- r += numrep * flocal; /* r' = r + n*f_local */
+ if (firstn)
+ /* r' = r + f_total */
+ r += ftotal;
+ else
+ /* r' = r + n*f_local */
+ r += numrep * flocal;
}
/* bucket choose */
- switch (in->alg) {
- case CRUSH_BUCKET_UNIFORM:
- item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r);
- break;
- case CRUSH_BUCKET_LIST:
- item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r);
- break;
- case CRUSH_BUCKET_TREE:
- item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r);
- break;
- case CRUSH_BUCKET_STRAW:
- item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r);
- break;
- default:
- BUG_ON(1);
- item = in->items[0];
- }
+ item = crush_bucket_choose(in, x, r);
BUG_ON(item >= map->max_devices);
-
+
/* desired type? */
- if (item < 0)
+ if (item < 0)
itemtype = map->buckets[-1-item]->type;
- else
+ else
itemtype = 0;
-
+
/* keep going? */
if (itemtype != type) {
- BUG_ON(item >= 0 || (-1-item) >= map->max_buckets);
+ BUG_ON(item >= 0 ||
+ (-1-item) >= map->max_buckets);
in = map->buckets[-1-item];
continue;
}
-
+
/* collision? */
collide = 0;
- for (i=0; i<outpos; i++) {
+ for (i = 0; i < outpos; i++) {
if (out[i] == item) {
collide = 1;
break;
}
}
-
+
/* out? */
- if (itemtype == 0)
+ if (itemtype == 0)
reject = is_out(map, item, x);
- else
+ else
reject = 0;
if (recurse_to_leaf &&
out2+outpos, 0,
firstn, 0, NULL))
reject = 1;
-
+
if (reject || collide) {
ftotal++;
flocal++;
-
- if (collide && flocal < 3)
- retry_bucket = 1; /* retry locally a few times */
+
+ if (collide && flocal < 3)
+ /* retry locally a few times */
+ retry_bucket = 1;
else if (ftotal < 10)
- retry_descent = 1; /* then retry descent */
+ /* then retry descent */
+ retry_descent = 1;
else
- skip_rep = 1; /* else give up */
+ /* else give up */
+ skip_rep = 1;
}
} while (retry_bucket);
} while (retry_descent);
-
+
if (skip_rep) continue;
out[outpos] = item;
outpos++;
}
-
+
return outpos;
}
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
int crush_do_rule(struct crush_map *map,
int ruleno, int x, int *result, int result_max,
- int force) /* -1 for none */
+ int force)
{
int result_len;
int force_context[CRUSH_MAX_DEPTH];
int step;
int i,j;
int numrep;
-
+
BUG_ON(ruleno >= map->max_rules);
rule = map->rules[ruleno];
result_len = 0;
}
}
}
-
+
for (step = 0; step < rule->len; step++) {
switch (rule->steps[step].op) {
case CRUSH_RULE_TAKE:
}
wsize = 1;
break;
-
+
case CRUSH_RULE_CHOOSE_FIRSTN:
case CRUSH_RULE_CHOOSE_INDEP:
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
BUG_ON(wsize == 0);
-
+
/* reset output */
osize = 0;
-
+
recurse_to_leaf = rule->steps[step].op >=
CRUSH_RULE_CHOOSE_LEAF_FIRSTN;
for (i = 0; i < wsize; i++) {
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
- * the provided result_max
+ * the provided result_max
*/
numrep = rule->steps[step].arg1;
if (numrep <= 0) {
}
osize += crush_choose(map,
map->buckets[-1-w[i]],
- x, numrep, rule->steps[step].arg2,
- o+osize, j, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN,
+ x, numrep,
+ rule->steps[step].arg2,
+ o+osize, j,
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_FIRSTN,
recurse_to_leaf, c+osize);
}
o = w;
w = tmp;
wsize = osize;
- break;
-
-
+ break;
+
+
case CRUSH_RULE_EMIT:
for (i=0; i<wsize && result_len < result_max; i++) {
result[result_len] = w[i];
}
wsize = 0;
break;
-
+
default:
BUG_ON(1);
}
}
-
+
return result_len;
}
#include "crush.h"
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ */
extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
extern int crush_do_rule(struct crush_map *map,
int ruleno,
int root;
int ruleno;
int r[10];
-
+
int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 };
struct crush_bucket *b;
struct crush_rule *rule;
-
+
struct crush_map *map = crush_create();
d = 0;
}
for (i=0; i<100; i += 10)
- printf("%2d : %d\n", i, o[i]);
+ printf("%2d : %d\n", i, o[i]);
return 0;
}