EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib
EXTRA_CFLAGS += -g
EXTRA_CFLAGS += -pg
-EXTRA_CFLAGS += -O3
+#EXTRA_CFLAGS += -O3
# base
CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cmon: cmon.o mon.o msg/SimpleMessenger.o common.o
+cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o
+cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o
+cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o
+csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
+cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
# fake*
-fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
+fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
+fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush2/libcrush.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
# mpi startup
-newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
+newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
test_disk_bw: test/test_disk_bw.cc common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
+# crush
+
+crush2/libcrush.o: force_look
+ cd crush2 ; make
+
+force_look:
+ true
+
# bits
common.o: ${COMMON_OBJS}
${LDINC} $@ $^
osbdb.o: ${OSBDB_OBJS}
${LDINC} $@ $^
-
# generic rules
%.so: %.cc
${CC} -shared -fPIC ${CFLAGS} $< -o $@
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CRUSH_WRAPPER_H
+#define __CRUSH_WRAPPER_H
+
+#include "crush.h"
+#include "builder.h"
+
+#include "include/encodable.h"
+
+#include <map>
+#include <set>
+
+class CrushWrapper {
+public:
+ struct crush_map *map;
+
+ CrushWrapper() : map(0) {}
+ ~CrushWrapper() {
+ if (map) crush_destroy(map);
+ }
+
+ void create() {
+ if (map) crush_destroy(map);
+ map = crush_create();
+ }
+ void finalize() {
+ assert(map);
+ crush_finalize(map);
+ }
+
+ void update_offload_map(std::set<int32_t>& out_osds,
+ std::map<int32_t,float>& overload_osds) {
+ for (unsigned i=0; i<map->max_devices; i++) {
+ if (out_osds.count(i))
+ map->device_offload[i] = 0x10000;
+ else if (overload_osds.count(i))
+ map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse?
+ else
+ map->device_offload[i] = 0; // normal.
+ }
+ }
+
+ void do_rule(int rule, int x, vector<int>& out, int maxout, int forcefeed) {
+ int rawout[maxout];
+
+ int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed);
+
+ out.resize(numrep);
+ for (int i=0; i<numrep; i++)
+ out[i] = rawout[i];
+ }
+
+ void _encode(bufferlist &bl) {
+ ::_encode_simple(map->max_buckets, bl);
+ ::_encode_simple(map->max_rules, bl);
+ ::_encode_simple(map->max_devices, bl);
+
+ // simple arrays
+ bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices);
+
+ // buckets
+ for (unsigned i=0; i<map->max_buckets; i++) {
+ __u32 type = 0;
+ if (map->buckets[i]) type = map->buckets[i]->bucket_type;
+ ::_encode_simple(type, bl);
+ if (!type) continue;
+
+ ::_encode_simple(map->buckets[i]->id, bl);
+ ::_encode_simple(map->buckets[i]->type, bl);
+ ::_encode_simple(map->buckets[i]->bucket_type, bl);
+ ::_encode_simple(map->buckets[i]->weight, bl);
+ ::_encode_simple(map->buckets[i]->size, bl);
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_encode_simple(map->buckets[i]->items[j], bl);
+
+ switch (map->buckets[i]->type) {
+ case CRUSH_BUCKET_UNIFORM:
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl);
+ ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl);
+ break;
+
+ case CRUSH_BUCKET_LIST:
+ for (unsigned j=0; j<map->buckets[i]->size; j++) {
+ ::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl);
+ ::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl);
+ }
+ break;
+
+ case CRUSH_BUCKET_TREE:
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl);
+ break;
+
+ case CRUSH_BUCKET_STRAW:
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl);
+ break;
+ }
+ }
+
+ // rules
+ for (unsigned i=0; i<map->max_rules; i++) {
+ __u32 yes = map->rules[i] ? 1:0;
+ ::_encode_simple(yes, bl);
+ if (!yes) continue;
+
+ ::_encode_simple(map->rules[i]->len, bl);
+ for (unsigned j=0; j<map->rules[i]->len; j++)
+ ::_encode_simple(map->rules[i]->steps[j], bl);
+ }
+ }
+
+ void _decode(bufferlist::iterator &blp) {
+ create();
+ ::_decode_simple(map->max_buckets, blp);
+ ::_decode_simple(map->max_rules, blp);
+ ::_decode_simple(map->max_devices, blp);
+
+ map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices);
+ blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload);
+
+ // buckets
+ map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets);
+ for (unsigned i=0; i<map->max_buckets; i++) {
+ __u32 type;
+ ::_decode_simple(type, blp);
+ if (!type) {
+ map->buckets[i] = 0;
+ continue;
+ }
+
+ int size = 0;
+ switch (type) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(crush_bucket_straw);
+ break;
+ default:
+ assert(0);
+ }
+ map->buckets[i] = (crush_bucket*)malloc(size);
+ memset(map->buckets[i], 0, size);
+
+ ::_decode_simple(map->buckets[i]->id, blp);
+ ::_decode_simple(map->buckets[i]->type, blp);
+ ::_decode_simple(map->buckets[i]->bucket_type, blp);
+ ::_decode_simple(map->buckets[i]->weight, blp);
+ ::_decode_simple(map->buckets[i]->size, blp);
+
+ map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size);
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_decode_simple(map->buckets[i]->items[j], blp);
+
+ switch (map->buckets[i]->type) {
+ case CRUSH_BUCKET_UNIFORM:
+ ((crush_bucket_uniform*)map->buckets[i])->primes =
+ (__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp);
+ ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp);
+ break;
+
+ case CRUSH_BUCKET_LIST:
+ ((crush_bucket_list*)map->buckets[i])->item_weights =
+ (__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
+ ((crush_bucket_list*)map->buckets[i])->sum_weights =
+ (__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
+
+ for (unsigned j=0; j<map->buckets[i]->size; j++) {
+ ::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp);
+ ::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp);
+ }
+ break;
+
+ case CRUSH_BUCKET_TREE:
+ ((crush_bucket_tree*)map->buckets[i])->node_weights =
+ (__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp);
+ break;
+
+ case CRUSH_BUCKET_STRAW:
+ ((crush_bucket_straw*)map->buckets[i])->straws =
+ (__u32*)malloc(map->buckets[i]->size * sizeof(__u32));
+ for (unsigned j=0; j<map->buckets[i]->size; j++)
+ ::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp);
+ break;
+ }
+ }
+
+ // rules
+ map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules);
+ for (unsigned i=0; i<map->max_rules; i++) {
+ __u32 yes;
+ ::_decode_simple(yes, blp);
+ if (!yes) {
+ map->rules[i] = 0;
+ continue;
+ }
+
+ map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule));
+ memset(map->rules[i], 0, sizeof(crush_rule));
+
+ ::_decode_simple(map->rules[i]->len, blp);
+ map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len);
+ for (unsigned j=0; j<map->rules[i]->len; j++)
+ ::_decode_simple(map->rules[i]->steps[j], blp);
+ }
+
+ finalize();
+ }
+};
+
+#endif
CC = gcc
CFLAGS = -Wall
-CFLAGS += -O3 -g
+CFLAGS += -g
+#CFLAGS += -O3
LD = ld
RM = rm
%.o: %.c
${CC} ${CFLAGS} -c $< -o $@
-libcrush.o: crush.o buckets.o
+libcrush.o: crush.o buckets.o builder.o
$(LD) -i -o $@ $^
.depend:
+#include "crush.h"
#include "hash.h"
-#include "buckets.h"
int
crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
return 0;
}
+
+static int height(int n) {
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+static int left(int x) {
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+static int right(int x) {
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+static int terminal(int x) {
+ return x & 1;
+}
+
int
crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
{
- return 0;
+ int n, l;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->h.size >> 1;
+
+ while (!terminal(n)) {
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n];
}
int
void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
- free(b->h.items);
+ free(b->node_weights);
}
void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
#ifndef _CRUSH_BUCKETS_H
#define _CRUSH_BUCKETS_H
-#include "types.h"
-
enum {
CRUSH_BUCKET_UNIFORM = 1,
CRUSH_BUCKET_LIST = 2,
};
struct crush_bucket {
- __u32 id;
- __u32 type;
+ __s32 id; /* this'll be negative */
+ __u16 type;
+ __u16 bucket_type;
__u32 weight; /* 16-bit fixed point */
__u32 size; /* num items */
__s32 *items;
struct crush_bucket_uniform {
struct crush_bucket h;
- __u32 item_weight; /* 16-bit fixed point */
__u32 *primes;
+ __u32 item_weight; /* 16-bit fixed point */
};
struct crush_bucket_list {
};
struct crush_bucket_tree {
- struct crush_bucket h;
-
+ struct crush_bucket h; /* note: h.size is tree size, not number of actual items */
+ __u32 *node_weights;
};
struct crush_bucket_straw {
#include "builder.h"
#include "hash.h"
-
-struct crush_map *crush_new()
+struct crush_map *crush_create()
{
struct crush_map *m;
m = malloc(sizeof(*m));
- memset(&m, 0, sizeof(m));
+ memset(m, 0, sizeof(*m));
return m;
}
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
for (i=0; i<map->buckets[b]->size; i++)
- if (map->buckets[b]->items[i] > map->max_devices)
- map->max_devices = map->buckets[b]->items[i];
+ if (map->buckets[b]->items[i] >= map->max_devices)
+ map->max_devices = map->buckets[b]->items[i] + 1;
}
/* allocate arrays */
map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
- map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
- memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
- /* build reverse map */
+ /* build parent maps */
for (b=0; b<map->max_buckets; b++) {
if (map->buckets[b] == 0) continue;
for (i=0; i<map->buckets[b]->size; i++) {
c = map->buckets[b]->items[i];
+ BUG_ON(c >= map->max_devices);
if (c >= 0)
map->device_parents[c] = map->buckets[b]->id;
else
- map->bucket_parents[-c] = map->buckets[b]->id;
+ map->bucket_parents[-1-c] = map->buckets[b]->id;
}
}
+
+ /* new device offload map? */
+ if (!map->device_offload) {
+ map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices);
+ memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices);
+ }
}
+
+
/*
* deallocate
*/
free(map->bucket_parents);
free(map->device_parents);
free(map->device_offload);
-
- memset(map, 0, sizeof(*map));
+ free(map);
}
/** rules **/
int crush_add_rule(struct crush_map *map,
+ int ruleno,
struct crush_rule *rule)
{
- int id;
-
- /* find a rule id */
- for (id=0; id < map->max_rules; id++)
- if (map->rules[id] == 0) break;
- if (id == map->max_rules) {
+ int oldsize;
+
+ if (ruleno < 0) {
+ for (ruleno=0; ruleno < map->max_rules; ruleno++)
+ if (map->rules[ruleno] == 0) break;
+ }
+ if (ruleno >= map->max_rules) {
/* expand array */
- if (map->max_rules)
- map->max_rules *= 2;
- else
- map->max_rules = 8;
+ oldsize = map->max_rules;
+ map->max_rules = ruleno+1;
map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0]));
+ memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
}
/* add it */
- map->rules[id] = rule;
- return id;
+ map->rules[ruleno] = rule;
+ return ruleno;
}
struct crush_rule *crush_make_rule()
struct crush_rule *rule;
rule = malloc(sizeof(struct crush_rule));
- memset(&rule, 0, sizeof(rule));
+ memset(rule, 0, sizeof(*rule));
return rule;
}
{
rule->len++;
if (rule->steps)
- rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
- else
rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len);
+ else
+ rule->steps = malloc(sizeof(rule->steps[0])*rule->len);
rule->steps[rule->len-1].op = op;
rule->steps[rule->len-1].arg1 = arg1;
rule->steps[rule->len-1].arg2 = arg2;
struct crush_bucket *bucket)
{
int id;
+ int oldsize;
/* find a bucket id */
for (id=0; id < map->max_buckets; id++)
if (map->buckets[id] == 0) break;
if (id == map->max_buckets) {
/* expand array */
+ oldsize = map->max_buckets;
if (map->max_buckets)
map->max_buckets *= 2;
else
map->max_buckets = 8;
map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]));
+ memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));
}
/* add it */
- bucket->id = id;
+ bucket->id = -1 - id;
map->buckets[id] = bucket;
- return id;
+ return -1 - id;
}
-void crush_make_uniform_bucket(struct crush_map *map,
- struct crush_bucket_uniform *bucket,
- int size,
- int *items,
- int item_weight)
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int type, int size,
+ int *items,
+ int item_weight)
{
int i, j, x;
+ struct crush_bucket_uniform *bucket;
+ bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
+ bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM;
+ bucket->h.type = type;
bucket->h.size = size;
bucket->h.weight = size * item_weight;
bucket->primes[i++] = x;
x += 2;
}
+
+ return bucket;
}
-void crush_make_list_bucket(struct crush_map *map,
- struct crush_bucket_list *bucket,
- int size,
- int *items,
- int *weights)
+struct crush_bucket_list*
+crush_make_list_bucket(int type, int size,
+ int *items,
+ int *weights)
{
int i;
int w;
-
+ struct crush_bucket_list *bucket;
+
+ bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
+ bucket->h.bucket_type = CRUSH_BUCKET_LIST;
+ bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
}
bucket->h.weight = w;
+
+ return bucket;
+}
+
+static int height(int n) {
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+static int on_right(int n, int h) {
+ return n & (1 << (h+1));
+}
+static int parent(int n)
+{
+ int h = height(n);
+ if (on_right(n, h))
+ return n - (1<<h);
+ else
+ return n + (1<<h);
+}
+
+struct crush_bucket_tree*
+crush_make_tree_bucket(int type, int size,
+ int *items, /* in leaf order */
+ int *weights)
+{
+ struct crush_bucket_tree *bucket;
+ int depth;
+ int node;
+ int t, i, j;
+
+ bucket = malloc(sizeof(*bucket));
+ memset(bucket, 0, sizeof(*bucket));
+ bucket->h.bucket_type = CRUSH_BUCKET_TREE;
+ bucket->h.type = type;
+ bucket->h.size = size;
+
+ /* calc tree depth */
+ depth = 1;
+ t = size - 1;
+ while (t) {
+ t = t >> 1;
+ depth++;
+ }
+ bucket->h.size = 1 << depth;
+
+ bucket->h.items = malloc(sizeof(__u32)*bucket->h.size);
+ bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size);
+
+ memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size);
+ memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size);
+
+ for (i=0; i<size; i++) {
+ node = ((i+1) << 1)-1;
+ bucket->h.items[i] = items[i];
+ bucket->node_weights[node] = weights[i];
+ bucket->h.weight += weights[i];
+ for (j=1; j<depth; j++) {
+ node = parent(node);
+ bucket->node_weights[node] += weights[i];
+ }
+ }
+ BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight);
+
+ return bucket;
}
-void crush_make_straw_bucket(struct crush_map *map,
- struct crush_bucket_straw *bucket,
- int size,
- int *items,
- int *weights)
+struct crush_bucket_straw *
+crush_make_straw_bucket(int type,
+ int size,
+ int *items,
+ int *weights)
{
+ struct crush_bucket_straw *bucket;
int *reverse;
int i, j, k;
double straw, wbelow, lastw, wnext, pbelow;
int numleft;
+ bucket = malloc(sizeof(*bucket));
memset(bucket, 0, sizeof(*bucket));
+ bucket->h.bucket_type = CRUSH_BUCKET_STRAW;
+ bucket->h.type = type;
bucket->h.size = size;
bucket->h.items = malloc(sizeof(__u32)*size);
free(reverse);
+ return bucket;
}
--- /dev/null
+#ifndef _CRUSH_BUILDER_H
+#define _CRUSH_BUILDER_H
+
+#include "crush.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern struct crush_map *crush_create();
+extern void crush_finalize(struct crush_map *map);
+extern void crush_destroy(struct crush_map *map);
+
+extern int crush_add_rule(struct crush_map *map,
+ int ruleno,
+ struct crush_rule *rule);
+extern struct crush_rule *crush_make_rule();
+extern void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2);
+
+extern int crush_add_bucket(struct crush_map *map,
+ struct crush_bucket *bucket);
+
+struct crush_bucket_uniform *
+crush_make_uniform_bucket(int type, int size,
+ int *items,
+ int item_weight);
+struct crush_bucket_list*
+crush_make_list_bucket(int type, int size,
+ int *items,
+ int *weights);
+struct crush_bucket_tree*
+crush_make_tree_bucket(int type, int size,
+ int *items, /* in leaf order */
+ int *weights);
+struct crush_bucket_straw *
+crush_make_straw_bucket(int type, int size,
+ int *items,
+ int *weights);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
#include "crush.h"
#include "hash.h"
+#include <string.h>
+
+
/*
* choose numrep distinct items of given type
*/
{
int rep;
int ftotal, flocal;
- int retry_rep, skip_rep;
+ int retry_descent, retry_bucket, skip_rep;
struct crush_bucket *in = bucket;
int r;
int i;
for (rep = 0; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
- skip_rep = 0;
-
- while (1) {
+ do {
in = bucket; /* initial bucket */
/* choose through intervening buckets */
flocal = 0;
- retry_rep = 0;
-
- while (1) {
+ do {
r = rep;
if (in->type == CRUSH_BUCKET_UNIFORM) {
/* be careful */
/* desired type? */
if (item < 0)
- itemtype = map->buckets[-item]->type;
+ itemtype = map->buckets[-1-item]->type;
else
itemtype = 0;
/* keep going? */
if (itemtype != type) {
- in = map->buckets[-item];
+ in = map->buckets[-1-item];
continue;
}
bad = 1;
}
+ skip_rep = 0;
+ retry_descent = 0;
+ retry_bucket = 0;
if (bad || collide) {
ftotal++;
flocal++;
if (collide && flocal < 3)
- continue; /* locally a few times */
- if (ftotal >= 10) {
- /* give up, ignore dup, fixme */
- skip_rep = 1;
- break;
- }
- retry_rep = 1;
+ retry_bucket = 1; /* retry locally a few times */
+ else if (ftotal < 10)
+ retry_descent = 1; /* then retry descent */
+ else
+ skip_rep = 1; /* then give up */
}
- break;
- }
-
- if (retry_rep) continue;
- }
+ } while (retry_bucket);
+ } while (retry_descent);
if (skip_rep) continue;
-
+
out[outpos] = item;
outpos++;
}
if (forcefeed >= 0)
forcefeed = map->device_parents[forcefeed];
else
- forcefeed = map->bucket_parents[-forcefeed];
+ forcefeed = map->bucket_parents[-1-forcefeed];
if (forcefeed == 0) break;
}
}
for (i = 0; i < wsize; i++) {
numrep = rule->steps[step].arg1;
-
if (force_pos >= 0) {
o[osize++] = force_stack[force_pos];
force_pos--;
numrep--;
}
- if (numrep)
- crush_choose(map,
- map->buckets[-w[i]],
- x, numrep, rule->steps[step].arg2,
- o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
+ if (!numrep) continue;
+ osize += crush_choose(map,
+ map->buckets[-1-w[i]],
+ x, numrep, rule->steps[step].arg2,
+ o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN);
}
/* swap t and w arrays */
tmp = o;
o = w;
- w = o;
+ w = tmp;
wsize = osize;
break;
return result_len;
}
+
+
+void crush_index(struct crush_map *map)
+{
+ int b, i, c;
+
+ /* allocate arrays */
+ map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices);
+ memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices);
+ map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets);
+ memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets);
+
+ /* build parent maps */
+ for (b=0; b<map->max_buckets; b++) {
+ if (map->buckets[b] == 0) continue;
+ for (i=0; i<map->buckets[b]->size; i++) {
+ c = map->buckets[b]->items[i];
+ if (c >= 0)
+ map->device_parents[c] = map->buckets[b]->id;
+ else
+ map->bucket_parents[-1-c] = map->buckets[b]->id;
+ }
+ }
+}
#ifndef _CRUSH_CRUSH_H
#define _CRUSH_CRUSH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
#include "types.h"
#include "buckets.h"
__u32 max_devices;
};
+
+extern void crush_index(struct crush_map *map);
+
extern int crush_do_rule(struct crush_map *map,
int ruleno,
int x, int *result, int result_max,
int forcefeed); /* -1 for none */
+
/*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/
+#ifdef __cplusplus
+}
+#endif
+
#endif
#include "MonitorStore.h"
+#include "crush2/CrushWrapper.h"
+
#include "messages/MOSDFailure.h"
#include "messages/MOSDMap.h"
#include "messages/MOSDGetMap.h"
}
-void OSDMonitor::build_crush_map(Crush& crush,
+void OSDMonitor::build_crush_map(CrushWrapper& crush,
map<int,double>& weights)
{
+ // new
+ crush.create();
if (g_conf.num_osd >= 12) {
int ndom = g_conf.osd_max_rep;
- UniformBucket *domain[ndom];
- int domid[ndom];
- for (int i=0; i<ndom; i++) {
- domain[i] = new UniformBucket(1, 0);
- domid[i] = crush.add_bucket(domain[i]);
- }
-
- // add osds
+ crush_bucket_uniform *domain[ndom];
+
+ int ritems[ndom];
+ int rweights[ndom];
+
int nper = ((g_conf.num_osd - 1) / ndom) + 1;
derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
- int i = 0;
- for (int dom=0; dom<ndom; dom++) {
- for (int j=0; j<nper; j++) {
- domain[dom]->add_item(i, weights[i] ? weights[i]:1.0);
- //derr(0) << "osd" << i << " in domain " << dom << dendl;
+
+ int o = 0;
+ for (int i=0; i<ndom; i++) {
+ int items[nper];
+ //int w[nper];
+ int j;
+ rweights[i] = 0;
+ for (j=0; j<nper; j++) {
+ items[j] = o;
+ //w[j] = weights[o] ? (0x10000 - (int)(weights[o] * 0x10000)):0x10000;
i++;
+ //rweights[i] += w[j];
if (i == g_conf.num_osd) break;
}
+
+ rweights[i] = 0x10000 * j;
+ domain[i] = crush_make_uniform_bucket(1, j, items, 0x10000);
+ ritems[i] = crush_add_bucket(crush.map, (crush_bucket*)domain[i]);
}
// root
- Bucket *root = new ListBucket(2);
- for (int i=0; i<ndom; i++) {
- //derr(0) << "dom " << i << " w " << domain[i]->get_weight() << dendl;
- root->add_item(domid[i], domain[i]->get_weight());
- }
- int nroot = crush.add_bucket(root);
+ crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights);
+ int nroot = crush_add_bucket(crush.map, (crush_bucket*)root);
// rules
// replication
for (int i=1; i<=ndom; i++) {
- int r = CRUSH_REP_RULE(i);
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ crush_rule *rule = crush_make_rule();
+ crush_rule_add_step(rule, CRUSH_RULE_TAKE, nroot, 0);
+ crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
+ crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
+ crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
}
+ /*
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
}
+ */
// test
//vector<int> out;
} else {
// one bucket
- Bucket *b = new UniformBucket(1, 0);
- int root = crush.add_bucket(b);
- for (int i=0; i<g_conf.num_osd; i++) {
- b->add_item(i, weights[i] ? weights[i]:1.0);
- }
+
+ int items[g_conf.num_osd];
+ for (int i=0; i<g_conf.num_osd; i++)
+ items[i] = i;
+
+ crush_bucket_uniform *b = crush_make_uniform_bucket(1, g_conf.num_osd, items, 0x10000);
+ int root = crush_add_bucket(crush.map, (crush_bucket*)b);
// rules
// replication
for (int i=1; i<=g_conf.osd_max_rep; i++) {
- int r = CRUSH_REP_RULE(i);
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
- crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ crush_rule *rule = crush_make_rule();
+ crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0);
+ crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
+ crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule);
}
+ /*
// raid
for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
int r = CRUSH_RAID_RULE(i);
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0));
crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
}
+ */
}
+ crush.finalize();
}
if (pending_inc.new_up.size() == osdmap.get_osds().size()) {
delay = 0.0;
if (g_conf.osd_auto_weight) {
- Crush crush;
+ CrushWrapper crush;
build_crush_map(crush, osd_weight);
crush._encode(pending_inc.crush);
}
map<int,double> osd_weight;
- void build_crush_map(Crush& crush,
+ void build_crush_map(CrushWrapper& crush,
map<int,double>& weights);
// svc
#include "common/Mutex.h"
#include "common/Clock.h"
-#include "crush/crush.h"
-using namespace crush;
+#include "crush2/hash.h"
+#include "crush2/CrushWrapper.h"
#include <vector>
#include <list>
map<int32_t,entity_inst_t> osd_inst;
public:
- Crush crush; // hierarchical map
+ CrushWrapper crush; // hierarchical map
friend class OSDMonitor;
friend class MDS;
void mark_down(int o, bool clean) { down_osds[o] = clean; }
void mark_up(int o) { down_osds.erase(o); }
- void mark_out(int o) { out_osds.insert(o); }
- void mark_in(int o) { out_osds.erase(o); }
-
+ void mark_out(int o) {
+ out_osds.insert(o);
+ crush.update_offload_map(out_osds, overload_osds);
+ }
+ void mark_in(int o) {
+ out_osds.erase(o);
+ crush.update_offload_map(out_osds, overload_osds);
+ }
void apply_incremental(Incremental &inc) {
assert(inc.epoch == epoch+1);
return;
}
if (inc.crush.length()) {
- int off = 0;
- crush._decode(inc.crush, off);
+ bufferlist::iterator blp = inc.crush.begin();
+ crush._decode(blp);
}
// nope, incremental.
i++) {
overload_osds[i->first] = i->second;
}
+
+ crush.update_offload_map(out_osds, overload_osds);
}
// serialize, unserialize
::_encode(overload_osds, blist);
::_encode(osd_inst, blist);
- crush._encode(blist);
+ bufferlist cbl;
+ crush._encode(cbl);
+ ::_encode(cbl, blist);
}
void decode(bufferlist& blist) {
::_decode(overload_osds, blist, off);
::_decode(osd_inst, blist, off);
- crush._decode(blist, off);
+ bufferlist cbl;
+ ::_decode(cbl, blist, off);
+ bufferlist::iterator cblp = cbl.begin();
+ crush._decode(cblp);
+
+ crush.update_offload_map(out_osds, overload_osds);
}
}
ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
- static crush::Hash H(777);
-
int num = preferred >= 0 ? localized_pg_num:pg_num;
int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask;
case CEPH_OBJECT_LAYOUT_HASHINO:
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask);
- ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
+ ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
break;
case CEPH_OBJECT_LAYOUT_HASH:
//ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask);
//ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask);
//ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask);
- ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
+ ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask);
break;
default:
if (pg.preferred() >= 0 &&
out_osds.count(pg.preferred()) == 0)
forcefeed = pg.preferred();
- crush.do_rule(crush.rules[rule],
+ crush.do_rule(rule,
pg.ps(),
- osds,
- out_osds, overload_osds,
+ osds, pg.size(),
forcefeed);
}
break;
case CEPH_PG_LAYOUT_HYBRID:
{
- static crush::Hash H(777);
- int h = H(pg.ps());
+ int h = crush_hash32(pg.ps());
for (int i=0; i<pg.size(); i++)
osds.push_back( (h+i) % g_conf.num_osd );
}
case CEPH_PG_LAYOUT_HASH:
{
- static crush::Hash H(777);
for (int i=0; i<pg.size(); i++) {
int t = 1;
int osd = 0;
while (t++) {
- osd = H(i, pg.ps(), t) % g_conf.num_osd;
+ osd = crush_hash32_3(i, pg.ps(), t) % g_conf.num_osd;
int j = 0;
for (; j<i; j++)
if (osds[j] == osd) break;
using namespace std;
#include <ext/hash_map>
+#include <ext/hash_set>
using namespace __gnu_cxx;
// crush rule ids
-#define CRUSH_REP_RULE(nrep) (100+nrep) // replication
-#define CRUSH_RAID_RULE(num) (200+num) // raid
+#define CRUSH_REP_RULE(nrep) (nrep) // replication
+#define CRUSH_RAID_RULE(num) (10+num) // raid