From 979eb4d8de049e7e82241898cb27d8a41fcbc125 Mon Sep 17 00:00:00 2001 From: sageweil Date: Thu, 18 Oct 2007 17:18:05 +0000 Subject: [PATCH] works git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1963 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/crush/Makefile | 27 ++- branches/sage/crush/crush2/CrushWrapper.h | 225 ++++++++++++++++++++++ branches/sage/crush/crush2/Makefile | 5 +- branches/sage/crush/crush2/buckets.c | 48 ++++- branches/sage/crush/crush2/buckets.h | 13 +- branches/sage/crush/crush2/builder.c | 178 ++++++++++++----- branches/sage/crush/crush2/builder.h | 44 +++++ branches/sage/crush/crush2/crush.c | 80 +++++--- branches/sage/crush/crush2/crush.h | 12 ++ branches/sage/crush/mon/OSDMonitor.cc | 86 +++++---- branches/sage/crush/mon/OSDMonitor.h | 2 +- branches/sage/crush/osd/OSDMap.h | 51 +++-- branches/sage/crush/osd/PG.h | 1 + branches/sage/crush/osd/osd_types.h | 4 +- 14 files changed, 621 insertions(+), 155 deletions(-) create mode 100644 branches/sage/crush/crush2/CrushWrapper.h create mode 100644 branches/sage/crush/crush2/builder.h diff --git a/branches/sage/crush/Makefile b/branches/sage/crush/Makefile index f1dc7b3d4d60a..083368c608172 100644 --- a/branches/sage/crush/Makefile +++ b/branches/sage/crush/Makefile @@ -16,7 +16,7 @@ EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib EXTRA_CFLAGS += -g EXTRA_CFLAGS += -pg -EXTRA_CFLAGS += -O3 +#EXTRA_CFLAGS += -O3 # base CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} @@ -145,22 +145,22 @@ mkmonmap: mkmonmap.cc common.o extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmon: cmon.o mon.o msg/SimpleMessenger.o common.o +cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o +cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o +cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o +csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o +cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ @@ -191,15 +191,15 @@ ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o # fake* -fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o +fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o +fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush2/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ # mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o +newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush2/libcrush.o ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ @@ -243,6 +243,14 @@ gprof-helper.so: test/gprof-helper.c test_disk_bw: test/test_disk_bw.cc common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ +# crush + +crush2/libcrush.o: force_look + cd crush2 ; make + +force_look: + true + # bits common.o: ${COMMON_OBJS} ${LDINC} $@ $^ @@ -268,7 +276,6 @@ mon.o: ${MON_OBJS} osbdb.o: ${OSBDB_OBJS} ${LDINC} $@ $^ - # generic rules %.so: %.cc ${CC} -shared -fPIC ${CFLAGS} $< -o $@ diff --git a/branches/sage/crush/crush2/CrushWrapper.h b/branches/sage/crush/crush2/CrushWrapper.h new file mode 100644 index 0000000000000..e398deb4fea89 --- /dev/null +++ b/branches/sage/crush/crush2/CrushWrapper.h @@ -0,0 +1,225 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef __CRUSH_WRAPPER_H +#define __CRUSH_WRAPPER_H + +#include "crush.h" +#include "builder.h" + +#include "include/encodable.h" + +#include +#include + +class CrushWrapper { +public: + struct crush_map *map; + + CrushWrapper() : map(0) {} + ~CrushWrapper() { + if (map) crush_destroy(map); + } + + void create() { + if (map) crush_destroy(map); + map = crush_create(); + } + void finalize() { + assert(map); + crush_finalize(map); + } + + void update_offload_map(std::set& out_osds, + std::map& overload_osds) { + for (unsigned i=0; imax_devices; i++) { + if (out_osds.count(i)) + map->device_offload[i] = 0x10000; + else if (overload_osds.count(i)) + map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse? + else + map->device_offload[i] = 0; // normal. + } + } + + void do_rule(int rule, int x, vector& out, int maxout, int forcefeed) { + int rawout[maxout]; + + int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed); + + out.resize(numrep); + for (int i=0; imax_buckets, bl); + ::_encode_simple(map->max_rules, bl); + ::_encode_simple(map->max_devices, bl); + + // simple arrays + bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices); + + // buckets + for (unsigned i=0; imax_buckets; i++) { + __u32 type = 0; + if (map->buckets[i]) type = map->buckets[i]->bucket_type; + ::_encode_simple(type, bl); + if (!type) continue; + + ::_encode_simple(map->buckets[i]->id, bl); + ::_encode_simple(map->buckets[i]->type, bl); + ::_encode_simple(map->buckets[i]->bucket_type, bl); + ::_encode_simple(map->buckets[i]->weight, bl); + ::_encode_simple(map->buckets[i]->size, bl); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(map->buckets[i]->items[j], bl); + + switch (map->buckets[i]->type) { + case CRUSH_BUCKET_UNIFORM: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl); + ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl); + break; + + case CRUSH_BUCKET_LIST: + for (unsigned j=0; jbuckets[i]->size; j++) { + ::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl); + ::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl); + } + break; + + case CRUSH_BUCKET_TREE: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl); + break; + + case CRUSH_BUCKET_STRAW: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl); + break; + } + } + + // rules + for (unsigned i=0; imax_rules; i++) { + __u32 yes = map->rules[i] ? 1:0; + ::_encode_simple(yes, bl); + if (!yes) continue; + + ::_encode_simple(map->rules[i]->len, bl); + for (unsigned j=0; jrules[i]->len; j++) + ::_encode_simple(map->rules[i]->steps[j], bl); + } + } + + void _decode(bufferlist::iterator &blp) { + create(); + ::_decode_simple(map->max_buckets, blp); + ::_decode_simple(map->max_rules, blp); + ::_decode_simple(map->max_devices, blp); + + map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices); + blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload); + + // buckets + map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets); + for (unsigned i=0; imax_buckets; i++) { + __u32 type; + ::_decode_simple(type, blp); + if (!type) { + map->buckets[i] = 0; + continue; + } + + int size = 0; + switch (type) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(crush_bucket_straw); + break; + default: + assert(0); + } + map->buckets[i] = (crush_bucket*)malloc(size); + memset(map->buckets[i], 0, size); + + ::_decode_simple(map->buckets[i]->id, blp); + ::_decode_simple(map->buckets[i]->type, blp); + ::_decode_simple(map->buckets[i]->bucket_type, blp); + ::_decode_simple(map->buckets[i]->weight, blp); + ::_decode_simple(map->buckets[i]->size, blp); + + map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(map->buckets[i]->items[j], blp); + + switch (map->buckets[i]->type) { + case CRUSH_BUCKET_UNIFORM: + ((crush_bucket_uniform*)map->buckets[i])->primes = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp); + ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp); + break; + + case CRUSH_BUCKET_LIST: + ((crush_bucket_list*)map->buckets[i])->item_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + ((crush_bucket_list*)map->buckets[i])->sum_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + + for (unsigned j=0; jbuckets[i]->size; j++) { + ::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp); + ::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp); + } + break; + + case CRUSH_BUCKET_TREE: + ((crush_bucket_tree*)map->buckets[i])->node_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp); + break; + + case CRUSH_BUCKET_STRAW: + ((crush_bucket_straw*)map->buckets[i])->straws = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp); + break; + } + } + + // rules + map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules); + for (unsigned i=0; imax_rules; i++) { + __u32 yes; + ::_decode_simple(yes, blp); + if (!yes) { + map->rules[i] = 0; + continue; + } + + map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule)); + memset(map->rules[i], 0, sizeof(crush_rule)); + + ::_decode_simple(map->rules[i]->len, blp); + map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len); + for (unsigned j=0; jrules[i]->len; j++) + ::_decode_simple(map->rules[i]->steps[j], blp); + } + + finalize(); + } +}; + +#endif diff --git a/branches/sage/crush/crush2/Makefile b/branches/sage/crush/crush2/Makefile index 6a9bae06f0ee5..baba6c2e44622 100644 --- a/branches/sage/crush/crush2/Makefile +++ b/branches/sage/crush/crush2/Makefile @@ -1,7 +1,8 @@ CC = gcc CFLAGS = -Wall -CFLAGS += -O3 -g +CFLAGS += -g +#CFLAGS += -O3 LD = ld RM = rm @@ -13,7 +14,7 @@ clean: %.o: %.c ${CC} ${CFLAGS} -c $< -o $@ -libcrush.o: crush.o buckets.o +libcrush.o: crush.o buckets.o builder.o $(LD) -i -o $@ $^ .depend: diff --git a/branches/sage/crush/crush2/buckets.c b/branches/sage/crush/crush2/buckets.c index 557dc68d00778..40d6c1eda8bb5 100644 --- a/branches/sage/crush/crush2/buckets.c +++ b/branches/sage/crush/crush2/buckets.c @@ -1,6 +1,6 @@ +#include "crush.h" #include "hash.h" -#include "buckets.h" int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) @@ -29,10 +29,52 @@ crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) return 0; } + +static int height(int n) { + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} +static int left(int x) { + int h = height(x); + return x - (1 << (h-1)); +} +static int right(int x) { + int h = height(x); + return x + (1 << (h-1)); +} +static int terminal(int x) { + return x & 1; +} + int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { - return 0; + int n, l; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->h.size >> 1; + + while (!terminal(n)) { + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w; + t = t >> 32; + + /* left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n]; } int @@ -72,7 +114,7 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - free(b->h.items); + free(b->node_weights); } void crush_destroy_bucket_straw(struct crush_bucket_straw *b) diff --git a/branches/sage/crush/crush2/buckets.h b/branches/sage/crush/crush2/buckets.h index aab536b28b2f3..e8b5b6968e8a8 100644 --- a/branches/sage/crush/crush2/buckets.h +++ b/branches/sage/crush/crush2/buckets.h @@ -1,8 +1,6 @@ #ifndef _CRUSH_BUCKETS_H #define _CRUSH_BUCKETS_H -#include "types.h" - enum { CRUSH_BUCKET_UNIFORM = 1, CRUSH_BUCKET_LIST = 2, @@ -11,8 +9,9 @@ enum { }; struct crush_bucket { - __u32 id; - __u32 type; + __s32 id; /* this'll be negative */ + __u16 type; + __u16 bucket_type; __u32 weight; /* 16-bit fixed point */ __u32 size; /* num items */ __s32 *items; @@ -20,8 +19,8 @@ struct crush_bucket { struct crush_bucket_uniform { struct crush_bucket h; - __u32 item_weight; /* 16-bit fixed point */ __u32 *primes; + __u32 item_weight; /* 16-bit fixed point */ }; struct crush_bucket_list { @@ -31,8 +30,8 @@ struct crush_bucket_list { }; struct crush_bucket_tree { - struct crush_bucket h; - + struct crush_bucket h; /* note: h.size is tree size, not number of actual items */ + __u32 *node_weights; }; struct crush_bucket_straw { diff --git a/branches/sage/crush/crush2/builder.c b/branches/sage/crush/crush2/builder.c index 803eb9d1561a0..fe4b04492a356 100644 --- a/branches/sage/crush/crush2/builder.c +++ b/branches/sage/crush/crush2/builder.c @@ -6,12 +6,11 @@ #include "builder.h" #include "hash.h" - -struct crush_map *crush_new() +struct crush_map *crush_create() { struct crush_map *m; m = malloc(sizeof(*m)); - memset(&m, 0, sizeof(m)); + memset(m, 0, sizeof(*m)); return m; } @@ -26,31 +25,38 @@ void crush_finalize(struct crush_map *map) for (b=0; bmax_buckets; b++) { if (map->buckets[b] == 0) continue; for (i=0; ibuckets[b]->size; i++) - if (map->buckets[b]->items[i] > map->max_devices) - map->max_devices = map->buckets[b]->items[i]; + if (map->buckets[b]->items[i] >= map->max_devices) + map->max_devices = map->buckets[b]->items[i] + 1; } /* allocate arrays */ map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices); - map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices); - memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets); memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets); - /* build reverse map */ + /* build parent maps */ for (b=0; bmax_buckets; b++) { if (map->buckets[b] == 0) continue; for (i=0; ibuckets[b]->size; i++) { c = map->buckets[b]->items[i]; + BUG_ON(c >= map->max_devices); if (c >= 0) map->device_parents[c] = map->buckets[b]->id; else - map->bucket_parents[-c] = map->buckets[b]->id; + map->bucket_parents[-1-c] = map->buckets[b]->id; } } + + /* new device offload map? */ + if (!map->device_offload) { + map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); + memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); + } } + + /* * deallocate */ @@ -90,8 +96,7 @@ void crush_destroy(struct crush_map *map) free(map->bucket_parents); free(map->device_parents); free(map->device_offload); - - memset(map, 0, sizeof(*map)); + free(map); } @@ -100,25 +105,26 @@ void crush_destroy(struct crush_map *map) /** rules **/ int crush_add_rule(struct crush_map *map, + int ruleno, struct crush_rule *rule) { - int id; - - /* find a rule id */ - for (id=0; id < map->max_rules; id++) - if (map->rules[id] == 0) break; - if (id == map->max_rules) { + int oldsize; + + if (ruleno < 0) { + for (ruleno=0; ruleno < map->max_rules; ruleno++) + if (map->rules[ruleno] == 0) break; + } + if (ruleno >= map->max_rules) { /* expand array */ - if (map->max_rules) - map->max_rules *= 2; - else - map->max_rules = 8; + oldsize = map->max_rules; + map->max_rules = ruleno+1; map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0])); + memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0])); } /* add it */ - map->rules[id] = rule; - return id; + map->rules[ruleno] = rule; + return ruleno; } struct crush_rule *crush_make_rule() @@ -126,7 +132,7 @@ struct crush_rule *crush_make_rule() struct crush_rule *rule; rule = malloc(sizeof(struct crush_rule)); - memset(&rule, 0, sizeof(rule)); + memset(rule, 0, sizeof(*rule)); return rule; } @@ -134,9 +140,9 @@ void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2) { rule->len++; if (rule->steps) - rule->steps = malloc(sizeof(rule->steps[0])*rule->len); - else rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len); + else + rule->steps = malloc(sizeof(rule->steps[0])*rule->len); rule->steps[rule->len-1].op = op; rule->steps[rule->len-1].arg1 = arg1; rule->steps[rule->len-1].arg2 = arg2; @@ -149,35 +155,41 @@ int crush_add_bucket(struct crush_map *map, struct crush_bucket *bucket) { int id; + int oldsize; /* find a bucket id */ for (id=0; id < map->max_buckets; id++) if (map->buckets[id] == 0) break; if (id == map->max_buckets) { /* expand array */ + oldsize = map->max_buckets; if (map->max_buckets) map->max_buckets *= 2; else map->max_buckets = 8; map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0])); + memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0])); } /* add it */ - bucket->id = id; + bucket->id = -1 - id; map->buckets[id] = bucket; - return id; + return -1 - id; } -void crush_make_uniform_bucket(struct crush_map *map, - struct crush_bucket_uniform *bucket, - int size, - int *items, - int item_weight) +struct crush_bucket_uniform * +crush_make_uniform_bucket(int type, int size, + int *items, + int item_weight) { int i, j, x; + struct crush_bucket_uniform *bucket; + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM; + bucket->h.type = type; bucket->h.size = size; bucket->h.weight = size * item_weight; @@ -202,18 +214,23 @@ void crush_make_uniform_bucket(struct crush_map *map, bucket->primes[i++] = x; x += 2; } + + return bucket; } -void crush_make_list_bucket(struct crush_map *map, - struct crush_bucket_list *bucket, - int size, - int *items, - int *weights) +struct crush_bucket_list* +crush_make_list_bucket(int type, int size, + int *items, + int *weights) { int i; int w; - + struct crush_bucket_list *bucket; + + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_LIST; + bucket->h.type = type; bucket->h.size = size; bucket->h.items = malloc(sizeof(__u32)*size); @@ -228,22 +245,94 @@ void crush_make_list_bucket(struct crush_map *map, } bucket->h.weight = w; + + return bucket; +} + +static int height(int n) { + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} +static int on_right(int n, int h) { + return n & (1 << (h+1)); +} +static int parent(int n) +{ + int h = height(n); + if (on_right(n, h)) + return n - (1<h.bucket_type = CRUSH_BUCKET_TREE; + bucket->h.type = type; + bucket->h.size = size; + + /* calc tree depth */ + depth = 1; + t = size - 1; + while (t) { + t = t >> 1; + depth++; + } + bucket->h.size = 1 << depth; + + bucket->h.items = malloc(sizeof(__u32)*bucket->h.size); + bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size); + + memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size); + memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size); + + for (i=0; ih.items[i] = items[i]; + bucket->node_weights[node] = weights[i]; + bucket->h.weight += weights[i]; + for (j=1; jnode_weights[node] += weights[i]; + } + } + BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight); + + return bucket; } -void crush_make_straw_bucket(struct crush_map *map, - struct crush_bucket_straw *bucket, - int size, - int *items, - int *weights) +struct crush_bucket_straw * +crush_make_straw_bucket(int type, + int size, + int *items, + int *weights) { + struct crush_bucket_straw *bucket; int *reverse; int i, j, k; double straw, wbelow, lastw, wnext, pbelow; int numleft; + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_STRAW; + bucket->h.type = type; bucket->h.size = size; bucket->h.items = malloc(sizeof(__u32)*size); @@ -300,5 +389,6 @@ void crush_make_straw_bucket(struct crush_map *map, free(reverse); + return bucket; } diff --git a/branches/sage/crush/crush2/builder.h b/branches/sage/crush/crush2/builder.h new file mode 100644 index 0000000000000..80938165dd2c1 --- /dev/null +++ b/branches/sage/crush/crush2/builder.h @@ -0,0 +1,44 @@ +#ifndef _CRUSH_BUILDER_H +#define _CRUSH_BUILDER_H + +#include "crush.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern struct crush_map *crush_create(); +extern void crush_finalize(struct crush_map *map); +extern void crush_destroy(struct crush_map *map); + +extern int crush_add_rule(struct crush_map *map, + int ruleno, + struct crush_rule *rule); +extern struct crush_rule *crush_make_rule(); +extern void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2); + +extern int crush_add_bucket(struct crush_map *map, + struct crush_bucket *bucket); + +struct crush_bucket_uniform * +crush_make_uniform_bucket(int type, int size, + int *items, + int item_weight); +struct crush_bucket_list* +crush_make_list_bucket(int type, int size, + int *items, + int *weights); +struct crush_bucket_tree* +crush_make_tree_bucket(int type, int size, + int *items, /* in leaf order */ + int *weights); +struct crush_bucket_straw * +crush_make_straw_bucket(int type, int size, + int *items, + int *weights); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/branches/sage/crush/crush2/crush.c b/branches/sage/crush/crush2/crush.c index 1c06ca7644399..b91ed299cc872 100644 --- a/branches/sage/crush/crush2/crush.c +++ b/branches/sage/crush/crush2/crush.c @@ -2,6 +2,9 @@ #include "crush.h" #include "hash.h" +#include + + /* * choose numrep distinct items of given type */ @@ -12,7 +15,7 @@ static int crush_choose(struct crush_map *map, { int rep; int ftotal, flocal; - int retry_rep, skip_rep; + int retry_descent, retry_bucket, skip_rep; struct crush_bucket *in = bucket; int r; int i; @@ -26,16 +29,12 @@ static int crush_choose(struct crush_map *map, for (rep = 0; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; - skip_rep = 0; - - while (1) { + do { in = bucket; /* initial bucket */ /* choose through intervening buckets */ flocal = 0; - retry_rep = 0; - - while (1) { + do { r = rep; if (in->type == CRUSH_BUCKET_UNIFORM) { /* be careful */ @@ -75,13 +74,13 @@ static int crush_choose(struct crush_map *map, /* desired type? */ if (item < 0) - itemtype = map->buckets[-item]->type; + itemtype = map->buckets[-1-item]->type; else itemtype = 0; /* keep going? */ if (itemtype != type) { - in = map->buckets[-item]; + in = map->buckets[-1-item]; continue; } @@ -103,27 +102,25 @@ static int crush_choose(struct crush_map *map, bad = 1; } + skip_rep = 0; + retry_descent = 0; + retry_bucket = 0; if (bad || collide) { ftotal++; flocal++; if (collide && flocal < 3) - continue; /* locally a few times */ - if (ftotal >= 10) { - /* give up, ignore dup, fixme */ - skip_rep = 1; - break; - } - retry_rep = 1; + retry_bucket = 1; /* retry locally a few times */ + else if (ftotal < 10) + retry_descent = 1; /* then retry descent */ + else + skip_rep = 1; /* then give up */ } - break; - } - - if (retry_rep) continue; - } + } while (retry_bucket); + } while (retry_descent); if (skip_rep) continue; - + out[outpos] = item; outpos++; } @@ -164,7 +161,7 @@ int crush_do_rule(struct crush_map *map, if (forcefeed >= 0) forcefeed = map->device_parents[forcefeed]; else - forcefeed = map->bucket_parents[-forcefeed]; + forcefeed = map->bucket_parents[-1-forcefeed]; if (forcefeed == 0) break; } } @@ -191,23 +188,22 @@ int crush_do_rule(struct crush_map *map, for (i = 0; i < wsize; i++) { numrep = rule->steps[step].arg1; - if (force_pos >= 0) { o[osize++] = force_stack[force_pos]; force_pos--; numrep--; } - if (numrep) - crush_choose(map, - map->buckets[-w[i]], - x, numrep, rule->steps[step].arg2, - o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); + if (!numrep) continue; + osize += crush_choose(map, + map->buckets[-1-w[i]], + x, numrep, rule->steps[step].arg2, + o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); } /* swap t and w arrays */ tmp = o; o = w; - w = o; + w = tmp; wsize = osize; break; @@ -229,3 +225,27 @@ int crush_do_rule(struct crush_map *map, return result_len; } + + +void crush_index(struct crush_map *map) +{ + int b, i, c; + + /* allocate arrays */ + map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices); + memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices); + map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets); + memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets); + + /* build parent maps */ + for (b=0; bmax_buckets; b++) { + if (map->buckets[b] == 0) continue; + for (i=0; ibuckets[b]->size; i++) { + c = map->buckets[b]->items[i]; + if (c >= 0) + map->device_parents[c] = map->buckets[b]->id; + else + map->bucket_parents[-1-c] = map->buckets[b]->id; + } + } +} diff --git a/branches/sage/crush/crush2/crush.h b/branches/sage/crush/crush2/crush.h index 9dd5a003bd591..6e1910c5b643a 100644 --- a/branches/sage/crush/crush2/crush.h +++ b/branches/sage/crush/crush2/crush.h @@ -1,6 +1,10 @@ #ifndef _CRUSH_CRUSH_H #define _CRUSH_CRUSH_H +#ifdef __cplusplus +extern "C" { +#endif + #include "types.h" #include "buckets.h" @@ -45,11 +49,19 @@ struct crush_map { __u32 max_devices; }; + +extern void crush_index(struct crush_map *map); + extern int crush_do_rule(struct crush_map *map, int ruleno, int x, int *result, int result_max, int forcefeed); /* -1 for none */ + /*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/ +#ifdef __cplusplus +} +#endif + #endif diff --git a/branches/sage/crush/mon/OSDMonitor.cc b/branches/sage/crush/mon/OSDMonitor.cc index 200187510f698..6d77ddcce9af7 100644 --- a/branches/sage/crush/mon/OSDMonitor.cc +++ b/branches/sage/crush/mon/OSDMonitor.cc @@ -18,6 +18,8 @@ #include "MonitorStore.h" +#include "crush2/CrushWrapper.h" + #include "messages/MOSDFailure.h" #include "messages/MOSDMap.h" #include "messages/MOSDGetMap.h" @@ -158,49 +160,56 @@ void OSDMonitor::create_initial() } -void OSDMonitor::build_crush_map(Crush& crush, +void OSDMonitor::build_crush_map(CrushWrapper& crush, map& weights) { + // new + crush.create(); if (g_conf.num_osd >= 12) { int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - //derr(0) << "osd" << i << " in domain " << dom << dendl; + + int o = 0; + for (int i=0; iget_weight() << dendl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = crush.add_bucket(root); + crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights); + int nroot = crush_add_bucket(crush.map, (crush_bucket*)root); // rules // replication for (int i=1; i<=ndom; i++) { - int r = CRUSH_REP_RULE(i); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + crush_rule *rule = crush_make_rule(); + crush_rule_add_step(rule, CRUSH_RULE_TAKE, nroot, 0); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 1); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0); + crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0); + crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule); } + /* // raid for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { int r = CRUSH_RAID_RULE(i); @@ -215,6 +224,7 @@ void OSDMonitor::build_crush_map(Crush& crush, crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); } } + */ // test //vector out; @@ -222,20 +232,24 @@ void OSDMonitor::build_crush_map(Crush& crush, } else { // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = crush.add_bucket(b); - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - } + + int items[g_conf.num_osd]; + for (int i=0; i osd_weight; - void build_crush_map(Crush& crush, + void build_crush_map(CrushWrapper& crush, map& weights); // svc diff --git a/branches/sage/crush/osd/OSDMap.h b/branches/sage/crush/osd/OSDMap.h index fda57d73ef99e..2a34d21bbb27f 100644 --- a/branches/sage/crush/osd/OSDMap.h +++ b/branches/sage/crush/osd/OSDMap.h @@ -28,8 +28,8 @@ #include "common/Mutex.h" #include "common/Clock.h" -#include "crush/crush.h" -using namespace crush; +#include "crush2/hash.h" +#include "crush2/CrushWrapper.h" #include #include @@ -144,7 +144,7 @@ private: map osd_inst; public: - Crush crush; // hierarchical map + CrushWrapper crush; // hierarchical map friend class OSDMonitor; friend class MDS; @@ -207,9 +207,14 @@ private: void mark_down(int o, bool clean) { down_osds[o] = clean; } void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - + void mark_out(int o) { + out_osds.insert(o); + crush.update_offload_map(out_osds, overload_osds); + } + void mark_in(int o) { + out_osds.erase(o); + crush.update_offload_map(out_osds, overload_osds); + } void apply_incremental(Incremental &inc) { assert(inc.epoch == epoch+1); @@ -223,8 +228,8 @@ private: return; } if (inc.crush.length()) { - int off = 0; - crush._decode(inc.crush, off); + bufferlist::iterator blp = inc.crush.begin(); + crush._decode(blp); } // nope, incremental. @@ -272,6 +277,8 @@ private: i++) { overload_osds[i->first] = i->second; } + + crush.update_offload_map(out_osds, overload_osds); } // serialize, unserialize @@ -288,7 +295,9 @@ private: ::_encode(overload_osds, blist); ::_encode(osd_inst, blist); - crush._encode(blist); + bufferlist cbl; + crush._encode(cbl); + ::_encode(cbl, blist); } void decode(bufferlist& blist) { @@ -306,7 +315,12 @@ private: ::_decode(overload_osds, blist, off); ::_decode(osd_inst, blist, off); - crush._decode(blist, off); + bufferlist cbl; + ::_decode(cbl, blist, off); + bufferlist::iterator cblp = cbl.begin(); + crush._decode(cblp); + + crush.update_offload_map(out_osds, overload_osds); } @@ -320,8 +334,6 @@ private: } ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - static crush::Hash H(777); - int num = preferred >= 0 ? localized_pg_num:pg_num; int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; @@ -334,14 +346,14 @@ private: case CEPH_OBJECT_LAYOUT_HASHINO: //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); + ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); break; case CEPH_OBJECT_LAYOUT_HASH: //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); + ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); break; default: @@ -374,10 +386,9 @@ private: if (pg.preferred() >= 0 && out_osds.count(pg.preferred()) == 0) forcefeed = pg.preferred(); - crush.do_rule(crush.rules[rule], + crush.do_rule(rule, pg.ps(), - osds, - out_osds, overload_osds, + osds, pg.size(), forcefeed); } break; @@ -389,8 +400,7 @@ private: case CEPH_PG_LAYOUT_HYBRID: { - static crush::Hash H(777); - int h = H(pg.ps()); + int h = crush_hash32(pg.ps()); for (int i=0; i +#include using namespace __gnu_cxx; diff --git a/branches/sage/crush/osd/osd_types.h b/branches/sage/crush/osd/osd_types.h index 24dd9eca74234..0ae9d0831b0d7 100644 --- a/branches/sage/crush/osd/osd_types.h +++ b/branches/sage/crush/osd/osd_types.h @@ -74,8 +74,8 @@ typedef uint8_t pruleset_t; // crush rule ids -#define CRUSH_REP_RULE(nrep) (100+nrep) // replication -#define CRUSH_RAID_RULE(num) (200+num) // raid +#define CRUSH_REP_RULE(nrep) (nrep) // replication +#define CRUSH_RAID_RULE(num) (10+num) // raid -- 2.39.5