From dae8bec9a18d29878c8a03cdde7098f287e903be Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 23 Oct 2007 21:53:24 +0000 Subject: [PATCH] merged r1961:1988 from ceph/branches/sage/crush back into trunk git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1989 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/Makefile | 27 +- trunk/ceph/{crush => crush.old}/BinaryTree.h | 0 trunk/ceph/{crush => crush.old}/Bucket.h | 0 trunk/ceph/{crush => crush.old}/Hash.h | 0 trunk/ceph/crush.old/crush.h | 543 ++++++++++++++++ .../test/bucket_movement.cc | 0 .../test/bucket_variance.cc | 0 .../test/cluster_movement.cc | 0 .../test/cluster_movement_remove.cc | 0 .../test/cluster_movement_rush.cc | 0 .../test/creeping_failure.cc | 0 .../test/creeping_failure_variance.cc | 0 .../test/depth_variance.cc | 0 trunk/ceph/{crush => crush.old}/test/mixed.cc | 0 .../{crush => crush.old}/test/movement.cc | 0 .../test/movement_failed.cc | 0 .../{crush => crush.old}/test/overload.cc | 0 .../test/overload_variance.cc | 0 trunk/ceph/{crush => crush.old}/test/sizes.cc | 0 .../{crush => crush.old}/test/smallbucket.cc | 0 .../{crush => crush.old}/test/speed_bucket.cc | 0 .../{crush => crush.old}/test/speed_depth.cc | 0 .../{crush => crush.old}/test/speed_rush.cc | 0 trunk/ceph/{crush => crush.old}/test/t.cc | 0 .../{crush => crush.old}/test/testbucket.cc | 0 .../{crush => crush.old}/test/testnormal.cc | 0 trunk/ceph/crush/CrushWrapper.h | 227 +++++++ trunk/ceph/{crush2 => crush}/Makefile | 10 +- trunk/ceph/crush/buckets.c | 6 + trunk/ceph/{crush2 => crush}/builder.c | 257 +++++--- trunk/ceph/crush/builder.h | 45 ++ trunk/ceph/crush/crush.c | 81 +++ trunk/ceph/crush/crush.h | 600 +++--------------- trunk/ceph/{crush2 => crush}/hash.h | 0 trunk/ceph/{crush2/crush.c => crush/mapper.c} | 200 ++++-- trunk/ceph/crush/mapper.h | 19 + trunk/ceph/crush/test.c | 65 ++ trunk/ceph/{crush2 => crush}/types.h | 0 trunk/ceph/crush2/buckets.c | 82 --- trunk/ceph/crush2/buckets.h | 53 -- trunk/ceph/crush2/crush.h | 55 -- trunk/ceph/mon/OSDMonitor.cc | 92 +-- trunk/ceph/mon/OSDMonitor.h | 2 +- trunk/ceph/osd/OSDMap.h | 50 +- trunk/ceph/osd/PG.h | 1 + trunk/ceph/osd/osd_types.h | 4 +- 46 files changed, 1509 insertions(+), 910 deletions(-) rename trunk/ceph/{crush => crush.old}/BinaryTree.h (100%) rename trunk/ceph/{crush => crush.old}/Bucket.h (100%) rename trunk/ceph/{crush => crush.old}/Hash.h (100%) create mode 100644 trunk/ceph/crush.old/crush.h rename trunk/ceph/{crush => crush.old}/test/bucket_movement.cc (100%) rename trunk/ceph/{crush => crush.old}/test/bucket_variance.cc (100%) rename trunk/ceph/{crush => crush.old}/test/cluster_movement.cc (100%) rename trunk/ceph/{crush => crush.old}/test/cluster_movement_remove.cc (100%) rename trunk/ceph/{crush => crush.old}/test/cluster_movement_rush.cc (100%) rename trunk/ceph/{crush => crush.old}/test/creeping_failure.cc (100%) rename trunk/ceph/{crush => crush.old}/test/creeping_failure_variance.cc (100%) rename trunk/ceph/{crush => crush.old}/test/depth_variance.cc (100%) rename trunk/ceph/{crush => crush.old}/test/mixed.cc (100%) rename trunk/ceph/{crush => crush.old}/test/movement.cc (100%) rename trunk/ceph/{crush => crush.old}/test/movement_failed.cc (100%) rename trunk/ceph/{crush => crush.old}/test/overload.cc (100%) rename trunk/ceph/{crush => crush.old}/test/overload_variance.cc (100%) rename trunk/ceph/{crush => crush.old}/test/sizes.cc (100%) rename trunk/ceph/{crush => crush.old}/test/smallbucket.cc (100%) rename trunk/ceph/{crush => crush.old}/test/speed_bucket.cc (100%) rename trunk/ceph/{crush => crush.old}/test/speed_depth.cc (100%) rename trunk/ceph/{crush => crush.old}/test/speed_rush.cc (100%) rename trunk/ceph/{crush => crush.old}/test/t.cc (100%) rename trunk/ceph/{crush => crush.old}/test/testbucket.cc (100%) rename trunk/ceph/{crush => crush.old}/test/testnormal.cc (100%) create mode 100644 trunk/ceph/crush/CrushWrapper.h rename trunk/ceph/{crush2 => crush}/Makefile (63%) create mode 100644 trunk/ceph/crush/buckets.c rename trunk/ceph/{crush2 => crush}/builder.c (50%) create mode 100644 trunk/ceph/crush/builder.h create mode 100644 trunk/ceph/crush/crush.c rename trunk/ceph/{crush2 => crush}/hash.h (100%) rename trunk/ceph/{crush2/crush.c => crush/mapper.c} (52%) create mode 100644 trunk/ceph/crush/mapper.h create mode 100644 trunk/ceph/crush/test.c rename trunk/ceph/{crush2 => crush}/types.h (100%) delete mode 100644 trunk/ceph/crush2/buckets.c delete mode 100644 trunk/ceph/crush2/buckets.h delete mode 100644 trunk/ceph/crush2/crush.h diff --git a/trunk/ceph/Makefile b/trunk/ceph/Makefile index f1dc7b3d4d60a..52fc13494c3c6 100644 --- a/trunk/ceph/Makefile +++ b/trunk/ceph/Makefile @@ -16,7 +16,7 @@ EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib EXTRA_CFLAGS += -g EXTRA_CFLAGS += -pg -EXTRA_CFLAGS += -O3 +#EXTRA_CFLAGS += -O3 # base CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} @@ -145,22 +145,22 @@ mkmonmap: mkmonmap.cc common.o extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmon: cmon.o mon.o msg/SimpleMessenger.o common.o +cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o +cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o +cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o +csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o +cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ @@ -191,15 +191,15 @@ ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o # fake* -fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o +fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o +fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush/libcrush.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ # mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o +newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ @@ -243,6 +243,14 @@ gprof-helper.so: test/gprof-helper.c test_disk_bw: test/test_disk_bw.cc common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ +# crush + +crush/libcrush.o: force_look + cd crush ; make + +force_look: + true + # bits common.o: ${COMMON_OBJS} ${LDINC} $@ $^ @@ -268,7 +276,6 @@ mon.o: ${MON_OBJS} osbdb.o: ${OSBDB_OBJS} ${LDINC} $@ $^ - # generic rules %.so: %.cc ${CC} -shared -fPIC ${CFLAGS} $< -o $@ diff --git a/trunk/ceph/crush/BinaryTree.h b/trunk/ceph/crush.old/BinaryTree.h similarity index 100% rename from trunk/ceph/crush/BinaryTree.h rename to trunk/ceph/crush.old/BinaryTree.h diff --git a/trunk/ceph/crush/Bucket.h b/trunk/ceph/crush.old/Bucket.h similarity index 100% rename from trunk/ceph/crush/Bucket.h rename to trunk/ceph/crush.old/Bucket.h diff --git a/trunk/ceph/crush/Hash.h b/trunk/ceph/crush.old/Hash.h similarity index 100% rename from trunk/ceph/crush/Hash.h rename to trunk/ceph/crush.old/Hash.h diff --git a/trunk/ceph/crush.old/crush.h b/trunk/ceph/crush.old/crush.h new file mode 100644 index 0000000000000..376e7d9b3fc86 --- /dev/null +++ b/trunk/ceph/crush.old/crush.h @@ -0,0 +1,543 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __crush_CRUSH_H +#define __crush_CRUSH_H + +#include +#include +#include +#include +#include +using std::set; +using std::map; +using std::vector; +using std::list; +#include +#include +using namespace __gnu_cxx; + + +#include "Bucket.h" + +#include "include/buffer.h" + + +namespace crush { + + + // *** RULES *** + + class RuleStep { + public: + int cmd; + vector args; + + RuleStep(int c) : cmd(c) {} + RuleStep(int c, int a) : cmd(c) { + args.push_back(a); + } + RuleStep(int c, int a, int b) : cmd(c) { + args.push_back(a); + args.push_back(b); + } + RuleStep(int o, int a, int b, int c) : cmd(o) { + args.push_back(a); + args.push_back(b); + args.push_back(c); + } + + void _encode(bufferlist& bl) { + bl.append((char*)&cmd, sizeof(cmd)); + ::_encode(args, bl); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(cmd), (char*)&cmd); + off += sizeof(cmd); + ::_decode(args, bl, off); + } + }; + + + // Rule operations + const int CRUSH_RULE_TAKE = 0; + const int CRUSH_RULE_CHOOSE = 1; // first n by default + const int CRUSH_RULE_CHOOSE_FIRSTN = 1; + const int CRUSH_RULE_CHOOSE_INDEP = 2; + const int CRUSH_RULE_EMIT = 3; + + class Rule { + public: + vector< RuleStep > steps; + + void _encode(bufferlist& bl) { + int n = steps.size(); + bl.append((char*)&n, sizeof(n)); + for (int i=0; i buckets; + int bucketno; + Hash h; + + hash_map parent_map; // what bucket each leaf/bucket lives in + + public: + map rules; + + //map collisions; + //map bumps; + + void _encode(bufferlist& bl) { + // buckets + int n = buckets.size(); + bl.append((char*)&n, sizeof(n)); + for (map::const_iterator it = buckets.begin(); + it != buckets.end(); + it++) { + bl.append((char*)&it->first, sizeof(it->first)); + it->second->_encode(bl); + } + bl.append((char*)&bucketno, sizeof(bucketno)); + + // hash + int s = h.get_seed(); + bl.append((char*)&s, sizeof(s)); + + //::_encode(out, bl); + //::_encode(overload, bl); + + // rules + n = rules.size(); + bl.append((char*)&n, sizeof(n)); + for(map::iterator it = rules.begin(); + it != rules.end(); + it++) { + bl.append((char*)&it->first, sizeof(it->first)); + it->second._encode(bl); + } + + } + + void _decode(bufferlist& bl, int& off) { + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i::iterator bp = buckets.begin(); + bp != buckets.end(); + ++bp) { + // index bucket items + vector items; + bp->second->get_items(items); + for (vector::iterator ip = items.begin(); + ip != items.end(); + ++ip) + parent_map[*ip] = bp->first; + } + } + + + + public: + Crush(int seed=123) : bucketno(-1), h(seed) {} + ~Crush() { + // hose buckets + for (map::iterator it = buckets.begin(); + it != buckets.end(); + it++) { + delete it->second; + } + } + + int print(ostream& out, int root, int indent=0) { + for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; + for (int i=0; iget_bucket_type() << ": "; + + vector items; + b->get_items(items); + + if (buckets.count(items[0])) { + out << std::endl; + for (unsigned i=0; iset_id(n); + buckets[n] = b; + return n; + } + + void add_item(int parent, int item, float w, bool back=false) { + // add item + assert(!buckets[parent]->is_uniform()); + Bucket *p = buckets[parent]; + + p->add_item(item, w, back); + + // set item's parent + Bucket *n = buckets[item]; + if (n) + n->set_parent(parent); + + // update weights + while (buckets.count(p->get_parent())) { + int child = p->get_id(); + p = buckets[p->get_parent()]; + p->adjust_item_weight(child, w); + } + } + + + /* + this is a hack, fix me! weights should be consistent throughout hierarchy! + + */ + void set_bucket_weight(int item, float w) { + Bucket *b = buckets[item]; + float adj = w - b->get_weight(); + + while (buckets.count(b->get_parent())) { + Bucket *p = buckets[b->get_parent()]; + p->adjust_item_weight(b->get_id(), adj); + b = p; + } + } + + + /* + * choose numrep distinct items of type type + */ + void choose(int x, + int numrep, + int type, + Bucket *inbucket, + vector& outvec, + bool firstn, + set& outset, map& overloadmap, + bool forcefeed=false, + int forcefeedval=-1) { + int off = outvec.size(); + + // for each replica + for (int rep=0; repis_uniform()) { + // uniform bucket; be careful! + if (firstn || numrep >= in->get_size()) { + // uniform bucket is too small; just walk thru elements + r += ftotal; // r' = r + f_total (first n) + } else { + // make sure numrep is not a multple of bucket size + int add = numrep*flocal; // r' = r + n*f_local + if (in->get_size() % numrep == 0) { + add += add/in->get_size(); // shift seq once per pass through the bucket + } + r += add; + } + } else { + // mixed bucket; just make a distinct-ish r sequence + if (firstn) + r += ftotal; // r' = r + f_total + else + r += numrep * flocal; // r' = r + n*f_local + } + + // choose + outv = in->choose_r(x, r, h); + + // did we get the type we want? + int itemtype = 0; // 0 is terminal type + Bucket *newin = 0; // remember bucket we hit + if (in->is_uniform()) { + itemtype = ((UniformBucket*)in)->get_item_type(); + } else { + if (buckets.count(outv)) { // another bucket + newin = buckets[outv]; + itemtype = newin->get_type(); + } + } + if (itemtype == type) { // this is what we want! + // collision? + bool collide = false; + for (int prep=0; prep overloadmap[outv]) + bad = true; + } + + if (collide || bad) { + ftotal++; + flocal++; + + if (collide && flocal < 3) + continue; // try locally a few times! + + if (ftotal >= 10) { + // ok fine, just ignore dup. FIXME. + skip_rep = true; + break; + } + + retry_rep = true; + } + + break; // ok then! + } + + // next + in = newin; + } + + if (retry_rep) continue; // try again + + break; + } + + // skip this rep? (e.g. too many collisions, we give up) + if (skip_rep) continue; + + // output this value + outvec.push_back(outv); + } // for rep + + // double check! + if (0) { + for (unsigned i=1; i& result, + set& outset, map& overloadmap, + int forcefeed=-1) { + //int numresult = 0; + result.clear(); + + // determine hierarchical context for forcefeed (if any) + list force_stack; + if (forcefeed >= 0 && parent_map.count(forcefeed)) { + int t = forcefeed; + while (1) { + force_stack.push_front(t); + //cout << "push " << t << " onto force_stack" << std::endl; + if (parent_map.count(t) == 0) break; // reached root, presumably. + //cout << " " << t << " parent is " << parent_map[t] << std::endl; + t = parent_map[t]; + } + } + + // working vector + vector w; // working variable + + // go through each statement + for (vector::iterator pc = rule.steps.begin(); + pc != rule.steps.end(); + pc++) { + // move input? + + // do it + switch (pc->cmd) { + case CRUSH_RULE_TAKE: + { + const int arg = pc->args[0]; + //cout << "take " << arg << std::endl; + + if (!force_stack.empty()) { + assert(force_stack.front() == arg); + force_stack.pop_front(); + } + + w.clear(); + w.push_back(arg); + } + break; + + case CRUSH_RULE_CHOOSE_FIRSTN: + case CRUSH_RULE_CHOOSE_INDEP: + { + const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; + const int numrep = pc->args[0]; + const int type = pc->args[1]; + + //cout << "choose " << numrep << " of type " << type << std::endl; + + assert(!w.empty()); + + // reset output + vector out; + + // forcefeeding? + bool forcing = false; + int forceval = -1; + if (!force_stack.empty()) { + forceval = force_stack.front(); + force_stack.pop_front(); + //cout << "priming out with " << forceval << std::endl; + forcing = true; + } else if (forcefeed >= 0 && type == 0) { + //cout << "forcing context-less " << forcefeed << std::endl; + forceval = forcefeed; + forcefeed = -1; + forcing = true; + } + + // do each row independently + for (vector::iterator i = w.begin(); + i != w.end(); + i++) { + assert(buckets.count(*i)); + Bucket *b = buckets[*i]; + choose(x, numrep, type, b, out, firstn, + outset, overloadmap, + forcing, + forceval); + forcing = false; // only once + } // for inrow + + // put back into w + w.swap(out); + out.clear(); + } + break; + + case CRUSH_RULE_EMIT: + { + for (unsigned i=0; i +#include + +class CrushWrapper { +public: + struct crush_map *map; + + CrushWrapper() : map(0) {} + ~CrushWrapper() { + if (map) crush_destroy(map); + } + + void create() { + if (map) crush_destroy(map); + map = crush_create(); + } + void finalize() { + assert(map); + crush_finalize(map); + } + + void update_offload_map(std::set& out_osds, + std::map& overload_osds) { + for (int i=0; imax_devices; i++) { + if (out_osds.count(i)) + map->device_offload[i] = 0x10000; + else if (overload_osds.count(i)) + map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse? + else + map->device_offload[i] = 0; // normal. + } + } + + void do_rule(int rule, int x, vector& out, int maxout, int forcefeed) { + int rawout[maxout]; + + int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed); + + out.resize(numrep); + for (int i=0; imax_buckets, bl); + ::_encode_simple(map->max_rules, bl); + ::_encode_simple(map->max_devices, bl); + + // simple arrays + bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices); + + // buckets + for (unsigned i=0; imax_buckets; i++) { + __u32 type = 0; + if (map->buckets[i]) type = map->buckets[i]->bucket_type; + ::_encode_simple(type, bl); + if (!type) continue; + + ::_encode_simple(map->buckets[i]->id, bl); + ::_encode_simple(map->buckets[i]->type, bl); + ::_encode_simple(map->buckets[i]->bucket_type, bl); + ::_encode_simple(map->buckets[i]->weight, bl); + ::_encode_simple(map->buckets[i]->size, bl); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(map->buckets[i]->items[j], bl); + + switch (map->buckets[i]->type) { + case CRUSH_BUCKET_UNIFORM: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl); + ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl); + break; + + case CRUSH_BUCKET_LIST: + for (unsigned j=0; jbuckets[i]->size; j++) { + ::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl); + ::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl); + } + break; + + case CRUSH_BUCKET_TREE: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl); + break; + + case CRUSH_BUCKET_STRAW: + for (unsigned j=0; jbuckets[i]->size; j++) + ::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl); + break; + } + } + + // rules + for (unsigned i=0; imax_rules; i++) { + __u32 yes = map->rules[i] ? 1:0; + ::_encode_simple(yes, bl); + if (!yes) continue; + + ::_encode_simple(map->rules[i]->len, bl); + for (unsigned j=0; jrules[i]->len; j++) + ::_encode_simple(map->rules[i]->steps[j], bl); + } + } + + void _decode(bufferlist::iterator &blp) { + create(); + ::_decode_simple(map->max_buckets, blp); + ::_decode_simple(map->max_rules, blp); + ::_decode_simple(map->max_devices, blp); + + map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices); + blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload); + + // buckets + map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets); + for (unsigned i=0; imax_buckets; i++) { + __u32 type; + ::_decode_simple(type, blp); + if (!type) { + map->buckets[i] = 0; + continue; + } + + int size = 0; + switch (type) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(crush_bucket_straw); + break; + default: + assert(0); + } + map->buckets[i] = (crush_bucket*)malloc(size); + memset(map->buckets[i], 0, size); + + ::_decode_simple(map->buckets[i]->id, blp); + ::_decode_simple(map->buckets[i]->type, blp); + ::_decode_simple(map->buckets[i]->bucket_type, blp); + ::_decode_simple(map->buckets[i]->weight, blp); + ::_decode_simple(map->buckets[i]->size, blp); + + map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(map->buckets[i]->items[j], blp); + + switch (map->buckets[i]->type) { + case CRUSH_BUCKET_UNIFORM: + ((crush_bucket_uniform*)map->buckets[i])->primes = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp); + ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp); + break; + + case CRUSH_BUCKET_LIST: + ((crush_bucket_list*)map->buckets[i])->item_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + ((crush_bucket_list*)map->buckets[i])->sum_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + + for (unsigned j=0; jbuckets[i]->size; j++) { + ::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp); + ::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp); + } + break; + + case CRUSH_BUCKET_TREE: + ((crush_bucket_tree*)map->buckets[i])->node_weights = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp); + break; + + case CRUSH_BUCKET_STRAW: + ((crush_bucket_straw*)map->buckets[i])->straws = + (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); + for (unsigned j=0; jbuckets[i]->size; j++) + ::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp); + break; + } + } + + // rules + map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules); + for (unsigned i=0; imax_rules; i++) { + __u32 yes; + ::_decode_simple(yes, blp); + if (!yes) { + map->rules[i] = 0; + continue; + } + + map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule)); + memset(map->rules[i], 0, sizeof(crush_rule)); + + ::_decode_simple(map->rules[i]->len, blp); + map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len); + for (unsigned j=0; jrules[i]->len; j++) + ::_decode_simple(map->rules[i]->steps[j], blp); + } + + finalize(); + } +}; + +#endif diff --git a/trunk/ceph/crush2/Makefile b/trunk/ceph/crush/Makefile similarity index 63% rename from trunk/ceph/crush2/Makefile rename to trunk/ceph/crush/Makefile index 6a9bae06f0ee5..72d1b676bdb32 100644 --- a/trunk/ceph/crush2/Makefile +++ b/trunk/ceph/crush/Makefile @@ -1,11 +1,12 @@ CC = gcc CFLAGS = -Wall -CFLAGS += -O3 -g +CFLAGS += -g +CFLAGS += -O3 LD = ld RM = rm -all: depend builder.o libcrush.o +all: depend libcrush.o test clean: rm -f *.o libcrush.o @@ -13,9 +14,12 @@ clean: %.o: %.c ${CC} ${CFLAGS} -c $< -o $@ -libcrush.o: crush.o buckets.o +libcrush.o: builder.o crush.o mapper.o $(LD) -i -o $@ $^ +test: test.c libcrush.o + $(CC) ${CFLAGS} -lm $^ -o $@ + .depend: touch .depend diff --git a/trunk/ceph/crush/buckets.c b/trunk/ceph/crush/buckets.c new file mode 100644 index 0000000000000..72441d19792bb --- /dev/null +++ b/trunk/ceph/crush/buckets.c @@ -0,0 +1,6 @@ + +#include "crush.h" +#include "hash.h" + +int + diff --git a/trunk/ceph/crush2/builder.c b/trunk/ceph/crush/builder.c similarity index 50% rename from trunk/ceph/crush2/builder.c rename to trunk/ceph/crush/builder.c index 803eb9d1561a0..a430dbd5c6284 100644 --- a/trunk/ceph/crush2/builder.c +++ b/trunk/ceph/crush/builder.c @@ -2,16 +2,16 @@ #include #include #include +#include #include "builder.h" #include "hash.h" - -struct crush_map *crush_new() +struct crush_map *crush_create() { struct crush_map *m; m = malloc(sizeof(*m)); - memset(&m, 0, sizeof(m)); + memset(m, 0, sizeof(*m)); return m; } @@ -26,99 +26,63 @@ void crush_finalize(struct crush_map *map) for (b=0; bmax_buckets; b++) { if (map->buckets[b] == 0) continue; for (i=0; ibuckets[b]->size; i++) - if (map->buckets[b]->items[i] > map->max_devices) - map->max_devices = map->buckets[b]->items[i]; + if (map->buckets[b]->items[i] >= map->max_devices) + map->max_devices = map->buckets[b]->items[i] + 1; } /* allocate arrays */ map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices); - map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices); - memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets); memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets); - /* build reverse map */ + /* build parent maps */ for (b=0; bmax_buckets; b++) { if (map->buckets[b] == 0) continue; for (i=0; ibuckets[b]->size; i++) { c = map->buckets[b]->items[i]; + BUG_ON(c >= map->max_devices); if (c >= 0) map->device_parents[c] = map->buckets[b]->id; else - map->bucket_parents[-c] = map->buckets[b]->id; + map->bucket_parents[-1-c] = map->buckets[b]->id; } } -} -/* - * deallocate - */ -void crush_destroy(struct crush_map *map) -{ - int b; - - /* buckets */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - switch (map->buckets[b]->type) { - case CRUSH_BUCKET_UNIFORM: - crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]); - break; - case CRUSH_BUCKET_LIST: - crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]); - break; - case CRUSH_BUCKET_TREE: - crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]); - break; - case CRUSH_BUCKET_STRAW: - crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]); - break; - } + /* new device offload map? */ + if (!map->device_offload) { + map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); + memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); } - free(map->buckets); - - /* rules */ - for (b=0; bmax_rules; b++) { - if (map->rules[b] == 0) continue; - if (map->rules[b]->steps) - free(map->rules[b]->steps); - free(map->rules[b]); - } - free(map->rules); - - free(map->bucket_parents); - free(map->device_parents); - free(map->device_offload); - - memset(map, 0, sizeof(*map)); } + /** rules **/ int crush_add_rule(struct crush_map *map, + int ruleno, struct crush_rule *rule) { - int id; - - /* find a rule id */ - for (id=0; id < map->max_rules; id++) - if (map->rules[id] == 0) break; - if (id == map->max_rules) { + int oldsize; + + if (ruleno < 0) { + for (ruleno=0; ruleno < map->max_rules; ruleno++) + if (map->rules[ruleno] == 0) break; + } + if (ruleno >= map->max_rules) { /* expand array */ - if (map->max_rules) - map->max_rules *= 2; - else - map->max_rules = 8; + oldsize = map->max_rules; + map->max_rules = ruleno+1; map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0])); + memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0])); } /* add it */ - map->rules[id] = rule; - return id; + map->rules[ruleno] = rule; + return ruleno; } struct crush_rule *crush_make_rule() @@ -126,7 +90,7 @@ struct crush_rule *crush_make_rule() struct crush_rule *rule; rule = malloc(sizeof(struct crush_rule)); - memset(&rule, 0, sizeof(rule)); + memset(rule, 0, sizeof(*rule)); return rule; } @@ -134,9 +98,9 @@ void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2) { rule->len++; if (rule->steps) - rule->steps = malloc(sizeof(rule->steps[0])*rule->len); - else rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len); + else + rule->steps = malloc(sizeof(rule->steps[0])*rule->len); rule->steps[rule->len-1].op = op; rule->steps[rule->len-1].arg1 = arg1; rule->steps[rule->len-1].arg2 = arg2; @@ -149,35 +113,43 @@ int crush_add_bucket(struct crush_map *map, struct crush_bucket *bucket) { int id; + int oldsize; /* find a bucket id */ for (id=0; id < map->max_buckets; id++) if (map->buckets[id] == 0) break; if (id == map->max_buckets) { /* expand array */ + oldsize = map->max_buckets; if (map->max_buckets) map->max_buckets *= 2; else map->max_buckets = 8; map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0])); + memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0])); } /* add it */ - bucket->id = id; + bucket->id = -1 - id; map->buckets[id] = bucket; - return id; + return -1 - id; } -void crush_make_uniform_bucket(struct crush_map *map, - struct crush_bucket_uniform *bucket, - int size, - int *items, - int item_weight) +/* uniform bucket */ + +struct crush_bucket_uniform * +crush_make_uniform_bucket(int type, int size, + int *items, + int item_weight) { int i, j, x; + struct crush_bucket_uniform *bucket; + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM; + bucket->h.type = type; bucket->h.size = size; bucket->h.weight = size * item_weight; @@ -202,48 +174,135 @@ void crush_make_uniform_bucket(struct crush_map *map, bucket->primes[i++] = x; x += 2; } + + return bucket; } -void crush_make_list_bucket(struct crush_map *map, - struct crush_bucket_list *bucket, - int size, - int *items, - int *weights) + +/* list bucket */ + +struct crush_bucket_list* +crush_make_list_bucket(int type, int size, + int *items, + int *weights) { int i; int w; - + struct crush_bucket_list *bucket; + + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_LIST; + bucket->h.type = type; bucket->h.size = size; bucket->h.items = malloc(sizeof(__u32)*size); bucket->item_weights = malloc(sizeof(__u32)*size); bucket->sum_weights = malloc(sizeof(__u32)*size); w = 0; - for (i=0; i=0; i--) { bucket->h.items[i] = items[i]; bucket->item_weights[i] = weights[i]; w += weights[i]; bucket->sum_weights[i] = w; + /*printf("%d item %d weight %d sum %d\n", + i, items[i], weights[i], bucket->sum_weights[i]);*/ } bucket->h.weight = w; + + return bucket; } -void crush_make_straw_bucket(struct crush_map *map, - struct crush_bucket_straw *bucket, - int size, - int *items, - int *weights) +/* tree bucket */ + +static int height(int n) { + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} +static int on_right(int n, int h) { + return n & (1 << (h+1)); +} +static int parent(int n) { + int h = height(n); + if (on_right(n, h)) + return n - (1<h.bucket_type = CRUSH_BUCKET_TREE; + bucket->h.type = type; + bucket->h.size = size; + + /* calc tree depth */ + depth = 1; + t = size - 1; + while (t) { + t = t >> 1; + depth++; + } + bucket->h.size = 1 << depth; + + bucket->h.items = malloc(sizeof(__u32)*bucket->h.size); + bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size); + + memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size); + memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size); + + for (i=0; ih.items[node] = items[i]; + bucket->node_weights[node] = weights[i]; + bucket->h.weight += weights[i]; + for (j=1; jnode_weights[node] += weights[i]; + } + } + BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight); + + return bucket; +} + + +/* straw bucket */ + +struct crush_bucket_straw * +crush_make_straw_bucket(int type, + int size, + int *items, + int *weights) +{ + struct crush_bucket_straw *bucket; int *reverse; int i, j, k; double straw, wbelow, lastw, wnext, pbelow; int numleft; + bucket = malloc(sizeof(*bucket)); memset(bucket, 0, sizeof(*bucket)); + bucket->h.bucket_type = CRUSH_BUCKET_STRAW; + bucket->h.type = type; bucket->h.size = size; bucket->h.items = malloc(sizeof(__u32)*size); @@ -255,20 +314,21 @@ void crush_make_straw_bucket(struct crush_map *map, bucket->h.weight += weights[i]; } - /* reverse sort by weight */ + /* reverse sort by weight (simple insertion sort) */ reverse = malloc(sizeof(int) * size); - reverse[0] = items[0]; + reverse[0] = 0; for (i=1; ij; k--) reverse[k] = reverse[k-1]; - reverse[j] = items[i]; + reverse[j] = i; + break; } } if (j == i) - reverse[i] = items[i]; + reverse[i] = i; } numleft = size; @@ -280,25 +340,36 @@ void crush_make_straw_bucket(struct crush_map *map, while (i < size) { /* set this item's straw */ bucket->straws[reverse[i]] = straw * 0x10000; + /*printf("item %d at %d weight %d straw %d (%lf)\n", + items[reverse[i]], + reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/ i++; if (i == size) break; /* same weight as previous? */ - if (weights[reverse[i]] == weights[reverse[i-1]]) + if (weights[reverse[i]] == weights[reverse[i-1]]) { + /*printf("same as previous\n");*/ continue; + } /* adjust straw for next guy */ - wbelow += (((double)weights[reverse[i-1]] / (double)0x10000) - lastw) * numleft; - numleft--; - wnext = numleft * ((double)(weights[reverse[i]] - weights[reverse[i-1]]) / (double)0x10000); + wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft; + for (j=i; j +#endif + +#include "crush.h" + +void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) +{ + free(b->primes); + free(b->h.items); + free(b); +} + +void crush_destroy_bucket_list(struct crush_bucket_list *b) +{ + free(b->item_weights); + free(b->sum_weights); + free(b->h.items); + free(b); +} + +void crush_destroy_bucket_tree(struct crush_bucket_tree *b) +{ + free(b->node_weights); + free(b); +} + +void crush_destroy_bucket_straw(struct crush_bucket_straw *b) +{ + free(b->straws); + free(b->h.items); + free(b); +} + + +/* + * deallocate + */ +void crush_destroy(struct crush_map *map) +{ + int b; + + /* buckets */ + for (b=0; bmax_buckets; b++) { + if (map->buckets[b] == 0) continue; + switch (map->buckets[b]->type) { + case CRUSH_BUCKET_UNIFORM: + crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]); + break; + case CRUSH_BUCKET_LIST: + crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]); + break; + case CRUSH_BUCKET_TREE: + crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]); + break; + case CRUSH_BUCKET_STRAW: + crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]); + break; + } + } + free(map->buckets); + + /* rules */ + for (b=0; bmax_rules; b++) { + if (map->rules[b] == 0) continue; + if (map->rules[b]->steps) + free(map->rules[b]->steps); + free(map->rules[b]); + } + free(map->rules); + + free(map->bucket_parents); + free(map->device_parents); + free(map->device_offload); + free(map); +} + + diff --git a/trunk/ceph/crush/crush.h b/trunk/ceph/crush/crush.h index 376e7d9b3fc86..5cf6cff498f13 100644 --- a/trunk/ceph/crush/crush.h +++ b/trunk/ceph/crush/crush.h @@ -1,543 +1,117 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ +#ifndef _CRUSH_CRUSH_H +#define _CRUSH_CRUSH_H -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using std::set; -using std::map; -using std::vector; -using std::list; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i /* just for int types */ - //::_decode(out, bl, off); - //::_decode(overload, bl, off); +#ifndef BUG_ON +# include +# define BUG_ON(x) assert(!(x)) +#endif - // rules - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - +/*** RULES ***/ +enum { + CRUSH_RULE_TAKE, + CRUSH_RULE_CHOOSE_FIRSTN, + CRUSH_RULE_CHOOSE_INDEP, + CRUSH_RULE_EMIT +}; +#define CRUSH_MAX_DEPTH 10 +#define CRUSH_MAX_SET 10 - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } +struct crush_rule_step { + __u32 op; + __s32 arg1; + __s32 arg2; +}; - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; +struct crush_rule { + __u32 len; + struct crush_rule_step *steps; +}; - vector items; - b->get_items(items); - if (buckets.count(items[0])) { - out << std::endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } +enum { + CRUSH_BUCKET_UNIFORM = 1, + CRUSH_BUCKET_LIST = 2, + CRUSH_BUCKET_TREE = 3, + CRUSH_BUCKET_STRAW = 4 +}; - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); +struct crush_bucket { + __s32 id; /* this'll be negative */ + __u16 type; + __u16 bucket_type; + __u32 weight; /* 16-bit fixed point */ + __u32 size; /* num items */ + __s32 *items; +}; - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); +struct crush_bucket_uniform { + struct crush_bucket h; + __u32 *primes; + __u32 item_weight; /* 16-bit fixed point */ +}; - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } +struct crush_bucket_list { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */ +}; +struct crush_bucket_tree { + struct crush_bucket h; /* note: h.size is tree size, not number of actual items */ + __u32 *node_weights; +}; - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); +struct crush_bucket_straw { + struct crush_bucket h; + __u32 *straws; /* 16-bit fixed point */ +}; - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); +/*** CRUSH ***/ - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for forcefeed (if any) - list force_stack; - if (forcefeed >= 0 && parent_map.count(forcefeed)) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - //cout << "push " << t << " onto force_stack" << std::endl; - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << std::endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << std::endl; - - if (!force_stack.empty()) { - assert(force_stack.front() == arg); - force_stack.pop_front(); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << std::endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval = -1; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << std::endl; - forcing = true; - } else if (forcefeed >= 0 && type == 0) { - //cout << "forcing context-less " << forcefeed << std::endl; - forceval = forcefeed; - forcefeed = -1; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i +#include + +/** bucket choose methods **/ + +/* uniform */ + +static int +crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) +{ + unsigned o, p, s; + o = crush_hash32_2(x, bucket->h.id) & 0xffff; + p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size]; + s = (x + o + (r+1)*p) % bucket->h.size; + /*printf("%d %d %d %d\n", x, o, r, p);*/ + return bucket->h.items[s]; +} + + +/* list */ + +static int +crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) +{ + int i; + __u64 w; + + for (i=0; ih.size; i++) { + w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id); + w &= 0xffff; + /*printf("%d item %d weight %d sum_weight %d r %lld", + i, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/ + w *= bucket->sum_weights[i]; + w = w >> 16; + /*printf(" scaled %lld\n", w);*/ + if (w < bucket->item_weights[i]) + return bucket->h.items[i]; + } + + BUG_ON(1); + return 0; +} + + +/* tree */ + +static int height(int n) { + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} +static int left(int x) { + int h = height(x); + return x - (1 << (h-1)); +} +static int right(int x) { + int h = height(x); + return x + (1 << (h-1)); +} +static int terminal(int x) { + return x & 1; +} + +static int +crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) +{ + int n, l; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->h.size >> 1; + + while (!terminal(n)) { + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w; + t = t >> 32; + + /* left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n]; +} + + +/* straw */ + +static int +crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) +{ + int i; + int high = 0; + __u64 high_draw = 0; + __u64 draw; + + for (i=0; ih.size; i++) { + draw = crush_hash32_3(x, bucket->h.items[i], r); + draw &= 0xffff; + draw *= bucket->straws[i]; + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + + return bucket->h.items[high]; +} + + + + +/** crush proper **/ + + /* * choose numrep distinct items of given type */ @@ -12,7 +135,7 @@ static int crush_choose(struct crush_map *map, { int rep; int ftotal, flocal; - int retry_rep, skip_rep; + int retry_descent, retry_bucket, skip_rep; struct crush_bucket *in = bucket; int r; int i; @@ -22,32 +145,28 @@ static int crush_choose(struct crush_map *map, int collide, bad; outpos = 0; - + for (rep = 0; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; - - while (1) { + do { + retry_descent = 0; in = bucket; /* initial bucket */ /* choose through intervening buckets */ flocal = 0; - retry_rep = 0; - - while (1) { + do { + retry_bucket = 0; r = rep; - if (in->type == CRUSH_BUCKET_UNIFORM) { + if (in->bucket_type == CRUSH_BUCKET_UNIFORM) { /* be careful */ - if (firstn || numrep >= in->size) { + if (firstn || numrep >= in->size) r += ftotal; /* r' = r + f_total */ - } else { - r += numrep * flocal; /* r' = r + n*f_local */ - /* make sure numrep is not a multiple of bucket size */ - if (in->size % numrep == 0) - /* shift seq once per pass through the bucket */ - r += numrep * flocal / in->size; - } + else if (in->size % numrep == 0) + r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */ + else + r += numrep * flocal; /* r' = r + n*f_local */ } else { if (firstn) r += ftotal; /* r' = r + f_total */ @@ -56,7 +175,7 @@ static int crush_choose(struct crush_map *map, } /* bucket choose */ - switch (in->type) { + switch (in->bucket_type) { case CRUSH_BUCKET_UNIFORM: item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r); break; @@ -75,13 +194,14 @@ static int crush_choose(struct crush_map *map, /* desired type? */ if (item < 0) - itemtype = map->buckets[-item]->type; + itemtype = map->buckets[-1-item]->type; else itemtype = 0; /* keep going? */ if (itemtype != type) { - in = map->buckets[-item]; + BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); + in = map->buckets[-1-item]; continue; } @@ -108,22 +228,17 @@ static int crush_choose(struct crush_map *map, flocal++; if (collide && flocal < 3) - continue; /* locally a few times */ - if (ftotal >= 10) { - /* give up, ignore dup, fixme */ - skip_rep = 1; - break; - } - retry_rep = 1; + retry_bucket = 1; /* retry locally a few times */ + else if (ftotal < 10) + retry_descent = 1; /* then retry descent */ + else + skip_rep = 1; /* else give up */ } - break; - } - - if (retry_rep) continue; - } + } while (retry_bucket); + } while (retry_descent); if (skip_rep) continue; - + out[outpos] = item; outpos++; } @@ -159,12 +274,17 @@ int crush_do_rule(struct crush_map *map, /* determine hierarchical context of forcefeed, if any */ if (forcefeed >= 0) { + if (map->device_parents[forcefeed] == 0) { + /*printf("CRUSH: forcefed device dne\n");*/ + return -1; /* force fed device dne */ + } while (1) { force_stack[++force_pos] = forcefeed; + /*printf("force_stack[%d] = %d\n", force_pos, forcefeed);*/ if (forcefeed >= 0) forcefeed = map->device_parents[forcefeed]; else - forcefeed = map->bucket_parents[-forcefeed]; + forcefeed = map->bucket_parents[-1-forcefeed]; if (forcefeed == 0) break; } } @@ -191,23 +311,22 @@ int crush_do_rule(struct crush_map *map, for (i = 0; i < wsize; i++) { numrep = rule->steps[step].arg1; - if (force_pos >= 0) { o[osize++] = force_stack[force_pos]; force_pos--; numrep--; } - if (numrep) - crush_choose(map, - map->buckets[-w[i]], - x, numrep, rule->steps[step].arg2, - o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); + if (!numrep) continue; + osize += crush_choose(map, + map->buckets[-1-w[i]], + x, numrep, rule->steps[step].arg2, + o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); } /* swap t and w arrays */ tmp = o; o = w; - w = o; + w = tmp; wsize = osize; break; @@ -229,3 +348,4 @@ int crush_do_rule(struct crush_map *map, return result_len; } + diff --git a/trunk/ceph/crush/mapper.h b/trunk/ceph/crush/mapper.h new file mode 100644 index 0000000000000..6c74539b634ca --- /dev/null +++ b/trunk/ceph/crush/mapper.h @@ -0,0 +1,19 @@ +#ifndef _CRUSH_MAPPER_H +#define _CRUSH_MAPPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "crush.h" + +extern int crush_do_rule(struct crush_map *map, + int ruleno, + int x, int *result, int result_max, + int forcefeed); /* -1 for none */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/trunk/ceph/crush/test.c b/trunk/ceph/crush/test.c new file mode 100644 index 0000000000000..041552bacfdc8 --- /dev/null +++ b/trunk/ceph/crush/test.c @@ -0,0 +1,65 @@ + +#include +#include +#include +#include + +#include "crush.h" +#include "mapper.h" +#include "builder.h" + + +int main() +{ + int sub[10]; + int subw[10]; + int i, j; + int d; + int o[100]; + int root; + int ruleno; + int r[10]; + + int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 }; + + struct crush_bucket *b; + struct crush_rule *rule; + + struct crush_map *map = crush_create(); + + d = 0; + for (i=0; i<10; i++) { + for (j=0; j<10; j++) + o[j] = d++; + b = (struct crush_bucket*)crush_make_uniform_bucket(1, 10, o, uw[i]); + sub[i] = crush_add_bucket(map, b); + subw[i] = b->weight; + printf("make bucket %d weight %d\n", sub[i], subw[i]); + } + + root = crush_add_bucket(map, (struct crush_bucket*)crush_make_tree_bucket(2, 10, sub, subw)); + + rule = crush_make_rule(); + crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 3, 1); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0); + crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0); + ruleno = crush_add_rule(map, -1, rule); + + crush_finalize(map); + printf("built\n"); + + /* test */ + memset(o, 0, 100*sizeof(o[0])); + for (i=0; i<1000000; i++) { + crush_do_rule(map, ruleno, i, r, 3, -1); + /*printf("%d %d %d\n", r[0], r[1], r[2]);*/ + for (j=0; j<3; j++) + o[r[j]]++; + } + + for (i=0; i<100; i += 10) + printf("%2d : %d\n", i, o[i]); + + return 0; +} diff --git a/trunk/ceph/crush2/types.h b/trunk/ceph/crush/types.h similarity index 100% rename from trunk/ceph/crush2/types.h rename to trunk/ceph/crush/types.h diff --git a/trunk/ceph/crush2/buckets.c b/trunk/ceph/crush2/buckets.c deleted file mode 100644 index 557dc68d00778..0000000000000 --- a/trunk/ceph/crush2/buckets.c +++ /dev/null @@ -1,82 +0,0 @@ - -#include "hash.h" -#include "buckets.h" - -int -crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) -{ - unsigned o, p, s; - o = crush_hash32_2(x, bucket->h.id); - p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size]; - s = (x + o + (r+1)*p) % bucket->h.size; - return bucket->h.items[s]; -} - -int -crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) -{ - int i; - __u64 w; - - for (i=0; ih.size; i++) { - w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id) & 0xffff; - w = (w * bucket->sum_weights[i]) >> 32; - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - -int -crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) -{ - return 0; -} - -int -crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) -{ - int i; - int high = 0; - unsigned high_draw = 0; - __u64 draw; - - for (i=0; ih.size; i++) { - draw = (crush_hash32_3(x, bucket->h.items[i], r) & 0xffff) * bucket->straws[i]; - draw = draw >> 32; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - - return high; -} - - - -void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) -{ - free(b->primes); - free(b->h.items); -} - -void crush_destroy_bucket_list(struct crush_bucket_list *b) -{ - free(b->item_weights); - free(b->sum_weights); - free(b->h.items); -} - -void crush_destroy_bucket_tree(struct crush_bucket_tree *b) -{ - free(b->h.items); -} - -void crush_destroy_bucket_straw(struct crush_bucket_straw *b) -{ - free(b->straws); - free(b->h.items); -} diff --git a/trunk/ceph/crush2/buckets.h b/trunk/ceph/crush2/buckets.h deleted file mode 100644 index aab536b28b2f3..0000000000000 --- a/trunk/ceph/crush2/buckets.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _CRUSH_BUCKETS_H -#define _CRUSH_BUCKETS_H - -#include "types.h" - -enum { - CRUSH_BUCKET_UNIFORM = 1, - CRUSH_BUCKET_LIST = 2, - CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 -}; - -struct crush_bucket { - __u32 id; - __u32 type; - __u32 weight; /* 16-bit fixed point */ - __u32 size; /* num items */ - __s32 *items; -}; - -struct crush_bucket_uniform { - struct crush_bucket h; - __u32 item_weight; /* 16-bit fixed point */ - __u32 *primes; -}; - -struct crush_bucket_list { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */ -}; - -struct crush_bucket_tree { - struct crush_bucket h; - -}; - -struct crush_bucket_straw { - struct crush_bucket h; - __u32 *straws; /* 16-bit fixed point */ -}; - -extern int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r); -extern int crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r); -extern int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r); -extern int crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r); - -extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *); -extern void crush_destroy_bucket_list(struct crush_bucket_list *); -extern void crush_destroy_bucket_tree(struct crush_bucket_tree *); -extern void crush_destroy_bucket_straw(struct crush_bucket_straw *); - -#endif diff --git a/trunk/ceph/crush2/crush.h b/trunk/ceph/crush2/crush.h deleted file mode 100644 index 9dd5a003bd591..0000000000000 --- a/trunk/ceph/crush2/crush.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H - -#include "types.h" -#include "buckets.h" - -enum { - CRUSH_RULE_TAKE, - CRUSH_RULE_CHOOSE_FIRSTN, - CRUSH_RULE_CHOOSE_INDEP, - CRUSH_RULE_EMIT -}; - -#define CRUSH_MAX_DEPTH 10 -#define CRUSH_MAX_SET 10 - -struct crush_rule_step { - __u32 op; - __s32 arg1; - __s32 arg2; -}; - -struct crush_rule { - __u32 len; - struct crush_rule_step *steps; -}; - -struct crush_map { - struct crush_bucket **buckets; - struct crush_rule **rules; - - /* parent pointers */ - __u32 *bucket_parents; - __u32 *device_parents; - - /* offload - * size max_devices, values 0...0xffff - * 0 == normal - * 0x10000 == 100% offload (i.e. failed) - */ - __u32 *device_offload; - - __u32 max_buckets; - __u32 max_rules; - __u32 max_devices; -}; - -extern int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int forcefeed); /* -1 for none */ - -/*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/ - -#endif diff --git a/trunk/ceph/mon/OSDMonitor.cc b/trunk/ceph/mon/OSDMonitor.cc index 200187510f698..8c8fb91b2b18c 100644 --- a/trunk/ceph/mon/OSDMonitor.cc +++ b/trunk/ceph/mon/OSDMonitor.cc @@ -18,6 +18,8 @@ #include "MonitorStore.h" +#include "crush/CrushWrapper.h" + #include "messages/MOSDFailure.h" #include "messages/MOSDMap.h" #include "messages/MOSDGetMap.h" @@ -158,49 +160,55 @@ void OSDMonitor::create_initial() } -void OSDMonitor::build_crush_map(Crush& crush, +void OSDMonitor::build_crush_map(CrushWrapper& crush, map& weights) { + // new + crush.create(); if (g_conf.num_osd >= 12) { int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - //derr(0) << "osd" << i << " in domain " << dom << dendl; - i++; - if (i == g_conf.num_osd) break; + + int o = 0; + for (int i=0; iget_weight() << dendl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = crush.add_bucket(root); + crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights); + int rootid = crush_add_bucket(crush.map, (crush_bucket*)root); // rules // replication for (int i=1; i<=ndom; i++) { - int r = CRUSH_REP_RULE(i); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + crush_rule *rule = crush_make_rule(); + crush_rule_add_step(rule, CRUSH_RULE_TAKE, rootid, 0); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, i, 1); + crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0); + crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0); + crush_add_rule(crush.map, CRUSH_REP_RULE(i), rule); } + /* // raid for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { int r = CRUSH_RAID_RULE(i); @@ -215,6 +223,7 @@ void OSDMonitor::build_crush_map(Crush& crush, crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); } } + */ // test //vector out; @@ -222,20 +231,24 @@ void OSDMonitor::build_crush_map(Crush& crush, } else { // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = crush.add_bucket(b); - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - } + + int items[g_conf.num_osd]; + for (int i=0; imax_devices << dendl; + //vector t; + //crush.do_rule(2, 132, t, 4, -1); } @@ -420,7 +438,7 @@ bool OSDMonitor::should_propose(double& delay) if (pending_inc.new_up.size() == osdmap.get_osds().size()) { delay = 0.0; if (g_conf.osd_auto_weight) { - Crush crush; + CrushWrapper crush; build_crush_map(crush, osd_weight); crush._encode(pending_inc.crush); } diff --git a/trunk/ceph/mon/OSDMonitor.h b/trunk/ceph/mon/OSDMonitor.h index afdd625ae04aa..c22c007f2d9b6 100644 --- a/trunk/ceph/mon/OSDMonitor.h +++ b/trunk/ceph/mon/OSDMonitor.h @@ -43,7 +43,7 @@ private: map osd_weight; - void build_crush_map(Crush& crush, + void build_crush_map(CrushWrapper& crush, map& weights); // svc diff --git a/trunk/ceph/osd/OSDMap.h b/trunk/ceph/osd/OSDMap.h index fda57d73ef99e..2b476e0456168 100644 --- a/trunk/ceph/osd/OSDMap.h +++ b/trunk/ceph/osd/OSDMap.h @@ -28,8 +28,7 @@ #include "common/Mutex.h" #include "common/Clock.h" -#include "crush/crush.h" -using namespace crush; +#include "crush/CrushWrapper.h" #include #include @@ -144,7 +143,7 @@ private: map osd_inst; public: - Crush crush; // hierarchical map + CrushWrapper crush; // hierarchical map friend class OSDMonitor; friend class MDS; @@ -207,9 +206,14 @@ private: void mark_down(int o, bool clean) { down_osds[o] = clean; } void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - + void mark_out(int o) { + out_osds.insert(o); + crush.update_offload_map(out_osds, overload_osds); + } + void mark_in(int o) { + out_osds.erase(o); + crush.update_offload_map(out_osds, overload_osds); + } void apply_incremental(Incremental &inc) { assert(inc.epoch == epoch+1); @@ -223,8 +227,8 @@ private: return; } if (inc.crush.length()) { - int off = 0; - crush._decode(inc.crush, off); + bufferlist::iterator blp = inc.crush.begin(); + crush._decode(blp); } // nope, incremental. @@ -272,6 +276,8 @@ private: i++) { overload_osds[i->first] = i->second; } + + crush.update_offload_map(out_osds, overload_osds); } // serialize, unserialize @@ -288,7 +294,9 @@ private: ::_encode(overload_osds, blist); ::_encode(osd_inst, blist); - crush._encode(blist); + bufferlist cbl; + crush._encode(cbl); + ::_encode(cbl, blist); } void decode(bufferlist& blist) { @@ -306,7 +314,12 @@ private: ::_decode(overload_osds, blist, off); ::_decode(osd_inst, blist, off); - crush._decode(blist, off); + bufferlist cbl; + ::_decode(cbl, blist, off); + bufferlist::iterator cblp = cbl.begin(); + crush._decode(cblp); + + crush.update_offload_map(out_osds, overload_osds); } @@ -320,8 +333,6 @@ private: } ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - static crush::Hash H(777); - int num = preferred >= 0 ? localized_pg_num:pg_num; int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; @@ -334,14 +345,14 @@ private: case CEPH_OBJECT_LAYOUT_HASHINO: //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); + ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); break; case CEPH_OBJECT_LAYOUT_HASH: //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); + ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); break; default: @@ -374,10 +385,9 @@ private: if (pg.preferred() >= 0 && out_osds.count(pg.preferred()) == 0) forcefeed = pg.preferred(); - crush.do_rule(crush.rules[rule], + crush.do_rule(rule, pg.ps(), - osds, - out_osds, overload_osds, + osds, pg.size(), forcefeed); } break; @@ -389,8 +399,7 @@ private: case CEPH_PG_LAYOUT_HYBRID: { - static crush::Hash H(777); - int h = H(pg.ps()); + int h = crush_hash32(pg.ps()); for (int i=0; i +#include using namespace __gnu_cxx; diff --git a/trunk/ceph/osd/osd_types.h b/trunk/ceph/osd/osd_types.h index 24dd9eca74234..0ae9d0831b0d7 100644 --- a/trunk/ceph/osd/osd_types.h +++ b/trunk/ceph/osd/osd_types.h @@ -74,8 +74,8 @@ typedef uint8_t pruleset_t; // crush rule ids -#define CRUSH_REP_RULE(nrep) (100+nrep) // replication -#define CRUSH_RAID_RULE(num) (200+num) // raid +#define CRUSH_REP_RULE(nrep) (nrep) // replication +#define CRUSH_RAID_RULE(num) (10+num) // raid -- 2.39.5