From 69c3761d655cabebd6531cc7fc56da588b07eb61 Mon Sep 17 00:00:00 2001 From: sage Date: Fri, 16 Sep 2005 05:22:25 +0000 Subject: [PATCH] *** empty log message *** git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@495 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 53 +++++++++++-- ceph/crush/Bucket.h | 141 ++++++++++++++++++++++---------- ceph/crush/Hash.h | 50 ++++++++++++ ceph/crush/crush.h | 172 ++++++++++++++++++++++++++++++++++++++++ ceph/test/testbucket.cc | 59 ++++++++++++++ ceph/test/testtree.cc | 45 +++++++++++ 6 files changed, 472 insertions(+), 48 deletions(-) create mode 100644 ceph/crush/Hash.h create mode 100644 ceph/crush/crush.h create mode 100644 ceph/test/testbucket.cc create mode 100644 ceph/test/testtree.cc diff --git a/ceph/TODO b/ceph/TODO index f3fce49a633b9..86dbe8a94616c 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -85,16 +85,59 @@ struct bucket { +struct bucket { + int id; + int type; // disk, shelf, etc. + vector contents; + float fixed_weight; // homogenous bucket, or + map weight; // mixed bucket +} + +give each bucket "type" has a numerical id, whatever you want. eg + +0 disk +1 shelf +2 cabinet +3 row +4 room + +take(root) +choose(2, room) +take_vert +choose(2, cab) +take_vert +choose(1, disk) + + +mirror(2): +choose(1, room, root) -> [ room1 ] +choose(2, cabinet, _) -> [ cab3 cab2 ] +choose(1, disk, _) -> [ somediskincab3or2, someotherdiskincab3or2 ] + +mirror(4): +choose(2, room, root) -> [ room1 room2 ] +vert(_) -> [ room1; room2 ] +choose(2, cabinet, _) -> [ cab11 cab12; cab21 cab22 ] +vert(_) -> [ cab11; cab12; cab21; cab22 ] +choose(1, disk, _) -> [ disk; disk; disk; disk ] +horiz(_) -> [ disk disk disk disk ] + +two goals: + - separate replicas into failure domains... "floor" in hierarchy + - limit replica set to a single domain... "ceiling" in hierarchy + +additional tricks: + - explicitly choose disks from different storage pools + +mirror(3) +choose(2, disk, primary) -> [ d1 d2 ] // fast scsi (or use more complex chooser as above.) +choose(1, disk, tertiary) -> [ d1 d2 d3 ] // ide +results are appended to _, unless _ is used as an input, in which case it is cleared. -p 0 disk -t 1 shelf -t 2 cabinet -t 3 row -t 4 room R(5, 0, [ s1 s2 s3 s4 ... ]) -> [ d1 d2 d3 d4 ] = 5 disks diff --git a/ceph/crush/Bucket.h b/ceph/crush/Bucket.h index 5737a44156e9e..448b00b2393e7 100644 --- a/ceph/crush/Bucket.h +++ b/ceph/crush/Bucket.h @@ -1,81 +1,117 @@ #ifndef __crush_BUCKET_H #define __crush_BUCKET_H +#include "BinaryTree.h" +#include "Hash.h" + +#include +#include +using namespace std; + namespace crush { + /** abstract bucket **/ class Bucket { protected: int id; int type; float weight; - vector items; public: - Bucket(int _id) : id(_id), weight(0) { } - - int get_id() { return id; } - int get_type() { return type; } - float get_weight() { return weight; } - vector get_items() { return items; } - int get_size() { return items.size(); } + Bucket(int _id, + int _type, + float _weight) : + id(_id), + type(_type), + weight(_weight) { } + + int get_id() { return id; } + int get_type() { return type; } + float get_weight() { return weight; } + virtual int get_size() = 0; - float get_item_weight() = 0; - bool is_uniform() = 0; - float calc_weight() = 0; + void set_weight(float w) { weight = w; } - - void choose_r(int r, int type, vector& in, vector& out) = 0; + virtual bool is_uniform() = 0; + virtual int choose_r(int x, int r, Hash& h) = 0; }; - - class UniformBucket : public Bucket { void choose_r(int r, int type, vector& in, vector& out) = 0; + /** uniform bucket **/ + class UniformBucket : public Bucket { protected: + public: + vector items; + int item_type; float item_weight; + + vector primes; + public: - float get_item_weight(int i) { return item_weight; } + UniformBucket(int _id, int _type, int _item_type, + float _item_weight, vector& _items) : + Bucket(_id, _type, _item_weight*_items.size()), + item_type(_item_type), + item_weight(_item_weight) { + items = _items; + } + + int get_size() { return items.size(); } + //float get_item_weight(int item) { return item_weight; } bool is_uniform() { return true; } - float calc_weight() { - if (item.empty()) - weight = 0; - else - weight = items.size() * get_item_weight(item[0]); - return weight; + int get_prime(int j) { + return primes[ j % primes.size() ]; } - int choose_r(pg_t x, int r, Hash& h) { - assert(type == get_type()); - if (r >= get_size()) cour << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; + void make_primes(Hash& h) { + // start with odd number > num_items + int x = items.size() + 1 + h(get_id()) % items.size(); + x |= 1; // odd + + while (primes.size() < items.size()) { + int j; + for (j=2; j*j<=x; j++) + if (x % j == 0) break; + if (j*j > x) { + primes.push_back(x); + } + x += 2; + } + } + + int choose_r(int x, int r, Hash& h) { + //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - int v = h(x, r, get_id(), 1) * get_size(); - int pmin = get_weight(); - int p = 0; // choose a prime based on hash(x, r, get_id(), 2) - //int s = x + v + r *p(? % get_size()) - //return get_item[s % get_size()]; + int v = (h(x, get_id(), 1) % get_size()) * get_size(); + int p = get_prime( h(x, 2) ); // choose a prime based on hash(x, get_id(), 2) + int s = (x + v + (r+1)*p) % get_size(); + return items[s]; } }; - // mixed bucket.. RUSH_T + // mixed bucket, based on RUSH_T type binary tree - class MixedBucket : public Bucket { protected: vector item_weight; + public: BinaryTree tree; map node_map; // node id -> item public: - float get_item_weight(int i) { - return item_weight[i]; + MixedBucket(int _id, int _type) : Bucket(_id, _type, 0) { } - bool is_uniform() { return false; } + //float get_item_weight(int i) { return item_weight[i]; } + bool is_uniform() { return false; } + int get_size() { return node_map.size(); } + /* float calc_weight() { weight = 0; for (unsigned i=0; i& items) { + /* + void make_new_tree(vector& _items) { + assert(items.empty()); assert(tree.empty()); + items = _items; + for (unsigned i=0; i +using namespace __gnu_cxx; + +namespace crush { + + class Hash { + hash H; + int seed; + public: + Hash(int s) : seed( myhash(s) ) {} + + unsigned myhash(unsigned n) { + // ethan's rush hash? + if (0) + return (n ^ 0xdead1234) * (884811920 * 3 + 1); + + // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. + if (1) { + unsigned int b = 378551; + unsigned int a = 63689; + unsigned int hash = 0; + + for(unsigned int i=0; i<4; i++) + { + hash = hash * a + (n&0xff); + a = a * b; + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + } + + + + + int operator()(int a) { + return myhash(a) ^ seed; + } + int operator()(int a, int b) { + return myhash(a) ^ myhash(b) ^ seed; + } + int operator()(int a, int b, int c) { + return myhash(a) ^ myhash(b) ^ myhash(c) ^ seed; + } + }; + +} diff --git a/ceph/crush/crush.h b/ceph/crush/crush.h new file mode 100644 index 0000000000000..53e49b4de509a --- /dev/null +++ b/ceph/crush/crush.h @@ -0,0 +1,172 @@ +#ifndef __crush_CRUSH_H +#define __crush_CRUSH_H + +namespace crush { + +#define CRUSH_RULE_TAKE 0 +#define CRUSH_RULE_CHOOSE 1 +#define CRUSH_RULE_VERT 2 + + + class RuleStep { + int cmd; + list args; + }; + + class Rule { + list< RuleStatement > steps; + }; + + + + class Crush { + protected: + map buckets; + map rules; + + public: + + void do_rule(int rno) { + assert(rules.count(rno)); + Rule& r = rules[rno]; + + // working variable + list< Bucket* > in; + list< vector > out; + + list< Bucket > temp_buckets; + + // go through each statement + for (list::iterator pc = r.steps.begin(); + pc != r.steps.end(); + pc++) { + // move input? + + // do it + switch (pc->cmd) { + case CRUSH_RULE_TAKE: + { + in.clear(); + temp_buckets.clear(); + + int arg = r.args[0]; + if (arg == 0) { + // input is old output + + for (list< vector >::iterator row = out.begin(); + row != out.end(); + row++) { + + if (row->size() == 1) { + in.push_back( &buckets[ (*row)[0] ] ); + } else { + // make a temp bucket! + temp_buckets.push_back( MixedBucket( rno, -1 ) ); + in.push_back( &temp_buckets.back() ); + + // put everything in. + temp_buckets.back().make_new_tree( *row ); + } + } + + // clear output variable + out.clear(); + + } else { + // input is some explicit existing bucket + assert(buckets.count(arg)); + + // match rows in output? + for (list< vector >::iterator row = out.begin(); + row != out.end(); + row++) + in.push_back( &buckets[arg] ); + } + + } + break; + + case CRUSH_RULE_VERT: + { + in.clear(); + temp_buckets.clear(); + + // input is (always) old output + + for (list< vector >::iterator row = out.begin(); + row != out.end(); + row++) { + for (int i=0; isize(); i++) { + in.push_back( &buckets[ (*row)[i] ] ); + } + } + }s.front(); + break; + + case CRUSH_RULE_CHOOSE: + { + int num = r.args[0]; + int type = r.args[1]; + + // reset output + out.clear(); + + // do each row independently + for (list< Bucket* >::iterator row = in.begin(); + row != in.end(); + row++) { + // make new output row + out.push_back( vector ); + vector& outrow = out.back(); + + // for each replica + for (int r=0; ris_uniform() && + ((UniformBucket*)b)->get_item_type() == type) + break; + + int next = b->choose_r(x, r, h); + int itemtype = 0; // 0 is a terminal type + if (buckets.count(next)) { + b = &buckets[next]; + itemtype = b->get_type(); + } + if (itemtype == type) break; // this is what we want! + } + + // choose in this bucket! + int item = b->choose_r(x, r, h); + outrow.push_back(item); + } + } + } + break; + + default: + assert(0); + } + + + } + + // assemble result + vector result; + list< vector >::iterator row = out.begin(); + result.swap( *row ); + while (row != out.end()) { + result.append( result.end(), row->begin(), row->end() ); + } + + + } + + } + +} + +#endif diff --git a/ceph/test/testbucket.cc b/ceph/test/testbucket.cc new file mode 100644 index 0000000000000..ed31523bbe092 --- /dev/null +++ b/ceph/test/testbucket.cc @@ -0,0 +1,59 @@ + + +#include "../crush/Bucket.h" +using namespace crush; + +#include +#include +using namespace std; + + +ostream& operator<<(ostream& out, vector& v) +{ + out << "["; + for (int i=0; i disks; + for (int i=0; i<20; i++) + disks.push_back(i); + + /* + UniformBucket b(10, 1, 0, 10, disks); + b.make_primes(h); + cout << "primes are " << ub.primes << endl; + */ + MixedBucket b(1, 1); + for (int i=0; i<20; i++) + b.add_item(i,20); + cout << b.tree << endl; + + vector ocount(disks.size()); + int numrep = 3; + + vector v(numrep); + for (int x=1; x<1000000; x++) { + //cout << H(x) << "\t" << h(x) << endl; + for (int i=0; i +#include +using namespace std; + +int main() +{ + BinaryTree t; + + vector nodes; + + for (int i=0; i<30; i++) { + cout << "adding " << i << endl; + int n = t.add_node(1); + nodes.push_back(n); + } + cout << t << endl; + + for (int k=0; k<10000; k++) { + if (rand() % 2) { + cout << "adding" << endl; + nodes.push_back( t.add_node(1) ); + } else { + if (!nodes.empty()) { + //for (int i=0; i