+struct bucket {
+ int id;
+ int type; // disk, shelf, etc.
+ vector<int> contents;
+ float fixed_weight; // homogenous bucket, or
+ map<int, float> weight; // mixed bucket
+}
+
+give each bucket "type" has a numerical id, whatever you want. eg
+
+0 disk
+1 shelf
+2 cabinet
+3 row
+4 room
+
+take(root)
+choose(2, room)
+take_vert
+choose(2, cab)
+take_vert
+choose(1, disk)
+
+
+mirror(2):
+choose(1, room, root) -> [ room1 ]
+choose(2, cabinet, _) -> [ cab3 cab2 ]
+choose(1, disk, _) -> [ somediskincab3or2, someotherdiskincab3or2 ]
+
+mirror(4):
+choose(2, room, root) -> [ room1 room2 ]
+vert(_) -> [ room1; room2 ]
+choose(2, cabinet, _) -> [ cab11 cab12; cab21 cab22 ]
+vert(_) -> [ cab11; cab12; cab21; cab22 ]
+choose(1, disk, _) -> [ disk; disk; disk; disk ]
+horiz(_) -> [ disk disk disk disk ]
+
+two goals:
+ - separate replicas into failure domains... "floor" in hierarchy
+ - limit replica set to a single domain... "ceiling" in hierarchy
+
+additional tricks:
+ - explicitly choose disks from different storage pools
+
+mirror(3)
+choose(2, disk, primary) -> [ d1 d2 ] // fast scsi (or use more complex chooser as above.)
+choose(1, disk, tertiary) -> [ d1 d2 d3 ] // ide
+results are appended to _, unless _ is used as an input, in which case it is cleared.
-p 0 disk
-t 1 shelf
-t 2 cabinet
-t 3 row
-t 4 room
R(5, 0, [ s1 s2 s3 s4 ... ]) -> [ d1 d2 d3 d4 ]
= 5 disks
#ifndef __crush_BUCKET_H
#define __crush_BUCKET_H
+#include "BinaryTree.h"
+#include "Hash.h"
+
+#include <vector>
+#include <map>
+using namespace std;
+
namespace crush {
+ /** abstract bucket **/
class Bucket {
protected:
int id;
int type;
float weight;
- vector<int> items;
public:
- Bucket(int _id) : id(_id), weight(0) { }
-
- int get_id() { return id; }
- int get_type() { return type; }
- float get_weight() { return weight; }
- vector<int> get_items() { return items; }
- int get_size() { return items.size(); }
+ Bucket(int _id,
+ int _type,
+ float _weight) :
+ id(_id),
+ type(_type),
+ weight(_weight) { }
+
+ int get_id() { return id; }
+ int get_type() { return type; }
+ float get_weight() { return weight; }
+ virtual int get_size() = 0;
- float get_item_weight() = 0;
- bool is_uniform() = 0;
- float calc_weight() = 0;
+ void set_weight(float w) { weight = w; }
-
- void choose_r(int r, int type, vector<int>& in, vector<int>& out) = 0;
+ virtual bool is_uniform() = 0;
+ virtual int choose_r(int x, int r, Hash& h) = 0;
};
-
- class UniformBucket : public Bucket { void choose_r(int r, int type, vector<int>& in, vector<int>& out) = 0;
+ /** uniform bucket **/
+ class UniformBucket : public Bucket {
protected:
+ public:
+ vector<int> items;
+ int item_type;
float item_weight;
+
+ vector<int> primes;
+
public:
- float get_item_weight(int i) { return item_weight; }
+ UniformBucket(int _id, int _type, int _item_type,
+ float _item_weight, vector<int>& _items) :
+ Bucket(_id, _type, _item_weight*_items.size()),
+ item_type(_item_type),
+ item_weight(_item_weight) {
+ items = _items;
+ }
+
+ int get_size() { return items.size(); }
+ //float get_item_weight(int item) { return item_weight; }
bool is_uniform() { return true; }
- float calc_weight() {
- if (item.empty())
- weight = 0;
- else
- weight = items.size() * get_item_weight(item[0]);
- return weight;
+ int get_prime(int j) {
+ return primes[ j % primes.size() ];
}
- int choose_r(pg_t x, int r, Hash& h) {
- assert(type == get_type());
- if (r >= get_size()) cour << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
+ void make_primes(Hash& h) {
+ // start with odd number > num_items
+ int x = items.size() + 1 + h(get_id()) % items.size();
+ x |= 1; // odd
+
+ while (primes.size() < items.size()) {
+ int j;
+ for (j=2; j*j<=x; j++)
+ if (x % j == 0) break;
+ if (j*j > x) {
+ primes.push_back(x);
+ }
+ x += 2;
+ }
+ }
+
+ int choose_r(int x, int r, Hash& h) {
+ //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
- int v = h(x, r, get_id(), 1) * get_size();
- int pmin = get_weight();
- int p = 0; // choose a prime based on hash(x, r, get_id(), 2)
- //int s = x + v + r *p(? % get_size())
- //return get_item[s % get_size()];
+ int v = (h(x, get_id(), 1) % get_size()) * get_size();
+ int p = get_prime( h(x, 2) ); // choose a prime based on hash(x, get_id(), 2)
+ int s = (x + v + (r+1)*p) % get_size();
+ return items[s];
}
};
- // mixed bucket.. RUSH_T
+ // mixed bucket, based on RUSH_T type binary tree
-
class MixedBucket : public Bucket {
protected:
vector<float> item_weight;
+ public:
BinaryTree tree;
map<int,int> node_map; // node id -> item
public:
- float get_item_weight(int i) {
- return item_weight[i];
+ MixedBucket(int _id, int _type) : Bucket(_id, _type, 0) {
}
- bool is_uniform() { return false; }
+ //float get_item_weight(int i) { return item_weight[i]; }
+ bool is_uniform() { return false; }
+ int get_size() { return node_map.size(); }
+ /*
float calc_weight() {
weight = 0;
for (unsigned i=0; i<items.size(); i++) {
}
return weight;
}
+ */
-
- void make_new_tree(vector<int>& items) {
+ /*
+ void make_new_tree(vector<int>& _items) {
+ assert(items.empty());
assert(tree.empty());
+ items = _items;
+
for (unsigned i=0; i<items.size(); i++) {
int n = tree.add_node(item_weight[i]);
node_map[n] = items[i];
}
+
+ //calc_weight();
}
+ */
- int choose_r(pg_t x, int r, Hash& h) {
+ void add_item(int item, float w) {
+ int n = tree.add_node(w);
+ node_map[n] = item;
+ weight += w;
+ }
+
+ int choose_r(int x, int r, Hash& h) {
int n = tree.root();
while (!tree.terminal(n)) {
- float f = h(x, n, r, 0);
- if (tree.weight(tree.left(n)) <= f)
- node = tree.left(n);
+ // pick a point in [0,w)
+ float w = tree.weight(n);
+ float f = (float)(h(x, n, r) % 1000) * w / 1000.0;
+
+ // left or right?
+ int l = tree.left(n);
+ if (tree.exists(l) &&
+ f < tree.weight(l))
+ n = l;
else
- node = tree.right(n);
+ n = tree.right(n);
}
-
+ assert(node_map.count(n));
return node_map[n];
}
--- /dev/null
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+namespace crush {
+
+ class Hash {
+ hash<int> H;
+ int seed;
+ public:
+ Hash(int s) : seed( myhash(s) ) {}
+
+ unsigned myhash(unsigned n) {
+ // ethan's rush hash?
+ if (0)
+ return (n ^ 0xdead1234) * (884811920 * 3 + 1);
+
+ // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes.
+ if (1) {
+ unsigned int b = 378551;
+ unsigned int a = 63689;
+ unsigned int hash = 0;
+
+ for(unsigned int i=0; i<4; i++)
+ {
+ hash = hash * a + (n&0xff);
+ a = a * b;
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+ }
+
+
+
+
+ int operator()(int a) {
+ return myhash(a) ^ seed;
+ }
+ int operator()(int a, int b) {
+ return myhash(a) ^ myhash(b) ^ seed;
+ }
+ int operator()(int a, int b, int c) {
+ return myhash(a) ^ myhash(b) ^ myhash(c) ^ seed;
+ }
+ };
+
+}
--- /dev/null
+#ifndef __crush_CRUSH_H
+#define __crush_CRUSH_H
+
+namespace crush {
+
+#define CRUSH_RULE_TAKE 0
+#define CRUSH_RULE_CHOOSE 1
+#define CRUSH_RULE_VERT 2
+
+
+ class RuleStep {
+ int cmd;
+ list<int> args;
+ };
+
+ class Rule {
+ list< RuleStatement > steps;
+ };
+
+
+
+ class Crush {
+ protected:
+ map<int, Bucket> buckets;
+ map<int, Rule> rules;
+
+ public:
+
+ void do_rule(int rno) {
+ assert(rules.count(rno));
+ Rule& r = rules[rno];
+
+ // working variable
+ list< Bucket* > in;
+ list< vector<int> > out;
+
+ list< Bucket > temp_buckets;
+
+ // go through each statement
+ for (list<RuleStatement>::iterator pc = r.steps.begin();
+ pc != r.steps.end();
+ pc++) {
+ // move input?
+
+ // do it
+ switch (pc->cmd) {
+ case CRUSH_RULE_TAKE:
+ {
+ in.clear();
+ temp_buckets.clear();
+
+ int arg = r.args[0];
+ if (arg == 0) {
+ // input is old output
+
+ for (list< vector<int> >::iterator row = out.begin();
+ row != out.end();
+ row++) {
+
+ if (row->size() == 1) {
+ in.push_back( &buckets[ (*row)[0] ] );
+ } else {
+ // make a temp bucket!
+ temp_buckets.push_back( MixedBucket( rno, -1 ) );
+ in.push_back( &temp_buckets.back() );
+
+ // put everything in.
+ temp_buckets.back().make_new_tree( *row );
+ }
+ }
+
+ // clear output variable
+ out.clear();
+
+ } else {
+ // input is some explicit existing bucket
+ assert(buckets.count(arg));
+
+ // match rows in output?
+ for (list< vector<int> >::iterator row = out.begin();
+ row != out.end();
+ row++)
+ in.push_back( &buckets[arg] );
+ }
+
+ }
+ break;
+
+ case CRUSH_RULE_VERT:
+ {
+ in.clear();
+ temp_buckets.clear();
+
+ // input is (always) old output
+
+ for (list< vector<int> >::iterator row = out.begin();
+ row != out.end();
+ row++) {
+ for (int i=0; i<row->size(); i++) {
+ in.push_back( &buckets[ (*row)[i] ] );
+ }
+ }
+ }s.front();
+ break;
+
+ case CRUSH_RULE_CHOOSE:
+ {
+ int num = r.args[0];
+ int type = r.args[1];
+
+ // reset output
+ out.clear();
+
+ // do each row independently
+ for (list< Bucket* >::iterator row = in.begin();
+ row != in.end();
+ row++) {
+ // make new output row
+ out.push_back( vector<int> );
+ vector<int>& outrow = out.back();
+
+ // for each replica
+ for (int r=0; r<num; r++) {
+ // start with input bucket
+ Bucket *b = *row;
+
+ // choose through any intervening buckets
+ while (1) {
+ if (b->is_uniform() &&
+ ((UniformBucket*)b)->get_item_type() == type)
+ break;
+
+ int next = b->choose_r(x, r, h);
+ int itemtype = 0; // 0 is a terminal type
+ if (buckets.count(next)) {
+ b = &buckets[next];
+ itemtype = b->get_type();
+ }
+ if (itemtype == type) break; // this is what we want!
+ }
+
+ // choose in this bucket!
+ int item = b->choose_r(x, r, h);
+ outrow.push_back(item);
+ }
+ }
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+
+ }
+
+ // assemble result
+ vector<int> result;
+ list< vector<int> >::iterator row = out.begin();
+ result.swap( *row );
+ while (row != out.end()) {
+ result.append( result.end(), row->begin(), row->end() );
+ }
+
+
+ }
+
+ }
+
+}
+
+#endif
--- /dev/null
+
+
+#include "../crush/Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+ out << "[";
+ for (int i=0; i<v.size(); i++) {
+ if (i) out << " ";
+ out << v[i];
+ }
+ out << "]";
+ return out;
+}
+
+
+int main()
+{
+ Hash h(73);
+
+ vector<int> disks;
+ for (int i=0; i<20; i++)
+ disks.push_back(i);
+
+ /*
+ UniformBucket b(10, 1, 0, 10, disks);
+ b.make_primes(h);
+ cout << "primes are " << ub.primes << endl;
+ */
+ MixedBucket b(1, 1);
+ for (int i=0; i<20; i++)
+ b.add_item(i,20);
+ cout << b.tree << endl;
+
+ vector<int> ocount(disks.size());
+ int numrep = 3;
+
+ vector<int> v(numrep);
+ for (int x=1; x<1000000; x++) {
+ //cout << H(x) << "\t" << h(x) << endl;
+ for (int i=0; i<numrep; i++) {
+ int d = b.choose_r(x, i, h);
+ v[i] = d;
+ ocount[d]++;
+ }
+ //cout << v << "\t" << endl;//ocount << endl;
+ }
+
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+
+}
--- /dev/null
+
+
+#include "../crush/BinaryTree.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int main()
+{
+ BinaryTree t;
+
+ vector<int> nodes;
+
+ for (int i=0; i<30; i++) {
+ cout << "adding " << i << endl;
+ int n = t.add_node(1);
+ nodes.push_back(n);
+ }
+ cout << t << endl;
+
+ for (int k=0; k<10000; k++) {
+ if (rand() % 2) {
+ cout << "adding" << endl;
+ nodes.push_back( t.add_node(1) );
+ } else {
+ if (!nodes.empty()) {
+ //for (int i=0; i<nodes.size(); i++) {
+ int p = rand() % nodes.size();
+ int n = nodes[p];
+ assert(t.exists(n));
+ cout << "removing " << n << endl;
+ t.remove_node(n);
+
+ for (int j=p; j<nodes.size(); j++)
+ nodes[j] = nodes[j+1];
+ nodes.pop_back();
+ }
+ }
+ cout << t << endl;
+ }
+
+
+}