*** empty log message ***

author sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>

Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)

committer sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>

Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)
author sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)
committer sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)
diff --git a/ceph/TODO b/ceph/TODO

index f3fce49a633b9d936503a77863979ab90c6605ef..86dbe8a94616c4a70650af3106aae4b42f972a5c 100644 (file)
--- a/ceph/TODO
+++ b/ceph/TODO
@@ -85,16 +85,59 @@ struct bucket {
  
  
  
+struct bucket {
+  int             id;
+  int             type;    // disk, shelf, etc.
+  vector<int>     contents;
+  float           fixed_weight;  // homogenous bucket, or
+  map<int, float> weight;        // mixed bucket
+}
+
+give each bucket "type" has a numerical id, whatever you want.  eg
+
+0 disk
+1 shelf
+2 cabinet
+3 row
+4 room
+
+take(root)
+choose(2, room)
+take_vert
+choose(2, cab)
+take_vert
+choose(1, disk)
+
+
+mirror(2):
+choose(1, room, root)       -> [ room1 ]
+choose(2, cabinet, _)       -> [ cab3 cab2 ]
+choose(1, disk, _)          -> [ somediskincab3or2, someotherdiskincab3or2 ]
+
+mirror(4):
+choose(2, room, root)       -> [ room1 room2 ]
+vert(_)                     -> [ room1; room2 ]
+choose(2, cabinet, _)       -> [ cab11 cab12; cab21 cab22 ]
+vert(_)                     -> [ cab11; cab12; cab21; cab22 ]
+choose(1, disk, _)          -> [ disk; disk; disk; disk ]
+horiz(_)                    -> [ disk disk disk disk ]
+
+two goals:
+ - separate replicas into failure domains... "floor" in hierarchy
+ - limit replica set to a single domain... "ceiling" in hierarchy
+
+additional tricks:
+ - explicitly choose disks from different storage pools
+
+mirror(3)
+choose(2, disk, primary)    -> [ d1 d2 ]    // fast scsi (or use more complex chooser as above.)
+choose(1, disk, tertiary)   -> [ d1 d2 d3 ] // ide 
  
+results are appended to _, unless _ is used as an input, in which case it is cleared.
  
  
  
  
-p 0 disk
-t 1 shelf
-t 2 cabinet
-t 3 row
-t 4 room
  
  R(5, 0, [ s1 s2 s3 s4 ... ])  -> [ d1 d2 d3 d4 ]
  = 5 disks
diff --git a/ceph/crush/Bucket.h b/ceph/crush/Bucket.h

index 5737a44156e9ef2cbb8cfd620eda59bb32f45c52..448b00b2393e7f5f32c3b3bcaa3d23fefb1bd2f9 100644 (file)
--- a/ceph/crush/Bucket.h
+++ b/ceph/crush/Bucket.h
@@ -1,81 +1,117 @@
  #ifndef __crush_BUCKET_H
  #define __crush_BUCKET_H
  
+#include "BinaryTree.h"
+#include "Hash.h"
+
+#include <vector>
+#include <map>
+using namespace std;
+
  namespace crush {
  
+  /** abstract bucket **/
  
    class Bucket {
    protected:
         int         id;
         int         type;
         float       weight;
-       vector<int> items;
  
    public:
-       Bucket(int _id) : id(_id), weight(0) { }
-
-       int         get_id() { return id; }
-       int         get_type() { return type; }
-       float       get_weight() { return weight; }
-       vector<int> get_items() { return items; }
-       int         get_size() { return items.size(); }
+       Bucket(int _id,
+                  int _type,
+                  float _weight) :
+         id(_id), 
+         type(_type),
+         weight(_weight) { }
+       
+       int          get_id() { return id; }
+       int          get_type() { return type; }
+       float        get_weight() { return weight; }
+       virtual int  get_size() = 0;
  
-       float       get_item_weight() = 0;
-       bool        is_uniform() = 0;
-       float       calc_weight() = 0;
+       void        set_weight(float w) { weight = w; }
  
-       
-       void choose_r(int r, int type, vector<int>& in, vector<int>& out) = 0;
+       virtual bool is_uniform() = 0;
+       virtual int choose_r(int x, int r, Hash& h) = 0;
  
    };
  
  
- 
-  class UniformBucket : public Bucket {        void choose_r(int r, int type, vector<int>& in, vector<int>& out) = 0;
+  /** uniform bucket **/
+  class UniformBucket : public Bucket {        
    protected:
+  public:
+       vector<int> items;
+       int    item_type;
         float  item_weight;
+
+       vector<int> primes;
+
    public:
-       float       get_item_weight(int i) { return item_weight; }
+       UniformBucket(int _id, int _type, int _item_type,
+                                 float _item_weight, vector<int>& _items) :
+         Bucket(_id, _type, _item_weight*_items.size()),
+         item_type(_item_type),
+         item_weight(_item_weight) {
+         items = _items;
+       }
+
+       int get_size() { return items.size(); }
+       //float       get_item_weight(int item) { return item_weight; }
         bool        is_uniform() { return true; }
  
-       float calc_weight() {
-         if (item.empty())
-               weight = 0;
-         else 
-               weight = items.size() * get_item_weight(item[0]);
-         return weight;
+       int get_prime(int j) {
+         return primes[ j % primes.size() ];
         }
  
-       int choose_r(pg_t x, int r, Hash& h) {
-         assert(type == get_type());
-         if (r >= get_size()) cour << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
+       void make_primes(Hash& h) {
+         // start with odd number > num_items
+         int x = items.size() + 1 + h(get_id()) % items.size();
+         x |= 1; // odd
+
+         while (primes.size() < items.size()) {
+               int j;
+               for (j=2; j*j<=x; j++) 
+                 if (x % j == 0) break;
+               if (j*j > x) {
+                 primes.push_back(x);
+               }
+               x += 2;
+         }
+       }
+
+       int choose_r(int x, int r, Hash& h) {
+         //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
           
-         int v = h(x, r, get_id(), 1) * get_size();
-         int pmin = get_weight();
-         int p = 0; // choose a prime based on hash(x, r, get_id(), 2)
-         //int s = x + v + r *p(? % get_size())
-         //return get_item[s % get_size()];
+         int v = (h(x, get_id(), 1) % get_size()) * get_size();
+         int p = get_prime( h(x, 2) );  // choose a prime based on hash(x, get_id(), 2)
+         int s = (x + v + (r+1)*p) % get_size();
+         return items[s];
         }
    };
  
  
  
-  // mixed bucket.. RUSH_T
+  // mixed bucket, based on RUSH_T type binary tree
    
-
    class MixedBucket : public Bucket {
    protected:
         vector<float>  item_weight;
  
+  public:
         BinaryTree     tree;
         map<int,int>   node_map;     // node id -> item
  
    public:
-       float       get_item_weight(int i) {
-         return item_weight[i];
+       MixedBucket(int _id, int _type) : Bucket(_id, _type, 0) {
         }
-       bool        is_uniform() { return false; }
  
+       //float       get_item_weight(int i) { return item_weight[i]; }
+       bool        is_uniform() { return false; }
+       int get_size() { return node_map.size(); }
+       /*
         float calc_weight() {
           weight = 0;
           for (unsigned i=0; i<items.size(); i++) {
@@ -83,27 +119,46 @@ namespace crush {
           }
           return weight;
         }
+       */
  
-
-       void make_new_tree(vector<int>& items) {
+       /*
+       void make_new_tree(vector<int>& _items) {
+         assert(items.empty());
           assert(tree.empty());
           
+         items = _items;
+         
           for (unsigned i=0; i<items.size(); i++) {
                 int n = tree.add_node(item_weight[i]);
                 node_map[n] = items[i];
           }
+
+         //calc_weight();
         }
+       */
  
-       int choose_r(pg_t x, int r, Hash& h) {
+       void add_item(int item, float w) {
+         int n = tree.add_node(w);
+         node_map[n] = item;
+         weight += w;
+       }
+
+       int choose_r(int x, int r, Hash& h) {
           int n = tree.root();
           while (!tree.terminal(n)) {
-               float f = h(x, n, r, 0);
-               if (tree.weight(tree.left(n)) <= f)
-                 node = tree.left(n);
+               // pick a point in [0,w)
+               float w = tree.weight(n);
+               float f = (float)(h(x, n, r) % 1000) * w / 1000.0;
+
+               // left or right?
+               int l = tree.left(n);
+               if (tree.exists(l) && 
+                       f < tree.weight(l))
+                 n = l;
                 else
-                 node = tree.right(n);
+                 n = tree.right(n);
           }
-       
+         assert(node_map.count(n));
           return node_map[n];
         }
  
diff --git a/ceph/crush/Hash.h b/ceph/crush/Hash.h

new file mode 100644 (file)

index 0000000..57d36fa
--- /dev/null
+++ b/ceph/crush/Hash.h
@@ -0,0 +1,50 @@
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+namespace crush {
+  
+  class Hash {
+       hash<int> H;
+       int seed;
+  public:
+       Hash(int s) : seed( myhash(s) ) {}
+
+       unsigned myhash(unsigned n) {
+         // ethan's rush hash?
+         if (0) 
+               return (n ^ 0xdead1234) * (884811920 * 3  + 1);
+         
+         // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes.
+         if (1) {
+               unsigned int b    = 378551;
+               unsigned int a    = 63689;
+               unsigned int hash = 0;
+               
+               for(unsigned int i=0; i<4; i++)
+                 {
+                       hash = hash * a + (n&0xff);
+                       a    = a * b;
+                       n = n >> 8;
+                 }
+               
+               return (hash & 0x7FFFFFFF);
+         }
+
+       }
+       
+
+
+       
+       int operator()(int a) {
+         return myhash(a) ^ seed;
+       }
+       int operator()(int a, int b) {
+         return myhash(a) ^ myhash(b) ^ seed;
+       }
+       int operator()(int a, int b, int c) {
+         return myhash(a) ^ myhash(b) ^ myhash(c) ^ seed;
+       }
+  };
+
+}
diff --git a/ceph/crush/crush.h b/ceph/crush/crush.h

new file mode 100644 (file)

index 0000000..53e49b4
--- /dev/null
+++ b/ceph/crush/crush.h
@@ -0,0 +1,172 @@
+#ifndef __crush_CRUSH_H
+#define __crush_CRUSH_H
+
+namespace crush {
+
+#define CRUSH_RULE_TAKE    0
+#define CRUSH_RULE_CHOOSE  1
+#define CRUSH_RULE_VERT    2
+
+
+  class RuleStep {
+       int       cmd;
+       list<int> args;
+  };
+
+  class Rule {
+       list< RuleStatement > steps;
+  };
+
+
+
+  class Crush {
+  protected:
+       map<int, Bucket>  buckets;
+       map<int, Rule>    rules;
+
+  public:
+
+       void do_rule(int rno) {
+         assert(rules.count(rno));
+         Rule& r = rules[rno];
+
+         // working variable
+         list< Bucket* >       in;
+         list< vector<int>  >  out;
+
+         list< Bucket >        temp_buckets;
+
+         // go through each statement
+         for (list<RuleStatement>::iterator pc = r.steps.begin();
+                  pc != r.steps.end();
+                  pc++) {
+               // move input?
+               
+               // do it
+               switch (pc->cmd) {
+               case CRUSH_RULE_TAKE:
+                 {
+                       in.clear();
+                       temp_buckets.clear();
+                       
+                       int arg = r.args[0];
+                       if (arg == 0) {  
+                         // input is old output
+
+                         for (list< vector<int> >::iterator row = out.begin();
+                                  row != out.end();
+                                  row++) {
+                               
+                               if (row->size() == 1) {
+                                 in.push_back( &buckets[ (*row)[0] ] );
+                               } else {
+                                 // make a temp bucket!
+                                 temp_buckets.push_back( MixedBucket( rno, -1 ) );
+                                 in.push_back( &temp_buckets.back() );
+                                 
+                                 // put everything in.
+                                 temp_buckets.back().make_new_tree( *row );
+                               }
+                         }
+                         
+                         // clear output variable
+                         out.clear();
+
+                       } else {         
+                         // input is some explicit existing bucket
+                         assert(buckets.count(arg));
+
+                         // match rows in output?
+                         for (list< vector<int> >::iterator row = out.begin();
+                                  row != out.end();
+                                  row++) 
+                               in.push_back( &buckets[arg] );
+                       }
+                       
+                 }
+                 break;
+
+               case CRUSH_RULE_VERT:
+                 {
+                       in.clear();
+                       temp_buckets.clear();
+                       
+                       // input is (always) old output
+
+                       for (list< vector<int> >::iterator row = out.begin();
+                                row != out.end();
+                                row++) {
+                         for (int i=0; i<row->size(); i++) {
+                               in.push_back( &buckets[ (*row)[i] ] );
+                         }
+                       }
+                 }s.front();
+                 break;
+
+               case CRUSH_RULE_CHOOSE:
+                 {
+                       int num = r.args[0];
+                       int type = r.args[1];
+
+                       // reset output
+                       out.clear();
+                       
+                       // do each row independently
+                       for (list< Bucket* >::iterator row = in.begin();
+                                row != in.end();
+                                row++) {
+                         // make new output row
+                         out.push_back( vector<int> );
+                         vector<int>& outrow = out.back();
+                         
+                         // for each replica
+                         for (int r=0; r<num; r++) {
+                               // start with input bucket
+                               Bucket *b = *row;
+                               
+                               // choose through any intervening buckets
+                               while (1) {
+                                 if (b->is_uniform() && 
+                                         ((UniformBucket*)b)->get_item_type() == type) 
+                                       break;
+                                 
+                                 int next = b->choose_r(x, r, h);
+                                 int itemtype = 0;  // 0 is a terminal type
+                                 if (buckets.count(next)) {
+                                       b = &buckets[next];
+                                       itemtype = b->get_type();
+                                 } 
+                                 if (itemtype == type) break;  // this is what we want!
+                               }
+
+                               // choose in this bucket!
+                               int item = b->choose_r(x, r, h);
+                               outrow.push_back(item);
+                         }
+                       }
+                 }
+                 break;
+
+               default:
+                 assert(0);
+               }
+               
+               
+         }
+
+         // assemble result
+         vector<int> result;
+         list< vector<int> >::iterator row = out.begin();
+         result.swap( *row );
+         while (row != out.end()) {
+               result.append( result.end(), row->begin(), row->end() );
+         }
+         
+
+       }
+
+  }
+
+}
+
+#endif
diff --git a/ceph/test/testbucket.cc b/ceph/test/testbucket.cc

new file mode 100644 (file)

index 0000000..ed31523
--- /dev/null
+++ b/ceph/test/testbucket.cc
@@ -0,0 +1,59 @@
+
+
+#include "../crush/Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+  out << "[";
+  for (int i=0; i<v.size(); i++) {
+       if (i) out << " ";
+       out << v[i];
+  }
+  out << "]";
+  return out;
+}
+
+
+int main() 
+{
+  Hash h(73);
+
+  vector<int> disks;
+  for (int i=0; i<20; i++)
+       disks.push_back(i);
+
+  /*
+  UniformBucket b(10, 1, 0, 10, disks);
+  b.make_primes(h);
+  cout << "primes are " << ub.primes << endl;
+  */
+  MixedBucket b(1, 1);
+  for (int i=0; i<20; i++)
+       b.add_item(i,20);
+  cout << b.tree << endl;
+
+  vector<int> ocount(disks.size());
+  int numrep = 3;
+
+  vector<int> v(numrep);
+  for (int x=1; x<1000000; x++) {
+       //cout << H(x) << "\t" << h(x) << endl;
+       for (int i=0; i<numrep; i++) {
+         int d = b.choose_r(x, i, h);
+         v[i] = d;
+         ocount[d]++;
+       }
+       //cout << v << "\t" << endl;//ocount << endl;
+  }
+
+  for (int i=0; i<ocount.size(); i++) {
+       cout << "disk " << i << " has " << ocount[i] << endl;
+  }
+
+}
diff --git a/ceph/test/testtree.cc b/ceph/test/testtree.cc

new file mode 100644 (file)

index 0000000..133aa23
--- /dev/null
+++ b/ceph/test/testtree.cc
@@ -0,0 +1,45 @@
+
+
+#include "../crush/BinaryTree.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int main() 
+{
+  BinaryTree t;
+
+  vector<int> nodes;
+
+  for (int i=0; i<30; i++) {
+       cout << "adding " << i << endl;
+       int n = t.add_node(1);
+       nodes.push_back(n);
+  }
+  cout << t << endl;
+
+  for (int k=0; k<10000; k++) {
+       if (rand() % 2) {
+         cout << "adding" << endl;
+         nodes.push_back( t.add_node(1) );
+       } else {
+         if (!nodes.empty()) {
+               //for (int i=0; i<nodes.size(); i++) {
+               int p = rand() % nodes.size();
+               int n = nodes[p];
+               assert(t.exists(n));
+               cout << "removing " << n << endl;
+               t.remove_node(n);
+               
+               for (int j=p; j<nodes.size(); j++)
+                 nodes[j] = nodes[j+1];
+               nodes.pop_back();
+         }
+       }
+       cout << t << endl;
+  }
+
+
+}
author	sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
	Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)
committer	sage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
	Fri, 16 Sep 2005 05:22:25 +0000 (05:22 +0000)
ceph/TODO		patch \| blob \| history
ceph/crush/Bucket.h		patch \| blob \| history
ceph/crush/Hash.h	[new file with mode: 0644]	patch \| blob
ceph/crush/crush.h	[new file with mode: 0644]	patch \| blob
ceph/test/testbucket.cc	[new file with mode: 0644]	patch \| blob
ceph/test/testtree.cc	[new file with mode: 0644]	patch \| blob