store data and metadata in separate pg_pools; name crush rules

author Sage Weil <sage@newdream.net>

Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)

committer Sage Weil <sage@newdream.net>

Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)
author Sage Weil <sage@newdream.net>
Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)
committer Sage Weil <sage@newdream.net>
Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)
diff --git a/src/TODO b/src/TODO

index d1231af1eede5ccb0feb35636d40cc9ea1847068..7b936be8e9076b44d892591499fe003f41e5f89f 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -8,7 +8,6 @@ big items
  - client, user authentication
  - cas
  
-- meta vs data crush rules
  - use libuuid
  
  userspace client
@@ -20,6 +19,7 @@ userspace client
  - fix lease validation to check session ttl
  - clean up ll_ interface, now that we have leases!
  - clean up client mds session vs mdsmap behavior?
+- stop using mds's inode_t?
  
  kernel client
  - flush caps on sync, fsync, etc.
diff --git a/src/config.cc b/src/config.cc

index c9a3591649027b718280617fdc1fb1b84c418530..8a560fc6f64dc0b1d87850e6366bec6eedbdbb3d 100644 (file)
--- a/src/config.cc
+++ b/src/config.cc
@@ -107,7 +107,19 @@ struct ceph_file_layout g_default_file_layout = {
   fl_pg_preferred: init_le32(-1),
   fl_pg_type: CEPH_PG_TYPE_REP,
   fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: 1
+};
+
+struct ceph_file_layout g_default_casdata_layout = {
+ fl_stripe_unit: init_le32(1<<22),
+ fl_stripe_count: init_le32(1),
+ fl_object_size: init_le32(1<<22),
+ fl_cas_hash: init_le32(0),
+ fl_object_stripe_unit: init_le32(0),
+ fl_pg_preferred: init_le32(-1),
+ fl_pg_type: CEPH_PG_TYPE_REP,
+ fl_pg_size: 2,
+ fl_pg_pool: 2
  };
  
  struct ceph_file_layout g_default_mds_dir_layout = {
@@ -146,6 +158,16 @@ struct ceph_file_layout g_default_mds_anchortable_layout = {
   fl_pg_pool: 0
  };
  
+const char *get_pool_name(int pool) 
+{
+  switch (pool) {
+  case 0: return "metadata";
+  case 1: return "data";
+  case 2: return "casdata";
+  default: return "";
+  }
+}
+
  #include <msg/msg_types.h>
  
  // fake osd failures: osd -> time
diff --git a/src/config.h b/src/config.h

index 293ec25c756ea61aed9e0ab74a0151f606e59b53..be12340b75547d0388730f9a91269d734546fd12 100644 (file)
--- a/src/config.h
+++ b/src/config.h
@@ -16,10 +16,13 @@
  #define __CEPH_CONFIG_H
  
  extern struct ceph_file_layout g_default_file_layout;
+extern struct ceph_file_layout g_default_casdata_layout;
  extern struct ceph_file_layout g_default_mds_dir_layout;
  extern struct ceph_file_layout g_default_mds_log_layout;
  extern struct ceph_file_layout g_default_mds_anchortable_layout;
  
+extern const char *get_pool_name(int pool);
+
  #include <vector>
  #include <map>
  
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc

index 9ff5962e0b6b10fd7e566ccd83e27a7eb923a3bf..e08f9199775a5a229a81f4a8e3635cf9392e0560 100644 (file)
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -60,6 +60,8 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, map<int,do
    crush.set_type_name(1, "domain");
    crush.set_type_name(2, "pool");
  
+  int npools = 3;
+
    int minrep = g_conf.osd_min_rep;
    int ndom = MAX(g_conf.osd_max_rep, g_conf.osd_max_raid_width);
    if (num_osd >= ndom*3 &&
@@ -101,25 +103,27 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, map<int,do
  
      // rules
      // replication
-    for (int pool=0; pool<1; pool++) {
+    for (int pool=0; pool<npools; pool++) {
        // size minrep..ndom
        crush_rule *rule = crush_make_rule(4, pool, CEPH_PG_TYPE_REP, minrep, ndom);
        crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
        crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, CRUSH_CHOOSE_N, 1); // choose N domains
        crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);  // and 1 device in each
        crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
-      crush_add_rule(crush.crush, rule, -1);
+      int rno = crush_add_rule(crush.crush, rule, -1);
+      crush.set_rule_name(rno, get_pool_name(pool));
      }
  
      // raid
-    for (int pool=0; pool<1; pool++) {
-      crush_rule *rule = crush_make_rule(4, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
-      crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
-      crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 1);
-      crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
-      crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
-      crush_add_rule(crush.crush, rule, -1);
-    }
+    if (g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
+      for (int pool=0; pool<npools; pool++) {
+       crush_rule *rule = crush_make_rule(4, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
+       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 1);
+       crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
+       crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
+       crush_add_rule(crush.crush, rule, -1);
+      }
      
    } else {
      // one bucket
@@ -133,22 +137,24 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, map<int,do
      crush.set_item_name(rootid, "root");
  
      // replication
-    for (int pool=0; pool<1; pool++) {
+    for (int pool=0; pool<npools; pool++) {
        crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep);
        crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
        crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, CRUSH_CHOOSE_N, 0);
        crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
-      crush_add_rule(crush.crush, rule, -1);
+      int rno = crush_add_rule(crush.crush, rule, -1);
+      crush.set_rule_name(rno, get_pool_name(pool));
      }
  
      // raid4
-    for (int pool=0; pool<1; pool++) {
-      crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
-      crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
-      crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 0);
-      crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
-      crush_add_rule(crush.crush, rule, -1);
-    }
+    if (g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
+      for (int pool=0; pool<npools; pool++) {
+       crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
+       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 0);
+       crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+       crush_add_rule(crush.crush, rule, -1);
+      }
    }
  
    crush.finalize();
author	Sage Weil <sage@newdream.net>
	Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)
committer	Sage Weil <sage@newdream.net>
	Thu, 12 Jun 2008 04:26:55 +0000 (21:26 -0700)
src/TODO		patch \| blob \| history
src/config.cc		patch \| blob \| history
src/config.h		patch \| blob \| history
src/osd/OSDMap.cc		patch \| blob \| history