]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: make pg pools resizeable
authorSage Weil <sage@newdream.net>
Mon, 27 Apr 2009 23:31:50 +0000 (16:31 -0700)
committerSage Weil <sage@newdream.net>
Tue, 28 Apr 2009 18:13:49 +0000 (11:13 -0700)
Move the pg size from the pgid to the pg_pool descriptor, so that
entire pools of pgs can be resized.

23 files changed:
src/client/SyntheticClient.cc
src/config.cc
src/config.h
src/crush/CrushWrapper.h
src/crush/builder.c
src/crush/crush.h
src/crush/mapper.c
src/crushtool.cc
src/include/ceph_fs.h
src/include/types.h
src/kernel/osd_client.c
src/kernel/osdmap.c
src/kernel/osdmap.h
src/mon/OSDMonitor.cc
src/mon/PGMonitor.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/OSDMap.cc
src/osd/OSDMap.h
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.h

index 6c4a359de8bec0452d2d61f0f74422131d7a966c..53be64de6b27cd3c315ab3de6489b094581af404 100644 (file)
@@ -2146,8 +2146,7 @@ int SyntheticClient::create_objects(int nobj, int osize, int inflight)
     if (time_to_stop()) break;
 
     object_t oid(0x1000, i);
-    ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 
-                                                                  g_default_file_layout.fl_pg_size, 0);
+    ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0);
     SnapContext snapc;
     
     if (i % inflight == 0) {
@@ -2250,8 +2249,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
     }
     object_t oid(0x1000, o);
 
-    ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 
-                                                                  g_default_file_layout.fl_pg_size, 0);
+    ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0);
     SnapContext snapc;
     
     client->client_lock.Lock();
index 42607272e9e0191b1ec87f40ffb2e8ce45bb6230..a9730b11643dec3599ce3304e3859bf809d773b2 100644 (file)
@@ -86,8 +86,7 @@ struct ceph_file_layout g_default_file_layout = {
  fl_object_stripe_unit: init_le32(0),
  fl_pg_preferred: init_le32(-1),
  fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 1
+ fl_pg_pool: {1},
 };
 
 struct ceph_file_layout g_default_casdata_layout = {
@@ -98,8 +97,7 @@ struct ceph_file_layout g_default_casdata_layout = {
  fl_object_stripe_unit: init_le32(0),
  fl_pg_preferred: init_le32(-1),
  fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 2
+ fl_pg_pool: {3},
 };
 
 struct ceph_file_layout g_default_mds_dir_layout = {
@@ -110,8 +108,7 @@ struct ceph_file_layout g_default_mds_dir_layout = {
  fl_object_stripe_unit: init_le32(0),
  fl_pg_preferred: init_le32(-1),
  fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
 };
 
 struct ceph_file_layout g_default_mds_log_layout = {
@@ -122,8 +119,7 @@ struct ceph_file_layout g_default_mds_log_layout = {
  fl_object_stripe_unit: init_le32(0),
  fl_pg_preferred: init_le32(-1),
  fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
 };
 
 struct ceph_file_layout g_default_mds_anchortable_layout = {
@@ -134,20 +130,9 @@ struct ceph_file_layout g_default_mds_anchortable_layout = {
  fl_object_stripe_unit: init_le32(0),
  fl_pg_preferred: init_le32(-1),
  fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
 };
 
-const char *get_pool_name(int pool) 
-{
-  switch (pool) {
-  case 0: return "metadata";
-  case 1: return "data";
-  case 2: return "casdata";
-  default: return "";
-  }
-}
-
 #include <msg/msg_types.h>
 
 // fake osd failures: osd -> time
index 693a7dbc27419d43b89d7268f673417ab0fe2790..f76dd7946e0d87bdb3301b62452a7666f8f6ea52 100644 (file)
@@ -21,8 +21,6 @@ extern struct ceph_file_layout g_default_mds_dir_layout;
 extern struct ceph_file_layout g_default_mds_log_layout;
 extern struct ceph_file_layout g_default_mds_anchortable_layout;
 
-extern const char *get_pool_name(int pool);
-
 #include <vector>
 #include <map>
 
index d8aeffd67b5c31462a6dc58d719ddadc7597b0dd..50a50042920549e8be7b675f6df54f61bf5a04d1 100644 (file)
@@ -182,10 +182,10 @@ public:
     if (IS_ERR(r)) return PTR_ERR(r);
     return r->len;
   }
-  int get_rule_mask_pool(unsigned ruleno) {
+  int get_rule_mask_ruleset(unsigned ruleno) {
     crush_rule *r = get_rule(ruleno);
     if (IS_ERR(r)) return -1;
-    return r->mask.pool;
+    return r->mask.ruleset;
   }
   int get_rule_mask_type(unsigned ruleno) {
     crush_rule *r = get_rule(ruleno);
index ab5ad5200f314048c9697132e29e2364d64c39ea..7f36319da4bd8fa9db1211fe6639e1f1c0ed933e 100644 (file)
@@ -70,12 +70,12 @@ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
        return ruleno;
 }
 
-struct crush_rule *crush_make_rule(int len, int pool, int type, int minsize, int maxsize)
+struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
 {
        struct crush_rule *rule;
        rule = malloc(crush_rule_size(len));
        rule->len = len;
-       rule->mask.pool = pool;
+       rule->mask.ruleset = ruleset;
        rule->mask.type = type;
        rule->mask.min_size = minsize;
        rule->mask.max_size = maxsize;
index e25976b17f6099138219007ab454c83e0fa94d04..19aba44b269d41bb448b4f1fa1f0c039ff45fc27 100644 (file)
@@ -56,11 +56,11 @@ enum {
 
 /*
  * The rule mask is used to describe what the rule is intended for.
- * Given a storage pool and size of output set, we search through the
+ * Given a ruleset and size of output set, we search through the
  * rule list for a matching rule_mask.
  */
 struct crush_rule_mask {
-       __u8 pool;
+       __u8 ruleset;
        __u8 type;
        __u8 min_size;
        __u8 max_size;
index 4752c2d73aa51be9f86519b1dd37135d31417287..81e2da99e7af5f056e816a5acd4ef1fe89df05e9 100644 (file)
 
 
 /**
- * crush_find_rule - find a crush_rule id for a given pool, type, and size.
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
  * @map: the crush_map
- * @pool: the storage pool id (user defined)
- * @type: storage pool type (user defined)
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
  * @size: output set size
  */
-int crush_find_rule(struct crush_map *map, int pool, int type, int size)
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
 {
        int i;
 
        for (i = 0; i < map->max_rules; i++) {
                if (map->rules[i] &&
-                   map->rules[i]->mask.pool == pool &&
+                   map->rules[i]->mask.ruleset == ruleset &&
                    map->rules[i]->mask.type == type &&
                    map->rules[i]->mask.min_size <= size &&
                    map->rules[i]->mask.max_size >= size)
index 00ffbd93562abbceb07b824b86f96e0809bbc1f2..b036530e4dbdd2bf8bb47a9ff99b4f5e5783f2f9 100644 (file)
@@ -234,7 +234,7 @@ void parse_rule(iter_t const& i, CrushWrapper &crush)
     start = 3;
   }
 
-  int pool = int_node(i->children[start]);
+  int ruleset = int_node(i->children[start]);
 
   string tname = string_node(i->children[start+2]);
   int type;
@@ -251,7 +251,7 @@ void parse_rule(iter_t const& i, CrushWrapper &crush)
   int steps = i->children.size() - start - 8;
   //cout << "num steps " << steps << std::endl;
   
-  int ruleno = crush.add_rule(steps, pool, type, minsize, maxsize, -1);
+  int ruleno = crush.add_rule(steps, ruleset, type, minsize, maxsize, -1);
   if (rname.length()) {
     crush.set_rule_name(ruleno, rname.c_str());
     rule_id[rname] = ruleno;
@@ -560,7 +560,7 @@ int decompile_crush(CrushWrapper &crush, ostream &out)
     if (crush.get_rule_name(i))
       print_rule_name(out, i, crush);
     out << " {\n";
-    out << "\tpool " << crush.get_rule_mask_pool(i) << "\n";
+    out << "\truleset " << crush.get_rule_mask_ruleset(i) << "\n";
     switch (crush.get_rule_mask_type(i)) {
     case CEPH_PG_TYPE_REP: out << "\ttype replicated\n"; break;
     case CEPH_PG_TYPE_RAID4: out << "\ttype raid4\n"; break;
@@ -820,15 +820,14 @@ int main(int argc, const char **argv)
       lower_weights.swap(cur_weights);
     }
     
-    // make some generic rules
-    for (int pool=0; pool<3; pool++) {
-      crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, 2, 2);
-      crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
-      crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1);
-      crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
-      int rno = crush_add_rule(crush.crush, rule, -1);
-      crush.set_rule_name(rno, get_pool_name(pool));
-    }
+    // make a generic rules
+    int ruleset=1;
+    crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, 2, 2);
+    crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+    crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1);
+    crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+    int rno = crush_add_rule(crush.crush, rule, -1);
+    crush.set_rule_name(rno, "data");
 
     crush.finalize();
     dout(0) << "crush max_devices " << crush.crush->max_devices << dendl;
index e8beda20623282ca0fd0dd5902b907647bdfe18d..b82aa6b91b3945876c262f0f72da74e9547cfb6f 100644 (file)
@@ -264,9 +264,9 @@ struct ceph_file_layout {
 
        /* object -> pg layout */
        __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-       __u8  fl_pg_type;       /* pg type; see PG_TYPE_* */
-       __u8  fl_pg_size;       /* pg size (num replicas, etc.) */
-       __u8  fl_pg_pool;       /* implies crush ruleset AND object namespace */
+       __u8   fl_pg_type;
+       __le16 fl_pg_pool;      /* implies crush ruleset, rep level */
+       __le16 fl_pg_ns;        /* object namespace */
 } __attribute__ ((packed));
 
 #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
@@ -297,16 +297,21 @@ union ceph_pg {
        struct {
                __s16 preferred; /* preferred primary osd */
                __u16 ps;        /* placement seed */
-               __u8 __pad;
-               __u8 size;
-               __u8 pool;       /* implies crush ruleset */
+               __u16 pool;      /* implies crush ruleset */
                __u8 type;
+               __u8 __pad;
        } pg;
 } __attribute__ ((packed));
 
 #define ceph_pg_is_rep(pg)   ((pg).pg.type == CEPH_PG_TYPE_REP)
 #define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4)
 
+struct ceph_pg_pool {
+       __u8 crush_ruleset;
+       __u8 size;
+       __u8 type;
+} __attribute__ ((packed));
+
 /*
  * stable_mod func is used to control number of placement groups.
  * similar to straight-up modulo, but produces a stable mapping as b
index 76b4d4e08126adc2d674988d14233a7f83ec6f98..a5dcf464c8901fe56a07d5516497c7e6de882f31 100644 (file)
@@ -199,6 +199,7 @@ struct ltstr
 
 WRITE_RAW_ENCODER(ceph_fsid_t)
 WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_pg_pool)
 WRITE_RAW_ENCODER(ceph_client_ticket)
 WRITE_RAW_ENCODER(ceph_mds_request_head)
 WRITE_RAW_ENCODER(ceph_mds_request_release)
index 66b034e7abe38c22d71302d24324bee21dd95d43..90ccda5fde5082c903dae464a4a712f804ba3154 100644 (file)
@@ -320,13 +320,16 @@ static int map_osds(struct ceph_osd_client *osdc,
        unsigned pps; /* placement ps */
        int osds[10], osd = -1;
        int i, num;
+       struct ceph_pg_pool *pool;
 
-       ruleno = crush_find_rule(osdc->osdmap->crush, req->r_pgid.pg.pool,
-                                req->r_pgid.pg.type, req->r_pgid.pg.size);
+       if (req->r_pgid.pg.pool >= osdc->osdmap->num_pools)
+               return -1;
+       pool = &osdc->osdmap->pg_pool[req->r_pgid.pg.pool];
+       ruleno = crush_find_rule(osdc->osdmap->crush, pool->crush_ruleset,
+                                req->r_pgid.pg.type, pool->size);
        if (ruleno < 0) {
                derr(0, "map_osds no crush rule for pool %d type %d size %d\n",
-                    req->r_pgid.pg.pool, req->r_pgid.pg.type,
-                    req->r_pgid.pg.size);
+                    req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->size);
                return -1;
        }
 
@@ -339,7 +342,7 @@ static int map_osds(struct ceph_osd_client *osdc,
                                     osdc->osdmap->pgp_num,
                                     osdc->osdmap->pgp_num_mask);
        num = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds,
-                           min_t(int, req->r_pgid.pg.size, ARRAY_SIZE(osds)),
+                           min_t(int, pool->size, ARRAY_SIZE(osds)),
                            req->r_pgid.pg.preferred, osdc->osdmap->osd_weight);
 
        /* primary is first up osd */
index eb1cbeea5608a99877c807ad4c000009247d177e..e1a21fe591f0bcba3a602a51eab58e6e41f163e3 100644 (file)
@@ -294,6 +294,7 @@ void osdmap_destroy(struct ceph_osdmap *map)
                crush_destroy(map->crush);
        kfree(map->osd_state);
        kfree(map->osd_weight);
+       kfree(map->pg_pool);
        kfree(map->osd_addr);
        kfree(map);
 }
@@ -366,7 +367,24 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        ceph_decode_32(p, map->lpg_num);
        ceph_decode_32(p, map->lpgp_num);
        ceph_decode_32(p, map->last_pg_change);
-       ceph_decode_32(p, map->flags);
+
+       ceph_decode_32(p, map->num_pools);
+       map->pg_pool = kmalloc(map->num_pools * sizeof(*map->pg_pool),
+                              GFP_NOFS);
+       if (!map->pg_pool) {
+               err = -ENOMEM;
+               goto bad;
+       }
+       ceph_decode_32_safe(p, end, max, bad);
+       while (max--) {
+               ceph_decode_need(p, end, 4+sizeof(*map->pg_pool), bad);
+               ceph_decode_32(p, i);
+               if (i >= map->num_pools)
+                       goto bad;
+               ceph_decode_copy(p, &map->pg_pool[i], sizeof(*map->pg_pool));
+       }
+
+       ceph_decode_32_safe(p, end, map->flags, bad);
 
        calc_pg_masks(map);
 
@@ -430,7 +448,7 @@ struct ceph_osdmap *apply_incremental(void **p, void *end,
        ceph_fsid_t fsid;
        u32 epoch = 0;
        struct ceph_timespec modified;
-       u32 len, x;
+       u32 len, x, pool;
        __s32 new_flags, max;
        void *start = *p;
        int err = -EINVAL;
@@ -502,6 +520,30 @@ struct ceph_osdmap *apply_incremental(void **p, void *end,
                newcrush = NULL;
        }
 
+       /* new_pool */
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               ceph_decode_32_safe(p, end, pool, bad);
+               if (pool >= map->num_pools) {
+                       void *p = kzalloc((pool+1) * sizeof(*map->pg_pool),
+                                         GFP_NOFS);
+                       if (!p) {
+                               err = -ENOMEM;
+                               goto bad;
+                       }
+                       memcpy(p, map->pg_pool,
+                              map->num_pools * sizeof(*map->pg_pool));
+                       kfree(map->pg_pool);
+                       map->pg_pool = p;
+                       map->num_pools = pool+1;
+               }
+               ceph_decode_copy(p, &map->pg_pool[pool], sizeof(*map->pg_pool));
+       }
+
+       /* old_pool (ignore) */
+       ceph_decode_32_safe(p, end, len, bad);
+       *p += len * (sizeof(u32) + sizeof(*map->pg_pool));
+
        /* new_up */
        err = -EINVAL;
        ceph_decode_32_safe(p, end, len, bad);
@@ -633,7 +675,6 @@ void calc_object_layout(struct ceph_object_layout *ol,
        pgid.pg.ps = bno + crush_hash32_2(ino, ino>>32);
        pgid.pg.preferred = preferred;
        pgid.pg.type = fl->fl_pg_type;
-       pgid.pg.size = fl->fl_pg_size;
        pgid.pg.pool = fl->fl_pg_pool;
 
        ol->ol_pgid = cpu_to_le64(pgid.pg64);
index 8cee8268c80c44ab3680263e369a8ca36518c290..21f80aa550cf9958f0e63a3b3db2a52889fa6d8a 100644 (file)
@@ -38,6 +38,9 @@ struct ceph_osdmap {
        u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
        struct ceph_entity_addr *osd_addr;
 
+       u32 num_pools;
+       struct ceph_pg_pool *pg_pool;
+
        /* the CRUSH map specifies the mapping of placement groups to
         * the list of osds that store+replicate them. */
        struct crush_map *crush;
index 79fb97c8d2565a6f3a11422ca152eacb6cdac21a..b57f0cfa9e667be92816e8807e9bd20eea5a615c 100644 (file)
@@ -846,20 +846,16 @@ void OSDMonitor::tick()
     // For all PGs that have OSD 0 as the primary,
     // switch them to use the first replca
     ps_t numps = osdmap.get_pg_num();
-    int minrep = 1; 
-    int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep);
     for (int pool=0; pool<1; pool++)
-      for (int nrep = minrep; nrep <= maxrep; nrep++) { 
-       for (ps_t ps = 0; ps < numps; ++ps) {
-         pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1);
-         vector<int> osds;
-         osdmap.pg_to_osds(pgid, osds); 
-         if (osds[0] == 0) {
-           pending_inc.new_pg_swap_primary[pgid] = osds[1];
-           dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
-                   << osds[1] << dendl;
-           do_propose = true;
-         }
+      for (ps_t ps = 0; ps < numps; ++ps) {
+       pg_t pgid = pg_t(pg_t::TYPE_REP, ps, pool, -1);
+       vector<int> osds;
+       osdmap.pg_to_osds(pgid, osds); 
+       if (osds[0] == 0) {
+         pending_inc.new_pg_swap_primary[pgid] = osds[1];
+         dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
+                 << osds[1] << dendl;
+         do_propose = true;
        }
       }
   }
@@ -1119,6 +1115,31 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
        return true;
       }
     }
+    else if (m->cmd[1] == "pool" && m->cmd.size() >= 5) {
+      int pool = -1;
+      for (map<int,nstring>::iterator p = osdmap.pool_name.begin();
+          p != osdmap.pool_name.end();
+          p++) {
+       if (p->second == m->cmd[2])
+         pool = p->first;
+      }
+      if (pool >= 0) {
+       if (m->cmd[3] == "size") {
+         int s = atoi(m->cmd[4].c_str());
+         if (s) {
+           pending_inc.new_pools[pool] = osdmap.pools[pool];
+           pending_inc.new_pools[pool].size = s;
+           ss << "set pool " << pool << " size to " << s;
+           getline(ss, rs);
+           paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+           return true;
+         }
+       }
+      } else {
+       ss << "unrecognized pool '" << m->cmd[2] << "'";
+       err = -ENOENT;
+      }
+    }
     else {
       ss << "unknown command " << m->cmd[1];
     }
index 7394eccc87a69cbc8c87c849e7fa189d7e1650cf..57fab6426ea05e0856382defe5dbb7f760771ccd 100644 (file)
@@ -440,57 +440,56 @@ bool PGMonitor::register_new_pgs()
 
   bool first = pg_map.pg_stat.empty(); // first pg creation
   int created = 0;
-  for (int ruleno=0; ruleno<crush->get_max_rules(); ruleno++) {
+  for (map<int,ceph_pg_pool>::iterator p = mon->osdmon()->osdmap.pools.begin();
+       p != mon->osdmon()->osdmap.pools.end();
+       p++) {
+    int pool = p->first;
+    int type = p->second.type;
+    int ruleno = p->second.crush_ruleset;
     if (!crush->rule_exists(ruleno)) 
       continue;
-    int pool = crush->get_rule_mask_pool(ruleno);
-    int type = crush->get_rule_mask_type(ruleno);
-    int min_size = crush->get_rule_mask_min_size(ruleno);
-    int max_size = crush->get_rule_mask_max_size(ruleno);
-    for (int size = min_size; size <= max_size; size++) {
-      for (ps_t ps = 0; ps < pg_num; ps++) {
-       pg_t pgid(type, size, ps, pool, -1);
-       if (pg_map.pg_stat.count(pgid)) {
-         dout(20) << "register_new_pgs have " << pgid << dendl;
-         continue;
-       }
+    for (ps_t ps = 0; ps < pg_num; ps++) {
+      pg_t pgid(type, ps, pool, -1);
+      if (pg_map.pg_stat.count(pgid)) {
+       dout(20) << "register_new_pgs have " << pgid << dendl;
+       continue;
+      }
 
-       pg_t parent;
-       int split_bits = 0;
-       if (!first) {
-         parent = pgid;
-         while (1) {
-           // remove most significant bit
-           int msb = calc_bits_of(parent.u.pg.ps);
-           if (!msb) break;
-           parent.u.pg.ps &= ~(1<<(msb-1));
-           split_bits++;
-           dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl;
-           //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) {
-           if (pg_map.pg_stat.count(parent) &&
-               pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
-             dout(10) << "  parent is " << parent << dendl;
-             break;
-           }
+      pg_t parent;
+      int split_bits = 0;
+      if (!first) {
+       parent = pgid;
+       while (1) {
+         // remove most significant bit
+         int msb = calc_bits_of(parent.u.pg.ps);
+         if (!msb) break;
+         parent.u.pg.ps &= ~(1<<(msb-1));
+         split_bits++;
+         dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl;
+         //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) {
+         if (pg_map.pg_stat.count(parent) &&
+             pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
+           dout(10) << "  parent is " << parent << dendl;
+           break;
          }
        }
-       
-       pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING;
-       pending_inc.pg_stat_updates[pgid].created = epoch;
-       pending_inc.pg_stat_updates[pgid].parent = parent;
-       pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits;
-       created++;      
-
-       if (split_bits == 0) {
-         dout(10) << "register_new_pgs will create " << pgid << dendl;
-       } else {
-         dout(10) << "register_new_pgs will create " << pgid
-                  << " parent " << parent
-                  << " by " << split_bits << " bits"
-                  << dendl;
-       }
-
       }
+      
+      pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING;
+      pending_inc.pg_stat_updates[pgid].created = epoch;
+      pending_inc.pg_stat_updates[pgid].parent = parent;
+      pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits;
+      created++;       
+      
+      if (split_bits == 0) {
+       dout(10) << "register_new_pgs will create " << pgid << dendl;
+      } else {
+       dout(10) << "register_new_pgs will create " << pgid
+                << " parent " << parent
+                << " by " << split_bits << " bits"
+                << dendl;
+      }
+      
     }
   } 
   dout(10) << "register_new_pgs registered " << created << " new pgs" << dendl;
index cfa9cbcd1df903838c76f7db618c584857560261..3e3b0ce672f3ffce7eec86c332e85af68d70b8ee 100644 (file)
@@ -2037,7 +2037,7 @@ void OSD::advance_map(ObjectStore::Transaction& t, interval_set<snapid_t>& remov
     pg->state_clear(PG_STATE_PEERING);  // we'll need to restart peering
 
     if (pg->is_primary() && 
-       pg->info.pgid.size() != pg->acting.size())
+       osdmap->get_pg_size(pg->info.pgid) != pg->acting.size())
       pg->state_set(PG_STATE_DEGRADED);
     else
       pg->state_clear(PG_STATE_DEGRADED);
@@ -2450,7 +2450,7 @@ void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction
 
   for (vector<pobject_t>::iterator p = olist.begin(); p != olist.end(); p++) {
     pobject_t poid = *p;
-    ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(), parentid.size(),
+    ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(), 
                                                      parentid.pool(), parentid.preferred());
     if (le64_to_cpu(l.ol_pgid) != parentid.u.pg64) {
       pg_t pgid(le64_to_cpu(l.ol_pgid));
index 07cd7a1a98c784a06333870870ebc87a6bd911dc..aa0d4b7d0752b2b8bcedd3da2d5861133c57afb1 100644 (file)
@@ -119,10 +119,10 @@ public:
   int get_nodeid() { return whoami; }
   
   static pobject_t get_osdmap_pobject_name(epoch_t epoch) { 
-    return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, epoch << 1)); 
+    return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, epoch << 1)); 
   }
   static pobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { 
-    return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, (epoch << 1) + 1)); 
+    return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, (epoch << 1) + 1)); 
   }
   
 
index 79b231586b80c9fada29d286f86877338e1445c2..8aef86fcd0f69317e4f8530c70027eb9008f9257 100644 (file)
@@ -31,6 +31,14 @@ void OSDMap::print(ostream& out)
       << "lpgp_num " << get_lpgp_num() << "\n"
       << "last_pg_change " << get_last_pg_change() << "\n"
       << std::endl;
+  for (map<int,ceph_pg_pool>::iterator p = pools.begin(); p != pools.end(); p++)
+    out << "pg_pool " << p->first
+       << " '" << pool_name[p->first]
+       << "' size " << (int)p->second.size
+       << " crush_ruleset " << (int)p->second.crush_ruleset
+       << "\n";
+  out << std::endl;
+
   out << "max_osd " << get_max_osd() << "\n";
   for (int i=0; i<get_max_osd(); i++) {
     if (exists(i)) {
@@ -90,7 +98,21 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid,
   lpg_num = lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0;
   
   // crush map
-  build_simple_crush_map(crush, num_osd, num_dom);
+  map<int, const char*> rulesets;
+  rulesets[CEPH_DATA_RULE] = "data";
+  rulesets[CEPH_METADATA_RULE] = "metadata";
+  rulesets[CEPH_CASDATA_RULE] = "casdata";
+  
+  int pool = 0;
+  for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+    pools[pool].size = 2;
+    pools[pool].crush_ruleset = p->first;
+    pools[pool].type = CEPH_PG_TYPE_REP;
+    pool_name[pool] = p->second;
+    pool++;
+  }
+
+  build_simple_crush_map(crush, rulesets, num_osd, num_dom);
 
   for (int i=0; i<num_osd; i++) {
     set_state(i, CEPH_OSD_EXISTS);
@@ -108,7 +130,7 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid,
   }
 }
 
-void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd,
+void OSDMap::build_simple_crush_map(CrushWrapper& crush, map<int, const char*>& rulesets, int num_osd,
                                    int num_dom)
 {
   // new
@@ -117,8 +139,6 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd,
   crush.set_type_name(1, "domain");
   crush.set_type_name(2, "pool");
 
-  int npools = 3;
-
   int minrep = g_conf.osd_min_rep;
   int ndom = num_dom;
   if (!ndom)
@@ -161,26 +181,15 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd,
     crush.set_item_name(rootid, "root");
 
     // rules
-    // replication
-    for (int pool=0; pool<npools; pool++) {
-      // size minrep..ndom
-      crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, minrep, g_conf.osd_max_rep);
+    for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+      int ruleset = p->first;
+      crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, minrep, g_conf.osd_max_rep);
       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); // choose N domains
       crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
       int rno = crush_add_rule(crush.crush, rule, -1);
-      crush.set_rule_name(rno, get_pool_name(pool));
+      crush.set_rule_name(rno, p->second);
     }
-
-    // raid
-    if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
-      for (int pool=0; pool<npools; pool++) {
-       crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
-       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
-       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_INDEP, CRUSH_CHOOSE_N, 1);
-       crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
-       crush_add_rule(crush.crush, rule, -1);
-      }
     
   } else {
     // one bucket
@@ -197,24 +206,16 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd,
     crush.set_item_name(rootid, "root");
 
     // replication
-    for (int pool=0; pool<npools; pool++) {
-      crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep);
+    for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+      int ruleset = p->first;
+      crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep);
       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, CRUSH_CHOOSE_N, 0);
       crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
       int rno = crush_add_rule(crush.crush, rule, -1);
-      crush.set_rule_name(rno, get_pool_name(pool));
+      crush.set_rule_name(rno, p->second);
     }
 
-    // raid4
-    if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
-      for (int pool=0; pool<npools; pool++) {
-       crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
-       crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
-       crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 0);
-       crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
-       crush_add_rule(crush.crush, rule, -1);
-      }
   }
 
   crush.finalize();
index e04e71b1724540f34acf566b19b9525ccf17d06a..abfbe5d4e1ff178d65899e7df9d6e43dfc4c7f61 100644 (file)
@@ -151,6 +151,9 @@ public:
     // incremental
     int32_t new_max_osd;
     int32_t new_pg_num, new_pgp_num, new_lpg_num, new_lpgp_num;
+    map<int32_t,ceph_pg_pool> new_pools;
+    map<int32_t,nstring> new_pool_names;
+    set<int32_t> old_pools;
     map<int32_t,entity_addr_t> new_up;
     map<int32_t,uint8_t> new_down;
     map<int32_t,uint32_t> new_weight;
@@ -180,11 +183,14 @@ public:
       ::encode(new_pgp_num, bl);
       ::encode(new_lpg_num, bl);
       ::encode(new_lpgp_num, bl);
+      ::encode(new_pools, bl);
+      ::encode(old_pools, bl);
       ::encode(new_up, bl);
       ::encode(new_down, bl);
       ::encode(new_weight, bl);
 
       // extended
+      ::encode(new_pool_names, bl);
       ::encode(new_up_thru, bl);
       ::encode(new_last_clean_interval, bl);
       ::encode(new_lost, bl);
@@ -209,11 +215,14 @@ public:
       ::decode(new_pgp_num, p);
       ::decode(new_lpg_num, p);
       ::decode(new_lpgp_num, p);
+      ::decode(new_pools, p);
+      ::decode(old_pools, p);
       ::decode(new_up, p);
       ::decode(new_down, p);
       ::decode(new_weight, p);
       
       // extended
+      ::decode(new_pool_names, p);
       ::decode(new_up_thru, p);
       ::decode(new_last_clean_interval, p);
       ::decode(new_lost, p);
@@ -276,6 +285,8 @@ private:
   vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
   vector<osd_info_t> osd_info;
 
+  map<int,ceph_pg_pool> pools;
+  map<int,nstring> pool_name;
   map<pg_t,uint32_t> pg_swap_primary;  // force new osd to be pg primary (if already a member)
   snapid_t max_snap;
   interval_set<snapid_t> removed_snaps;
@@ -286,6 +297,7 @@ private:
   CrushWrapper     crush;       // hierarchical map
 
   friend class OSDMonitor;
+  friend class PGMonitor;
   friend class MDS;
 
  public:
@@ -528,6 +540,21 @@ private:
     if (inc.new_max_osd >= 0) 
       set_max_osd(inc.new_max_osd);
 
+    for (set<int32_t>::iterator p = inc.old_pools.begin();
+        p != inc.old_pools.end();
+        p++) {
+      pools.erase(*p);
+      pool_name.erase(*p);
+    }
+    for (map<int32_t,ceph_pg_pool>::iterator p = inc.new_pools.begin();
+        p != inc.new_pools.end();
+        p++)
+      pools[p->first] = p->second;
+    for (map<int32_t,nstring>::iterator p = inc.new_pool_names.begin();
+        p != inc.new_pool_names.end();
+        p++)
+      pool_name[p->first] = p->second;
+
     for (map<int32_t,uint32_t>::iterator i = inc.new_weight.begin();
          i != inc.new_weight.end();
          i++)
@@ -609,6 +636,13 @@ private:
     ::encode(lpg_num, blist);
     ::encode(lpgp_num, blist);
     ::encode(last_pg_change, blist);
+
+    int32_t max_pools = 0;
+    if (pools.size())
+      max_pools = pools.rbegin()->first + 1;
+    ::encode(max_pools, blist);
+    ::encode(pools, blist);
+
     ::encode(flags, blist);
     
     ::encode(max_osd, blist);
@@ -623,6 +657,7 @@ private:
 
     // extended
     ::encode(osd_info, blist);
+    ::encode(pool_name, blist);
     ::encode(pg_swap_primary, blist);
 
     ::encode(max_snap, blist);
@@ -643,6 +678,11 @@ private:
     ::decode(lpgp_num, p);
     calc_pg_masks();
     ::decode(last_pg_change, p);
+
+    int32_t max_pools;
+    ::decode(max_pools, p);
+    ::decode(pools, p);
+
     ::decode(flags, p);
 
     ::decode(max_osd, p);
@@ -658,6 +698,7 @@ private:
 
     // extended
     ::decode(osd_info, p);
+    ::decode(pool_name, p);
     ::decode(pg_swap_primary, p);
     
     ::decode(max_snap, p);
@@ -672,13 +713,13 @@ private:
 
   // oid -> pg
   ceph_object_layout file_to_object_layout(object_t oid, ceph_file_layout& layout) {
-    return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, 
+    return make_object_layout(oid, layout.fl_pg_type,
                              layout.fl_pg_pool,
                              ceph_file_layout_pg_preferred(layout),
                              ceph_file_layout_object_su(layout));
   }
 
-  ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int pg_pool, int preferred=-1, int object_stripe_unit = 0) {
+  ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_pool, int preferred=-1, int object_stripe_unit = 0) {
     // calculate ps (placement seed)
     ps_t ps;  // NOTE: keep full precision, here!
     switch (g_conf.osd_object_layout) {
@@ -705,7 +746,7 @@ private:
     //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl;
 
     // construct object layout
-    pg_t pgid = pg_t(pg_type, pg_size, ps, pg_pool, preferred);
+    pg_t pgid = pg_t(pg_type, ps, pg_pool, preferred);
     ceph_object_layout layout;
     layout.ol_pgid = pgid.u.pg64;
     layout.ol_stripe_unit = object_stripe_unit;
@@ -737,35 +778,39 @@ private:
   // pg -> (osd list)
   int pg_to_osds(pg_t pg, vector<int>& osds) {
     // map to osds[]
-
+    int p = pg.pool();
+    if (!pools.count(p)) {
+      return osds.size();
+    }
+    ceph_pg_pool &pool = pools[p];
     ps_t pps = raw_pg_to_pps(pg);  // placement ps
 
     switch (g_conf.osd_pg_layout) {
     case CEPH_PG_LAYOUT_CRUSH:
       {
        // what crush rule?
-       int ruleno = crush.find_rule(pg.pool(), pg.type(), pg.size());
+       int ruleno = crush.find_rule(pool.crush_ruleset, pg.type(), pool.size);
        if (ruleno >= 0)
-         crush.do_rule(ruleno, pps, osds, pg.size(), pg.preferred(), osd_weight);
+         crush.do_rule(ruleno, pps, osds, pool.size, pg.preferred(), osd_weight);
       }
       break;
       
     case CEPH_PG_LAYOUT_LINEAR:
-      for (unsigned i=0; i<pg.size(); i++) 
-       osds.push_back( (i + pps*pg.size()) % g_conf.num_osd );
+      for (unsigned i=0; i<pool.size; i++) 
+       osds.push_back( (i + pps*pool.size) % g_conf.num_osd );
       break;
       
     case CEPH_PG_LAYOUT_HYBRID:
       {
        int h = crush_hash32(pps);
-       for (unsigned i=0; i<pg.size(); i++) 
+       for (unsigned i=0; i<pool.size; i++) 
          osds.push_back( (h+i) % g_conf.num_osd );
       }
       break;
       
     case CEPH_PG_LAYOUT_HASH:
       {
-       for (unsigned i=0; i<pg.size(); i++) {
+       for (unsigned i=0; i<pool.size; i++) {
          int t = 1;
          int osd = 0;
          while (t++) {
@@ -793,8 +838,8 @@ private:
       if (osds.empty()) {
         osds.push_back(osd);
       } else {
-        assert(pg.size() > 0);
-        for (unsigned i=1; i<pg.size(); i++)
+        assert(pool.size > 0);
+        for (unsigned i=1; i<pool.size; i++)
           if (osds[i] == osd) {
             // swap with position 0
             osds[i] = osds[0];
@@ -838,6 +883,11 @@ private:
 
 
 
+  unsigned get_pg_size(pg_t pg) {
+    ceph_pg_pool &pool = pools[pg.pool()];
+    return pool.size;
+  }
+
   // pg -> primary osd
   int get_pg_primary(pg_t pg) {
     vector<int> group;
@@ -908,7 +958,7 @@ private:
                    int num_osd, int num_dom,
                    int pg_bits, int lpg_bits,
                    int mds_local_osd);
-  static void build_simple_crush_map(CrushWrapper& crush, int num_osd, int num_dom=0);
+  static void build_simple_crush_map(CrushWrapper& crush, map<int, const char*>& poolsets, int num_osd, int num_dom=0);
 
 
   void print(ostream& out);
index bd18420424c81337e91d833ae104b091faabdb56..a115c03c73f335b0193bde4336e5ccc02bffc12b 100644 (file)
@@ -1269,7 +1269,7 @@ void PG::activate(ObjectStore::Transaction& t,
     state_clear(PG_STATE_REPLAY);
   }
   if (is_primary() && 
-      info.pgid.size() != acting.size())
+      osd->osdmap->get_pg_size(info.pgid) != acting.size())
     state_set(PG_STATE_DEGRADED);
   else
     state_clear(PG_STATE_DEGRADED);
@@ -1548,7 +1548,7 @@ void PG::update_stats()
     pg_stats_stable.state = state;
     pg_stats_stable.acting = acting;
 
-    pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * info.pgid.size();
+    pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * osd->osdmap->get_pg_size(info.pgid);
     if (!is_clean() && is_active()) {
       pg_stats_stable.num_objects_missing_on_primary = missing.num_missing();
       int degraded = missing.num_missing();
index a76c470b383827219e3d72bb7731f73dce517755..b4ca4f0b2dc68259905df082222fe8b9216e5bcc 100644 (file)
@@ -805,7 +805,7 @@ public:
 
   bool  is_empty() const { return info.last_update == eversion_t(0,0); }
 
-  bool is_complete_pg() { return acting.size() == info.pgid.size(); }
+  //bool is_complete_pg() { return acting.size() == info.pgid.size(); }
 
   void add_log_entry(Log::Entry& e, bufferlist& log_bl);
 
index e27e6d8c4daffb182ce744d836256e77cd40acea..a029585f54acaf64730bed66531f4938261d8e38 100644 (file)
@@ -1531,7 +1531,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, int dest, utime_t now)
   wr->snapset = repop->pinfo->oi.snapset;
   wr->snapc = repop->snapc;
   wr->get_data() = repop->op->get_data();   // _copy_ bufferlist
-  if (is_complete_pg())
+  if (osd->osdmap->get_pg_size(info.pgid) == acting.size())
     wr->pg_trim_to = peers_complete_thru;
   wr->peer_stat = osd->get_my_stat_for(now, dest);
   osd->messenger->send_message(wr, osd->osdmap->get_inst(dest));
index d621f1ae2ac9f7b25319b4fd458f943564d6b6be..90854d1b6e3168e2d09c581eacfd4143e3318cba 100644 (file)
@@ -25,7 +25,7 @@
 
 
 
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v010"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v011"
 
 
 
@@ -90,8 +90,22 @@ namespace __gnu_cxx {
 
 typedef uint16_t ps_t;
 
-#define OSD_METADATA_PG_POOL 0xff
-#define OSD_SUPERBLOCK_POBJECT pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0,0))
+// object namespaces
+#define CEPH_METADATA_NS       1
+#define CEPH_DATA_NS           2
+#define CEPH_CAS_NS            3
+#define CEPH_OSDMETADATA_NS 0xff
+
+// poolsets
+enum {
+  CEPH_DATA_RULE,
+  CEPH_METADATA_RULE,
+  CEPH_CASDATA_RULE,
+};
+
+//#define CEPH_POOL(poolset, size) (((poolset) << 8) + (size))
+
+#define OSD_SUPERBLOCK_POBJECT pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0,0))
 
 // placement group id
 struct pg_t {
@@ -105,10 +119,9 @@ public:
 public:
   pg_t() { u.pg64 = 0; }
   pg_t(const pg_t& o) { u.pg64 = o.u.pg64; }
-  pg_t(int type, int size, ps_t seed, int pool, int pref) {
+  pg_t(int type, ps_t seed, int pool, int pref) {
     u.pg64 = 0;
     u.pg.type = type;
-    u.pg.size = size;
     u.pg.ps = seed;
     u.pg.pool = pool;
     u.pg.preferred = pref;   // hack: avoid negative.
@@ -123,7 +136,6 @@ public:
   bool is_rep()   { return type() == TYPE_REP; }
   bool is_raid4() { return type() == TYPE_RAID4; }
 
-  unsigned size() { return u.pg.size; }
   ps_t ps() { return u.pg.ps; }
   int pool() { return u.pg.pool; }
   int preferred() { return u.pg.preferred; }   // hack: avoid negative.
@@ -131,7 +143,7 @@ public:
   operator uint64_t() const { return u.pg64; }
 
   pobject_t to_log_pobject() const { 
-    return pobject_t(OSD_METADATA_PG_POOL,   // osd metadata 
+    return pobject_t(CEPH_OSDMETADATA_NS,
                     0,
                     object_t(u.pg64, 0));
   }
@@ -144,15 +156,13 @@ public:
   }
 
   bool parse(const char *s) {
-    int numrep;
     int pool;
     int ps;
-    int r = sscanf(s, "%dx%d.%x", &numrep, &pool, &ps);
+    int r = sscanf(s, "%d.%x", &pool, &ps);
     if (r < 3)
       return false;
     u.pg.type = TYPE_REP;
     u.pg.pool = pool;
-    u.pg.size = numrep;
     u.pg.ps = ps;
     u.pg.preferred = -1;
     return true;
@@ -170,12 +180,6 @@ inline void decode(pg_t &pgid, bufferlist::iterator& p) {
 
 inline ostream& operator<<(ostream& out, pg_t pg) 
 {
-  if (pg.is_rep()) 
-    out << pg.size() << 'x';
-  else if (pg.is_raid4()) 
-    out << pg.size() << 'r';
-  else 
-    out << pg.size() << '?';
   out << pg.pool() << '.';
   out << hex << pg.ps() << dec;