From e286a7e9518e7866703e9bb6ba6785ee09c0b537 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 27 Apr 2009 16:31:50 -0700 Subject: [PATCH] osd: make pg pools resizeable Move the pg size from the pgid to the pg_pool descriptor, so that entire pools of pgs can be resized. --- src/client/SyntheticClient.cc | 6 +-- src/config.cc | 25 ++-------- src/config.h | 2 - src/crush/CrushWrapper.h | 4 +- src/crush/builder.c | 4 +- src/crush/crush.h | 4 +- src/crush/mapper.c | 10 ++-- src/crushtool.cc | 23 +++++---- src/include/ceph_fs.h | 17 ++++--- src/include/types.h | 1 + src/kernel/osd_client.c | 13 +++-- src/kernel/osdmap.c | 47 ++++++++++++++++-- src/kernel/osdmap.h | 3 ++ src/mon/OSDMonitor.cc | 47 +++++++++++++----- src/mon/PGMonitor.cc | 89 +++++++++++++++++------------------ src/osd/OSD.cc | 4 +- src/osd/OSD.h | 4 +- src/osd/OSDMap.cc | 63 +++++++++++++------------ src/osd/OSDMap.h | 76 +++++++++++++++++++++++++----- src/osd/PG.cc | 4 +- src/osd/PG.h | 2 +- src/osd/ReplicatedPG.cc | 2 +- src/osd/osd_types.h | 36 +++++++------- 23 files changed, 297 insertions(+), 189 deletions(-) diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index 6c4a359de8be..53be64de6b27 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -2146,8 +2146,7 @@ int SyntheticClient::create_objects(int nobj, int osize, int inflight) if (time_to_stop()) break; object_t oid(0x1000, i); - ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, - g_default_file_layout.fl_pg_size, 0); + ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0); SnapContext snapc; if (i % inflight == 0) { @@ -2250,8 +2249,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc, } object_t oid(0x1000, o); - ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, - g_default_file_layout.fl_pg_size, 0); + ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0); SnapContext snapc; client->client_lock.Lock(); diff --git a/src/config.cc b/src/config.cc index 42607272e9e0..a9730b11643d 100644 --- a/src/config.cc +++ b/src/config.cc @@ -86,8 +86,7 @@ struct ceph_file_layout g_default_file_layout = { fl_object_stripe_unit: init_le32(0), fl_pg_preferred: init_le32(-1), fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2, - fl_pg_pool: 1 + fl_pg_pool: {1}, }; struct ceph_file_layout g_default_casdata_layout = { @@ -98,8 +97,7 @@ struct ceph_file_layout g_default_casdata_layout = { fl_object_stripe_unit: init_le32(0), fl_pg_preferred: init_le32(-1), fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2, - fl_pg_pool: 2 + fl_pg_pool: {3}, }; struct ceph_file_layout g_default_mds_dir_layout = { @@ -110,8 +108,7 @@ struct ceph_file_layout g_default_mds_dir_layout = { fl_object_stripe_unit: init_le32(0), fl_pg_preferred: init_le32(-1), fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2, - fl_pg_pool: 0 + fl_pg_pool: {2}, }; struct ceph_file_layout g_default_mds_log_layout = { @@ -122,8 +119,7 @@ struct ceph_file_layout g_default_mds_log_layout = { fl_object_stripe_unit: init_le32(0), fl_pg_preferred: init_le32(-1), fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2, - fl_pg_pool: 0 + fl_pg_pool: {2}, }; struct ceph_file_layout g_default_mds_anchortable_layout = { @@ -134,20 +130,9 @@ struct ceph_file_layout g_default_mds_anchortable_layout = { fl_object_stripe_unit: init_le32(0), fl_pg_preferred: init_le32(-1), fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2, - fl_pg_pool: 0 + fl_pg_pool: {2}, }; -const char *get_pool_name(int pool) -{ - switch (pool) { - case 0: return "metadata"; - case 1: return "data"; - case 2: return "casdata"; - default: return ""; - } -} - #include // fake osd failures: osd -> time diff --git a/src/config.h b/src/config.h index 693a7dbc2741..f76dd7946e0d 100644 --- a/src/config.h +++ b/src/config.h @@ -21,8 +21,6 @@ extern struct ceph_file_layout g_default_mds_dir_layout; extern struct ceph_file_layout g_default_mds_log_layout; extern struct ceph_file_layout g_default_mds_anchortable_layout; -extern const char *get_pool_name(int pool); - #include #include diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index d8aeffd67b5c..50a500429205 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -182,10 +182,10 @@ public: if (IS_ERR(r)) return PTR_ERR(r); return r->len; } - int get_rule_mask_pool(unsigned ruleno) { + int get_rule_mask_ruleset(unsigned ruleno) { crush_rule *r = get_rule(ruleno); if (IS_ERR(r)) return -1; - return r->mask.pool; + return r->mask.ruleset; } int get_rule_mask_type(unsigned ruleno) { crush_rule *r = get_rule(ruleno); diff --git a/src/crush/builder.c b/src/crush/builder.c index ab5ad5200f31..7f36319da4bd 100644 --- a/src/crush/builder.c +++ b/src/crush/builder.c @@ -70,12 +70,12 @@ int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno) return ruleno; } -struct crush_rule *crush_make_rule(int len, int pool, int type, int minsize, int maxsize) +struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize) { struct crush_rule *rule; rule = malloc(crush_rule_size(len)); rule->len = len; - rule->mask.pool = pool; + rule->mask.ruleset = ruleset; rule->mask.type = type; rule->mask.min_size = minsize; rule->mask.max_size = maxsize; diff --git a/src/crush/crush.h b/src/crush/crush.h index e25976b17f60..19aba44b269d 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -56,11 +56,11 @@ enum { /* * The rule mask is used to describe what the rule is intended for. - * Given a storage pool and size of output set, we search through the + * Given a ruleset and size of output set, we search through the * rule list for a matching rule_mask. */ struct crush_rule_mask { - __u8 pool; + __u8 ruleset; __u8 type; __u8 min_size; __u8 max_size; diff --git a/src/crush/mapper.c b/src/crush/mapper.c index 4752c2d73aa5..81e2da99e7af 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -23,19 +23,19 @@ /** - * crush_find_rule - find a crush_rule id for a given pool, type, and size. + * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. * @map: the crush_map - * @pool: the storage pool id (user defined) - * @type: storage pool type (user defined) + * @ruleset: the storage ruleset id (user defined) + * @type: storage ruleset type (user defined) * @size: output set size */ -int crush_find_rule(struct crush_map *map, int pool, int type, int size) +int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) { int i; for (i = 0; i < map->max_rules; i++) { if (map->rules[i] && - map->rules[i]->mask.pool == pool && + map->rules[i]->mask.ruleset == ruleset && map->rules[i]->mask.type == type && map->rules[i]->mask.min_size <= size && map->rules[i]->mask.max_size >= size) diff --git a/src/crushtool.cc b/src/crushtool.cc index 00ffbd93562a..b036530e4dbd 100644 --- a/src/crushtool.cc +++ b/src/crushtool.cc @@ -234,7 +234,7 @@ void parse_rule(iter_t const& i, CrushWrapper &crush) start = 3; } - int pool = int_node(i->children[start]); + int ruleset = int_node(i->children[start]); string tname = string_node(i->children[start+2]); int type; @@ -251,7 +251,7 @@ void parse_rule(iter_t const& i, CrushWrapper &crush) int steps = i->children.size() - start - 8; //cout << "num steps " << steps << std::endl; - int ruleno = crush.add_rule(steps, pool, type, minsize, maxsize, -1); + int ruleno = crush.add_rule(steps, ruleset, type, minsize, maxsize, -1); if (rname.length()) { crush.set_rule_name(ruleno, rname.c_str()); rule_id[rname] = ruleno; @@ -560,7 +560,7 @@ int decompile_crush(CrushWrapper &crush, ostream &out) if (crush.get_rule_name(i)) print_rule_name(out, i, crush); out << " {\n"; - out << "\tpool " << crush.get_rule_mask_pool(i) << "\n"; + out << "\truleset " << crush.get_rule_mask_ruleset(i) << "\n"; switch (crush.get_rule_mask_type(i)) { case CEPH_PG_TYPE_REP: out << "\ttype replicated\n"; break; case CEPH_PG_TYPE_RAID4: out << "\ttype raid4\n"; break; @@ -820,15 +820,14 @@ int main(int argc, const char **argv) lower_weights.swap(cur_weights); } - // make some generic rules - for (int pool=0; pool<3; pool++) { - crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, 2, 2); - crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); - crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); - crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); - int rno = crush_add_rule(crush.crush, rule, -1); - crush.set_rule_name(rno, get_pool_name(pool)); - } + // make a generic rules + int ruleset=1; + crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, 2, 2); + crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); + crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); + crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); + int rno = crush_add_rule(crush.crush, rule, -1); + crush.set_rule_name(rno, "data"); crush.finalize(); dout(0) << "crush max_devices " << crush.crush->max_devices << dendl; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index e8beda206232..b82aa6b91b39 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -264,9 +264,9 @@ struct ceph_file_layout { /* object -> pg layout */ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ - __u8 fl_pg_type; /* pg type; see PG_TYPE_* */ - __u8 fl_pg_size; /* pg size (num replicas, etc.) */ - __u8 fl_pg_pool; /* implies crush ruleset AND object namespace */ + __u8 fl_pg_type; + __le16 fl_pg_pool; /* implies crush ruleset, rep level */ + __le16 fl_pg_ns; /* object namespace */ } __attribute__ ((packed)); #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) @@ -297,16 +297,21 @@ union ceph_pg { struct { __s16 preferred; /* preferred primary osd */ __u16 ps; /* placement seed */ - __u8 __pad; - __u8 size; - __u8 pool; /* implies crush ruleset */ + __u16 pool; /* implies crush ruleset */ __u8 type; + __u8 __pad; } pg; } __attribute__ ((packed)); #define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP) #define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4) +struct ceph_pg_pool { + __u8 crush_ruleset; + __u8 size; + __u8 type; +} __attribute__ ((packed)); + /* * stable_mod func is used to control number of placement groups. * similar to straight-up modulo, but produces a stable mapping as b diff --git a/src/include/types.h b/src/include/types.h index 76b4d4e08126..a5dcf464c890 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -199,6 +199,7 @@ struct ltstr WRITE_RAW_ENCODER(ceph_fsid_t) WRITE_RAW_ENCODER(ceph_file_layout) +WRITE_RAW_ENCODER(ceph_pg_pool) WRITE_RAW_ENCODER(ceph_client_ticket) WRITE_RAW_ENCODER(ceph_mds_request_head) WRITE_RAW_ENCODER(ceph_mds_request_release) diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index 66b034e7abe3..90ccda5fde50 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -320,13 +320,16 @@ static int map_osds(struct ceph_osd_client *osdc, unsigned pps; /* placement ps */ int osds[10], osd = -1; int i, num; + struct ceph_pg_pool *pool; - ruleno = crush_find_rule(osdc->osdmap->crush, req->r_pgid.pg.pool, - req->r_pgid.pg.type, req->r_pgid.pg.size); + if (req->r_pgid.pg.pool >= osdc->osdmap->num_pools) + return -1; + pool = &osdc->osdmap->pg_pool[req->r_pgid.pg.pool]; + ruleno = crush_find_rule(osdc->osdmap->crush, pool->crush_ruleset, + req->r_pgid.pg.type, pool->size); if (ruleno < 0) { derr(0, "map_osds no crush rule for pool %d type %d size %d\n", - req->r_pgid.pg.pool, req->r_pgid.pg.type, - req->r_pgid.pg.size); + req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->size); return -1; } @@ -339,7 +342,7 @@ static int map_osds(struct ceph_osd_client *osdc, osdc->osdmap->pgp_num, osdc->osdmap->pgp_num_mask); num = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds, - min_t(int, req->r_pgid.pg.size, ARRAY_SIZE(osds)), + min_t(int, pool->size, ARRAY_SIZE(osds)), req->r_pgid.pg.preferred, osdc->osdmap->osd_weight); /* primary is first up osd */ diff --git a/src/kernel/osdmap.c b/src/kernel/osdmap.c index eb1cbeea5608..e1a21fe591f0 100644 --- a/src/kernel/osdmap.c +++ b/src/kernel/osdmap.c @@ -294,6 +294,7 @@ void osdmap_destroy(struct ceph_osdmap *map) crush_destroy(map->crush); kfree(map->osd_state); kfree(map->osd_weight); + kfree(map->pg_pool); kfree(map->osd_addr); kfree(map); } @@ -366,7 +367,24 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32(p, map->lpg_num); ceph_decode_32(p, map->lpgp_num); ceph_decode_32(p, map->last_pg_change); - ceph_decode_32(p, map->flags); + + ceph_decode_32(p, map->num_pools); + map->pg_pool = kmalloc(map->num_pools * sizeof(*map->pg_pool), + GFP_NOFS); + if (!map->pg_pool) { + err = -ENOMEM; + goto bad; + } + ceph_decode_32_safe(p, end, max, bad); + while (max--) { + ceph_decode_need(p, end, 4+sizeof(*map->pg_pool), bad); + ceph_decode_32(p, i); + if (i >= map->num_pools) + goto bad; + ceph_decode_copy(p, &map->pg_pool[i], sizeof(*map->pg_pool)); + } + + ceph_decode_32_safe(p, end, map->flags, bad); calc_pg_masks(map); @@ -430,7 +448,7 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, ceph_fsid_t fsid; u32 epoch = 0; struct ceph_timespec modified; - u32 len, x; + u32 len, x, pool; __s32 new_flags, max; void *start = *p; int err = -EINVAL; @@ -502,6 +520,30 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, newcrush = NULL; } + /* new_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + ceph_decode_32_safe(p, end, pool, bad); + if (pool >= map->num_pools) { + void *p = kzalloc((pool+1) * sizeof(*map->pg_pool), + GFP_NOFS); + if (!p) { + err = -ENOMEM; + goto bad; + } + memcpy(p, map->pg_pool, + map->num_pools * sizeof(*map->pg_pool)); + kfree(map->pg_pool); + map->pg_pool = p; + map->num_pools = pool+1; + } + ceph_decode_copy(p, &map->pg_pool[pool], sizeof(*map->pg_pool)); + } + + /* old_pool (ignore) */ + ceph_decode_32_safe(p, end, len, bad); + *p += len * (sizeof(u32) + sizeof(*map->pg_pool)); + /* new_up */ err = -EINVAL; ceph_decode_32_safe(p, end, len, bad); @@ -633,7 +675,6 @@ void calc_object_layout(struct ceph_object_layout *ol, pgid.pg.ps = bno + crush_hash32_2(ino, ino>>32); pgid.pg.preferred = preferred; pgid.pg.type = fl->fl_pg_type; - pgid.pg.size = fl->fl_pg_size; pgid.pg.pool = fl->fl_pg_pool; ol->ol_pgid = cpu_to_le64(pgid.pg64); diff --git a/src/kernel/osdmap.h b/src/kernel/osdmap.h index 8cee8268c80c..21f80aa550cf 100644 --- a/src/kernel/osdmap.h +++ b/src/kernel/osdmap.h @@ -38,6 +38,9 @@ struct ceph_osdmap { u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ struct ceph_entity_addr *osd_addr; + u32 num_pools; + struct ceph_pg_pool *pg_pool; + /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ struct crush_map *crush; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 79fb97c8d256..b57f0cfa9e66 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -846,20 +846,16 @@ void OSDMonitor::tick() // For all PGs that have OSD 0 as the primary, // switch them to use the first replca ps_t numps = osdmap.get_pg_num(); - int minrep = 1; - int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); for (int pool=0; pool<1; pool++) - for (int nrep = minrep; nrep <= maxrep; nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) { - pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1); - vector osds; - osdmap.pg_to_osds(pgid, osds); - if (osds[0] == 0) { - pending_inc.new_pg_swap_primary[pgid] = osds[1]; - dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to " - << osds[1] << dendl; - do_propose = true; - } + for (ps_t ps = 0; ps < numps; ++ps) { + pg_t pgid = pg_t(pg_t::TYPE_REP, ps, pool, -1); + vector osds; + osdmap.pg_to_osds(pgid, osds); + if (osds[0] == 0) { + pending_inc.new_pg_swap_primary[pgid] = osds[1]; + dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to " + << osds[1] << dendl; + do_propose = true; } } } @@ -1119,6 +1115,31 @@ bool OSDMonitor::prepare_command(MMonCommand *m) return true; } } + else if (m->cmd[1] == "pool" && m->cmd.size() >= 5) { + int pool = -1; + for (map::iterator p = osdmap.pool_name.begin(); + p != osdmap.pool_name.end(); + p++) { + if (p->second == m->cmd[2]) + pool = p->first; + } + if (pool >= 0) { + if (m->cmd[3] == "size") { + int s = atoi(m->cmd[4].c_str()); + if (s) { + pending_inc.new_pools[pool] = osdmap.pools[pool]; + pending_inc.new_pools[pool].size = s; + ss << "set pool " << pool << " size to " << s; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; + } + } + } else { + ss << "unrecognized pool '" << m->cmd[2] << "'"; + err = -ENOENT; + } + } else { ss << "unknown command " << m->cmd[1]; } diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 7394eccc87a6..57fab6426ea0 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -440,57 +440,56 @@ bool PGMonitor::register_new_pgs() bool first = pg_map.pg_stat.empty(); // first pg creation int created = 0; - for (int ruleno=0; rulenoget_max_rules(); ruleno++) { + for (map::iterator p = mon->osdmon()->osdmap.pools.begin(); + p != mon->osdmon()->osdmap.pools.end(); + p++) { + int pool = p->first; + int type = p->second.type; + int ruleno = p->second.crush_ruleset; if (!crush->rule_exists(ruleno)) continue; - int pool = crush->get_rule_mask_pool(ruleno); - int type = crush->get_rule_mask_type(ruleno); - int min_size = crush->get_rule_mask_min_size(ruleno); - int max_size = crush->get_rule_mask_max_size(ruleno); - for (int size = min_size; size <= max_size; size++) { - for (ps_t ps = 0; ps < pg_num; ps++) { - pg_t pgid(type, size, ps, pool, -1); - if (pg_map.pg_stat.count(pgid)) { - dout(20) << "register_new_pgs have " << pgid << dendl; - continue; - } + for (ps_t ps = 0; ps < pg_num; ps++) { + pg_t pgid(type, ps, pool, -1); + if (pg_map.pg_stat.count(pgid)) { + dout(20) << "register_new_pgs have " << pgid << dendl; + continue; + } - pg_t parent; - int split_bits = 0; - if (!first) { - parent = pgid; - while (1) { - // remove most significant bit - int msb = calc_bits_of(parent.u.pg.ps); - if (!msb) break; - parent.u.pg.ps &= ~(1<<(msb-1)); - split_bits++; - dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl; - //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { - if (pg_map.pg_stat.count(parent) && - pg_map.pg_stat[parent].state != PG_STATE_CREATING) { - dout(10) << " parent is " << parent << dendl; - break; - } + pg_t parent; + int split_bits = 0; + if (!first) { + parent = pgid; + while (1) { + // remove most significant bit + int msb = calc_bits_of(parent.u.pg.ps); + if (!msb) break; + parent.u.pg.ps &= ~(1<<(msb-1)); + split_bits++; + dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl; + //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { + if (pg_map.pg_stat.count(parent) && + pg_map.pg_stat[parent].state != PG_STATE_CREATING) { + dout(10) << " parent is " << parent << dendl; + break; } } - - pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING; - pending_inc.pg_stat_updates[pgid].created = epoch; - pending_inc.pg_stat_updates[pgid].parent = parent; - pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits; - created++; - - if (split_bits == 0) { - dout(10) << "register_new_pgs will create " << pgid << dendl; - } else { - dout(10) << "register_new_pgs will create " << pgid - << " parent " << parent - << " by " << split_bits << " bits" - << dendl; - } - } + + pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING; + pending_inc.pg_stat_updates[pgid].created = epoch; + pending_inc.pg_stat_updates[pgid].parent = parent; + pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits; + created++; + + if (split_bits == 0) { + dout(10) << "register_new_pgs will create " << pgid << dendl; + } else { + dout(10) << "register_new_pgs will create " << pgid + << " parent " << parent + << " by " << split_bits << " bits" + << dendl; + } + } } dout(10) << "register_new_pgs registered " << created << " new pgs" << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index cfa9cbcd1df9..3e3b0ce672f3 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2037,7 +2037,7 @@ void OSD::advance_map(ObjectStore::Transaction& t, interval_set& remov pg->state_clear(PG_STATE_PEERING); // we'll need to restart peering if (pg->is_primary() && - pg->info.pgid.size() != pg->acting.size()) + osdmap->get_pg_size(pg->info.pgid) != pg->acting.size()) pg->state_set(PG_STATE_DEGRADED); else pg->state_clear(PG_STATE_DEGRADED); @@ -2450,7 +2450,7 @@ void OSD::split_pg(PG *parent, map& children, ObjectStore::Transaction for (vector::iterator p = olist.begin(); p != olist.end(); p++) { pobject_t poid = *p; - ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(), parentid.size(), + ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(), parentid.pool(), parentid.preferred()); if (le64_to_cpu(l.ol_pgid) != parentid.u.pg64) { pg_t pgid(le64_to_cpu(l.ol_pgid)); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 07cd7a1a98c7..aa0d4b7d0752 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -119,10 +119,10 @@ public: int get_nodeid() { return whoami; } static pobject_t get_osdmap_pobject_name(epoch_t epoch) { - return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, epoch << 1)); + return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, epoch << 1)); } static pobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { - return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, (epoch << 1) + 1)); + return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, (epoch << 1) + 1)); } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 79b231586b80..8aef86fcd0f6 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -31,6 +31,14 @@ void OSDMap::print(ostream& out) << "lpgp_num " << get_lpgp_num() << "\n" << "last_pg_change " << get_last_pg_change() << "\n" << std::endl; + for (map::iterator p = pools.begin(); p != pools.end(); p++) + out << "pg_pool " << p->first + << " '" << pool_name[p->first] + << "' size " << (int)p->second.size + << " crush_ruleset " << (int)p->second.crush_ruleset + << "\n"; + out << std::endl; + out << "max_osd " << get_max_osd() << "\n"; for (int i=0; i rulesets; + rulesets[CEPH_DATA_RULE] = "data"; + rulesets[CEPH_METADATA_RULE] = "metadata"; + rulesets[CEPH_CASDATA_RULE] = "casdata"; + + int pool = 0; + for (map::iterator p = rulesets.begin(); p != rulesets.end(); p++) { + pools[pool].size = 2; + pools[pool].crush_ruleset = p->first; + pools[pool].type = CEPH_PG_TYPE_REP; + pool_name[pool] = p->second; + pool++; + } + + build_simple_crush_map(crush, rulesets, num_osd, num_dom); for (int i=0; i& rulesets, int num_osd, int num_dom) { // new @@ -117,8 +139,6 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, crush.set_type_name(1, "domain"); crush.set_type_name(2, "pool"); - int npools = 3; - int minrep = g_conf.osd_min_rep; int ndom = num_dom; if (!ndom) @@ -161,26 +181,15 @@ void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, crush.set_item_name(rootid, "root"); // rules - // replication - for (int pool=0; pool::iterator p = rulesets.begin(); p != rulesets.end(); p++) { + int ruleset = p->first; + crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, minrep, g_conf.osd_max_rep); crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); // choose N domains crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); int rno = crush_add_rule(crush.crush, rule, -1); - crush.set_rule_name(rno, get_pool_name(pool)); + crush.set_rule_name(rno, p->second); } - - // raid - if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width) - for (int pool=0; pool::iterator p = rulesets.begin(); p != rulesets.end(); p++) { + int ruleset = p->first; + crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep); crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, CRUSH_CHOOSE_N, 0); crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); int rno = crush_add_rule(crush.crush, rule, -1); - crush.set_rule_name(rno, get_pool_name(pool)); + crush.set_rule_name(rno, p->second); } - // raid4 - if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width) - for (int pool=0; pool new_pools; + map new_pool_names; + set old_pools; map new_up; map new_down; map new_weight; @@ -180,11 +183,14 @@ public: ::encode(new_pgp_num, bl); ::encode(new_lpg_num, bl); ::encode(new_lpgp_num, bl); + ::encode(new_pools, bl); + ::encode(old_pools, bl); ::encode(new_up, bl); ::encode(new_down, bl); ::encode(new_weight, bl); // extended + ::encode(new_pool_names, bl); ::encode(new_up_thru, bl); ::encode(new_last_clean_interval, bl); ::encode(new_lost, bl); @@ -209,11 +215,14 @@ public: ::decode(new_pgp_num, p); ::decode(new_lpg_num, p); ::decode(new_lpgp_num, p); + ::decode(new_pools, p); + ::decode(old_pools, p); ::decode(new_up, p); ::decode(new_down, p); ::decode(new_weight, p); // extended + ::decode(new_pool_names, p); ::decode(new_up_thru, p); ::decode(new_last_clean_interval, p); ::decode(new_lost, p); @@ -276,6 +285,8 @@ private: vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" vector osd_info; + map pools; + map pool_name; map pg_swap_primary; // force new osd to be pg primary (if already a member) snapid_t max_snap; interval_set removed_snaps; @@ -286,6 +297,7 @@ private: CrushWrapper crush; // hierarchical map friend class OSDMonitor; + friend class PGMonitor; friend class MDS; public: @@ -528,6 +540,21 @@ private: if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); + for (set::iterator p = inc.old_pools.begin(); + p != inc.old_pools.end(); + p++) { + pools.erase(*p); + pool_name.erase(*p); + } + for (map::iterator p = inc.new_pools.begin(); + p != inc.new_pools.end(); + p++) + pools[p->first] = p->second; + for (map::iterator p = inc.new_pool_names.begin(); + p != inc.new_pool_names.end(); + p++) + pool_name[p->first] = p->second; + for (map::iterator i = inc.new_weight.begin(); i != inc.new_weight.end(); i++) @@ -609,6 +636,13 @@ private: ::encode(lpg_num, blist); ::encode(lpgp_num, blist); ::encode(last_pg_change, blist); + + int32_t max_pools = 0; + if (pools.size()) + max_pools = pools.rbegin()->first + 1; + ::encode(max_pools, blist); + ::encode(pools, blist); + ::encode(flags, blist); ::encode(max_osd, blist); @@ -623,6 +657,7 @@ private: // extended ::encode(osd_info, blist); + ::encode(pool_name, blist); ::encode(pg_swap_primary, blist); ::encode(max_snap, blist); @@ -643,6 +678,11 @@ private: ::decode(lpgp_num, p); calc_pg_masks(); ::decode(last_pg_change, p); + + int32_t max_pools; + ::decode(max_pools, p); + ::decode(pools, p); + ::decode(flags, p); ::decode(max_osd, p); @@ -658,6 +698,7 @@ private: // extended ::decode(osd_info, p); + ::decode(pool_name, p); ::decode(pg_swap_primary, p); ::decode(max_snap, p); @@ -672,13 +713,13 @@ private: // oid -> pg ceph_object_layout file_to_object_layout(object_t oid, ceph_file_layout& layout) { - return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, + return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_pool, ceph_file_layout_pg_preferred(layout), ceph_file_layout_object_su(layout)); } - ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int pg_pool, int preferred=-1, int object_stripe_unit = 0) { + ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_pool, int preferred=-1, int object_stripe_unit = 0) { // calculate ps (placement seed) ps_t ps; // NOTE: keep full precision, here! switch (g_conf.osd_object_layout) { @@ -705,7 +746,7 @@ private: //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl; // construct object layout - pg_t pgid = pg_t(pg_type, pg_size, ps, pg_pool, preferred); + pg_t pgid = pg_t(pg_type, ps, pg_pool, preferred); ceph_object_layout layout; layout.ol_pgid = pgid.u.pg64; layout.ol_stripe_unit = object_stripe_unit; @@ -737,35 +778,39 @@ private: // pg -> (osd list) int pg_to_osds(pg_t pg, vector& osds) { // map to osds[] - + int p = pg.pool(); + if (!pools.count(p)) { + return osds.size(); + } + ceph_pg_pool &pool = pools[p]; ps_t pps = raw_pg_to_pps(pg); // placement ps switch (g_conf.osd_pg_layout) { case CEPH_PG_LAYOUT_CRUSH: { // what crush rule? - int ruleno = crush.find_rule(pg.pool(), pg.type(), pg.size()); + int ruleno = crush.find_rule(pool.crush_ruleset, pg.type(), pool.size); if (ruleno >= 0) - crush.do_rule(ruleno, pps, osds, pg.size(), pg.preferred(), osd_weight); + crush.do_rule(ruleno, pps, osds, pool.size, pg.preferred(), osd_weight); } break; case CEPH_PG_LAYOUT_LINEAR: - for (unsigned i=0; i 0); - for (unsigned i=1; i 0); + for (unsigned i=1; i primary osd int get_pg_primary(pg_t pg) { vector group; @@ -908,7 +958,7 @@ private: int num_osd, int num_dom, int pg_bits, int lpg_bits, int mds_local_osd); - static void build_simple_crush_map(CrushWrapper& crush, int num_osd, int num_dom=0); + static void build_simple_crush_map(CrushWrapper& crush, map& poolsets, int num_osd, int num_dom=0); void print(ostream& out); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index bd18420424c8..a115c03c73f3 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1269,7 +1269,7 @@ void PG::activate(ObjectStore::Transaction& t, state_clear(PG_STATE_REPLAY); } if (is_primary() && - info.pgid.size() != acting.size()) + osd->osdmap->get_pg_size(info.pgid) != acting.size()) state_set(PG_STATE_DEGRADED); else state_clear(PG_STATE_DEGRADED); @@ -1548,7 +1548,7 @@ void PG::update_stats() pg_stats_stable.state = state; pg_stats_stable.acting = acting; - pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * info.pgid.size(); + pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * osd->osdmap->get_pg_size(info.pgid); if (!is_clean() && is_active()) { pg_stats_stable.num_objects_missing_on_primary = missing.num_missing(); int degraded = missing.num_missing(); diff --git a/src/osd/PG.h b/src/osd/PG.h index a76c470b3838..b4ca4f0b2dc6 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -805,7 +805,7 @@ public: bool is_empty() const { return info.last_update == eversion_t(0,0); } - bool is_complete_pg() { return acting.size() == info.pgid.size(); } + //bool is_complete_pg() { return acting.size() == info.pgid.size(); } void add_log_entry(Log::Entry& e, bufferlist& log_bl); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index e27e6d8c4daf..a029585f54ac 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1531,7 +1531,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, int dest, utime_t now) wr->snapset = repop->pinfo->oi.snapset; wr->snapc = repop->snapc; wr->get_data() = repop->op->get_data(); // _copy_ bufferlist - if (is_complete_pg()) + if (osd->osdmap->get_pg_size(info.pgid) == acting.size()) wr->pg_trim_to = peers_complete_thru; wr->peer_stat = osd->get_my_stat_for(now, dest); osd->messenger->send_message(wr, osd->osdmap->get_inst(dest)); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index d621f1ae2ac9..90854d1b6e31 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -25,7 +25,7 @@ -#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v010" +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v011" @@ -90,8 +90,22 @@ namespace __gnu_cxx { typedef uint16_t ps_t; -#define OSD_METADATA_PG_POOL 0xff -#define OSD_SUPERBLOCK_POBJECT pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0,0)) +// object namespaces +#define CEPH_METADATA_NS 1 +#define CEPH_DATA_NS 2 +#define CEPH_CAS_NS 3 +#define CEPH_OSDMETADATA_NS 0xff + +// poolsets +enum { + CEPH_DATA_RULE, + CEPH_METADATA_RULE, + CEPH_CASDATA_RULE, +}; + +//#define CEPH_POOL(poolset, size) (((poolset) << 8) + (size)) + +#define OSD_SUPERBLOCK_POBJECT pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0,0)) // placement group id struct pg_t { @@ -105,10 +119,9 @@ public: public: pg_t() { u.pg64 = 0; } pg_t(const pg_t& o) { u.pg64 = o.u.pg64; } - pg_t(int type, int size, ps_t seed, int pool, int pref) { + pg_t(int type, ps_t seed, int pool, int pref) { u.pg64 = 0; u.pg.type = type; - u.pg.size = size; u.pg.ps = seed; u.pg.pool = pool; u.pg.preferred = pref; // hack: avoid negative. @@ -123,7 +136,6 @@ public: bool is_rep() { return type() == TYPE_REP; } bool is_raid4() { return type() == TYPE_RAID4; } - unsigned size() { return u.pg.size; } ps_t ps() { return u.pg.ps; } int pool() { return u.pg.pool; } int preferred() { return u.pg.preferred; } // hack: avoid negative. @@ -131,7 +143,7 @@ public: operator uint64_t() const { return u.pg64; } pobject_t to_log_pobject() const { - return pobject_t(OSD_METADATA_PG_POOL, // osd metadata + return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(u.pg64, 0)); } @@ -144,15 +156,13 @@ public: } bool parse(const char *s) { - int numrep; int pool; int ps; - int r = sscanf(s, "%dx%d.%x", &numrep, &pool, &ps); + int r = sscanf(s, "%d.%x", &pool, &ps); if (r < 3) return false; u.pg.type = TYPE_REP; u.pg.pool = pool; - u.pg.size = numrep; u.pg.ps = ps; u.pg.preferred = -1; return true; @@ -170,12 +180,6 @@ inline void decode(pg_t &pgid, bufferlist::iterator& p) { inline ostream& operator<<(ostream& out, pg_t pg) { - if (pg.is_rep()) - out << pg.size() << 'x'; - else if (pg.is_raid4()) - out << pg.size() << 'r'; - else - out << pg.size() << '?'; out << pg.pool() << '.'; out << hex << pg.ps() << dec; -- 2.47.3