if (time_to_stop()) break;
object_t oid(0x1000, i);
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP,
- g_default_file_layout.fl_pg_size, 0);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0);
SnapContext snapc;
if (i % inflight == 0) {
}
object_t oid(0x1000, o);
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP,
- g_default_file_layout.fl_pg_size, 0);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 0);
SnapContext snapc;
client->client_lock.Lock();
fl_object_stripe_unit: init_le32(0),
fl_pg_preferred: init_le32(-1),
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 1
+ fl_pg_pool: {1},
};
struct ceph_file_layout g_default_casdata_layout = {
fl_object_stripe_unit: init_le32(0),
fl_pg_preferred: init_le32(-1),
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 2
+ fl_pg_pool: {3},
};
struct ceph_file_layout g_default_mds_dir_layout = {
fl_object_stripe_unit: init_le32(0),
fl_pg_preferred: init_le32(-1),
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
};
struct ceph_file_layout g_default_mds_log_layout = {
fl_object_stripe_unit: init_le32(0),
fl_pg_preferred: init_le32(-1),
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
};
struct ceph_file_layout g_default_mds_anchortable_layout = {
fl_object_stripe_unit: init_le32(0),
fl_pg_preferred: init_le32(-1),
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2,
- fl_pg_pool: 0
+ fl_pg_pool: {2},
};
-const char *get_pool_name(int pool)
-{
- switch (pool) {
- case 0: return "metadata";
- case 1: return "data";
- case 2: return "casdata";
- default: return "";
- }
-}
-
#include <msg/msg_types.h>
// fake osd failures: osd -> time
extern struct ceph_file_layout g_default_mds_log_layout;
extern struct ceph_file_layout g_default_mds_anchortable_layout;
-extern const char *get_pool_name(int pool);
-
#include <vector>
#include <map>
if (IS_ERR(r)) return PTR_ERR(r);
return r->len;
}
- int get_rule_mask_pool(unsigned ruleno) {
+ int get_rule_mask_ruleset(unsigned ruleno) {
crush_rule *r = get_rule(ruleno);
if (IS_ERR(r)) return -1;
- return r->mask.pool;
+ return r->mask.ruleset;
}
int get_rule_mask_type(unsigned ruleno) {
crush_rule *r = get_rule(ruleno);
return ruleno;
}
-struct crush_rule *crush_make_rule(int len, int pool, int type, int minsize, int maxsize)
+struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
{
struct crush_rule *rule;
rule = malloc(crush_rule_size(len));
rule->len = len;
- rule->mask.pool = pool;
+ rule->mask.ruleset = ruleset;
rule->mask.type = type;
rule->mask.min_size = minsize;
rule->mask.max_size = maxsize;
/*
* The rule mask is used to describe what the rule is intended for.
- * Given a storage pool and size of output set, we search through the
+ * Given a ruleset and size of output set, we search through the
* rule list for a matching rule_mask.
*/
struct crush_rule_mask {
- __u8 pool;
+ __u8 ruleset;
__u8 type;
__u8 min_size;
__u8 max_size;
/**
- * crush_find_rule - find a crush_rule id for a given pool, type, and size.
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
* @map: the crush_map
- * @pool: the storage pool id (user defined)
- * @type: storage pool type (user defined)
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
* @size: output set size
*/
-int crush_find_rule(struct crush_map *map, int pool, int type, int size)
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
{
int i;
for (i = 0; i < map->max_rules; i++) {
if (map->rules[i] &&
- map->rules[i]->mask.pool == pool &&
+ map->rules[i]->mask.ruleset == ruleset &&
map->rules[i]->mask.type == type &&
map->rules[i]->mask.min_size <= size &&
map->rules[i]->mask.max_size >= size)
start = 3;
}
- int pool = int_node(i->children[start]);
+ int ruleset = int_node(i->children[start]);
string tname = string_node(i->children[start+2]);
int type;
int steps = i->children.size() - start - 8;
//cout << "num steps " << steps << std::endl;
- int ruleno = crush.add_rule(steps, pool, type, minsize, maxsize, -1);
+ int ruleno = crush.add_rule(steps, ruleset, type, minsize, maxsize, -1);
if (rname.length()) {
crush.set_rule_name(ruleno, rname.c_str());
rule_id[rname] = ruleno;
if (crush.get_rule_name(i))
print_rule_name(out, i, crush);
out << " {\n";
- out << "\tpool " << crush.get_rule_mask_pool(i) << "\n";
+ out << "\truleset " << crush.get_rule_mask_ruleset(i) << "\n";
switch (crush.get_rule_mask_type(i)) {
case CEPH_PG_TYPE_REP: out << "\ttype replicated\n"; break;
case CEPH_PG_TYPE_RAID4: out << "\ttype raid4\n"; break;
lower_weights.swap(cur_weights);
}
- // make some generic rules
- for (int pool=0; pool<3; pool++) {
- crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, 2, 2);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- int rno = crush_add_rule(crush.crush, rule, -1);
- crush.set_rule_name(rno, get_pool_name(pool));
- }
+ // make a generic rules
+ int ruleset=1;
+ crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, 2, 2);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ int rno = crush_add_rule(crush.crush, rule, -1);
+ crush.set_rule_name(rno, "data");
crush.finalize();
dout(0) << "crush max_devices " << crush.crush->max_devices << dendl;
/* object -> pg layout */
__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
- __u8 fl_pg_type; /* pg type; see PG_TYPE_* */
- __u8 fl_pg_size; /* pg size (num replicas, etc.) */
- __u8 fl_pg_pool; /* implies crush ruleset AND object namespace */
+ __u8 fl_pg_type;
+ __le16 fl_pg_pool; /* implies crush ruleset, rep level */
+ __le16 fl_pg_ns; /* object namespace */
} __attribute__ ((packed));
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
struct {
__s16 preferred; /* preferred primary osd */
__u16 ps; /* placement seed */
- __u8 __pad;
- __u8 size;
- __u8 pool; /* implies crush ruleset */
+ __u16 pool; /* implies crush ruleset */
__u8 type;
+ __u8 __pad;
} pg;
} __attribute__ ((packed));
#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP)
#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4)
+struct ceph_pg_pool {
+ __u8 crush_ruleset;
+ __u8 size;
+ __u8 type;
+} __attribute__ ((packed));
+
/*
* stable_mod func is used to control number of placement groups.
* similar to straight-up modulo, but produces a stable mapping as b
WRITE_RAW_ENCODER(ceph_fsid_t)
WRITE_RAW_ENCODER(ceph_file_layout)
+WRITE_RAW_ENCODER(ceph_pg_pool)
WRITE_RAW_ENCODER(ceph_client_ticket)
WRITE_RAW_ENCODER(ceph_mds_request_head)
WRITE_RAW_ENCODER(ceph_mds_request_release)
unsigned pps; /* placement ps */
int osds[10], osd = -1;
int i, num;
+ struct ceph_pg_pool *pool;
- ruleno = crush_find_rule(osdc->osdmap->crush, req->r_pgid.pg.pool,
- req->r_pgid.pg.type, req->r_pgid.pg.size);
+ if (req->r_pgid.pg.pool >= osdc->osdmap->num_pools)
+ return -1;
+ pool = &osdc->osdmap->pg_pool[req->r_pgid.pg.pool];
+ ruleno = crush_find_rule(osdc->osdmap->crush, pool->crush_ruleset,
+ req->r_pgid.pg.type, pool->size);
if (ruleno < 0) {
derr(0, "map_osds no crush rule for pool %d type %d size %d\n",
- req->r_pgid.pg.pool, req->r_pgid.pg.type,
- req->r_pgid.pg.size);
+ req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->size);
return -1;
}
osdc->osdmap->pgp_num,
osdc->osdmap->pgp_num_mask);
num = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds,
- min_t(int, req->r_pgid.pg.size, ARRAY_SIZE(osds)),
+ min_t(int, pool->size, ARRAY_SIZE(osds)),
req->r_pgid.pg.preferred, osdc->osdmap->osd_weight);
/* primary is first up osd */
crush_destroy(map->crush);
kfree(map->osd_state);
kfree(map->osd_weight);
+ kfree(map->pg_pool);
kfree(map->osd_addr);
kfree(map);
}
ceph_decode_32(p, map->lpg_num);
ceph_decode_32(p, map->lpgp_num);
ceph_decode_32(p, map->last_pg_change);
- ceph_decode_32(p, map->flags);
+
+ ceph_decode_32(p, map->num_pools);
+ map->pg_pool = kmalloc(map->num_pools * sizeof(*map->pg_pool),
+ GFP_NOFS);
+ if (!map->pg_pool) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ ceph_decode_32_safe(p, end, max, bad);
+ while (max--) {
+ ceph_decode_need(p, end, 4+sizeof(*map->pg_pool), bad);
+ ceph_decode_32(p, i);
+ if (i >= map->num_pools)
+ goto bad;
+ ceph_decode_copy(p, &map->pg_pool[i], sizeof(*map->pg_pool));
+ }
+
+ ceph_decode_32_safe(p, end, map->flags, bad);
calc_pg_masks(map);
ceph_fsid_t fsid;
u32 epoch = 0;
struct ceph_timespec modified;
- u32 len, x;
+ u32 len, x, pool;
__s32 new_flags, max;
void *start = *p;
int err = -EINVAL;
newcrush = NULL;
}
+ /* new_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ ceph_decode_32_safe(p, end, pool, bad);
+ if (pool >= map->num_pools) {
+ void *p = kzalloc((pool+1) * sizeof(*map->pg_pool),
+ GFP_NOFS);
+ if (!p) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ memcpy(p, map->pg_pool,
+ map->num_pools * sizeof(*map->pg_pool));
+ kfree(map->pg_pool);
+ map->pg_pool = p;
+ map->num_pools = pool+1;
+ }
+ ceph_decode_copy(p, &map->pg_pool[pool], sizeof(*map->pg_pool));
+ }
+
+ /* old_pool (ignore) */
+ ceph_decode_32_safe(p, end, len, bad);
+ *p += len * (sizeof(u32) + sizeof(*map->pg_pool));
+
/* new_up */
err = -EINVAL;
ceph_decode_32_safe(p, end, len, bad);
pgid.pg.ps = bno + crush_hash32_2(ino, ino>>32);
pgid.pg.preferred = preferred;
pgid.pg.type = fl->fl_pg_type;
- pgid.pg.size = fl->fl_pg_size;
pgid.pg.pool = fl->fl_pg_pool;
ol->ol_pgid = cpu_to_le64(pgid.pg64);
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
+ u32 num_pools;
+ struct ceph_pg_pool *pg_pool;
+
/* the CRUSH map specifies the mapping of placement groups to
* the list of osds that store+replicate them. */
struct crush_map *crush;
// For all PGs that have OSD 0 as the primary,
// switch them to use the first replca
ps_t numps = osdmap.get_pg_num();
- int minrep = 1;
- int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep);
for (int pool=0; pool<1; pool++)
- for (int nrep = minrep; nrep <= maxrep; nrep++) {
- for (ps_t ps = 0; ps < numps; ++ps) {
- pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1);
- vector<int> osds;
- osdmap.pg_to_osds(pgid, osds);
- if (osds[0] == 0) {
- pending_inc.new_pg_swap_primary[pgid] = osds[1];
- dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
- << osds[1] << dendl;
- do_propose = true;
- }
+ for (ps_t ps = 0; ps < numps; ++ps) {
+ pg_t pgid = pg_t(pg_t::TYPE_REP, ps, pool, -1);
+ vector<int> osds;
+ osdmap.pg_to_osds(pgid, osds);
+ if (osds[0] == 0) {
+ pending_inc.new_pg_swap_primary[pgid] = osds[1];
+ dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
+ << osds[1] << dendl;
+ do_propose = true;
}
}
}
return true;
}
}
+ else if (m->cmd[1] == "pool" && m->cmd.size() >= 5) {
+ int pool = -1;
+ for (map<int,nstring>::iterator p = osdmap.pool_name.begin();
+ p != osdmap.pool_name.end();
+ p++) {
+ if (p->second == m->cmd[2])
+ pool = p->first;
+ }
+ if (pool >= 0) {
+ if (m->cmd[3] == "size") {
+ int s = atoi(m->cmd[4].c_str());
+ if (s) {
+ pending_inc.new_pools[pool] = osdmap.pools[pool];
+ pending_inc.new_pools[pool].size = s;
+ ss << "set pool " << pool << " size to " << s;
+ getline(ss, rs);
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ }
+ }
+ } else {
+ ss << "unrecognized pool '" << m->cmd[2] << "'";
+ err = -ENOENT;
+ }
+ }
else {
ss << "unknown command " << m->cmd[1];
}
bool first = pg_map.pg_stat.empty(); // first pg creation
int created = 0;
- for (int ruleno=0; ruleno<crush->get_max_rules(); ruleno++) {
+ for (map<int,ceph_pg_pool>::iterator p = mon->osdmon()->osdmap.pools.begin();
+ p != mon->osdmon()->osdmap.pools.end();
+ p++) {
+ int pool = p->first;
+ int type = p->second.type;
+ int ruleno = p->second.crush_ruleset;
if (!crush->rule_exists(ruleno))
continue;
- int pool = crush->get_rule_mask_pool(ruleno);
- int type = crush->get_rule_mask_type(ruleno);
- int min_size = crush->get_rule_mask_min_size(ruleno);
- int max_size = crush->get_rule_mask_max_size(ruleno);
- for (int size = min_size; size <= max_size; size++) {
- for (ps_t ps = 0; ps < pg_num; ps++) {
- pg_t pgid(type, size, ps, pool, -1);
- if (pg_map.pg_stat.count(pgid)) {
- dout(20) << "register_new_pgs have " << pgid << dendl;
- continue;
- }
+ for (ps_t ps = 0; ps < pg_num; ps++) {
+ pg_t pgid(type, ps, pool, -1);
+ if (pg_map.pg_stat.count(pgid)) {
+ dout(20) << "register_new_pgs have " << pgid << dendl;
+ continue;
+ }
- pg_t parent;
- int split_bits = 0;
- if (!first) {
- parent = pgid;
- while (1) {
- // remove most significant bit
- int msb = calc_bits_of(parent.u.pg.ps);
- if (!msb) break;
- parent.u.pg.ps &= ~(1<<(msb-1));
- split_bits++;
- dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl;
- //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) {
- if (pg_map.pg_stat.count(parent) &&
- pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
- dout(10) << " parent is " << parent << dendl;
- break;
- }
+ pg_t parent;
+ int split_bits = 0;
+ if (!first) {
+ parent = pgid;
+ while (1) {
+ // remove most significant bit
+ int msb = calc_bits_of(parent.u.pg.ps);
+ if (!msb) break;
+ parent.u.pg.ps &= ~(1<<(msb-1));
+ split_bits++;
+ dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl;
+ //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) {
+ if (pg_map.pg_stat.count(parent) &&
+ pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
+ dout(10) << " parent is " << parent << dendl;
+ break;
}
}
-
- pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING;
- pending_inc.pg_stat_updates[pgid].created = epoch;
- pending_inc.pg_stat_updates[pgid].parent = parent;
- pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits;
- created++;
-
- if (split_bits == 0) {
- dout(10) << "register_new_pgs will create " << pgid << dendl;
- } else {
- dout(10) << "register_new_pgs will create " << pgid
- << " parent " << parent
- << " by " << split_bits << " bits"
- << dendl;
- }
-
}
+
+ pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING;
+ pending_inc.pg_stat_updates[pgid].created = epoch;
+ pending_inc.pg_stat_updates[pgid].parent = parent;
+ pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits;
+ created++;
+
+ if (split_bits == 0) {
+ dout(10) << "register_new_pgs will create " << pgid << dendl;
+ } else {
+ dout(10) << "register_new_pgs will create " << pgid
+ << " parent " << parent
+ << " by " << split_bits << " bits"
+ << dendl;
+ }
+
}
}
dout(10) << "register_new_pgs registered " << created << " new pgs" << dendl;
pg->state_clear(PG_STATE_PEERING); // we'll need to restart peering
if (pg->is_primary() &&
- pg->info.pgid.size() != pg->acting.size())
+ osdmap->get_pg_size(pg->info.pgid) != pg->acting.size())
pg->state_set(PG_STATE_DEGRADED);
else
pg->state_clear(PG_STATE_DEGRADED);
for (vector<pobject_t>::iterator p = olist.begin(); p != olist.end(); p++) {
pobject_t poid = *p;
- ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(), parentid.size(),
+ ceph_object_layout l = osdmap->make_object_layout(poid.oid, parentid.type(),
parentid.pool(), parentid.preferred());
if (le64_to_cpu(l.ol_pgid) != parentid.u.pg64) {
pg_t pgid(le64_to_cpu(l.ol_pgid));
int get_nodeid() { return whoami; }
static pobject_t get_osdmap_pobject_name(epoch_t epoch) {
- return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, epoch << 1));
+ return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, epoch << 1));
}
static pobject_t get_inc_osdmap_pobject_name(epoch_t epoch) {
- return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, (epoch << 1) + 1));
+ return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, (epoch << 1) + 1));
}
<< "lpgp_num " << get_lpgp_num() << "\n"
<< "last_pg_change " << get_last_pg_change() << "\n"
<< std::endl;
+ for (map<int,ceph_pg_pool>::iterator p = pools.begin(); p != pools.end(); p++)
+ out << "pg_pool " << p->first
+ << " '" << pool_name[p->first]
+ << "' size " << (int)p->second.size
+ << " crush_ruleset " << (int)p->second.crush_ruleset
+ << "\n";
+ out << std::endl;
+
out << "max_osd " << get_max_osd() << "\n";
for (int i=0; i<get_max_osd(); i++) {
if (exists(i)) {
lpg_num = lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0;
// crush map
- build_simple_crush_map(crush, num_osd, num_dom);
+ map<int, const char*> rulesets;
+ rulesets[CEPH_DATA_RULE] = "data";
+ rulesets[CEPH_METADATA_RULE] = "metadata";
+ rulesets[CEPH_CASDATA_RULE] = "casdata";
+
+ int pool = 0;
+ for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+ pools[pool].size = 2;
+ pools[pool].crush_ruleset = p->first;
+ pools[pool].type = CEPH_PG_TYPE_REP;
+ pool_name[pool] = p->second;
+ pool++;
+ }
+
+ build_simple_crush_map(crush, rulesets, num_osd, num_dom);
for (int i=0; i<num_osd; i++) {
set_state(i, CEPH_OSD_EXISTS);
}
}
-void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd,
+void OSDMap::build_simple_crush_map(CrushWrapper& crush, map<int, const char*>& rulesets, int num_osd,
int num_dom)
{
// new
crush.set_type_name(1, "domain");
crush.set_type_name(2, "pool");
- int npools = 3;
-
int minrep = g_conf.osd_min_rep;
int ndom = num_dom;
if (!ndom)
crush.set_item_name(rootid, "root");
// rules
- // replication
- for (int pool=0; pool<npools; pool++) {
- // size minrep..ndom
- crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, minrep, g_conf.osd_max_rep);
+ for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+ int ruleset = p->first;
+ crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, minrep, g_conf.osd_max_rep);
crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); // choose N domains
crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
int rno = crush_add_rule(crush.crush, rule, -1);
- crush.set_rule_name(rno, get_pool_name(pool));
+ crush.set_rule_name(rno, p->second);
}
-
- // raid
- if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
- for (int pool=0; pool<npools; pool++) {
- crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_INDEP, CRUSH_CHOOSE_N, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, rule, -1);
- }
} else {
// one bucket
crush.set_item_name(rootid, "root");
// replication
- for (int pool=0; pool<npools; pool++) {
- crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep);
+ for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
+ int ruleset = p->first;
+ crush_rule *rule = crush_make_rule(3, ruleset, CEPH_PG_TYPE_REP, g_conf.osd_min_rep, g_conf.osd_max_rep);
crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, CRUSH_CHOOSE_N, 0);
crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
int rno = crush_add_rule(crush.crush, rule, -1);
- crush.set_rule_name(rno, get_pool_name(pool));
+ crush.set_rule_name(rno, p->second);
}
- // raid4
- if (false && g_conf.osd_min_raid_width <= g_conf.osd_max_raid_width)
- for (int pool=0; pool<npools; pool++) {
- crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_RAID4, g_conf.osd_min_raid_width, g_conf.osd_max_raid_width);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, CRUSH_CHOOSE_N, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, rule, -1);
- }
}
crush.finalize();
// incremental
int32_t new_max_osd;
int32_t new_pg_num, new_pgp_num, new_lpg_num, new_lpgp_num;
+ map<int32_t,ceph_pg_pool> new_pools;
+ map<int32_t,nstring> new_pool_names;
+ set<int32_t> old_pools;
map<int32_t,entity_addr_t> new_up;
map<int32_t,uint8_t> new_down;
map<int32_t,uint32_t> new_weight;
::encode(new_pgp_num, bl);
::encode(new_lpg_num, bl);
::encode(new_lpgp_num, bl);
+ ::encode(new_pools, bl);
+ ::encode(old_pools, bl);
::encode(new_up, bl);
::encode(new_down, bl);
::encode(new_weight, bl);
// extended
+ ::encode(new_pool_names, bl);
::encode(new_up_thru, bl);
::encode(new_last_clean_interval, bl);
::encode(new_lost, bl);
::decode(new_pgp_num, p);
::decode(new_lpg_num, p);
::decode(new_lpgp_num, p);
+ ::decode(new_pools, p);
+ ::decode(old_pools, p);
::decode(new_up, p);
::decode(new_down, p);
::decode(new_weight, p);
// extended
+ ::decode(new_pool_names, p);
::decode(new_up_thru, p);
::decode(new_last_clean_interval, p);
::decode(new_lost, p);
vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
vector<osd_info_t> osd_info;
+ map<int,ceph_pg_pool> pools;
+ map<int,nstring> pool_name;
map<pg_t,uint32_t> pg_swap_primary; // force new osd to be pg primary (if already a member)
snapid_t max_snap;
interval_set<snapid_t> removed_snaps;
CrushWrapper crush; // hierarchical map
friend class OSDMonitor;
+ friend class PGMonitor;
friend class MDS;
public:
if (inc.new_max_osd >= 0)
set_max_osd(inc.new_max_osd);
+ for (set<int32_t>::iterator p = inc.old_pools.begin();
+ p != inc.old_pools.end();
+ p++) {
+ pools.erase(*p);
+ pool_name.erase(*p);
+ }
+ for (map<int32_t,ceph_pg_pool>::iterator p = inc.new_pools.begin();
+ p != inc.new_pools.end();
+ p++)
+ pools[p->first] = p->second;
+ for (map<int32_t,nstring>::iterator p = inc.new_pool_names.begin();
+ p != inc.new_pool_names.end();
+ p++)
+ pool_name[p->first] = p->second;
+
for (map<int32_t,uint32_t>::iterator i = inc.new_weight.begin();
i != inc.new_weight.end();
i++)
::encode(lpg_num, blist);
::encode(lpgp_num, blist);
::encode(last_pg_change, blist);
+
+ int32_t max_pools = 0;
+ if (pools.size())
+ max_pools = pools.rbegin()->first + 1;
+ ::encode(max_pools, blist);
+ ::encode(pools, blist);
+
::encode(flags, blist);
::encode(max_osd, blist);
// extended
::encode(osd_info, blist);
+ ::encode(pool_name, blist);
::encode(pg_swap_primary, blist);
::encode(max_snap, blist);
::decode(lpgp_num, p);
calc_pg_masks();
::decode(last_pg_change, p);
+
+ int32_t max_pools;
+ ::decode(max_pools, p);
+ ::decode(pools, p);
+
::decode(flags, p);
::decode(max_osd, p);
// extended
::decode(osd_info, p);
+ ::decode(pool_name, p);
::decode(pg_swap_primary, p);
::decode(max_snap, p);
// oid -> pg
ceph_object_layout file_to_object_layout(object_t oid, ceph_file_layout& layout) {
- return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size,
+ return make_object_layout(oid, layout.fl_pg_type,
layout.fl_pg_pool,
ceph_file_layout_pg_preferred(layout),
ceph_file_layout_object_su(layout));
}
- ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int pg_pool, int preferred=-1, int object_stripe_unit = 0) {
+ ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_pool, int preferred=-1, int object_stripe_unit = 0) {
// calculate ps (placement seed)
ps_t ps; // NOTE: keep full precision, here!
switch (g_conf.osd_object_layout) {
//cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl;
// construct object layout
- pg_t pgid = pg_t(pg_type, pg_size, ps, pg_pool, preferred);
+ pg_t pgid = pg_t(pg_type, ps, pg_pool, preferred);
ceph_object_layout layout;
layout.ol_pgid = pgid.u.pg64;
layout.ol_stripe_unit = object_stripe_unit;
// pg -> (osd list)
int pg_to_osds(pg_t pg, vector<int>& osds) {
// map to osds[]
-
+ int p = pg.pool();
+ if (!pools.count(p)) {
+ return osds.size();
+ }
+ ceph_pg_pool &pool = pools[p];
ps_t pps = raw_pg_to_pps(pg); // placement ps
switch (g_conf.osd_pg_layout) {
case CEPH_PG_LAYOUT_CRUSH:
{
// what crush rule?
- int ruleno = crush.find_rule(pg.pool(), pg.type(), pg.size());
+ int ruleno = crush.find_rule(pool.crush_ruleset, pg.type(), pool.size);
if (ruleno >= 0)
- crush.do_rule(ruleno, pps, osds, pg.size(), pg.preferred(), osd_weight);
+ crush.do_rule(ruleno, pps, osds, pool.size, pg.preferred(), osd_weight);
}
break;
case CEPH_PG_LAYOUT_LINEAR:
- for (unsigned i=0; i<pg.size(); i++)
- osds.push_back( (i + pps*pg.size()) % g_conf.num_osd );
+ for (unsigned i=0; i<pool.size; i++)
+ osds.push_back( (i + pps*pool.size) % g_conf.num_osd );
break;
case CEPH_PG_LAYOUT_HYBRID:
{
int h = crush_hash32(pps);
- for (unsigned i=0; i<pg.size(); i++)
+ for (unsigned i=0; i<pool.size; i++)
osds.push_back( (h+i) % g_conf.num_osd );
}
break;
case CEPH_PG_LAYOUT_HASH:
{
- for (unsigned i=0; i<pg.size(); i++) {
+ for (unsigned i=0; i<pool.size; i++) {
int t = 1;
int osd = 0;
while (t++) {
if (osds.empty()) {
osds.push_back(osd);
} else {
- assert(pg.size() > 0);
- for (unsigned i=1; i<pg.size(); i++)
+ assert(pool.size > 0);
+ for (unsigned i=1; i<pool.size; i++)
if (osds[i] == osd) {
// swap with position 0
osds[i] = osds[0];
+ unsigned get_pg_size(pg_t pg) {
+ ceph_pg_pool &pool = pools[pg.pool()];
+ return pool.size;
+ }
+
// pg -> primary osd
int get_pg_primary(pg_t pg) {
vector<int> group;
int num_osd, int num_dom,
int pg_bits, int lpg_bits,
int mds_local_osd);
- static void build_simple_crush_map(CrushWrapper& crush, int num_osd, int num_dom=0);
+ static void build_simple_crush_map(CrushWrapper& crush, map<int, const char*>& poolsets, int num_osd, int num_dom=0);
void print(ostream& out);
state_clear(PG_STATE_REPLAY);
}
if (is_primary() &&
- info.pgid.size() != acting.size())
+ osd->osdmap->get_pg_size(info.pgid) != acting.size())
state_set(PG_STATE_DEGRADED);
else
state_clear(PG_STATE_DEGRADED);
pg_stats_stable.state = state;
pg_stats_stable.acting = acting;
- pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * info.pgid.size();
+ pg_stats_stable.num_object_copies = pg_stats_stable.num_objects * osd->osdmap->get_pg_size(info.pgid);
if (!is_clean() && is_active()) {
pg_stats_stable.num_objects_missing_on_primary = missing.num_missing();
int degraded = missing.num_missing();
bool is_empty() const { return info.last_update == eversion_t(0,0); }
- bool is_complete_pg() { return acting.size() == info.pgid.size(); }
+ //bool is_complete_pg() { return acting.size() == info.pgid.size(); }
void add_log_entry(Log::Entry& e, bufferlist& log_bl);
wr->snapset = repop->pinfo->oi.snapset;
wr->snapc = repop->snapc;
wr->get_data() = repop->op->get_data(); // _copy_ bufferlist
- if (is_complete_pg())
+ if (osd->osdmap->get_pg_size(info.pgid) == acting.size())
wr->pg_trim_to = peers_complete_thru;
wr->peer_stat = osd->get_my_stat_for(now, dest);
osd->messenger->send_message(wr, osd->osdmap->get_inst(dest));
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v010"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v011"
typedef uint16_t ps_t;
-#define OSD_METADATA_PG_POOL 0xff
-#define OSD_SUPERBLOCK_POBJECT pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0,0))
+// object namespaces
+#define CEPH_METADATA_NS 1
+#define CEPH_DATA_NS 2
+#define CEPH_CAS_NS 3
+#define CEPH_OSDMETADATA_NS 0xff
+
+// poolsets
+enum {
+ CEPH_DATA_RULE,
+ CEPH_METADATA_RULE,
+ CEPH_CASDATA_RULE,
+};
+
+//#define CEPH_POOL(poolset, size) (((poolset) << 8) + (size))
+
+#define OSD_SUPERBLOCK_POBJECT pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0,0))
// placement group id
struct pg_t {
public:
pg_t() { u.pg64 = 0; }
pg_t(const pg_t& o) { u.pg64 = o.u.pg64; }
- pg_t(int type, int size, ps_t seed, int pool, int pref) {
+ pg_t(int type, ps_t seed, int pool, int pref) {
u.pg64 = 0;
u.pg.type = type;
- u.pg.size = size;
u.pg.ps = seed;
u.pg.pool = pool;
u.pg.preferred = pref; // hack: avoid negative.
bool is_rep() { return type() == TYPE_REP; }
bool is_raid4() { return type() == TYPE_RAID4; }
- unsigned size() { return u.pg.size; }
ps_t ps() { return u.pg.ps; }
int pool() { return u.pg.pool; }
int preferred() { return u.pg.preferred; } // hack: avoid negative.
operator uint64_t() const { return u.pg64; }
pobject_t to_log_pobject() const {
- return pobject_t(OSD_METADATA_PG_POOL, // osd metadata
+ return pobject_t(CEPH_OSDMETADATA_NS,
0,
object_t(u.pg64, 0));
}
}
bool parse(const char *s) {
- int numrep;
int pool;
int ps;
- int r = sscanf(s, "%dx%d.%x", &numrep, &pool, &ps);
+ int r = sscanf(s, "%d.%x", &pool, &ps);
if (r < 3)
return false;
u.pg.type = TYPE_REP;
u.pg.pool = pool;
- u.pg.size = numrep;
u.pg.ps = ps;
u.pg.preferred = -1;
return true;
inline ostream& operator<<(ostream& out, pg_t pg)
{
- if (pg.is_rep())
- out << pg.size() << 'x';
- else if (pg.is_raid4())
- out << pg.size() << 'r';
- else
- out << pg.size() << '?';
out << pg.pool() << '.';
out << hex << pg.ps() << dec;