/- async xattr
/- dcache readdir
/- confutils memory leaks
+/- osd re-up attempt when marked down
v0.9
- make mds exhert memory pressure on client caps, leases
-- osd re-up attempt when marked down
- optionally separate osd interfaces (ips) for clients and osds (replication, peering, etc.)
later
kernel client
+- fix symbol names for osdmap.h (possibly others?)
+- osd client needs to recalculate layout if osdmap changes (pg_num etc may change)
- fix up mds selection, and ESTALE handling
- make cap import/export efficient
- simplify mds auth tracking?
#define CEPH_OSD_PROTOCOL 5 /* cluster internal */
#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
#define CEPH_MON_PROTOCOL 4 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL 6 /* public/client */
+#define CEPH_OSDC_PROTOCOL 7 /* public/client */
#define CEPH_MDSC_PROTOCOL 18 /* public/client */
#define CEPH_MONC_PROTOCOL 11 /* public/client */
#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP)
#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4)
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
struct ceph_pg_pool {
- __u8 crush_ruleset;
- __u8 size;
__u8 type;
+ __u8 size;
+ __u8 crush_ruleset;
+ __u32 pg_num, pgp_num;
+ __u32 lpg_num, lpgp_num;
+ __u32 last_change; /* most recent epoch changed */
} __attribute__ ((packed));
/*
if (client->osdc.osdmap == NULL)
return 0;
seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
- seq_printf(s, "pg_num %d / %d\n",
- client->osdc.osdmap->pg_num,
- client->osdc.osdmap->pg_num_mask);
- seq_printf(s, "lpg_num %d / %d\n",
- client->osdc.osdmap->lpg_num,
- client->osdc.osdmap->lpg_num_mask);
seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL":"",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL":"");
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+ " NEARFULL":"",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+ " FULL":"");
+ for (i = 0; i < client->osdc.osdmap->num_pools; i++) {
+ struct ceph_pg_pool_info *pool =
+ &client->osdc.osdmap->pg_pool[i];
+ seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d",
+ pool->v.pg_num, pool->pg_num_mask,
+ pool->v.lpg_num, pool->lpg_num_mask);
+ }
for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
struct ceph_entity_addr *addr =
&client->osdc.osdmap->osd_addr[i];
* request accordingly. shorten extent as necessary if it crosses an
* object boundary.
*/
-static void calc_layout(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- struct ceph_osd_request *req)
+static int calc_layout(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ struct ceph_osd_request *req)
{
struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
struct ceph_osd_op *op = (void *)(reqhead + 1);
u64 orig_len = *plen;
u64 objoff, objlen; /* extent in object */
+ int err;
/* object extent? */
reqhead->oid.ino = cpu_to_le64(vino.ino);
req->r_num_pages = calc_pages_for(off, *plen);
/* pgid? */
- calc_object_layout(&reqhead->layout, &reqhead->oid, layout,
- osdc->osdmap);
+ err = calc_object_layout(&reqhead->layout, &reqhead->oid, layout,
+ osdc->osdmap);
dout(10, "calc_layout %llx.%08x %llu~%llu pgid %llx (%d pages)\n",
le64_to_cpu(reqhead->oid.ino), le32_to_cpu(reqhead->oid.bno),
objoff, objlen, le64_to_cpu(reqhead->layout.ol_pgid),
req->r_num_pages);
+ return err;
}
int do_trunc = truncate_seq && (off + *plen > truncate_size);
int num_op = 1 + do_sync + do_trunc;
size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
- int i;
+ int i, err;
u64 prevofs;
/* we may overallocate here, if our write extent is shortened below */
req->r_snapc = ceph_get_snap_context(snapc);
/* calculate max write size, pgid */
- calc_layout(osdc, vino, layout, off, plen, req);
+ err = calc_layout(osdc, vino, layout, off, plen, req);
+ if (err < 0) {
+ ceph_msg_put(msg);
+ kfree(req);
+ return ERR_PTR(err);
+ }
req->r_pgid.pg64 = le64_to_cpu(head->layout.ol_pgid);
if (flags & CEPH_OSD_FLAG_MODIFY) {
unsigned pps; /* placement ps */
int osds[10], osd = -1;
int i, num;
- struct ceph_pg_pool *pool;
+ struct ceph_pg_pool_info *pool;
if (req->r_pgid.pg.pool >= osdc->osdmap->num_pools)
return -1;
pool = &osdc->osdmap->pg_pool[req->r_pgid.pg.pool];
- ruleno = crush_find_rule(osdc->osdmap->crush, pool->crush_ruleset,
- req->r_pgid.pg.type, pool->size);
+ ruleno = crush_find_rule(osdc->osdmap->crush, pool->v.crush_ruleset,
+ req->r_pgid.pg.type, pool->v.size);
if (ruleno < 0) {
derr(0, "map_osds no crush rule for pool %d type %d size %d\n",
- req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->size);
+ req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->v.size);
return -1;
}
if (req->r_pgid.pg.preferred >= 0)
pps = ceph_stable_mod(req->r_pgid.pg.ps,
- osdc->osdmap->lpgp_num,
- osdc->osdmap->lpgp_num_mask);
+ pool->v.lpgp_num, pool->lpgp_num_mask);
else
pps = ceph_stable_mod(req->r_pgid.pg.ps,
- osdc->osdmap->pgp_num,
- osdc->osdmap->pgp_num_mask);
+ pool->v.pgp_num, pool->pgp_num_mask);
num = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds,
- min_t(int, pool->size, ARRAY_SIZE(osds)),
+ min_t(int, pool->v.size, ARRAY_SIZE(osds)),
req->r_pgid.pg.preferred, osdc->osdmap->osd_weight);
/* primary is first up osd */
/*
* the foo_mask is the smallest value 2^n-1 that is >= foo.
*/
-static void calc_pg_masks(struct ceph_osdmap *map)
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
{
- map->pg_num_mask = (1 << calc_bits_of(map->pg_num-1)) - 1;
- map->pgp_num_mask = (1 << calc_bits_of(map->pgp_num-1)) - 1;
- map->lpg_num_mask = (1 << calc_bits_of(map->lpg_num-1)) - 1;
- map->lpgp_num_mask = (1 << calc_bits_of(map->lpgp_num-1)) - 1;
+ pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+ pi->pgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+ pi->lpg_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+ pi->lpgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
}
/*
if (map == NULL)
return ERR_PTR(-ENOMEM);
- ceph_decode_need(p, end, 2*sizeof(u64)+11*sizeof(u32), bad);
+ ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
ceph_decode_64_le(p, major);
__ceph_fsid_set_major(&map->fsid, major);
ceph_decode_64_le(p, minor);
ceph_decode_32_le(p, map->created.tv_nsec);
ceph_decode_32_le(p, map->modified.tv_sec);
ceph_decode_32_le(p, map->modified.tv_nsec);
- ceph_decode_32(p, map->pg_num);
- ceph_decode_32(p, map->pgp_num);
- ceph_decode_32(p, map->lpg_num);
- ceph_decode_32(p, map->lpgp_num);
- ceph_decode_32(p, map->last_pg_change);
ceph_decode_32(p, map->num_pools);
map->pg_pool = kmalloc(map->num_pools * sizeof(*map->pg_pool),
}
ceph_decode_32_safe(p, end, max, bad);
while (max--) {
- ceph_decode_need(p, end, 4+sizeof(*map->pg_pool), bad);
+ ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
ceph_decode_32(p, i);
if (i >= map->num_pools)
goto bad;
- ceph_decode_copy(p, &map->pg_pool[i], sizeof(*map->pg_pool));
+ ceph_decode_copy(p, &map->pg_pool[i].v,
+ sizeof(map->pg_pool->v));
+ calc_pg_masks(&map->pg_pool[i]);
}
ceph_decode_32_safe(p, end, map->flags, bad);
- calc_pg_masks(map);
-
ceph_decode_32(p, max);
/* (re)alloc osd arrays */
ceph_fsid_t fsid;
u32 epoch = 0;
struct ceph_timespec modified;
- u32 len, x, pool;
+ u32 len, pool;
__s32 new_flags, max;
void *start = *p;
int err = -EINVAL;
if (err < 0)
goto bad;
}
- ceph_decode_32(p, x);
- if (x)
- map->pg_num = x;
- ceph_decode_32(p, x);
- if (x)
- map->pgp_num = x;
- ceph_decode_32(p, x);
- if (x)
- map->lpg_num = x;
- ceph_decode_32(p, x);
- if (x)
- map->lpgp_num = x;
map->epoch++;
map->modified = map->modified;
map->pg_pool = p;
map->num_pools = pool+1;
}
- ceph_decode_copy(p, &map->pg_pool[pool], sizeof(*map->pg_pool));
+ ceph_decode_copy(p, &map->pg_pool[pool].v,
+ sizeof(map->pg_pool->v));
+ calc_pg_masks(&map->pg_pool[pool]);
}
/* old_pool (ignore) */
ceph_decode_32_safe(p, end, len, bad);
- *p += len * (sizeof(u32) + sizeof(*map->pg_pool));
+ *p += len * sizeof(u32);
/* new_up */
err = -EINVAL;
* calculate an object layout (i.e. pgid) from an oid,
* file_layout, and osdmap
*/
-void calc_object_layout(struct ceph_object_layout *ol,
- struct ceph_object *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap)
+int calc_object_layout(struct ceph_object_layout *ol,
+ struct ceph_object *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap)
{
unsigned num, num_mask;
union ceph_pg pgid;
u64 ino = le64_to_cpu(oid->ino);
unsigned bno = le32_to_cpu(oid->bno);
s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+ int poolid = le16_to_cpu(fl->fl_pg_pool);
+ struct ceph_pg_pool_info *pool;
+
+ if (poolid >= osdmap->num_pools)
+ return -EIO;
+ pool = &osdmap->pg_pool[poolid];
if (preferred >= 0) {
- num = osdmap->lpg_num;
- num_mask = osdmap->lpg_num_mask;
+ num = le32_to_cpu(pool->v.lpg_num);
+ num_mask = pool->lpg_num_mask;
} else {
- num = osdmap->pg_num;
- num_mask = osdmap->pg_num_mask;
+ num = le32_to_cpu(pool->v.pg_num);
+ num_mask = pool->pg_num_mask;
}
pgid.pg64 = 0; /* start with it zeroed out */
ol->ol_pgid = cpu_to_le64(pgid.pg64);
ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+
+ return 0;
}
* The map can be updated either via an incremental map (diff) describing
* the change between two successive epochs, or as a fully encoded map.
*/
+struct ceph_pg_pool_info {
+ struct ceph_pg_pool v;
+ int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+
struct ceph_osdmap {
ceph_fsid_t fsid;
u32 epoch;
u32 mkfs_epoch;
struct ceph_timespec created, modified;
- /* these parameters describe the number of placement groups
- * in the system. foo_mask is the smallest value (2**n-1) >= foo. */
- u32 pg_num, pg_num_mask;
- u32 pgp_num, pgp_num_mask;
- u32 lpg_num, lpg_num_mask;
- u32 lpgp_num, lpgp_num_mask;
- u32 last_pg_change; /* epoch of last pg count change */
-
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
struct ceph_entity_addr *osd_addr;
u32 num_pools;
- struct ceph_pg_pool *pg_pool;
+ struct ceph_pg_pool_info *pg_pool;
/* the CRUSH map specifies the mapping of placement groups to
* the list of osds that store+replicate them. */
u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */
-extern void calc_object_layout(struct ceph_object_layout *ol,
- struct ceph_object *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap);
+extern int calc_object_layout(struct ceph_object_layout *ol,
+ struct ceph_object *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap);
#endif
OSDMap newmap;
newmap.decode(bl);
- newmap.epoch = 1;
+ newmap.set_epoch(1);
newmap.set_fsid(mon->monmap->fsid);
newmap.created = newmap.modified = g_clock.now();
// ---------------
#define SWAP_PRIMARIES_AT_START 0
#define SWAP_TIME 1
+#if 0
if (SWAP_PRIMARIES_AT_START) {
// For all PGs that have OSD 0 as the primary,
// switch them to use the first replca
}
}
}
+#endif
// ---------------
if (do_propose)
epoch_t e = atoi(m->cmd[2].c_str());
if (ceph_fsid_compare(&map.fsid, &mon->monmap->fsid) == 0) {
if (pending_inc.epoch == e) {
- map.epoch = pending_inc.epoch; // make sure epoch is correct
+ map.set_epoch(pending_inc.epoch); // make sure epoch is correct
map.encode(pending_inc.fullmap);
string rs = "set osd map";
paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
return true;
}
- else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) {
- int n = atoi(m->cmd[2].c_str());
- if (n <= osdmap.get_pg_num()) {
- ss << "specified pg_num " << n << " <= current " << osdmap.get_pg_num();
- } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
- ss << "currently creating pgs, wait";
- err = -EAGAIN;
- } else {
- ss << "set new pg_num = " << n;
- pending_inc.new_pg_num = n;
- getline(ss, rs);
- paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
- return true;
- }
- }
- else if (m->cmd[1] == "setpgpnum" && m->cmd.size() > 2) {
- int n = atoi(m->cmd[2].c_str());
- if (n <= osdmap.get_pgp_num()) {
- ss << "specified pgp_num " << n << " <= current " << osdmap.get_pgp_num();
- } else if (n > osdmap.get_pg_num()) {
- ss << "specified pgp_num " << n << " > pg_num " << osdmap.get_pg_num();
- } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
- ss << "still creating pgs, wait";
- err = -EAGAIN;
- } else {
- ss << "set new pgp_num = " << n;
- pending_inc.new_pgp_num = n;
- getline(ss, rs);
- paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
- return true;
- }
- }
else if (m->cmd[1] == "down" && m->cmd.size() == 3) {
long osd = strtol(m->cmd[2].c_str(), 0, 10);
if (osdmap.is_down(osd)) {
}
else if (m->cmd[1] == "pool" && m->cmd.size() >= 5) {
int pool = -1;
- for (map<int,nstring>::iterator p = osdmap.pool_name.begin();
- p != osdmap.pool_name.end();
- p++) {
- if (p->second == m->cmd[2])
- pool = p->first;
+ pg_pool_t *p = 0;
+ for (map<int,nstring>::iterator i = osdmap.pool_name.begin();
+ i != osdmap.pool_name.end();
+ i++) {
+ if (i->second == m->cmd[2]) {
+ pool = i->first;
+ p = &osdmap.pools[pool];
+ }
}
if (pool >= 0) {
- if (m->cmd[3] == "size") {
- int s = atoi(m->cmd[4].c_str());
- if (s) {
- pending_inc.new_pools[pool] = osdmap.pools[pool];
- pending_inc.new_pools[pool].size = s;
- ss << "set pool " << pool << " size to " << s;
+ int n = atoi(m->cmd[4].c_str());
+ if (n) {
+ if (m->cmd[3] == "size") {
+ pending_inc.new_pools[pool] = *p;
+ pending_inc.new_pools[pool].v.size = n;
+ ss << "set pool " << pool << " size to " << n;
getline(ss, rs);
paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
return true;
+ } else if (m->cmd[3] == "pg_num") {
+ if (n <= p->get_pg_num()) {
+ ss << "specified pg_num " << n << " <= current " << p->get_pg_num();
+ } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+ ss << "currently creating pgs, wait";
+ err = -EAGAIN;
+ } else {
+ pending_inc.new_pools[pool] = osdmap.pools[pool];
+ pending_inc.new_pools[pool].v.pg_num = n;
+ ss << "set pool " << pool << " pg_num to " << n;
+ getline(ss, rs);
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ }
+ } else if (m->cmd[3] == "pgp_num") {
+ if (n <= p->get_pgp_num()) {
+ ss << "specified pgp_num " << n << " <= current " << p->get_pgp_num();
+ } else if (n > p->get_pg_num()) {
+ ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num();
+ } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+ ss << "still creating pgs, wait";
+ err = -EAGAIN;
+ } else {
+ pending_inc.new_pools[pool] = osdmap.pools[pool];
+ pending_inc.new_pools[pool].v.pgp_num = n;
+ ss << "set pool " << pool << " pgp_num to " << n;
+ getline(ss, rs);
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ }
+ } else {
+ ss << "unrecognized pool field " << m->cmd[3];
}
}
} else {
bool PGMonitor::register_new_pgs()
{
- dout(10) << "osdmap last_pg_change " << mon->osdmon()->osdmap.get_last_pg_change()
- << ", pgmap last_pg_scan " << pg_map.last_pg_scan << dendl;
- if (mon->osdmon()->osdmap.get_last_pg_change() <= pg_map.last_pg_scan ||
- mon->osdmon()->osdmap.get_last_pg_change() <= pending_inc.pg_scan) {
- dout(10) << "register_new_pgs -- i've already scanned pg space since last significant osdmap update" << dendl;
- return false;
- }
-
// iterate over crush mapspace
- dout(10) << "register_new_pgs scanning pgid space defined by crush rule masks" << dendl;
-
- CrushWrapper *crush = &mon->osdmon()->osdmap.crush;
- int pg_num = mon->osdmon()->osdmap.get_pg_num();
epoch_t epoch = mon->osdmon()->osdmap.get_epoch();
+ dout(10) << "register_new_pgs checking pg pools for osdmap epoch " << epoch
+ << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
bool first = pg_map.pg_stat.empty(); // first pg creation
int created = 0;
- for (map<int,ceph_pg_pool>::iterator p = mon->osdmon()->osdmap.pools.begin();
+ for (map<int,pg_pool_t>::iterator p = mon->osdmon()->osdmap.pools.begin();
p != mon->osdmon()->osdmap.pools.end();
p++) {
- int pool = p->first;
- int type = p->second.type;
- int ruleno = p->second.crush_ruleset;
- if (!crush->rule_exists(ruleno))
+ int poolid = p->first;
+ pg_pool_t &pool = p->second;
+ int type = pool.get_type();
+ int ruleno = pool.get_crush_ruleset();
+ if (!mon->osdmon()->osdmap.crush.rule_exists(ruleno))
continue;
- for (ps_t ps = 0; ps < pg_num; ps++) {
- pg_t pgid(type, ps, pool, -1);
+
+ if (pool.get_last_change() <= pg_map.last_pg_scan ||
+ pool.get_last_change() <= pending_inc.pg_scan) {
+ dout(10) << " no change in " << pool << dendl;
+ continue;
+ }
+
+ dout(10) << "register_new_pgs scanning " << pool << dendl;
+
+ for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
+ pg_t pgid(type, ps, poolid, -1);
if (pg_map.pg_stat.count(pgid)) {
- dout(20) << "register_new_pgs have " << pgid << dendl;
+ dout(20) << "register_new_pgs have " << pgid << dendl;
continue;
}
parent = pgid;
while (1) {
// remove most significant bit
- int msb = calc_bits_of(parent.u.pg.ps);
+ int msb = pool.calc_bits_of(parent.u.pg.ps);
if (!msb) break;
parent.u.pg.ps &= ~(1<<(msb-1));
split_bits++;
created++;
if (split_bits == 0) {
- dout(10) << "register_new_pgs will create " << pgid << dendl;
+ dout(10) << "register_new_pgs will create " << pgid << dendl;
} else {
- dout(10) << "register_new_pgs will create " << pgid
+ dout(10) << "register_new_pgs will create " << pgid
<< " parent " << parent
<< " by " << split_bits << " bits"
<< dendl;
<< "created " << get_created() << "\n"
<< "modifed " << get_modified() << "\n"
<< std::endl;
- out << "pg_num " << get_pg_num() << "\n"
- << "pgp_num " << get_pgp_num() << "\n"
- << "lpg_num " << get_lpg_num() << "\n"
- << "lpgp_num " << get_lpgp_num() << "\n"
- << "last_pg_change " << get_last_pg_change() << "\n"
- << std::endl;
- for (map<int,ceph_pg_pool>::iterator p = pools.begin(); p != pools.end(); p++)
+ for (map<int,pg_pool_t>::iterator p = pools.begin(); p != pools.end(); p++)
out << "pg_pool " << p->first
<< " '" << pool_name[p->first]
- << "' size " << (int)p->second.size
- << " crush_ruleset " << (int)p->second.crush_ruleset
- << "\n";
+ << "' " << p->second << "\n";
out << std::endl;
out << "max_osd " << get_max_osd() << "\n";
created = modified = g_clock.now();
set_max_osd(num_osd);
- pg_num = pgp_num = num_osd << pg_bits;
- lpg_num = lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0;
// crush map
map<int, const char*> rulesets;
int pool = 0;
for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
- pools[pool].size = 2;
- pools[pool].crush_ruleset = p->first;
- pools[pool].type = CEPH_PG_TYPE_REP;
+ pools[pool].v.type = CEPH_PG_TYPE_REP;
+ pools[pool].v.size = 2;
+ pools[pool].v.crush_ruleset = p->first;
+ pools[pool].v.pg_num = num_osd << pg_bits;
+ pools[pool].v.pgp_num = num_osd << pg_bits;
+ pools[pool].v.lpg_num = lpg_bits ? (1 << (lpg_bits-1)) : 0;
+ pools[pool].v.lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0;
+ pools[pool].v.last_change = epoch;
pool_name[pool] = p->second;
pool++;
}
//#define PG_ROLE_TAIL 2
-inline int calc_bits_of(int t) {
- int b = 0;
- while (t > 0) {
- t = t >> 1;
- b++;
- }
- return b;
-}
-
-
/*
* we track up to two intervals during which the osd was alive and
utime_t modified;
int32_t new_flags;
+ /*
bool is_pg_change() {
return (fullmap.length() ||
crush.length() ||
new_pg_num ||
new_lpg_num);
}
+ */
// full (rare)
bufferlist fullmap; // in leiu of below.
// incremental
int32_t new_max_osd;
- int32_t new_pg_num, new_pgp_num, new_lpg_num, new_lpgp_num;
- map<int32_t,ceph_pg_pool> new_pools;
+ map<int32_t,pg_pool_t> new_pools;
map<int32_t,nstring> new_pool_names;
set<int32_t> old_pools;
map<int32_t,entity_addr_t> new_up;
::encode(crush, bl);
::encode(new_max_osd, bl);
- ::encode(new_pg_num, bl);
- ::encode(new_pgp_num, bl);
- ::encode(new_lpg_num, bl);
- ::encode(new_lpgp_num, bl);
::encode(new_pools, bl);
::encode(old_pools, bl);
::encode(new_up, bl);
::decode(crush, p);
::decode(new_max_osd, p);
- ::decode(new_pg_num, p);
- ::decode(new_pgp_num, p);
- ::decode(new_lpg_num, p);
- ::decode(new_lpgp_num, p);
::decode(new_pools, p);
::decode(old_pools, p);
::decode(new_up, p);
::decode(old_blacklist, p);
}
- Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1),
- new_pg_num(0), new_pgp_num(0), new_lpg_num(0), new_lpgp_num(0) {
+ Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1) {
memset(&fsid, 0, sizeof(fsid));
}
Incremental(bufferlist &bl) {
epoch_t epoch; // what epoch of the osd cluster descriptor is this
utime_t created, modified; // epoch start time
- /*
- * placement groups
- *
- * pg_num -- base number of pseudorandomly placed pgs
- *
- * pgp_num -- effective number when calculating pg placement. this
- * is used for pg_num increases. new pgs result in data being
- * "split" into new pgs. for this to proceed smoothly, new pgs are
- * intiially colocated with their parents; that is, pgp_num doesn't
- * increase until the new pgs have successfully split. only _then_
- * are the new pgs placed independently.
- *
- * lpg_num -- localized pg count (per device). replicas are
- * randomly selected.
- *
- * lpgp_num -- as above.
- */
- int32_t pg_num, pg_num_mask; // placement group count and bitmask
- int32_t pgp_num, pgp_num_mask; // pg placement num (for placing pg's.. <= pg_num)
- int32_t lpg_num, lpg_num_mask; // localized placement group count
- int32_t lpgp_num, lpgp_num_mask; // as above
-
- // new pgs
- epoch_t last_pg_change; // most recent epoch initiating possible pg creation
-
uint32_t flags;
int32_t max_osd;
vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
vector<osd_info_t> osd_info;
- map<int,ceph_pg_pool> pools;
+ map<int,pg_pool_t> pools;
map<int,nstring> pool_name;
map<pg_t,uint32_t> pg_swap_primary; // force new osd to be pg primary (if already a member)
snapid_t max_snap;
public:
OSDMap() : epoch(0),
- pg_num(0), pgp_num(0), lpg_num(0), lpgp_num(0),
- last_pg_change(0),
flags(0),
max_osd(0), max_snap(0) {
memset(&fsid, 0, sizeof(fsid));
- calc_pg_masks();
}
// map info
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
- void set_epoch(epoch_t e) { epoch = e; }
- /* pg num / masks */
- void calc_pg_masks() {
- pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1;
- pgp_num_mask = (1 << calc_bits_of(pgp_num-1)) - 1;
- lpg_num_mask = (1 << calc_bits_of(lpg_num-1)) - 1;
- lpgp_num_mask = (1 << calc_bits_of(lpgp_num-1)) - 1;
+ void set_epoch(epoch_t e) {
+ epoch = e;
+ for (map<int,pg_pool_t>::iterator p = pools.begin();
+ p != pools.end();
+ p++)
+ p->second.v.last_change = e;
}
- int get_pg_num() const { return pg_num; }
- int get_pgp_num() const { return pgp_num; }
- int get_lpg_num() const { return lpg_num; }
- int get_lpgp_num() const { return lpgp_num; }
-
- int get_pg_num_mask() const { return pg_num_mask; }
- int get_pgp_num_mask() const { return pgp_num_mask; }
- int get_lpg_num_mask() const { return lpg_num_mask; }
- int get_lpgp_num_mask() const { return lpgp_num_mask; }
-
/* stamps etc */
const utime_t& get_created() const { return created; }
const utime_t& get_modified() const { return modified; }
- epoch_t get_last_pg_change() const {
- return last_pg_change;
- }
-
snapid_t get_max_snap() { return max_snap; }
bool is_removed_snap(snapid_t sn) {
if (sn > max_snap)
modified = inc.modified;
// full map?
- if (inc.fullmap.length())
+ if (inc.fullmap.length()) {
decode(inc.fullmap);
-
- if (inc.is_pg_change())
- last_pg_change = epoch;
-
- if (inc.fullmap.length())
return;
+ }
// nope, incremental.
if (inc.new_flags >= 0)
flags = inc.new_flags;
- if (inc.new_pg_num) {
- assert(inc.new_pg_num >= pg_num);
- pg_num = inc.new_pg_num;
- }
- if (inc.new_lpg_num) {
- assert(inc.new_lpg_num >= lpg_num);
- lpg_num = inc.new_lpg_num;
- }
- if (inc.new_pgp_num) {
- assert(inc.new_pgp_num >= pgp_num);
- pgp_num = inc.new_pgp_num;
- }
- if (inc.new_lpgp_num) {
- assert(inc.new_lpgp_num >= lpgp_num);
- lpgp_num = inc.new_lpgp_num;
- }
-
if (inc.new_max_osd >= 0)
set_max_osd(inc.new_max_osd);
pools.erase(*p);
pool_name.erase(*p);
}
- for (map<int32_t,ceph_pg_pool>::iterator p = inc.new_pools.begin();
+ for (map<int32_t,pg_pool_t>::iterator p = inc.new_pools.begin();
p != inc.new_pools.end();
p++)
pools[p->first] = p->second;
::encode(epoch, blist);
::encode(created, blist);
::encode(modified, blist);
- ::encode(pg_num, blist);
- ::encode(pgp_num, blist);
- ::encode(lpg_num, blist);
- ::encode(lpgp_num, blist);
- ::encode(last_pg_change, blist);
int32_t max_pools = 0;
if (pools.size())
::decode(epoch, p);
::decode(created, p);
::decode(modified, p);
- ::decode(pg_num, p);
- ::decode(pgp_num, p);
- ::decode(lpg_num, p);
- ::decode(lpgp_num, p);
- calc_pg_masks();
- ::decode(last_pg_change, p);
int32_t max_pools;
::decode(max_pools, p);
return layout;
}
-
- /*
- * map a raw pg (with full precision ps) into an actual pg, for storage
- */
- pg_t raw_pg_to_pg(pg_t pg) {
- if (pg.preferred() >= 0)
- pg.u.pg.ps = ceph_stable_mod(pg.ps(), lpg_num, lpg_num_mask);
- else
- pg.u.pg.ps = ceph_stable_mod(pg.ps(), pg_num, pg_num_mask);
- return pg;
- }
-
- /*
- * map raw pg (full precision ps) into a placement ps
- */
- ps_t raw_pg_to_pps(pg_t pg) {
- if (pg.preferred() >= 0)
- return ceph_stable_mod(pg.ps(), lpgp_num, lpgp_num_mask);
- else
- return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask);
- }
-
// pg -> (osd list)
int pg_to_osds(pg_t pg, vector<int>& osds) {
// map to osds[]
if (!pools.count(p)) {
return osds.size();
}
- ceph_pg_pool &pool = pools[p];
- ps_t pps = raw_pg_to_pps(pg); // placement ps
+ pg_pool_t &pool = pools[p];
+ ps_t pps = pool.raw_pg_to_pps(pg); // placement ps
+ unsigned size = pool.get_size();
switch (g_conf.osd_pg_layout) {
case CEPH_PG_LAYOUT_CRUSH:
{
// what crush rule?
- int ruleno = crush.find_rule(pool.crush_ruleset, pg.type(), pool.size);
+ int ruleno = crush.find_rule(pool.get_crush_ruleset(), pg.type(), size);
if (ruleno >= 0)
- crush.do_rule(ruleno, pps, osds, pool.size, pg.preferred(), osd_weight);
+ crush.do_rule(ruleno, pps, osds, size, pg.preferred(), osd_weight);
}
break;
case CEPH_PG_LAYOUT_LINEAR:
- for (unsigned i=0; i<pool.size; i++)
- osds.push_back( (i + pps*pool.size) % g_conf.num_osd );
+ for (unsigned i=0; i<size; i++)
+ osds.push_back( (i + pps*size) % g_conf.num_osd );
break;
case CEPH_PG_LAYOUT_HYBRID:
{
int h = crush_hash32(pps);
- for (unsigned i=0; i<pool.size; i++)
+ for (unsigned i=0; i<size; i++)
osds.push_back( (h+i) % g_conf.num_osd );
}
break;
case CEPH_PG_LAYOUT_HASH:
{
- for (unsigned i=0; i<pool.size; i++) {
+ for (unsigned i=0; i<size; i++) {
int t = 1;
int osd = 0;
while (t++) {
if (osds.empty()) {
osds.push_back(osd);
} else {
- assert(pool.size > 0);
- for (unsigned i=1; i<pool.size; i++)
+ assert(size > 0);
+ for (unsigned i=1; i<size; i++)
if (osds[i] == osd) {
// swap with position 0
osds[i] = osds[0];
unsigned get_pg_size(pg_t pg) {
- ceph_pg_pool &pool = pools[pg.pool()];
- return pool.size;
+ assert(pools.count(pg.pool()));
+ pg_pool_t &pool = pools[pg.pool()];
+ return pool.get_size();
+ }
+
+ pg_t raw_pg_to_pg(pg_t pg) {
+ assert(pools.count(pg.pool()));
+ pg_pool_t &pool = pools[pg.pool()];
+ return pool.raw_pg_to_pg(pg);
}
// pg -> primary osd
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v011"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v012"
return st;
}
+
+
+
+/*
+ * pg_pool
+ */
+struct pg_pool_t {
+ ceph_pg_pool v;
+ int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+
+ int get_pg_num() const { return v.pg_num; }
+ int get_pgp_num() const { return v.pgp_num; }
+ int get_lpg_num() const { return v.lpg_num; }
+ int get_lpgp_num() const { return v.lpgp_num; }
+
+ int get_pg_num_mask() const { return pg_num_mask; }
+ int get_pgp_num_mask() const { return pgp_num_mask; }
+ int get_lpg_num_mask() const { return lpg_num_mask; }
+ int get_lpgp_num_mask() const { return lpgp_num_mask; }
+
+ int calc_bits_of(int t) {
+ int b = 0;
+ while (t > 0) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+ }
+
+ unsigned get_type() const { return v.type; }
+ unsigned get_size() const { return v.size; }
+ int get_crush_ruleset() const { return v.crush_ruleset; }
+ epoch_t get_last_change() const { return v.last_change; }
+
+ void calc_pg_masks() {
+ pg_num_mask = (1 << calc_bits_of(v.pg_num-1)) - 1;
+ pgp_num_mask = (1 << calc_bits_of(v.pgp_num-1)) - 1;
+ lpg_num_mask = (1 << calc_bits_of(v.lpg_num-1)) - 1;
+ lpgp_num_mask = (1 << calc_bits_of(v.lpgp_num-1)) - 1;
+ }
+
+ /*
+ * map a raw pg (with full precision ps) into an actual pg, for storage
+ */
+ pg_t raw_pg_to_pg(pg_t pg) const {
+ if (pg.preferred() >= 0 && v.lpg_num)
+ pg.u.pg.ps = ceph_stable_mod(pg.ps(), v.lpg_num, lpg_num_mask);
+ else
+ pg.u.pg.ps = ceph_stable_mod(pg.ps(), v.pg_num, pg_num_mask);
+ return pg;
+ }
+
+ /*
+ * map raw pg (full precision ps) into a placement ps
+ */
+ ps_t raw_pg_to_pps(pg_t pg) const {
+ if (pg.preferred() >= 0 && v.lpgp_num)
+ return ceph_stable_mod(pg.ps(), v.lpgp_num, lpgp_num_mask);
+ else
+ return ceph_stable_mod(pg.ps(), v.pgp_num, pgp_num_mask);
+ }
+
+ void encode(bufferlist& bl) const {
+ ::encode(v, bl);
+ }
+ void decode(bufferlist::iterator& bl) {
+ ::decode(v, bl);
+ calc_pg_masks();
+ }
+};
+WRITE_CLASS_ENCODER(pg_pool_t)
+
+inline ostream& operator<<(ostream& out, const pg_pool_t& p) {
+ return out << "pg_pool(type " << p.get_type()
+ << " size " << p.get_size()
+ << " ruleset " << p.get_crush_ruleset()
+ << " pg_num " << p.get_pg_num()
+ << " pgp_num " << p.get_pgp_num()
+ << " lpg_num " << p.get_lpg_num()
+ << " lpgp_num " << p.get_lpgp_num()
+ << " last_change " << p.get_last_change()
+ << ")";
+}
+
/** pg_stat
* aggregate stats for a single PG.
*/