int64_t ol = t.get_int();
object_t oid(oh, ol);
lock.Lock();
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2, 0);
off_t size;
client->objecter->stat(oid, &size, layout, new C_SafeCond(&lock, &cond, &ack));
while (!ack) cond.Wait(lock);
int64_t len = t.get_int();
object_t oid(oh, ol);
lock.Lock();
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2, 0);
bufferlist bl;
client->objecter->read(oid, off, len, layout, &bl, new C_SafeCond(&lock, &cond, &ack));
while (!ack) cond.Wait(lock);
int64_t len = t.get_int();
object_t oid(oh, ol);
lock.Lock();
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2, 0);
bufferptr bp(len);
bufferlist bl;
bl.push_back(bp);
int64_t len = t.get_int();
object_t oid(oh, ol);
lock.Lock();
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2, 0);
client->objecter->zero(oid, off, len, layout,
new C_SafeCond(&lock, &cond, &ack),
safeg->new_sub());
if (time_to_stop()) break;
object_t oid(0x1000, i);
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size, 0);
if (i % inflight == 0) {
dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl;
}
object_t oid(0x1000, o);
- ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size);
+ ceph_object_layout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size, 0);
client->client_lock.Lock();
utime_t start = g_clock.now();
fl_stripe_unit: 1<<22,
fl_stripe_count: 1,
fl_object_size: 1<<22,
+ fl_cas_hash: 0,
fl_object_stripe_unit: 0,
fl_pg_preferred: -1,
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2
+ fl_pg_size: 2,
+ fl_pg_pool: 0
};
struct ceph_file_layout g_OSD_MDDirLayout = {
fl_stripe_unit: 1<<22,
fl_stripe_count: 1,
fl_object_size: 1<<22,
+ fl_cas_hash: 0,
fl_object_stripe_unit: 0,
fl_pg_preferred: -1,
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2
+ fl_pg_size: 2,
+ fl_pg_pool: 0
};
struct ceph_file_layout g_OSD_MDLogLayout = {
fl_stripe_unit: 1<<20,
fl_stripe_count: 1,
fl_object_size: 1<<20,
+ fl_cas_hash: 0,
fl_object_stripe_unit: 0,
fl_pg_preferred: -1,
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2
+ fl_pg_size: 2,
+ fl_pg_pool: 0
};
struct ceph_file_layout g_OSD_MDAnchorTableLayout = {
fl_stripe_unit: 1<<20,
fl_stripe_count: 1,
fl_object_size: 1<<20,
+ fl_cas_hash: 0,
fl_object_stripe_unit: 0,
fl_pg_preferred: -1,
fl_pg_type: CEPH_PG_TYPE_REP,
- fl_pg_size: 2
+ fl_pg_size: 2,
+ fl_pg_pool: 0
};
#include <msg/msg_types.h>
ObjectStore *store = new Ebofs(dev);
bufferlist bl;
store->mount();
- int r = store->read(object_t(0,0), 0, sizeof(sb), bl);
+ int r = store->read(OSD_SUPERBLOCK_POBJECT, 0, sizeof(sb), bl);
if (r < 0) {
cerr << "couldn't read superblock object on " << dev << std::endl;
exit(0);
__u32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */
__u32 fl_stripe_count; /* over this many objects */
__u32 fl_object_size; /* until objects are this big, then move to new objects */
+ __u32 fl_cas_hash; /* 0 = none; 1 = sha256 */
/* pg -> disk layout */
__u32 fl_object_stripe_unit; /* for per-object parity, if any */
__s32 fl_pg_preferred; /* preferred primary for pg, if any (-1 = none) */
__u8 fl_pg_type; /* pg type; see PG_TYPE_* */
__u8 fl_pg_size; /* pg size (num replicas, raid stripe width, etc. */
+ __u8 fl_pg_pool; /* implies crush ruleset AND object namespace */
};
#define ceph_file_layout_stripe_width(l) (l.fl_stripe_unit * l.fl_stripe_count)
union ceph_pg {
__u64 pg64;
struct {
- __s32 preferred; /* preferred primary osd */
+ __s16 preferred; /* preferred primary osd */
__u16 ps; /* placement seed */
+ __u8 pool; /* implies crush ruleset */
__u8 type;
__u8 size;
+ __u8 __pad;
} pg;
};
#define ceph_pg_is_raid4(pg) (pg.pg.type == CEPH_PG_TYPE_RAID4)
/*
- * crush rule ids. fixme.
+ * crush rule ids. fixme, this static mapping to rule ids is lame.
*/
-#define CRUSH_REP_RULE(nrep) (nrep)
-#define CRUSH_RAID_RULE(num) (10+num)
+#define CRUSH_MAX_REP 8
+#define CRUSH_PG_TYPES 2
+#define CRUSH_RULE_OFFSET(p, t) (((p)*CRUSH_PG_TYPES + (t))*CRUSH_MAX_REP)
+#define CRUSH_REP_RULE(nrep, pool) (CRUSH_RULE_OFFSET(pool, 0) + (nrep))
+#define CRUSH_RAID_RULE(num, pool) (CRUSH_RULE_OFFSET(pool, 1) + (num))
/*
* stable_mod func is used to control number of placement groups
uint32_t rank; // rank/stripe id (e.g. for parity encoding)
object_t oid; // logical object
pobject_t() : volume(0), rank(0) {}
- pobject_t(object_t o) : volume(0), rank(0), oid(o) {} // this should go away eventually
+ //pobject_t(object_t o) : volume(0), rank(0), oid(o) {} // this should go away eventually
pobject_t(uint16_t v, uint16_t r, object_t o) : volume(v), rank(r), oid(o) {}
} __attribute__ ((packed));
/* choose dest */
switch (req->r_pgid.pg.type) {
case CEPH_PG_TYPE_REP:
- rule = CRUSH_REP_RULE(req->r_pgid.pg.size);
+ rule = CRUSH_REP_RULE(req->r_pgid.pg.size, req->r_pgid.pg.pool);
break;
default:
BUG_ON(1);
// rules
// replication
- for (int i=1; i<=ndom; i++) {
- crush_rule *rule = crush_make_rule(4);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
- crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_REP_RULE(i), rule);
- }
-
- // raid
- for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
- if (ndom >= i) {
+ for (int pool=0; pool<1; pool++)
+ for (int i=1; i<=ndom; i++) {
crush_rule *rule = crush_make_rule(4);
crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i), rule);
- } else {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i), rule);
+ crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
+ }
+
+ // raid
+ for (int pool=0; pool<1; pool++)
+ for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
+ if (ndom >= i) {
+ crush_rule *rule = crush_make_rule(4);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
+ crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ } else {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ }
}
- }
} else {
// one bucket
// rules
// replication
- for (int i=1; i<=g_conf.osd_max_rep; i++) {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_REP_RULE(i), rule);
- }
+ for (int pool=0; pool<1; pool++)
+ for (int i=1; i<=g_conf.osd_max_rep; i++) {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
+ }
+
// raid4
- for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i), rule);
- }
+ for (int pool=0; pool<1; pool++)
+ for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ }
}
crush.finalize();
ps_t numps = osdmap.get_pg_num();
int minrep = 1;
int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep);
- for (int nrep = minrep; nrep <= maxrep; nrep++) {
- for (ps_t ps = 0; ps < numps; ++ps) {
- pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, -1);
- vector<int> osds;
- osdmap.pg_to_osds(pgid, osds);
- if (osds[0] == 0) {
- pending_inc.new_pg_swap_primary[pgid] = osds[1];
- dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
- << osds[1] << dendl;
+ for (int pool=0; pool<1; pool++)
+ for (int nrep = minrep; nrep <= maxrep; nrep++) {
+ for (ps_t ps = 0; ps < numps; ++ps) {
+ pg_t pgid = pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1);
+ vector<int> osds;
+ osdmap.pg_to_osds(pgid, osds);
+ if (osds[0] == 0) {
+ pending_inc.new_pg_swap_primary[pgid] = osds[1];
+ dout(3) << "Changing primary for PG " << pgid << " from " << osds[0] << " to "
+ << osds[1] << dendl;
+ }
}
}
- }
propose_pending();
}
}
-object_t Ager::age_get_oid() {
+pobject_t Ager::age_get_oid() {
if (!age_free_oids.empty()) {
- object_t o = age_free_oids.front();
+ pobject_t o = age_free_oids.front();
age_free_oids.pop_front();
return o;
}
- object_t last = age_cur_oid;
- ++age_cur_oid.bno;
+ pobject_t last = age_cur_oid;
+ ++age_cur_oid.oid.bno;
return last;
}
avail - free > .02)
store->sync();
- object_t oid = age_get_oid();
+ pobject_t poid = age_get_oid();
int b = myrand() % 10;
- age_objects[b].push_back(oid);
+ age_objects[b].push_back(poid);
ssize_t s = age_pick_size();
wrote += (s + 4095) / 4096;
- generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl;
+ generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << poid << dec << " sz " << s << dendl;
if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) {
ssize_t t = MIN(s, max);
bufferlist sbl;
sbl.substr_of(bl, 0, t);
- store->write(oid, off, t, sbl, false);
+ store->write(poid, off, t, sbl, false);
off += t;
s -= t;
}
- oid.bno++;
+ poid.oid.bno++;
}
return wrote*4; // KB
n = nper;
continue;
}
- object_t oid = age_objects[b].front();
+ pobject_t poid = age_objects[b].front();
age_objects[b].pop_front();
- generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << dendl;
+ generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << poid << dec << dendl;
- store->remove(oid);
- age_free_oids.push_back(oid);
+ store->remove(poid);
+ age_free_oids.push_back(poid);
}
g_conf.ebofs_verify = false;
utime_t nextfl = start;
nextfl.sec_ref() += freelist_inc;
- while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+ while (age_objects.size() < 10) age_objects.push_back( list<pobject_t>() );
if (fake_size_mb) {
int fake_bl = fake_size_mb * 256;
// init size distn (once)
if (!did_distn) {
did_distn = true;
- age_cur_oid = object_t(0,1);
+ age_cur_oid = pobject_t(0, 0, object_t(0,1));
file_size_distn.add(1, 19.0758125+0.65434375);
file_size_distn.add(512, 35.6566);
file_size_distn.add(1024, 27.7271875);
ObjectStore *store;
private:
- list<object_t> age_free_oids;
- object_t age_cur_oid;
- vector< list<object_t> > age_objects;
+ list<pobject_t> age_free_oids;
+ pobject_t age_cur_oid;
+ vector< list<pobject_t> > age_objects;
Distribution file_size_distn; //kb
bool did_distn;
void age_empty(float pc);
uint64_t age_fill(float pc, utime_t until);
ssize_t age_pick_size();
- object_t age_get_oid();
+ pobject_t age_get_oid();
public:
Ager(ObjectStore *s) : store(s), did_distn(false) {}
const char *osd_base_path = "./osddata";
const char *ebofs_base_path = "./dev";
-static const object_t SUPERBLOCK_OBJECT(0,0);
-
// <hack> force remount hack for performance testing FakeStore
class C_Remount : public Context {
OSD *osd;
bl.push_back(bp);
utime_t start = g_clock.now();
for (int i=0; i<1000; i++)
- store->write(object_t(999,i), 0, bl.length(), bl, 0);
+ store->write(pobject_t(0, 0, object_t(999,i)), 0, bl.length(), bl, 0);
store->sync();
utime_t end = g_clock.now();
end -= start;
dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl;
for (int i=0; i<1000; i++)
- store->remove(object_t(999,i), 0);
+ store->remove(pobject_t(0, 0, object_t(999,i)), 0);
// set osd weight
superblock.weight = (1000.0 / (double)end);
bufferlist bl;
bl.append((char*)&superblock, sizeof(superblock));
- t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+ t.write(OSD_SUPERBLOCK_POBJECT, 0, sizeof(superblock), bl);
}
int OSD::read_superblock()
{
bufferlist bl;
- int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+ int r = store->read(OSD_SUPERBLOCK_POBJECT, 0, sizeof(superblock), bl);
if (bl.length() != sizeof(superblock)) {
dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl;
return -1;
p++)
t.remove(*p);
t.remove_collection(pgid);
- t.remove(pgid.to_object()); // log too
+ t.remove(pgid.to_pobject()); // log too
}
store->apply_transaction(t);
for (map<epoch_t,bufferlist>::iterator p = m->maps.begin();
p != m->maps.end();
p++) {
- object_t oid = get_osdmap_object_name(p->first);
- if (store->exists(oid)) {
+ pobject_t poid = get_osdmap_pobject_name(p->first);
+ if (store->exists(poid)) {
dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl;
logger->inc("mapfdup");
bufferlist bl;
}
dout(10) << "handle_osd_map got full map epoch " << p->first << dendl;
- store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it.
+ store->write(poid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it.
if (p->first > superblock.newest_map)
superblock.newest_map = p->first;
for (map<epoch_t,bufferlist>::iterator p = m->incremental_maps.begin();
p != m->incremental_maps.end();
p++) {
- object_t oid = get_inc_osdmap_object_name(p->first);
- if (store->exists(oid)) {
+ pobject_t poid = get_inc_osdmap_pobject_name(p->first);
+ if (store->exists(poid)) {
dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl;
logger->inc("mapidup");
bufferlist bl;
}
dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl;
- store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it.
+ store->write(poid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it.
if (p->first > superblock.newest_map)
superblock.newest_map = p->first;
dout(10) << "cur " << cur << " < newest " << superblock.newest_map << dendl;
if (m->incremental_maps.count(cur+1) ||
- store->exists(get_inc_osdmap_object_name(cur+1))) {
+ store->exists(get_inc_osdmap_pobject_name(cur+1))) {
dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl;
bufferlist bl;
// archive the full map
bl.clear();
osdmap->encode(bl);
- t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl);
+ t.write( get_osdmap_pobject_name(cur+1), 0, bl.length(), bl);
// notify messenger
for (map<int32_t,uint8_t>::iterator i = inc.new_down.begin();
}
}
else if (m->maps.count(cur+1) ||
- store->exists(get_osdmap_object_name(cur+1))) {
+ store->exists(get_osdmap_pobject_name(cur+1))) {
dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl;
bufferlist bl;
if (m->maps.count(cur+1))
int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep);
int minraid = g_conf.osd_min_raid_width;
int maxraid = g_conf.osd_max_raid_width;
+ int numpool = 1; // FIXME
dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, "
<< minraid << ".." << maxraid << " osd raid groups" << dendl;
// create PGs
// replicated
- for (int nrep = 1; nrep <= maxrep; nrep++) {
- for (ps_t ps = 0; ps < numps; ++ps)
- try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, -1), t);
- for (ps_t ps = 0; ps < numlps; ++ps)
- try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, whoami), t);
- }
+ for (int pool = 0; pool < numpool; pool++)
+ for (int nrep = 1; nrep <= maxrep; nrep++) {
+ for (ps_t ps = 0; ps < numps; ++ps)
+ try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1), t);
+ for (ps_t ps = 0; ps < numlps; ++ps)
+ try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, whoami), t);
+ }
// raided
- for (int size = minraid; size <= maxraid; size++) {
- for (ps_t ps = 0; ps < numps; ++ps)
- try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, -1), t);
- for (ps_t ps = 0; ps < numlps; ++ps)
- try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, whoami), t);
- }
+ for (int pool = 0; pool < numpool; pool++)
+ for (int size = minraid; size <= maxraid; size++) {
+ for (ps_t ps = 0; ps < numps; ++ps)
+ try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, -1), t);
+ for (ps_t ps = 0; ps < numlps; ++ps)
+ try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, whoami), t);
+ }
dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl;
bool OSD::get_map_bl(epoch_t e, bufferlist& bl)
{
- return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0;
+ return store->read(get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
}
bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl)
{
- return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0;
+ return store->read(get_inc_osdmap_pobject_name(e), 0, 0, bl) >= 0;
}
void OSD::get_map(epoch_t epoch, OSDMap &m)
public:
int get_nodeid() { return whoami; }
- static object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); }
- static object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); }
+ static pobject_t get_osdmap_pobject_name(epoch_t epoch) {
+ return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, epoch << 1));
+ }
+ static pobject_t get_inc_osdmap_pobject_name(epoch_t epoch) {
+ return pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0, (epoch << 1) + 1));
+ }
private:
// oid -> pg
ceph_object_layout file_to_object_layout(object_t oid, FileLayout& layout) {
- return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, layout.fl_pg_preferred, layout.fl_object_stripe_unit);
+ return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size,
+ layout.fl_pg_pool,
+ layout.fl_pg_preferred,
+ layout.fl_object_stripe_unit);
}
- ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
+ ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int pg_pool, int preferred=-1, int object_stripe_unit = 0) {
int num = preferred >= 0 ? localized_pg_num:pg_num;
int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask;
//cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl;
// construct object layout
- pg_t pgid = pg_t(pg_type, pg_size, ps, preferred);
+ pg_t pgid = pg_t(pg_type, pg_size, ps, pg_pool, preferred);
ceph_object_layout layout;
layout.ol_pgid = pgid.u;
layout.ol_stripe_unit = object_stripe_unit;
{
// what crush rule?
int rule;
- if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size());
- else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size());
+ if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size(), pg.pool());
+ else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size(), pg.pool());
else assert(0);
crush.do_rule(rule,
pg.ps(),
ondisklog.top = bl.length();
// write it
- t.remove( info.pgid.to_object() );
- t.write( info.pgid.to_object() , 0, bl.length(), bl);
+ t.remove( info.pgid.to_pobject() );
+ t.write( info.pgid.to_pobject() , 0, bl.length(), bl);
t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
- t.zero(info.pgid.to_object(), 0, ondisklog.bottom);
+ t.zero(info.pgid.to_pobject(), 0, ondisklog.bottom);
}
bufferptr bp(4096 - sizeof(logentry));
bl.push_back(bp);
}
- t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl );
+ t.write( info.pgid.to_pobject(), ondisklog.top, bl.length(), bl );
// update block map?
if (ondisklog.top % 4096 == 0)
if (ondisklog.top > 0) {
// read
bufferlist bl;
- store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl);
+ store->read(info.pgid.to_pobject(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl);
if (bl.length() < ondisklog.top-ondisklog.bottom) {
dout(0) << "read_log data doesn't match attrs" << dendl;
assert(0);
if (i->is_delete()) continue;
eversion_t v;
- int r = osd->store->getattr(i->oid, "version", &v, sizeof(v));
+ pobject_t poid(info.pgid.pool(), 0, i->oid);
+ int r = osd->store->getattr(poid, "version", &v, sizeof(v));
if (r < 0 || v < i->version)
missing.add(i->oid, i->version);
}
//
bool PG::block_if_wrlocked(MOSDOp* op)
{
- object_t oid = op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, op->get_oid());
entity_name_t source;
- int len = osd->store->getattr(oid, "wrlock", &source, sizeof(entity_name_t));
+ int len = osd->store->getattr(poid, "wrlock", &source, sizeof(entity_name_t));
//dout(0) << "getattr returns " << len << " on " << oid << dendl;
if (len == sizeof(source) &&
source != op->get_client()) {
//the object is locked for writing by someone else -- add the op to the waiting queue
- waiting_for_wr_unlock[oid].push_back(op);
+ waiting_for_wr_unlock[poid.oid].push_back(op);
return true;
}
bool PG::pick_object_rev(object_t& oid)
{
- pobject_t t = oid;
+ pobject_t t(info.pgid.pool(), 0, oid);
if (!osd->store->pick_object_revision_lt(t))
return false; // we have no revisions of this object!
{
assert(is_missing_object(oid));
+ pobject_t poid(info.pgid.pool(), 0, oid);
+
// we don't have it (yet).
eversion_t v = missing.missing[oid];
if (objects_pulling.count(oid)) {
dout(7) << "missing "
- << oid
+ << poid
<< " v " << v
<< ", already pulling"
<< dendl;
} else {
dout(7) << "missing "
- << oid
+ << poid
<< " v " << v
<< ", pulling"
<< dendl;
- pull(oid);
+ pull(poid);
}
waiting_for_missing_object[oid].push_back(m);
}
return false;
object_t oid = op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, oid);
// -- load balance reads --
if (is_primary() &&
bool b;
// *** FIXME *** this may block, and we're in the fast path! ***
if (g_conf.osd_balance_reads &&
- osd->store->getattr(pobject_t(0,0,oid), "balance-reads", &b, 1) >= 0)
+ osd->store->getattr(poid, "balance-reads", &b, 1) >= 0)
is_balanced = true;
if (!is_balanced && should_balance &&
// -- fastpath read?
// if this is a read and the data is in the cache, do an immediate read..
if ( g_conf.osd_immediate_read_from_cache ) {
- if (osd->store->is_cached( pobject_t(0,0,oid) ,
+ if (osd->store->is_cached( poid,
op->get_offset(),
op->get_length() ) == 0) {
if (!is_primary() && !op->get_source().is_osd()) {
// am i allowed?
bool v;
- if (osd->store->getattr(pobject_t(0,0,oid), "balance-reads", &v, 1) < 0) {
+ if (osd->store->getattr(poid, "balance-reads", &v, 1) < 0) {
dout(-10) << "preprocess_op in-cache but no balance-reads on " << oid
<< ", fwd to primary" << dendl;
osd->messenger->send_message(op, osd->osdmap->get_inst(get_primary()));
void ReplicatedPG::op_read(MOSDOp *op)
{
object_t oid = op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, oid);
dout(10) << "op_read " << MOSDOp::get_opname(op->get_op())
<< " " << oid
} else {
// make sure i exist and am balanced, otherwise fw back to acker.
bool b;
- if (!osd->store->exists(oid) ||
- osd->store->getattr(oid, "balance-reads", &b, 1) < 0) {
- dout(-10) << "read on replica, object " << oid
+ if (!osd->store->exists(poid) ||
+ osd->store->getattr(poid, "balance-reads", &b, 1) < 0) {
+ dout(-10) << "read on replica, object " << poid
<< " dne or no balance-reads, fw back to primary" << dendl;
osd->messenger->send_message(op, osd->osdmap->get_inst(get_acker()));
return;
{
// read into a buffer
bufferlist bl;
- r = osd->store->read(oid,
+ r = osd->store->read(poid,
op->get_offset(), op->get_length(),
bl);
reply->set_data(bl);
{
struct stat st;
memset(&st, sizeof(st), 0);
- r = osd->store->stat(oid, &st);
+ r = osd->store->stat(poid, &st);
if (r >= 0)
reply->set_length(st.st_size);
}
void ReplicatedPG::issue_repop(RepGather *repop, int dest, utime_t now)
{
- pobject_t poid = repop->op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, repop->op->get_oid());
dout(7) << " issue_repop rep_tid " << repop->rep_tid
<< " o " << poid
<< " to osd" << dest
objectrev_t ReplicatedPG::assign_version(MOSDOp *op)
{
- object_t oid = op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, op->get_oid());
// check crev
objectrev_t crev = 0;
- osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
+ osd->store->getattr(poid, "crev", (char*)&crev, sizeof(crev));
// assign version
eversion_t clone_version;
assert(nv > log.top);
// will clone?
- if (crev && op->get_oid().rev && op->get_oid().rev > crev) {
+ if (crev && poid.oid.rev && poid.oid.rev > crev) {
clone_version = nv;
nv.version++;
}
void ReplicatedPG::op_modify(MOSDOp *op)
{
int whoami = osd->get_nodeid();
- object_t oid = op->get_oid();
+ pobject_t poid(info.pgid.pool(), 0, op->get_oid());
+
const char *opname = MOSDOp::get_opname(op->get_op());
// make sure it looks ok
// balance-reads set?
char v;
if ((op->get_op() != CEPH_OSD_OP_BALANCEREADS && op->get_op() != CEPH_OSD_OP_UNBALANCEREADS) &&
- (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) >= 0 ||
- balancing_reads.count(op->get_oid()))) {
+ (osd->store->getattr(poid, "balance-reads", &v, 1) >= 0 ||
+ balancing_reads.count(poid.oid))) {
- if (!unbalancing_reads.count(op->get_oid())) {
+ if (!unbalancing_reads.count(poid.oid)) {
// unbalance
- dout(-10) << "preprocess_op unbalancing-reads on " << op->get_oid() << dendl;
- unbalancing_reads.insert(op->get_oid());
+ dout(-10) << "preprocess_op unbalancing-reads on " << poid.oid << dendl;
+ unbalancing_reads.insert(poid.oid);
ceph_object_layout layout;
layout.ol_pgid = info.pgid.u;
layout.ol_stripe_unit = 0;
MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(),
- op->get_oid(),
+ poid.oid,
layout,
osd->osdmap->get_epoch(),
CEPH_OSD_OP_UNBALANCEREADS);
}
// add to wait queue
- dout(-10) << "preprocess_op waiting for unbalance-reads on " << op->get_oid() << dendl;
- waiting_for_unbalanced_reads[op->get_oid()].push_back(op);
+ dout(-10) << "preprocess_op waiting for unbalance-reads on " << poid.oid << dendl;
+ waiting_for_unbalanced_reads[poid.oid].push_back(op);
return;
}
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
if (peer_missing.count(peer) &&
- peer_missing[peer].is_missing(oid)) {
+ peer_missing[peer].is_missing(poid.oid)) {
// push it before this update.
// FIXME, this is probably extra much work (eg if we're about to overwrite)
- push(oid, peer);
+ push(poid, peer);
}
}
dout(10) << "op_modify " << opname
- << " " << oid
+ << " " << poid.oid
<< " v " << nv
//<< " crev " << crev
- << " rev " << op->get_oid().rev
+ << " rev " << poid.oid.rev
<< " " << op->get_offset() << "~" << op->get_length()
<< dendl;
// we are acker.
if (op->get_op() != CEPH_OSD_OP_WRNOOP) {
// log and update later.
- pobject_t poid = oid;
prepare_log_transaction(repop->t, op->get_reqid(), poid, op->get_op(), nv,
- crev, op->get_oid().rev, peers_complete_thru);
+ crev, poid.oid.rev, peers_complete_thru);
prepare_op_transaction(repop->t, op->get_reqid(),
info.pgid, op->get_op(), poid,
op->get_offset(), op->get_length(), op->get_data(),
- nv, crev, op->get_oid().rev);
+ nv, crev, poid.oid.rev);
}
// (logical) local ack.
if (latest->is_update() &&
!objects_pulling.count(latest->oid) &&
missing.is_missing(latest->oid)) {
- pull(latest->oid);
+ pobject_t poid(info.pgid.pool(), 0, latest->oid);
+ pull(poid);
return true;
}
// oldest first!
object_t oid = peer_missing[peer].rmissing.begin()->second;
+ pobject_t poid(info.pgid.pool(), 0, oid);
eversion_t v = peer_missing[peer].rmissing.begin()->first;
- push(oid, peer);
+ push(poid, peer);
// do other peers need it too?
for (i++; i<acting.size(); i++) {
int peer = acting[i];
if (peer_missing.count(peer) &&
peer_missing[peer].is_missing(oid))
- push(oid, peer);
+ push(poid, peer);
}
return;
if (p->is_delete()) {
if (s.count(p->oid)) {
- dout(10) << " deleting " << p->oid
+ pobject_t poid(info.pgid.pool(), 0, p->oid);
+ dout(10) << " deleting " << poid
<< " when " << p->version << dendl;
- t.remove(p->oid);
+ t.remove(poid);
}
s.erase(p->oid);
} else {
for (set<object_t>::iterator i = s.begin();
i != s.end();
i++) {
- dout(10) << " deleting stray " << *i << dendl;
- t.remove(*i);
+ pobject_t poid(info.pgid.pool(), 0, *i);
+ dout(10) << " deleting stray " << poid << dendl;
+ t.remove(poid);
}
} else {
did.insert(p->oid);
if (p->is_delete()) {
- dout(10) << " deleting " << p->oid
+ pobject_t poid(info.pgid.pool(), 0, p->oid);
+ dout(10) << " deleting " << poid
<< " when " << p->version << dendl;
- t.remove(p->oid);
+ t.remove(poid);
} else {
// keep old(+missing) objects, just for kicks.
}
// pg stuff
typedef uint16_t ps_t;
-typedef uint8_t pruleset_t;
-
+#define OSD_METADATA_PG_POOL 0xff
+#define OSD_SUPERBLOCK_POBJECT pobject_t(OSD_METADATA_PG_POOL, 0, object_t(0,0))
// placement group id
struct pg_t {
public:
pg_t() { u.pg64 = 0; }
pg_t(const pg_t& o) { u.pg64 = o.u.pg64; }
- pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) {
+ pg_t(int type, int size, ps_t seed, int pool, int pref) {
u.pg.type = type;
u.pg.size = size;
u.pg.ps = seed;
+ u.pg.pool = pool;
u.pg.preferred = pref; // hack: avoid negative.
- //u.pg.ruleset = r;
assert(sizeof(u.pg) == sizeof(u.pg64));
}
pg_t(uint64_t v) { u.pg64 = v; }
int size() { return u.pg.size; }
ps_t ps() { return u.pg.ps; }
- //pruleset_t ruleset() { return u.pg.ruleset; }
+ int pool() { return u.pg.pool; }
int preferred() { return u.pg.preferred; } // hack: avoid negative.
/*
*/
operator uint64_t() const { return u.pg64; }
- pobject_t to_object() const {
- return pobject_t(1, // volume 1 == osd metadata, for now
+ pobject_t to_pobject() const {
+ return pobject_t(OSD_METADATA_PG_POOL, // osd metadata
0,
object_t(u.pg64, 0));
}
else
out << pg.size() << '?';
- //if (pg.ruleset())
- //out << (int)pg.ruleset() << 's';
-
out << hex << pg.ps() << dec;
+ if (pg.pool() > 0)
+ out << 'v' << pg.pool();
if (pg.preferred() >= 0)
out << 'p' << pg.preferred();