_unlock_pg(pgid);
}
}
- osd_lock.Unlock();
+
+ // finishers?
+ if (finished.empty()) {
+ osd_lock.Unlock();
+ } else {
+ list<Message*> waiting;
+ waiting.splice(waiting.begin(), finished);
+
+ osd_lock.Unlock();
+
+ for (list<Message*>::iterator it = waiting.begin();
+ it != waiting.end();
+ it++) {
+ dispatch(*it);
+ }
+ }
// kick myself w/ a ping .. HACK
- messenger->send_message(new MPing, MSG_ADDR_OSD(whoami));
+ //messenger->send_message(new MPing, MSG_ADDR_OSD(whoami));
}
if (osdmap->is_mkfs()) {
ps_t maxps = 1ULL << osdmap->get_pg_bits();
+ ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits();
dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl;
assert(osdmap->get_epoch() == 1);
for (int nrep = 1;
nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh
nrep++) {
- for (pg_t ps = 0; ps < maxps; ++ps) {
- pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep);
- vector<int> acting;
- int nrep = osdmap->pg_to_acting_osds(pgid, acting);
- int role = osdmap->calc_pg_role(whoami, acting, nrep);
- if (role < 0) continue;
-
- PG *pg = create_pg(pgid, t);
- pg->set_role(role);
- pg->acting.swap(acting);
- pg->last_epoch_started_any =
- pg->info.last_epoch_started =
- pg->info.history.same_since =
- pg->info.history.same_primary_since =
- pg->info.history.same_acker_since = osdmap->get_epoch();
- pg->activate(t);
-
- dout(7) << "created " << *pg << endl;
+ for (ps_t ps = 0; ps < maxps; ++ps) {
+ vector<int> acting;
+ pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep);
+ int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int role = osdmap->calc_pg_role(whoami, acting, nrep);
+ if (role < 0) continue;
+
+ PG *pg = create_pg(pgid, t);
+ pg->set_role(role);
+ pg->acting.swap(acting);
+ pg->last_epoch_started_any =
+ pg->info.last_epoch_started =
+ pg->info.history.same_since =
+ pg->info.history.same_primary_since =
+ pg->info.history.same_acker_since = osdmap->get_epoch();
+ pg->activate(t);
+
+ dout(7) << "created " << *pg << endl;
}
- // local PG too
- pg_t pgid = osdmap->osd_nrep_to_pg(whoami, nrep);
- vector<int> acting;
- int nrep = osdmap->pg_to_acting_osds(pgid, acting);
- int role = osdmap->calc_pg_role(whoami, acting, nrep);
-
- PG *pg = create_pg(pgid, t);
- pg->acting.swap(acting);
- pg->set_role(role);
- pg->last_epoch_started_any =
- pg->info.last_epoch_started =
- pg->info.history.same_primary_since =
- pg->info.history.same_acker_since =
- pg->info.history.same_since = osdmap->get_epoch();
- pg->activate(t);
-
- dout(7) << "created " << *pg << endl;
+ for (ps_t ps = 0; ps < maxlps; ++ps) {
+ // local PG too
+ vector<int> acting;
+ pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep);
+ int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int role = osdmap->calc_pg_role(whoami, acting, nrep);
+
+ PG *pg = create_pg(pgid, t);
+ pg->acting.swap(acting);
+ pg->set_role(role);
+ pg->last_epoch_started_any =
+ pg->info.last_epoch_started =
+ pg->info.history.same_primary_since =
+ pg->info.history.same_acker_since =
+ pg->info.history.same_since = osdmap->get_epoch();
+ pg->activate(t);
+
+ dout(7) << "created " << *pg << endl;
+ }
}
dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl;
PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t)
{
+ if (pg_map.count(pgid)) {
+ dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl;
+ }
assert(pg_map.count(pgid) == 0);
assert(!pg_exists(pgid));
*/
// from LSB to MSB,
-#define PG_PS_BITS 24 // max bits for placement seed/group portion of PG
+#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG
#define PG_REP_BITS 6 // up to 64 replicas
#define PG_TYPE_BITS 2
#define PG_PS_MASK ((1LL<<PG_PS_BITS)-1)
epoch_t mon_epoch; // monitor epoch (election iteration)
utime_t ctime; // epoch start time
int pg_bits; // placement group bits
+ int localized_pg_bits; // bits for localized pgs
set<int> osds; // all osds
set<int> down_osds; // list of down disks
friend class MDS;
public:
- OSDMap() : epoch(0), mon_epoch(0), pg_bits(5) {}
+ OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {}
// map info
epoch_t get_epoch() const { return epoch; }
int get_pg_bits() const { return pg_bits; }
void set_pg_bits(int b) { pg_bits = b; }
+ int get_localized_pg_bits() const { return localized_pg_bits; }
const utime_t& get_ctime() const { return ctime; }
/**** mapping facilities ****/
- // oid -> ps
- ps_t object_to_pg(object_t oid, FileLayout& layout) {
+ // oid -> pg
+ pg_t object_to_pg(object_t oid, FileLayout& layout) {
static crush::Hash H(777);
int policy = layout.object_layout;
policy = g_conf.osd_object_layout;
int type = PG_TYPE_RAND;
- pg_t ps;
+ ps_t ps;
switch (policy) {
case OBJECT_LAYOUT_LINEAR:
}
// construct final PG
- pg_t pg = type;
+ /*pg_t pg = type;
pg = (pg << PG_REP_BITS) | (pg_t)layout.num_rep;
pg = (pg << PG_PS_BITS) | ps;
+ */
//cout << "pg " << hex << pg << dec << endl;
- return pg;
+ return pg_t(ps, 0, layout.num_rep);
}
// (ps, nrep) -> pg
pg_t ps_nrep_to_pg(ps_t ps, int nrep) {
- return ((pg_t)ps & ((1ULL<<pg_bits)-1ULL))
+ /*return ((pg_t)ps & ((1ULL<<pg_bits)-1ULL))
| ((pg_t)nrep << PG_PS_BITS)
| ((pg_t)PG_TYPE_RAND << (PG_PS_BITS+PG_REP_BITS));
+ */
+ return pg_t(ps, 0, nrep, 0);
}
- pg_t osd_nrep_to_pg(int osd, int nrep) {
- return ((pg_t)osd)
+ pg_t ps_osd_nrep_to_pg(ps_t ps, int osd, int nrep) {
+ /*return ((pg_t)osd)
| ((pg_t)nrep << PG_PS_BITS)
| ((pg_t)PG_TYPE_STARTOSD << (PG_PS_BITS+PG_REP_BITS));
-
+ */
+ return pg_t(ps, osd+1, nrep, 0);
}
// pg -> nrep
int pg_to_nrep(pg_t pg) {
- return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1);
+ return pg.u.fields.nrep;
+ //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1);
}
// pg -> ps
int pg_to_ps(pg_t pg) {
- return pg & PG_PS_MASK;
- }
-
- // pg -> pg_type
- int pg_to_type(pg_t pg) {
- return pg >> (PG_PS_BITS + PG_REP_BITS);
+ //return pg & PG_PS_MASK;
+ return pg.u.fields.ps;
}
-
// pg -> (osd list)
int pg_to_osds(pg_t pg,
pg_t ps = pg_to_ps(pg);
int num_rep = pg_to_nrep(pg);
assert(num_rep > 0);
- int type = pg_to_type(pg);
-
- // spread "on" ps bits around a bit (usually only low bits are set bc of pg_bits)
- if (num_rep > 0) {
- switch(g_conf.osd_pg_layout) {
- case PG_LAYOUT_CRUSH:
- crush.do_rule(crush.rules[num_rep],
- ps,
- osds,
- out_osds, overload_osds);
- break;
-
- case PG_LAYOUT_LINEAR:
- for (int i=0; i<num_rep; i++)
- osds.push_back( (i + ps*num_rep) % g_conf.num_osd );
- break;
-
- case PG_LAYOUT_HYBRID:
- {
- static crush::Hash H(777);
- int h = H(ps);
- for (int i=0; i<num_rep; i++)
- osds.push_back( (h+i) % g_conf.num_osd );
- }
- break;
-
- case PG_LAYOUT_HASH:
- {
- static crush::Hash H(777);
- for (int i=0; i<num_rep; i++) {
- int t = 1;
- int osd = 0;
- while (t++) {
- osd = H(i, ps, t) % g_conf.num_osd;
- int j = 0;
- for (; j<i; j++)
- if (osds[j] == osd) break;
- if (j == i) break;
- }
- osds.push_back(osd);
- }
- }
- break;
-
- default:
- assert(0);
+ // map to osds[]
+ switch(g_conf.osd_pg_layout) {
+ case PG_LAYOUT_CRUSH:
+ crush.do_rule(crush.rules[num_rep], // FIXME.
+ ps,
+ osds,
+ out_osds, overload_osds);
+ break;
+
+ case PG_LAYOUT_LINEAR:
+ for (int i=0; i<num_rep; i++)
+ osds.push_back( (i + ps*num_rep) % g_conf.num_osd );
+ break;
+
+ case PG_LAYOUT_HYBRID:
+ {
+ static crush::Hash H(777);
+ int h = H(ps);
+ for (int i=0; i<num_rep; i++)
+ osds.push_back( (h+i) % g_conf.num_osd );
+ }
+ break;
+
+ case PG_LAYOUT_HASH:
+ {
+ static crush::Hash H(777);
+ for (int i=0; i<num_rep; i++) {
+ int t = 1;
+ int osd = 0;
+ while (t++) {
+ osd = H(i, ps, t) % g_conf.num_osd;
+ int j = 0;
+ for (; j<i; j++)
+ if (osds[j] == osd) break;
+ if (j == i) break;
+ }
+ osds.push_back(osd);
+ }
}
+ break;
+
+ default:
+ assert(0);
}
+
+ if (pg.u.fields.preferred > 0) {
+ int osd = pg.u.fields.preferred-1;
- if (type == PG_TYPE_STARTOSD) {
// already in there?
if (osds.empty()) {
- osds.push_back((int)ps);
+ osds.push_back(osd);
} else {
assert(num_rep > 0);
for (int i=1; i<num_rep; i++)
- if (osds[i] == (int)ps) {
+ if (osds[i] == osd) {
// swap with position 0
osds[i] = osds[0];
}
- osds[0] = (int)ps;
+ osds[0] = osd;
}
- if (is_out((int)ps))
+ if (is_out(osd))
osds.erase(osds.begin()); // oops, but it's down!
}