- read+floor_lockout
osd/rados
+- pg splits.
+ - during mkps, iterate over affected old pgs and move objects
+ - set prior set on new pg to include primaries+replicas of any possible old pgs from whom objects may have been inherited.
+
- fix build_prior_set behavior. needs to not always exclude currently down nodes. e.g.,
1: A B
osd_auto_weight: false,
- osd_hack_fast_startup: false, // this breaks localized pgs.
-
// --- fakestore ---
fakestore_sync_interval: 2, // seconds
else if (strcmp(args[i], "--osd_auto_weight") == 0)
g_conf.osd_auto_weight = atoi(args[++i]);
- else if (strcmp(args[i], "--osd_hack_fast_startup") == 0)
- g_conf.osd_hack_fast_startup = atoi(args[++i]);
-
else if (strcmp(args[i], "--bdev_lock") == 0)
g_conf.bdev_lock = atoi(args[++i]);
else if (strcmp(args[i], "--bdev_el_bidir") == 0)
bool osd_auto_weight;
- bool osd_hack_fast_startup;
-
// fakestore
double fakestore_sync_interval;
bool fakestore_fake_attrs;
goto bad;
if ((err = ceph_decode_32(p, end, &map->epoch)) < 0)
goto bad;
- if ((err = ceph_decode_32(p, end, &map->mkfs_epoch)) < 0)
- goto bad;
if ((err = ceph_decode_32(p, end, &map->ctime.tv_sec)) < 0)
goto bad;
if ((err = ceph_decode_32(p, end, &map->ctime.tv_usec)) < 0)
goto bad;
if ((err = ceph_decode_32(p, end, &map->localized_pg_num)) < 0)
goto bad;
+ *p += 2*sizeof(__u32); /* skip prior_*pg_num fields */
calc_pg_masks(map);
goto bad;
if ((err = ceph_decode_32(p, end, &epoch)) < 0)
goto bad;
- (*p)++; /* skip mkfs u8 */
BUG_ON(epoch != map->epoch+1);
if ((err = ceph_decode_32(p, end, &ctime.tv_sec)) < 0)
goto bad;
/* new max? */
if ((err = ceph_decode_32(p, end, &max)) < 0)
goto bad;
+ *p += 2*sizeof(__u32); /* skip new_pg_num, for now. FIXME. */
if (max > 0) {
if ((err = osdmap_set_max_osd(map, max)) < 0)
goto bad;
pending_inc.new_offload[from] = CEPH_OSD_IN;
osd_weight[from] = m->sb.weight;
-
- if (!osdmap.post_mkfs() && !osdmap.is_mkfs())
- pending_inc.mkfs = 1; // first set of up osds, do the mkfs!
+
+ if (osdmap.pg_num == 0) {
+ // set a conservative initial pg_num
+ pending_inc.new_pg_num = osdmap.get_max_osd() << g_conf.osd_pg_bits;
+ pending_inc.new_localized_pg_num = 4; // per osd
+ dout(1) << "prepare_boot setting initial pg_num to " << pending_inc.new_pg_num << dendl;
+ }
// wait
paxos->wait_for_commit(new C_Booted(this, m));
paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
return true;
}
+ else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) {
+ int n = atoi(m->cmd[2].c_str());
+ if (n > osdmap.get_pg_num()) {
+ ss << "set new pg_num = " << n;
+ pending_inc.new_pg_num = n;
+ getline(ss, rs);
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ } else {
+ ss << "specified pg_num " << n << " < current " << osdmap.get_pg_num();
+ getline(ss, rs);
+ mon->reply_command(m, -EINVAL, rs);
+ }
+ }
else if (m->cmd[1] == "down" && m->cmd.size() > 2) {
errno = 0;
long osd = strtol(m->cmd[2].c_str(), 0, 10);
pg->info.history.same_primary_since =
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->write_log(t);
- if (g_conf.osd_hack_fast_startup)
- pg->activate(t);
dout(7) << "created " << *pg << dendl;
pg->unlock();
{
dout(7) << "advance_map epoch " << osdmap->get_epoch()
<< " " << pg_map.size() << " pgs"
- << " mkfs_peoch " << osdmap->get_mkfs_epoch()
<< dendl;
- if (osdmap->is_mkfs()) {
+ if (osdmap->is_mkpg()) {
+ // FIXME: move this bit elsewhere....
// is this okay?
+ /*
ceph_fsid nullfsid;
memset(&nullfsid, 0, sizeof(nullfsid));
if (memcmp(&nullfsid, &superblock.fsid, sizeof(nullfsid)) != 0) {
- derr(0) << "will not mkfs, my superblock fsid is not zeroed" << dendl;
+ derr(0) << "will not mkps, my superblock fsid is not zeroed" << dendl;
assert(0);
}
- superblock.fsid = osdmap->get_fsid();
- assert(g_conf.osd_mkfs); // make sure we did a mkfs!
+ */
+ superblock.fsid = osdmap->get_fsid(); // FIXME
+ //assert(g_conf.osd_mkfs); // make sure we did a mkfs!
// ok!
ps_t numps = osdmap->get_pg_num();
+ ps_t fromps = osdmap->get_prior_pg_num();
ps_t numlps = osdmap->get_localized_pg_num();
- dout(1) << "mkfs " << osdmap->get_fsid() << " on "
- << numps << " normal, "
- << numlps << " localized pg sets" << dendl;
+ ps_t fromlps = osdmap->get_prior_pg_num();
+ dout(1) << "mkpg " << osdmap->get_fsid() << " on "
+ << fromps << "-" << numps << " normal, "
+ << fromlps << "-" << numlps << " localized pg sets" << dendl;
int minrep = 1;
int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep);
int minraid = g_conf.osd_min_raid_width;
int maxraid = g_conf.osd_max_raid_width;
int numpool = 1; // FIXME
- dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, "
+ dout(1) << "mkpg " << minrep << ".." << maxrep << " replicas, "
<< minraid << ".." << maxraid << " osd raid groups" << dendl;
//derr(0) << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl;
// replicated
for (int pool = 0; pool < numpool; pool++)
for (int nrep = 1; nrep <= maxrep; nrep++) {
- for (ps_t ps = 0; ps < numps; ++ps)
+ for (ps_t ps = fromps; ps < numps; ++ps)
try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1), t);
- for (ps_t ps = 0; ps < numlps; ++ps)
+ for (ps_t ps = fromlps; ps < numlps; ++ps)
try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, whoami), t);
}
// raided
for (int pool = 0; pool < numpool; pool++)
for (int size = minraid; size <= maxraid; size++) {
- for (ps_t ps = 0; ps < numps; ++ps)
+ for (ps_t ps = fromps; ps < numps; ++ps)
try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, -1), t);
- for (ps_t ps = 0; ps < numlps; ++ps)
+ for (ps_t ps = fromlps; ps < numlps; ++ps)
try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, whoami), t);
}
- dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl;
+ dout(1) << "mkpg done, now i have " << pg_map.size() << " pgs" << dendl;
} else {
// scan existing pg's
pg->unlock();
}
- if (g_conf.osd_hack_fast_startup &&
- osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs
- return;
-
do_notifies(notify_list); // notify? (residual|replica)
do_queries(query_map);
do_activators(activator_map);
ctime = g_clock.now();
set_max_osd(num_osd);
- set_pg_num(num_osd << pg_bits);
+ //pg_num = num_osd << pg_bits; // not here.. must be >0 'up' osds!
// crush map
map<int,double> weights;
inline int calc_bits_of(int t) {
int b = 0;
- while (t) {
+ while (t > 0) {
t = t >> 1;
b++;
}
// incremental
int32_t new_max_osd;
- __u8 mkfs;
+ int32_t new_pg_num;
+ int32_t new_localized_pg_num;
map<int32_t,entity_addr_t> new_up;
map<int32_t,uint8_t> new_down;
map<int32_t,uint32_t> new_offload;
void encode(bufferlist& bl) {
::_encode(fsid, bl);
::_encode(epoch, bl);
- ::_encode(mkfs, bl);
ctime._encode(bl);
::_encode(fullmap, bl);
::_encode(crush, bl);
::_encode(new_max_osd, bl);
+ ::_encode(new_pg_num, bl);
+ ::_encode(new_localized_pg_num, bl);
::_encode(new_up, bl);
::_encode(new_down, bl);
::_encode(new_offload, bl);
void decode(bufferlist& bl, int& off) {
::_decode(fsid, bl, off);
::_decode(epoch, bl, off);
- ::_decode(mkfs, bl, off);
ctime._decode(bl, off);
::_decode(fullmap, bl, off);
::_decode(crush, bl, off);
::_decode(new_max_osd, bl, off);
+ ::_decode(new_pg_num, bl, off);
+ ::_decode(new_localized_pg_num, bl, off);
::_decode(new_up, bl, off);
::_decode(new_down, bl, off);
::_decode(new_offload, bl, off);
::_decode(old_pg_swap_primary, bl, off);
}
- Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), mkfs(0) {
+ Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), new_pg_num(0), new_localized_pg_num(0) {
fsid.major = fsid.minor = cpu_to_le64(0);
}
};
private:
ceph_fsid fsid;
- epoch_t epoch, mkfs_epoch; // what epoch of the osd cluster descriptor is this
- utime_t ctime, mtime; // epoch start time
+ epoch_t epoch; // what epoch of the osd cluster descriptor is this
+ utime_t ctime, mtime; // epoch start time
int32_t pg_num; // placement group count
int32_t pg_num_mask; // bitmask for above
int32_t localized_pg_num; // localized place group count
int32_t localized_pg_num_mask; // ditto
+ // values from prior epoch, so we can create them
+ int32_t prior_pg_num;
+ int32_t prior_localized_pg_num;
+
int32_t max_osd;
vector<uint8_t> osd_state;
vector<entity_addr_t> osd_addr;
friend class MDS;
public:
- OSDMap() : epoch(0), mkfs_epoch((epoch_t)-1),
- pg_num(1<<5),
- localized_pg_num(1<<3),
+ OSDMap() : epoch(0),
+ pg_num(0), localized_pg_num(0),
+ prior_pg_num(0), prior_localized_pg_num(0),
max_osd(0) {
fsid.major = fsid.minor = cpu_to_le64(0);
calc_pg_masks();
}
int get_pg_num() const { return pg_num; }
- void set_pg_num(int m) { pg_num = m; calc_pg_masks(); }
+ int get_prior_pg_num() const { return prior_pg_num; }
+ //void set_pg_num(int m) { pg_num = m; calc_pg_masks(); }
int get_localized_pg_num() const { return localized_pg_num; }
+ int get_prior_localized_pg_num() const { return prior_localized_pg_num; }
/* stamps etc */
const utime_t& get_ctime() const { return ctime; }
const utime_t& get_mtime() const { return mtime; }
- bool is_mkfs() const { return epoch == mkfs_epoch; }
- bool post_mkfs() const { return epoch > mkfs_epoch; }
- epoch_t get_mkfs_epoch() const { return mkfs_epoch; }
+ bool is_mkpg() const {
+ return (pg_num > prior_pg_num) || (localized_pg_num > prior_localized_pg_num);
+ }
/***** cluster state *****/
/* osds */
epoch++;
ctime = inc.ctime;
+ prior_pg_num = pg_num;
+ prior_localized_pg_num = prior_localized_pg_num;
+
// full map?
if (inc.fullmap.length()) {
decode(inc.fullmap);
}
// nope, incremental.
- if (inc.mkfs)
- mkfs_epoch = epoch;
+ if (inc.new_pg_num) {
+ pg_num = inc.new_pg_num;
+ assert(pg_num >= prior_pg_num);
+ }
+ if (inc.new_localized_pg_num) {
+ localized_pg_num = inc.new_localized_pg_num;
+ assert(localized_pg_num >= prior_localized_pg_num);
+ }
if (inc.new_max_osd >= 0)
set_max_osd(inc.new_max_osd);
void encode(bufferlist& blist) {
::_encode(fsid, blist);
::_encode(epoch, blist);
- ::_encode(mkfs_epoch, blist);
::_encode(ctime, blist);
::_encode(mtime, blist);
::_encode(pg_num, blist);
::_encode(localized_pg_num, blist);
+ ::_encode(prior_pg_num, blist);
+ ::_encode(prior_localized_pg_num, blist);
::_encode(max_osd, blist);
::_encode(osd_state, blist);
int off = 0;
::_decode(fsid, blist, off);
::_decode(epoch, blist, off);
- ::_decode(mkfs_epoch, blist, off);
::_decode(ctime, blist, off);
::_decode(mtime, blist, off);
::_decode(pg_num, blist, off);
::_decode(localized_pg_num, blist, off);
+ ::_decode(prior_pg_num, blist, off);
+ ::_decode(prior_localized_pg_num, blist, off);
calc_pg_masks();
::_decode(max_osd, blist, off);
} else {
dout(10) << " still active from last started: " << last_started << dendl;
}
- } else if (osd->osdmap->post_mkfs()) {
+ } else if (info.pgid.u.pg.ps < osd->osdmap->get_prior_pg_num()) {
dout(10) << " crashed since epoch " << last_epoch_started_any << dendl;
state_set(STATE_CRASHED);
}
}
// if primary..
- if (role == 0 &&
- (!g_conf.osd_hack_fast_startup || osd->osdmap->post_mkfs())) {
+ if (role == 0) {
// who is clean?
uptodate_set.clear();
if (info.is_uptodate())