*c = coll_t(coll);
int64_t pool = -1;
- pg_t pg;
+ spg_t pg;
if (c->is_pg_prefix(pg))
- pool = (int64_t)pg.pool();
+ pool = (int64_t)pg.pgid.pool();
(*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
{
bool r = parse_object(long_name.c_str(), *out);
int64_t pool = -1;
- pg_t pg;
+ spg_t pg;
if (coll().is_pg_prefix(pg))
- pool = (int64_t)pg.pool();
+ pool = (int64_t)pg.pgid.pool();
out->hobj.pool = pool;
if (!r) return r;
string temp = lfn_generate_object_name(*out);
int64_t pool = -1;
- pg_t pg;
+ spg_t pg;
if (coll().is_pg_prefix(pg))
- pool = (int64_t)pg.pool();
+ pool = (int64_t)pg.pgid.pool();
(*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
delete objecter;
}
-void OSDService::_start_split(pg_t parent, const set<pg_t> &children)
+void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
{
- for (set<pg_t>::const_iterator i = children.begin();
+ for (set<spg_t>::const_iterator i = children.begin();
i != children.end();
++i) {
dout(10) << __func__ << ": Starting split on pg " << *i
}
}
-void OSDService::mark_split_in_progress(pg_t parent, const set<pg_t> &children)
+void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
{
Mutex::Locker l(in_progress_split_lock);
- map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+ map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
assert(piter != rev_pending_splits.end());
- for (set<pg_t>::const_iterator i = children.begin();
+ for (set<spg_t>::const_iterator i = children.begin();
i != children.end();
++i) {
assert(piter->second.count(*i));
rev_pending_splits.erase(piter);
}
-void OSDService::cancel_pending_splits_for_parent(pg_t parent)
+void OSDService::cancel_pending_splits_for_parent(spg_t parent)
{
Mutex::Locker l(in_progress_split_lock);
return _cancel_pending_splits_for_parent(parent);
}
-void OSDService::_cancel_pending_splits_for_parent(pg_t parent)
+void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
{
- map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+ map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
if (piter == rev_pending_splits.end())
return;
- for (set<pg_t>::iterator i = piter->second.begin();
+ for (set<spg_t>::iterator i = piter->second.begin();
i != piter->second.end();
++i) {
assert(pending_splits.count(*i));
void OSDService::_maybe_split_pgid(OSDMapRef old_map,
OSDMapRef new_map,
- pg_t pgid)
+ spg_t pgid)
{
assert(old_map->have_pg_pool(pgid.pool()));
if (pgid.ps() < static_cast<unsigned>(old_map->get_pg_num(pgid.pool()))) {
- set<pg_t> children;
+ set<spg_t> children;
pgid.is_split(old_map->get_pg_num(pgid.pool()),
new_map->get_pg_num(pgid.pool()), &children);
_start_split(pgid, children);
}
}
-void OSDService::init_splits_between(pg_t pgid,
+void OSDService::init_splits_between(spg_t pgid,
OSDMapRef frommap,
OSDMapRef tomap)
{
tomap->get_pg_num(pgid.pool()),
NULL)) {
// Ok, a split happened, so we need to walk the osdmaps
- set<pg_t> new_pgs; // pgs to scan on each map
+ set<spg_t> new_pgs; // pgs to scan on each map
new_pgs.insert(pgid);
OSDMapRef curmap(get_map(frommap->get_epoch()));
for (epoch_t e = frommap->get_epoch() + 1;
OSDMapRef nextmap(try_get_map(e));
if (!nextmap)
continue;
- set<pg_t> even_newer_pgs; // pgs added in this loop
- for (set<pg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
- set<pg_t> split_pgs;
+ set<spg_t> even_newer_pgs; // pgs added in this loop
+ for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
+ set<spg_t> split_pgs;
if (i->is_split(curmap->get_pg_num(i->pool()),
nextmap->get_pg_num(i->pool()),
&split_pgs)) {
OSDMapRef new_map)
{
Mutex::Locker l(in_progress_split_lock);
- for (set<pg_t>::iterator i = in_progress_splits.begin();
+ for (set<spg_t>::iterator i = in_progress_splits.begin();
i != in_progress_splits.end();
) {
if (!new_map->have_pg_pool(i->pool())) {
++i;
}
}
- for (map<pg_t, pg_t>::iterator i = pending_splits.begin();
+ for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
i != pending_splits.end();
) {
if (!new_map->have_pg_pool(i->first.pool())) {
}
}
-bool OSDService::splitting(pg_t pgid)
+bool OSDService::splitting(spg_t pgid)
{
Mutex::Locker l(in_progress_split_lock);
return in_progress_splits.count(pgid) ||
pending_splits.count(pgid);
}
-void OSDService::complete_split(const set<pg_t> &pgs)
+void OSDService::complete_split(const set<spg_t> &pgs)
{
Mutex::Locker l(in_progress_split_lock);
- for (set<pg_t>::const_iterator i = pgs.begin();
+ for (set<spg_t>::const_iterator i = pgs.begin();
i != pgs.end();
++i) {
dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
for (vector<coll_t>::iterator i = collections.begin();
i != collections.end();
++i) {
- pg_t pgid;
+ spg_t pgid;
if (i->is_temp(pgid))
recursive_remove_collection(store, *i);
else if (i->to_str() == "convertfs_temp" ||
list<obj_watch_item_t> watchers;
osd_lock.Lock();
// scan pg's
- for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+ for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
++it) {
cct->_conf->apply_changes(NULL);
// Shutdown PGs
- for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+ for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
p != pg_map.end();
++p) {
dout(20) << " kicking pg " << p->first << dendl;
#ifdef PG_DEBUG_REFS
service.dump_live_pgids();
#endif
- for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+ for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
p != pg_map.end();
++p) {
dout(20) << " kicking pg " << p->first << dendl;
PG *OSD::_open_lock_pg(
OSDMapRef createmap,
- pg_t pgid, bool no_lockdep_check, bool hold_map_lock)
+ spg_t pgid, bool no_lockdep_check, bool hold_map_lock)
{
assert(osd_lock.is_locked());
PG* OSD::_make_pg(
OSDMapRef createmap,
- pg_t pgid)
+ spg_t pgid)
{
dout(10) << "_open_lock_pg " << pgid << dendl;
PGPool pool = _get_pool(pgid.pool(), createmap);
PG *pg;
hobject_t logoid = make_pg_log_oid(pgid);
hobject_t infooid = make_pg_biginfo_oid(pgid);
- if (createmap->get_pg_type(pgid) == pg_pool_t::TYPE_REPLICATED)
+ if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED)
pg = new ReplicatedPG(&service, createmap, pool, pgid, logoid, infooid);
else
assert(0);
pg_map[pg->info.pgid] = pg;
dout(10) << "Adding newly split pg " << *pg << dendl;
vector<int> up, acting;
- pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid, up, acting);
+ pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
int role = OSDMap::calc_pg_role(service.whoami, acting);
pg->set_role(role);
pg->reg_next_scrub();
pg->handle_loaded(rctx);
pg->write_if_dirty(*(rctx->transaction));
pg->queue_null(e, e);
- map<pg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
+ map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
peering_wait_for_split.find(pg->info.pgid);
if (to_wake != peering_wait_for_split.end()) {
for (list<PG::CephPeeringEvtRef>::iterator i =
}
OSD::res_result OSD::_try_resurrect_pg(
- OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state)
+ OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
{
assert(resurrected);
assert(old_pg_state);
// find nearest ancestor
DeletingStateRef df;
- pg_t cur(pgid);
+ spg_t cur(pgid);
while (true) {
df = service.deleting_pgs.lookup(cur);
if (df)
OSDMapRef create_map = df->old_pg_state->get_osdmap();
df->old_pg_state->unlock();
- set<pg_t> children;
+ set<spg_t> children;
if (cur == pgid) {
if (df->try_stop_deletion()) {
dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
PG *OSD::_create_lock_pg(
OSDMapRef createmap,
- pg_t pgid,
+ spg_t pgid,
bool newly_created,
bool hold_map_lock,
bool backfill,
- int role, vector<int>& up, vector<int>& acting, pg_history_t history,
+ int role,
+ vector<int>& up, int up_primary,
+ vector<int>& acting, int acting_primary,
+ pg_history_t history,
pg_interval_map_t& pi,
ObjectStore::Transaction& t)
{
service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
- pg->init(role, up, acting, history, pi, backfill, &t);
+ pg->init(
+ role,
+ up,
+ up_primary,
+ acting,
+ acting_primary,
+ history,
+ pi,
+ backfill,
+ &t);
dout(7) << "_create_lock_pg " << *pg << dendl;
return pg;
}
-bool OSD::_have_pg(pg_t pgid)
+bool OSD::_have_pg(spg_t pgid)
{
assert(osd_lock.is_locked());
return pg_map.count(pgid);
}
-PG *OSD::_lookup_lock_pg(pg_t pgid)
+PG *OSD::_lookup_lock_pg(spg_t pgid)
{
assert(osd_lock.is_locked());
if (!pg_map.count(pgid))
}
-PG *OSD::_lookup_pg(pg_t pgid)
+PG *OSD::_lookup_pg(spg_t pgid)
{
assert(osd_lock.is_locked());
if (!pg_map.count(pgid))
return pg;
}
-PG *OSD::_lookup_lock_pg_with_map_lock_held(pg_t pgid)
+PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
{
assert(osd_lock.is_locked());
assert(pg_map.count(pgid));
derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
}
- set<pg_t> head_pgs;
- map<pg_t, interval_set<snapid_t> > pgs;
+ set<spg_t> head_pgs;
+ map<spg_t, interval_set<snapid_t> > pgs;
for (vector<coll_t>::iterator it = ls.begin();
it != ls.end();
++it) {
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
uint64_t seq;
}
bool has_upgraded = false;
- for (map<pg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
+ for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
i != pgs.end();
++i) {
- pg_t pgid(i->first);
+ spg_t pgid(i->first);
if (!head_pgs.count(pgid)) {
dout(10) << __func__ << ": " << pgid << " has orphan snap collections " << i->second
pg->reg_next_scrub();
// generate state for PG's current mapping
- pg->get_osdmap()->pg_to_up_acting_osds(pgid, pg->up, pg->acting);
+ int primary, up_primary;
+ vector<int> acting, up;
+ pg->get_osdmap()->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &primary);
+ pg->init_primary_up_acting(
+ up,
+ acting,
+ up_primary,
+ primary);
int role = OSDMap::calc_pg_role(whoami, pg->acting);
pg->set_role(role);
// calculate untion of map range
epoch_t end_epoch = superblock.oldest_map;
epoch_t cur_epoch = superblock.newest_map;
- for (ceph::unordered_map<pg_t, PG*>::iterator i = pg_map.begin();
+ for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
i != pg_map.end();
++i) {
PG *pg = i->second;
continue;
vector<int> acting, up;
- cur_map->pg_to_up_acting_osds(pg->info.pgid, up, acting);
+ cur_map->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
if (p.same_interval_since == 0) {
dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
assert(last_map);
std::stringstream debug;
- bool new_interval = pg_interval_t::check_new_interval(p.old_acting, acting,
- p.old_up, up,
- p.same_interval_since,
- pg->info.history.last_epoch_clean,
- cur_map, last_map,
- pg->info.pgid.pool(),
- pg->info.pgid,
- &pg->past_intervals,
- &debug);
+ bool new_interval = pg_interval_t::check_new_interval(
+ p.old_acting, acting,
+ p.old_up, up,
+ p.same_interval_since,
+ pg->info.history.last_epoch_clean,
+ cur_map, last_map,
+ pg->info.pgid.pool(),
+ pg->info.pgid.pgid,
+ &pg->past_intervals,
+ &debug);
if (new_interval) {
dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
<< " " << debug.str() << dendl;
* hasn't changed since the given epoch and we are the primary.
*/
void OSD::handle_pg_peering_evt(
+ spg_t pgid,
const pg_info_t& info,
pg_interval_map_t& pi,
epoch_t epoch,
- int from,
+ pg_shard_t from,
bool primary,
PG::CephPeeringEvtRef evt)
{
- if (service.splitting(info.pgid)) {
- peering_wait_for_split[info.pgid].push_back(evt);
+ if (service.splitting(pgid)) {
+ peering_wait_for_split[pgid].push_back(evt);
return;
}
- if (!_have_pg(info.pgid)) {
+ if (!_have_pg(pgid)) {
// same primary?
- if (!osdmap->have_pg_pool(info.pgid.pool()))
+ if (!osdmap->have_pg_pool(pgid.pool()))
return;
+ int up_primary, acting_primary;
vector<int> up, acting;
- osdmap->pg_to_up_acting_osds(info.pgid, up, acting);
+ osdmap->pg_to_up_acting_osds(
+ pgid.pgid, &up, &up_primary, &acting, &acting_primary);
int role = osdmap->calc_pg_role(whoami, acting, acting.size());
pg_history_t history = info.history;
bool valid_history = project_pg_history(
- info.pgid, history, epoch, up, acting);
+ pgid, history, epoch, up, acting);
if (!valid_history || epoch < history.same_interval_since) {
- dout(10) << "get_or_create_pg " << info.pgid << " acting changed in "
+ dout(10) << "get_or_create_pg " << pgid << " acting changed in "
<< history.same_interval_since << " (msg from " << epoch << ")" << dendl;
return;
}
- if (service.splitting(info.pgid)) {
+ if (service.splitting(pgid)) {
assert(0);
}
// DNE on source?
if (info.dne()) {
// is there a creation pending on this pg?
- if (creating_pgs.count(info.pgid)) {
- creating_pgs[info.pgid].prior.erase(from);
- if (!can_create_pg(info.pgid))
+ if (creating_pgs.count(pgid)) {
+ creating_pgs[pgid].prior.erase(from);
+ if (!can_create_pg(pgid))
return;
- history = creating_pgs[info.pgid].history;
+ history = creating_pgs[pgid].history;
create = true;
} else {
- dout(10) << "get_or_create_pg " << info.pgid
+ dout(10) << "get_or_create_pg " << pgid
<< " DNE on source, but creation probe, ignoring" << dendl;
return;
}
}
- creating_pgs.erase(info.pgid);
+ creating_pgs.erase(pgid);
} else {
assert(!info.dne()); // pg exists if we are hearing about it
}
// do we need to resurrect a deleting pg?
- pg_t resurrected;
+ spg_t resurrected;
PGRef old_pg_state;
res_result result = _try_resurrect_pg(
service.get_osdmap(),
- info.pgid,
+ pgid,
&resurrected,
&old_pg_state);
switch (result) {
case RES_NONE: {
// ok, create the pg locally using provided Info and History
- rctx.transaction->create_collection(coll_t(info.pgid));
+ rctx.transaction->create_collection(coll_t(pgid));
PG *pg = _create_lock_pg(
get_map(epoch),
- info.pgid, create, false, result == RES_SELF,
- role, up, acting, history, pi,
+ pgid, create, false, result == RES_SELF,
+ role,
+ up, up_primary,
+ acting, acting_primary,
+ history, pi,
*rctx.transaction);
pg->handle_create(&rctx);
pg->write_if_dirty(*rctx.transaction);
true,
old_pg_state->role,
old_pg_state->up,
+ old_pg_state->up_primary.osd,
old_pg_state->acting,
+ old_pg_state->primary.osd,
old_pg_state->info.history,
old_pg_state->past_intervals,
*rctx.transaction);
true,
old_pg_state->role,
old_pg_state->up,
+ old_pg_state->up_primary.osd,
old_pg_state->acting,
+ old_pg_state->primary.osd,
old_pg_state->info.history,
old_pg_state->past_intervals,
*rctx.transaction
// kick any waiters
wake_pg_waiters(parent->info.pgid);
- assert(service.splitting(info.pgid));
- peering_wait_for_split[info.pgid].push_back(evt);
+ assert(service.splitting(pgid));
+ peering_wait_for_split[pgid].push_back(evt);
//parent->queue_peering_event(evt);
parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
}
} else {
// already had it. did the mapping change?
- PG *pg = _lookup_lock_pg(info.pgid);
+ PG *pg = _lookup_lock_pg(pgid);
if (epoch < pg->info.history.same_interval_since) {
dout(10) << *pg << " get_or_create_pg acting changed in "
<< pg->info.history.same_interval_since
* - from each epoch, include all osds up then AND now
* - if no osds from then are up now, include them all, even tho they're not reachable now
*/
-void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset)
+void OSD::calc_priors_during(
+ spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset)
{
- dout(15) << "calc_priors_during " << pgid << " [" << start << "," << end << ")" << dendl;
+ dout(15) << "calc_priors_during " << pgid << " [" << start
+ << "," << end << ")" << dendl;
for (epoch_t e = start; e < end; e++) {
OSDMapRef oldmap = get_map(e);
vector<int> acting;
- oldmap->pg_to_acting_osds(pgid, acting);
+ oldmap->pg_to_acting_osds(pgid.pgid, acting);
dout(20) << " " << pgid << " in epoch " << e << " was " << acting << dendl;
int up = 0;
for (unsigned i=0; i<acting.size(); i++)
if (osdmap->is_up(acting[i])) {
- if (acting[i] != whoami)
- pset.insert(acting[i]);
+ if (acting[i] != whoami) {
+ pset.insert(
+ pg_shard_t(
+ acting[i],
+ osdmap->pg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD));
+ }
up++;
}
if (!up && !acting.empty()) {
// sucky. add down osds, even tho we can't reach them right now.
for (unsigned i=0; i<acting.size(); i++)
if (acting[i] != whoami)
- pset.insert(acting[i]);
+ pset.insert(
+ pg_shard_t(
+ acting[i],
+ osdmap->pg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD));
}
}
dout(10) << "calc_priors_during " << pgid
* Fill in the passed history so you know same_interval_since, same_up_since,
* and same_primary_since.
*/
-bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from,
+bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
const vector<int>& currentup,
const vector<int>& currentacting)
{
assert(oldmap->have_pg_pool(pgid.pool()));
vector<int> up, acting;
- oldmap->pg_to_up_acting_osds(pgid, up, acting);
+ oldmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
// acting set change?
if ((acting != currentacting || up != currentup) && e > h.same_interval_since) {
// build heartbeat from set
if (is_active()) {
- for (ceph::unordered_map<pg_t, PG*>::iterator i = pg_map.begin();
+ for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
i != pg_map.end();
++i) {
PG *pg = i->second;
command == "truncobj" || command == "injectmdataerr" ||
command == "injectdataerr"
) {
- pg_t rawpg, pgid;
+ pg_t rawpg;
int64_t pool;
OSDMapRef curmap = service->get_osdmap();
int r;
ss << "Invalid namespace/objname";
return;
}
- pgid = curmap->raw_pg_to_pg(rawpg);
+ if (curmap->pg_is_ec(rawpg)) {
+ ss << "Must not call on ec pool";
+ return;
+ }
+ spg_t pgid = spg_t(curmap->raw_pg_to_pg(rawpg), ghobject_t::no_shard());
hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
ObjectStore::Transaction t;
}
pg->pg_stats_publish_lock.Lock();
if (pg->pg_stats_publish_valid) {
- m->pg_stat[pg->info.pgid] = pg->pg_stats_publish;
+ m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
<< pg->pg_stats_publish.reported_seq << dendl;
} else {
PGRef _pg(pg);
++p;
- if (ack->pg_stat.count(pg->info.pgid)) {
- pair<version_t,epoch_t> acked = ack->pg_stat[pg->info.pgid];
+ if (ack->pg_stat.count(pg->info.pgid.pgid)) {
+ pair<version_t,epoch_t> acked = ack->pg_stat[pg->info.pgid.pgid];
pg->pg_stats_publish_lock.Lock();
if (acked.first == pg->pg_stats_publish.reported_seq &&
acked.second == pg->pg_stats_publish.reported_epoch) {
prefix == "mark_unfound_lost" ||
prefix == "list_missing")
)) {
- pg_t pgid;
+ spg_t pgid;
if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
ss << "no pgid specified";
goto out;
}
- std::set <pg_t> keys;
- for (ceph::unordered_map<pg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
+ std::set <spg_t> keys;
+ for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
pg_map_e != pg_map.end(); ++pg_map_e) {
keys.insert(pg_map_e->first);
}
fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
- for (std::set <pg_t>::iterator p = keys.begin();
+ for (std::set <spg_t>::iterator p = keys.begin();
p != keys.end(); ++p) {
- ceph::unordered_map<pg_t, PG*>::iterator q = pg_map.find(*p);
+ ceph::unordered_map<spg_t, PG*>::iterator q = pg_map.find(*p);
assert(q != pg_map.end());
PG *pg = q->second;
pg->lock();
pg->pg_log.get_missing().missing.begin();
for (; mi != mend; ++mi) {
fout << mi->first << " -> " << mi->second << std::endl;
- map<hobject_t, set<int> >::const_iterator mli =
+ map<hobject_t, set<pg_shard_t> >::const_iterator mli =
pg->missing_loc.find(mi->first);
if (mli == pg->missing_loc.end())
continue;
- const set<int> &mls(mli->second);
+ const set<pg_shard_t> &mls(mli->second);
if (mls.empty())
continue;
fout << "missing_loc: " << mls << std::endl;
}
if (m->scrub_pgs.empty()) {
- for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+ for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
p != pg_map.end();
++p) {
PG *pg = p->second;
} else {
for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
p != m->scrub_pgs.end();
- ++p)
- if (pg_map.count(*p)) {
- PG *pg = pg_map[*p];
+ ++p) {
+ spg_t pcand;
+ if (osdmap->get_primary_shard(*p, &pcand) &&
+ pg_map.count(pcand)) {
+ PG *pg = pg_map[pcand];
pg->lock();
if (pg->is_primary()) {
pg->unreg_next_scrub();
}
pg->unlock();
}
+ }
}
m->put();
//dout(20) << " " << last_scrub_pg << dendl;
- pair<utime_t, pg_t> pos;
+ pair<utime_t, spg_t> pos;
if (service.first_scrub_stamp(&pos)) {
do {
utime_t t = pos.first;
- pg_t pgid = pos.second;
+ spg_t pgid = pos.second;
dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
utime_t diff = now - t;
continue;
vector<int> newup, newacting;
- nextmap->pg_to_up_acting_osds(pg->info.pgid, newup, newacting);
- pg->handle_advance_map(nextmap, lastmap, newup, newacting, rctx);
+ int up_primary, acting_primary;
+ nextmap->pg_to_up_acting_osds(
+ pg->info.pgid.pgid,
+ &newup, &up_primary,
+ &newacting, &acting_primary);
+ pg->handle_advance_map(
+ nextmap, lastmap, newup, up_primary,
+ newacting, acting_primary, rctx);
// Check for split!
- set<pg_t> children;
- if (pg->info.pgid.is_split(
+ set<spg_t> children;
+ spg_t parent(pg->info.pgid);
+ if (parent.is_split(
lastmap->get_pg_num(pg->pool.id),
nextmap->get_pg_num(pg->pool.id),
&children)) {
}
// scan pg creations
- ceph::unordered_map<pg_t, create_pg_info>::iterator n = creating_pgs.begin();
+ ceph::unordered_map<spg_t, create_pg_info>::iterator n = creating_pgs.begin();
while (n != creating_pgs.end()) {
- ceph::unordered_map<pg_t, create_pg_info>::iterator p = n++;
- pg_t pgid = p->first;
+ ceph::unordered_map<spg_t, create_pg_info>::iterator p = n++;
+ spg_t pgid = p->first;
// am i still primary?
vector<int> acting;
int primary;
- osdmap->pg_to_acting_osds(pgid, &acting, &primary);
+ osdmap->pg_to_acting_osds(pgid.pgid, &acting, &primary);
if (primary != whoami) {
dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl;
creating_pgs.erase(p);
}
// scan pgs with waiters
- map<pg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
+ map<spg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
while (p != waiting_for_pg.end()) {
- pg_t pgid = p->first;
+ spg_t pgid = p->first;
vector<int> acting;
- int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int nrep = osdmap->pg_to_acting_osds(pgid.pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role >= 0) {
++p; // still me
list<PGRef> to_remove;
// scan pg's
- for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+ for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
++it) {
PG *pg = it->second;
service.publish_map(osdmap);
// scan pg's
- for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+ for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
it != pg_map.end();
++it) {
PG *pg = it->second;
// pg creation
-bool OSD::can_create_pg(pg_t pgid)
+bool OSD::can_create_pg(spg_t pgid)
{
assert(creating_pgs.count(pgid));
void OSD::split_pgs(
PG *parent,
- const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
+ const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
OSDMapRef curmap,
OSDMapRef nextmap,
PG::RecoveryCtx *rctx)
parent->update_snap_mapper_bits(
parent->info.pgid.get_split_bits(pg_num)
);
- for (set<pg_t>::const_iterator i = childpgids.begin();
+ for (set<spg_t>::const_iterator i = childpgids.begin();
i != childpgids.end();
++i) {
dout(10) << "Splitting " << *parent << " into " << *i << dendl;
parent->split_colls(
*i,
split_bits,
- i->m_seed,
+ i->ps(),
rctx->transaction);
parent->split_into(
- *i,
+ i->pgid,
child,
split_bits);
for (map<pg_t,pg_create_t>::iterator p = m->mkpg.begin();
p != m->mkpg.end();
++p) {
- pg_t pgid = p->first;
epoch_t created = p->second.created;
pg_t parent = p->second.parent;
if (p->second.split_bits) // Skip split pgs
continue;
- pg_t on = pgid;
+ pg_t on = p->first;
- if (pgid.preferred() >= 0) {
- dout(20) << "ignoring localized pg " << pgid << dendl;
+ if (on.preferred() >= 0) {
+ dout(20) << "ignoring localized pg " << on << dendl;
continue;
}
- if (!osdmap->have_pg_pool(pgid.pool())) {
- dout(20) << "ignoring pg on deleted pool " << pgid << dendl;
+
+ if (!osdmap->have_pg_pool(on.pool())) {
+ dout(20) << "ignoring pg on deleted pool " << on << dendl;
continue;
}
- dout(20) << "mkpg " << pgid << " e" << created << dendl;
+ dout(20) << "mkpg " << on << " e" << created << dendl;
// is it still ours?
vector<int> up, acting;
- int up_primary, acting_primary;
+ int up_primary = -1;
+ int acting_primary = -1;
osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
int role = osdmap->calc_pg_role(whoami, acting, acting.size());
if (up_primary != whoami) {
- dout(10) << "mkpg " << pgid << " not primary (role=" << role << "), skipping" << dendl;
+ dout(10) << "mkpg " << on << " not primary (role="
+ << role << "), skipping" << dendl;
continue;
}
if (up != acting) {
- dout(10) << "mkpg " << pgid << " up " << up << " != acting " << acting << ", ignoring" << dendl;
+ dout(10) << "mkpg " << on << " up " << up
+ << " != acting " << acting << ", ignoring" << dendl;
// we'll get a query soon anyway, since we know the pg
// must exist. we can ignore this.
continue;
}
+ spg_t pgid;
+ bool mapped = osdmap->get_primary_shard(on, &pgid);
+ assert(mapped);
+
// does it already exist?
if (_have_pg(pgid)) {
dout(10) << "mkpg " << pgid << " already exists, skipping" << dendl;
PG::RecoveryCtx rctx = create_context();
// poll priors
- set<int>& pset = creating_pgs[pgid].prior;
+ set<pg_shard_t>& pset = creating_pgs[pgid].prior;
dout(10) << "mkpg " << pgid << " e" << created
<< " h " << history
<< " : querying priors " << pset << dendl;
- for (set<int>::iterator p = pset.begin(); p != pset.end(); ++p)
- if (osdmap->is_up(*p))
- (*rctx.query_map)[*p][pgid] = pg_query_t(pg_query_t::INFO, history,
- osdmap->get_epoch());
+ for (set<pg_shard_t>::iterator p = pset.begin(); p != pset.end(); ++p)
+ if (osdmap->is_up(p->osd))
+ (*rctx.query_map)[p->osd][spg_t(pgid.pgid, p->shard)] =
+ pg_query_t(
+ pg_query_t::INFO,
+ p->shard, pgid.shard,
+ history,
+ osdmap->get_epoch());
PG *pg = NULL;
if (can_create_pg(pgid)) {
rctx.transaction->create_collection(coll_t(pgid));
pg = _create_lock_pg(
osdmap, pgid, true, false, false,
- 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting,
+ 0, creating_pgs[pgid].acting, whoami,
+ creating_pgs[pgid].acting, whoami,
history, pi,
*rctx.transaction);
pg->info.last_epoch_started = pg->info.history.last_epoch_started;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
C_Contexts *on_applied = new C_Contexts(cct);
C_Contexts *on_safe = new C_Contexts(cct);
- map< int, map<pg_t,pg_query_t> > *query_map =
- new map<int, map<pg_t, pg_query_t> >;
+ map<int, map<spg_t,pg_query_t> > *query_map =
+ new map<int, map<spg_t, pg_query_t> >;
map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list =
- new map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >;
+ new map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >;
map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map =
new map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >;
PG::RecoveryCtx rctx(query_map, info_map, notify_list,
bool OSD::compat_must_dispatch_immediately(PG *pg)
{
assert(pg->is_locked());
- vector<int> *tmpacting = &pg->acting;
- if (pg->actingbackfill.size() > 0)
- tmpacting = &pg->actingbackfill;
- for (vector<int>::iterator i = tmpacting->begin();
- i != tmpacting->end();
+ set<pg_shard_t> tmpacting;
+ if (pg->actingbackfill.size() > 0) {
+ tmpacting = pg->actingbackfill;
+ } else {
+ for (unsigned i = 0; i < pg->acting.size(); ++i) {
+ tmpacting.insert(
+ pg_shard_t(
+ pg->acting[i],
+ pg->pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+ }
+ }
+
+ for (set<pg_shard_t>::iterator i = tmpacting.begin();
+ i != tmpacting.end();
++i) {
- if (*i == whoami)
+ if (i->osd == whoami)
continue;
ConnectionRef conn =
- service.get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch());
+ service.get_con_osd_cluster(i->osd, pg->get_osdmap()->get_epoch());
if (conn && !conn->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
return true;
}
*/
void OSD::do_notifies(
- map< int,vector<pair<pg_notify_t,pg_interval_map_t> > >& notify_list,
+ map<int,vector<pair<pg_notify_t,pg_interval_map_t> > >& notify_list,
OSDMapRef curmap)
{
- for (map< int, vector<pair<pg_notify_t,pg_interval_map_t> > >::iterator it = notify_list.begin();
+ for (map<int,
+ vector<pair<pg_notify_t,pg_interval_map_t> > >::iterator it =
+ notify_list.begin();
it != notify_list.end();
++it) {
- if (it->first == whoami) {
- dout(7) << "do_notify osd." << it->first << " is self, skipping" << dendl;
- continue;
- }
if (!curmap->is_up(it->first))
continue;
- ConnectionRef con = service.get_con_osd_cluster(it->first, curmap->get_epoch());
+ ConnectionRef con = service.get_con_osd_cluster(
+ it->first, curmap->get_epoch());
if (!con)
continue;
_share_map_outgoing(it->first, con.get(), curmap);
if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
- dout(7) << "do_notify osd." << it->first
+ dout(7) << "do_notify osd " << it->first
<< " on " << it->second.size() << " PGs" << dendl;
MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
it->second);
cluster_messenger->send_message(m, con.get());
} else {
- dout(7) << "do_notify osd." << it->first
+ dout(7) << "do_notify osd " << it->first
<< " sending seperate messages" << dendl;
for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i =
it->second.begin();
/** do_queries
* send out pending queries for info | summaries
*/
-void OSD::do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
+void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
OSDMapRef curmap)
{
- for (map< int, map<pg_t,pg_query_t> >::iterator pit = query_map.begin();
+ for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
pit != query_map.end();
++pit) {
if (!curmap->is_up(pit->first))
dout(7) << "do_queries querying osd." << who
<< " sending seperate messages "
<< " on " << pit->second.size() << " PGs" << dendl;
- for (map<pg_t, pg_query_t>::iterator i = pit->second.begin();
+ for (map<spg_t, pg_query_t>::iterator i = pit->second.begin();
i != pit->second.end();
++i) {
- map<pg_t, pg_query_t> to_send;
+ map<spg_t, pg_query_t> to_send;
to_send.insert(*i);
MOSDPGQuery *m = new MOSDPGQuery(i->second.epoch_sent, to_send);
cluster_messenger->send_message(m, con.get());
}
-void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+void OSD::do_infos(map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
OSDMapRef curmap)
{
- for (map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator p = info_map.begin();
+ for (map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator p =
+ info_map.begin();
p != info_map.end();
++p) {
if (!curmap->is_up(p->first))
for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator i = p->second.begin();
i != p->second.end();
++i) {
- dout(20) << "Sending info " << i->first.info << " to osd." << p->first << dendl;
+ dout(20) << "Sending info " << i->first.info
+ << " to shard " << p->first << dendl;
}
- ConnectionRef con = service.get_con_osd_cluster(p->first, curmap->get_epoch());
+ ConnectionRef con = service.get_con_osd_cluster(
+ p->first, curmap->get_epoch());
if (!con)
continue;
_share_map_outgoing(p->first, con.get(), curmap);
}
handle_pg_peering_evt(
+ spg_t(it->first.info.pgid.pgid, it->first.to),
it->first.info, it->second,
- it->first.query_epoch, from, true,
+ it->first.query_epoch, pg_shard_t(from, it->first.from), true,
PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
it->first.epoch_sent, it->first.query_epoch,
- PG::MNotifyRec(from, it->first)))
+ PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first)))
);
}
}
op->mark_started();
handle_pg_peering_evt(
+ spg_t(m->info.pgid.pgid, m->to),
m->info, m->past_intervals, m->get_epoch(),
- from, false,
+ pg_shard_t(from, m->from), false,
PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
m->get_epoch(), m->get_query_epoch(),
- PG::MLogRec(from, m)))
+ PG::MLogRec(pg_shard_t(from, m->from), m)))
);
}
}
handle_pg_peering_evt(
+ spg_t(p->first.info.pgid.pgid, p->first.to),
p->first.info, p->second, p->first.epoch_sent,
- from, false,
+ pg_shard_t(from, p->first.from), false,
PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
p->first.epoch_sent, p->first.query_epoch,
- PG::MInfoRec(from, p->first.info, p->first.epoch_sent)))
+ PG::MInfoRec(
+ pg_shard_t(
+ from, p->first.from), p->first.info, p->first.epoch_sent)))
);
}
}
if (pg->is_primary()) {
// peer is informing us of their last_complete_ondisk
dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
- pg->peer_last_complete_ondisk[from] = m->trim_to;
+ pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
+ m->trim_to;
if (pg->calc_min_last_complete_ondisk()) {
dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
pg->trim_peers();
map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
- for (map<pg_t,pg_query_t>::iterator it = m->pg_list.begin();
+ for (map<spg_t,pg_query_t>::iterator it = m->pg_list.begin();
it != m->pg_list.end();
++it) {
- pg_t pgid = it->first;
+ spg_t pgid = it->first;
if (pgid.preferred() >= 0) {
dout(10) << "ignoring localized pg " << pgid << dendl;
PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
it->second.epoch_sent, it->second.epoch_sent,
- PG::MQuery(from, it->second, it->second.epoch_sent))));
+ PG::MQuery(pg_shard_t(from, it->second.from),
+ it->second, it->second.epoch_sent))));
continue;
}
if (pg_map.count(pgid)) {
PG *pg = 0;
pg = _lookup_lock_pg(pgid);
- pg->queue_query(it->second.epoch_sent, it->second.epoch_sent,
- from, it->second);
+ pg->queue_query(
+ it->second.epoch_sent, it->second.epoch_sent,
+ pg_shard_t(from, it->second.from), it->second);
pg->unlock();
continue;
}
// get active crush mapping
vector<int> up, acting;
- osdmap->pg_to_up_acting_osds(pgid, up, acting);
+ osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
// same primary?
pg_history_t history = it->second.history;
}
dout(10) << " pg " << pgid << " dne" << dendl;
- pg_info_t empty(pgid);
+ pg_info_t empty(spg_t(pgid.pgid, it->second.to));
if (it->second.type == pg_query_t::LOG ||
it->second.type == pg_query_t::FULLLOG) {
ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
if (con) {
- MOSDPGLog *mlog = new MOSDPGLog(osdmap->get_epoch(), empty,
- it->second.epoch_sent);
+ MOSDPGLog *mlog = new MOSDPGLog(
+ it->second.from, it->second.to,
+ osdmap->get_epoch(), empty,
+ it->second.epoch_sent);
_share_map_outgoing(from, con.get(), osdmap);
cluster_messenger->send_message(mlog, con.get());
}
} else {
- notify_list[from].push_back(make_pair(pg_notify_t(it->second.epoch_sent,
- osdmap->get_epoch(),
- empty),
- pg_interval_map_t()));
+ notify_list[from].push_back(
+ make_pair(
+ pg_notify_t(
+ it->second.from, it->second.to,
+ it->second.epoch_sent,
+ osdmap->get_epoch(),
+ empty),
+ pg_interval_map_t()));
}
}
do_notifies(notify_list, osdmap);
op->mark_started();
- for (vector<pg_t>::iterator it = m->pg_list.begin();
+ for (vector<spg_t>::iterator it = m->pg_list.begin();
it != m->pg_list.end();
++it) {
- pg_t pgid = *it;
+ spg_t pgid = *it;
if (pgid.preferred() >= 0) {
dout(10) << "ignoring localized pg " << pgid << dendl;
continue;
PG *pg = _lookup_lock_pg(pgid);
pg_history_t history = pg->info.history;
vector<int> up, acting;
- osdmap->pg_to_up_acting_osds(pgid, up, acting);
+ osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
bool valid_history =
project_pg_history(pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
up, acting);
if (valid_history &&
history.same_interval_since <= m->get_epoch()) {
- assert(pg->get_primary() == m->get_source().num());
+ assert(pg->get_primary().osd == m->get_source().num());
PGRef _pg(pg);
_remove_pg(pg);
pg->unlock();
assert(osd_lock.is_locked());
utime_t now = ceph_clock_now(cct);
- list< pair<pg_t,utime_t> > pgids;
+ list< pair<spg_t,utime_t> > pgids;
replay_queue_lock.Lock();
while (!replay_queue.empty() &&
replay_queue.front().second <= now) {
}
replay_queue_lock.Unlock();
- for (list< pair<pg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); ++p) {
- pg_t pgid = p->first;
+ for (list< pair<spg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); ++p) {
+ spg_t pgid = p->first;
if (pg_map.count(pgid)) {
PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
dout(10) << "check_replay_queue " << *pg << dendl;
}
}
// calc actual pgid
- pg_t pgid = m->get_pg();
- int64_t pool = pgid.pool();
+ pg_t _pgid = m->get_pg();
+ int64_t pool = _pgid.pool();
if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 &&
osdmap->have_pg_pool(pool))
- pgid = osdmap->raw_pg_to_pg(pgid);
+ _pgid = osdmap->raw_pg_to_pg(_pgid);
+
+ spg_t pgid;
+ if (!osdmap->get_primary_shard(_pgid, &pgid)) {
+ // missing pool or acting set empty -- drop
+ return;
+ }
// get and lock *pg.
PG *pg = _have_pg(pgid) ? _lookup_pg(pgid) : NULL;
if (!pg) {
dout(7) << "hit non-existent pg " << pgid << dendl;
- if (osdmap->get_pg_acting_role(pgid, whoami) >= 0) {
+ if (osdmap->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
dout(7) << "we are valid target for op, waiting" << dendl;
waiting_for_pg[pgid].push_back(op);
op->mark_delayed("waiting for pg to exist locally");
}
OSDMapRef send_map = get_map(m->get_map_epoch());
- if (send_map->get_pg_acting_role(pgid, whoami) >= 0) {
+ if (send_map->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
dout(7) << "dropping request; client will resend when they get new map" << dendl;
} else if (!send_map->have_pg_pool(pgid.pool())) {
dout(7) << "dropping request; pool did not exist" << dendl;
<< "\n";
} else {
dout(7) << "we are invalid target" << dendl;
- pgid = m->get_pg();
- if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
- pgid = send_map->raw_pg_to_pg(pgid);
clog.warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
<< " pg " << m->get_pg()
<< " to osd." << whoami
static_cast<Session*>(m->get_connection()->get_priv()));
// make sure we have the pg
- const pg_t pgid = m->pgid;
+ const spg_t pgid = m->pgid;
if (service.splitting(pgid)) {
waiting_for_pg[pgid].push_back(op);
return;
if (osd->is_stopping())
return;
PG::RecoveryCtx rctx = osd->create_context();
- set<pg_t> to_complete;
+ set<spg_t> to_complete;
for (set<boost::intrusive_ptr<PG> >::iterator i = pgs.begin();
i != pgs.end();
++i) {
} status;
bool stop_deleting;
public:
- const pg_t pgid;
+ const spg_t pgid;
const PGRef old_pg_state;
- DeletingState(const pair<pg_t, PGRef> &in) :
+ DeletingState(const pair<spg_t, PGRef> &in) :
lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
pgid(in.first), old_pg_state(in.second) {}
public:
OSD *osd;
CephContext *cct;
- SharedPtrRegistry<pg_t, ObjectStore::Sequencer> osr_registry;
- SharedPtrRegistry<pg_t, DeletingState> deleting_pgs;
+ SharedPtrRegistry<spg_t, ObjectStore::Sequencer> osr_registry;
+ SharedPtrRegistry<spg_t, DeletingState> deleting_pgs;
const int whoami;
ObjectStore *&store;
LogClient &clog;
Mutex sched_scrub_lock;
int scrubs_pending;
int scrubs_active;
- set< pair<utime_t,pg_t> > last_scrub_pg;
+ set< pair<utime_t,spg_t> > last_scrub_pg;
- void reg_last_pg_scrub(pg_t pgid, utime_t t) {
+ void reg_last_pg_scrub(spg_t pgid, utime_t t) {
Mutex::Locker l(sched_scrub_lock);
- last_scrub_pg.insert(pair<utime_t,pg_t>(t, pgid));
+ last_scrub_pg.insert(pair<utime_t,spg_t>(t, pgid));
}
- void unreg_last_pg_scrub(pg_t pgid, utime_t t) {
+ void unreg_last_pg_scrub(spg_t pgid, utime_t t) {
Mutex::Locker l(sched_scrub_lock);
- pair<utime_t,pg_t> p(t, pgid);
- set<pair<utime_t,pg_t> >::iterator it = last_scrub_pg.find(p);
+ pair<utime_t,spg_t> p(t, pgid);
+ set<pair<utime_t,spg_t> >::iterator it = last_scrub_pg.find(p);
assert(it != last_scrub_pg.end());
last_scrub_pg.erase(it);
}
- bool first_scrub_stamp(pair<utime_t, pg_t> *out) {
+ bool first_scrub_stamp(pair<utime_t, spg_t> *out) {
Mutex::Locker l(sched_scrub_lock);
if (last_scrub_pg.empty())
return false;
- set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.begin();
+ set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.begin();
*out = *iter;
return true;
}
- bool next_scrub_stamp(pair<utime_t, pg_t> next,
- pair<utime_t, pg_t> *out) {
+ bool next_scrub_stamp(pair<utime_t, spg_t> next,
+ pair<utime_t, spg_t> *out) {
Mutex::Locker l(sched_scrub_lock);
if (last_scrub_pg.empty())
return false;
- set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
+ set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
if (iter == last_scrub_pg.end())
return false;
++iter;
enum {
BACKFILL_LOW = 0, // backfill non-degraded PGs
BACKFILL_HIGH = 1, // backfill degraded PGs
- RECOVERY = AsyncReserver<pg_t>::MAX_PRIORITY // log based recovery
+ RECOVERY = AsyncReserver<spg_t>::MAX_PRIORITY // log based recovery
};
Finisher reserver_finisher;
- AsyncReserver<pg_t> local_reserver;
- AsyncReserver<pg_t> remote_reserver;
+ AsyncReserver<spg_t> local_reserver;
+ AsyncReserver<spg_t> remote_reserver;
// -- pg_temp --
Mutex pg_temp_lock;
// split
Mutex in_progress_split_lock;
- map<pg_t, pg_t> pending_splits; // child -> parent
- map<pg_t, set<pg_t> > rev_pending_splits; // parent -> [children]
- set<pg_t> in_progress_splits; // child
+ map<spg_t, spg_t> pending_splits; // child -> parent
+ map<spg_t, set<spg_t> > rev_pending_splits; // parent -> [children]
+ set<spg_t> in_progress_splits; // child
- void _start_split(pg_t parent, const set<pg_t> &children);
- void start_split(pg_t parent, const set<pg_t> &children) {
+ void _start_split(spg_t parent, const set<spg_t> &children);
+ void start_split(spg_t parent, const set<spg_t> &children) {
Mutex::Locker l(in_progress_split_lock);
return _start_split(parent, children);
}
- void mark_split_in_progress(pg_t parent, const set<pg_t> &pgs);
- void complete_split(const set<pg_t> &pgs);
- void cancel_pending_splits_for_parent(pg_t parent);
- void _cancel_pending_splits_for_parent(pg_t parent);
- bool splitting(pg_t pgid);
+ void mark_split_in_progress(spg_t parent, const set<spg_t> &pgs);
+ void complete_split(const set<spg_t> &pgs);
+ void cancel_pending_splits_for_parent(spg_t parent);
+ void _cancel_pending_splits_for_parent(spg_t parent);
+ bool splitting(spg_t pgid);
void expand_pg_num(OSDMapRef old_map,
OSDMapRef new_map);
void _maybe_split_pgid(OSDMapRef old_map,
OSDMapRef new_map,
- pg_t pgid);
- void init_splits_between(pg_t pgid, OSDMapRef frommap, OSDMapRef tomap);
+ spg_t pgid);
+ void init_splits_between(spg_t pgid, OSDMapRef frommap, OSDMapRef tomap);
// -- OSD Full Status --
Mutex full_status_lock;
#ifdef PG_DEBUG_REFS
Mutex pgid_lock;
- map<pg_t, int> pgid_tracker;
- map<pg_t, PG*> live_pgs;
- void add_pgid(pg_t pgid, PG *pg) {
+ map<spg_t, int> pgid_tracker;
+ map<spg_t, PG*> live_pgs;
+ void add_pgid(spg_t pgid, PG *pg) {
Mutex::Locker l(pgid_lock);
if (!pgid_tracker.count(pgid)) {
pgid_tracker[pgid] = 0;
}
pgid_tracker[pgid]++;
}
- void remove_pgid(pg_t pgid, PG *pg) {
+ void remove_pgid(spg_t pgid, PG *pg) {
Mutex::Locker l(pgid_lock);
assert(pgid_tracker.count(pgid));
assert(pgid_tracker[pgid] > 0);
void dump_live_pgids() {
Mutex::Locker l(pgid_lock);
derr << "live pgids:" << dendl;
- for (map<pg_t, int>::iterator i = pgid_tracker.begin();
+ for (map<spg_t, int>::iterator i = pgid_tracker.begin();
i != pgid_tracker.end();
++i) {
derr << "\t" << *i << dendl;
0));
}
- static hobject_t make_pg_log_oid(pg_t pg) {
+ static hobject_t make_pg_log_oid(spg_t pg) {
stringstream ss;
ss << "pglog_" << pg;
string s;
return hobject_t(sobject_t(object_t(s.c_str()), 0));
}
- static hobject_t make_pg_biginfo_oid(pg_t pg) {
+ static hobject_t make_pg_biginfo_oid(spg_t pg) {
stringstream ss;
ss << "pginfo_" << pg;
string s;
protected:
// -- placement groups --
- ceph::unordered_map<pg_t, PG*> pg_map;
- map<pg_t, list<OpRequestRef> > waiting_for_pg;
- map<pg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
+ ceph::unordered_map<spg_t, PG*> pg_map;
+ map<spg_t, list<OpRequestRef> > waiting_for_pg;
+ map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
PGRecoveryStats pg_recovery_stats;
PGPool _get_pool(int id, OSDMapRef createmap);
- bool _have_pg(pg_t pgid);
- PG *_lookup_lock_pg_with_map_lock_held(pg_t pgid);
- PG *_lookup_lock_pg(pg_t pgid);
- PG *_lookup_pg(pg_t pgid);
+ bool _have_pg(spg_t pgid);
+ PG *_lookup_lock_pg_with_map_lock_held(spg_t pgid);
+ PG *_lookup_lock_pg(spg_t pgid);
+ PG *_lookup_pg(spg_t pgid);
PG *_open_lock_pg(OSDMapRef createmap,
- pg_t pg, bool no_lockdep_check=false,
+ spg_t pg, bool no_lockdep_check=false,
bool hold_map_lock=false);
enum res_result {
RES_PARENT, // resurrected a parent
RES_NONE // nothing relevant deleting
};
res_result _try_resurrect_pg(
- OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state);
- PG *_create_lock_pg(OSDMapRef createmap,
- pg_t pgid,
- bool newly_created,
- bool hold_map_lock,
- bool backfill,
- int role,
- vector<int>& up,
- vector<int>& acting,
- pg_history_t history,
- pg_interval_map_t& pi,
- ObjectStore::Transaction& t);
- PG *_lookup_qlock_pg(pg_t pgid);
-
- PG* _make_pg(OSDMapRef createmap, pg_t pgid);
+ OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state);
+ PG *_create_lock_pg(
+ OSDMapRef createmap,
+ spg_t pgid,
+ bool newly_created,
+ bool hold_map_lock,
+ bool backfill,
+ int role,
+ vector<int>& up, int up_primary,
+ vector<int>& acting, int acting_primary,
+ pg_history_t history,
+ pg_interval_map_t& pi,
+ ObjectStore::Transaction& t);
+ PG *_lookup_qlock_pg(spg_t pgid);
+
+ PG* _make_pg(OSDMapRef createmap, spg_t pgid);
void add_newly_split_pg(PG *pg,
PG::RecoveryCtx *rctx);
void handle_pg_peering_evt(
+ spg_t pgid,
const pg_info_t& info,
pg_interval_map_t& pi,
- epoch_t epoch, int from,
+ epoch_t epoch,
+ pg_shard_t from,
bool primary,
PG::CephPeeringEvtRef evt);
void load_pgs();
void build_past_intervals_parallel();
- void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset);
+ void calc_priors_during(
+ spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset);
/// project pg history from from to now
bool project_pg_history(
- pg_t pgid, pg_history_t& h, epoch_t from,
+ spg_t pgid, pg_history_t& h, epoch_t from,
const vector<int>& lastup, const vector<int>& lastacting
); ///< @return false if there was a map gap between from and now
- void wake_pg_waiters(pg_t pgid) {
+ void wake_pg_waiters(spg_t pgid) {
if (waiting_for_pg.count(pgid)) {
take_waiters_front(waiting_for_pg[pgid]);
waiting_for_pg.erase(pgid);
}
}
void wake_all_pg_waiters() {
- for (map<pg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
+ for (map<spg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
p != waiting_for_pg.end();
++p)
take_waiters_front(p->second);
struct create_pg_info {
pg_history_t history;
vector<int> acting;
- set<int> prior;
+ set<pg_shard_t> prior;
pg_t parent;
};
- ceph::unordered_map<pg_t, create_pg_info> creating_pgs;
+ ceph::unordered_map<spg_t, create_pg_info> creating_pgs;
double debug_drop_pg_create_probability;
int debug_drop_pg_create_duration;
int debug_drop_pg_create_left; // 0 if we just dropped the last one, -1 if we can drop more
- bool can_create_pg(pg_t pgid);
+ bool can_create_pg(spg_t pgid);
void handle_pg_create(OpRequestRef op);
void split_pgs(
PG *parent,
- const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
+ const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
OSDMapRef curmap,
OSDMapRef nextmap,
PG::RecoveryCtx *rctx);
ThreadPool::TPHandle *handle = NULL);
void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
ThreadPool::TPHandle *handle = NULL);
- void do_notifies(map< int,vector<pair<pg_notify_t, pg_interval_map_t> > >& notify_list,
+ void do_notifies(map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > >&
+ notify_list,
OSDMapRef map);
- void do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
+ void do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
OSDMapRef map);
- void do_infos(map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+ void do_infos(map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
OSDMapRef map);
- void repeer(PG *pg, map< int, map<pg_t,pg_query_t> >& query_map);
+ void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
bool require_mon_peer(Message *m);
bool require_osd_peer(OpRequestRef op);
utime_t defer_recovery_until;
int recovery_ops_active;
#ifdef DEBUG_RECOVERY_OIDS
- map<pg_t, set<hobject_t> > recovery_oids;
+ map<spg_t, set<hobject_t> > recovery_oids;
#endif
struct RecoveryWQ : public ThreadPool::WorkQueue<PG> {
// replay / delayed pg activation
Mutex replay_queue_lock;
- list< pair<pg_t, utime_t > > replay_queue;
+ list< pair<spg_t, utime_t > > replay_queue;
void check_replay_queue();
}
} remove_wq;
uint64_t next_removal_seq;
- coll_t get_next_removal_coll(pg_t pgid) {
+ coll_t get_next_removal_coll(spg_t pgid) {
return coll_t::make_removal_coll(next_removal_seq++, pgid);
}
assert(up.empty() || up_primary == up.front());
assert(acting.empty() || acting_primary == acting.front());
}
+ bool pg_is_ec(pg_t pg) const {
+ map<int64_t, pg_pool_t>::const_iterator i = pools.find(pg.pool());
+ assert(i != pools.end());
+ return i->second.ec_pool();
+ }
+ spg_t get_primary_shard(pg_t pgid) const { return spg_t(); /* TODOSAM: fix */}
int64_t lookup_pg_pool_name(const string& name) {
if (name_pool.count(name))
}
PG::PG(OSDService *o, OSDMapRef curmap,
- const PGPool &_pool, pg_t p, const hobject_t& loid,
+ const PGPool &_pool, spg_t p, const hobject_t& loid,
const hobject_t& ioid) :
osd(o),
cct(o->cct),
osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
snap_mapper(
&osdriver,
- p.m_seed,
+ p.ps(),
p.get_split_bits(curmap->get_pg_num(_pool.id)),
_pool.id,
p.shard),
role(0),
state(0),
send_notify(false),
+ pg_whoami(osd->whoami, p.shard),
need_up_thru(false),
last_peering_reset(0),
heartbeat_peer_lock("PG::heartbeat_peer_lock"),
/********* PG **********/
-void PG::proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from)
+void PG::proc_master_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
{
- dout(10) << "proc_master_log for osd." << from << ": " << olog << " " << omissing << dendl;
+ dout(10) << "proc_master_log for osd." << from << ": "
+ << olog << " " << omissing << dendl;
assert(!is_active() && is_primary());
// merge log into our own log to build master log. no need to
peer_missing[from].swap(omissing);
}
-void PG::proc_replica_log(ObjectStore::Transaction& t,
- pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from)
+void PG::proc_replica_log(
+ ObjectStore::Transaction& t,
+ pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing,
+ pg_shard_t from)
{
dout(10) << "proc_replica_log for osd." << from << ": "
<< oinfo << " " << olog << " " << omissing << dendl;
peer_missing[from].swap(omissing);
}
-bool PG::proc_replica_info(int from, const pg_info_t &oinfo)
+bool PG::proc_replica_info(pg_shard_t from, const pg_info_t &oinfo)
{
- map<int,pg_info_t>::iterator p = peer_info.find(from);
+ map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
return false;
}
void PG::merge_log(
- ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from)
+ ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
{
PGLogEntryHandler rollbacker;
pg_log.merge_log(
* TODO: if the missing set becomes very large, this could get expensive.
* Instead, we probably want to just iterate over our unfound set.
*/
-bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing,
- int fromosd)
+bool PG::search_for_missing(
+ const pg_info_t &oinfo, const pg_missing_t *omissing,
+ pg_shard_t fromosd)
{
bool stats_updated = false;
bool found_missing = false;
dout(10) << "search_for_missing " << soid << " " << need
<< " is on osd." << fromosd << dendl;
- map<hobject_t, set<int> >::iterator ml = missing_loc.find(soid);
+ map<hobject_t, set<pg_shard_t> >::iterator ml = missing_loc.find(soid);
if (ml == missing_loc.end()) {
map<hobject_t, list<OpRequestRef> >::iterator wmo =
waiting_for_missing_object.find(soid);
return found_missing;
}
-void PG::discover_all_missing(map< int, map<pg_t,pg_query_t> > &query_map)
+void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
{
const pg_missing_t &missing = pg_log.get_missing();
assert(missing.have_missing());
<< get_num_unfound() << " unfound"
<< dendl;
- std::set<int>::const_iterator m = might_have_unfound.begin();
- std::set<int>::const_iterator mend = might_have_unfound.end();
+ std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
+ std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
for (; m != mend; ++m) {
- int peer(*m);
+ pg_shard_t peer(*m);
- if (!get_osdmap()->is_up(peer)) {
+ if (!get_osdmap()->is_up(peer.osd)) {
dout(20) << __func__ << " skipping down osd." << peer << dendl;
continue;
}
- map<int, pg_info_t>::const_iterator iter = peer_info.find(peer);
+ map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
if (iter != peer_info.end() &&
(iter->second.is_empty() || iter->second.dne())) {
// ignore empty peers
dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
<< dendl;
peer_missing_requested.insert(peer);
- query_map[peer][info.pgid] =
- pg_query_t(pg_query_t::MISSING, info.history, get_osdmap()->get_epoch());
+ query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
+ pg_query_t(
+ pg_query_t::MISSING,
+ peer.shard, pg_whoami.shard,
+ info.history, get_osdmap()->get_epoch());
}
}
}
assert(actingbackfill.size() > 0);
- vector<int>::const_iterator end = actingbackfill.end();
- vector<int>::const_iterator a = actingbackfill.begin();
+ set<pg_shard_t>::const_iterator end = actingbackfill.end();
+ set<pg_shard_t>::const_iterator a = actingbackfill.begin();
assert(a != end);
- ++a;
for (; a != end; ++a) {
- int peer = *a;
- map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
+ if (*a == get_primary()) continue;
+ pg_shard_t peer = *a;
+ map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
if (pm == peer_missing.end()) {
dout(10) << __func__ << " osd." << peer << " don't have missing set" << dendl;
ret = true;
// We can assume that only possible osds that need backfill
// are on the backfill_targets vector nodes.
- vector<int>::const_iterator end = backfill_targets.end();
- vector<int>::const_iterator a = backfill_targets.begin();
+ set<pg_shard_t>::const_iterator end = backfill_targets.end();
+ set<pg_shard_t>::const_iterator a = backfill_targets.begin();
for (; a != end; ++a) {
- int peer = *a;
- map<int,pg_info_t>::const_iterator pi = peer_info.find(peer);
+ pg_shard_t peer = *a;
+ map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
if (!pi->second.last_backfill.is_max()) {
dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
ret = true;
vector<int> acting, up, old_acting, old_up;
cur_map = osd->get_map(cur_epoch);
- cur_map->pg_to_up_acting_osds(get_pgid(), up, acting);
+ cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting);
epoch_t same_interval_since = cur_epoch;
dout(10) << __func__ << " over epochs " << cur_epoch << "-"
<< end_epoch << dendl;
old_acting.swap(acting);
cur_map = osd->get_map(cur_epoch);
- cur_map->pg_to_up_acting_osds(get_pgid(), up, acting);
+ cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting);
std::stringstream debug;
bool new_interval = pg_interval_t::check_new_interval(
cur_map,
last_map,
info.pgid.pool(),
- info.pgid,
+ info.pgid.pgid,
&past_intervals,
&debug);
if (new_interval) {
{
// Remove any downed osds from peer_info
bool removed = false;
- map<int,pg_info_t>::iterator p = peer_info.begin();
+ map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
while (p != peer_info.end()) {
- if (!osdmap->is_up(p->first)) {
+ if (!osdmap->is_up(p->first.osd)) {
dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
peer_missing.erase(p->first);
peer_log_requested.erase(p->first);
{
assert(is_primary());
- set<int>::const_iterator peer = might_have_unfound.begin();
- set<int>::const_iterator mend = might_have_unfound.end();
+ set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
+ set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
for (; peer != mend; ++peer) {
if (peer_missing.count(*peer))
continue;
- map<int, pg_info_t>::const_iterator iter = peer_info.find(*peer);
+ map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
if (iter != peer_info.end() &&
(iter->second.is_empty() || iter->second.dne()))
continue;
- const osd_info_t &osd_info(osdmap->get_info(*peer));
+ const osd_info_t &osd_info(osdmap->get_info(peer->osd));
if (osd_info.lost_at <= osd_info.up_from) {
// If there is even one OSD in might_have_unfound that isn't lost, we
// still might retrieve our unfound.
{
if (1) {
// sanity check
- for (map<int,pg_info_t>::iterator it = peer_info.begin();
+ for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
it != peer_info.end();
++it) {
assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
}
}
- prior_set.reset(new PriorSet(*get_osdmap(),
- past_intervals,
- up,
- acting,
- info,
- this));
+ prior_set.reset(
+ new PriorSet(
+ pool.info.ec_pool(),
+ *get_osdmap(),
+ past_intervals,
+ up,
+ acting,
+ info,
+ this));
PriorSet &prior(*prior_set.get());
if (prior.pg_down) {
* 2) Prefer longer tail if it brings another info into contiguity
* 3) Prefer current primary
*/
-map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t> &infos) const
+map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
+ const map<pg_shard_t, pg_info_t> &infos) const
{
eversion_t min_last_update_acceptable = eversion_t::max();
epoch_t max_last_epoch_started_found = 0;
- for (map<int, pg_info_t>::const_iterator i = infos.begin();
+ for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
i != infos.end();
++i) {
if (max_last_epoch_started_found < i->second.last_epoch_started) {
}
assert(min_last_update_acceptable != eversion_t::max());
- map<int, pg_info_t>::const_iterator best = infos.end();
+ map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
// find osd with newest last_update (oldest for ec_pool).
// if there are multiples, prefer
// - a longer tail, if it brings another peer into log contiguity
// - the current primary
- for (map<int, pg_info_t>::const_iterator p = infos.begin();
+ for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
p != infos.end();
++p) {
// Only consider peers with last_update >= min_last_update_acceptable
}
}
// Prefer longer tail if it brings another peer into contiguity
- for (map<int, pg_info_t>::const_iterator q = infos.begin();
+ for (map<pg_shard_t, pg_info_t>::const_iterator q = infos.begin();
q != infos.end();
++q) {
if (q->second.is_incomplete())
}
}
// prefer current primary (usually the caller), all things being equal
- if (p->first == acting[0]) {
+ if (p->first == pg_whoami) {
dout(10) << "calc_acting prefer osd." << p->first
<< " because it is current primary" << dendl;
best = p;
* incomplete, or another osd has a longer tail that allows us to
* bring other up nodes up to date.
*/
-bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>& backfill) const
-{
- map<int, pg_info_t> all_info(peer_info.begin(), peer_info.end());
- all_info[osd->whoami] = info;
-
- for (map<int,pg_info_t>::iterator p = all_info.begin(); p != all_info.end(); ++p) {
+bool PG::calc_acting(
+ pg_shard_t &auth_log_shard_id,
+ vector<int> &want,
+ set<pg_shard_t> &backfill) const
+{
+// TODOSAM: fix
+#if 0
+ map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+ all_info[pg_whoami] = info;
+
+ for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
+ p != all_info.end();
+ ++p) {
dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
}
- map<int, pg_info_t>::const_iterator newest_update_osd = find_best_info(all_info);
+ map<pg_shard_t, pg_info_t>::const_iterator newest_update_osd =
+ find_best_info(all_info);
if (newest_update_osd == all_info.end()) {
if (up != acting) {
- dout(10) << "calc_acting no suitable info found (incomplete backfills?), reverting to up" << dendl;
+ dout(10) << "calc_acting no suitable info found (incomplete backfills?),"
+ << " reverting to up" << dendl;
want = up;
return true;
} else {
newest_update_osd_id = newest_update_osd->first;
// select primary
- map<int,pg_info_t>::const_iterator primary;
+ map<pg_shard_t,pg_info_t>::const_iterator primary;
if (up.size() &&
- !all_info[up[0]].is_incomplete() &&
- all_info[up[0]].last_update >= newest_update_osd->second.log_tail) {
- dout(10) << "up[0](osd." << up[0] << ") selected as primary" << dendl;
- primary = all_info.find(up[0]); // prefer up[0], all thing being equal
+ !all_info[up_primary].is_incomplete() &&
+ all_info[up_primary].last_update >= newest_update_osd->second.log_tail) {
+ dout(10) << "up_primary: " << up_primary << ") selected as primary" << dendl;
+ primary = all_info.find(up_primary); // prefer up[0], all thing being equal
} else if (!newest_update_osd->second.is_incomplete()) {
dout(10) << "up[0] needs backfill, osd." << newest_update_osd_id
<< " selected as primary instead" << dendl;
primary = newest_update_osd;
} else {
- map<int, pg_info_t> complete_infos;
- for (map<int, pg_info_t>::iterator i = all_info.begin();
+ map<pg_shard_t, pg_info_t> complete_infos;
+ for (map<pg_shard_t, pg_info_t>::iterator i = all_info.begin();
i != all_info.end();
++i) {
if (!i->second.is_incomplete())
}
}
- for (map<int,pg_info_t>::const_iterator i = all_info.begin();
+ for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
i != all_info.end();
++i) {
if (usable >= get_osdmap()->get_pg_size(info.pgid))
}
}
+#endif
return true;
}
* calculate the desired acting, and request a change with the monitor
* if it differs from the current acting.
*/
-bool PG::choose_acting(int& newest_update_osd)
+bool PG::choose_acting(pg_shard_t &auth_log_shard)
{
- vector<int> want, backfill;
+ vector<int> want;
+ set<pg_shard_t> backfill;
- if (!calc_acting(newest_update_osd, want, backfill)) {
+ if (!calc_acting(auth_log_shard, want, backfill)) {
dout(10) << "choose_acting failed" << dendl;
assert(want_acting.empty());
return false;
// Determine if compatibility needed
bool compat_mode = !cct->_conf->osd_debug_override_acting_compat;
+ //TODOSAM: fix
+#if 0
if (compat_mode) {
bool all_support = true;
OSDMapRef osdmap = get_osdmap();
if (all_support)
compat_mode = false;
}
+#endif
if (compat_mode && !backfill.empty()) {
- backfill.resize(1);
+ pg_shard_t only_backfill(*backfill.begin());
+ backfill.clear();
+ backfill.insert(only_backfill);
}
// This might cause a problem if min_size is large
return false;
}
- if (compat_mode) {
- want.insert(want.end(), backfill.begin(), backfill.end());
+ if (compat_mode && backfill.size()) {
+ want.insert(want.end(), backfill.begin()->osd);
}
if (want != acting) {
// want is the same as crush map up OSDs.
assert(compat_mode || backfill.empty());
vector<int> empty;
- osd->queue_want_pg_temp(info.pgid, empty);
+ osd->queue_want_pg_temp(info.pgid.pgid, empty);
} else
- osd->queue_want_pg_temp(info.pgid, want);
+ osd->queue_want_pg_temp(info.pgid.pgid, want);
return false;
}
want_acting.clear();
// We can only get here when new interval has arrived and
// we've accepted the acting set. Now we can create
// actingbackfill and backfill_targets vectors.
- actingbackfill = acting;
+ actingbackfill.clear();
+ for (unsigned i = 0; i < acting.size(); ++i) {
+ if (acting[i] != -1) {
+ actingbackfill.insert(
+ pg_shard_t(
+ acting[i],
+ pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+ }
+ }
if (!compat_mode)
- actingbackfill.insert(actingbackfill.end(), backfill.begin(), backfill.end());
+ actingbackfill.insert(backfill.begin(), backfill.end());
assert(backfill_targets.empty() || backfill_targets == backfill);
if (backfill_targets.empty()) {
backfill_targets = backfill;
- for (unsigned i = 0; i < backfill.size() ; ++i) {
- stray_set.erase(backfill[i]);
+ for (set<pg_shard_t>::iterator i = backfill.begin();
+ i != backfill.end();
+ ++i) {
+ stray_set.erase(*i);
}
} else {
// Will not change if already set because up would have had to change
assert(backfill_targets == backfill);
// Verify that nothing in backfill is in stray_set
- for (unsigned i = 0; i < backfill.size() ; ++i) {
- assert(stray_set.find(backfill[i]) == stray_set.end());
+ for (set<pg_shard_t>::iterator i = backfill.begin();
+ i != backfill.end();
+ ++i) {
+ assert(stray_set.find(*i) == stray_set.end());
}
}
dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
if (!interval.maybe_went_rw)
continue;
+ int i = 0;
std::vector<int>::const_iterator a = interval.acting.begin();
std::vector<int>::const_iterator a_end = interval.acting.end();
- for (; a != a_end; ++a) {
+ for (; a != a_end; ++a, ++i) {
if (*a != osd->whoami)
- might_have_unfound.insert(*a);
+ might_have_unfound.insert(
+ pg_shard_t(
+ *a,
+ pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
}
}
// include any (stray) peers
- for (map<int,pg_info_t>::iterator p = peer_info.begin();
+ for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
p != peer_info.end();
++p)
might_have_unfound.insert(p->first);
void PG::activate(ObjectStore::Transaction& t,
epoch_t query_epoch,
list<Context*>& tfin,
- map< int, map<pg_t,pg_query_t> >& query_map,
- map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map)
+ map<int, map<spg_t,pg_query_t> >& query_map,
+ map<int,
+ vector<
+ pair<pg_notify_t,
+ pg_interval_map_t> > > *activator_map)
{
assert(!is_active());
assert(scrubber.callbacks.empty());
// TODOSAM: osd->osd-> is no good
osd->osd->replay_queue_lock.Lock();
- osd->osd->replay_queue.push_back(pair<pg_t,utime_t>(info.pgid, replay_until));
+ osd->osd->replay_queue.push_back(pair<spg_t,utime_t>(
+ info.pgid, replay_until));
osd->osd->replay_queue_lock.Unlock();
}
// start up replicas
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ pg_shard_t peer = *i;
assert(peer_info.count(peer));
pg_info_t& pi = peer_info[peer];
// empty log
if (!pi.is_empty() && activator_map) {
dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
- (*activator_map)[peer].push_back(
+ (*activator_map)[peer.osd].push_back(
make_pair(
pg_notify_t(
+ peer.shard, pg_whoami.shard,
get_osdmap()->get_epoch(),
get_osdmap()->get_epoch(),
info),
past_intervals));
} else {
dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
- m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
+ m = new MOSDPGLog(
+ i->shard, pg_whoami.shard,
+ get_osdmap()->get_epoch(), info);
}
} else if (pg_log.get_tail() > pi.last_update || pi.last_backfill == hobject_t()) {
// backfill
pi.history = info.history;
pi.stats.stats.clear();
- m = new MOSDPGLog(get_osdmap()->get_epoch(), pi);
+ m = new MOSDPGLog(
+ i->shard, pg_whoami.shard,
+ get_osdmap()->get_epoch(), pi);
// send some recent log, so that op dup detection works well.
m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
} else {
// catch up
assert(pg_log.get_tail() <= pi.last_update);
- m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
+ m = new MOSDPGLog(
+ i->shard, pg_whoami.shard,
+ get_osdmap()->get_epoch(), info);
// send new stuff to append to replicas log
m->log.copy_after(pg_log.get_log(), pi.last_update);
}
if (m) {
dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
//m->log.print(cout);
- osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
}
// peer now has
}
// degraded?
- if (get_osdmap()->get_pg_size(info.pgid) > acting.size())
+ if (get_osdmap()->get_pg_size(info.pgid.pgid) > acting.size())
state_set(PG_STATE_DEGRADED);
// all clean?
if (pg_has_reset_since(e)) {
dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl;
} else if (is_primary()) {
- peer_activated.insert(osd->whoami);
+ peer_activated.insert(pg_whoami);
dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated
<< " last_epoch_started " << info.history.last_epoch_started
<< " same_interval_since " << info.history.same_interval_since << dendl;
} else {
dout(10) << "_activate_committed " << e << " telling primary" << dendl;
MOSDPGInfo *m = new MOSDPGInfo(e);
- pg_notify_t i = pg_notify_t(get_osdmap()->get_epoch(),
- get_osdmap()->get_epoch(),
- info);
+ pg_notify_t i = pg_notify_t(
+ get_primary().shard, pg_whoami.shard,
+ get_osdmap()->get_epoch(),
+ get_osdmap()->get_epoch(),
+ info);
i.info.history.last_epoch_started = e;
m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
- osd->send_message_osd_cluster(acting[0], m, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
}
if (dirty_info) {
{
// only mark CLEAN if we have the desired number of replicas AND we
// are not remapped.
- if (acting.size() == get_osdmap()->get_pg_size(info.pgid) &&
+ if (acting.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
up == acting)
state_set(PG_STATE_CLEAN);
}
void PG::split_ops(PG *child, unsigned split_bits) {
- unsigned match = child->info.pgid.m_seed;
+ unsigned match = child->info.pgid.ps();
assert(waiting_for_all_missing.empty());
assert(waiting_for_cache_not_full.empty());
assert(waiting_for_missing_object.empty());
child->snap_trimq = snap_trimq;
// There can't be recovery/backfill going on now
- get_osdmap()->pg_to_up_acting_osds(child->info.pgid, child->up, child->acting);
+ int primary, up_primary;
+ vector<int> newup, newacting;
+ get_osdmap()->pg_to_up_acting_osds(
+ child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
+ child->init_primary_up_acting(
+ newup,
+ newacting,
+ up_primary,
+ primary);
child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
if (get_primary() != child->get_primary())
child->info.history.same_primary_since = get_osdmap()->get_epoch();
dout(10) << "purge_strays " << stray_set << dendl;
bool removed = false;
- for (set<int>::iterator p = stray_set.begin();
+ for (set<pg_shard_t>::iterator p = stray_set.begin();
p != stray_set.end();
++p) {
- if (get_osdmap()->is_up(*p)) {
+ assert(!is_actingbackfill(*p));
+ if (get_osdmap()->is_up(p->osd)) {
dout(10) << "sending PGRemove to osd." << *p << dendl;
- vector<pg_t> to_remove;
- to_remove.push_back(info.pgid);
+ vector<spg_t> to_remove;
+ to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
MOSDPGRemove *m = new MOSDPGRemove(
get_osdmap()->get_epoch(),
to_remove);
- osd->send_message_osd_cluster(*p, m, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
stray_purged.insert(*p);
} else {
dout(10) << "not sending PGRemove to down osd." << *p << dendl;
peer_missing_requested.clear();
}
-void PG::set_probe_targets(const set<int> &probe_set)
+void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
{
Mutex::Locker l(heartbeat_peer_lock);
- probe_targets = probe_set;
+ probe_targets.clear();
+ for (set<pg_shard_t>::iterator i = probe_set.begin();
+ i != probe_set.end();
+ ++i) {
+ probe_targets.insert(i->osd);
+ }
}
void PG::clear_probe_targets()
new_peers.insert(acting[i]);
for (unsigned i=0; i<up.size(); i++)
new_peers.insert(up[i]);
- for (map<int,pg_info_t>::iterator p = peer_info.begin(); p != peer_info.end(); ++p)
- new_peers.insert(p->first);
+ for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
+ p != peer_info.end();
+ ++p)
+ new_peers.insert(p->first.osd);
}
bool need_update = false;
info.stats.ondisk_log_start = pg_log.get_tail();
// calc copies, degraded
- unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), actingbackfill.size());
+ unsigned target = MAX(
+ get_osdmap()->get_pg_size(info.pgid.pgid), actingbackfill.size());
info.stats.stats.calc_copies(target);
info.stats.stats.sum.num_objects_degraded = 0;
if ((is_degraded() || !is_clean()) && is_active()) {
degraded += pg_log.get_missing().num_missing();
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- assert(peer_missing.count(actingbackfill[i]));
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ assert(peer_missing.count(*i));
// in missing set
- degraded += peer_missing[actingbackfill[i]].num_missing();
+ degraded += peer_missing[*i].num_missing();
// not yet backfilled
- degraded += num_objects - peer_info[actingbackfill[i]].stats.stats.sum.num_objects;
+ degraded += num_objects - peer_info[*i].stats.stats.sum.num_objects;
}
info.stats.stats.sum.num_objects_degraded = degraded;
info.stats.stats.sum.num_objects_unfound = get_num_unfound();
* @param backfill true if info should be marked as backfill
* @param t transaction to write out our new state in
*/
-void PG::init(int role, vector<int>& newup, vector<int>& newacting,
- pg_history_t& history,
- pg_interval_map_t& pi,
- bool backfill,
- ObjectStore::Transaction *t)
+void PG::init(
+ int role,
+ vector<int>& newup, int new_up_primary,
+ vector<int>& newacting, int new_acting_primary,
+ pg_history_t& history,
+ pg_interval_map_t& pi,
+ bool backfill,
+ ObjectStore::Transaction *t)
{
dout(10) << "init role " << role << " up " << newup << " acting " << newacting
<< " history " << history
set_role(role);
acting = newacting;
up = newup;
+ init_primary_up_acting(
+ newup,
+ newacting,
+ new_up_primary,
+ new_acting_primary);
info.history = history;
past_intervals.swap(pi);
epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl)
{
assert(bl);
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
bool ok = coll.is_pg(pgid, snap);
assert(ok);
dout(10) << "trim_peers " << pg_trim_to << dendl;
if (pg_trim_to != eversion_t()) {
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++)
- osd->send_message_osd_cluster(actingbackfill[i],
- new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
- pg_trim_to),
- get_osdmap()->get_epoch());
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ osd->send_message_osd_cluster(
+ i->osd,
+ new MOSDPGTrim(
+ get_osdmap()->get_epoch(),
+ spg_t(info.pgid.pgid, i->shard),
+ pg_trim_to),
+ get_osdmap()->get_epoch());
+ }
}
}
dout(0) << "strftime failed" << dendl;
return "corrupt_log_unknown_time";
}
- info.pgid.print(buf + ret, MAX_BUF - ret);
- return buf;
+ string out(buf);
+ out += stringify(info.pgid);
+ return out;
}
int PG::read_info(
if (osd->inc_scrubs_pending()) {
dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
scrubber.reserved = true;
- scrubber.reserved_peers.insert(osd->whoami);
+ scrubber.reserved_peers.insert(pg_whoami);
scrub_reserve_replicas();
} else {
dout(20) << "sched_scrub: failed to reserve locally" << dendl;
op->mark_started();
- int from = m->get_source().num();
-
- dout(10) << " got osd." << from << " scrub map" << dendl;
+ dout(10) << " got " << m->from << " scrub map" << dendl;
bufferlist::iterator p = m->get_data().begin();
if (scrubber.is_chunky) { // chunky scrub
- scrubber.received_maps[from].decode(p, info.pgid.pool());
- dout(10) << "map version is " << scrubber.received_maps[from].valid_through << dendl;
+ scrubber.received_maps[m->from].decode(p, info.pgid.pool());
+ dout(10) << "map version is "
+ << scrubber.received_maps[m->from].valid_through
+ << dendl;
} else { // classic scrub
- if (scrubber.received_maps.count(from)) {
+ if (scrubber.received_maps.count(m->from)) {
ScrubMap incoming;
incoming.decode(p, info.pgid.pool());
- dout(10) << "from replica " << from << dendl;
+ dout(10) << "from replica " << m->from << dendl;
dout(10) << "map version is " << incoming.valid_through << dendl;
- scrubber.received_maps[from].merge_incr(incoming);
+ scrubber.received_maps[m->from].merge_incr(incoming);
} else {
- scrubber.received_maps[from].decode(p, info.pgid.pool());
+ scrubber.received_maps[m->from].decode(p, info.pgid.pool());
}
}
--scrubber.waiting_on;
- scrubber.waiting_on_whom.erase(from);
+ scrubber.waiting_on_whom.erase(m->from);
if (scrubber.waiting_on == 0) {
if (scrubber.is_chunky) { // chunky scrub
scrubber.finalizing = true;
scrub_gather_replica_maps();
++scrubber.waiting_on;
- scrubber.waiting_on_whom.insert(osd->whoami);
+ scrubber.waiting_on_whom.insert(pg_whoami);
osd->scrub_wq.queue(this);
}
}
}
// send scrub v2-compatible messages (classic scrub)
-void PG::_request_scrub_map_classic(int replica, eversion_t version)
+void PG::_request_scrub_map_classic(pg_shard_t replica, eversion_t version)
{
- assert(replica != osd->whoami);
+ assert(replica != pg_whoami);
dout(10) << "scrub requesting scrubmap from osd." << replica << dendl;
- MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version,
- last_update_applied,
- get_osdmap()->get_epoch());
- osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch());
+ MOSDRepScrub *repscrubop =
+ new MOSDRepScrub(
+ spg_t(info.pgid.pgid, replica.shard), version,
+ last_update_applied,
+ get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(
+ replica.osd, repscrubop, get_osdmap()->get_epoch());
}
// send scrub v3 messages (chunky scrub)
-void PG::_request_scrub_map(int replica, eversion_t version,
- hobject_t start, hobject_t end,
- bool deep)
+void PG::_request_scrub_map(
+ pg_shard_t replica, eversion_t version,
+ hobject_t start, hobject_t end,
+ bool deep)
{
- assert(replica != osd->whoami);
+ assert(replica != pg_whoami);
dout(10) << "scrub requesting scrubmap from osd." << replica << dendl;
- MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version,
- get_osdmap()->get_epoch(),
- start, end, deep);
- osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch());
+ MOSDRepScrub *repscrubop = new MOSDRepScrub(
+ spg_t(info.pgid.pgid, replica.shard), version,
+ get_osdmap()->get_epoch(),
+ start, end, deep);
+ osd->send_message_osd_cluster(
+ replica.osd, repscrubop, get_osdmap()->get_epoch());
}
void PG::sub_op_scrub_reserve(OpRequestRef op)
scrubber.reserved = osd->inc_scrubs_pending();
- MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+ MOSDSubOpReply *reply = new MOSDSubOpReply(
+ m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
::encode(scrubber.reserved, reply->get_data());
osd->send_message_osd_cluster(reply, m->get_connection());
}
op->mark_started();
- int from = reply->get_source().num();
+ pg_shard_t from = reply->from;
bufferlist::iterator p = reply->get_data().begin();
bool reserved;
::decode(reserved, p);
// see comment in sub_op_scrub_reserve
scrubber.reserved = false;
- MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+ MOSDSubOpReply *reply = new MOSDSubOpReply(
+ m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
osd->send_message_osd_cluster(reply, m->get_connection());
}
void PG::reject_reservation()
{
osd->send_message_osd_cluster(
- acting[0],
+ primary.osd,
new MBackfillReserve(
MBackfillReserve::REJECT,
- info.pgid,
+ spg_t(info.pgid.pgid, primary.shard),
get_osdmap()->get_epoch()),
get_osdmap()->get_epoch());
}
void PG::scrub_reserve_replicas()
{
assert(backfill_targets.empty());
- for (unsigned i=1; i<acting.size(); i++) {
- dout(10) << "scrub requesting reserve from osd." << acting[i] << dendl;
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ dout(10) << "scrub requesting reserve from osd." << *i << dendl;
vector<OSDOp> scrub(1);
scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
hobject_t poid;
eversion_t v;
osd_reqid_t reqid;
- MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
- get_osdmap()->get_epoch(), osd->get_tid(), v);
+ MOSDSubOp *subop = new MOSDSubOp(
+ reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0,
+ get_osdmap()->get_epoch(), osd->get_tid(), v);
subop->ops = scrub;
- osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(
+ i->osd, subop, get_osdmap()->get_epoch());
}
}
void PG::scrub_unreserve_replicas()
{
assert(backfill_targets.empty());
- for (unsigned i=1; i<acting.size(); i++) {
- dout(10) << "scrub requesting unreserve from osd." << acting[i] << dendl;
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
vector<OSDOp> scrub(1);
scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
hobject_t poid;
eversion_t v;
osd_reqid_t reqid;
- MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
- get_osdmap()->get_epoch(), osd->get_tid(), v);
+ MOSDSubOp *subop = new MOSDSubOp(
+ reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0,
+ get_osdmap()->get_epoch(), osd->get_tid(), v);
subop->ops = scrub;
- osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
}
}
osd->store->collection_getattrs(coll, map.attrs);
}
-void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer)
+void PG::repair_object(
+ const hobject_t& soid, ScrubMap::object *po,
+ pg_shard_t bad_peer, pg_shard_t ok_peer)
{
- dout(10) << "repair_object " << soid << " bad_peer osd." << bad_peer << " ok_peer osd." << ok_peer << dendl;
+ dout(10) << "repair_object " << soid << " bad_peer osd."
+ << bad_peer << " ok_peer osd." << ok_peer << dendl;
eversion_t v;
bufferlist bv;
bv.push_back(po->attrs[OI_ATTR]);
object_info_t oi(bv);
- if (bad_peer != acting[0]) {
+ if (bad_peer != primary) {
peer_missing[bad_peer].add(soid, oi.version, eversion_t());
} else {
// We should only be scrubbing if the PG is clean.
hobject_t poid;
eversion_t v;
osd_reqid_t reqid;
- MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
- msg->map_epoch, osd->get_tid(), v);
+ MOSDSubOp *subop = new MOSDSubOp(
+ reqid,
+ pg_whoami,
+ spg_t(info.pgid.pgid, get_primary().shard),
+ poid,
+ false,
+ 0,
+ msg->map_epoch,
+ osd->get_tid(),
+ v);
::encode(map, subop->get_data());
subop->ops = scrub;
* last_update_applied == info.last_update)
*/
scrubber.waiting_on = acting.size();
- scrubber.waiting_on_whom.insert(acting.begin(), acting.end());
+ scrubber.waiting_on_whom.insert(
+ actingbackfill.begin(), actingbackfill.end());
+ scrubber.waiting_on_whom.erase(pg_whoami);
// request maps from replicas
- for (unsigned i=1; i<acting.size(); i++) {
- _request_scrub_map_classic(acting[i], eversion_t());
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ _request_scrub_map_classic(*i, eversion_t());
}
// Unlocks and relocks...
}
--scrubber.waiting_on;
- scrubber.waiting_on_whom.erase(osd->whoami);
+ scrubber.waiting_on_whom.erase(pg_whoami);
if (scrubber.waiting_on == 0) {
// the replicas have completed their scrub map, so lock out writes
// request incrementals from replicas
scrub_gather_replica_maps();
++scrubber.waiting_on;
- scrubber.waiting_on_whom.insert(osd->whoami);
+ scrubber.waiting_on_whom.insert(pg_whoami);
}
dout(10) << "clean up scrub" << dendl;
}
--scrubber.waiting_on;
- scrubber.waiting_on_whom.erase(osd->whoami);
+ scrubber.waiting_on_whom.erase(pg_whoami);
if (scrubber.waiting_on == 0) {
assert(last_update_applied == info.last_update);
osd->scrub_finalize_wq.queue(this);
}
// ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan
- scrubber.waiting_on_whom.insert(osd->whoami);
+ scrubber.waiting_on_whom.insert(pg_whoami);
++scrubber.waiting_on;
// request maps from replicas
- for (unsigned i=1; i<acting.size(); i++) {
- _request_scrub_map(acting[i], scrubber.subset_last_update,
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ _request_scrub_map(*i, scrubber.subset_last_update,
scrubber.start, scrubber.end, scrubber.deep);
- scrubber.waiting_on_whom.insert(acting[i]);
+ scrubber.waiting_on_whom.insert(*i);
++scrubber.waiting_on;
}
}
--scrubber.waiting_on;
- scrubber.waiting_on_whom.erase(osd->whoami);
+ scrubber.waiting_on_whom.erase(pg_whoami);
scrubber.state = PG::Scrubber::WAIT_REPLICAS;
break;
assert(scrubber.waiting_on == 0);
assert(_lock.is_locked());
- for (map<int,ScrubMap>::iterator p = scrubber.received_maps.begin();
+ for (map<pg_shard_t, ScrubMap>::iterator p = scrubber.received_maps.begin();
p != scrubber.received_maps.end();
++p) {
}
}
-
-
void PG::scrub_compare_maps()
{
dout(10) << "scrub_compare_maps has maps, analyzing" << dendl;
stringstream ss;
// Map from object with errors to good peer
- map<hobject_t, int> authoritative;
- map<int,ScrubMap *> maps;
+ map<hobject_t, pg_shard_t> authoritative;
+ map<pg_shard_t, ScrubMap *> maps;
dout(2) << "scrub osd." << acting[0] << " has "
<< scrubber.primary_scrubmap.objects.size() << " items" << dendl;
- maps[0] = &scrubber.primary_scrubmap;
- for (unsigned i=1; i<acting.size(); i++) {
- dout(2) << "scrub osd." << acting[i] << " has "
- << scrubber.received_maps[acting[i]].objects.size() << " items" << dendl;
- maps[i] = &scrubber.received_maps[acting[i]];
+ maps[pg_whoami] = &scrubber.primary_scrubmap;
+
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ dout(2) << "scrub replica " << *i << " has "
+ << scrubber.received_maps[*i].objects.size()
+ << " items" << dendl;
+ maps[*i] = &scrubber.received_maps[*i];
}
get_pgbackend()->be_compare_scrubmaps(
osd->clog.error(ss);
}
- for (map<hobject_t, int>::iterator i = authoritative.begin();
+ for (map<hobject_t, pg_shard_t>::iterator i = authoritative.begin();
i != authoritative.end();
++i) {
scrubber.authoritative.insert(
make_pair(maps[i->second]->objects[i->first], i->second)));
}
- for (map<hobject_t, int>::iterator i = authoritative.begin();
+ for (map<hobject_t, pg_shard_t>::iterator i = authoritative.begin();
i != authoritative.end();
++i) {
authmap.objects.erase(i->first);
if (!scrubber.authoritative.empty() || !scrubber.inconsistent.empty()) {
stringstream ss;
- for (map<hobject_t, set<int> >::iterator obj =
+ for (map<hobject_t, set<pg_shard_t> >::iterator obj =
scrubber.inconsistent_snapcolls.begin();
obj != scrubber.inconsistent_snapcolls.end();
++obj) {
- for (set<int>::iterator j = obj->second.begin();
+ for (set<pg_shard_t>::iterator j = obj->second.begin();
j != obj->second.end();
++j) {
++scrubber.shallow_errors;
}
}
- ss << info.pgid << " " << mode << " " << scrubber.missing.size() << " missing, "
+ ss << info.pgid << " " << mode << " "
+ << scrubber.missing.size() << " missing, "
<< scrubber.inconsistent.size() << " inconsistent objects\n";
dout(2) << ss.str() << dendl;
osd->clog.error(ss);
if (repair) {
state_clear(PG_STATE_CLEAN);
- for (map<hobject_t, pair<ScrubMap::object, int> >::iterator i =
+ for (map<hobject_t, pair<ScrubMap::object, pg_shard_t> >::iterator i =
scrubber.authoritative.begin();
i != scrubber.authoritative.end();
++i) {
- set<int>::iterator j;
+ set<pg_shard_t>::iterator j;
if (scrubber.missing.count(i->first)) {
for (j = scrubber.missing[i->first].begin();
j != scrubber.missing[i->first].end();
++j) {
- repair_object(i->first,
+ repair_object(
+ i->first,
&(i->second.first),
- acting[*j],
- acting[i->second.second]);
+ *j,
+ i->second.second);
++scrubber.fixed;
}
}
++j) {
repair_object(i->first,
&(i->second.first),
- acting[*j],
- acting[i->second.second]);
+ *j,
+ i->second.second);
++scrubber.fixed;
}
}
// share new pg_info_t with replicas
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
- if (peer_info.count(i)) {
- peer_info[i].last_epoch_started = info.last_epoch_started;
- peer_info[i].history.merge(info.history);
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == pg_whoami) continue;
+ pg_shard_t peer = *i;
+ if (peer_info.count(peer)) {
+ peer_info[peer].last_epoch_started = info.last_epoch_started;
+ peer_info[peer].history.merge(info.history);
}
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
m->pg_list.push_back(
make_pair(
pg_notify_t(
+ peer.shard, pg_whoami.shard,
get_osdmap()->get_epoch(),
get_osdmap()->get_epoch(),
info),
pg_interval_map_t()));
- osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
}
}
dout(10) << __func__ << dendl;
assert(is_primary());
- vector<int>::const_iterator a = actingbackfill.begin();
+ set<pg_shard_t>::const_iterator a = actingbackfill.begin();
assert(a != actingbackfill.end());
- vector<int>::const_iterator end = actingbackfill.end();
- while (++a != end) {
- int peer(*a);
+ set<pg_shard_t>::const_iterator end = actingbackfill.end();
+ while (a != end) {
+ pg_shard_t peer(*a);
+ ++a;
+ if (peer == pg_whoami) continue;
pg_missing_t& pmissing(peer_missing[peer]);
pg_info_t& pinfo(peer_info[peer]);
- MOSDPGLog *m = new MOSDPGLog(info.last_update.epoch, info);
+ MOSDPGLog *m = new MOSDPGLog(
+ peer.shard, pg_whoami.shard,
+ info.last_update.epoch, info);
m->log.copy_after(pg_log.get_log(), pinfo.last_update);
for (list<pg_log_entry_t>::const_iterator i = m->log.log.begin();
}
pinfo.last_update = m->log.head;
- osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
}
}
reg_next_scrub();
}
-void PG::fulfill_info(int from, const pg_query_t &query,
- pair<int, pg_info_t> ¬ify_info)
+void PG::fulfill_info(
+ pg_shard_t from, const pg_query_t &query,
+ pair<pg_shard_t, pg_info_t> ¬ify_info)
{
- assert(!acting.empty());
- assert(from == acting[0]);
+ assert(from == primary);
assert(query.type == pg_query_t::INFO);
// info
notify_info = make_pair(from, info);
}
-void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch)
+void PG::fulfill_log(
+ pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
{
- assert(!acting.empty());
- assert(from == acting[0]);
+ dout(10) << "log request from " << from << dendl;
+ assert(from == primary);
assert(query.type != pg_query_t::INFO);
- MOSDPGLog *mlog = new MOSDPGLog(get_osdmap()->get_epoch(),
- info, query_epoch);
+ MOSDPGLog *mlog = new MOSDPGLog(
+ from.shard, pg_whoami.shard,
+ get_osdmap()->get_epoch(),
+ info, query_epoch);
mlog->missing = pg_log.get_missing();
// primary -> other, when building master log
dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
- ConnectionRef con = osd->get_con_osd_cluster(from, get_osdmap()->get_epoch());
+ ConnectionRef con = osd->get_con_osd_cluster(
+ from.osd, get_osdmap()->get_epoch());
if (con) {
- osd->osd->_share_map_outgoing(from, con.get(), get_osdmap());
+ osd->osd->_share_map_outgoing(from.osd, con.get(), get_osdmap());
osd->send_message_osd_cluster(mlog, con.get());
} else {
mlog->put();
}
/* Called before initializing peering during advance_map */
-void PG::start_peering_interval(const OSDMapRef lastmap,
- const vector<int>& newup,
- const vector<int>& newacting,
- ObjectStore::Transaction *t)
+void PG::start_peering_interval(
+ const OSDMapRef lastmap,
+ const vector<int>& newup, int new_up_primary,
+ const vector<int>& newacting, int new_acting_primary,
+ ObjectStore::Transaction *t)
{
const OSDMapRef osdmap = get_osdmap();
vector<int> oldacting, oldup;
int oldrole = get_role();
- int oldprimary = get_primary();
+
+ pg_shard_t oldprimary = get_primary();
bool was_old_primary = is_primary();
+
acting.swap(oldacting);
up.swap(oldup);
-
- up = newup;
- acting = newacting;
+ init_primary_up_acting(
+ newup,
+ newacting,
+ new_up_primary,
+ new_acting_primary);
if (info.stats.up != up ||
info.stats.acting != acting) {
state_clear(PG_STATE_REMAPPED);
int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
- set_role(role);
+ if (role == pg_whoami.shard)
+ set_role(role);
+ else
+ set_role(-1);
// did acting, up, primary|acker change?
if (!lastmap) {
osdmap,
lastmap,
info.pgid.pool(),
- info.pgid,
+ info.pgid.pgid,
&past_intervals,
&debug);
dout(10) << __func__ << ": check_new_interval output: "
}
}
// make sure we clear out any pg_temp change requests
- osd->remove_want_pg_temp(info.pgid);
+ osd->remove_want_pg_temp(info.pgid.pgid);
cancel_recovery();
- if (acting.empty() && !up.empty() && up[0] == osd->whoami) {
+ if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
- osd->queue_want_pg_temp(info.pgid, acting);
+ osd->queue_want_pg_temp(info.pgid.pgid, acting);
}
}
void PG::queue_notify(epoch_t msg_epoch,
epoch_t query_epoch,
- int from, pg_notify_t& i)
+ pg_shard_t from, pg_notify_t& i)
{
- dout(10) << "notify " << i << " from osd." << from << dendl;
+ dout(10) << "notify " << i << " from replica " << from << dendl;
queue_peering_event(
CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
MNotifyRec(from, i))));
void PG::queue_info(epoch_t msg_epoch,
epoch_t query_epoch,
- int from, pg_info_t& i)
+ pg_shard_t from, pg_info_t& i)
{
- dout(10) << "info " << i << " from osd." << from << dendl;
+ dout(10) << "info " << i << " from replica " << from << dendl;
queue_peering_event(
CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
MInfoRec(from, i, msg_epoch))));
void PG::queue_log(epoch_t msg_epoch,
epoch_t query_epoch,
- int from,
+ pg_shard_t from,
MOSDPGLog *msg)
{
- dout(10) << "log " << *msg << " from osd." << from << dendl;
+ dout(10) << "log " << *msg << " from replica " << from << dendl;
queue_peering_event(
CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
MLogRec(from, msg))));
void PG::queue_query(epoch_t msg_epoch,
epoch_t query_epoch,
- int from, const pg_query_t& q)
+ pg_shard_t from, const pg_query_t& q)
{
- dout(10) << "handle_query " << q << " from osd." << from << dendl;
+ dout(10) << "handle_query " << q << " from replica " << from << dendl;
queue_peering_event(
CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
MQuery(from, q, query_epoch))));
}
-void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
- vector<int>& newup, vector<int>& newacting,
- RecoveryCtx *rctx)
+void PG::handle_advance_map(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ vector<int>& newup, int up_primary,
+ vector<int>& newacting, int acting_primary,
+ RecoveryCtx *rctx)
{
assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
assert(lastmap == osdmap_ref);
- dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl;
+ dout(10) << "handle_advance_map "
+ << newup << "/" << newacting
+ << " -- " << up_primary << "/" << acting_primary
+ << dendl;
update_osdmap_ref(osdmap);
pool.update(osdmap);
if (pool.info.last_change == osdmap_ref->get_epoch())
on_pool_change();
- AdvMap evt(osdmap, lastmap, newup, newacting);
+ AdvMap evt(
+ osdmap, lastmap, newup, up_primary,
+ newacting, acting_primary);
recovery_state.handle_event(evt, rctx);
}
// _before_ we are active.
pg->generate_past_intervals();
- pg->remove_down_peer_info(advmap.osdmap);
if (pg->acting_up_affected(advmap.newup, advmap.newacting) ||
pg->is_split(advmap.lastmap, advmap.osdmap)) {
dout(10) << "up or acting affected, calling start_peering_interval again"
<< dendl;
- pg->start_peering_interval(advmap.lastmap, advmap.newup, advmap.newacting,
- context< RecoveryMachine >().get_cur_transaction());
+ pg->start_peering_interval(
+ advmap.lastmap,
+ advmap.newup, advmap.up_primary,
+ advmap.newacting, advmap.acting_primary,
+ context< RecoveryMachine >().get_cur_transaction());
}
+ pg->remove_down_peer_info(advmap.osdmap);
return discard_event();
}
boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
{
PG *pg = context< RecoveryMachine >().pg;
- if (pg->should_send_notify() && pg->get_primary() >= 0) {
- context< RecoveryMachine >().send_notify(pg->get_primary(),
- pg_notify_t(pg->get_osdmap()->get_epoch(),
- pg->get_osdmap()->get_epoch(),
- pg->info),
- pg->past_intervals);
+ if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+ context< RecoveryMachine >().send_notify(
+ pg->get_primary(),
+ pg_notify_t(
+ pg->get_primary().shard, pg->pg_whoami.shard,
+ pg->get_osdmap()->get_epoch(),
+ pg->get_osdmap()->get_epoch(),
+ pg->info),
+ pg->past_intervals);
}
pg->update_heartbeat_peers();
assert(pg->want_acting.empty());
}
-boost::statechart::result PG::RecoveryState::Primary::react(const AdvMap &advmap)
-{
- PG *pg = context< RecoveryMachine >().pg;
- pg->remove_down_peer_info(advmap.osdmap);
- return forward_event();
-}
-
boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
{
dout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
q.f->close_section();
q.f->open_array_section("probing_osds");
- for (set<int>::iterator p = prior_set->probe.begin(); p != prior_set->probe.end(); ++p)
- q.f->dump_int("osd", *p);
+ for (set<pg_shard_t>::iterator p = prior_set->probe.begin();
+ p != prior_set->probe.end();
+ ++p)
+ q.f->dump_stream("osd") << *p;
q.f->close_section();
if (prior_set->pg_down)
q.f->dump_string("blocked", "peering is blocked due to down osds");
q.f->open_array_section("down_osds_we_would_probe");
- for (set<int>::iterator p = prior_set->down.begin(); p != prior_set->down.end(); ++p)
+ for (set<int>::iterator p = prior_set->down.begin();
+ p != prior_set->down.end();
+ ++p)
q.f->dump_int("osd", *p);
q.f->close_section();
if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) {
//The primary never backfills itself
- assert(*backfill_osd_it != pg->osd->whoami);
- ConnectionRef con = pg->osd->get_con_osd_cluster(*backfill_osd_it, pg->get_osdmap()->get_epoch());
+ assert(*backfill_osd_it != pg->pg_whoami);
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
+ backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::REQUEST,
- pg->info.pgid,
+ spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
pg->get_osdmap()->get_epoch(), priority),
con.get());
} else {
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->send_message_osd_cluster(
- pg->acting[0],
+ pg->primary.osd,
new MRecoveryReserve(
MRecoveryReserve::GRANT,
- pg->info.pgid,
+ spg_t(pg->info.pgid.pgid, pg->primary.shard),
pg->get_osdmap()->get_epoch()),
pg->get_osdmap()->get_epoch());
return transit<RepRecovering>();
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->send_message_osd_cluster(
- pg->acting[0],
+ pg->primary.osd,
new MBackfillReserve(
MBackfillReserve::GRANT,
- pg->info.pgid,
+ spg_t(pg->info.pgid.pgid, pg->primary.shard),
pg->get_osdmap()->get_epoch()),
pg->get_osdmap()->get_epoch());
return transit<RepRecovering>();
PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
- acting_osd_it(context< Active >().sorted_acting_set.begin())
+ acting_osd_it(context< Active >().sorted_actingbackfill_set.begin())
{
context< RecoveryMachine >().log_enter(state_name);
post_event(RemoteRecoveryReserved());
PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
PG *pg = context< RecoveryMachine >().pg;
- if (acting_osd_it != context< Active >().sorted_acting_set.end()) {
+ if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
// skip myself
- if (*acting_osd_it == pg->osd->whoami)
+ if (*acting_osd_it == pg->pg_whoami)
++acting_osd_it;
}
- if (acting_osd_it != context< Active >().sorted_acting_set.end()) {
- ConnectionRef con = pg->osd->get_con_osd_cluster(*acting_osd_it, pg->get_osdmap()->get_epoch());
+ if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
+ acting_osd_it->osd, pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
pg->osd->send_message_osd_cluster(
- new MRecoveryReserve(MRecoveryReserve::REQUEST,
- pg->info.pgid,
- pg->get_osdmap()->get_epoch()),
+ new MRecoveryReserve(
+ MRecoveryReserve::REQUEST,
+ spg_t(pg->info.pgid.pgid, acting_osd_it->shard),
+ pg->get_osdmap()->get_epoch()),
con.get());
} else {
post_event(RemoteRecoveryReserved());
assert(!pg->pg_log.get_missing().have_missing());
// release remote reservations
- for (set<int>::const_iterator i = context< Active >().sorted_acting_set.begin();
- i != context< Active >().sorted_acting_set.end();
+ for (set<pg_shard_t>::const_iterator i =
+ context< Active >().sorted_actingbackfill_set.begin();
+ i != context< Active >().sorted_actingbackfill_set.end();
++i) {
- if (*i == pg->osd->whoami) // skip myself
+ if (*i == pg->pg_whoami) // skip myself
continue;
- ConnectionRef con = pg->osd->get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch());
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
+ i->osd, pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
pg->osd->send_message_osd_cluster(
- new MRecoveryReserve(MRecoveryReserve::RELEASE,
- pg->info.pgid,
- pg->get_osdmap()->get_epoch()),
+ new MRecoveryReserve(
+ MRecoveryReserve::RELEASE,
+ spg_t(pg->info.pgid.pgid, i->shard),
+ pg->get_osdmap()->get_epoch()),
con.get());
}
}
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovered")
{
- int newest_update_osd;
+ pg_shard_t auth_log_shard;
context< RecoveryMachine >().log_enter(state_name);
// if we finished backfill, all acting are active; recheck if
// DEGRADED is appropriate.
assert(pg->actingbackfill.size() > 0);
- if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->actingbackfill.size())
+ if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
+ pg->actingbackfill.size())
pg->state_clear(PG_STATE_DEGRADED);
// adjust acting set? (e.g. because backfill completed...)
- if (pg->acting != pg->up && !pg->choose_acting(newest_update_osd))
+ if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard))
assert(pg->want_acting.size());
assert(!pg->needs_recovery());
PG::RecoveryState::Active::Active(my_context ctx)
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
- sorted_acting_set(context< RecoveryMachine >().pg->actingbackfill.begin(),
- context< RecoveryMachine >().pg->actingbackfill.end()),
- sorted_backfill_set(context< RecoveryMachine >().pg->backfill_targets.begin(),
- context< RecoveryMachine >().pg->backfill_targets.end()),
+ sorted_actingbackfill_set(
+ context< RecoveryMachine >().pg->actingbackfill.begin(),
+ context< RecoveryMachine >().pg->actingbackfill.end()),
+ sorted_backfill_set(
+ context< RecoveryMachine >().pg->backfill_targets.begin(),
+ context< RecoveryMachine >().pg->backfill_targets.end()),
all_replicas_activated(false)
{
context< RecoveryMachine >().log_enter(state_name);
/* Check for changes in pool size (if the acting set changed as a result,
* this does not matter) */
- if (advmap.lastmap->get_pg_size(pg->info.pgid) !=
- pg->get_osdmap()->get_pg_size(pg->info.pgid)) {
- if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
+ if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
+ pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
+ if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->acting.size())
pg->state_clear(PG_STATE_DEGRADED);
else
pg->state_set(PG_STATE_DEGRADED);
{
q.f->open_array_section("might_have_unfound");
- for (set<int>::iterator p = pg->might_have_unfound.begin();
+ for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
p != pg->might_have_unfound.end();
++p) {
q.f->open_object_section("osd");
- q.f->dump_int("osd", *p);
+ q.f->dump_stream("osd") << *p;
if (pg->peer_missing.count(*p)) {
q.f->dump_string("status", "already probed");
} else if (pg->peer_missing_requested.count(*p)) {
q.f->dump_string("status", "querying");
- } else if (!pg->get_osdmap()->is_up(*p)) {
+ } else if (!pg->get_osdmap()->is_up(p->osd)) {
q.f->dump_string("status", "osd is down");
} else {
q.f->dump_string("status", "not queried");
q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
{
q.f->open_array_section("scrubber.waiting_on_whom");
- for (set<int>::iterator p = pg->scrubber.waiting_on_whom.begin();
+ for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
p != pg->scrubber.waiting_on_whom.end();
++p) {
- q.f->dump_int("osd", *p);
+ q.f->dump_stream("shard") << *p;
}
q.f->close_section();
}
const Activate& actevt) {
dout(10) << "In ReplicaActive, about to call activate" << dendl;
PG *pg = context< RecoveryMachine >().pg;
- map< int, map< pg_t, pg_query_t> > query_map;
+ map<int, map<spg_t, pg_query_t> > query_map;
pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
actevt.query_epoch,
*context< RecoveryMachine >().get_on_safe_context_list(),
boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
{
PG *pg = context< RecoveryMachine >().pg;
- if (pg->should_send_notify() && pg->get_primary() >= 0) {
- context< RecoveryMachine >().send_notify(pg->get_primary(),
- pg_notify_t(pg->get_osdmap()->get_epoch(),
- pg->get_osdmap()->get_epoch(),
- pg->info),
- pg->past_intervals);
+ if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+ context< RecoveryMachine >().send_notify(
+ pg->get_primary(),
+ pg_notify_t(
+ pg->get_primary().shard, pg->pg_whoami.shard,
+ pg->get_osdmap()->get_epoch(),
+ pg->get_osdmap()->get_epoch(),
+ pg->info),
+ pg->past_intervals);
}
pg->take_waiters();
return discard_event();
{
PG *pg = context< RecoveryMachine >().pg;
if (query.query.type == pg_query_t::INFO) {
- pair<int, pg_info_t> notify_info;
+ pair<pg_shard_t, pg_info_t> notify_info;
pg->update_history_from_master(query.query.history);
pg->fulfill_info(query.from, query.query, notify_info);
- context< RecoveryMachine >().send_notify(notify_info.first,
- pg_notify_t(query.query_epoch,
- pg->get_osdmap()->get_epoch(),
- notify_info.second),
- pg->past_intervals);
+ context< RecoveryMachine >().send_notify(
+ notify_info.first,
+ pg_notify_t(
+ notify_info.first.shard, pg->pg_whoami.shard,
+ query.query_epoch,
+ pg->get_osdmap()->get_epoch(),
+ notify_info.second),
+ pg->past_intervals);
} else {
pg->fulfill_log(query.from, query.query, query.query_epoch);
}
boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
{
PG *pg = context< RecoveryMachine >().pg;
- if (pg->should_send_notify() && pg->get_primary() >= 0) {
- context< RecoveryMachine >().send_notify(pg->get_primary(),
- pg_notify_t(pg->get_osdmap()->get_epoch(),
- pg->get_osdmap()->get_epoch(),
- pg->info),
- pg->past_intervals);
+ if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+ context< RecoveryMachine >().send_notify(
+ pg->get_primary(),
+ pg_notify_t(
+ pg->get_primary().shard, pg->pg_whoami.shard,
+ pg->get_osdmap()->get_epoch(),
+ pg->get_osdmap()->get_epoch(),
+ pg->info),
+ pg->past_intervals);
}
pg->take_waiters();
return discard_event();
PG *pg = context< RecoveryMachine >().pg;
auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
- for (set<int>::const_iterator it = prior_set->probe.begin();
+ for (set<pg_shard_t>::const_iterator it = prior_set->probe.begin();
it != prior_set->probe.end();
++it) {
- int peer = *it;
- if (peer == pg->osd->whoami) {
+ pg_shard_t peer = *it;
+ if (peer == pg->pg_whoami) {
continue;
}
if (pg->peer_info.count(peer)) {
}
if (peer_info_requested.count(peer)) {
dout(10) << " already requested info from osd." << peer << dendl;
- } else if (!pg->get_osdmap()->is_up(peer)) {
+ } else if (!pg->get_osdmap()->is_up(peer.osd)) {
dout(10) << " not querying info from down osd." << peer << dendl;
} else {
dout(10) << " querying info from osd." << peer << dendl;
context< RecoveryMachine >().send_query(
peer, pg_query_t(pg_query_t::INFO,
+ it->shard, pg->pg_whoami.shard,
pg->info.history,
pg->get_osdmap()->get_epoch()));
peer_info_requested.insert(peer);
boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
{
- set<int>::iterator p = peer_info_requested.find(infoevt.from);
+ set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
if (p != peer_info_requested.end())
peer_info_requested.erase(p);
// filter out any osds that got dropped from the probe set from
// peer_info_requested. this is less expensive than restarting
// peering (which would re-probe everyone).
- set<int>::iterator p = peer_info_requested.begin();
+ set<pg_shard_t>::iterator p = peer_info_requested.begin();
while (p != peer_info_requested.end()) {
if (prior_set->probe.count(*p) == 0) {
dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
bool any_down_now = false;
for (unsigned i=0; i<interval.acting.size(); i++) {
int o = interval.acting[i];
+ pg_shard_t so(o, pg->pool.info.ec_pool() ? i : ghobject_t::NO_SHARD);
if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
continue; // dne or lost
if (osdmap->is_up(o)) {
pg_info_t *pinfo;
- if (o == pg->osd->whoami) {
+ if (so == pg->pg_whoami) {
pinfo = &pg->info;
} else {
- assert(pg->peer_info.count(o));
- pinfo = &pg->peer_info[o];
+ assert(pg->peer_info.count(so));
+ pinfo = &pg->peer_info[so];
}
if (!pinfo->is_incomplete())
any_up_complete_now = true;
q.f->dump_stream("enter_time") << enter_time;
q.f->open_array_section("requested_info_from");
- for (set<int>::iterator p = peer_info_requested.begin(); p != peer_info_requested.end(); ++p) {
+ for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
+ p != peer_info_requested.end();
+ ++p) {
q.f->open_object_section("osd");
- q.f->dump_int("osd", *p);
+ q.f->dump_stream("osd") << *p;
if (pg->peer_info.count(*p)) {
q.f->open_object_section("got_info");
pg->peer_info[*p].dump(q.f);
/*------GetLog------------*/
PG::RecoveryState::GetLog::GetLog(my_context ctx)
: my_base(ctx),
- NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
- newest_update_osd(-1), msg(0)
+ NamedState(
+ context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
+ msg(0)
{
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
// adjust acting?
- if (!pg->choose_acting(newest_update_osd)) {
+ if (!pg->choose_acting(auth_log_shard)) {
if (!pg->want_acting.empty()) {
post_event(NeedActingChange());
} else {
}
// am i the best?
- if (newest_update_osd == pg->osd->whoami) {
+ if (auth_log_shard == pg->pg_whoami) {
post_event(GotLog());
return;
}
- const pg_info_t& best = pg->peer_info[newest_update_osd];
+ const pg_info_t& best = pg->peer_info[auth_log_shard];
// am i broken?
if (pg->info.last_update < best.log_tail) {
- dout(10) << " not contiguous with osd." << newest_update_osd << ", down" << dendl;
+ dout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
post_event(IsIncomplete());
return;
}
// how much log to request?
eversion_t request_log_from = pg->info.last_update;
assert(pg->actingbackfill.size() > 0);
- for (vector<int>::iterator p = pg->actingbackfill.begin() + 1;
- p != pg->actingbackfill.end(); ++p) {
+ for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
+ p != pg->actingbackfill.end();
+ ++p) {
+ if (*p == pg->pg_whoami) continue;
pg_info_t& ri = pg->peer_info[*p];
if (ri.last_update >= best.log_tail && ri.last_update < request_log_from)
request_log_from = ri.last_update;
}
// how much?
- dout(10) << " requesting log from osd." << newest_update_osd << dendl;
+ dout(10) << " requesting log from osd." << auth_log_shard << dendl;
context<RecoveryMachine>().send_query(
- newest_update_osd,
- pg_query_t(pg_query_t::LOG, request_log_from, pg->info.history,
- pg->get_osdmap()->get_epoch()));
+ auth_log_shard,
+ pg_query_t(
+ pg_query_t::LOG,
+ auth_log_shard.shard, pg->pg_whoami.shard,
+ request_log_from, pg->info.history,
+ pg->get_osdmap()->get_epoch()));
}
boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
// make sure our log source didn't go down. we need to check
// explicitly because it may not be part of the prior set, which
// means the Peering state check won't catch it going down.
- if (!advmap.osdmap->is_up(newest_update_osd)) {
- dout(10) << "GetLog: newest_update_osd osd." << newest_update_osd << " went down" << dendl;
+ if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
+ dout(10) << "GetLog: auth_log_shard osd."
+ << auth_log_shard.osd << " went down" << dendl;
post_event(advmap);
return transit< Reset >();
}
boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
{
assert(!msg);
- if (logevt.from != newest_update_osd) {
+ if (logevt.from != auth_log_shard) {
dout(10) << "GetLog: discarding log from "
- << "non-newest_update_osd osd." << logevt.from << dendl;
+ << "non-auth_log_shard osd." << logevt.from << dendl;
return discard_event();
}
dout(10) << "GetLog: recieved master log from osd"
dout(10) << "processing master log" << dendl;
pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
msg->info, msg->log, msg->missing,
- newest_update_osd);
+ auth_log_shard);
}
pg->start_flush(
context< RecoveryMachine >().get_cur_transaction(),
q.f->open_object_section("state");
q.f->dump_string("name", state_name);
q.f->dump_stream("enter_time") << enter_time;
- q.f->dump_int("newest_update_osd", newest_update_osd);
+ q.f->dump_stream("auth_log_shard") << auth_log_shard;
q.f->close_section();
return forward_event();
}
PG *pg = context< RecoveryMachine >().pg;
assert(pg->actingbackfill.size() > 0);
- for (vector<int>::iterator i = pg->actingbackfill.begin() + 1;
+ for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
i != pg->actingbackfill.end();
++i) {
+ if (*i == pg->get_primary()) continue;
const pg_info_t& pi = pg->peer_info[*i];
if (pi.is_empty())
dout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
context< RecoveryMachine >().send_query(
*i,
- pg_query_t(pg_query_t::LOG, since, pg->info.history,
- pg->get_osdmap()->get_epoch()));
+ pg_query_t(
+ pg_query_t::LOG,
+ i->shard, pg->pg_whoami.shard,
+ since, pg->info.history,
+ pg->get_osdmap()->get_epoch()));
} else {
dout(10) << " requesting fulllog+missing from osd." << *i
<< " (want since " << since << " < log.tail " << pi.log_tail << ")"
<< dendl;
context< RecoveryMachine >().send_query(
- *i, pg_query_t(pg_query_t::FULLLOG,
- pg->info.history, pg->get_osdmap()->get_epoch()));
+ *i, pg_query_t(
+ pg_query_t::FULLLOG,
+ i->shard, pg->pg_whoami.shard,
+ pg->info.history, pg->get_osdmap()->get_epoch()));
}
peer_missing_requested.insert(*i);
}
q.f->dump_stream("enter_time") << enter_time;
q.f->open_array_section("peer_missing_requested");
- for (set<int>::iterator p = peer_missing_requested.begin(); p != peer_missing_requested.end(); ++p) {
+ for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
+ p != peer_missing_requested.end();
+ ++p) {
q.f->open_object_section("osd");
- q.f->dump_int("osd", *p);
+ q.f->dump_stream("osd") << *p;
if (pg->peer_missing.count(*p)) {
q.f->open_object_section("got_missing");
pg->peer_missing[*p].dump(q.f);
#undef dout_prefix
#define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
-PG::PriorSet::PriorSet(const OSDMap &osdmap,
+PG::PriorSet::PriorSet(bool ec_pool,
+ const OSDMap &osdmap,
const map<epoch_t, pg_interval_t> &past_intervals,
const vector<int> &up,
const vector<int> &acting,
const pg_info_t &info,
const PG *debug_pg)
- : pg_down(false)
+ : ec_pool(ec_pool), pg_down(false)
{
/*
* We have to be careful to gracefully deal with situations like
// so that we know what they do/do not have explicitly before
// sending them any new info/logs/whatever.
for (unsigned i=0; i<acting.size(); i++)
- probe.insert(acting[i]);
+ probe.insert(pg_shard_t(acting[i], ec_pool ? i : ghobject_t::NO_SHARD));
// It may be possible to exlude the up nodes, but let's keep them in
// there for now.
for (unsigned i=0; i<up.size(); i++)
- probe.insert(up[i]);
+ probe.insert(pg_shard_t(up[i], ec_pool ? i : ghobject_t::NO_SHARD));
for (map<epoch_t,pg_interval_t>::const_reverse_iterator p = past_intervals.rbegin();
p != past_intervals.rend();
// consider ACTING osds
for (unsigned i=0; i<interval.acting.size(); i++) {
int o = interval.acting[i];
+ pg_shard_t so(o, ec_pool ? i : ghobject_t::NO_SHARD);
const osd_info_t *pinfo = 0;
if (osdmap.exists(o))
if (osdmap.is_up(o)) {
// include past acting osds if they are up.
- probe.insert(o);
+ probe.insert(so);
any_up_now = true;
} else if (!pinfo) {
dout(10) << "build_prior prior osd." << o << " no longer exists" << dendl;
++i) {
if (osdmap.exists(*i) && // if it doesn't exist, we already consider it lost.
osdmap.is_down(*i)) {
- probe.insert(*i);
pg_down = true;
// make note of when any down osd in the cur set was lost, so that
// true if the given map affects the prior set
bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) const
{
- for (set<int>::iterator p = probe.begin();
+ for (set<pg_shard_t>::iterator p = probe.begin();
p != probe.end();
++p) {
- int o = *p;
+ int o = p->osd;
// did someone in the prior set go down?
if (osdmap->is_down(o) && down.count(o) == 0) {
}
// did a down osd in cur get (re)marked as lost?
- map<int,epoch_t>::const_iterator r = blocked_by.find(o);
+ map<int, epoch_t>::const_iterator r = blocked_by.find(o);
if (r != blocked_by.end()) {
if (!osdmap->exists(o)) {
dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl;
const coll_t coll;
PGLog pg_log;
- static string get_info_key(pg_t pgid) {
+ static string get_info_key(spg_t pgid) {
return stringify(pgid) + "_info";
}
- static string get_biginfo_key(pg_t pgid) {
+ static string get_biginfo_key(spg_t pgid) {
return stringify(pgid) + "_biginfo";
}
- static string get_epoch_key(pg_t pgid) {
+ static string get_epoch_key(spg_t pgid) {
return stringify(pgid) + "_epoch";
}
hobject_t log_oid;
hobject_t biginfo_oid;
- map<hobject_t, set<int> > missing_loc;
- set<int> missing_loc_sources; // superset of missing_loc locations
+ map<hobject_t, set<pg_shard_t> > missing_loc;
+ set<pg_shard_t> missing_loc_sources; // superset of missing_loc locations
interval_set<snapid_t> snap_collections; // obsolete
map<epoch_t,pg_interval_t> past_intervals;
* (if they have one) */
xlist<PG*>::item recovery_item, scrub_item, scrub_finalize_item, snap_trim_item, stat_queue_item;
int recovery_ops_active;
- set<int> waiting_on_backfill;
+ set<pg_shard_t> waiting_on_backfill;
#ifdef DEBUG_RECOVERY_OIDS
set<hobject_t> recovering_oids;
#endif
// primary state
public:
- vector<int> up, acting, want_acting, actingbackfill;
- map<int,eversion_t> peer_last_complete_ondisk;
+ pg_shard_t primary;
+ pg_shard_t pg_whoami;
+ pg_shard_t up_primary;
+ vector<int> up, acting, want_acting;
+ set<pg_shard_t> actingbackfill;
+ map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
eversion_t pg_trim_to;
// [primary only] content recovery state
protected:
struct PriorSet {
- set<int> probe; /// current+prior OSDs we need to probe.
+ const bool ec_pool;
+ set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
set<int> down; /// down osds that would normally be in @a probe and might be interesting.
- map<int,epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
+ map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
- PriorSet(const OSDMap &osdmap,
+ PriorSet(bool ec_pool,
+ const OSDMap &osdmap,
const map<epoch_t, pg_interval_t> &past_intervals,
const vector<int> &up,
const vector<int> &acting,
public:
struct RecoveryCtx {
utime_t start_time;
- map< int, map<pg_t, pg_query_t> > *query_map;
- map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
- map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list;
+ map<int, map<spg_t, pg_query_t> > *query_map;
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list;
C_Contexts *on_applied;
C_Contexts *on_safe;
ObjectStore::Transaction *transaction;
- RecoveryCtx(map< int, map<pg_t, pg_query_t> > *query_map,
- map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map,
- map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list,
+ RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
+ map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map,
+ map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list,
C_Contexts *on_applied,
C_Contexts *on_safe,
ObjectStore::Transaction *transaction)
*/
bool need_up_thru;
- set<int> stray_set; // non-acting osds that have PG data.
+ set<pg_shard_t> stray_set; // non-acting osds that have PG data.
eversion_t oldest_update; // acting: lowest (valid) last_update in active set
- map<int,pg_info_t> peer_info; // info from peers (stray or prior)
- set<int> peer_purged; // peers purged
- map<int,pg_missing_t> peer_missing;
- set<int> peer_log_requested; // logs i've requested (and start stamps)
- set<int> peer_missing_requested;
- set<int> stray_purged; // i deleted these strays; ignore racing PGInfo from them
- set<int> peer_activated;
+ map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
+ set<pg_shard_t> peer_purged; // peers purged
+ map<pg_shard_t, pg_missing_t> peer_missing;
+ set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
+ set<pg_shard_t> peer_missing_requested;
+
+ // i deleted these strays; ignore racing PGInfo from them
+ set<pg_shard_t> stray_purged;
+ set<pg_shard_t> peer_activated;
// primary-only, recovery-only state
- set<int> might_have_unfound; // These osds might have objects on them
- // which are unfound on the primary
+ set<pg_shard_t> might_have_unfound; // These osds might have objects on them
+ // which are unfound on the primary
epoch_t last_peering_reset;
/* heartbeat peers */
- void set_probe_targets(const set<int> &probe_set);
+ void set_probe_targets(const set<pg_shard_t> &probe_set);
void clear_probe_targets();
public:
Mutex heartbeat_peer_lock;
};
BackfillInterval backfill_info;
- map<int, BackfillInterval> peer_backfill_info;
+ map<pg_shard_t, BackfillInterval> peer_backfill_info;
bool backfill_reserved;
bool backfill_reserving;
friend class OSD;
public:
- vector<int> backfill_targets;
+ set<pg_shard_t> backfill_targets;
- bool is_backfill_targets(int osd) {
- if (std::find(backfill_targets.begin(), backfill_targets.end(), osd)
- != backfill_targets.end())
- return true;
- else
- return false;
+ bool is_backfill_targets(pg_shard_t osd) {
+ return backfill_targets.count(osd);
}
protected:
void clear_primary_state();
public:
- bool is_acting(int osd) const {
- for (unsigned i=0; i<acting.size(); i++)
- if (acting[i] == osd) return true;
- return false;
+ bool is_actingbackfill(pg_shard_t osd) const {
+ return actingbackfill.count(osd);
}
- bool is_up(int osd) const {
- for (unsigned i=0; i<up.size(); i++)
- if (up[i] == osd) return true;
- return false;
- }
- bool is_actingbackfill(int osd) const {
- for (unsigned i=0; i<actingbackfill.size(); i++)
- if (actingbackfill[i] == osd) return true;
- return false;
+ bool is_acting(pg_shard_t osd) const {
+ if (pool.info.ec_pool()) {
+ return acting.size() > osd.shard && acting[osd.shard] == osd.osd;
+ } else {
+ return std::find(acting.begin(), acting.end(), osd.osd) != acting.end();
+ }
}
bool needs_recovery() const;
bool calc_min_last_complete_ondisk() {
eversion_t min = last_complete_ondisk;
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- if (peer_last_complete_ondisk.count(actingbackfill[i]) == 0)
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ if (peer_last_complete_ondisk.count(*i) == 0)
return false; // we don't have complete info
- eversion_t a = peer_last_complete_ondisk[actingbackfill[i]];
+ eversion_t a = peer_last_complete_ondisk[*i];
if (a < min)
min = a;
}
virtual void calc_trim_to() = 0;
void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
- pg_missing_t& omissing, int from);
+ pg_missing_t& omissing, pg_shard_t from);
void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
- pg_missing_t& omissing, int from);
- bool proc_replica_info(int from, const pg_info_t &info);
+ pg_missing_t& omissing, pg_shard_t from);
+ bool proc_replica_info(pg_shard_t from, const pg_info_t &info);
struct LogEntryTrimmer : public ObjectModDesc::Visitor {
ObjectStore::Transaction *t, const hobject_t &soid);
void remove_snap_mapped_object(
ObjectStore::Transaction& t, const hobject_t& soid);
- void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from);
+ void merge_log(
+ ObjectStore::Transaction& t, pg_info_t &oinfo,
+ pg_log_t &olog, pg_shard_t from);
void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
- bool search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing,
- int fromosd);
+ bool search_for_missing(
+ const pg_info_t &oinfo, const pg_missing_t *omissing,
+ pg_shard_t fromosd);
void check_for_lost_objects();
void forget_lost_objects();
- void discover_all_missing(std::map< int, map<pg_t,pg_query_t> > &query_map);
+ void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
void trim_write_ahead();
- map<int, pg_info_t>::const_iterator find_best_info(const map<int, pg_info_t> &infos) const;
- bool calc_acting(int& newest_update_osd, vector<int>& want, vector<int>& backfill) const;
- bool choose_acting(int& newest_update_osd);
+ map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+ const map<pg_shard_t, pg_info_t> &infos) const;
+ bool calc_acting(
+ pg_shard_t &auth_log_shard,
+ vector<int> &want,
+ set<pg_shard_t> &backfill) const;
+ bool choose_acting(pg_shard_t &auth_log_shard);
void build_might_have_unfound();
void replay_queued_ops();
- void activate(ObjectStore::Transaction& t,
- epoch_t query_epoch,
- list<Context*>& tfin,
- map< int, map<pg_t,pg_query_t> >& query_map,
- map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map=0);
+ void activate(
+ ObjectStore::Transaction& t,
+ epoch_t query_epoch,
+ list<Context*>& tfin,
+ map<int, map<spg_t,pg_query_t> >& query_map,
+ map<int,
+ vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map=0);
void _activate_committed(epoch_t e);
void all_activated_and_committed();
}
// metadata
- set<int> reserved_peers;
+ set<pg_shard_t> reserved_peers;
bool reserved, reserve_failed;
epoch_t epoch_start;
bool active;
bool queue_snap_trim;
int waiting_on;
- set<int> waiting_on_whom;
+ set<pg_shard_t> waiting_on_whom;
int shallow_errors;
int deep_errors;
int fixed;
ScrubMap primary_scrubmap;
- map<int,ScrubMap> received_maps;
+ map<pg_shard_t, ScrubMap> received_maps;
MOSDRepScrub *active_rep_scrub;
utime_t scrub_reg_stamp; // stamp we registered for
bool must_scrub, must_deep_scrub, must_repair;
// Maps from objects with errors to missing/inconsistent peers
- map<hobject_t, set<int> > missing;
- map<hobject_t, set<int> > inconsistent;
- map<hobject_t, set<int> > inconsistent_snapcolls;
+ map<hobject_t, set<pg_shard_t> > missing;
+ map<hobject_t, set<pg_shard_t> > inconsistent;
+ map<hobject_t, set<pg_shard_t> > inconsistent_snapcolls;
// Map from object with errors to good peer
- map<hobject_t, pair<ScrubMap::object, int> > authoritative;
+ map<hobject_t, pair<ScrubMap::object, pg_shard_t> > authoritative;
// classic scrub
bool classic;
int active_pushes;
- void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer);
+ void repair_object(
+ const hobject_t& soid, ScrubMap::object *po,
+ pg_shard_t bad_peer,
+ pg_shard_t ok_peer);
+
void scrub(ThreadPool::TPHandle &handle);
void classic_scrub(ThreadPool::TPHandle &handle);
void chunky_scrub(ThreadPool::TPHandle &handle);
void scrub_clear_state();
bool scrub_gather_replica_maps();
void _scan_snaps(ScrubMap &map);
- void _request_scrub_map_classic(int replica, eversion_t version);
- void _request_scrub_map(int replica, eversion_t version,
+ void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
+ void _request_scrub_map(pg_shard_t replica, eversion_t version,
hobject_t start, hobject_t end, bool deep);
int build_scrub_map_chunk(
ScrubMap &map,
virtual void _scrub_finish() { }
virtual void get_colls(list<coll_t> *out) = 0;
virtual void split_colls(
- pg_t child,
+ spg_t child,
int split_bits,
int seed,
ObjectStore::Transaction *t) = 0;
virtual bool _report_snap_collection_errors(
const hobject_t &hoid,
const map<string, bufferptr> &attrs,
- int osd,
+ pg_shard_t osd,
ostream &out) { return false; };
void clear_scrub_reserved();
void scrub_reserve_replicas();
};
struct MInfoRec : boost::statechart::event< MInfoRec > {
- int from;
+ pg_shard_t from;
pg_info_t info;
epoch_t msg_epoch;
- MInfoRec(int from, pg_info_t &info, epoch_t msg_epoch) :
+ MInfoRec(pg_shard_t from, pg_info_t &info, epoch_t msg_epoch) :
from(from), info(info), msg_epoch(msg_epoch) {}
void print(std::ostream *out) const {
*out << "MInfoRec from " << from << " info: " << info;
};
struct MLogRec : boost::statechart::event< MLogRec > {
- int from;
+ pg_shard_t from;
boost::intrusive_ptr<MOSDPGLog> msg;
- MLogRec(int from, MOSDPGLog *msg) :
+ MLogRec(pg_shard_t from, MOSDPGLog *msg) :
from(from), msg(msg) {}
void print(std::ostream *out) const {
*out << "MLogRec from " << from;
};
struct MNotifyRec : boost::statechart::event< MNotifyRec > {
- int from;
+ pg_shard_t from;
pg_notify_t notify;
- MNotifyRec(int from, pg_notify_t ¬ify) :
+ MNotifyRec(pg_shard_t from, pg_notify_t ¬ify) :
from(from), notify(notify) {}
void print(std::ostream *out) const {
*out << "MNotifyRec from " << from << " notify: " << notify;
};
struct MQuery : boost::statechart::event< MQuery > {
- int from;
+ pg_shard_t from;
pg_query_t query;
epoch_t query_epoch;
- MQuery(int from, const pg_query_t &query, epoch_t query_epoch):
+ MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
from(from), query(query), query_epoch(query_epoch) {}
void print(std::ostream *out) const {
*out << "MQuery from " << from
OSDMapRef osdmap;
OSDMapRef lastmap;
vector<int> newup, newacting;
- AdvMap(OSDMapRef osdmap, OSDMapRef lastmap, vector<int>& newup, vector<int>& newacting):
- osdmap(osdmap), lastmap(lastmap), newup(newup), newacting(newacting) {}
+ int up_primary, acting_primary;
+ AdvMap(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ vector<int>& newup, int up_primary,
+ vector<int>& newacting, int acting_primary):
+ osdmap(osdmap), lastmap(lastmap),
+ newup(newup),
+ newacting(newacting),
+ up_primary(up_primary),
+ acting_primary(acting_primary) {}
void print(std::ostream *out) const {
*out << "AdvMap";
}
return state->rctx->transaction;
}
- void send_query(int to, const pg_query_t &query) {
+ void send_query(pg_shard_t to, const pg_query_t &query) {
assert(state->rctx->query_map);
- (*state->rctx->query_map)[to][pg->info.pgid] = query;
+ (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
+ query;
}
- map<int, map<pg_t, pg_query_t> > *get_query_map() {
+ map<int, map<spg_t, pg_query_t> > *get_query_map() {
assert(state->rctx->query_map);
return state->rctx->query_map;
}
return &(state->rctx->on_applied->contexts);
}
- void send_notify(int to, const pg_notify_t &info, const pg_interval_map_t &pi) {
+ void send_notify(pg_shard_t to,
+ const pg_notify_t &info, const pg_interval_map_t &pi) {
assert(state->rctx->notify_list);
- (*state->rctx->notify_list)[to].push_back(make_pair(info, pi));
+ (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
}
};
friend class RecoveryMachine;
typedef boost::mpl::list <
boost::statechart::custom_reaction< ActMap >,
boost::statechart::custom_reaction< MNotifyRec >,
- boost::statechart::transition< NeedActingChange, WaitActingChange >,
- boost::statechart::custom_reaction< AdvMap>
+ boost::statechart::transition< NeedActingChange, WaitActingChange >
> reactions;
- boost::statechart::result react(const AdvMap&);
boost::statechart::result react(const ActMap&);
boost::statechart::result react(const MNotifyRec&);
};
Active(my_context ctx);
void exit();
- const set<int> sorted_acting_set;
- const set<int> sorted_backfill_set;
+ const set<pg_shard_t> sorted_actingbackfill_set;
+ const set<pg_shard_t> sorted_backfill_set;
bool all_replicas_activated;
typedef boost::mpl::list <
boost::statechart::custom_reaction< RemoteReservationRejected >,
boost::statechart::transition< AllBackfillsReserved, Backfilling >
> reactions;
- set<int>::const_iterator backfill_osd_it;
+ set<pg_shard_t>::const_iterator backfill_osd_it;
WaitRemoteBackfillReserved(my_context ctx);
void exit();
boost::statechart::result react(const RemoteBackfillReserved& evt);
boost::statechart::custom_reaction< RemoteRecoveryReserved >,
boost::statechart::transition< AllRemotesReserved, Recovering >
> reactions;
- set<int>::const_iterator acting_osd_it;
+ set<pg_shard_t>::const_iterator acting_osd_it;
WaitRemoteRecoveryReserved(my_context ctx);
boost::statechart::result react(const RemoteRecoveryReserved &evt);
void exit();
struct GetLog;
struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
- set<int> peer_info_requested;
+ set<pg_shard_t> peer_info_requested;
GetInfo(my_context ctx);
void exit();
};
struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
- int newest_update_osd;
+ pg_shard_t auth_log_shard;
boost::intrusive_ptr<MOSDPGLog> msg;
GetLog(my_context ctx);
struct WaitFlushedPeering;
struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
- set<int> peer_missing_requested;
+ set<pg_shard_t> peer_missing_requested;
GetMissing(my_context ctx);
void exit();
public:
PG(OSDService *o, OSDMapRef curmap,
- const PGPool &pool, pg_t p, const hobject_t& loid, const hobject_t& ioid);
+ const PGPool &pool, spg_t p, const hobject_t& loid, const hobject_t& ioid);
virtual ~PG();
private:
PG& operator=(const PG& rhs);
public:
- pg_t get_pgid() const { return info.pgid; }
+ spg_t get_pgid() const { return info.pgid; }
int get_nrep() const { return acting.size(); }
- int get_primary() { return acting.empty() ? -1:acting[0]; }
+ void init_primary_up_acting(
+ const vector<int> &newup,
+ const vector<int> &newacting,
+ int new_up_primary,
+ int new_acting_primary) {
+ actingset.clear();
+ acting = newacting;
+ for (shard_id_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] != CRUSH_ITEM_NONE)
+ actingset.insert(
+ pg_shard_t(
+ acting[i],
+ pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+ }
+ up = newup;
+ if (!pool.info.ec_pool()) {
+ up_primary = pg_shard_t(new_up_primary, ghobject_t::no_shard());
+ primary = pg_shard_t(new_acting_primary, ghobject_t::no_shard());
+ return;
+ }
+ up_primary = pg_shard_t();
+ primary = pg_shard_t();
+ for (shard_id_t i = 0; i < up.size(); ++i) {
+ if (up[i] == new_up_primary) {
+ up_primary = pg_shard_t(up[i], i);
+ break;
+ }
+ }
+ for (shard_id_t i = 0; i < acting.size(); ++i) {
+ if (acting[i] == new_acting_primary) {
+ primary = pg_shard_t(acting[i], i);
+ break;
+ }
+ }
+ assert(up_primary.osd == new_up_primary);
+ assert(primary.osd == new_acting_primary);
+ }
+ pg_shard_t get_primary() const { return primary; }
int get_role() const { return role; }
void set_role(int r) { role = r; }
- bool is_primary() const { return role == 0; }
+ bool is_primary() const { return pg_whoami == primary; }
bool is_replica() const { return role > 0; }
epoch_t get_last_peering_reset() const { return last_peering_reset; }
void init(
int role,
vector<int>& up,
+ int up_primary,
vector<int>& acting,
+ int acting_primary,
pg_history_t& history,
pg_interval_map_t& pim,
bool backfill,
/// share new pg log entries after a pg is active
void share_pg_log();
- void start_peering_interval(const OSDMapRef lastmap,
- const vector<int>& newup,
- const vector<int>& newacting,
- ObjectStore::Transaction *t);
+ void start_peering_interval(
+ const OSDMapRef lastmap,
+ const vector<int>& newup, int up_primary,
+ const vector<int>& newacting, int acting_primary,
+ ObjectStore::Transaction *t);
void start_flush(ObjectStore::Transaction *t,
list<Context *> *on_applied,
list<Context *> *on_safe);
}
void update_history_from_master(pg_history_t new_history);
- void fulfill_info(int from, const pg_query_t &query,
- pair<int, pg_info_t> ¬ify_info);
- void fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch);
+ void fulfill_info(pg_shard_t from, const pg_query_t &query,
+ pair<pg_shard_t, pg_info_t> ¬ify_info);
+ void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
bool is_split(OSDMapRef lastmap, OSDMapRef nextmap);
bool acting_up_affected(const vector<int>& newup, const vector<int>& newacting);
void queue_peering_event(CephPeeringEvtRef evt);
void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
void queue_notify(epoch_t msg_epoch, epoch_t query_epoch,
- int from, pg_notify_t& i);
+ pg_shard_t from, pg_notify_t& i);
void queue_info(epoch_t msg_epoch, epoch_t query_epoch,
- int from, pg_info_t& i);
- void queue_log(epoch_t msg_epoch, epoch_t query_epoch, int from,
+ pg_shard_t from, pg_info_t& i);
+ void queue_log(epoch_t msg_epoch, epoch_t query_epoch, pg_shard_t from,
MOSDPGLog *msg);
void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
- int from, const pg_query_t& q);
+ pg_shard_t from, const pg_query_t& q);
void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
void queue_flushed(epoch_t started_at);
- void handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
- vector<int>& newup, vector<int>& newacting,
- RecoveryCtx *rctx);
+ void handle_advance_map(
+ OSDMapRef osdmap, OSDMapRef lastmap,
+ vector<int>& newup, int up_primary,
+ vector<int>& newacting, int acting_primary,
+ RecoveryCtx *rctx);
void handle_activate_map(RecoveryCtx *rctx);
void handle_create(RecoveryCtx *rctx);
void handle_loaded(RecoveryCtx *rctx);
* Called when peer is recovered
*/
virtual void on_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t &oid,
const ObjectRecoveryInfo &recovery_info,
const object_stat_sum_t &stat
) = 0;
virtual void begin_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t oid) = 0;
- virtual void failed_push(int from, const hobject_t &soid) = 0;
-
+ virtual void failed_push(pg_shard_t from, const hobject_t &soid) = 0;
virtual void cancel_pull(const hobject_t &soid) = 0;
ObjectStore::Transaction *t,
OpRequestRef op = OpRequestRef()
) = 0;
- virtual epoch_t get_epoch() = 0;
- virtual const vector<int> &get_actingbackfill() = 0;
+ virtual epoch_t get_epoch() const = 0;
+
+ virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
+
virtual std::string gen_dbg_prefix() const = 0;
- virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
- virtual const map<int, pg_missing_t> &get_peer_missing() = 0;
- virtual const map<int, pg_info_t> &get_peer_info() = 0;
- virtual const pg_missing_t &get_local_missing() = 0;
- virtual const PGLog &get_log() = 0;
+ virtual const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards()
+ const = 0;
+
+ virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
+ const = 0;
+
+ virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+
+ virtual const pg_missing_t &get_local_missing() const = 0;
+ virtual const PGLog &get_log() const = 0;
virtual bool pgb_is_primary() const = 0;
virtual OSDMapRef pgb_get_osdmap() const = 0;
virtual const pg_info_t &get_info() const = 0;
const eversion_t &applied_version) = 0;
virtual bool should_send_op(
- int peer,
+ pg_shard_t peer,
const hobject_t &hoid) = 0;
virtual void log_operation(
ObjectStore::Transaction *t) = 0;
virtual void update_peer_last_complete_ondisk(
- int fromosd,
+ pg_shard_t fromosd,
eversion_t lcod) = 0;
virtual void update_last_complete_ondisk(
virtual void schedule_work(
GenContext<ThreadPool::TPHandle&> *c) = 0;
- virtual int whoami() const = 0;
+ virtual pg_shard_t whoami_shard() const = 0;
+ int whoami() const {
+ return whoami_shard().osd;
+ }
+ spg_t whoami_spg_t() const {
+ return get_info().pgid;
+ }
+
+ virtual spg_t primary_spg_t() const = 0;
+ virtual pg_shard_t primary_shard() const = 0;
virtual void send_message_osd_cluster(
int peer, Message *m, epoch_t from_epoch) = 0;
out->push_back(temp_coll);
}
void split_colls(
- pg_t child,
+ spg_t child,
int split_bits,
int seed,
ObjectStore::Transaction *t) {
Context *on_complete) = 0;
virtual bool scrub_supported() { return false; }
- virtual void be_scan_list(ScrubMap &map, const vector<hobject_t> &ls, bool deep,
+ virtual void be_scan_list(
+ ScrubMap &map, const vector<hobject_t> &ls, bool deep,
ThreadPool::TPHandle &handle) { assert(0); }
virtual enum scrub_error_type be_compare_scrub_objects(
- const ScrubMap::object &auth,
- const ScrubMap::object &candidate,
- ostream &errorstream) { assert(0); }
- virtual map<int, ScrubMap *>::const_iterator be_select_auth_object(
+ const ScrubMap::object &auth,
+ const ScrubMap::object &candidate,
+ ostream &errorstream) { assert(0); }
+ virtual map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
const hobject_t &obj,
- const map<int,ScrubMap*> &maps) { assert(0); }
- virtual void be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
- map<hobject_t, set<int> > &missing,
- map<hobject_t, set<int> > &inconsistent,
- map<hobject_t, int> &authoritative,
- map<hobject_t, set<int> > &invalid_snapcolls,
- int &shallow_errors, int &deep_errors,
- const pg_t pgid,
- const vector<int> &acting,
- ostream &errorstream) { assert(0); }
+ const map<pg_shard_t,ScrubMap*> &maps) { assert(0); }
+ virtual void be_compare_scrubmaps(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ map<hobject_t, set<pg_shard_t> > &missing,
+ map<hobject_t, set<pg_shard_t> > &inconsistent,
+ map<hobject_t, pg_shard_t> &authoritative,
+ map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+ int &shallow_errors, int &deep_errors,
+ const spg_t pgid,
+ const vector<int> &acting,
+ ostream &errorstream) { assert(0); }
};
struct PG_SendMessageOnConn: public Context {
}
void PGLog::clear_info_log(
- pg_t pgid,
+ spg_t pgid,
const hobject_t &infos_oid,
const hobject_t &log_oid,
ObjectStore::Transaction *t) {
}
}
-void PGLog::proc_replica_log(ObjectStore::Transaction& t,
- pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing, int from) const
+void PGLog::proc_replica_log(
+ ObjectStore::Transaction& t,
+ pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing,
+ pg_shard_t from) const
{
dout(10) << "proc_replica_log for osd." << from << ": "
<< oinfo << " " << olog << " " << omissing << dendl;
}
void PGLog::merge_log(ObjectStore::Transaction& t,
- pg_info_t &oinfo, pg_log_t &olog, int fromosd,
+ pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info)
{
void reset_recovery_pointers() { log.reset_recovery_pointers(); }
static void clear_info_log(
- pg_t pgid,
+ spg_t pgid,
const hobject_t &infos_oid,
const hobject_t &log_oid,
ObjectStore::Transaction *t);
}
void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, const pg_log_t &olog,
- pg_missing_t& omissing, int from) const;
+ pg_missing_t& omissing, pg_shard_t from) const;
protected:
bool _merge_old_entry(
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info);
- void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from,
+ void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
+ pg_shard_t from,
pg_info_t &info, LogEntryHandler *rollbacker,
bool &dirty_info, bool &dirty_big_info);
void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
{
- for(map<int, set<hobject_t> >::iterator i = pull_from_peer.begin();
+ for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin();
i != pull_from_peer.end();
) {
- if (osdmap->is_down(i->first)) {
+ if (osdmap->is_down(i->first.osd)) {
dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
<< ", osdmap has it marked down" << dendl;
for (set<hobject_t>::iterator j = i->second.begin();
)
).first->second;
+ op.waiting_for_applied.insert(
+ parent->get_actingbackfill_shards().begin(),
+ parent->get_actingbackfill_shards().end());
+ op.waiting_for_commit.insert(
+ parent->get_actingbackfill_shards().begin(),
+ parent->get_actingbackfill_shards().end());
+
+
issue_op(
soid,
at_version,
&op,
op_t);
- // add myself to gather set
- op.waiting_for_applied.insert(osd->whoami);
- op.waiting_for_commit.insert(osd->whoami);
-
ObjectStore::Transaction local_t;
if (t->get_temp_added().size()) {
get_temp_coll(&local_t);
if (op->op)
op->op->mark_event("op_applied");
- op->waiting_for_applied.erase(get_parent()->whoami());
+ op->waiting_for_applied.erase(get_parent()->whoami_shard());
parent->op_applied(op->v);
if (op->waiting_for_applied.empty()) {
if (op->op)
op->op->mark_event("op_commit");
- op->waiting_for_commit.erase(get_parent()->whoami());
+ op->waiting_for_commit.erase(get_parent()->whoami_shard());
if (op->waiting_for_commit.empty()) {
op->on_commit->complete(0);
// must be replication.
tid_t rep_tid = r->get_tid();
- int fromosd = r->get_source().num();
+ pg_shard_t from = r->from;
if (in_progress_ops.count(rep_tid)) {
map<tid_t, InProgressOp>::iterator iter =
if (m)
dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
<< " ack_type " << (int)r->ack_type
- << " from osd." << fromosd
+ << " from " << from
<< dendl;
else
dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) "
<< " ack_type " << (int)r->ack_type
- << " from osd." << fromosd
+ << " from " << from
<< dendl;
// oh, good.
if (r->ack_type & CEPH_OSD_FLAG_ONDISK) {
- assert(ip_op.waiting_for_commit.count(fromosd));
- ip_op.waiting_for_commit.erase(fromosd);
+ assert(ip_op.waiting_for_commit.count(from));
+ ip_op.waiting_for_commit.erase(from);
if (ip_op.op)
ip_op.op->mark_event("sub_op_commit_rec");
} else {
- assert(ip_op.waiting_for_applied.count(fromosd));
+ assert(ip_op.waiting_for_applied.count(from));
if (ip_op.op)
ip_op.op->mark_event("sub_op_applied_rec");
}
- ip_op.waiting_for_applied.erase(fromosd);
+ ip_op.waiting_for_applied.erase(from);
parent->update_peer_last_complete_ondisk(
- fromosd,
+ from,
r->get_last_complete_ondisk());
if (ip_op.waiting_for_applied.empty() &&
hobject_t poid = *p;
struct stat st;
- int r = osd->store->stat(coll, poid, &st, true);
+ int r = store->stat(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &st,
+ true);
if (r == 0) {
ScrubMap::object &o = map.objects[poid];
o.size = st.st_size;
assert(!o.negative);
- osd->store->getattrs(coll, poid, o.attrs);
+ store->getattrs(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ o.attrs);
// calculate the CRC32 on deep scrubs
if (deep) {
bufferlist bl, hdrbl;
int r;
__u64 pos = 0;
- while ( (r = osd->store->read(coll, poid, pos,
- cct->_conf->osd_deep_scrub_stride, bl,
- true)) > 0) {
+ while ( (
+ r = store->read(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ pos,
+ cct->_conf->osd_deep_scrub_stride, bl,
+ true)) > 0) {
handle.reset_tp_timeout();
h << bl;
pos += bl.length();
o.digest_present = true;
bl.clear();
- r = osd->store->omap_get_header(coll, poid, &hdrbl, true);
+ r = store->omap_get_header(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+ &hdrbl, true);
if (r == 0) {
dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
<< dendl;
o.read_error = true;
}
- ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
- coll, poid);
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
+ coll,
+ ghobject_t(
+ poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
assert(iter);
uint64_t keys_scanned = 0;
for (iter->seek_to_first(); iter->valid() ; iter->next()) {
}
enum scrub_error_type ReplicatedBackend::be_compare_scrub_objects(
- const ScrubMap::object &auth,
- const ScrubMap::object &candidate,
- ostream &errorstream)
+ const ScrubMap::object &auth,
+ const ScrubMap::object &candidate,
+ ostream &errorstream)
{
enum scrub_error_type error = CLEAN;
if (candidate.read_error) {
return error;
}
-map<int, ScrubMap *>::const_iterator ReplicatedBackend::be_select_auth_object(
+map<pg_shard_t, ScrubMap *>::const_iterator
+ ReplicatedBackend::be_select_auth_object(
const hobject_t &obj,
- const map<int,ScrubMap*> &maps)
+ const map<pg_shard_t,ScrubMap*> &maps)
{
- map<int, ScrubMap *>::const_iterator auth = maps.end();
- for (map<int, ScrubMap *>::const_iterator j = maps.begin();
+ map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+ for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
j != maps.end();
++j) {
map<hobject_t, ScrubMap::object>::iterator i =
return auth;
}
-void ReplicatedBackend::be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
- map<hobject_t, set<int> > &missing,
- map<hobject_t, set<int> > &inconsistent,
- map<hobject_t, int> &authoritative,
- map<hobject_t, set<int> > &invalid_snapcolls,
- int &shallow_errors,
- int &deep_errors,
- const pg_t pgid,
- const vector<int> &acting,
- ostream &errorstream)
+void ReplicatedBackend::be_compare_scrubmaps(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ map<hobject_t, set<pg_shard_t> > &missing,
+ map<hobject_t, set<pg_shard_t> > &inconsistent,
+ map<hobject_t, pg_shard_t> &authoritative,
+ map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+ int &shallow_errors, int &deep_errors,
+ const spg_t pgid,
+ const vector<int> &acting,
+ ostream &errorstream)
{
map<hobject_t,ScrubMap::object>::const_iterator i;
- map<int, ScrubMap *>::const_iterator j;
+ map<pg_shard_t, ScrubMap *>::const_iterator j;
set<hobject_t> master_set;
// Construct master set
for (set<hobject_t>::const_iterator k = master_set.begin();
k != master_set.end();
++k) {
- map<int, ScrubMap *>::const_iterator auth = be_select_auth_object(*k, maps);
+ map<pg_shard_t, ScrubMap *>::const_iterator auth =
+ be_select_auth_object(*k, maps);
assert(auth != maps.end());
- set<int> cur_missing;
- set<int> cur_inconsistent;
+ set<pg_shard_t> cur_missing;
+ set<pg_shard_t> cur_inconsistent;
for (j = maps.begin(); j != maps.end(); ++j) {
if (j == auth)
continue;
++shallow_errors;
else
++deep_errors;
- errorstream << pgid << " osd." << acting[j->first]
+ errorstream << pgid << " shard " << j->first
<< ": soid " << *k << " " << ss.str() << std::endl;
}
} else {
cur_missing.insert(j->first);
++shallow_errors;
- errorstream << pgid
- << " osd." << acting[j->first]
+ errorstream << pgid << " shard " << j->first
<< " missing " << *k << std::endl;
}
}
struct C_ReplicatedBackend_OnPullComplete;
class ReplicatedBackend : public PGBackend {
struct RPGHandle : public PGBackend::RecoveryHandle {
- map<int, vector<PushOp> > pushes;
- map<int, vector<PullOp> > pulls;
+ map<pg_shard_t, vector<PushOp> > pushes;
+ map<pg_shard_t, vector<PullOp> > pulls;
};
friend struct C_ReplicatedBackend_OnPullComplete;
public:
virtual void dump_recovery_info(Formatter *f) const {
{
f->open_array_section("pull_from_peer");
- for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+ for (map<pg_shard_t, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
i != pull_from_peer.end();
++i) {
f->open_object_section("pulling_from");
- f->dump_int("pull_from", i->first);
+ f->dump_stream("pull_from") << i->first;
{
f->open_array_section("pulls");
for (set<hobject_t>::const_iterator j = i->second.begin();
}
{
f->open_array_section("pushing");
- for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
+ for (map<hobject_t, map<pg_shard_t, PushInfo> >::const_iterator i =
pushing.begin();
i != pushing.end();
++i) {
f->dump_stream("pushing") << i->first;
{
f->open_array_section("pushing_to");
- for (map<int, PushInfo>::const_iterator j = i->second.begin();
+ for (map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin();
j != i->second.end();
++j) {
f->open_object_section("push_progress");
- f->dump_stream("object_pushing") << j->first;
+ f->dump_stream("pushing_to") << j->first;
{
f->open_object_section("push_info");
j->second.dump(f);
}
}
};
- map<hobject_t, map<int, PushInfo> > pushing;
+ map<hobject_t, map<pg_shard_t, PushInfo> > pushing;
// pull
struct PullInfo {
map<hobject_t, PullInfo> pulling;
// Reverse mapping from osd peer to objects beging pulled from that peer
- map<int, set<hobject_t> > pull_from_peer;
+ map<pg_shard_t, set<hobject_t> > pull_from_peer;
void sub_op_push(OpRequestRef op);
void sub_op_push_reply(OpRequestRef op);
void do_pull(OpRequestRef op);
void do_push_reply(OpRequestRef op);
- bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
- void handle_pull(int peer, PullOp &op, PushOp *reply);
+ bool handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply);
+ void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply);
bool handle_pull_response(
- int from, PushOp &op, PullOp *response,
+ pg_shard_t from, PushOp &op, PullOp *response,
list<hobject_t> *to_continue,
ObjectStore::Transaction *t);
- void handle_push(int from, PushOp &op, PushReplyOp *response,
+ void handle_push(pg_shard_t from, PushOp &op, PushReplyOp *response,
ObjectStore::Transaction *t);
static void trim_pushed_data(const interval_set<uint64_t> ©_subset,
bufferlist data_received,
interval_set<uint64_t> *intervals_usable,
bufferlist *data_usable);
- void _failed_push(int from, const hobject_t &soid);
+ void _failed_push(pg_shard_t from, const hobject_t &soid);
- void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
+ void send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes);
void prep_push_op_blank(const hobject_t& soid, PushOp *op);
- int send_push_op_legacy(int priority, int peer,
+ int send_push_op_legacy(int priority, pg_shard_t peer,
PushOp &pop);
- int send_pull_legacy(int priority, int peer,
+ int send_pull_legacy(int priority, pg_shard_t peer,
const ObjectRecoveryInfo& recovery_info,
ObjectRecoveryProgress progress);
void send_pulls(
int priority,
- map<int, vector<PullOp> > &pulls);
+ map<pg_shard_t, vector<PullOp> > &pulls);
int build_push_op(const ObjectRecoveryInfo &recovery_info,
const ObjectRecoveryProgress &progress,
ObjectContextRef obj,
RPGHandle *h);
void prep_push_to_replica(
- ObjectContextRef obc, const hobject_t& soid, int peer,
+ ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
PushOp *pop);
void prep_push(ObjectContextRef obc,
- const hobject_t& oid, int dest,
+ const hobject_t& oid, pg_shard_t dest,
PushOp *op);
void prep_push(ObjectContextRef obc,
- const hobject_t& soid, int peer,
+ const hobject_t& soid, pg_shard_t peer,
eversion_t version,
interval_set<uint64_t> &data_subset,
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
*/
struct InProgressOp {
tid_t tid;
- set<int> waiting_for_commit;
- set<int> waiting_for_applied;
+ set<pg_shard_t> waiting_for_commit;
+ set<pg_shard_t> waiting_for_applied;
Context *on_commit;
Context *on_applied;
OpRequestRef op;
void sub_op_modify_applied(RepModifyRef rm);
void sub_op_modify_commit(RepModifyRef rm);
bool scrub_supported() { return true; }
- void be_scan_list(ScrubMap &map, const vector<hobject_t> &ls, bool deep,
+
+ void be_scan_list(
+ ScrubMap &map, const vector<hobject_t> &ls, bool deep,
ThreadPool::TPHandle &handle);
enum scrub_error_type be_compare_scrub_objects(
- const ScrubMap::object &auth,
- const ScrubMap::object &candidate,
- ostream &errorstream);
- map<int, ScrubMap *>::const_iterator be_select_auth_object(
+ const ScrubMap::object &auth,
+ const ScrubMap::object &candidate,
+ ostream &errorstream);
+ map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
const hobject_t &obj,
- const map<int,ScrubMap*> &maps);
- void be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
- map<hobject_t, set<int> > &missing,
- map<hobject_t, set<int> > &inconsistent,
- map<hobject_t, int> &authoritative,
- map<hobject_t, set<int> > &invalid_snapcolls,
- int &shallow_errors, int &deep_errors,
- const pg_t pgid,
- const vector<int> &acting,
- ostream &errorstream);
+ const map<pg_shard_t,ScrubMap*> &maps);
+ void be_compare_scrubmaps(
+ const map<pg_shard_t,ScrubMap*> &maps,
+ map<hobject_t, set<pg_shard_t> > &missing,
+ map<hobject_t, set<pg_shard_t> > &inconsistent,
+ map<hobject_t, pg_shard_t> &authoritative,
+ map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+ int &shallow_errors, int &deep_errors,
+ const spg_t pgid,
+ const vector<int> &acting,
+ ostream &errorstream);
};
#endif
}
void ReplicatedPG::on_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t &soid,
const ObjectRecoveryInfo &recovery_info,
const object_stat_sum_t &stat)
}
void ReplicatedPG::begin_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t soid)
{
peer_missing[peer].revise_have(soid, eversion_t());
{
if (pg_log.get_missing().missing.count(soid))
return true;
- for (unsigned i = 1; i < actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
+ assert(actingbackfill.size() > 0);
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
if (peer_missing.count(peer) &&
peer_missing[peer].missing.count(soid))
return true;
<< dendl;
eversion_t v;
assert(actingbackfill.size() > 0);
- for (unsigned i = 1; i < actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
if (peer_missing.count(peer) &&
peer_missing[peer].missing.count(soid)) {
v = peer_missing[peer].missing[soid].need;
f->close_section();
if (backfill_targets.size() > 0) {
f->open_array_section("backfill_targets");
- for (vector<int>::iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
- f->dump_unsigned("osd", *p);
+ for (set<pg_shard_t>::iterator p = backfill_targets.begin();
+ p != backfill_targets.end();
+ ++p)
+ f->dump_stream("shard") << *p;
f->close_section();
}
if (actingbackfill.size() > 0) {
f->open_array_section("actingbackfill");
- for (vector<int>::iterator p = actingbackfill.begin(); p != actingbackfill.end(); ++p)
- f->dump_unsigned("osd", *p);
+ for (set<pg_shard_t>::iterator p = actingbackfill.begin();
+ p != actingbackfill.end();
+ ++p)
+ f->dump_stream("shard") << *p;
f->close_section();
}
f->open_object_section("info");
f->close_section();
f->open_array_section("peer_info");
- for (map<int,pg_info_t>::iterator p = peer_info.begin();
+ for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
p != peer_info.end();
++p) {
f->open_object_section("info");
- f->dump_unsigned("peer", p->first);
+ f->dump_stream("peer") << p->first;
p->second.dump(f.get());
f->close_section();
}
p->second.dump(f.get()); // have, need keys
{
f->open_array_section("locations");
- map<hobject_t,set<int> >::iterator q = missing_loc.find(p->first);
+ map<hobject_t,set<pg_shard_t> >::iterator q =
+ missing_loc.find(p->first);
if (q != missing_loc.end())
- for (set<int>::iterator r = q->second.begin(); r != q->second.end(); ++r)
- f->dump_int("osd", *r);
+ for (set<pg_shard_t>::iterator r = q->second.begin();
+ r != q->second.end();
+ ++r)
+ f->dump_stream("shard") << *r;
f->close_section();
}
f->close_section();
// fall through
case CEPH_OSD_OP_PGLS:
- if (m->get_pg() != info.pgid) {
+ if (m->get_pg() != info.pgid.pgid) {
dout(10) << " pgls pg=" << m->get_pg() << " != " << info.pgid << dendl;
result = 0; // hmm?
} else {
}
ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
- const PGPool &_pool, pg_t p, const hobject_t& oid,
+ const PGPool &_pool, spg_t p, const hobject_t& oid,
const hobject_t& ioid) :
PG(o, curmap, _pool, p, oid, ioid),
pgbackend(new ReplicatedBackend(this, coll_t(p), o->store, cct)),
hobject_t ReplicatedPG::earliest_backfill() const
{
hobject_t e = hobject_t::get_max();
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- map<int, pg_info_t>::const_iterator iter = peer_info.find(bt);
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
+ map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
assert(iter != peer_info.end());
if (iter->second.last_backfill < e)
e = iter->second.last_backfill;
// take the larger of last_backfill_started and the replicas last_backfill.
bool ReplicatedPG::check_src_targ(const hobject_t& soid, const hobject_t& toid) const
{
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- map<int, pg_info_t>::const_iterator iter = peer_info.find(bt);
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t bt = *i;
+ map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
assert(iter != peer_info.end());
if (toid <= MAX(last_backfill_started, iter->second.last_backfill) &&
cct->_conf->osd_backfill_scan_max,
&bi,
handle);
- MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
- get_osdmap()->get_epoch(), m->query_epoch,
- info.pgid, bi.begin, bi.end);
+ MOSDPGScan *reply = new MOSDPGScan(
+ MOSDPGScan::OP_SCAN_DIGEST,
+ pg_whoami,
+ get_osdmap()->get_epoch(), m->query_epoch,
+ spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
::encode(bi.objects, reply->get_data());
osd->send_message_osd_cluster(reply, m->get_connection());
}
case MOSDPGScan::OP_SCAN_DIGEST:
{
- int from = m->get_source().num();
+ pg_shard_t from = m->from;
// Check that from is in backfill_targets vector
assert(is_backfill_targets(from));
{
MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
- int from = m->get_source().num();
+ pg_shard_t from = m->from;
vector<PushReplyOp> replies;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
}
MOSDPGPushReply *reply = new MOSDPGPushReply;
+ reply->from = get_parent()->whoami_shard();
reply->set_priority(m->get_priority());
reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
{
MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
- int from = m->get_source().num();
+ pg_shard_t from = m->from;
vector<PullOp> replies(1);
ObjectStore::Transaction *t = new ObjectStore::Transaction;
if (replies.size()) {
MOSDPGPull *reply = new MOSDPGPull;
+ reply->from = parent->whoami_shard();
reply->set_priority(m->get_priority());
reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
{
MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PULL);
- int from = m->get_source().num();
+ pg_shard_t from = m->from;
- map<int, vector<PushOp> > replies;
+ map<pg_shard_t, vector<PushOp> > replies;
for (vector<PullOp>::iterator i = m->pulls.begin();
i != m->pulls.end();
++i) {
{
MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
- int from = m->get_source().num();
+ pg_shard_t from = m->from;
vector<PushOp> replies(1);
for (vector<PushReplyOp>::iterator i = m->replies.begin();
}
replies.erase(replies.end() - 1);
- map<int, vector<PushOp> > _replies;
+ map<pg_shard_t, vector<PushOp> > _replies;
_replies[from].swap(replies);
send_pushes(m->get_priority(), _replies);
}
{
assert(cct->_conf->osd_kill_backfill_at != 1);
- MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
- get_osdmap()->get_epoch(), m->query_epoch,
- info.pgid);
+ MOSDPGBackfill *reply = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+ get_osdmap()->get_epoch(),
+ m->query_epoch,
+ spg_t(info.pgid.pgid, primary.shard));
reply->set_priority(cct->_conf->osd_recovery_op_priority);
osd->send_message_osd_cluster(reply, m->get_connection());
queue_peering_event(
ctx->obc->ssc->snapset = ctx->new_snapset;
info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
- for (unsigned i = 0; i < backfill_targets.size() ; ++i) {
- int bt = backfill_targets[i];
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
pg_info_t& pinfo = peer_info[bt];
if (soid <= pinfo.last_backfill)
pinfo.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
scrubber.finalizing = true;
scrub_gather_replica_maps();
++scrubber.waiting_on;
- scrubber.waiting_on_whom.insert(osd->whoami);
+ scrubber.waiting_on_whom.insert(pg_whoami);
osd->scrub_wq.queue(this);
}
} else {
repop->v = ctx->at_version;
- for (vector<int>::iterator i = actingbackfill.begin() + 1;
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
+ if (*i == get_primary()) continue;
pg_info_t &pinfo = peer_info[*i];
// keep peer_info up to date
if (pinfo.last_complete == pinfo.last_update)
{
int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
- if (parent->get_actingbackfill().size() > 1) {
+ if (parent->get_actingbackfill_shards().size() > 1) {
ostringstream ss;
- ss << "waiting for subops from " <<
- vector<int>(
- parent->get_actingbackfill().begin() + 1,
- parent->get_actingbackfill().end());
+ set<pg_shard_t> replicas = parent->get_actingbackfill_shards();
+ replicas.erase(parent->whoami_shard());
+ ss << "waiting for subops from " << replicas;
if (op->op)
op->op->mark_sub_op_sent(ss.str());
}
- for (unsigned i=1; i<parent->get_actingbackfill().size(); i++) {
- int peer = parent->get_actingbackfill()[i];
- const pg_info_t &pinfo = parent->get_peer_info().find(peer)->second;
-
- op->waiting_for_applied.insert(peer);
- op->waiting_for_commit.insert(peer);
+ for (set<pg_shard_t>::const_iterator i =
+ parent->get_actingbackfill_shards().begin();
+ i != parent->get_actingbackfill_shards().end();
+ ++i) {
+ if (*i == parent->whoami_shard()) continue;
+ pg_shard_t peer = *i;
+ const pg_info_t &pinfo = parent->get_shard_info().find(peer)->second;
// forward the write/update/whatever
MOSDSubOp *wr = new MOSDSubOp(
- reqid, get_info().pgid, soid,
+ reqid, parent->whoami_shard(),
+ spg_t(get_info().pgid.pgid, i->shard),
+ soid,
false, acks_wanted,
get_osdmap()->get_epoch(),
tid, at_version);
wr->discard_temp_oid = discard_temp_oid;
get_parent()->send_message_osd_cluster(
- peer, wr, get_osdmap()->get_epoch());
+ peer.osd, wr, get_osdmap()->get_epoch());
}
}
if (!rm->committed) {
// send ack to acker only if we haven't sent a commit already
- MOSDSubOpReply *ack = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+ MOSDSubOpReply *ack = new MOSDSubOpReply(
+ m, parent->whoami_shard(),
+ 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
ack->set_priority(CEPH_MSG_PRIO_HIGH); // this better match commit priority!
get_parent()->send_message_osd_cluster(
rm->ackerosd, ack, get_osdmap()->get_epoch());
assert(get_osdmap()->is_up(rm->ackerosd));
get_parent()->update_last_complete_ondisk(rm->last_complete);
- MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+ MOSDSubOpReply *commit = new MOSDSubOpReply(
+ static_cast<MOSDSubOp*>(rm->op->get_req()),
+ get_parent()->whoami_shard(),
+ 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
commit->set_last_complete_ondisk(rm->last_complete);
commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
get_parent()->send_message_osd_cluster(
eversion_t _v = get_parent()->get_local_missing().missing.find(
soid)->second.need;
assert(_v == v);
- const map<hobject_t, set<int> > &missing_loc(
- get_parent()->get_missing_loc());
- const map<int, pg_missing_t > &peer_missing(
- get_parent()->get_peer_missing());
- int fromosd = -1;
- map<hobject_t,set<int> >::const_iterator q = missing_loc.find(soid);
+ const map<hobject_t, set<pg_shard_t> > &missing_loc(
+ get_parent()->get_missing_loc_shards());
+ const map<pg_shard_t, pg_missing_t > &peer_missing(
+ get_parent()->get_shard_missing());
+ map<hobject_t, set<pg_shard_t> >::const_iterator q = missing_loc.find(soid);
assert(q != missing_loc.end());
assert(!q->second.empty());
// pick a pullee
- vector<int> shuffle(q->second.begin(), q->second.end());
+ vector<pg_shard_t> shuffle(q->second.begin(), q->second.end());
random_shuffle(shuffle.begin(), shuffle.end());
- vector<int>::iterator p = shuffle.begin();
- assert(get_osdmap()->is_up(*p));
- fromosd = *p;
- assert(fromosd >= 0);
+ vector<pg_shard_t>::iterator p = shuffle.begin();
+ assert(get_osdmap()->is_up(p->osd));
+ pg_shard_t fromshard = *p;
dout(7) << "pull " << soid
<< "v " << v
<< " on osds " << *p
- << " from osd." << fromosd
+ << " from osd." << fromshard
<< dendl;
- assert(peer_missing.count(fromosd));
- const pg_missing_t &pmissing = peer_missing.find(fromosd)->second;
+ assert(peer_missing.count(fromshard));
+ const pg_missing_t &pmissing = peer_missing.find(fromshard)->second;
if (pmissing.is_missing(soid, v)) {
assert(pmissing.missing.find(soid)->second.have != v);
- dout(10) << "pulling soid " << soid << " from osd " << fromosd
+ dout(10) << "pulling soid " << soid << " from osd " << fromshard
<< " at version " << pmissing.missing.find(soid)->second.have
<< " rather than at version " << v << dendl;
v = pmissing.missing.find(soid)->second.have;
recovery_info.size = ((uint64_t)-1);
}
- h->pulls[fromosd].push_back(PullOp());
- PullOp &op = h->pulls[fromosd].back();
+ h->pulls[fromshard].push_back(PullOp());
+ PullOp &op = h->pulls[fromshard].back();
op.soid = soid;
op.recovery_info = recovery_info;
op.recovery_progress.first = true;
assert(!pulling.count(soid));
- pull_from_peer[fromosd].insert(soid);
+ pull_from_peer[fromshard].insert(soid);
PullInfo &pi = pulling[soid];
pi.head_ctx = headctx;
pi.recovery_info = op.recovery_info;
int priority,
PGBackend::RecoveryHandle *h)
{
- map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
+ map<hobject_t, set<pg_shard_t> >::iterator q = missing_loc.find(soid);
if (q == missing_loc.end()) {
dout(7) << "pull " << soid
<< " v " << v
return PULL_YES;
}
-void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
+void ReplicatedPG::send_remove_op(
+ const hobject_t& oid, eversion_t v, pg_shard_t peer)
{
tid_t tid = osd->get_tid();
osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
dout(10) << "send_remove_op " << oid << " from osd." << peer
<< " tid " << tid << dendl;
- MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, oid, false, CEPH_OSD_FLAG_ACK,
- get_osdmap()->get_epoch(), tid, v);
+ MOSDSubOp *subop = new MOSDSubOp(
+ rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
+ oid, false, CEPH_OSD_FLAG_ACK,
+ get_osdmap()->get_epoch(), tid, v);
subop->ops = vector<OSDOp>(1);
subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
- osd->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
}
/*
* clones/heads and dup data ranges where possible.
*/
void ReplicatedBackend::prep_push_to_replica(
- ObjectContextRef obc, const hobject_t& soid, int peer,
+ ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
PushOp *pop)
{
const object_info_t& oi = obc->obs.oi;
SnapSetContext *ssc = obc->ssc;
assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
- map<int, pg_missing_t>::const_iterator pm =
- get_parent()->get_peer_missing().find(peer);
- assert(pm != get_parent()->get_peer_missing().end());
- map<int, pg_info_t>::const_iterator pi =
- get_parent()->get_peer_info().find(peer);
- assert(pi != get_parent()->get_peer_info().end());
+ map<pg_shard_t, pg_missing_t>::const_iterator pm =
+ get_parent()->get_shard_missing().find(peer);
+ assert(pm != get_parent()->get_shard_missing().end());
+ map<pg_shard_t, pg_info_t>::const_iterator pi =
+ get_parent()->get_shard_info().find(peer);
+ assert(pi != get_parent()->get_shard_info().end());
calc_clone_subsets(ssc->snapset, soid,
pm->second,
pi->second.last_backfill,
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
calc_head_subsets(
obc,
- ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second,
- get_parent()->get_peer_info().find(peer)->second.last_backfill,
+ ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second,
+ get_parent()->get_shard_info().find(peer)->second.last_backfill,
data_subset, clone_subsets);
}
}
void ReplicatedBackend::prep_push(ObjectContextRef obc,
- const hobject_t& soid, int peer,
+ const hobject_t& soid, pg_shard_t peer,
PushOp *pop)
{
interval_set<uint64_t> data_subset;
void ReplicatedBackend::prep_push(
ObjectContextRef obc,
- const hobject_t& soid, int peer,
+ const hobject_t& soid, pg_shard_t peer,
eversion_t version,
interval_set<uint64_t> &data_subset,
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
pi.recovery_progress = new_progress;
}
-int ReplicatedBackend::send_pull_legacy(int prio, int peer,
+int ReplicatedBackend::send_pull_legacy(int prio, pg_shard_t peer,
const ObjectRecoveryInfo &recovery_info,
ObjectRecoveryProgress progress)
{
<< " from osd." << peer
<< " tid " << tid << dendl;
- MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid,
- false, CEPH_OSD_FLAG_ACK,
- get_osdmap()->get_epoch(), tid,
- recovery_info.version);
+ MOSDSubOp *subop = new MOSDSubOp(
+ rid, parent->whoami_shard(),
+ get_info().pgid, recovery_info.soid,
+ false, CEPH_OSD_FLAG_ACK,
+ get_osdmap()->get_epoch(), tid,
+ recovery_info.version);
subop->set_priority(prio);
subop->ops = vector<OSDOp>(1);
subop->ops[0].op.op = CEPH_OSD_OP_PULL;
subop->recovery_progress = progress;
get_parent()->send_message_osd_cluster(
- peer, subop, get_osdmap()->get_epoch());
+ peer.osd, subop, get_osdmap()->get_epoch());
get_parent()->get_logger()->inc(l_osd_pull);
return 0;
}
bool ReplicatedBackend::handle_pull_response(
- int from, PushOp &pop, PullOp *response,
+ pg_shard_t from, PushOp &pop, PullOp *response,
list<hobject_t> *to_continue,
ObjectStore::Transaction *t
)
};
void ReplicatedBackend::handle_push(
- int from, PushOp &pop, PushReplyOp *response,
+ pg_shard_t from, PushOp &pop, PushReplyOp *response,
ObjectStore::Transaction *t)
{
dout(10) << "handle_push "
t);
}
-void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
+void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes)
{
- for (map<int, vector<PushOp> >::iterator i = pushes.begin();
+ for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin();
i != pushes.end();
++i) {
ConnectionRef con = get_parent()->get_con_osd_cluster(
- i->first,
+ i->first.osd,
get_osdmap()->get_epoch());
if (!con)
continue;
uint64_t cost = 0;
uint64_t pushes = 0;
MOSDPGPush *msg = new MOSDPGPush();
- msg->pgid = get_info().pgid;
+ msg->from = get_parent()->whoami_shard();
+ msg->pgid = get_parent()->primary_spg_t();
msg->map_epoch = get_osdmap()->get_epoch();
msg->set_priority(prio);
for (;
}
}
-void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
+void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls)
{
- for (map<int, vector<PullOp> >::iterator i = pulls.begin();
+ for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin();
i != pulls.end();
++i) {
ConnectionRef con = get_parent()->get_con_osd_cluster(
- i->first,
+ i->first.osd,
get_osdmap()->get_epoch());
if (!con)
continue;
dout(20) << __func__ << ": sending pulls " << i->second
<< " to osd." << i->first << dendl;
MOSDPGPull *msg = new MOSDPGPull();
+ msg->from = parent->whoami_shard();
msg->set_priority(prio);
- msg->pgid = get_info().pgid;
+ msg->pgid = get_parent()->primary_spg_t();
msg->map_epoch = get_osdmap()->get_epoch();
msg->pulls.swap(i->second);
msg->compute_cost(cct);
return 0;
}
-int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
+int ReplicatedBackend::send_push_op_legacy(int prio, pg_shard_t peer, PushOp &pop)
{
tid_t tid = get_parent()->get_tid();
osd_reqid_t rid(get_parent()->get_cluster_msgr_name(), 0, tid);
- MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid,
- false, 0, get_osdmap()->get_epoch(),
- tid, pop.recovery_info.version);
+ MOSDSubOp *subop = new MOSDSubOp(
+ rid, parent->whoami_shard(),
+ spg_t(get_info().pgid.pgid, peer.shard), pop.soid,
+ false, 0, get_osdmap()->get_epoch(),
+ tid, pop.recovery_info.version);
subop->ops = vector<OSDOp>(1);
subop->ops[0].op.op = CEPH_OSD_OP_PUSH;
subop->current_progress = pop.before_progress;
subop->recovery_progress = pop.after_progress;
- get_parent()->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch());
+ get_parent()->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
return 0;
}
const hobject_t& soid = reply->get_poid();
assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl;
- int peer = reply->get_source().num();
+ pg_shard_t peer = reply->from;
op->mark_started();
send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
}
-bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
+bool ReplicatedBackend::handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
if (pushing.count(soid) == 0) {
pop.recovery_progress = m->recovery_progress;
PushOp reply;
- handle_pull(m->get_source().num(), pop, &reply);
+ handle_pull(m->from, pop, &reply);
send_push_op_legacy(
m->get_priority(),
- m->get_source().num(),
+ m->from,
reply);
log_subop_stats(get_parent()->get_logger(), op, 0, l_osd_sop_pull_lat);
}
-void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply)
+void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
struct stat st;
if (!is_primary()) {
// Either we are a replica or backfill target.
// we are fully up to date. tell the primary!
- osd->send_message_osd_cluster(get_primary(),
- new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
- last_complete_ondisk),
- get_osdmap()->get_epoch());
+ osd->send_message_osd_cluster(
+ get_primary().osd,
+ new MOSDPGTrim(
+ get_osdmap()->get_epoch(),
+ spg_t(info.pgid.pgid, primary.shard),
+ last_complete_ondisk),
+ get_osdmap()->get_epoch());
} else {
// we are the primary. tell replicas to trim?
if (calc_min_last_complete_ondisk())
RPGHandle *h = _open_recovery_op();
list<hobject_t> to_continue;
bool more = handle_pull_response(
- m->get_source().num(), pop, &resp,
+ m->from, pop, &resp,
&to_continue, t);
if (more) {
send_pull_legacy(
m->get_priority(),
- m->get_source().num(),
+ m->from,
resp.recovery_info,
resp.recovery_progress);
} else {
} else {
PushReplyOp resp;
MOSDSubOpReply *reply = new MOSDSubOpReply(
- m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+ m, parent->whoami_shard(), 0,
+ get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
reply->set_priority(m->get_priority());
assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type);
- handle_push(m->get_source().num(), pop, &resp, t);
+ handle_push(m->from, pop, &resp, t);
t->register_on_complete(new PG_SendMessageOnConn(
get_parent(), reply, m->get_connection()));
}
return;
}
-void ReplicatedPG::failed_push(int from, const hobject_t &soid)
+void ReplicatedPG::failed_push(pg_shard_t from, const hobject_t &soid)
{
assert(recovering.count(soid));
recovering.erase(soid);
- map<hobject_t,set<int> >::iterator p = missing_loc.find(soid);
+ map<hobject_t,set<pg_shard_t> >::iterator p = missing_loc.find(soid);
if (p != missing_loc.end()) {
- dout(0) << "_failed_push " << soid << " from osd." << from
+ dout(0) << "_failed_push " << soid << " from shard " << from
<< ", reps on " << p->second << dendl;
p->second.erase(from); // forget about this (bad) peer replica
if (p->second.empty())
missing_loc.erase(p);
} else {
- dout(0) << "_failed_push " << soid << " from osd." << from
+ dout(0) << "_failed_push " << soid << " from shard " << from
<< " but not in missing_loc ???" << dendl;
}
finish_recovery_op(soid); // close out this attempt,
}
-void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
+void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid)
{
get_parent()->failed_push(from, soid);
pull_from_peer[from].erase(soid);
dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); ++i) {
- int peer = actingbackfill[i];
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
if (!peer_missing[peer].is_missing(oid)) {
assert(is_backfill_targets(peer));
continue;
osd->local_reserver.cancel_reservation(info.pgid);
clear_primary_state();
- osd->remove_want_pg_temp(info.pgid);
+ osd->remove_want_pg_temp(info.pgid.pgid);
cancel_recovery();
}
assert(!last_backfill_started.is_max());
dout(5) << "on activate: bft=" << backfill_targets
<< " from " << last_backfill_started << dendl;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- dout(5) << "target osd." << backfill_targets[i]
- << " from " << peer_info[backfill_targets[i]].last_backfill
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ dout(5) << "target shard " << *i
+ << " from " << peer_info[*i].last_backfill
<< dendl;
}
}
* check that any peers we are planning to (or currently) pulling
* objects from are dealt with.
*/
- set<int> now_down;
- for (set<int>::iterator p = missing_loc_sources.begin();
+ set<pg_shard_t> now_down;
+ for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
p != missing_loc_sources.end();
) {
- if (osdmap->is_up(*p)) {
+ if (osdmap->is_up(p->osd)) {
++p;
continue;
}
<< missing_loc_sources << dendl;
// filter missing_loc
- map<hobject_t, set<int> >::iterator p = missing_loc.begin();
+ map<hobject_t, set<pg_shard_t> >::iterator p = missing_loc.begin();
while (p != missing_loc.end()) {
- set<int>::iterator q = p->second.begin();
+ set<pg_shard_t>::iterator q = p->second.begin();
while (q != p->second.end())
if (now_down.count(*q)) {
p->second.erase(q++);
}
}
- for (set<int>::iterator i = peer_log_requested.begin();
+ for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
i != peer_log_requested.end();
) {
- if (!osdmap->is_up(*i)) {
+ if (!osdmap->is_up(i->osd)) {
dout(10) << "peer_log_requested removing " << *i << dendl;
peer_log_requested.erase(i++);
} else {
}
}
- for (set<int>::iterator i = peer_missing_requested.begin();
+ for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
i != peer_missing_requested.end();
) {
- if (!osdmap->is_up(*i)) {
+ if (!osdmap->is_up(i->osd)) {
dout(10) << "peer_missing_requested removing " << *i << dendl;
peer_missing_requested.erase(i++);
} else {
eversion_t alternate_need = latest->reverting_to;
dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
- set<int>& loc = missing_loc[soid];
- for (map<int,pg_missing_t>::iterator p = peer_missing.begin(); p != peer_missing.end(); ++p)
+ set<pg_shard_t>& loc = missing_loc[soid];
+ for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
+ p != peer_missing.end();
+ ++p)
if (p->second.is_missing(soid, need) &&
p->second.missing[soid].have == alternate_need) {
missing_loc_sources.insert(p->first);
pg_log.missing_add(soid, v, eversion_t());
bool uhoh = true;
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
if (!peer_missing[peer].is_missing(soid, v)) {
missing_loc[soid].insert(peer);
missing_loc_sources.insert(peer);
dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
- << ", there should be a copy on osd." << peer << dendl;
+ << ", there should be a copy on shard " << peer << dendl;
uhoh = false;
}
}
{
int pushes = 0;
// who needs it?
- assert(get_parent()->get_actingbackfill().size() > 0);
- for (unsigned i=1; i<get_parent()->get_actingbackfill().size(); i++) {
- int peer = get_parent()->get_actingbackfill()[i];
- map<int, pg_missing_t>::const_iterator j =
- get_parent()->get_peer_missing().find(peer);
- assert(j != get_parent()->get_peer_missing().end());
+ assert(get_parent()->get_actingbackfill_shards().size() > 0);
+ for (set<pg_shard_t>::iterator i =
+ get_parent()->get_actingbackfill_shards().begin();
+ i != get_parent()->get_actingbackfill_shards().end();
+ ++i) {
+ if (*i == get_parent()->whoami_shard()) continue;
+ pg_shard_t peer = *i;
+ map<pg_shard_t, pg_missing_t>::const_iterator j =
+ get_parent()->get_shard_missing().find(peer);
+ assert(j != get_parent()->get_shard_missing().end());
if (j->second.is_missing(soid)) {
++pushes;
h->pushes[peer].push_back(PushOp());
// this is FAR from an optimal recovery order. pretty lame, really.
assert(actingbackfill.size() > 0);
- for (unsigned i=1; i<actingbackfill.size(); i++) {
- int peer = actingbackfill[i];
- map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
+ for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+ i != actingbackfill.end();
+ ++i) {
+ if (*i == get_primary()) continue;
+ pg_shard_t peer = *i;
+ map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
assert(pm != peer_missing.end());
- map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+ map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
assert(pi != peer_info.end());
size_t m_sz = pm->second.num_missing();
hobject_t ReplicatedPG::earliest_peer_backfill() const
{
hobject_t e = hobject_t::get_max();
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int peer = backfill_targets[i];
- map<int, BackfillInterval>::const_iterator iter =
+ for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t peer = *i;
+ map<pg_shard_t, BackfillInterval>::const_iterator iter =
peer_backfill_info.find(peer);
assert(iter != peer_backfill_info.end());
if (iter->second.begin < e)
// Primary hasn't got any more objects
assert(backfill_info.empty());
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- map<int, BackfillInterval>::const_iterator piter =
+ for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
+ map<pg_shard_t, BackfillInterval>::const_iterator piter =
peer_backfill_info.find(bt);
assert(piter != peer_backfill_info.end());
const BackfillInterval& pbi = piter->second;
// on_activate() was called prior to getting here
assert(last_backfill_started == earliest_backfill());
new_backfill = false;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- peer_backfill_info[bt].reset(peer_info[bt].last_backfill);
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
}
backfill_info.reset(last_backfill_started);
}
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- dout(10) << "peer osd." << bt
- << " info " << peer_info[bt]
- << " interval " << peer_backfill_info[bt].begin
- << "-" << peer_backfill_info[bt].end
- << " " << peer_backfill_info[bt].objects.size() << " objects"
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ dout(10) << "peer osd." << *i
+ << " info " << peer_info[*i]
+ << " interval " << peer_backfill_info[*i].begin
+ << "-" << peer_backfill_info[*i].end
+ << " " << peer_backfill_info[*i].objects.size() << " objects"
<< dendl;
}
int ops = 0;
vector<boost::tuple<hobject_t, eversion_t,
- ObjectContextRef, vector<int> > > to_push;
- vector<boost::tuple<hobject_t, eversion_t, int> > to_remove;
+ ObjectContextRef, vector<pg_shard_t> > > to_push;
+ vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
set<hobject_t> add_to_stat;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
- peer_backfill_info[bt].trim_to(last_backfill_started);
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ peer_backfill_info[*i].trim_to(last_backfill_started);
}
backfill_info.trim_to(last_backfill_started);
<< dendl;
bool sent_scan = false;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
BackfillInterval& pbi = peer_backfill_info[bt];
- dout(20) << " peer osd." << bt << " backfill " << pbi.begin << "-"
+ dout(20) << " peer shard " << bt << " backfill " << pbi.begin << "-"
<< pbi.end << " " << pbi.objects << dendl;
if (pbi.begin <= backfill_info.begin &&
!pbi.extends_to_end() && pbi.empty()) {
dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
epoch_t e = get_osdmap()->get_epoch();
- MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid,
- pbi.end, hobject_t());
- osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch());
+ MOSDPGScan *m = new MOSDPGScan(
+ MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, e,
+ spg_t(info.pgid.pgid, bt.shard),
+ pbi.end, hobject_t());
+ osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
waiting_on_backfill.insert(bt);
sent_scan = true;
if (check < backfill_info.begin) {
- vector<int> check_targets;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
+ set<pg_shard_t> check_targets;
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
BackfillInterval& pbi = peer_backfill_info[bt];
if (pbi.begin == check)
- check_targets.push_back(bt);
+ check_targets.insert(bt);
}
assert(!check_targets.empty());
dout(20) << " BACKFILL removing " << check
<< " from peers " << check_targets << dendl;
- for (unsigned i = 0; i < check_targets.size(); ++i) {
- int bt = check_targets[i];
+ for (set<pg_shard_t>::iterator i = check_targets.begin();
+ i != check_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
BackfillInterval& pbi = peer_backfill_info[bt];
assert(pbi.begin == check);
} else {
eversion_t& obj_v = backfill_info.objects.begin()->second;
- vector<int> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
+ vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
BackfillInterval& pbi = peer_backfill_info[bt];
// Find all check peers that have the wrong version
if (check == backfill_info.begin && check == pbi.begin) {
<< " with ver " << obj_v
<< " to peers " << missing_targs << dendl;
}
- vector<int> all_push = need_ver_targs;
+ vector<pg_shard_t> all_push = need_ver_targs;
all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
to_push.push_back(
- boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<int> >
+ boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
(backfill_info.begin, obj_v, obc, all_push));
// Count all simultaneous pushes of the same object as a single op
ops++;
last_backfill_started = backfill_info.begin;
add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
backfill_info.pop_front();
- vector<int> check_targets = need_ver_targs;
+ vector<pg_shard_t> check_targets = need_ver_targs;
check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
- for (unsigned i = 0; i < check_targets.size(); ++i) {
- int bt = check_targets[i];
+ for (vector<pg_shard_t>::iterator i = check_targets.begin();
+ i != check_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
BackfillInterval& pbi = peer_backfill_info[bt];
pbi.pop_front();
}
i->first < next_backfill_to_complete;
pending_backfill_updates.erase(i++)) {
assert(i->first > new_last_backfill);
- for (unsigned j = 0; j < backfill_targets.size(); ++j) {
- int bt = backfill_targets[j];
+ for (set<pg_shard_t>::iterator j = backfill_targets.begin();
+ j != backfill_targets.end();
+ ++j) {
+ pg_shard_t bt = *j;
pg_info_t& pinfo = peer_info[bt];
//Add stats to all peers that were missing object
if (i->first > pinfo.last_backfill)
// If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
// all the backfill targets. Otherwise, we will move last_backfill up on
// those targets need it and send OP_BACKFILL_PROGRESS to them.
- for (unsigned i = 0; i < backfill_targets.size(); ++i) {
- int bt = backfill_targets[i];
+ for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+ i != backfill_targets.end();
+ ++i) {
+ pg_shard_t bt = *i;
pg_info_t& pinfo = peer_info[bt];
if (new_last_backfill > pinfo.last_backfill) {
epoch_t e = get_osdmap()->get_epoch();
MOSDPGBackfill *m = NULL;
if (pinfo.last_backfill.is_max()) {
- m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH, e, e, info.pgid);
+ m = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_FINISH,
+ e,
+ e,
+ spg_t(info.pgid.pgid, bt.shard));
// Use default priority here, must match sub_op priority
/* pinfo.stats might be wrong if we did log-based recovery on the
* backfilled portion in addition to continuing backfill.
pinfo.stats = info.stats;
start_recovery_op(hobject_t::get_max());
} else {
- m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid);
+ m = new MOSDPGBackfill(
+ MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+ e,
+ e,
+ spg_t(info.pgid.pgid, bt.shard));
// Use default priority here, must match sub_op priority
}
m->last_backfill = pinfo.last_backfill;
m->stats = pinfo.stats;
- osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch());
- dout(10) << " peer osd." << bt
+ osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
+ dout(10) << " peer " << bt
<< " num_objects now " << pinfo.stats.stats.sum.num_objects
<< " / " << info.stats.stats.sum.num_objects << dendl;
}
void ReplicatedPG::prep_backfill_object_push(
hobject_t oid, eversion_t v,
ObjectContextRef obc,
- vector<int> peers,
+ vector<pg_shard_t> peers,
PGBackend::RecoveryHandle *h)
{
dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl;
backfills_in_flight.insert(oid);
for (unsigned int i = 0 ; i < peers.size(); ++i) {
- map<int, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
+ map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
assert(bpm != peer_missing.end());
bpm->second.add(oid, eversion_t(), eversion_t());
}
// choose random starting position
agent_state->position = hobject_t();
agent_state->position.pool = info.pgid.pool();
- agent_state->position.hash = pool.info.get_random_pg_position(info.pgid,
- rand());
+ agent_state->position.hash = pool.info.get_random_pg_position(
+ info.pgid.pgid,
+ rand());
dout(10) << __func__ << " allocated new state, position "
<< agent_state->position << dendl;
void ReplicatedPG::agent_choose_mode()
{
- uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid);
+ uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
// get dirty, full ratios
uint64_t dirty_micro = 0;
ObjectStore::Transaction *t
);
void on_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t &oid,
const ObjectRecoveryInfo &recovery_info,
const object_stat_sum_t &stat
);
void begin_peer_recover(
- int peer,
+ pg_shard_t peer,
const hobject_t oid);
void on_global_recover(
const hobject_t &oid);
- void failed_push(int from, const hobject_t &soid);
+ void failed_push(pg_shard_t from, const hobject_t &soid);
void cancel_pull(const hobject_t &soid);
template <typename T>
tls.push_back(t);
osd->store->queue_transaction(osr.get(), t, 0, 0, 0, op);
}
- epoch_t get_epoch() {
+ epoch_t get_epoch() const {
return get_osdmap()->get_epoch();
}
- const vector<int> &get_actingbackfill() {
+ const set<pg_shard_t> &get_actingbackfill_shards() const {
return actingbackfill;
}
std::string gen_dbg_prefix() const { return gen_prefix(); }
- const map<hobject_t, set<int> > &get_missing_loc() {
+ const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards() const {
return missing_loc;
}
- const map<int, pg_missing_t> &get_peer_missing() {
+ const map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
return peer_missing;
}
- const map<int, pg_info_t> &get_peer_info() {
+ const map<pg_shard_t, pg_info_t> &get_shard_info() const {
return peer_info;
}
- const pg_missing_t &get_local_missing() {
+ const pg_missing_t &get_local_missing() const {
return pg_log.get_missing();
}
- const PGLog &get_log() {
+ const PGLog &get_log() const {
return pg_log;
}
bool pgb_is_primary() const {
const eversion_t &applied_version);
bool should_send_op(
- int peer,
+ pg_shard_t peer,
const hobject_t &hoid) {
+ if (peer == get_primary())
+ return true;
assert(peer_info.count(peer));
bool should_send = hoid.pool != (int64_t)info.pgid.pool() ||
hoid <= MAX(last_backfill_started, peer_info[peer].last_backfill);
}
void update_peer_last_complete_ondisk(
- int fromosd,
+ pg_shard_t fromosd,
eversion_t lcod) {
peer_last_complete_ondisk[fromosd] = lcod;
}
void schedule_work(
GenContext<ThreadPool::TPHandle&> *c);
- int whoami() const {
- return osd->whoami;
+ pg_shard_t whoami_shard() const {
+ return pg_whoami;
+ }
+ spg_t primary_spg_t() const {
+ return spg_t(info.pgid.pgid, primary.shard);
+ }
+ pg_shard_t primary_shard() const {
+ return primary;
}
void send_message_osd_cluster(
void dump_recovery_info(Formatter *f) const {
f->open_array_section("backfill_targets");
- for (vector<int>::const_iterator p = backfill_targets.begin();
+ for (set<pg_shard_t>::const_iterator p = backfill_targets.begin();
p != backfill_targets.end(); ++p)
- f->dump_int("osd", *p);
+ f->dump_stream("replica") << *p;
f->close_section();
f->open_array_section("waiting_on_backfill");
- for (set<int>::const_iterator p = waiting_on_backfill.begin();
+ for (set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
p != waiting_on_backfill.end(); ++p)
- f->dump_int("osd", *p);
+ f->dump_stream("osd") << *p;
f->close_section();
f->dump_stream("last_backfill_started") << last_backfill_started;
{
}
{
f->open_array_section("peer_backfill_info");
- for (map<int, BackfillInterval>::const_iterator pbi = peer_backfill_info.begin();
+ for (map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+ peer_backfill_info.begin();
pbi != peer_backfill_info.end(); ++pbi) {
- f->dump_int("osd", pbi->first);
+ f->dump_stream("osd") << pbi->first;
f->open_object_section("BackfillInterval");
pbi->second.dump(f);
f->close_section();
void prep_backfill_object_push(
hobject_t oid, eversion_t v, ObjectContextRef obc,
- vector<int> peer,
+ vector<pg_shard_t> peers,
PGBackend::RecoveryHandle *h);
- void send_remove_op(const hobject_t& oid, eversion_t v, int peer);
+ void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
struct C_OSD_OndiskWriteUnlock : public Context {
public:
ReplicatedPG(OSDService *o, OSDMapRef curmap,
- const PGPool &_pool, pg_t p, const hobject_t& oid,
+ const PGPool &_pool, spg_t p, const hobject_t& oid,
const hobject_t& ioid);
~ReplicatedPG() {}
return pgbackend->temp_colls(out);
}
void split_colls(
- pg_t child,
+ spg_t child,
int split_bits,
int seed,
ObjectStore::Transaction *t) {
const coll_t coll_t::META_COLL("meta");
-bool coll_t::is_temp(pg_t& pgid) const
+bool coll_t::is_temp(spg_t& pgid) const
{
const char *cstr(str.c_str());
if (!pgid.parse(cstr))
return false;
}
-bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const
+bool coll_t::is_pg(spg_t& pgid, snapid_t& snap) const
{
const char *cstr(str.c_str());
return true;
}
-bool coll_t::is_pg_prefix(pg_t& pgid) const
+bool coll_t::is_pg_prefix(spg_t& pgid) const
{
const char *cstr(str.c_str());
return true;
}
-bool coll_t::is_removal(uint64_t *seq, pg_t *pgid) const
+bool coll_t::is_removal(uint64_t *seq, spg_t *pgid) const
{
if (str.substr(0, 11) != string("FORREMOVAL_"))
return false;
::decode(struct_v, bl);
switch (struct_v) {
case 1: {
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
::decode(pgid, bl);
::decode(snap, bl);
// infer the type
- if (pgid == pg_t() && snap == 0)
+ if (pgid == spg_t() && snap == 0)
str = "meta";
else
str = pg_and_snap_to_str(pgid, snap);
case 2: {
__u8 type;
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
::decode(type, bl);
void pg_info_t::encode(bufferlist &bl) const
{
- ENCODE_START(29, 26, bl);
- ::encode(pgid, bl);
+ ENCODE_START(30, 26, bl);
+ ::encode(pgid.pgid, bl);
::encode(last_update, bl);
::encode(last_complete, bl);
::encode(log_tail, bl);
::encode(last_epoch_started, bl);
::encode(last_user_version, bl);
::encode(hit_set, bl);
+ ::encode(pgid.shard, bl);
ENCODE_FINISH(bl);
}
if (struct_v < 23) {
old_pg_t opgid;
::decode(opgid, bl);
- pgid = opgid;
+ pgid.pgid = opgid;
} else {
- ::decode(pgid, bl);
+ ::decode(pgid.pgid, bl);
}
::decode(last_update, bl);
::decode(last_complete, bl);
last_user_version = last_update.version;
if (struct_v >= 29)
::decode(hit_set, bl);
+ if (struct_v >= 30)
+ ::decode(pgid.shard, bl);
+ else
+ pgid.shard = ghobject_t::no_shard();
DECODE_FINISH(bl);
}
list<pg_history_t*> h;
pg_history_t::generate_test_instances(h);
o.back()->history = *h.back();
- o.back()->pgid = pg_t(1, 2, -1);
+ o.back()->pgid = spg_t(pg_t(1, 2, -1), ghobject_t::no_shard());
o.back()->last_update = eversion_t(3, 4);
o.back()->last_complete = eversion_t(5, 6);
o.back()->last_user_version = 2;
// -- pg_notify_t --
void pg_notify_t::encode(bufferlist &bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(query_epoch, bl);
::encode(epoch_sent, bl);
::encode(info, bl);
+ ::encode(to, bl);
+ ::encode(from, bl);
ENCODE_FINISH(bl);
}
void pg_notify_t::decode(bufferlist::iterator &bl)
{
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(query_epoch, bl);
::decode(epoch_sent, bl);
::decode(info, bl);
+ if (struct_v >= 2) {
+ ::decode(to, bl);
+ ::decode(from, bl);
+ } else {
+ to = ghobject_t::NO_SHARD;
+ from = ghobject_t::NO_SHARD;
+ }
DECODE_FINISH(bl);
}
void pg_notify_t::dump(Formatter *f) const
{
+ f->dump_int("from", from);
+ f->dump_int("to", to);
f->dump_stream("query_epoch") << query_epoch;
f->dump_stream("epoch_sent") << epoch_sent;
{
void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
{
- o.push_back(new pg_notify_t(1,1,pg_info_t()));
- o.push_back(new pg_notify_t(3,10,pg_info_t()));
+ o.push_back(new pg_notify_t(3, ghobject_t::NO_SHARD, 1 ,1 , pg_info_t()));
+ o.push_back(new pg_notify_t(0, 0, 3, 10, pg_info_t()));
}
ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify)
{
- return lhs << "(query_epoch:" << notify.query_epoch
- << ", epoch_sent:" << notify.epoch_sent
- << ", info:" << notify.info << ")";
+ lhs << "(query_epoch:" << notify.query_epoch
+ << ", epoch_sent:" << notify.epoch_sent
+ << ", info:" << notify.info;
+ if (notify.from != ghobject_t::NO_SHARD ||
+ notify.to != ghobject_t::NO_SHARD)
+ lhs << " " << (unsigned)notify.from
+ << "->" << (unsigned)notify.to;
+ return lhs << ")";
}
// -- pg_interval_t --
void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
if (features & CEPH_FEATURE_QUERY_T) {
- ENCODE_START(2, 2, bl);
+ ENCODE_START(3, 2, bl);
::encode(type, bl);
::encode(since, bl);
history.encode(bl);
::encode(epoch_sent, bl);
+ ::encode(to, bl);
+ ::encode(from, bl);
ENCODE_FINISH(bl);
} else {
::encode(type, bl);
void pg_query_t::decode(bufferlist::iterator &bl) {
bufferlist::iterator bl2 = bl;
try {
- DECODE_START(2, bl);
+ DECODE_START(3, bl);
::decode(type, bl);
::decode(since, bl);
history.decode(bl);
::decode(epoch_sent, bl);
+ if (struct_v >= 3) {
+ ::decode(to, bl);
+ ::decode(from, bl);
+ } else {
+ to = ghobject_t::NO_SHARD;
+ from = ghobject_t::NO_SHARD;
+ }
DECODE_FINISH(bl);
} catch (...) {
bl = bl2;
void pg_query_t::dump(Formatter *f) const
{
+ f->dump_int("from", from);
+ f->dump_int("to", to);
f->dump_string("type", get_type_name());
f->dump_stream("since") << since;
f->dump_stream("epoch_sent") << epoch_sent;
o.push_back(new pg_query_t());
list<pg_history_t*> h;
pg_history_t::generate_test_instances(h);
- o.push_back(new pg_query_t(pg_query_t::INFO, *h.back(), 4));
- o.push_back(new pg_query_t(pg_query_t::MISSING, *h.back(), 4));
- o.push_back(new pg_query_t(pg_query_t::LOG, eversion_t(4, 5), *h.back(), 4));
- o.push_back(new pg_query_t(pg_query_t::FULLLOG, *h.back(), 5));
+ o.push_back(new pg_query_t(pg_query_t::INFO, 1, 2, *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::MISSING, 2, 3, *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::LOG, 0, 0,
+ eversion_t(4, 5), *h.back(), 4));
+ o.push_back(new pg_query_t(pg_query_t::FULLLOG,
+ ghobject_t::NO_SHARD, ghobject_t::NO_SHARD,
+ *h.back(), 5));
}
// -- ObjectModDesc --
: str(str_)
{ }
- explicit coll_t(pg_t pgid, snapid_t snap = CEPH_NOSNAP)
+ explicit coll_t(spg_t pgid, snapid_t snap = CEPH_NOSNAP)
: str(pg_and_snap_to_str(pgid, snap))
{ }
- static coll_t make_temp_coll(pg_t pgid) {
+ static coll_t make_temp_coll(spg_t pgid) {
return coll_t(pg_to_tmp_str(pgid));
}
- static coll_t make_removal_coll(uint64_t seq, pg_t pgid) {
+ static coll_t make_removal_coll(uint64_t seq, spg_t pgid) {
return coll_t(seq_to_removal_str(seq, pgid));
}
return str < rhs.str;
}
- bool is_pg_prefix(pg_t& pgid) const;
- bool is_pg(pg_t& pgid, snapid_t& snap) const;
- bool is_temp(pg_t& pgid) const;
- bool is_removal(uint64_t *seq, pg_t *pgid) const;
+ bool is_pg_prefix(spg_t& pgid) const;
+ bool is_pg(spg_t& pgid, snapid_t& snap) const;
+ bool is_temp(spg_t& pgid) const;
+ bool is_removal(uint64_t *seq, spg_t *pgid) const;
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
inline bool operator==(const coll_t& rhs) const {
static void generate_test_instances(list<coll_t*>& o);
private:
- static std::string pg_and_snap_to_str(pg_t p, snapid_t s) {
+ static std::string pg_and_snap_to_str(spg_t p, snapid_t s) {
std::ostringstream oss;
oss << p << "_" << s;
return oss.str();
}
- static std::string pg_to_tmp_str(pg_t p) {
+ static std::string pg_to_tmp_str(spg_t p) {
std::ostringstream oss;
oss << p << "_TEMP";
return oss.str();
}
- static std::string seq_to_removal_str(uint64_t seq, pg_t pgid) {
+ static std::string seq_to_removal_str(uint64_t seq, spg_t pgid) {
std::ostringstream oss;
oss << "FORREMOVAL_" << seq << "_" << pgid;
return oss.str();
* otherwise, we have no idea what the pg is supposed to contain.
*/
struct pg_info_t {
- pg_t pgid;
+ spg_t pgid;
eversion_t last_update; // last object version applied to store.
eversion_t last_complete; // last version pg was complete through.
epoch_t last_epoch_started;// last epoch at which this pg started on this osd
: last_epoch_started(0), last_user_version(0),
last_backfill(hobject_t::get_max())
{ }
- pg_info_t(pg_t p)
+ pg_info_t(spg_t p)
: pgid(p),
last_epoch_started(0), last_user_version(0),
last_backfill(hobject_t::get_max())
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p);
void dump(Formatter *f) const;
+ bool overlaps_with(const pg_info_t &oinfo) const {
+ return last_update > oinfo.log_tail ?
+ oinfo.last_update >= log_tail :
+ last_update >= oinfo.log_tail;
+ }
static void generate_test_instances(list<pg_info_t*>& o);
};
WRITE_CLASS_ENCODER(pg_info_t)
epoch_t query_epoch;
epoch_t epoch_sent;
pg_info_t info;
- pg_notify_t() : query_epoch(0), epoch_sent(0) {}
- pg_notify_t(epoch_t query_epoch,
- epoch_t epoch_sent,
- const pg_info_t &info)
+ shard_id_t to;
+ shard_id_t from;
+ pg_notify_t() :
+ query_epoch(0), epoch_sent(0), to(ghobject_t::no_shard()),
+ from(ghobject_t::no_shard()) {}
+ pg_notify_t(
+ shard_id_t to,
+ shard_id_t from,
+ epoch_t query_epoch,
+ epoch_t epoch_sent,
+ const pg_info_t &info)
: query_epoch(query_epoch),
epoch_sent(epoch_sent),
- info(info) {}
+ info(info), to(to), from(from) {
+ assert(from == info.pgid.shard);
+ }
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &p);
void dump(Formatter *f) const;
eversion_t since;
pg_history_t history;
epoch_t epoch_sent;
-
- pg_query_t() : type(-1), epoch_sent(0) {}
- pg_query_t(int t, const pg_history_t& h,
- epoch_t epoch_sent)
- : type(t), history(h),
- epoch_sent(epoch_sent) {
+ shard_id_t to;
+ shard_id_t from;
+
+ pg_query_t() : type(-1), epoch_sent(0), to(ghobject_t::NO_SHARD),
+ from(ghobject_t::NO_SHARD) {}
+ pg_query_t(
+ int t,
+ shard_id_t to,
+ shard_id_t from,
+ const pg_history_t& h,
+ epoch_t epoch_sent)
+ : type(t),
+ history(h),
+ epoch_sent(epoch_sent),
+ to(to), from(from) {
assert(t != LOG);
}
- pg_query_t(int t, eversion_t s, const pg_history_t& h,
- epoch_t epoch_sent)
+ pg_query_t(
+ int t,
+ shard_id_t to,
+ shard_id_t from,
+ eversion_t s,
+ const pg_history_t& h,
+ epoch_t epoch_sent)
: type(t), since(s), history(h),
- epoch_sent(epoch_sent) {
+ epoch_sent(epoch_sent), to(to), from(from) {
assert(t == LOG);
}
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
ObjectStore::Transaction t;
pg_log_t olog;
pg_info_t oinfo;
- int fromosd = -1;
+ pg_shard_t fromosd;
pg_info_t info;
list<hobject_t> remove_snap;
bool dirty_info = false;
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
eversion_t last_update(1, 1);
oinfo.last_update = last_update;
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
{
pg_log_entry_t e;
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
hobject_t divergent_object;
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
eversion_t last_update(1, 2);
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
eversion_t last_update(1, 2);
hobject_t divergent_object;
pg_log_t olog;
pg_info_t oinfo;
pg_missing_t omissing;
- int from = -1;
+ pg_shard_t from;
eversion_t last_update(1, 2);
hobject_t divergent_object;
for (vector<coll_t>::iterator it = ls.begin();
it != ls.end();
++it) {
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
if (it->is_temp(pgid)) {
{
ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
- if (store->collection_exists(coll_t(r_pgid))) {
+ if (store->collection_exists(coll_t(spg_t(r_pgid, ghobject_t::no_shard())))) {
coll_t to_remove = coll_t::make_removal_coll((*next_removal_seq)++,
- r_pgid);
- cout << "collection rename " << coll_t(r_pgid) << " to " << to_remove
+ spg_t(r_pgid, ghobject_t::no_shard()));
+ cout << "collection rename " << coll_t(spg_t(r_pgid, ghobject_t::no_shard()))
+ << " to " << to_remove
<< std::endl;
- rmt->collection_rename(coll_t(r_pgid), to_remove);
+ rmt->collection_rename(coll_t(spg_t(r_pgid, ghobject_t::no_shard())), to_remove);
} else {
delete rmt;
return ENOENT;
return 1;
}
- log_oid = OSD::make_pg_log_oid(pgid);
- biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+ log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard()));
+ biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard()));
//Check for PG already present.
- coll_t coll(pgid);
+ coll_t coll(spg_t(pgid, ghobject_t::no_shard()));
if (store->collection_exists(coll)) {
cout << "pgid " << pgid << " already exists" << std::endl;
return 1;
//Switch to collection which will be removed automatically if
//this program is interupted.
- coll_t rmcoll = coll_t::make_removal_coll(next_removal_seq, pgid);
+ coll_t rmcoll = coll_t::make_removal_coll(
+ next_removal_seq, spg_t(pgid, ghobject_t::no_shard()));
ObjectStore::Transaction *t = new ObjectStore::Transaction;
t->create_collection(rmcoll);
store->apply_transaction(*t);
goto out;
}
- log_oid = OSD::make_pg_log_oid(pgid);
- biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+ log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard()));
+ biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard()));
if (type == "remove") {
uint64_t next_removal_seq = 0; //My local seq
for (it = ls.begin(); it != ls.end(); ++it) {
snapid_t snap;
- pg_t tmppgid;
+ spg_t tmppgid;
if (!it->is_pg(tmppgid, snap)) {
continue;
}
- if (tmppgid != pgid) {
+ if (tmppgid.pgid != pgid) {
continue;
}
if (snap != CEPH_NOSNAP && debug) {
if (debug)
cerr << "map_epoch " << map_epoch << std::endl;
- pg_info_t info(pgid);
+ pg_info_t info(spg_t(pgid, ghobject_t::no_shard()));
map<epoch_t,pg_interval_t> past_intervals;
- hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+ hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(
+ spg_t(pgid, ghobject_t::no_shard()));
interval_set<snapid_t> snap_collections;
__u8 struct_ver;
vector<coll_t> colls_to_check;
if (pgidstr.length()) {
- pg_t pgid;
+ spg_t pgid;
if (!pgid.parse(pgidstr.c_str())) {
cout << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
exit(1);
for (vector<coll_t>::iterator i = candidates.begin();
i != candidates.end();
++i) {
- pg_t pgid;
+ spg_t pgid;
snapid_t snap;
if (i->is_pg(pgid, snap)) {
colls_to_check.push_back(*i);