From: Samuel Just Date: Tue, 28 Jan 2014 20:50:05 +0000 (-0800) Subject: osd/: refer to pg by spg_t and replica by pg_shard_t X-Git-Tag: v0.78~163^2~32 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=035d6cc2a315eab19d00f9d064edda6d2c9fa0a1;p=ceph.git osd/: refer to pg by spg_t and replica by pg_shard_t We may have multiple pg shards on the same osd for an ec pool. Thus, replicas must be referred to by and pgs by . Signed-off-by: Samuel Just --- diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 886658f9aa7..f23815a4298 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -248,9 +248,9 @@ bool DBObjectMap::parse_ghobject_key_v0(const string &in, coll_t *c, *c = coll_t(coll); int64_t pool = -1; - pg_t pg; + spg_t pg; if (c->is_pg_prefix(pg)) - pool = (int64_t)pg.pool(); + pool = (int64_t)pg.pgid.pool(); (*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); return true; } diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc index a250016e8e9..92ccebf0249 100644 --- a/src/os/LFNIndex.cc +++ b/src/os/LFNIndex.cc @@ -950,9 +950,9 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t { bool r = parse_object(long_name.c_str(), *out); int64_t pool = -1; - pg_t pg; + spg_t pg; if (coll().is_pg_prefix(pg)) - pool = (int64_t)pg.pool(); + pool = (int64_t)pg.pgid.pool(); out->hobj.pool = pool; if (!r) return r; string temp = lfn_generate_object_name(*out); @@ -1043,9 +1043,9 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name, int64_t pool = -1; - pg_t pg; + spg_t pg; if (coll().is_pg_prefix(pg)) - pool = (int64_t)pg.pool(); + pool = (int64_t)pg.pgid.pool(); (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, "")); return true; } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 22b1ad4215b..15c38833ade 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -235,9 +235,9 @@ OSDService::~OSDService() delete objecter; } -void OSDService::_start_split(pg_t parent, const set &children) +void OSDService::_start_split(spg_t parent, const set &children) { - for (set::const_iterator i = children.begin(); + for (set::const_iterator i = children.begin(); i != children.end(); ++i) { dout(10) << __func__ << ": Starting split on pg " << *i @@ -251,12 +251,12 @@ void OSDService::_start_split(pg_t parent, const set &children) } } -void OSDService::mark_split_in_progress(pg_t parent, const set &children) +void OSDService::mark_split_in_progress(spg_t parent, const set &children) { Mutex::Locker l(in_progress_split_lock); - map >::iterator piter = rev_pending_splits.find(parent); + map >::iterator piter = rev_pending_splits.find(parent); assert(piter != rev_pending_splits.end()); - for (set::const_iterator i = children.begin(); + for (set::const_iterator i = children.begin(); i != children.end(); ++i) { assert(piter->second.count(*i)); @@ -272,19 +272,19 @@ void OSDService::mark_split_in_progress(pg_t parent, const set &children) rev_pending_splits.erase(piter); } -void OSDService::cancel_pending_splits_for_parent(pg_t parent) +void OSDService::cancel_pending_splits_for_parent(spg_t parent) { Mutex::Locker l(in_progress_split_lock); return _cancel_pending_splits_for_parent(parent); } -void OSDService::_cancel_pending_splits_for_parent(pg_t parent) +void OSDService::_cancel_pending_splits_for_parent(spg_t parent) { - map >::iterator piter = rev_pending_splits.find(parent); + map >::iterator piter = rev_pending_splits.find(parent); if (piter == rev_pending_splits.end()) return; - for (set::iterator i = piter->second.begin(); + for (set::iterator i = piter->second.begin(); i != piter->second.end(); ++i) { assert(pending_splits.count(*i)); @@ -299,11 +299,11 @@ void OSDService::_cancel_pending_splits_for_parent(pg_t parent) void OSDService::_maybe_split_pgid(OSDMapRef old_map, OSDMapRef new_map, - pg_t pgid) + spg_t pgid) { assert(old_map->have_pg_pool(pgid.pool())); if (pgid.ps() < static_cast(old_map->get_pg_num(pgid.pool()))) { - set children; + set children; pgid.is_split(old_map->get_pg_num(pgid.pool()), new_map->get_pg_num(pgid.pool()), &children); _start_split(pgid, children); @@ -312,7 +312,7 @@ void OSDService::_maybe_split_pgid(OSDMapRef old_map, } } -void OSDService::init_splits_between(pg_t pgid, +void OSDService::init_splits_between(spg_t pgid, OSDMapRef frommap, OSDMapRef tomap) { @@ -323,7 +323,7 @@ void OSDService::init_splits_between(pg_t pgid, tomap->get_pg_num(pgid.pool()), NULL)) { // Ok, a split happened, so we need to walk the osdmaps - set new_pgs; // pgs to scan on each map + set new_pgs; // pgs to scan on each map new_pgs.insert(pgid); OSDMapRef curmap(get_map(frommap->get_epoch())); for (epoch_t e = frommap->get_epoch() + 1; @@ -332,9 +332,9 @@ void OSDService::init_splits_between(pg_t pgid, OSDMapRef nextmap(try_get_map(e)); if (!nextmap) continue; - set even_newer_pgs; // pgs added in this loop - for (set::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) { - set split_pgs; + set even_newer_pgs; // pgs added in this loop + for (set::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) { + set split_pgs; if (i->is_split(curmap->get_pg_num(i->pool()), nextmap->get_pg_num(i->pool()), &split_pgs)) { @@ -353,7 +353,7 @@ void OSDService::expand_pg_num(OSDMapRef old_map, OSDMapRef new_map) { Mutex::Locker l(in_progress_split_lock); - for (set::iterator i = in_progress_splits.begin(); + for (set::iterator i = in_progress_splits.begin(); i != in_progress_splits.end(); ) { if (!new_map->have_pg_pool(i->pool())) { @@ -363,7 +363,7 @@ void OSDService::expand_pg_num(OSDMapRef old_map, ++i; } } - for (map::iterator i = pending_splits.begin(); + for (map::iterator i = pending_splits.begin(); i != pending_splits.end(); ) { if (!new_map->have_pg_pool(i->first.pool())) { @@ -376,17 +376,17 @@ void OSDService::expand_pg_num(OSDMapRef old_map, } } -bool OSDService::splitting(pg_t pgid) +bool OSDService::splitting(spg_t pgid) { Mutex::Locker l(in_progress_split_lock); return in_progress_splits.count(pgid) || pending_splits.count(pgid); } -void OSDService::complete_split(const set &pgs) +void OSDService::complete_split(const set &pgs) { Mutex::Locker l(in_progress_split_lock); - for (set::const_iterator i = pgs.begin(); + for (set::const_iterator i = pgs.begin(); i != pgs.end(); ++i) { dout(10) << __func__ << ": Completing split on pg " << *i << dendl; @@ -598,7 +598,7 @@ int OSD::do_convertfs(ObjectStore *store) for (vector::iterator i = collections.begin(); i != collections.end(); ++i) { - pg_t pgid; + spg_t pgid; if (i->is_temp(pgid)) recursive_remove_collection(store, *i); else if (i->to_str() == "convertfs_temp" || @@ -1011,7 +1011,7 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format, list watchers; osd_lock.Lock(); // scan pg's - for (ceph::unordered_map::iterator it = pg_map.begin(); + for (ceph::unordered_map::iterator it = pg_map.begin(); it != pg_map.end(); ++it) { @@ -1525,7 +1525,7 @@ int OSD::shutdown() cct->_conf->apply_changes(NULL); // Shutdown PGs - for (ceph::unordered_map::iterator p = pg_map.begin(); + for (ceph::unordered_map::iterator p = pg_map.begin(); p != pg_map.end(); ++p) { dout(20) << " kicking pg " << p->first << dendl; @@ -1624,7 +1624,7 @@ int OSD::shutdown() #ifdef PG_DEBUG_REFS service.dump_live_pgids(); #endif - for (ceph::unordered_map::iterator p = pg_map.begin(); + for (ceph::unordered_map::iterator p = pg_map.begin(); p != pg_map.end(); ++p) { dout(20) << " kicking pg " << p->first << dendl; @@ -1757,7 +1757,7 @@ PGPool OSD::_get_pool(int id, OSDMapRef createmap) PG *OSD::_open_lock_pg( OSDMapRef createmap, - pg_t pgid, bool no_lockdep_check, bool hold_map_lock) + spg_t pgid, bool no_lockdep_check, bool hold_map_lock) { assert(osd_lock.is_locked()); @@ -1772,7 +1772,7 @@ PG *OSD::_open_lock_pg( PG* OSD::_make_pg( OSDMapRef createmap, - pg_t pgid) + spg_t pgid) { dout(10) << "_open_lock_pg " << pgid << dendl; PGPool pool = _get_pool(pgid.pool(), createmap); @@ -1781,7 +1781,7 @@ PG* OSD::_make_pg( PG *pg; hobject_t logoid = make_pg_log_oid(pgid); hobject_t infooid = make_pg_biginfo_oid(pgid); - if (createmap->get_pg_type(pgid) == pg_pool_t::TYPE_REPLICATED) + if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED) pg = new ReplicatedPG(&service, createmap, pool, pgid, logoid, infooid); else assert(0); @@ -1797,14 +1797,14 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx) pg_map[pg->info.pgid] = pg; dout(10) << "Adding newly split pg " << *pg << dendl; vector up, acting; - pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid, up, acting); + pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting); int role = OSDMap::calc_pg_role(service.whoami, acting); pg->set_role(role); pg->reg_next_scrub(); pg->handle_loaded(rctx); pg->write_if_dirty(*(rctx->transaction)); pg->queue_null(e, e); - map >::iterator to_wake = + map >::iterator to_wake = peering_wait_for_split.find(pg->info.pgid); if (to_wake != peering_wait_for_split.end()) { for (list::iterator i = @@ -1821,13 +1821,13 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx) } OSD::res_result OSD::_try_resurrect_pg( - OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state) + OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state) { assert(resurrected); assert(old_pg_state); // find nearest ancestor DeletingStateRef df; - pg_t cur(pgid); + spg_t cur(pgid); while (true) { df = service.deleting_pgs.lookup(cur); if (df) @@ -1843,7 +1843,7 @@ OSD::res_result OSD::_try_resurrect_pg( OSDMapRef create_map = df->old_pg_state->get_osdmap(); df->old_pg_state->unlock(); - set children; + set children; if (cur == pgid) { if (df->try_stop_deletion()) { dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl; @@ -1880,11 +1880,14 @@ OSD::res_result OSD::_try_resurrect_pg( PG *OSD::_create_lock_pg( OSDMapRef createmap, - pg_t pgid, + spg_t pgid, bool newly_created, bool hold_map_lock, bool backfill, - int role, vector& up, vector& acting, pg_history_t history, + int role, + vector& up, int up_primary, + vector& acting, int acting_primary, + pg_history_t history, pg_interval_map_t& pi, ObjectStore::Transaction& t) { @@ -1895,20 +1898,29 @@ PG *OSD::_create_lock_pg( service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap()); - pg->init(role, up, acting, history, pi, backfill, &t); + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + history, + pi, + backfill, + &t); dout(7) << "_create_lock_pg " << *pg << dendl; return pg; } -bool OSD::_have_pg(pg_t pgid) +bool OSD::_have_pg(spg_t pgid) { assert(osd_lock.is_locked()); return pg_map.count(pgid); } -PG *OSD::_lookup_lock_pg(pg_t pgid) +PG *OSD::_lookup_lock_pg(spg_t pgid) { assert(osd_lock.is_locked()); if (!pg_map.count(pgid)) @@ -1919,7 +1931,7 @@ PG *OSD::_lookup_lock_pg(pg_t pgid) } -PG *OSD::_lookup_pg(pg_t pgid) +PG *OSD::_lookup_pg(spg_t pgid) { assert(osd_lock.is_locked()); if (!pg_map.count(pgid)) @@ -1928,7 +1940,7 @@ PG *OSD::_lookup_pg(pg_t pgid) return pg; } -PG *OSD::_lookup_lock_pg_with_map_lock_held(pg_t pgid) +PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid) { assert(osd_lock.is_locked()); assert(pg_map.count(pgid)); @@ -1949,12 +1961,12 @@ void OSD::load_pgs() derr << "failed to list pgs: " << cpp_strerror(-r) << dendl; } - set head_pgs; - map > pgs; + set head_pgs; + map > pgs; for (vector::iterator it = ls.begin(); it != ls.end(); ++it) { - pg_t pgid; + spg_t pgid; snapid_t snap; uint64_t seq; @@ -1981,10 +1993,10 @@ void OSD::load_pgs() } bool has_upgraded = false; - for (map >::iterator i = pgs.begin(); + for (map >::iterator i = pgs.begin(); i != pgs.end(); ++i) { - pg_t pgid(i->first); + spg_t pgid(i->first); if (!head_pgs.count(pgid)) { dout(10) << __func__ << ": " << pgid << " has orphan snap collections " << i->second @@ -2051,7 +2063,15 @@ void OSD::load_pgs() pg->reg_next_scrub(); // generate state for PG's current mapping - pg->get_osdmap()->pg_to_up_acting_osds(pgid, pg->up, pg->acting); + int primary, up_primary; + vector acting, up; + pg->get_osdmap()->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &primary); + pg->init_primary_up_acting( + up, + acting, + up_primary, + primary); int role = OSDMap::calc_pg_role(whoami, pg->acting); pg->set_role(role); @@ -2089,7 +2109,7 @@ void OSD::build_past_intervals_parallel() // calculate untion of map range epoch_t end_epoch = superblock.oldest_map; epoch_t cur_epoch = superblock.newest_map; - for (ceph::unordered_map::iterator i = pg_map.begin(); + for (ceph::unordered_map::iterator i = pg_map.begin(); i != pg_map.end(); ++i) { PG *pg = i->second; @@ -2131,7 +2151,7 @@ void OSD::build_past_intervals_parallel() continue; vector acting, up; - cur_map->pg_to_up_acting_osds(pg->info.pgid, up, acting); + cur_map->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting); if (p.same_interval_since == 0) { dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid @@ -2145,15 +2165,16 @@ void OSD::build_past_intervals_parallel() assert(last_map); std::stringstream debug; - bool new_interval = pg_interval_t::check_new_interval(p.old_acting, acting, - p.old_up, up, - p.same_interval_since, - pg->info.history.last_epoch_clean, - cur_map, last_map, - pg->info.pgid.pool(), - pg->info.pgid, - &pg->past_intervals, - &debug); + bool new_interval = pg_interval_t::check_new_interval( + p.old_acting, acting, + p.old_up, up, + p.same_interval_since, + pg->info.history.last_epoch_clean, + cur_map, last_map, + pg->info.pgid.pool(), + pg->info.pgid.pgid, + &pg->past_intervals, + &debug); if (new_interval) { dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid << " " << debug.str() << dendl; @@ -2195,37 +2216,40 @@ void OSD::build_past_intervals_parallel() * hasn't changed since the given epoch and we are the primary. */ void OSD::handle_pg_peering_evt( + spg_t pgid, const pg_info_t& info, pg_interval_map_t& pi, epoch_t epoch, - int from, + pg_shard_t from, bool primary, PG::CephPeeringEvtRef evt) { - if (service.splitting(info.pgid)) { - peering_wait_for_split[info.pgid].push_back(evt); + if (service.splitting(pgid)) { + peering_wait_for_split[pgid].push_back(evt); return; } - if (!_have_pg(info.pgid)) { + if (!_have_pg(pgid)) { // same primary? - if (!osdmap->have_pg_pool(info.pgid.pool())) + if (!osdmap->have_pg_pool(pgid.pool())) return; + int up_primary, acting_primary; vector up, acting; - osdmap->pg_to_up_acting_osds(info.pgid, up, acting); + osdmap->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &acting_primary); int role = osdmap->calc_pg_role(whoami, acting, acting.size()); pg_history_t history = info.history; bool valid_history = project_pg_history( - info.pgid, history, epoch, up, acting); + pgid, history, epoch, up, acting); if (!valid_history || epoch < history.same_interval_since) { - dout(10) << "get_or_create_pg " << info.pgid << " acting changed in " + dout(10) << "get_or_create_pg " << pgid << " acting changed in " << history.same_interval_since << " (msg from " << epoch << ")" << dendl; return; } - if (service.splitting(info.pgid)) { + if (service.splitting(pgid)) { assert(0); } @@ -2234,29 +2258,29 @@ void OSD::handle_pg_peering_evt( // DNE on source? if (info.dne()) { // is there a creation pending on this pg? - if (creating_pgs.count(info.pgid)) { - creating_pgs[info.pgid].prior.erase(from); - if (!can_create_pg(info.pgid)) + if (creating_pgs.count(pgid)) { + creating_pgs[pgid].prior.erase(from); + if (!can_create_pg(pgid)) return; - history = creating_pgs[info.pgid].history; + history = creating_pgs[pgid].history; create = true; } else { - dout(10) << "get_or_create_pg " << info.pgid + dout(10) << "get_or_create_pg " << pgid << " DNE on source, but creation probe, ignoring" << dendl; return; } } - creating_pgs.erase(info.pgid); + creating_pgs.erase(pgid); } else { assert(!info.dne()); // pg exists if we are hearing about it } // do we need to resurrect a deleting pg? - pg_t resurrected; + spg_t resurrected; PGRef old_pg_state; res_result result = _try_resurrect_pg( service.get_osdmap(), - info.pgid, + pgid, &resurrected, &old_pg_state); @@ -2264,11 +2288,14 @@ void OSD::handle_pg_peering_evt( switch (result) { case RES_NONE: { // ok, create the pg locally using provided Info and History - rctx.transaction->create_collection(coll_t(info.pgid)); + rctx.transaction->create_collection(coll_t(pgid)); PG *pg = _create_lock_pg( get_map(epoch), - info.pgid, create, false, result == RES_SELF, - role, up, acting, history, pi, + pgid, create, false, result == RES_SELF, + role, + up, up_primary, + acting, acting_primary, + history, pi, *rctx.transaction); pg->handle_create(&rctx); pg->write_if_dirty(*rctx.transaction); @@ -2293,7 +2320,9 @@ void OSD::handle_pg_peering_evt( true, old_pg_state->role, old_pg_state->up, + old_pg_state->up_primary.osd, old_pg_state->acting, + old_pg_state->primary.osd, old_pg_state->info.history, old_pg_state->past_intervals, *rctx.transaction); @@ -2322,7 +2351,9 @@ void OSD::handle_pg_peering_evt( true, old_pg_state->role, old_pg_state->up, + old_pg_state->up_primary.osd, old_pg_state->acting, + old_pg_state->primary.osd, old_pg_state->info.history, old_pg_state->past_intervals, *rctx.transaction @@ -2337,8 +2368,8 @@ void OSD::handle_pg_peering_evt( // kick any waiters wake_pg_waiters(parent->info.pgid); - assert(service.splitting(info.pgid)); - peering_wait_for_split[info.pgid].push_back(evt); + assert(service.splitting(pgid)); + peering_wait_for_split[pgid].push_back(evt); //parent->queue_peering_event(evt); parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch()); @@ -2348,7 +2379,7 @@ void OSD::handle_pg_peering_evt( } } else { // already had it. did the mapping change? - PG *pg = _lookup_lock_pg(info.pgid); + PG *pg = _lookup_lock_pg(pgid); if (epoch < pg->info.history.same_interval_since) { dout(10) << *pg << " get_or_create_pg acting changed in " << pg->info.history.same_interval_since @@ -2368,27 +2399,36 @@ void OSD::handle_pg_peering_evt( * - from each epoch, include all osds up then AND now * - if no osds from then are up now, include them all, even tho they're not reachable now */ -void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& pset) +void OSD::calc_priors_during( + spg_t pgid, epoch_t start, epoch_t end, set& pset) { - dout(15) << "calc_priors_during " << pgid << " [" << start << "," << end << ")" << dendl; + dout(15) << "calc_priors_during " << pgid << " [" << start + << "," << end << ")" << dendl; for (epoch_t e = start; e < end; e++) { OSDMapRef oldmap = get_map(e); vector acting; - oldmap->pg_to_acting_osds(pgid, acting); + oldmap->pg_to_acting_osds(pgid.pgid, acting); dout(20) << " " << pgid << " in epoch " << e << " was " << acting << dendl; int up = 0; for (unsigned i=0; iis_up(acting[i])) { - if (acting[i] != whoami) - pset.insert(acting[i]); + if (acting[i] != whoami) { + pset.insert( + pg_shard_t( + acting[i], + osdmap->pg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD)); + } up++; } if (!up && !acting.empty()) { // sucky. add down osds, even tho we can't reach them right now. for (unsigned i=0; ipg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD)); } } dout(10) << "calc_priors_during " << pgid @@ -2401,7 +2441,7 @@ void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& ps * Fill in the passed history so you know same_interval_since, same_up_since, * and same_primary_since. */ -bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, +bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from, const vector& currentup, const vector& currentacting) { @@ -2423,7 +2463,7 @@ bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, assert(oldmap->have_pg_pool(pgid.pool())); vector up, acting; - oldmap->pg_to_up_acting_osds(pgid, up, acting); + oldmap->pg_to_up_acting_osds(pgid.pgid, up, acting); // acting set change? if ((acting != currentacting || up != currentup) && e > h.same_interval_since) { @@ -2667,7 +2707,7 @@ void OSD::maybe_update_heartbeat_peers() // build heartbeat from set if (is_active()) { - for (ceph::unordered_map::iterator i = pg_map.begin(); + for (ceph::unordered_map::iterator i = pg_map.begin(); i != pg_map.end(); ++i) { PG *pg = i->second; @@ -3177,7 +3217,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, command == "truncobj" || command == "injectmdataerr" || command == "injectdataerr" ) { - pg_t rawpg, pgid; + pg_t rawpg; int64_t pool; OSDMapRef curmap = service->get_osdmap(); int r; @@ -3208,7 +3248,11 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, ss << "Invalid namespace/objname"; return; } - pgid = curmap->raw_pg_to_pg(rawpg); + if (curmap->pg_is_ec(rawpg)) { + ss << "Must not call on ec pool"; + return; + } + spg_t pgid = spg_t(curmap->raw_pg_to_pg(rawpg), ghobject_t::no_shard()); hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace); ObjectStore::Transaction t; @@ -3844,7 +3888,7 @@ void OSD::send_pg_stats(const utime_t &now) } pg->pg_stats_publish_lock.Lock(); if (pg->pg_stats_publish_valid) { - m->pg_stat[pg->info.pgid] = pg->pg_stats_publish; + m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish; dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":" << pg->pg_stats_publish.reported_seq << dendl; } else { @@ -3888,8 +3932,8 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack) PGRef _pg(pg); ++p; - if (ack->pg_stat.count(pg->info.pgid)) { - pair acked = ack->pg_stat[pg->info.pgid]; + if (ack->pg_stat.count(pg->info.pgid.pgid)) { + pair acked = ack->pg_stat[pg->info.pgid.pgid]; pg->pg_stats_publish_lock.Lock(); if (acked.first == pg->pg_stats_publish.reported_seq && acked.second == pg->pg_stats_publish.reported_epoch) { @@ -4137,7 +4181,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist prefix == "mark_unfound_lost" || prefix == "list_missing") )) { - pg_t pgid; + spg_t pgid; if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) { ss << "no pgid specified"; @@ -4235,16 +4279,16 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist goto out; } - std::set keys; - for (ceph::unordered_map::const_iterator pg_map_e = pg_map.begin(); + std::set keys; + for (ceph::unordered_map::const_iterator pg_map_e = pg_map.begin(); pg_map_e != pg_map.end(); ++pg_map_e) { keys.insert(pg_map_e->first); } fout << "*** osd " << whoami << ": dump_missing ***" << std::endl; - for (std::set ::iterator p = keys.begin(); + for (std::set ::iterator p = keys.begin(); p != keys.end(); ++p) { - ceph::unordered_map::iterator q = pg_map.find(*p); + ceph::unordered_map::iterator q = pg_map.find(*p); assert(q != pg_map.end()); PG *pg = q->second; pg->lock(); @@ -4256,11 +4300,11 @@ void OSD::do_command(Connection *con, tid_t tid, vector& cmd, bufferlist pg->pg_log.get_missing().missing.begin(); for (; mi != mend; ++mi) { fout << mi->first << " -> " << mi->second << std::endl; - map >::const_iterator mli = + map >::const_iterator mli = pg->missing_loc.find(mi->first); if (mli == pg->missing_loc.end()) continue; - const set &mls(mli->second); + const set &mls(mli->second); if (mls.empty()) continue; fout << "missing_loc: " << mls << std::endl; @@ -4816,7 +4860,7 @@ void OSD::handle_scrub(MOSDScrub *m) } if (m->scrub_pgs.empty()) { - for (ceph::unordered_map::iterator p = pg_map.begin(); + for (ceph::unordered_map::iterator p = pg_map.begin(); p != pg_map.end(); ++p) { PG *pg = p->second; @@ -4834,9 +4878,11 @@ void OSD::handle_scrub(MOSDScrub *m) } else { for (vector::iterator p = m->scrub_pgs.begin(); p != m->scrub_pgs.end(); - ++p) - if (pg_map.count(*p)) { - PG *pg = pg_map[*p]; + ++p) { + spg_t pcand; + if (osdmap->get_primary_shard(*p, &pcand) && + pg_map.count(pcand)) { + PG *pg = pg_map[pcand]; pg->lock(); if (pg->is_primary()) { pg->unreg_next_scrub(); @@ -4848,6 +4894,7 @@ void OSD::handle_scrub(MOSDScrub *m) } pg->unlock(); } + } } m->put(); @@ -4896,11 +4943,11 @@ void OSD::sched_scrub() //dout(20) << " " << last_scrub_pg << dendl; - pair pos; + pair pos; if (service.first_scrub_stamp(&pos)) { do { utime_t t = pos.first; - pg_t pgid = pos.second; + spg_t pgid = pos.second; dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl; utime_t diff = now - t; @@ -5496,12 +5543,19 @@ void OSD::advance_pg( continue; vector newup, newacting; - nextmap->pg_to_up_acting_osds(pg->info.pgid, newup, newacting); - pg->handle_advance_map(nextmap, lastmap, newup, newacting, rctx); + int up_primary, acting_primary; + nextmap->pg_to_up_acting_osds( + pg->info.pgid.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + pg->handle_advance_map( + nextmap, lastmap, newup, up_primary, + newacting, acting_primary, rctx); // Check for split! - set children; - if (pg->info.pgid.is_split( + set children; + spg_t parent(pg->info.pgid); + if (parent.is_split( lastmap->get_pg_num(pg->pool.id), nextmap->get_pg_num(pg->pool.id), &children)) { @@ -5541,15 +5595,15 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin) } // scan pg creations - ceph::unordered_map::iterator n = creating_pgs.begin(); + ceph::unordered_map::iterator n = creating_pgs.begin(); while (n != creating_pgs.end()) { - ceph::unordered_map::iterator p = n++; - pg_t pgid = p->first; + ceph::unordered_map::iterator p = n++; + spg_t pgid = p->first; // am i still primary? vector acting; int primary; - osdmap->pg_to_acting_osds(pgid, &acting, &primary); + osdmap->pg_to_acting_osds(pgid.pgid, &acting, &primary); if (primary != whoami) { dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl; creating_pgs.erase(p); @@ -5563,12 +5617,12 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin) } // scan pgs with waiters - map >::iterator p = waiting_for_pg.begin(); + map >::iterator p = waiting_for_pg.begin(); while (p != waiting_for_pg.end()) { - pg_t pgid = p->first; + spg_t pgid = p->first; vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); + int nrep = osdmap->pg_to_acting_osds(pgid.pgid, acting); int role = osdmap->calc_pg_role(whoami, acting, nrep); if (role >= 0) { ++p; // still me @@ -5591,7 +5645,7 @@ void OSD::consume_map() list to_remove; // scan pg's - for (ceph::unordered_map::iterator it = pg_map.begin(); + for (ceph::unordered_map::iterator it = pg_map.begin(); it != pg_map.end(); ++it) { PG *pg = it->second; @@ -5628,7 +5682,7 @@ void OSD::consume_map() service.publish_map(osdmap); // scan pg's - for (ceph::unordered_map::iterator it = pg_map.begin(); + for (ceph::unordered_map::iterator it = pg_map.begin(); it != pg_map.end(); ++it) { PG *pg = it->second; @@ -5912,7 +5966,7 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch) // pg creation -bool OSD::can_create_pg(pg_t pgid) +bool OSD::can_create_pg(spg_t pgid) { assert(creating_pgs.count(pgid)); @@ -5929,7 +5983,7 @@ bool OSD::can_create_pg(pg_t pgid) void OSD::split_pgs( PG *parent, - const set &childpgids, set > *out_pgs, + const set &childpgids, set > *out_pgs, OSDMapRef curmap, OSDMapRef nextmap, PG::RecoveryCtx *rctx) @@ -5939,7 +5993,7 @@ void OSD::split_pgs( parent->update_snap_mapper_bits( parent->info.pgid.get_split_bits(pg_num) ); - for (set::const_iterator i = childpgids.begin(); + for (set::const_iterator i = childpgids.begin(); i != childpgids.end(); ++i) { dout(10) << "Splitting " << *parent << " into " << *i << dendl; @@ -5956,10 +6010,10 @@ void OSD::split_pgs( parent->split_colls( *i, split_bits, - i->m_seed, + i->ps(), rctx->transaction); parent->split_into( - *i, + i->pgid, child, split_bits); @@ -6014,41 +6068,48 @@ void OSD::handle_pg_create(OpRequestRef op) for (map::iterator p = m->mkpg.begin(); p != m->mkpg.end(); ++p) { - pg_t pgid = p->first; epoch_t created = p->second.created; pg_t parent = p->second.parent; if (p->second.split_bits) // Skip split pgs continue; - pg_t on = pgid; + pg_t on = p->first; - if (pgid.preferred() >= 0) { - dout(20) << "ignoring localized pg " << pgid << dendl; + if (on.preferred() >= 0) { + dout(20) << "ignoring localized pg " << on << dendl; continue; } - if (!osdmap->have_pg_pool(pgid.pool())) { - dout(20) << "ignoring pg on deleted pool " << pgid << dendl; + + if (!osdmap->have_pg_pool(on.pool())) { + dout(20) << "ignoring pg on deleted pool " << on << dendl; continue; } - dout(20) << "mkpg " << pgid << " e" << created << dendl; + dout(20) << "mkpg " << on << " e" << created << dendl; // is it still ours? vector up, acting; - int up_primary, acting_primary; + int up_primary = -1; + int acting_primary = -1; osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary); int role = osdmap->calc_pg_role(whoami, acting, acting.size()); if (up_primary != whoami) { - dout(10) << "mkpg " << pgid << " not primary (role=" << role << "), skipping" << dendl; + dout(10) << "mkpg " << on << " not primary (role=" + << role << "), skipping" << dendl; continue; } if (up != acting) { - dout(10) << "mkpg " << pgid << " up " << up << " != acting " << acting << ", ignoring" << dendl; + dout(10) << "mkpg " << on << " up " << up + << " != acting " << acting << ", ignoring" << dendl; // we'll get a query soon anyway, since we know the pg // must exist. we can ignore this. continue; } + spg_t pgid; + bool mapped = osdmap->get_primary_shard(on, &pgid); + assert(mapped); + // does it already exist? if (_have_pg(pgid)) { dout(10) << "mkpg " << pgid << " already exists, skipping" << dendl; @@ -6080,14 +6141,18 @@ void OSD::handle_pg_create(OpRequestRef op) PG::RecoveryCtx rctx = create_context(); // poll priors - set& pset = creating_pgs[pgid].prior; + set& pset = creating_pgs[pgid].prior; dout(10) << "mkpg " << pgid << " e" << created << " h " << history << " : querying priors " << pset << dendl; - for (set::iterator p = pset.begin(); p != pset.end(); ++p) - if (osdmap->is_up(*p)) - (*rctx.query_map)[*p][pgid] = pg_query_t(pg_query_t::INFO, history, - osdmap->get_epoch()); + for (set::iterator p = pset.begin(); p != pset.end(); ++p) + if (osdmap->is_up(p->osd)) + (*rctx.query_map)[p->osd][spg_t(pgid.pgid, p->shard)] = + pg_query_t( + pg_query_t::INFO, + p->shard, pgid.shard, + history, + osdmap->get_epoch()); PG *pg = NULL; if (can_create_pg(pgid)) { @@ -6095,7 +6160,8 @@ void OSD::handle_pg_create(OpRequestRef op) rctx.transaction->create_collection(coll_t(pgid)); pg = _create_lock_pg( osdmap, pgid, true, false, false, - 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, + 0, creating_pgs[pgid].acting, whoami, + creating_pgs[pgid].acting, whoami, history, pi, *rctx.transaction); pg->info.last_epoch_started = pg->info.history.last_epoch_started; @@ -6122,10 +6188,10 @@ PG::RecoveryCtx OSD::create_context() ObjectStore::Transaction *t = new ObjectStore::Transaction; C_Contexts *on_applied = new C_Contexts(cct); C_Contexts *on_safe = new C_Contexts(cct); - map< int, map > *query_map = - new map >; + map > *query_map = + new map >; map > > *notify_list = - new map > >; + new map > >; map > > *info_map = new map > >; PG::RecoveryCtx rctx(query_map, info_map, notify_list, @@ -6152,16 +6218,25 @@ void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg, bool OSD::compat_must_dispatch_immediately(PG *pg) { assert(pg->is_locked()); - vector *tmpacting = &pg->acting; - if (pg->actingbackfill.size() > 0) - tmpacting = &pg->actingbackfill; - for (vector::iterator i = tmpacting->begin(); - i != tmpacting->end(); + set tmpacting; + if (pg->actingbackfill.size() > 0) { + tmpacting = pg->actingbackfill; + } else { + for (unsigned i = 0; i < pg->acting.size(); ++i) { + tmpacting.insert( + pg_shard_t( + pg->acting[i], + pg->pool.info.ec_pool() ? i : ghobject_t::NO_SHARD)); + } + } + + for (set::iterator i = tmpacting.begin(); + i != tmpacting.end(); ++i) { - if (*i == whoami) + if (i->osd == whoami) continue; ConnectionRef conn = - service.get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch()); + service.get_con_osd_cluster(i->osd, pg->get_osdmap()->get_epoch()); if (conn && !conn->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) { return true; } @@ -6203,30 +6278,29 @@ void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap, */ void OSD::do_notifies( - map< int,vector > >& notify_list, + map > >& notify_list, OSDMapRef curmap) { - for (map< int, vector > >::iterator it = notify_list.begin(); + for (map > >::iterator it = + notify_list.begin(); it != notify_list.end(); ++it) { - if (it->first == whoami) { - dout(7) << "do_notify osd." << it->first << " is self, skipping" << dendl; - continue; - } if (!curmap->is_up(it->first)) continue; - ConnectionRef con = service.get_con_osd_cluster(it->first, curmap->get_epoch()); + ConnectionRef con = service.get_con_osd_cluster( + it->first, curmap->get_epoch()); if (!con) continue; _share_map_outgoing(it->first, con.get(), curmap); if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) { - dout(7) << "do_notify osd." << it->first + dout(7) << "do_notify osd " << it->first << " on " << it->second.size() << " PGs" << dendl; MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(), it->second); cluster_messenger->send_message(m, con.get()); } else { - dout(7) << "do_notify osd." << it->first + dout(7) << "do_notify osd " << it->first << " sending seperate messages" << dendl; for (vector >::iterator i = it->second.begin(); @@ -6246,10 +6320,10 @@ void OSD::do_notifies( /** do_queries * send out pending queries for info | summaries */ -void OSD::do_queries(map< int, map >& query_map, +void OSD::do_queries(map >& query_map, OSDMapRef curmap) { - for (map< int, map >::iterator pit = query_map.begin(); + for (map >::iterator pit = query_map.begin(); pit != query_map.end(); ++pit) { if (!curmap->is_up(pit->first)) @@ -6268,10 +6342,10 @@ void OSD::do_queries(map< int, map >& query_map, dout(7) << "do_queries querying osd." << who << " sending seperate messages " << " on " << pit->second.size() << " PGs" << dendl; - for (map::iterator i = pit->second.begin(); + for (map::iterator i = pit->second.begin(); i != pit->second.end(); ++i) { - map to_send; + map to_send; to_send.insert(*i); MOSDPGQuery *m = new MOSDPGQuery(i->second.epoch_sent, to_send); cluster_messenger->send_message(m, con.get()); @@ -6281,10 +6355,13 @@ void OSD::do_queries(map< int, map >& query_map, } -void OSD::do_infos(map > >& info_map, +void OSD::do_infos(map > >& info_map, OSDMapRef curmap) { - for (map > >::iterator p = info_map.begin(); + for (map > >::iterator p = + info_map.begin(); p != info_map.end(); ++p) { if (!curmap->is_up(p->first)) @@ -6292,9 +6369,11 @@ void OSD::do_infos(map > >& info for (vector >::iterator i = p->second.begin(); i != p->second.end(); ++i) { - dout(20) << "Sending info " << i->first.info << " to osd." << p->first << dendl; + dout(20) << "Sending info " << i->first.info + << " to shard " << p->first << dendl; } - ConnectionRef con = service.get_con_osd_cluster(p->first, curmap->get_epoch()); + ConnectionRef con = service.get_con_osd_cluster( + p->first, curmap->get_epoch()); if (!con) continue; _share_map_outgoing(p->first, con.get(), curmap); @@ -6349,12 +6428,13 @@ void OSD::handle_pg_notify(OpRequestRef op) } handle_pg_peering_evt( + spg_t(it->first.info.pgid.pgid, it->first.to), it->first.info, it->second, - it->first.query_epoch, from, true, + it->first.query_epoch, pg_shard_t(from, it->first.from), true, PG::CephPeeringEvtRef( new PG::CephPeeringEvt( it->first.epoch_sent, it->first.query_epoch, - PG::MNotifyRec(from, it->first))) + PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first))) ); } } @@ -6378,12 +6458,13 @@ void OSD::handle_pg_log(OpRequestRef op) op->mark_started(); handle_pg_peering_evt( + spg_t(m->info.pgid.pgid, m->to), m->info, m->past_intervals, m->get_epoch(), - from, false, + pg_shard_t(from, m->from), false, PG::CephPeeringEvtRef( new PG::CephPeeringEvt( m->get_epoch(), m->get_query_epoch(), - PG::MLogRec(from, m))) + PG::MLogRec(pg_shard_t(from, m->from), m))) ); } @@ -6410,12 +6491,15 @@ void OSD::handle_pg_info(OpRequestRef op) } handle_pg_peering_evt( + spg_t(p->first.info.pgid.pgid, p->first.to), p->first.info, p->second, p->first.epoch_sent, - from, false, + pg_shard_t(from, p->first.from), false, PG::CephPeeringEvtRef( new PG::CephPeeringEvt( p->first.epoch_sent, p->first.query_epoch, - PG::MInfoRec(from, p->first.info, p->first.epoch_sent))) + PG::MInfoRec( + pg_shard_t( + from, p->first.from), p->first.info, p->first.epoch_sent))) ); } } @@ -6454,7 +6538,8 @@ void OSD::handle_pg_trim(OpRequestRef op) if (pg->is_primary()) { // peer is informing us of their last_complete_ondisk dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl; - pg->peer_last_complete_ondisk[from] = m->trim_to; + pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] = + m->trim_to; if (pg->calc_min_last_complete_ondisk()) { dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl; pg->trim_peers(); @@ -6654,10 +6739,10 @@ void OSD::handle_pg_query(OpRequestRef op) map< int, vector > > notify_list; - for (map::iterator it = m->pg_list.begin(); + for (map::iterator it = m->pg_list.begin(); it != m->pg_list.end(); ++it) { - pg_t pgid = it->first; + spg_t pgid = it->first; if (pgid.preferred() >= 0) { dout(10) << "ignoring localized pg " << pgid << dendl; @@ -6669,15 +6754,17 @@ void OSD::handle_pg_query(OpRequestRef op) PG::CephPeeringEvtRef( new PG::CephPeeringEvt( it->second.epoch_sent, it->second.epoch_sent, - PG::MQuery(from, it->second, it->second.epoch_sent)))); + PG::MQuery(pg_shard_t(from, it->second.from), + it->second, it->second.epoch_sent)))); continue; } if (pg_map.count(pgid)) { PG *pg = 0; pg = _lookup_lock_pg(pgid); - pg->queue_query(it->second.epoch_sent, it->second.epoch_sent, - from, it->second); + pg->queue_query( + it->second.epoch_sent, it->second.epoch_sent, + pg_shard_t(from, it->second.from), it->second); pg->unlock(); continue; } @@ -6687,7 +6774,7 @@ void OSD::handle_pg_query(OpRequestRef op) // get active crush mapping vector up, acting; - osdmap->pg_to_up_acting_osds(pgid, up, acting); + osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting); // same primary? pg_history_t history = it->second.history; @@ -6703,21 +6790,27 @@ void OSD::handle_pg_query(OpRequestRef op) } dout(10) << " pg " << pgid << " dne" << dendl; - pg_info_t empty(pgid); + pg_info_t empty(spg_t(pgid.pgid, it->second.to)); if (it->second.type == pg_query_t::LOG || it->second.type == pg_query_t::FULLLOG) { ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch()); if (con) { - MOSDPGLog *mlog = new MOSDPGLog(osdmap->get_epoch(), empty, - it->second.epoch_sent); + MOSDPGLog *mlog = new MOSDPGLog( + it->second.from, it->second.to, + osdmap->get_epoch(), empty, + it->second.epoch_sent); _share_map_outgoing(from, con.get(), osdmap); cluster_messenger->send_message(mlog, con.get()); } } else { - notify_list[from].push_back(make_pair(pg_notify_t(it->second.epoch_sent, - osdmap->get_epoch(), - empty), - pg_interval_map_t())); + notify_list[from].push_back( + make_pair( + pg_notify_t( + it->second.from, it->second.to, + it->second.epoch_sent, + osdmap->get_epoch(), + empty), + pg_interval_map_t())); } } do_notifies(notify_list, osdmap); @@ -6740,10 +6833,10 @@ void OSD::handle_pg_remove(OpRequestRef op) op->mark_started(); - for (vector::iterator it = m->pg_list.begin(); + for (vector::iterator it = m->pg_list.begin(); it != m->pg_list.end(); ++it) { - pg_t pgid = *it; + spg_t pgid = *it; if (pgid.preferred() >= 0) { dout(10) << "ignoring localized pg " << pgid << dendl; continue; @@ -6757,13 +6850,13 @@ void OSD::handle_pg_remove(OpRequestRef op) PG *pg = _lookup_lock_pg(pgid); pg_history_t history = pg->info.history; vector up, acting; - osdmap->pg_to_up_acting_osds(pgid, up, acting); + osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting); bool valid_history = project_pg_history(pg->info.pgid, history, pg->get_osdmap()->get_epoch(), up, acting); if (valid_history && history.same_interval_since <= m->get_epoch()) { - assert(pg->get_primary() == m->get_source().num()); + assert(pg->get_primary().osd == m->get_source().num()); PGRef _pg(pg); _remove_pg(pg); pg->unlock(); @@ -6820,7 +6913,7 @@ void OSD::check_replay_queue() assert(osd_lock.is_locked()); utime_t now = ceph_clock_now(cct); - list< pair > pgids; + list< pair > pgids; replay_queue_lock.Lock(); while (!replay_queue.empty() && replay_queue.front().second <= now) { @@ -6829,8 +6922,8 @@ void OSD::check_replay_queue() } replay_queue_lock.Unlock(); - for (list< pair >::iterator p = pgids.begin(); p != pgids.end(); ++p) { - pg_t pgid = p->first; + for (list< pair >::iterator p = pgids.begin(); p != pgids.end(); ++p) { + spg_t pgid = p->first; if (pg_map.count(pgid)) { PG *pg = _lookup_lock_pg_with_map_lock_held(pgid); dout(10) << "check_replay_queue " << *pg << dendl; @@ -7111,18 +7204,24 @@ void OSD::handle_op(OpRequestRef op) } } // calc actual pgid - pg_t pgid = m->get_pg(); - int64_t pool = pgid.pool(); + pg_t _pgid = m->get_pg(); + int64_t pool = _pgid.pool(); if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 && osdmap->have_pg_pool(pool)) - pgid = osdmap->raw_pg_to_pg(pgid); + _pgid = osdmap->raw_pg_to_pg(_pgid); + + spg_t pgid; + if (!osdmap->get_primary_shard(_pgid, &pgid)) { + // missing pool or acting set empty -- drop + return; + } // get and lock *pg. PG *pg = _have_pg(pgid) ? _lookup_pg(pgid) : NULL; if (!pg) { dout(7) << "hit non-existent pg " << pgid << dendl; - if (osdmap->get_pg_acting_role(pgid, whoami) >= 0) { + if (osdmap->get_pg_acting_role(pgid.pgid, whoami) >= 0) { dout(7) << "we are valid target for op, waiting" << dendl; waiting_for_pg[pgid].push_back(op); op->mark_delayed("waiting for pg to exist locally"); @@ -7136,7 +7235,7 @@ void OSD::handle_op(OpRequestRef op) } OSDMapRef send_map = get_map(m->get_map_epoch()); - if (send_map->get_pg_acting_role(pgid, whoami) >= 0) { + if (send_map->get_pg_acting_role(pgid.pgid, whoami) >= 0) { dout(7) << "dropping request; client will resend when they get new map" << dendl; } else if (!send_map->have_pg_pool(pgid.pool())) { dout(7) << "dropping request; pool did not exist" << dendl; @@ -7149,9 +7248,6 @@ void OSD::handle_op(OpRequestRef op) << "\n"; } else { dout(7) << "we are invalid target" << dendl; - pgid = m->get_pg(); - if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) - pgid = send_map->raw_pg_to_pg(pgid); clog.warn() << m->get_source_inst() << " misdirected " << m->get_reqid() << " pg " << m->get_pg() << " to osd." << whoami @@ -7195,7 +7291,7 @@ void OSD::handle_replica_op(OpRequestRef op) static_cast(m->get_connection()->get_priv())); // make sure we have the pg - const pg_t pgid = m->pgid; + const spg_t pgid = m->pgid; if (service.splitting(pgid)) { waiting_for_pg[pgid].push_back(op); return; @@ -7351,7 +7447,7 @@ struct C_CompleteSplits : public Context { if (osd->is_stopping()) return; PG::RecoveryCtx rctx = osd->create_context(); - set to_complete; + set to_complete; for (set >::iterator i = pgs.begin(); i != pgs.end(); ++i) { diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 3bf76d11dc6..1a592d667b2 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -197,9 +197,9 @@ class DeletingState { } status; bool stop_deleting; public: - const pg_t pgid; + const spg_t pgid; const PGRef old_pg_state; - DeletingState(const pair &in) : + DeletingState(const pair &in) : lock("DeletingState::lock"), status(QUEUED), stop_deleting(false), pgid(in.first), old_pg_state(in.second) {} @@ -289,8 +289,8 @@ class OSDService { public: OSD *osd; CephContext *cct; - SharedPtrRegistry osr_registry; - SharedPtrRegistry deleting_pgs; + SharedPtrRegistry osr_registry; + SharedPtrRegistry deleting_pgs; const int whoami; ObjectStore *&store; LogClient &clog; @@ -385,33 +385,33 @@ public: Mutex sched_scrub_lock; int scrubs_pending; int scrubs_active; - set< pair > last_scrub_pg; + set< pair > last_scrub_pg; - void reg_last_pg_scrub(pg_t pgid, utime_t t) { + void reg_last_pg_scrub(spg_t pgid, utime_t t) { Mutex::Locker l(sched_scrub_lock); - last_scrub_pg.insert(pair(t, pgid)); + last_scrub_pg.insert(pair(t, pgid)); } - void unreg_last_pg_scrub(pg_t pgid, utime_t t) { + void unreg_last_pg_scrub(spg_t pgid, utime_t t) { Mutex::Locker l(sched_scrub_lock); - pair p(t, pgid); - set >::iterator it = last_scrub_pg.find(p); + pair p(t, pgid); + set >::iterator it = last_scrub_pg.find(p); assert(it != last_scrub_pg.end()); last_scrub_pg.erase(it); } - bool first_scrub_stamp(pair *out) { + bool first_scrub_stamp(pair *out) { Mutex::Locker l(sched_scrub_lock); if (last_scrub_pg.empty()) return false; - set< pair >::iterator iter = last_scrub_pg.begin(); + set< pair >::iterator iter = last_scrub_pg.begin(); *out = *iter; return true; } - bool next_scrub_stamp(pair next, - pair *out) { + bool next_scrub_stamp(pair next, + pair *out) { Mutex::Locker l(sched_scrub_lock); if (last_scrub_pg.empty()) return false; - set< pair >::iterator iter = last_scrub_pg.lower_bound(next); + set< pair >::iterator iter = last_scrub_pg.lower_bound(next); if (iter == last_scrub_pg.end()) return false; ++iter; @@ -577,11 +577,11 @@ public: enum { BACKFILL_LOW = 0, // backfill non-degraded PGs BACKFILL_HIGH = 1, // backfill degraded PGs - RECOVERY = AsyncReserver::MAX_PRIORITY // log based recovery + RECOVERY = AsyncReserver::MAX_PRIORITY // log based recovery }; Finisher reserver_finisher; - AsyncReserver local_reserver; - AsyncReserver remote_reserver; + AsyncReserver local_reserver; + AsyncReserver remote_reserver; // -- pg_temp -- Mutex pg_temp_lock; @@ -652,26 +652,26 @@ public: // split Mutex in_progress_split_lock; - map pending_splits; // child -> parent - map > rev_pending_splits; // parent -> [children] - set in_progress_splits; // child + map pending_splits; // child -> parent + map > rev_pending_splits; // parent -> [children] + set in_progress_splits; // child - void _start_split(pg_t parent, const set &children); - void start_split(pg_t parent, const set &children) { + void _start_split(spg_t parent, const set &children); + void start_split(spg_t parent, const set &children) { Mutex::Locker l(in_progress_split_lock); return _start_split(parent, children); } - void mark_split_in_progress(pg_t parent, const set &pgs); - void complete_split(const set &pgs); - void cancel_pending_splits_for_parent(pg_t parent); - void _cancel_pending_splits_for_parent(pg_t parent); - bool splitting(pg_t pgid); + void mark_split_in_progress(spg_t parent, const set &pgs); + void complete_split(const set &pgs); + void cancel_pending_splits_for_parent(spg_t parent); + void _cancel_pending_splits_for_parent(spg_t parent); + bool splitting(spg_t pgid); void expand_pg_num(OSDMapRef old_map, OSDMapRef new_map); void _maybe_split_pgid(OSDMapRef old_map, OSDMapRef new_map, - pg_t pgid); - void init_splits_between(pg_t pgid, OSDMapRef frommap, OSDMapRef tomap); + spg_t pgid); + void init_splits_between(spg_t pgid, OSDMapRef frommap, OSDMapRef tomap); // -- OSD Full Status -- Mutex full_status_lock; @@ -706,9 +706,9 @@ public: #ifdef PG_DEBUG_REFS Mutex pgid_lock; - map pgid_tracker; - map live_pgs; - void add_pgid(pg_t pgid, PG *pg) { + map pgid_tracker; + map live_pgs; + void add_pgid(spg_t pgid, PG *pg) { Mutex::Locker l(pgid_lock); if (!pgid_tracker.count(pgid)) { pgid_tracker[pgid] = 0; @@ -716,7 +716,7 @@ public: } pgid_tracker[pgid]++; } - void remove_pgid(pg_t pgid, PG *pg) { + void remove_pgid(spg_t pgid, PG *pg) { Mutex::Locker l(pgid_lock); assert(pgid_tracker.count(pgid)); assert(pgid_tracker[pgid] > 0); @@ -729,7 +729,7 @@ public: void dump_live_pgids() { Mutex::Locker l(pgid_lock); derr << "live pgids:" << dendl; - for (map::iterator i = pgid_tracker.begin(); + for (map::iterator i = pgid_tracker.begin(); i != pgid_tracker.end(); ++i) { derr << "\t" << *i << dendl; @@ -831,7 +831,7 @@ public: 0)); } - static hobject_t make_pg_log_oid(pg_t pg) { + static hobject_t make_pg_log_oid(spg_t pg) { stringstream ss; ss << "pglog_" << pg; string s; @@ -839,7 +839,7 @@ public: return hobject_t(sobject_t(object_t(s.c_str()), 0)); } - static hobject_t make_pg_biginfo_oid(pg_t pg) { + static hobject_t make_pg_biginfo_oid(spg_t pg) { stringstream ss; ss << "pginfo_" << pg; string s; @@ -1264,19 +1264,19 @@ private: protected: // -- placement groups -- - ceph::unordered_map pg_map; - map > waiting_for_pg; - map > peering_wait_for_split; + ceph::unordered_map pg_map; + map > waiting_for_pg; + map > peering_wait_for_split; PGRecoveryStats pg_recovery_stats; PGPool _get_pool(int id, OSDMapRef createmap); - bool _have_pg(pg_t pgid); - PG *_lookup_lock_pg_with_map_lock_held(pg_t pgid); - PG *_lookup_lock_pg(pg_t pgid); - PG *_lookup_pg(pg_t pgid); + bool _have_pg(spg_t pgid); + PG *_lookup_lock_pg_with_map_lock_held(spg_t pgid); + PG *_lookup_lock_pg(spg_t pgid); + PG *_lookup_pg(spg_t pgid); PG *_open_lock_pg(OSDMapRef createmap, - pg_t pg, bool no_lockdep_check=false, + spg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false); enum res_result { RES_PARENT, // resurrected a parent @@ -1284,50 +1284,54 @@ protected: RES_NONE // nothing relevant deleting }; res_result _try_resurrect_pg( - OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state); - PG *_create_lock_pg(OSDMapRef createmap, - pg_t pgid, - bool newly_created, - bool hold_map_lock, - bool backfill, - int role, - vector& up, - vector& acting, - pg_history_t history, - pg_interval_map_t& pi, - ObjectStore::Transaction& t); - PG *_lookup_qlock_pg(pg_t pgid); - - PG* _make_pg(OSDMapRef createmap, pg_t pgid); + OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state); + PG *_create_lock_pg( + OSDMapRef createmap, + spg_t pgid, + bool newly_created, + bool hold_map_lock, + bool backfill, + int role, + vector& up, int up_primary, + vector& acting, int acting_primary, + pg_history_t history, + pg_interval_map_t& pi, + ObjectStore::Transaction& t); + PG *_lookup_qlock_pg(spg_t pgid); + + PG* _make_pg(OSDMapRef createmap, spg_t pgid); void add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx); void handle_pg_peering_evt( + spg_t pgid, const pg_info_t& info, pg_interval_map_t& pi, - epoch_t epoch, int from, + epoch_t epoch, + pg_shard_t from, bool primary, PG::CephPeeringEvtRef evt); void load_pgs(); void build_past_intervals_parallel(); - void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& pset); + void calc_priors_during( + spg_t pgid, epoch_t start, epoch_t end, set& pset); /// project pg history from from to now bool project_pg_history( - pg_t pgid, pg_history_t& h, epoch_t from, + spg_t pgid, pg_history_t& h, epoch_t from, const vector& lastup, const vector& lastacting ); ///< @return false if there was a map gap between from and now - void wake_pg_waiters(pg_t pgid) { + void wake_pg_waiters(spg_t pgid) { if (waiting_for_pg.count(pgid)) { take_waiters_front(waiting_for_pg[pgid]); waiting_for_pg.erase(pgid); } } void wake_all_pg_waiters() { - for (map >::iterator p = waiting_for_pg.begin(); + for (map >::iterator p = waiting_for_pg.begin(); p != waiting_for_pg.end(); ++p) take_waiters_front(p->second); @@ -1339,20 +1343,20 @@ protected: struct create_pg_info { pg_history_t history; vector acting; - set prior; + set prior; pg_t parent; }; - ceph::unordered_map creating_pgs; + ceph::unordered_map creating_pgs; double debug_drop_pg_create_probability; int debug_drop_pg_create_duration; int debug_drop_pg_create_left; // 0 if we just dropped the last one, -1 if we can drop more - bool can_create_pg(pg_t pgid); + bool can_create_pg(spg_t pgid); void handle_pg_create(OpRequestRef op); void split_pgs( PG *parent, - const set &childpgids, set > *out_pgs, + const set &childpgids, set > *out_pgs, OSDMapRef curmap, OSDMapRef nextmap, PG::RecoveryCtx *rctx); @@ -1445,13 +1449,16 @@ protected: ThreadPool::TPHandle *handle = NULL); void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg, ThreadPool::TPHandle *handle = NULL); - void do_notifies(map< int,vector > >& notify_list, + void do_notifies(map > >& + notify_list, OSDMapRef map); - void do_queries(map< int, map >& query_map, + void do_queries(map >& query_map, OSDMapRef map); - void do_infos(map > >& info_map, + void do_infos(map > >& info_map, OSDMapRef map); - void repeer(PG *pg, map< int, map >& query_map); + void repeer(PG *pg, map< int, map >& query_map); bool require_mon_peer(Message *m); bool require_osd_peer(OpRequestRef op); @@ -1535,7 +1542,7 @@ protected: utime_t defer_recovery_until; int recovery_ops_active; #ifdef DEBUG_RECOVERY_OIDS - map > recovery_oids; + map > recovery_oids; #endif struct RecoveryWQ : public ThreadPool::WorkQueue { @@ -1588,7 +1595,7 @@ protected: // replay / delayed pg activation Mutex replay_queue_lock; - list< pair > replay_queue; + list< pair > replay_queue; void check_replay_queue(); @@ -1815,7 +1822,7 @@ protected: } } remove_wq; uint64_t next_removal_seq; - coll_t get_next_removal_coll(pg_t pgid) { + coll_t get_next_removal_coll(spg_t pgid) { return coll_t::make_removal_coll(next_removal_seq++, pgid); } diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index a9eed12c19f..47ac2f796cf 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -622,6 +622,12 @@ public: assert(up.empty() || up_primary == up.front()); assert(acting.empty() || acting_primary == acting.front()); } + bool pg_is_ec(pg_t pg) const { + map::const_iterator i = pools.find(pg.pool()); + assert(i != pools.end()); + return i->second.ec_pool(); + } + spg_t get_primary_shard(pg_t pgid) const { return spg_t(); /* TODOSAM: fix */} int64_t lookup_pg_pool_name(const string& name) { if (name_pool.count(name)) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 7046a6fcdcf..793a3947b58 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -139,14 +139,14 @@ void PGPool::update(OSDMapRef map) } PG::PG(OSDService *o, OSDMapRef curmap, - const PGPool &_pool, pg_t p, const hobject_t& loid, + const PGPool &_pool, spg_t p, const hobject_t& loid, const hobject_t& ioid) : osd(o), cct(o->cct), osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()), snap_mapper( &osdriver, - p.m_seed, + p.ps(), p.get_split_bits(curmap->get_pg_num(_pool.id)), _pool.id, p.shard), @@ -166,6 +166,7 @@ PG::PG(OSDService *o, OSDMapRef curmap, role(0), state(0), send_notify(false), + pg_whoami(osd->whoami, p.shard), need_up_thru(false), last_peering_reset(0), heartbeat_peer_lock("PG::heartbeat_peer_lock"), @@ -227,9 +228,12 @@ std::string PG::gen_prefix() const /********* PG **********/ -void PG::proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from) +void PG::proc_master_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from) { - dout(10) << "proc_master_log for osd." << from << ": " << olog << " " << omissing << dendl; + dout(10) << "proc_master_log for osd." << from << ": " + << olog << " " << omissing << dendl; assert(!is_active() && is_primary()); // merge log into our own log to build master log. no need to @@ -245,8 +249,10 @@ void PG::proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t peer_missing[from].swap(omissing); } -void PG::proc_replica_log(ObjectStore::Transaction& t, - pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from) +void PG::proc_replica_log( + ObjectStore::Transaction& t, + pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, + pg_shard_t from) { dout(10) << "proc_replica_log for osd." << from << ": " << oinfo << " " << olog << " " << omissing << dendl; @@ -267,9 +273,9 @@ void PG::proc_replica_log(ObjectStore::Transaction& t, peer_missing[from].swap(omissing); } -bool PG::proc_replica_info(int from, const pg_info_t &oinfo) +bool PG::proc_replica_info(pg_shard_t from, const pg_info_t &oinfo) { - map::iterator p = peer_info.find(from); + map::iterator p = peer_info.find(from); if (p != peer_info.end() && p->second.last_update == oinfo.last_update) { dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl; return false; @@ -344,7 +350,7 @@ void PG::update_object_snap_mapping( } void PG::merge_log( - ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from) + ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from) { PGLogEntryHandler rollbacker; pg_log.merge_log( @@ -367,8 +373,9 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead) * TODO: if the missing set becomes very large, this could get expensive. * Instead, we probably want to just iterate over our unfound set. */ -bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing, - int fromosd) +bool PG::search_for_missing( + const pg_info_t &oinfo, const pg_missing_t *omissing, + pg_shard_t fromosd) { bool stats_updated = false; bool found_missing = false; @@ -418,7 +425,7 @@ bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing dout(10) << "search_for_missing " << soid << " " << need << " is on osd." << fromosd << dendl; - map >::iterator ml = missing_loc.find(soid); + map >::iterator ml = missing_loc.find(soid); if (ml == missing_loc.end()) { map >::iterator wmo = waiting_for_missing_object.find(soid); @@ -443,7 +450,7 @@ bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing return found_missing; } -void PG::discover_all_missing(map< int, map > &query_map) +void PG::discover_all_missing(map > &query_map) { const pg_missing_t &missing = pg_log.get_missing(); assert(missing.have_missing()); @@ -453,17 +460,17 @@ void PG::discover_all_missing(map< int, map > &query_map) << get_num_unfound() << " unfound" << dendl; - std::set::const_iterator m = might_have_unfound.begin(); - std::set::const_iterator mend = might_have_unfound.end(); + std::set::const_iterator m = might_have_unfound.begin(); + std::set::const_iterator mend = might_have_unfound.end(); for (; m != mend; ++m) { - int peer(*m); + pg_shard_t peer(*m); - if (!get_osdmap()->is_up(peer)) { + if (!get_osdmap()->is_up(peer.osd)) { dout(20) << __func__ << " skipping down osd." << peer << dendl; continue; } - map::const_iterator iter = peer_info.find(peer); + map::const_iterator iter = peer_info.find(peer); if (iter != peer_info.end() && (iter->second.is_empty() || iter->second.dne())) { // ignore empty peers @@ -493,8 +500,11 @@ void PG::discover_all_missing(map< int, map > &query_map) dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t" << dendl; peer_missing_requested.insert(peer); - query_map[peer][info.pgid] = - pg_query_t(pg_query_t::MISSING, info.history, get_osdmap()->get_epoch()); + query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] = + pg_query_t( + pg_query_t::MISSING, + peer.shard, pg_whoami.shard, + info.history, get_osdmap()->get_epoch()); } } @@ -513,13 +523,13 @@ bool PG::needs_recovery() const } assert(actingbackfill.size() > 0); - vector::const_iterator end = actingbackfill.end(); - vector::const_iterator a = actingbackfill.begin(); + set::const_iterator end = actingbackfill.end(); + set::const_iterator a = actingbackfill.begin(); assert(a != end); - ++a; for (; a != end; ++a) { - int peer = *a; - map::const_iterator pm = peer_missing.find(peer); + if (*a == get_primary()) continue; + pg_shard_t peer = *a; + map::const_iterator pm = peer_missing.find(peer); if (pm == peer_missing.end()) { dout(10) << __func__ << " osd." << peer << " don't have missing set" << dendl; ret = true; @@ -544,11 +554,11 @@ bool PG::needs_backfill() const // We can assume that only possible osds that need backfill // are on the backfill_targets vector nodes. - vector::const_iterator end = backfill_targets.end(); - vector::const_iterator a = backfill_targets.begin(); + set::const_iterator end = backfill_targets.end(); + set::const_iterator a = backfill_targets.begin(); for (; a != end; ++a) { - int peer = *a; - map::const_iterator pi = peer_info.find(peer); + pg_shard_t peer = *a; + map::const_iterator pi = peer_info.find(peer); if (!pi->second.last_backfill.is_max()) { dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl; ret = true; @@ -599,7 +609,7 @@ void PG::generate_past_intervals() vector acting, up, old_acting, old_up; cur_map = osd->get_map(cur_epoch); - cur_map->pg_to_up_acting_osds(get_pgid(), up, acting); + cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting); epoch_t same_interval_since = cur_epoch; dout(10) << __func__ << " over epochs " << cur_epoch << "-" << end_epoch << dendl; @@ -610,7 +620,7 @@ void PG::generate_past_intervals() old_acting.swap(acting); cur_map = osd->get_map(cur_epoch); - cur_map->pg_to_up_acting_osds(get_pgid(), up, acting); + cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting); std::stringstream debug; bool new_interval = pg_interval_t::check_new_interval( @@ -623,7 +633,7 @@ void PG::generate_past_intervals() cur_map, last_map, info.pgid.pool(), - info.pgid, + info.pgid.pgid, &past_intervals, &debug); if (new_interval) { @@ -672,9 +682,9 @@ void PG::remove_down_peer_info(const OSDMapRef osdmap) { // Remove any downed osds from peer_info bool removed = false; - map::iterator p = peer_info.begin(); + map::iterator p = peer_info.begin(); while (p != peer_info.end()) { - if (!osdmap->is_up(p->first)) { + if (!osdmap->is_up(p->first.osd)) { dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl; peer_missing.erase(p->first); peer_log_requested.erase(p->first); @@ -698,16 +708,16 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const { assert(is_primary()); - set::const_iterator peer = might_have_unfound.begin(); - set::const_iterator mend = might_have_unfound.end(); + set::const_iterator peer = might_have_unfound.begin(); + set::const_iterator mend = might_have_unfound.end(); for (; peer != mend; ++peer) { if (peer_missing.count(*peer)) continue; - map::const_iterator iter = peer_info.find(*peer); + map::const_iterator iter = peer_info.find(*peer); if (iter != peer_info.end() && (iter->second.is_empty() || iter->second.dne())) continue; - const osd_info_t &osd_info(osdmap->get_info(*peer)); + const osd_info_t &osd_info(osdmap->get_info(peer->osd)); if (osd_info.lost_at <= osd_info.up_from) { // If there is even one OSD in might_have_unfound that isn't lost, we // still might retrieve our unfound. @@ -723,18 +733,21 @@ void PG::build_prior(std::auto_ptr &prior_set) { if (1) { // sanity check - for (map::iterator it = peer_info.begin(); + for (map::iterator it = peer_info.begin(); it != peer_info.end(); ++it) { assert(info.history.last_epoch_started >= it->second.history.last_epoch_started); } } - prior_set.reset(new PriorSet(*get_osdmap(), - past_intervals, - up, - acting, - info, - this)); + prior_set.reset( + new PriorSet( + pool.info.ec_pool(), + *get_osdmap(), + past_intervals, + up, + acting, + info, + this)); PriorSet &prior(*prior_set.get()); if (prior.pg_down) { @@ -799,11 +812,12 @@ void PG::clear_primary_state() * 2) Prefer longer tail if it brings another info into contiguity * 3) Prefer current primary */ -map::const_iterator PG::find_best_info(const map &infos) const +map::const_iterator PG::find_best_info( + const map &infos) const { eversion_t min_last_update_acceptable = eversion_t::max(); epoch_t max_last_epoch_started_found = 0; - for (map::const_iterator i = infos.begin(); + for (map::const_iterator i = infos.begin(); i != infos.end(); ++i) { if (max_last_epoch_started_found < i->second.last_epoch_started) { @@ -817,12 +831,12 @@ map::const_iterator PG::find_best_info(const map } assert(min_last_update_acceptable != eversion_t::max()); - map::const_iterator best = infos.end(); + map::const_iterator best = infos.end(); // find osd with newest last_update (oldest for ec_pool). // if there are multiples, prefer // - a longer tail, if it brings another peer into log contiguity // - the current primary - for (map::const_iterator p = infos.begin(); + for (map::const_iterator p = infos.begin(); p != infos.end(); ++p) { // Only consider peers with last_update >= min_last_update_acceptable @@ -852,7 +866,7 @@ map::const_iterator PG::find_best_info(const map } } // Prefer longer tail if it brings another peer into contiguity - for (map::const_iterator q = infos.begin(); + for (map::const_iterator q = infos.begin(); q != infos.end(); ++q) { if (q->second.is_incomplete()) @@ -870,7 +884,7 @@ map::const_iterator PG::find_best_info(const map } } // prefer current primary (usually the caller), all things being equal - if (p->first == acting[0]) { + if (p->first == pg_whoami) { dout(10) << "calc_acting prefer osd." << p->first << " because it is current primary" << dendl; best = p; @@ -887,20 +901,29 @@ map::const_iterator PG::find_best_info(const map * incomplete, or another osd has a longer tail that allows us to * bring other up nodes up to date. */ -bool PG::calc_acting(int& newest_update_osd_id, vector& want, vector& backfill) const -{ - map all_info(peer_info.begin(), peer_info.end()); - all_info[osd->whoami] = info; - - for (map::iterator p = all_info.begin(); p != all_info.end(); ++p) { +bool PG::calc_acting( + pg_shard_t &auth_log_shard_id, + vector &want, + set &backfill) const +{ +// TODOSAM: fix +#if 0 + map all_info(peer_info.begin(), peer_info.end()); + all_info[pg_whoami] = info; + + for (map::iterator p = all_info.begin(); + p != all_info.end(); + ++p) { dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl; } - map::const_iterator newest_update_osd = find_best_info(all_info); + map::const_iterator newest_update_osd = + find_best_info(all_info); if (newest_update_osd == all_info.end()) { if (up != acting) { - dout(10) << "calc_acting no suitable info found (incomplete backfills?), reverting to up" << dendl; + dout(10) << "calc_acting no suitable info found (incomplete backfills?)," + << " reverting to up" << dendl; want = up; return true; } else { @@ -915,19 +938,19 @@ bool PG::calc_acting(int& newest_update_osd_id, vector& want, vector& newest_update_osd_id = newest_update_osd->first; // select primary - map::const_iterator primary; + map::const_iterator primary; if (up.size() && - !all_info[up[0]].is_incomplete() && - all_info[up[0]].last_update >= newest_update_osd->second.log_tail) { - dout(10) << "up[0](osd." << up[0] << ") selected as primary" << dendl; - primary = all_info.find(up[0]); // prefer up[0], all thing being equal + !all_info[up_primary].is_incomplete() && + all_info[up_primary].last_update >= newest_update_osd->second.log_tail) { + dout(10) << "up_primary: " << up_primary << ") selected as primary" << dendl; + primary = all_info.find(up_primary); // prefer up[0], all thing being equal } else if (!newest_update_osd->second.is_incomplete()) { dout(10) << "up[0] needs backfill, osd." << newest_update_osd_id << " selected as primary instead" << dendl; primary = newest_update_osd; } else { - map complete_infos; - for (map::iterator i = all_info.begin(); + map complete_infos; + for (map::iterator i = all_info.begin(); i != all_info.end(); ++i) { if (!i->second.is_incomplete()) @@ -1002,7 +1025,7 @@ bool PG::calc_acting(int& newest_update_osd_id, vector& want, vector& } } - for (map::const_iterator i = all_info.begin(); + for (map::const_iterator i = all_info.begin(); i != all_info.end(); ++i) { if (usable >= get_osdmap()->get_pg_size(info.pgid)) @@ -1027,6 +1050,7 @@ bool PG::calc_acting(int& newest_update_osd_id, vector& want, vector& } } +#endif return true; } @@ -1036,11 +1060,12 @@ bool PG::calc_acting(int& newest_update_osd_id, vector& want, vector& * calculate the desired acting, and request a change with the monitor * if it differs from the current acting. */ -bool PG::choose_acting(int& newest_update_osd) +bool PG::choose_acting(pg_shard_t &auth_log_shard) { - vector want, backfill; + vector want; + set backfill; - if (!calc_acting(newest_update_osd, want, backfill)) { + if (!calc_acting(auth_log_shard, want, backfill)) { dout(10) << "choose_acting failed" << dendl; assert(want_acting.empty()); return false; @@ -1049,6 +1074,8 @@ bool PG::choose_acting(int& newest_update_osd) // Determine if compatibility needed bool compat_mode = !cct->_conf->osd_debug_override_acting_compat; + //TODOSAM: fix +#if 0 if (compat_mode) { bool all_support = true; OSDMapRef osdmap = get_osdmap(); @@ -1069,9 +1096,12 @@ bool PG::choose_acting(int& newest_update_osd) if (all_support) compat_mode = false; } +#endif if (compat_mode && !backfill.empty()) { - backfill.resize(1); + pg_shard_t only_backfill(*backfill.begin()); + backfill.clear(); + backfill.insert(only_backfill); } // This might cause a problem if min_size is large @@ -1083,8 +1113,8 @@ bool PG::choose_acting(int& newest_update_osd) return false; } - if (compat_mode) { - want.insert(want.end(), backfill.begin(), backfill.end()); + if (compat_mode && backfill.size()) { + want.insert(want.end(), backfill.begin()->osd); } if (want != acting) { @@ -1097,30 +1127,42 @@ bool PG::choose_acting(int& newest_update_osd) // want is the same as crush map up OSDs. assert(compat_mode || backfill.empty()); vector empty; - osd->queue_want_pg_temp(info.pgid, empty); + osd->queue_want_pg_temp(info.pgid.pgid, empty); } else - osd->queue_want_pg_temp(info.pgid, want); + osd->queue_want_pg_temp(info.pgid.pgid, want); return false; } want_acting.clear(); // We can only get here when new interval has arrived and // we've accepted the acting set. Now we can create // actingbackfill and backfill_targets vectors. - actingbackfill = acting; + actingbackfill.clear(); + for (unsigned i = 0; i < acting.size(); ++i) { + if (acting[i] != -1) { + actingbackfill.insert( + pg_shard_t( + acting[i], + pool.info.ec_pool() ? i : ghobject_t::NO_SHARD)); + } + } if (!compat_mode) - actingbackfill.insert(actingbackfill.end(), backfill.begin(), backfill.end()); + actingbackfill.insert(backfill.begin(), backfill.end()); assert(backfill_targets.empty() || backfill_targets == backfill); if (backfill_targets.empty()) { backfill_targets = backfill; - for (unsigned i = 0; i < backfill.size() ; ++i) { - stray_set.erase(backfill[i]); + for (set::iterator i = backfill.begin(); + i != backfill.end(); + ++i) { + stray_set.erase(*i); } } else { // Will not change if already set because up would have had to change assert(backfill_targets == backfill); // Verify that nothing in backfill is in stray_set - for (unsigned i = 0; i < backfill.size() ; ++i) { - assert(stray_set.find(backfill[i]) == stray_set.end()); + for (set::iterator i = backfill.begin(); + i != backfill.end(); + ++i) { + assert(stray_set.find(*i) == stray_set.end()); } } dout(10) << "choose_acting want " << want << " (== acting) backfill_targets " @@ -1160,16 +1202,20 @@ void PG::build_might_have_unfound() if (!interval.maybe_went_rw) continue; + int i = 0; std::vector::const_iterator a = interval.acting.begin(); std::vector::const_iterator a_end = interval.acting.end(); - for (; a != a_end; ++a) { + for (; a != a_end; ++a, ++i) { if (*a != osd->whoami) - might_have_unfound.insert(*a); + might_have_unfound.insert( + pg_shard_t( + *a, + pool.info.ec_pool() ? i : ghobject_t::NO_SHARD)); } } // include any (stray) peers - for (map::iterator p = peer_info.begin(); + for (map::iterator p = peer_info.begin(); p != peer_info.end(); ++p) might_have_unfound.insert(p->first); @@ -1190,8 +1236,11 @@ struct C_PG_ActivateCommitted : public Context { void PG::activate(ObjectStore::Transaction& t, epoch_t query_epoch, list& tfin, - map< int, map >& query_map, - map > > *activator_map) + map >& query_map, + map > > *activator_map) { assert(!is_active()); assert(scrubber.callbacks.empty()); @@ -1209,7 +1258,8 @@ void PG::activate(ObjectStore::Transaction& t, // TODOSAM: osd->osd-> is no good osd->osd->replay_queue_lock.Lock(); - osd->osd->replay_queue.push_back(pair(info.pgid, replay_until)); + osd->osd->replay_queue.push_back(pair( + info.pgid, replay_until)); osd->osd->replay_queue_lock.Unlock(); } @@ -1286,8 +1336,11 @@ void PG::activate(ObjectStore::Transaction& t, // start up replicas assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + pg_shard_t peer = *i; assert(peer_info.count(peer)); pg_info_t& pi = peer_info[peer]; @@ -1302,16 +1355,19 @@ void PG::activate(ObjectStore::Transaction& t, // empty log if (!pi.is_empty() && activator_map) { dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl; - (*activator_map)[peer].push_back( + (*activator_map)[peer.osd].push_back( make_pair( pg_notify_t( + peer.shard, pg_whoami.shard, get_osdmap()->get_epoch(), get_osdmap()->get_epoch(), info), past_intervals)); } else { dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl; - m = new MOSDPGLog(get_osdmap()->get_epoch(), info); + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap()->get_epoch(), info); } } else if (pg_log.get_tail() > pi.last_update || pi.last_backfill == hobject_t()) { // backfill @@ -1325,7 +1381,9 @@ void PG::activate(ObjectStore::Transaction& t, pi.history = info.history; pi.stats.stats.clear(); - m = new MOSDPGLog(get_osdmap()->get_epoch(), pi); + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap()->get_epoch(), pi); // send some recent log, so that op dup detection works well. m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries); @@ -1336,7 +1394,9 @@ void PG::activate(ObjectStore::Transaction& t, } else { // catch up assert(pg_log.get_tail() <= pi.last_update); - m = new MOSDPGLog(get_osdmap()->get_epoch(), info); + m = new MOSDPGLog( + i->shard, pg_whoami.shard, + get_osdmap()->get_epoch(), info); // send new stuff to append to replicas log m->log.copy_after(pg_log.get_log(), pi.last_update); } @@ -1359,7 +1419,7 @@ void PG::activate(ObjectStore::Transaction& t, if (m) { dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl; //m->log.print(cout); - osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch()); } // peer now has @@ -1375,7 +1435,7 @@ void PG::activate(ObjectStore::Transaction& t, } // degraded? - if (get_osdmap()->get_pg_size(info.pgid) > acting.size()) + if (get_osdmap()->get_pg_size(info.pgid.pgid) > acting.size()) state_set(PG_STATE_DEGRADED); // all clean? @@ -1520,7 +1580,7 @@ void PG::_activate_committed(epoch_t e) if (pg_has_reset_since(e)) { dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl; } else if (is_primary()) { - peer_activated.insert(osd->whoami); + peer_activated.insert(pg_whoami); dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated << " last_epoch_started " << info.history.last_epoch_started << " same_interval_since " << info.history.same_interval_since << dendl; @@ -1530,12 +1590,14 @@ void PG::_activate_committed(epoch_t e) } else { dout(10) << "_activate_committed " << e << " telling primary" << dendl; MOSDPGInfo *m = new MOSDPGInfo(e); - pg_notify_t i = pg_notify_t(get_osdmap()->get_epoch(), - get_osdmap()->get_epoch(), - info); + pg_notify_t i = pg_notify_t( + get_primary().shard, pg_whoami.shard, + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + info); i.info.history.last_epoch_started = e; m->pg_list.push_back(make_pair(i, pg_interval_map_t())); - osd->send_message_osd_cluster(acting[0], m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch()); } if (dirty_info) { @@ -1614,7 +1676,7 @@ void PG::mark_clean() { // only mark CLEAN if we have the desired number of replicas AND we // are not remapped. - if (acting.size() == get_osdmap()->get_pg_size(info.pgid) && + if (acting.size() == get_osdmap()->get_pg_size(info.pgid.pgid) && up == acting) state_set(PG_STATE_CLEAN); @@ -1741,7 +1803,7 @@ static void split_replay_queue( } void PG::split_ops(PG *child, unsigned split_bits) { - unsigned match = child->info.pgid.m_seed; + unsigned match = child->info.pgid.ps(); assert(waiting_for_all_missing.empty()); assert(waiting_for_cache_not_full.empty()); assert(waiting_for_missing_object.empty()); @@ -1795,7 +1857,15 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) child->snap_trimq = snap_trimq; // There can't be recovery/backfill going on now - get_osdmap()->pg_to_up_acting_osds(child->info.pgid, child->up, child->acting); + int primary, up_primary; + vector newup, newacting; + get_osdmap()->pg_to_up_acting_osds( + child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary); + child->init_primary_up_acting( + newup, + newacting, + up_primary, + primary); child->role = OSDMap::calc_pg_role(osd->whoami, child->acting); if (get_primary() != child->get_primary()) child->info.history.same_primary_since = get_osdmap()->get_epoch(); @@ -1846,17 +1916,18 @@ void PG::purge_strays() dout(10) << "purge_strays " << stray_set << dendl; bool removed = false; - for (set::iterator p = stray_set.begin(); + for (set::iterator p = stray_set.begin(); p != stray_set.end(); ++p) { - if (get_osdmap()->is_up(*p)) { + assert(!is_actingbackfill(*p)); + if (get_osdmap()->is_up(p->osd)) { dout(10) << "sending PGRemove to osd." << *p << dendl; - vector to_remove; - to_remove.push_back(info.pgid); + vector to_remove; + to_remove.push_back(spg_t(info.pgid.pgid, p->shard)); MOSDPGRemove *m = new MOSDPGRemove( get_osdmap()->get_epoch(), to_remove); - osd->send_message_osd_cluster(*p, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch()); stray_purged.insert(*p); } else { dout(10) << "not sending PGRemove to down osd." << *p << dendl; @@ -1878,10 +1949,15 @@ void PG::purge_strays() peer_missing_requested.clear(); } -void PG::set_probe_targets(const set &probe_set) +void PG::set_probe_targets(const set &probe_set) { Mutex::Locker l(heartbeat_peer_lock); - probe_targets = probe_set; + probe_targets.clear(); + for (set::iterator i = probe_set.begin(); + i != probe_set.end(); + ++i) { + probe_targets.insert(i->osd); + } } void PG::clear_probe_targets() @@ -1900,8 +1976,10 @@ void PG::update_heartbeat_peers() new_peers.insert(acting[i]); for (unsigned i=0; i::iterator p = peer_info.begin(); p != peer_info.end(); ++p) - new_peers.insert(p->first); + for (map::iterator p = peer_info.begin(); + p != peer_info.end(); + ++p) + new_peers.insert(p->first.osd); } bool need_update = false; @@ -1937,7 +2015,8 @@ void PG::_update_calc_stats() info.stats.ondisk_log_start = pg_log.get_tail(); // calc copies, degraded - unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), actingbackfill.size()); + unsigned target = MAX( + get_osdmap()->get_pg_size(info.pgid.pgid), actingbackfill.size()); info.stats.stats.calc_copies(target); info.stats.stats.sum.num_objects_degraded = 0; if ((is_degraded() || !is_clean()) && is_active()) { @@ -1957,14 +2036,17 @@ void PG::_update_calc_stats() degraded += pg_log.get_missing().num_missing(); assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + assert(peer_missing.count(*i)); // in missing set - degraded += peer_missing[actingbackfill[i]].num_missing(); + degraded += peer_missing[*i].num_missing(); // not yet backfilled - degraded += num_objects - peer_info[actingbackfill[i]].stats.stats.sum.num_objects; + degraded += num_objects - peer_info[*i].stats.stats.sum.num_objects; } info.stats.stats.sum.num_objects_degraded = degraded; info.stats.stats.sum.num_objects_unfound = get_num_unfound(); @@ -2041,11 +2123,14 @@ void PG::clear_publish_stats() * @param backfill true if info should be marked as backfill * @param t transaction to write out our new state in */ -void PG::init(int role, vector& newup, vector& newacting, - pg_history_t& history, - pg_interval_map_t& pi, - bool backfill, - ObjectStore::Transaction *t) +void PG::init( + int role, + vector& newup, int new_up_primary, + vector& newacting, int new_acting_primary, + pg_history_t& history, + pg_interval_map_t& pi, + bool backfill, + ObjectStore::Transaction *t) { dout(10) << "init role " << role << " up " << newup << " acting " << newacting << " history " << history @@ -2055,6 +2140,11 @@ void PG::init(int role, vector& newup, vector& newacting, set_role(role); acting = newacting; up = newup; + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); info.history = history; past_intervals.swap(pi); @@ -2272,7 +2362,7 @@ void PG::write_info(ObjectStore::Transaction& t) epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl) { assert(bl); - pg_t pgid; + spg_t pgid; snapid_t snap; bool ok = coll.is_pg(pgid, snap); assert(ok); @@ -2316,11 +2406,18 @@ void PG::trim_peers() dout(10) << "trim_peers " << pg_trim_to << dendl; if (pg_trim_to != eversion_t()) { assert(actingbackfill.size() > 0); - for (unsigned i=1; isend_message_osd_cluster(actingbackfill[i], - new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid, - pg_trim_to), - get_osdmap()->get_epoch()); + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + osd->send_message_osd_cluster( + i->osd, + new MOSDPGTrim( + get_osdmap()->get_epoch(), + spg_t(info.pgid.pgid, i->shard), + pg_trim_to), + get_osdmap()->get_epoch()); + } } } @@ -2395,8 +2492,9 @@ std::string PG::get_corrupt_pg_log_name() const dout(0) << "strftime failed" << dendl; return "corrupt_log_unknown_time"; } - info.pgid.print(buf + ret, MAX_BUF - ret); - return buf; + string out(buf); + out += stringify(info.pgid); + return out; } int PG::read_info( @@ -2679,7 +2777,7 @@ bool PG::sched_scrub() if (osd->inc_scrubs_pending()) { dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl; scrubber.reserved = true; - scrubber.reserved_peers.insert(osd->whoami); + scrubber.reserved_peers.insert(pg_whoami); scrub_reserve_replicas(); } else { dout(20) << "sched_scrub: failed to reserve locally" << dendl; @@ -2737,28 +2835,28 @@ void PG::sub_op_scrub_map(OpRequestRef op) op->mark_started(); - int from = m->get_source().num(); - - dout(10) << " got osd." << from << " scrub map" << dendl; + dout(10) << " got " << m->from << " scrub map" << dendl; bufferlist::iterator p = m->get_data().begin(); if (scrubber.is_chunky) { // chunky scrub - scrubber.received_maps[from].decode(p, info.pgid.pool()); - dout(10) << "map version is " << scrubber.received_maps[from].valid_through << dendl; + scrubber.received_maps[m->from].decode(p, info.pgid.pool()); + dout(10) << "map version is " + << scrubber.received_maps[m->from].valid_through + << dendl; } else { // classic scrub - if (scrubber.received_maps.count(from)) { + if (scrubber.received_maps.count(m->from)) { ScrubMap incoming; incoming.decode(p, info.pgid.pool()); - dout(10) << "from replica " << from << dendl; + dout(10) << "from replica " << m->from << dendl; dout(10) << "map version is " << incoming.valid_through << dendl; - scrubber.received_maps[from].merge_incr(incoming); + scrubber.received_maps[m->from].merge_incr(incoming); } else { - scrubber.received_maps[from].decode(p, info.pgid.pool()); + scrubber.received_maps[m->from].decode(p, info.pgid.pool()); } } --scrubber.waiting_on; - scrubber.waiting_on_whom.erase(from); + scrubber.waiting_on_whom.erase(m->from); if (scrubber.waiting_on == 0) { if (scrubber.is_chunky) { // chunky scrub @@ -2772,7 +2870,7 @@ void PG::sub_op_scrub_map(OpRequestRef op) scrubber.finalizing = true; scrub_gather_replica_maps(); ++scrubber.waiting_on; - scrubber.waiting_on_whom.insert(osd->whoami); + scrubber.waiting_on_whom.insert(pg_whoami); osd->scrub_wq.queue(this); } } @@ -2781,27 +2879,33 @@ void PG::sub_op_scrub_map(OpRequestRef op) } // send scrub v2-compatible messages (classic scrub) -void PG::_request_scrub_map_classic(int replica, eversion_t version) +void PG::_request_scrub_map_classic(pg_shard_t replica, eversion_t version) { - assert(replica != osd->whoami); + assert(replica != pg_whoami); dout(10) << "scrub requesting scrubmap from osd." << replica << dendl; - MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version, - last_update_applied, - get_osdmap()->get_epoch()); - osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch()); + MOSDRepScrub *repscrubop = + new MOSDRepScrub( + spg_t(info.pgid.pgid, replica.shard), version, + last_update_applied, + get_osdmap()->get_epoch()); + osd->send_message_osd_cluster( + replica.osd, repscrubop, get_osdmap()->get_epoch()); } // send scrub v3 messages (chunky scrub) -void PG::_request_scrub_map(int replica, eversion_t version, - hobject_t start, hobject_t end, - bool deep) +void PG::_request_scrub_map( + pg_shard_t replica, eversion_t version, + hobject_t start, hobject_t end, + bool deep) { - assert(replica != osd->whoami); + assert(replica != pg_whoami); dout(10) << "scrub requesting scrubmap from osd." << replica << dendl; - MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version, - get_osdmap()->get_epoch(), - start, end, deep); - osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch()); + MOSDRepScrub *repscrubop = new MOSDRepScrub( + spg_t(info.pgid.pgid, replica.shard), version, + get_osdmap()->get_epoch(), + start, end, deep); + osd->send_message_osd_cluster( + replica.osd, repscrubop, get_osdmap()->get_epoch()); } void PG::sub_op_scrub_reserve(OpRequestRef op) @@ -2819,7 +2923,8 @@ void PG::sub_op_scrub_reserve(OpRequestRef op) scrubber.reserved = osd->inc_scrubs_pending(); - MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); + MOSDSubOpReply *reply = new MOSDSubOpReply( + m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); ::encode(scrubber.reserved, reply->get_data()); osd->send_message_osd_cluster(reply, m->get_connection()); } @@ -2837,7 +2942,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op) op->mark_started(); - int from = reply->get_source().num(); + pg_shard_t from = reply->from; bufferlist::iterator p = reply->get_data().begin(); bool reserved; ::decode(reserved, p); @@ -2878,17 +2983,18 @@ void PG::sub_op_scrub_stop(OpRequestRef op) // see comment in sub_op_scrub_reserve scrubber.reserved = false; - MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); + MOSDSubOpReply *reply = new MOSDSubOpReply( + m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); osd->send_message_osd_cluster(reply, m->get_connection()); } void PG::reject_reservation() { osd->send_message_osd_cluster( - acting[0], + primary.osd, new MBackfillReserve( MBackfillReserve::REJECT, - info.pgid, + spg_t(info.pgid.pgid, primary.shard), get_osdmap()->get_epoch()), get_osdmap()->get_epoch()); } @@ -2918,34 +3024,43 @@ void PG::clear_scrub_reserved() void PG::scrub_reserve_replicas() { assert(backfill_targets.empty()); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(10) << "scrub requesting reserve from osd." << *i << dendl; vector scrub(1); scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE; hobject_t poid; eversion_t v; osd_reqid_t reqid; - MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0, - get_osdmap()->get_epoch(), osd->get_tid(), v); + MOSDSubOp *subop = new MOSDSubOp( + reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0, + get_osdmap()->get_epoch(), osd->get_tid(), v); subop->ops = scrub; - osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster( + i->osd, subop, get_osdmap()->get_epoch()); } } void PG::scrub_unreserve_replicas() { assert(backfill_targets.empty()); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(10) << "scrub requesting unreserve from osd." << *i << dendl; vector scrub(1); scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE; hobject_t poid; eversion_t v; osd_reqid_t reqid; - MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0, - get_osdmap()->get_epoch(), osd->get_tid(), v); + MOSDSubOp *subop = new MOSDSubOp( + reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0, + get_osdmap()->get_epoch(), osd->get_tid(), v); subop->ops = scrub; - osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch()); } } @@ -3127,14 +3242,17 @@ void PG::build_inc_scrub_map( osd->store->collection_getattrs(coll, map.attrs); } -void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer) +void PG::repair_object( + const hobject_t& soid, ScrubMap::object *po, + pg_shard_t bad_peer, pg_shard_t ok_peer) { - dout(10) << "repair_object " << soid << " bad_peer osd." << bad_peer << " ok_peer osd." << ok_peer << dendl; + dout(10) << "repair_object " << soid << " bad_peer osd." + << bad_peer << " ok_peer osd." << ok_peer << dendl; eversion_t v; bufferlist bv; bv.push_back(po->attrs[OI_ATTR]); object_info_t oi(bv); - if (bad_peer != acting[0]) { + if (bad_peer != primary) { peer_missing[bad_peer].add(soid, oi.version, eversion_t()); } else { // We should only be scrubbing if the PG is clean. @@ -3237,8 +3355,16 @@ void PG::replica_scrub( hobject_t poid; eversion_t v; osd_reqid_t reqid; - MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0, - msg->map_epoch, osd->get_tid(), v); + MOSDSubOp *subop = new MOSDSubOp( + reqid, + pg_whoami, + spg_t(info.pgid.pgid, get_primary().shard), + poid, + false, + 0, + msg->map_epoch, + osd->get_tid(), + v); ::encode(map, subop->get_data()); subop->ops = scrub; @@ -3367,11 +3493,16 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle) * last_update_applied == info.last_update) */ scrubber.waiting_on = acting.size(); - scrubber.waiting_on_whom.insert(acting.begin(), acting.end()); + scrubber.waiting_on_whom.insert( + actingbackfill.begin(), actingbackfill.end()); + scrubber.waiting_on_whom.erase(pg_whoami); // request maps from replicas - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + _request_scrub_map_classic(*i, eversion_t()); } // Unlocks and relocks... @@ -3386,7 +3517,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle) } --scrubber.waiting_on; - scrubber.waiting_on_whom.erase(osd->whoami); + scrubber.waiting_on_whom.erase(pg_whoami); if (scrubber.waiting_on == 0) { // the replicas have completed their scrub map, so lock out writes @@ -3406,7 +3537,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle) // request incrementals from replicas scrub_gather_replica_maps(); ++scrubber.waiting_on; - scrubber.waiting_on_whom.insert(osd->whoami); + scrubber.waiting_on_whom.insert(pg_whoami); } dout(10) << "clean up scrub" << dendl; @@ -3428,7 +3559,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle) } --scrubber.waiting_on; - scrubber.waiting_on_whom.erase(osd->whoami); + scrubber.waiting_on_whom.erase(pg_whoami); if (scrubber.waiting_on == 0) { assert(last_update_applied == info.last_update); osd->scrub_finalize_wq.queue(this); @@ -3603,14 +3734,17 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) } // ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan - scrubber.waiting_on_whom.insert(osd->whoami); + scrubber.waiting_on_whom.insert(pg_whoami); ++scrubber.waiting_on; // request maps from replicas - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + _request_scrub_map(*i, scrubber.subset_last_update, scrubber.start, scrubber.end, scrubber.deep); - scrubber.waiting_on_whom.insert(acting[i]); + scrubber.waiting_on_whom.insert(*i); ++scrubber.waiting_on; } @@ -3653,7 +3787,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) } --scrubber.waiting_on; - scrubber.waiting_on_whom.erase(osd->whoami); + scrubber.waiting_on_whom.erase(pg_whoami); scrubber.state = PG::Scrubber::WAIT_REPLICAS; break; @@ -3735,7 +3869,7 @@ bool PG::scrub_gather_replica_maps() assert(scrubber.waiting_on == 0); assert(_lock.is_locked()); - for (map::iterator p = scrubber.received_maps.begin(); + for (map::iterator p = scrubber.received_maps.begin(); p != scrubber.received_maps.end(); ++p) { @@ -3754,8 +3888,6 @@ bool PG::scrub_gather_replica_maps() } } - - void PG::scrub_compare_maps() { dout(10) << "scrub_compare_maps has maps, analyzing" << dendl; @@ -3769,16 +3901,21 @@ void PG::scrub_compare_maps() stringstream ss; // Map from object with errors to good peer - map authoritative; - map maps; + map authoritative; + map maps; dout(2) << "scrub osd." << acting[0] << " has " << scrubber.primary_scrubmap.objects.size() << " items" << dendl; - maps[0] = &scrubber.primary_scrubmap; - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + dout(2) << "scrub replica " << *i << " has " + << scrubber.received_maps[*i].objects.size() + << " items" << dendl; + maps[*i] = &scrubber.received_maps[*i]; } get_pgbackend()->be_compare_scrubmaps( @@ -3797,7 +3934,7 @@ void PG::scrub_compare_maps() osd->clog.error(ss); } - for (map::iterator i = authoritative.begin(); + for (map::iterator i = authoritative.begin(); i != authoritative.end(); ++i) { scrubber.authoritative.insert( @@ -3806,7 +3943,7 @@ void PG::scrub_compare_maps() make_pair(maps[i->second]->objects[i->first], i->second))); } - for (map::iterator i = authoritative.begin(); + for (map::iterator i = authoritative.begin(); i != authoritative.end(); ++i) { authmap.objects.erase(i->first); @@ -3827,11 +3964,11 @@ void PG::scrub_process_inconsistent() if (!scrubber.authoritative.empty() || !scrubber.inconsistent.empty()) { stringstream ss; - for (map >::iterator obj = + for (map >::iterator obj = scrubber.inconsistent_snapcolls.begin(); obj != scrubber.inconsistent_snapcolls.end(); ++obj) { - for (set::iterator j = obj->second.begin(); + for (set::iterator j = obj->second.begin(); j != obj->second.end(); ++j) { ++scrubber.shallow_errors; @@ -3840,26 +3977,28 @@ void PG::scrub_process_inconsistent() } } - ss << info.pgid << " " << mode << " " << scrubber.missing.size() << " missing, " + ss << info.pgid << " " << mode << " " + << scrubber.missing.size() << " missing, " << scrubber.inconsistent.size() << " inconsistent objects\n"; dout(2) << ss.str() << dendl; osd->clog.error(ss); if (repair) { state_clear(PG_STATE_CLEAN); - for (map >::iterator i = + for (map >::iterator i = scrubber.authoritative.begin(); i != scrubber.authoritative.end(); ++i) { - set::iterator j; + set::iterator j; if (scrubber.missing.count(i->first)) { for (j = scrubber.missing[i->first].begin(); j != scrubber.missing[i->first].end(); ++j) { - repair_object(i->first, + repair_object( + i->first, &(i->second.first), - acting[*j], - acting[i->second.second]); + *j, + i->second.second); ++scrubber.fixed; } } @@ -3869,8 +4008,8 @@ void PG::scrub_process_inconsistent() ++j) { repair_object(i->first, &(i->second.first), - acting[*j], - acting[i->second.second]); + *j, + i->second.second); ++scrubber.fixed; } } @@ -4012,21 +4151,25 @@ void PG::share_pg_info() // share new pg_info_t with replicas assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == pg_whoami) continue; + pg_shard_t peer = *i; + if (peer_info.count(peer)) { + peer_info[peer].last_epoch_started = info.last_epoch_started; + peer_info[peer].history.merge(info.history); } MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch()); m->pg_list.push_back( make_pair( pg_notify_t( + peer.shard, pg_whoami.shard, get_osdmap()->get_epoch(), get_osdmap()->get_epoch(), info), pg_interval_map_t())); - osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch()); } } @@ -4040,15 +4183,19 @@ void PG::share_pg_log() dout(10) << __func__ << dendl; assert(is_primary()); - vector::const_iterator a = actingbackfill.begin(); + set::const_iterator a = actingbackfill.begin(); assert(a != actingbackfill.end()); - vector::const_iterator end = actingbackfill.end(); - while (++a != end) { - int peer(*a); + set::const_iterator end = actingbackfill.end(); + while (a != end) { + pg_shard_t peer(*a); + ++a; + if (peer == pg_whoami) continue; pg_missing_t& pmissing(peer_missing[peer]); pg_info_t& pinfo(peer_info[peer]); - MOSDPGLog *m = new MOSDPGLog(info.last_update.epoch, info); + MOSDPGLog *m = new MOSDPGLog( + peer.shard, pg_whoami.shard, + info.last_update.epoch, info); m->log.copy_after(pg_log.get_log(), pinfo.last_update); for (list::const_iterator i = m->log.log.begin(); @@ -4058,7 +4205,7 @@ void PG::share_pg_log() } pinfo.last_update = m->log.head; - osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch()); } } @@ -4069,11 +4216,11 @@ void PG::update_history_from_master(pg_history_t new_history) reg_next_scrub(); } -void PG::fulfill_info(int from, const pg_query_t &query, - pair ¬ify_info) +void PG::fulfill_info( + pg_shard_t from, const pg_query_t &query, + pair ¬ify_info) { - assert(!acting.empty()); - assert(from == acting[0]); + assert(from == primary); assert(query.type == pg_query_t::INFO); // info @@ -4081,14 +4228,17 @@ void PG::fulfill_info(int from, const pg_query_t &query, notify_info = make_pair(from, info); } -void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch) +void PG::fulfill_log( + pg_shard_t from, const pg_query_t &query, epoch_t query_epoch) { - assert(!acting.empty()); - assert(from == acting[0]); + dout(10) << "log request from " << from << dendl; + assert(from == primary); assert(query.type != pg_query_t::INFO); - MOSDPGLog *mlog = new MOSDPGLog(get_osdmap()->get_epoch(), - info, query_epoch); + MOSDPGLog *mlog = new MOSDPGLog( + from.shard, pg_whoami.shard, + get_osdmap()->get_epoch(), + info, query_epoch); mlog->missing = pg_log.get_missing(); // primary -> other, when building master log @@ -4110,9 +4260,10 @@ void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch) dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl; - ConnectionRef con = osd->get_con_osd_cluster(from, get_osdmap()->get_epoch()); + ConnectionRef con = osd->get_con_osd_cluster( + from.osd, get_osdmap()->get_epoch()); if (con) { - osd->osd->_share_map_outgoing(from, con.get(), get_osdmap()); + osd->osd->_share_map_outgoing(from.osd, con.get(), get_osdmap()); osd->send_message_osd_cluster(mlog, con.get()); } else { mlog->put(); @@ -4260,10 +4411,11 @@ void PG::start_flush(ObjectStore::Transaction *t, } /* Called before initializing peering during advance_map */ -void PG::start_peering_interval(const OSDMapRef lastmap, - const vector& newup, - const vector& newacting, - ObjectStore::Transaction *t) +void PG::start_peering_interval( + const OSDMapRef lastmap, + const vector& newup, int new_up_primary, + const vector& newacting, int new_acting_primary, + ObjectStore::Transaction *t) { const OSDMapRef osdmap = get_osdmap(); @@ -4271,13 +4423,17 @@ void PG::start_peering_interval(const OSDMapRef lastmap, vector oldacting, oldup; int oldrole = get_role(); - int oldprimary = get_primary(); + + pg_shard_t oldprimary = get_primary(); bool was_old_primary = is_primary(); + acting.swap(oldacting); up.swap(oldup); - - up = newup; - acting = newacting; + init_primary_up_acting( + newup, + newacting, + new_up_primary, + new_acting_primary); if (info.stats.up != up || info.stats.acting != acting) { @@ -4294,7 +4450,10 @@ void PG::start_peering_interval(const OSDMapRef lastmap, state_clear(PG_STATE_REMAPPED); int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size()); - set_role(role); + if (role == pg_whoami.shard) + set_role(role); + else + set_role(-1); // did acting, up, primary|acker change? if (!lastmap) { @@ -4311,7 +4470,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, osdmap, lastmap, info.pgid.pool(), - info.pgid, + info.pgid.pgid, &past_intervals, &debug); dout(10) << __func__ << ": check_new_interval output: " @@ -4405,12 +4564,12 @@ void PG::start_peering_interval(const OSDMapRef lastmap, } } // make sure we clear out any pg_temp change requests - osd->remove_want_pg_temp(info.pgid); + osd->remove_want_pg_temp(info.pgid.pgid); cancel_recovery(); - if (acting.empty() && !up.empty() && up[0] == osd->whoami) { + if (acting.empty() && !up.empty() && up_primary == pg_whoami) { dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl; - osd->queue_want_pg_temp(info.pgid, acting); + osd->queue_want_pg_temp(info.pgid.pgid, acting); } } @@ -4727,9 +4886,9 @@ void PG::queue_peering_event(CephPeeringEvtRef evt) void PG::queue_notify(epoch_t msg_epoch, epoch_t query_epoch, - int from, pg_notify_t& i) + pg_shard_t from, pg_notify_t& i) { - dout(10) << "notify " << i << " from osd." << from << dendl; + dout(10) << "notify " << i << " from replica " << from << dendl; queue_peering_event( CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch, MNotifyRec(from, i)))); @@ -4737,9 +4896,9 @@ void PG::queue_notify(epoch_t msg_epoch, void PG::queue_info(epoch_t msg_epoch, epoch_t query_epoch, - int from, pg_info_t& i) + pg_shard_t from, pg_info_t& i) { - dout(10) << "info " << i << " from osd." << from << dendl; + dout(10) << "info " << i << " from replica " << from << dendl; queue_peering_event( CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch, MInfoRec(from, i, msg_epoch)))); @@ -4747,10 +4906,10 @@ void PG::queue_info(epoch_t msg_epoch, void PG::queue_log(epoch_t msg_epoch, epoch_t query_epoch, - int from, + pg_shard_t from, MOSDPGLog *msg) { - dout(10) << "log " << *msg << " from osd." << from << dendl; + dout(10) << "log " << *msg << " from replica " << from << dendl; queue_peering_event( CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch, MLogRec(from, msg)))); @@ -4775,26 +4934,33 @@ void PG::queue_flushed(epoch_t e) void PG::queue_query(epoch_t msg_epoch, epoch_t query_epoch, - int from, const pg_query_t& q) + pg_shard_t from, const pg_query_t& q) { - dout(10) << "handle_query " << q << " from osd." << from << dendl; + dout(10) << "handle_query " << q << " from replica " << from << dendl; queue_peering_event( CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch, MQuery(from, q, query_epoch)))); } -void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap, - vector& newup, vector& newacting, - RecoveryCtx *rctx) +void PG::handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary, + RecoveryCtx *rctx) { assert(lastmap->get_epoch() == osdmap_ref->get_epoch()); assert(lastmap == osdmap_ref); - dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl; + dout(10) << "handle_advance_map " + << newup << "/" << newacting + << " -- " << up_primary << "/" << acting_primary + << dendl; update_osdmap_ref(osdmap); pool.update(osdmap); if (pool.info.last_change == osdmap_ref->get_epoch()) on_pool_change(); - AdvMap evt(osdmap, lastmap, newup, newacting); + AdvMap evt( + osdmap, lastmap, newup, up_primary, + newacting, acting_primary); recovery_state.handle_event(evt, rctx); } @@ -4993,26 +5159,32 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap) // _before_ we are active. pg->generate_past_intervals(); - pg->remove_down_peer_info(advmap.osdmap); if (pg->acting_up_affected(advmap.newup, advmap.newacting) || pg->is_split(advmap.lastmap, advmap.osdmap)) { dout(10) << "up or acting affected, calling start_peering_interval again" << dendl; - pg->start_peering_interval(advmap.lastmap, advmap.newup, advmap.newacting, - context< RecoveryMachine >().get_cur_transaction()); + pg->start_peering_interval( + advmap.lastmap, + advmap.newup, advmap.up_primary, + advmap.newacting, advmap.acting_primary, + context< RecoveryMachine >().get_cur_transaction()); } + pg->remove_down_peer_info(advmap.osdmap); return discard_event(); } boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&) { PG *pg = context< RecoveryMachine >().pg; - if (pg->should_send_notify() && pg->get_primary() >= 0) { - context< RecoveryMachine >().send_notify(pg->get_primary(), - pg_notify_t(pg->get_osdmap()->get_epoch(), - pg->get_osdmap()->get_epoch(), - pg->info), - pg->past_intervals); + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap()->get_epoch(), + pg->get_osdmap()->get_epoch(), + pg->info), + pg->past_intervals); } pg->update_heartbeat_peers(); @@ -5073,13 +5245,6 @@ PG::RecoveryState::Primary::Primary(my_context ctx) assert(pg->want_acting.empty()); } -boost::statechart::result PG::RecoveryState::Primary::react(const AdvMap &advmap) -{ - PG *pg = context< RecoveryMachine >().pg; - pg->remove_down_peer_info(advmap.osdmap); - return forward_event(); -} - boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt) { dout(7) << "handle_pg_notify from osd." << notevt.from << dendl; @@ -5160,15 +5325,19 @@ boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q) q.f->close_section(); q.f->open_array_section("probing_osds"); - for (set::iterator p = prior_set->probe.begin(); p != prior_set->probe.end(); ++p) - q.f->dump_int("osd", *p); + for (set::iterator p = prior_set->probe.begin(); + p != prior_set->probe.end(); + ++p) + q.f->dump_stream("osd") << *p; q.f->close_section(); if (prior_set->pg_down) q.f->dump_string("blocked", "peering is blocked due to down osds"); q.f->open_array_section("down_osds_we_would_probe"); - for (set::iterator p = prior_set->down.begin(); p != prior_set->down.end(); ++p) + for (set::iterator p = prior_set->down.begin(); + p != prior_set->down.end(); + ++p) q.f->dump_int("osd", *p); q.f->close_section(); @@ -5259,8 +5428,9 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) { //The primary never backfills itself - assert(*backfill_osd_it != pg->osd->whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster(*backfill_osd_it, pg->get_osdmap()->get_epoch()); + assert(*backfill_osd_it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + backfill_osd_it->osd, pg->get_osdmap()->get_epoch()); if (con) { if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) { unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH @@ -5268,7 +5438,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve pg->osd->send_message_osd_cluster( new MBackfillReserve( MBackfillReserve::REQUEST, - pg->info.pgid, + spg_t(pg->info.pgid.pgid, backfill_osd_it->shard), pg->get_osdmap()->get_epoch(), priority), con.get()); } else { @@ -5379,10 +5549,10 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved & { PG *pg = context< RecoveryMachine >().pg; pg->osd->send_message_osd_cluster( - pg->acting[0], + pg->primary.osd, new MRecoveryReserve( MRecoveryReserve::GRANT, - pg->info.pgid, + spg_t(pg->info.pgid.pgid, pg->primary.shard), pg->get_osdmap()->get_epoch()), pg->get_osdmap()->get_epoch()); return transit(); @@ -5439,10 +5609,10 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved & { PG *pg = context< RecoveryMachine >().pg; pg->osd->send_message_osd_cluster( - pg->acting[0], + pg->primary.osd, new MBackfillReserve( MBackfillReserve::GRANT, - pg->info.pgid, + spg_t(pg->info.pgid.pgid, pg->primary.shard), pg->get_osdmap()->get_epoch()), pg->get_osdmap()->get_epoch()); return transit(); @@ -5522,7 +5692,7 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit() PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) : my_base(ctx), NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"), - acting_osd_it(context< Active >().sorted_acting_set.begin()) + acting_osd_it(context< Active >().sorted_actingbackfill_set.begin()) { context< RecoveryMachine >().log_enter(state_name); post_event(RemoteRecoveryReserved()); @@ -5532,20 +5702,22 @@ boost::statechart::result PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { PG *pg = context< RecoveryMachine >().pg; - if (acting_osd_it != context< Active >().sorted_acting_set.end()) { + if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) { // skip myself - if (*acting_osd_it == pg->osd->whoami) + if (*acting_osd_it == pg->pg_whoami) ++acting_osd_it; } - if (acting_osd_it != context< Active >().sorted_acting_set.end()) { - ConnectionRef con = pg->osd->get_con_osd_cluster(*acting_osd_it, pg->get_osdmap()->get_epoch()); + if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) { + ConnectionRef con = pg->osd->get_con_osd_cluster( + acting_osd_it->osd, pg->get_osdmap()->get_epoch()); if (con) { if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) { pg->osd->send_message_osd_cluster( - new MRecoveryReserve(MRecoveryReserve::REQUEST, - pg->info.pgid, - pg->get_osdmap()->get_epoch()), + new MRecoveryReserve( + MRecoveryReserve::REQUEST, + spg_t(pg->info.pgid.pgid, acting_osd_it->shard), + pg->get_osdmap()->get_epoch()), con.get()); } else { post_event(RemoteRecoveryReserved()); @@ -5584,18 +5756,21 @@ void PG::RecoveryState::Recovering::release_reservations() assert(!pg->pg_log.get_missing().have_missing()); // release remote reservations - for (set::const_iterator i = context< Active >().sorted_acting_set.begin(); - i != context< Active >().sorted_acting_set.end(); + for (set::const_iterator i = + context< Active >().sorted_actingbackfill_set.begin(); + i != context< Active >().sorted_actingbackfill_set.end(); ++i) { - if (*i == pg->osd->whoami) // skip myself + if (*i == pg->pg_whoami) // skip myself continue; - ConnectionRef con = pg->osd->get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch()); + ConnectionRef con = pg->osd->get_con_osd_cluster( + i->osd, pg->get_osdmap()->get_epoch()); if (con) { if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) { pg->osd->send_message_osd_cluster( - new MRecoveryReserve(MRecoveryReserve::RELEASE, - pg->info.pgid, - pg->get_osdmap()->get_epoch()), + new MRecoveryReserve( + MRecoveryReserve::RELEASE, + spg_t(pg->info.pgid.pgid, i->shard), + pg->get_osdmap()->get_epoch()), con.get()); } } @@ -5632,7 +5807,7 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx) : my_base(ctx), NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovered") { - int newest_update_osd; + pg_shard_t auth_log_shard; context< RecoveryMachine >().log_enter(state_name); @@ -5642,11 +5817,12 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx) // if we finished backfill, all acting are active; recheck if // DEGRADED is appropriate. assert(pg->actingbackfill.size() > 0); - if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->actingbackfill.size()) + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= + pg->actingbackfill.size()) pg->state_clear(PG_STATE_DEGRADED); // adjust acting set? (e.g. because backfill completed...) - if (pg->acting != pg->up && !pg->choose_acting(newest_update_osd)) + if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard)) assert(pg->want_acting.size()); assert(!pg->needs_recovery()); @@ -5695,10 +5871,12 @@ void PG::RecoveryState::Clean::exit() PG::RecoveryState::Active::Active(my_context ctx) : my_base(ctx), NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"), - sorted_acting_set(context< RecoveryMachine >().pg->actingbackfill.begin(), - context< RecoveryMachine >().pg->actingbackfill.end()), - sorted_backfill_set(context< RecoveryMachine >().pg->backfill_targets.begin(), - context< RecoveryMachine >().pg->backfill_targets.end()), + sorted_actingbackfill_set( + context< RecoveryMachine >().pg->actingbackfill.begin(), + context< RecoveryMachine >().pg->actingbackfill.end()), + sorted_backfill_set( + context< RecoveryMachine >().pg->backfill_targets.begin(), + context< RecoveryMachine >().pg->backfill_targets.end()), all_replicas_activated(false) { context< RecoveryMachine >().log_enter(state_name); @@ -5745,9 +5923,9 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap) /* Check for changes in pool size (if the acting set changed as a result, * this does not matter) */ - if (advmap.lastmap->get_pg_size(pg->info.pgid) != - pg->get_osdmap()->get_pg_size(pg->info.pgid)) { - if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size()) + if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) != + pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) { + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->acting.size()) pg->state_clear(PG_STATE_DEGRADED); else pg->state_set(PG_STATE_DEGRADED); @@ -5876,16 +6054,16 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q) { q.f->open_array_section("might_have_unfound"); - for (set::iterator p = pg->might_have_unfound.begin(); + for (set::iterator p = pg->might_have_unfound.begin(); p != pg->might_have_unfound.end(); ++p) { q.f->open_object_section("osd"); - q.f->dump_int("osd", *p); + q.f->dump_stream("osd") << *p; if (pg->peer_missing.count(*p)) { q.f->dump_string("status", "already probed"); } else if (pg->peer_missing_requested.count(*p)) { q.f->dump_string("status", "querying"); - } else if (!pg->get_osdmap()->is_up(*p)) { + } else if (!pg->get_osdmap()->is_up(p->osd)) { q.f->dump_string("status", "osd is down"); } else { q.f->dump_string("status", "not queried"); @@ -5909,10 +6087,10 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q) q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on); { q.f->open_array_section("scrubber.waiting_on_whom"); - for (set::iterator p = pg->scrubber.waiting_on_whom.begin(); + for (set::iterator p = pg->scrubber.waiting_on_whom.begin(); p != pg->scrubber.waiting_on_whom.end(); ++p) { - q.f->dump_int("osd", *p); + q.f->dump_stream("shard") << *p; } q.f->close_section(); } @@ -5968,7 +6146,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react( const Activate& actevt) { dout(10) << "In ReplicaActive, about to call activate" << dendl; PG *pg = context< RecoveryMachine >().pg; - map< int, map< pg_t, pg_query_t> > query_map; + map > query_map; pg->activate(*context< RecoveryMachine >().get_cur_transaction(), actevt.query_epoch, *context< RecoveryMachine >().get_on_safe_context_list(), @@ -5999,12 +6177,15 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&) { PG *pg = context< RecoveryMachine >().pg; - if (pg->should_send_notify() && pg->get_primary() >= 0) { - context< RecoveryMachine >().send_notify(pg->get_primary(), - pg_notify_t(pg->get_osdmap()->get_epoch(), - pg->get_osdmap()->get_epoch(), - pg->info), - pg->past_intervals); + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap()->get_epoch(), + pg->get_osdmap()->get_epoch(), + pg->info), + pg->past_intervals); } pg->take_waiters(); return discard_event(); @@ -6107,14 +6288,17 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query) { PG *pg = context< RecoveryMachine >().pg; if (query.query.type == pg_query_t::INFO) { - pair notify_info; + pair notify_info; pg->update_history_from_master(query.query.history); pg->fulfill_info(query.from, query.query, notify_info); - context< RecoveryMachine >().send_notify(notify_info.first, - pg_notify_t(query.query_epoch, - pg->get_osdmap()->get_epoch(), - notify_info.second), - pg->past_intervals); + context< RecoveryMachine >().send_notify( + notify_info.first, + pg_notify_t( + notify_info.first.shard, pg->pg_whoami.shard, + query.query_epoch, + pg->get_osdmap()->get_epoch(), + notify_info.second), + pg->past_intervals); } else { pg->fulfill_log(query.from, query.query, query.query_epoch); } @@ -6124,12 +6308,15 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query) boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&) { PG *pg = context< RecoveryMachine >().pg; - if (pg->should_send_notify() && pg->get_primary() >= 0) { - context< RecoveryMachine >().send_notify(pg->get_primary(), - pg_notify_t(pg->get_osdmap()->get_epoch(), - pg->get_osdmap()->get_epoch(), - pg->info), - pg->past_intervals); + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< RecoveryMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap()->get_epoch(), + pg->get_osdmap()->get_epoch(), + pg->info), + pg->past_intervals); } pg->take_waiters(); return discard_event(); @@ -6170,11 +6357,11 @@ void PG::RecoveryState::GetInfo::get_infos() PG *pg = context< RecoveryMachine >().pg; auto_ptr &prior_set = context< Peering >().prior_set; - for (set::const_iterator it = prior_set->probe.begin(); + for (set::const_iterator it = prior_set->probe.begin(); it != prior_set->probe.end(); ++it) { - int peer = *it; - if (peer == pg->osd->whoami) { + pg_shard_t peer = *it; + if (peer == pg->pg_whoami) { continue; } if (pg->peer_info.count(peer)) { @@ -6183,12 +6370,13 @@ void PG::RecoveryState::GetInfo::get_infos() } if (peer_info_requested.count(peer)) { dout(10) << " already requested info from osd." << peer << dendl; - } else if (!pg->get_osdmap()->is_up(peer)) { + } else if (!pg->get_osdmap()->is_up(peer.osd)) { dout(10) << " not querying info from down osd." << peer << dendl; } else { dout(10) << " querying info from osd." << peer << dendl; context< RecoveryMachine >().send_query( peer, pg_query_t(pg_query_t::INFO, + it->shard, pg->pg_whoami.shard, pg->info.history, pg->get_osdmap()->get_epoch())); peer_info_requested.insert(peer); @@ -6198,7 +6386,7 @@ void PG::RecoveryState::GetInfo::get_infos() boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt) { - set::iterator p = peer_info_requested.find(infoevt.from); + set::iterator p = peer_info_requested.find(infoevt.from); if (p != peer_info_requested.end()) peer_info_requested.erase(p); @@ -6214,7 +6402,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in // filter out any osds that got dropped from the probe set from // peer_info_requested. this is less expensive than restarting // peering (which would re-probe everyone). - set::iterator p = peer_info_requested.begin(); + set::iterator p = peer_info_requested.begin(); while (p != peer_info_requested.end()) { if (prior_set->probe.count(*p) == 0) { dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; @@ -6256,15 +6444,16 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in bool any_down_now = false; for (unsigned i=0; ipool.info.ec_pool() ? i : ghobject_t::NO_SHARD); if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first) continue; // dne or lost if (osdmap->is_up(o)) { pg_info_t *pinfo; - if (o == pg->osd->whoami) { + if (so == pg->pg_whoami) { pinfo = &pg->info; } else { - assert(pg->peer_info.count(o)); - pinfo = &pg->peer_info[o]; + assert(pg->peer_info.count(so)); + pinfo = &pg->peer_info[so]; } if (!pinfo->is_incomplete()) any_up_complete_now = true; @@ -6294,9 +6483,11 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q) q.f->dump_stream("enter_time") << enter_time; q.f->open_array_section("requested_info_from"); - for (set::iterator p = peer_info_requested.begin(); p != peer_info_requested.end(); ++p) { + for (set::iterator p = peer_info_requested.begin(); + p != peer_info_requested.end(); + ++p) { q.f->open_object_section("osd"); - q.f->dump_int("osd", *p); + q.f->dump_stream("osd") << *p; if (pg->peer_info.count(*p)) { q.f->open_object_section("got_info"); pg->peer_info[*p].dump(q.f); @@ -6321,15 +6512,16 @@ void PG::RecoveryState::GetInfo::exit() /*------GetLog------------*/ PG::RecoveryState::GetLog::GetLog(my_context ctx) : my_base(ctx), - NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"), - newest_update_osd(-1), msg(0) + NamedState( + context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"), + msg(0) { context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; // adjust acting? - if (!pg->choose_acting(newest_update_osd)) { + if (!pg->choose_acting(auth_log_shard)) { if (!pg->want_acting.empty()) { post_event(NeedActingChange()); } else { @@ -6339,16 +6531,16 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx) } // am i the best? - if (newest_update_osd == pg->osd->whoami) { + if (auth_log_shard == pg->pg_whoami) { post_event(GotLog()); return; } - const pg_info_t& best = pg->peer_info[newest_update_osd]; + const pg_info_t& best = pg->peer_info[auth_log_shard]; // am i broken? if (pg->info.last_update < best.log_tail) { - dout(10) << " not contiguous with osd." << newest_update_osd << ", down" << dendl; + dout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; post_event(IsIncomplete()); return; } @@ -6356,19 +6548,24 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx) // how much log to request? eversion_t request_log_from = pg->info.last_update; assert(pg->actingbackfill.size() > 0); - for (vector::iterator p = pg->actingbackfill.begin() + 1; - p != pg->actingbackfill.end(); ++p) { + for (set::iterator p = pg->actingbackfill.begin(); + p != pg->actingbackfill.end(); + ++p) { + if (*p == pg->pg_whoami) continue; pg_info_t& ri = pg->peer_info[*p]; if (ri.last_update >= best.log_tail && ri.last_update < request_log_from) request_log_from = ri.last_update; } // how much? - dout(10) << " requesting log from osd." << newest_update_osd << dendl; + dout(10) << " requesting log from osd." << auth_log_shard << dendl; context().send_query( - newest_update_osd, - pg_query_t(pg_query_t::LOG, request_log_from, pg->info.history, - pg->get_osdmap()->get_epoch())); + auth_log_shard, + pg_query_t( + pg_query_t::LOG, + auth_log_shard.shard, pg->pg_whoami.shard, + request_log_from, pg->info.history, + pg->get_osdmap()->get_epoch())); } boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap) @@ -6376,8 +6573,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap) // make sure our log source didn't go down. we need to check // explicitly because it may not be part of the prior set, which // means the Peering state check won't catch it going down. - if (!advmap.osdmap->is_up(newest_update_osd)) { - dout(10) << "GetLog: newest_update_osd osd." << newest_update_osd << " went down" << dendl; + if (!advmap.osdmap->is_up(auth_log_shard.osd)) { + dout(10) << "GetLog: auth_log_shard osd." + << auth_log_shard.osd << " went down" << dendl; post_event(advmap); return transit< Reset >(); } @@ -6389,9 +6587,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap) boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt) { assert(!msg); - if (logevt.from != newest_update_osd) { + if (logevt.from != auth_log_shard) { dout(10) << "GetLog: discarding log from " - << "non-newest_update_osd osd." << logevt.from << dendl; + << "non-auth_log_shard osd." << logevt.from << dendl; return discard_event(); } dout(10) << "GetLog: recieved master log from osd" @@ -6409,7 +6607,7 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&) dout(10) << "processing master log" << dendl; pg->proc_master_log(*context().get_cur_transaction(), msg->info, msg->log, msg->missing, - newest_update_osd); + auth_log_shard); } pg->start_flush( context< RecoveryMachine >().get_cur_transaction(), @@ -6423,7 +6621,7 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q) q.f->open_object_section("state"); q.f->dump_string("name", state_name); q.f->dump_stream("enter_time") << enter_time; - q.f->dump_int("newest_update_osd", newest_update_osd); + q.f->dump_stream("auth_log_shard") << auth_log_shard; q.f->close_section(); return forward_event(); } @@ -6557,9 +6755,10 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx) PG *pg = context< RecoveryMachine >().pg; assert(pg->actingbackfill.size() > 0); - for (vector::iterator i = pg->actingbackfill.begin() + 1; + for (set::iterator i = pg->actingbackfill.begin(); i != pg->actingbackfill.end(); ++i) { + if (*i == pg->get_primary()) continue; const pg_info_t& pi = pg->peer_info[*i]; if (pi.is_empty()) @@ -6596,15 +6795,20 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx) dout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl; context< RecoveryMachine >().send_query( *i, - pg_query_t(pg_query_t::LOG, since, pg->info.history, - pg->get_osdmap()->get_epoch())); + pg_query_t( + pg_query_t::LOG, + i->shard, pg->pg_whoami.shard, + since, pg->info.history, + pg->get_osdmap()->get_epoch())); } else { dout(10) << " requesting fulllog+missing from osd." << *i << " (want since " << since << " < log.tail " << pi.log_tail << ")" << dendl; context< RecoveryMachine >().send_query( - *i, pg_query_t(pg_query_t::FULLLOG, - pg->info.history, pg->get_osdmap()->get_epoch())); + *i, pg_query_t( + pg_query_t::FULLLOG, + i->shard, pg->pg_whoami.shard, + pg->info.history, pg->get_osdmap()->get_epoch())); } peer_missing_requested.insert(*i); } @@ -6650,9 +6854,11 @@ boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q.f->dump_stream("enter_time") << enter_time; q.f->open_array_section("peer_missing_requested"); - for (set::iterator p = peer_missing_requested.begin(); p != peer_missing_requested.end(); ++p) { + for (set::iterator p = peer_missing_requested.begin(); + p != peer_missing_requested.end(); + ++p) { q.f->open_object_section("osd"); - q.f->dump_int("osd", *p); + q.f->dump_stream("osd") << *p; if (pg->peer_missing.count(*p)) { q.f->open_object_section("got_missing"); pg->peer_missing[*p].dump(q.f); @@ -6779,13 +6985,14 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_ #undef dout_prefix #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ") -PG::PriorSet::PriorSet(const OSDMap &osdmap, +PG::PriorSet::PriorSet(bool ec_pool, + const OSDMap &osdmap, const map &past_intervals, const vector &up, const vector &acting, const pg_info_t &info, const PG *debug_pg) - : pg_down(false) + : ec_pool(ec_pool), pg_down(false) { /* * We have to be careful to gracefully deal with situations like @@ -6835,11 +7042,11 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap, // so that we know what they do/do not have explicitly before // sending them any new info/logs/whatever. for (unsigned i=0; i::const_reverse_iterator p = past_intervals.rbegin(); p != past_intervals.rend(); @@ -6865,6 +7072,7 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap, // consider ACTING osds for (unsigned i=0; i::iterator p = probe.begin(); + for (set::iterator p = probe.begin(); p != probe.end(); ++p) { - int o = *p; + int o = p->osd; // did someone in the prior set go down? if (osdmap->is_down(o) && down.count(o) == 0) { @@ -6931,7 +7138,7 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c } // did a down osd in cur get (re)marked as lost? - map::const_iterator r = blocked_by.find(o); + map::const_iterator r = blocked_by.find(o); if (r != blocked_by.end()) { if (!osdmap->exists(o)) { dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index 2ff9e164d0c..032e731240f 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -289,19 +289,19 @@ public: const coll_t coll; PGLog pg_log; - static string get_info_key(pg_t pgid) { + static string get_info_key(spg_t pgid) { return stringify(pgid) + "_info"; } - static string get_biginfo_key(pg_t pgid) { + static string get_biginfo_key(spg_t pgid) { return stringify(pgid) + "_biginfo"; } - static string get_epoch_key(pg_t pgid) { + static string get_epoch_key(spg_t pgid) { return stringify(pgid) + "_epoch"; } hobject_t log_oid; hobject_t biginfo_oid; - map > missing_loc; - set missing_loc_sources; // superset of missing_loc locations + map > missing_loc; + set missing_loc_sources; // superset of missing_loc locations interval_set snap_collections; // obsolete map past_intervals; @@ -312,7 +312,7 @@ public: * (if they have one) */ xlist::item recovery_item, scrub_item, scrub_finalize_item, snap_trim_item, stat_queue_item; int recovery_ops_active; - set waiting_on_backfill; + set waiting_on_backfill; #ifdef DEBUG_RECOVERY_OIDS set recovering_oids; #endif @@ -332,20 +332,26 @@ public: // primary state public: - vector up, acting, want_acting, actingbackfill; - map peer_last_complete_ondisk; + pg_shard_t primary; + pg_shard_t pg_whoami; + pg_shard_t up_primary; + vector up, acting, want_acting; + set actingbackfill; + map peer_last_complete_ondisk; eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk eversion_t pg_trim_to; // [primary only] content recovery state protected: struct PriorSet { - set probe; /// current+prior OSDs we need to probe. + const bool ec_pool; + set probe; /// current+prior OSDs we need to probe. set down; /// down osds that would normally be in @a probe and might be interesting. - map blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set + map blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set. - PriorSet(const OSDMap &osdmap, + PriorSet(bool ec_pool, + const OSDMap &osdmap, const map &past_intervals, const vector &up, const vector &acting, @@ -364,15 +370,17 @@ public: public: struct RecoveryCtx { utime_t start_time; - map< int, map > *query_map; - map< int, vector > > *info_map; - map< int, vector > > *notify_list; + map > *query_map; + map > > *info_map; + map > > *notify_list; C_Contexts *on_applied; C_Contexts *on_safe; ObjectStore::Transaction *transaction; - RecoveryCtx(map< int, map > *query_map, - map< int, vector > > *info_map, - map< int, vector > > *notify_list, + RecoveryCtx(map > *query_map, + map > > *info_map, + map > > *notify_list, C_Contexts *on_applied, C_Contexts *on_safe, ObjectStore::Transaction *transaction) @@ -403,24 +411,26 @@ protected: */ bool need_up_thru; - set stray_set; // non-acting osds that have PG data. + set stray_set; // non-acting osds that have PG data. eversion_t oldest_update; // acting: lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_purged; // peers purged - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_missing_requested; - set stray_purged; // i deleted these strays; ignore racing PGInfo from them - set peer_activated; + map peer_info; // info from peers (stray or prior) + set peer_purged; // peers purged + map peer_missing; + set peer_log_requested; // logs i've requested (and start stamps) + set peer_missing_requested; + + // i deleted these strays; ignore racing PGInfo from them + set stray_purged; + set peer_activated; // primary-only, recovery-only state - set might_have_unfound; // These osds might have objects on them - // which are unfound on the primary + set might_have_unfound; // These osds might have objects on them + // which are unfound on the primary epoch_t last_peering_reset; /* heartbeat peers */ - void set_probe_targets(const set &probe_set); + void set_probe_targets(const set &probe_set); void clear_probe_targets(); public: Mutex heartbeat_peer_lock; @@ -505,21 +515,17 @@ protected: }; BackfillInterval backfill_info; - map peer_backfill_info; + map peer_backfill_info; bool backfill_reserved; bool backfill_reserving; friend class OSD; public: - vector backfill_targets; + set backfill_targets; - bool is_backfill_targets(int osd) { - if (std::find(backfill_targets.begin(), backfill_targets.end(), osd) - != backfill_targets.end()) - return true; - else - return false; + bool is_backfill_targets(pg_shard_t osd) { + return backfill_targets.count(osd); } protected: @@ -564,20 +570,15 @@ public: void clear_primary_state(); public: - bool is_acting(int osd) const { - for (unsigned i=0; i osd.shard && acting[osd.shard] == osd.osd; + } else { + return std::find(acting.begin(), acting.end(), osd.osd) != acting.end(); + } } bool needs_recovery() const; @@ -601,10 +602,13 @@ public: bool calc_min_last_complete_ondisk() { eversion_t min = last_complete_ondisk; assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + if (peer_last_complete_ondisk.count(*i) == 0) return false; // we don't have complete info - eversion_t a = peer_last_complete_ondisk[actingbackfill[i]]; + eversion_t a = peer_last_complete_ondisk[*i]; if (a < min) min = a; } @@ -617,10 +621,10 @@ public: virtual void calc_trim_to() = 0; void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, - pg_missing_t& omissing, int from); + pg_missing_t& omissing, pg_shard_t from); void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, - pg_missing_t& omissing, int from); - bool proc_replica_info(int from, const pg_info_t &info); + pg_missing_t& omissing, pg_shard_t from); + bool proc_replica_info(pg_shard_t from, const pg_info_t &info); struct LogEntryTrimmer : public ObjectModDesc::Visitor { @@ -719,28 +723,37 @@ public: ObjectStore::Transaction *t, const hobject_t &soid); void remove_snap_mapped_object( ObjectStore::Transaction& t, const hobject_t& soid); - void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from); + void merge_log( + ObjectStore::Transaction& t, pg_info_t &oinfo, + pg_log_t &olog, pg_shard_t from); void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead); - bool search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing, - int fromosd); + bool search_for_missing( + const pg_info_t &oinfo, const pg_missing_t *omissing, + pg_shard_t fromosd); void check_for_lost_objects(); void forget_lost_objects(); - void discover_all_missing(std::map< int, map > &query_map); + void discover_all_missing(std::map > &query_map); void trim_write_ahead(); - map::const_iterator find_best_info(const map &infos) const; - bool calc_acting(int& newest_update_osd, vector& want, vector& backfill) const; - bool choose_acting(int& newest_update_osd); + map::const_iterator find_best_info( + const map &infos) const; + bool calc_acting( + pg_shard_t &auth_log_shard, + vector &want, + set &backfill) const; + bool choose_acting(pg_shard_t &auth_log_shard); void build_might_have_unfound(); void replay_queued_ops(); - void activate(ObjectStore::Transaction& t, - epoch_t query_epoch, - list& tfin, - map< int, map >& query_map, - map > > *activator_map=0); + void activate( + ObjectStore::Transaction& t, + epoch_t query_epoch, + list& tfin, + map >& query_map, + map > > *activator_map=0); void _activate_committed(epoch_t e); void all_activated_and_committed(); @@ -805,7 +818,7 @@ public: } // metadata - set reserved_peers; + set reserved_peers; bool reserved, reserve_failed; epoch_t epoch_start; @@ -814,12 +827,12 @@ public: bool active; bool queue_snap_trim; int waiting_on; - set waiting_on_whom; + set waiting_on_whom; int shallow_errors; int deep_errors; int fixed; ScrubMap primary_scrubmap; - map received_maps; + map received_maps; MOSDRepScrub *active_rep_scrub; utime_t scrub_reg_stamp; // stamp we registered for @@ -827,12 +840,12 @@ public: bool must_scrub, must_deep_scrub, must_repair; // Maps from objects with errors to missing/inconsistent peers - map > missing; - map > inconsistent; - map > inconsistent_snapcolls; + map > missing; + map > inconsistent; + map > inconsistent_snapcolls; // Map from object with errors to good peer - map > authoritative; + map > authoritative; // classic scrub bool classic; @@ -944,7 +957,11 @@ public: int active_pushes; - void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer); + void repair_object( + const hobject_t& soid, ScrubMap::object *po, + pg_shard_t bad_peer, + pg_shard_t ok_peer); + void scrub(ThreadPool::TPHandle &handle); void classic_scrub(ThreadPool::TPHandle &handle); void chunky_scrub(ThreadPool::TPHandle &handle); @@ -955,8 +972,8 @@ public: void scrub_clear_state(); bool scrub_gather_replica_maps(); void _scan_snaps(ScrubMap &map); - void _request_scrub_map_classic(int replica, eversion_t version); - void _request_scrub_map(int replica, eversion_t version, + void _request_scrub_map_classic(pg_shard_t replica, eversion_t version); + void _request_scrub_map(pg_shard_t replica, eversion_t version, hobject_t start, hobject_t end, bool deep); int build_scrub_map_chunk( ScrubMap &map, @@ -970,14 +987,14 @@ public: virtual void _scrub_finish() { } virtual void get_colls(list *out) = 0; virtual void split_colls( - pg_t child, + spg_t child, int split_bits, int seed, ObjectStore::Transaction *t) = 0; virtual bool _report_snap_collection_errors( const hobject_t &hoid, const map &attrs, - int osd, + pg_shard_t osd, ostream &out) { return false; }; void clear_scrub_reserved(); void scrub_reserve_replicas(); @@ -1055,10 +1072,10 @@ public: }; struct MInfoRec : boost::statechart::event< MInfoRec > { - int from; + pg_shard_t from; pg_info_t info; epoch_t msg_epoch; - MInfoRec(int from, pg_info_t &info, epoch_t msg_epoch) : + MInfoRec(pg_shard_t from, pg_info_t &info, epoch_t msg_epoch) : from(from), info(info), msg_epoch(msg_epoch) {} void print(std::ostream *out) const { *out << "MInfoRec from " << from << " info: " << info; @@ -1066,9 +1083,9 @@ public: }; struct MLogRec : boost::statechart::event< MLogRec > { - int from; + pg_shard_t from; boost::intrusive_ptr msg; - MLogRec(int from, MOSDPGLog *msg) : + MLogRec(pg_shard_t from, MOSDPGLog *msg) : from(from), msg(msg) {} void print(std::ostream *out) const { *out << "MLogRec from " << from; @@ -1076,9 +1093,9 @@ public: }; struct MNotifyRec : boost::statechart::event< MNotifyRec > { - int from; + pg_shard_t from; pg_notify_t notify; - MNotifyRec(int from, pg_notify_t ¬ify) : + MNotifyRec(pg_shard_t from, pg_notify_t ¬ify) : from(from), notify(notify) {} void print(std::ostream *out) const { *out << "MNotifyRec from " << from << " notify: " << notify; @@ -1086,10 +1103,10 @@ public: }; struct MQuery : boost::statechart::event< MQuery > { - int from; + pg_shard_t from; pg_query_t query; epoch_t query_epoch; - MQuery(int from, const pg_query_t &query, epoch_t query_epoch): + MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch): from(from), query(query), query_epoch(query_epoch) {} void print(std::ostream *out) const { *out << "MQuery from " << from @@ -1102,8 +1119,16 @@ public: OSDMapRef osdmap; OSDMapRef lastmap; vector newup, newacting; - AdvMap(OSDMapRef osdmap, OSDMapRef lastmap, vector& newup, vector& newacting): - osdmap(osdmap), lastmap(lastmap), newup(newup), newacting(newacting) {} + int up_primary, acting_primary; + AdvMap( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary): + osdmap(osdmap), lastmap(lastmap), + newup(newup), + newacting(newacting), + up_primary(up_primary), + acting_primary(acting_primary) {} void print(std::ostream *out) const { *out << "AdvMap"; } @@ -1196,12 +1221,13 @@ public: return state->rctx->transaction; } - void send_query(int to, const pg_query_t &query) { + void send_query(pg_shard_t to, const pg_query_t &query) { assert(state->rctx->query_map); - (*state->rctx->query_map)[to][pg->info.pgid] = query; + (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = + query; } - map > *get_query_map() { + map > *get_query_map() { assert(state->rctx->query_map); return state->rctx->query_map; } @@ -1221,9 +1247,10 @@ public: return &(state->rctx->on_applied->contexts); } - void send_notify(int to, const pg_notify_t &info, const pg_interval_map_t &pi) { + void send_notify(pg_shard_t to, + const pg_notify_t &info, const pg_interval_map_t &pi) { assert(state->rctx->notify_list); - (*state->rctx->notify_list)[to].push_back(make_pair(info, pi)); + (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi)); } }; friend class RecoveryMachine; @@ -1335,10 +1362,8 @@ public: typedef boost::mpl::list < boost::statechart::custom_reaction< ActMap >, boost::statechart::custom_reaction< MNotifyRec >, - boost::statechart::transition< NeedActingChange, WaitActingChange >, - boost::statechart::custom_reaction< AdvMap> + boost::statechart::transition< NeedActingChange, WaitActingChange > > reactions; - boost::statechart::result react(const AdvMap&); boost::statechart::result react(const ActMap&); boost::statechart::result react(const MNotifyRec&); }; @@ -1385,8 +1410,8 @@ public: Active(my_context ctx); void exit(); - const set sorted_acting_set; - const set sorted_backfill_set; + const set sorted_actingbackfill_set; + const set sorted_backfill_set; bool all_replicas_activated; typedef boost::mpl::list < @@ -1448,7 +1473,7 @@ public: boost::statechart::custom_reaction< RemoteReservationRejected >, boost::statechart::transition< AllBackfillsReserved, Backfilling > > reactions; - set::const_iterator backfill_osd_it; + set::const_iterator backfill_osd_it; WaitRemoteBackfillReserved(my_context ctx); void exit(); boost::statechart::result react(const RemoteBackfillReserved& evt); @@ -1550,7 +1575,7 @@ public: boost::statechart::custom_reaction< RemoteRecoveryReserved >, boost::statechart::transition< AllRemotesReserved, Recovering > > reactions; - set::const_iterator acting_osd_it; + set::const_iterator acting_osd_it; WaitRemoteRecoveryReserved(my_context ctx); boost::statechart::result react(const RemoteRecoveryReserved &evt); void exit(); @@ -1599,7 +1624,7 @@ public: struct GetLog; struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { - set peer_info_requested; + set peer_info_requested; GetInfo(my_context ctx); void exit(); @@ -1620,7 +1645,7 @@ public: }; struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { - int newest_update_osd; + pg_shard_t auth_log_shard; boost::intrusive_ptr msg; GetLog(my_context ctx); @@ -1643,7 +1668,7 @@ public: struct WaitFlushedPeering; struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { - set peer_missing_requested; + set peer_missing_requested; GetMissing(my_context ctx); void exit(); @@ -1725,7 +1750,7 @@ public: public: PG(OSDService *o, OSDMapRef curmap, - const PGPool &pool, pg_t p, const hobject_t& loid, const hobject_t& ioid); + const PGPool &pool, spg_t p, const hobject_t& loid, const hobject_t& ioid); virtual ~PG(); private: @@ -1734,15 +1759,52 @@ public: PG& operator=(const PG& rhs); public: - pg_t get_pgid() const { return info.pgid; } + spg_t get_pgid() const { return info.pgid; } int get_nrep() const { return acting.size(); } - int get_primary() { return acting.empty() ? -1:acting[0]; } + void init_primary_up_acting( + const vector &newup, + const vector &newacting, + int new_up_primary, + int new_acting_primary) { + actingset.clear(); + acting = newacting; + for (shard_id_t i = 0; i < acting.size(); ++i) { + if (acting[i] != CRUSH_ITEM_NONE) + actingset.insert( + pg_shard_t( + acting[i], + pool.info.ec_pool() ? i : ghobject_t::NO_SHARD)); + } + up = newup; + if (!pool.info.ec_pool()) { + up_primary = pg_shard_t(new_up_primary, ghobject_t::no_shard()); + primary = pg_shard_t(new_acting_primary, ghobject_t::no_shard()); + return; + } + up_primary = pg_shard_t(); + primary = pg_shard_t(); + for (shard_id_t i = 0; i < up.size(); ++i) { + if (up[i] == new_up_primary) { + up_primary = pg_shard_t(up[i], i); + break; + } + } + for (shard_id_t i = 0; i < acting.size(); ++i) { + if (acting[i] == new_acting_primary) { + primary = pg_shard_t(acting[i], i); + break; + } + } + assert(up_primary.osd == new_up_primary); + assert(primary.osd == new_acting_primary); + } + pg_shard_t get_primary() const { return primary; } int get_role() const { return role; } void set_role(int r) { role = r; } - bool is_primary() const { return role == 0; } + bool is_primary() const { return pg_whoami == primary; } bool is_replica() const { return role > 0; } epoch_t get_last_peering_reset() const { return last_peering_reset; } @@ -1770,7 +1832,9 @@ public: void init( int role, vector& up, + int up_primary, vector& acting, + int acting_primary, pg_history_t& history, pg_interval_map_t& pim, bool backfill, @@ -1831,10 +1895,11 @@ public: /// share new pg log entries after a pg is active void share_pg_log(); - void start_peering_interval(const OSDMapRef lastmap, - const vector& newup, - const vector& newacting, - ObjectStore::Transaction *t); + void start_peering_interval( + const OSDMapRef lastmap, + const vector& newup, int up_primary, + const vector& newacting, int acting_primary, + ObjectStore::Transaction *t); void start_flush(ObjectStore::Transaction *t, list *on_applied, list *on_safe); @@ -1845,9 +1910,9 @@ public: } void update_history_from_master(pg_history_t new_history); - void fulfill_info(int from, const pg_query_t &query, - pair ¬ify_info); - void fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch); + void fulfill_info(pg_shard_t from, const pg_query_t &query, + pair ¬ify_info); + void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch); bool is_split(OSDMapRef lastmap, OSDMapRef nextmap); bool acting_up_affected(const vector& newup, const vector& newacting); @@ -1883,18 +1948,20 @@ public: void queue_peering_event(CephPeeringEvtRef evt); void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx); void queue_notify(epoch_t msg_epoch, epoch_t query_epoch, - int from, pg_notify_t& i); + pg_shard_t from, pg_notify_t& i); void queue_info(epoch_t msg_epoch, epoch_t query_epoch, - int from, pg_info_t& i); - void queue_log(epoch_t msg_epoch, epoch_t query_epoch, int from, + pg_shard_t from, pg_info_t& i); + void queue_log(epoch_t msg_epoch, epoch_t query_epoch, pg_shard_t from, MOSDPGLog *msg); void queue_query(epoch_t msg_epoch, epoch_t query_epoch, - int from, const pg_query_t& q); + pg_shard_t from, const pg_query_t& q); void queue_null(epoch_t msg_epoch, epoch_t query_epoch); void queue_flushed(epoch_t started_at); - void handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap, - vector& newup, vector& newacting, - RecoveryCtx *rctx); + void handle_advance_map( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary, + RecoveryCtx *rctx); void handle_activate_map(RecoveryCtx *rctx); void handle_create(RecoveryCtx *rctx); void handle_loaded(RecoveryCtx *rctx); diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index e13369c2ee0..0968323da79 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -78,18 +78,17 @@ * Called when peer is recovered */ virtual void on_peer_recover( - int peer, + pg_shard_t peer, const hobject_t &oid, const ObjectRecoveryInfo &recovery_info, const object_stat_sum_t &stat ) = 0; virtual void begin_peer_recover( - int peer, + pg_shard_t peer, const hobject_t oid) = 0; - virtual void failed_push(int from, const hobject_t &soid) = 0; - + virtual void failed_push(pg_shard_t from, const hobject_t &soid) = 0; virtual void cancel_pull(const hobject_t &soid) = 0; @@ -108,15 +107,22 @@ ObjectStore::Transaction *t, OpRequestRef op = OpRequestRef() ) = 0; - virtual epoch_t get_epoch() = 0; - virtual const vector &get_actingbackfill() = 0; + virtual epoch_t get_epoch() const = 0; + + virtual const set &get_actingbackfill_shards() const = 0; + virtual std::string gen_dbg_prefix() const = 0; - virtual const map > &get_missing_loc() = 0; - virtual const map &get_peer_missing() = 0; - virtual const map &get_peer_info() = 0; - virtual const pg_missing_t &get_local_missing() = 0; - virtual const PGLog &get_log() = 0; + virtual const map > &get_missing_loc_shards() + const = 0; + + virtual const map &get_shard_missing() + const = 0; + + virtual const map &get_shard_info() const = 0; + + virtual const pg_missing_t &get_local_missing() const = 0; + virtual const PGLog &get_log() const = 0; virtual bool pgb_is_primary() const = 0; virtual OSDMapRef pgb_get_osdmap() const = 0; virtual const pg_info_t &get_info() const = 0; @@ -129,7 +135,7 @@ const eversion_t &applied_version) = 0; virtual bool should_send_op( - int peer, + pg_shard_t peer, const hobject_t &hoid) = 0; virtual void log_operation( @@ -139,7 +145,7 @@ ObjectStore::Transaction *t) = 0; virtual void update_peer_last_complete_ondisk( - int fromosd, + pg_shard_t fromosd, eversion_t lcod) = 0; virtual void update_last_complete_ondisk( @@ -151,7 +157,16 @@ virtual void schedule_work( GenContext *c) = 0; - virtual int whoami() const = 0; + virtual pg_shard_t whoami_shard() const = 0; + int whoami() const { + return whoami_shard().osd; + } + spg_t whoami_spg_t() const { + return get_info().pgid; + } + + virtual spg_t primary_spg_t() const = 0; + virtual pg_shard_t primary_shard() const = 0; virtual void send_message_osd_cluster( int peer, Message *m, epoch_t from_epoch) = 0; @@ -266,7 +281,7 @@ out->push_back(temp_coll); } void split_colls( - pg_t child, + spg_t child, int split_bits, int seed, ObjectStore::Transaction *t) { @@ -499,24 +514,26 @@ Context *on_complete) = 0; virtual bool scrub_supported() { return false; } - virtual void be_scan_list(ScrubMap &map, const vector &ls, bool deep, + virtual void be_scan_list( + ScrubMap &map, const vector &ls, bool deep, ThreadPool::TPHandle &handle) { assert(0); } virtual enum scrub_error_type be_compare_scrub_objects( - const ScrubMap::object &auth, - const ScrubMap::object &candidate, - ostream &errorstream) { assert(0); } - virtual map::const_iterator be_select_auth_object( + const ScrubMap::object &auth, + const ScrubMap::object &candidate, + ostream &errorstream) { assert(0); } + virtual map::const_iterator be_select_auth_object( const hobject_t &obj, - const map &maps) { assert(0); } - virtual void be_compare_scrubmaps(const map &maps, - map > &missing, - map > &inconsistent, - map &authoritative, - map > &invalid_snapcolls, - int &shallow_errors, int &deep_errors, - const pg_t pgid, - const vector &acting, - ostream &errorstream) { assert(0); } + const map &maps) { assert(0); } + virtual void be_compare_scrubmaps( + const map &maps, + map > &missing, + map > &inconsistent, + map &authoritative, + map > &invalid_snapcolls, + int &shallow_errors, int &deep_errors, + const spg_t pgid, + const vector &acting, + ostream &errorstream) { assert(0); } }; struct PG_SendMessageOnConn: public Context { diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index a643a5a7fa7..c6b391521a0 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -110,7 +110,7 @@ void PGLog::clear() { } void PGLog::clear_info_log( - pg_t pgid, + spg_t pgid, const hobject_t &infos_oid, const hobject_t &log_oid, ObjectStore::Transaction *t) { @@ -144,8 +144,10 @@ void PGLog::trim( } } -void PGLog::proc_replica_log(ObjectStore::Transaction& t, - pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing, int from) const +void PGLog::proc_replica_log( + ObjectStore::Transaction& t, + pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing, + pg_shard_t from) const { dout(10) << "proc_replica_log for osd." << from << ": " << oinfo << " " << olog << " " << omissing << dendl; @@ -451,7 +453,7 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead } void PGLog::merge_log(ObjectStore::Transaction& t, - pg_info_t &oinfo, pg_log_t &olog, int fromosd, + pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd, pg_info_t &info, LogEntryHandler *rollbacker, bool &dirty_info, bool &dirty_big_info) { diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index beafdbfc13f..1cf9df505ec 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -313,7 +313,7 @@ public: void reset_recovery_pointers() { log.reset_recovery_pointers(); } static void clear_info_log( - pg_t pgid, + spg_t pgid, const hobject_t &infos_oid, const hobject_t &log_oid, ObjectStore::Transaction *t); @@ -385,7 +385,7 @@ public: } void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, const pg_log_t &olog, - pg_missing_t& omissing, int from) const; + pg_missing_t& omissing, pg_shard_t from) const; protected: bool _merge_old_entry( @@ -418,7 +418,8 @@ public: pg_info_t &info, LogEntryHandler *rollbacker, bool &dirty_info, bool &dirty_big_info); - void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from, + void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, + pg_shard_t from, pg_info_t &info, LogEntryHandler *rollbacker, bool &dirty_info, bool &dirty_big_info); diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index bf0c8542c08..8dc86b4965c 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -76,10 +76,10 @@ void ReplicatedBackend::recover_object( void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap) { - for(map >::iterator i = pull_from_peer.begin(); + for(map >::iterator i = pull_from_peer.begin(); i != pull_from_peer.end(); ) { - if (osdmap->is_down(i->first)) { + if (osdmap->is_down(i->first.osd)) { dout(10) << "check_recovery_sources resetting pulls from osd." << i->first << ", osdmap has it marked down" << dendl; for (set::iterator j = i->second.begin(); @@ -504,6 +504,14 @@ void ReplicatedBackend::submit_transaction( ) ).first->second; + op.waiting_for_applied.insert( + parent->get_actingbackfill_shards().begin(), + parent->get_actingbackfill_shards().end()); + op.waiting_for_commit.insert( + parent->get_actingbackfill_shards().begin(), + parent->get_actingbackfill_shards().end()); + + issue_op( soid, at_version, @@ -517,10 +525,6 @@ void ReplicatedBackend::submit_transaction( &op, op_t); - // add myself to gather set - op.waiting_for_applied.insert(osd->whoami); - op.waiting_for_commit.insert(osd->whoami); - ObjectStore::Transaction local_t; if (t->get_temp_added().size()) { get_temp_coll(&local_t); @@ -553,7 +557,7 @@ void ReplicatedBackend::op_applied( if (op->op) op->op->mark_event("op_applied"); - op->waiting_for_applied.erase(get_parent()->whoami()); + op->waiting_for_applied.erase(get_parent()->whoami_shard()); parent->op_applied(op->v); if (op->waiting_for_applied.empty()) { @@ -573,7 +577,7 @@ void ReplicatedBackend::op_commit( if (op->op) op->op->mark_event("op_commit"); - op->waiting_for_commit.erase(get_parent()->whoami()); + op->waiting_for_commit.erase(get_parent()->whoami_shard()); if (op->waiting_for_commit.empty()) { op->on_commit->complete(0); @@ -594,7 +598,7 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op) // must be replication. tid_t rep_tid = r->get_tid(); - int fromosd = r->get_source().num(); + pg_shard_t from = r->from; if (in_progress_ops.count(rep_tid)) { map::iterator iter = @@ -607,30 +611,30 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op) if (m) dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m << " ack_type " << (int)r->ack_type - << " from osd." << fromosd + << " from " << from << dendl; else dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) " << " ack_type " << (int)r->ack_type - << " from osd." << fromosd + << " from " << from << dendl; // oh, good. if (r->ack_type & CEPH_OSD_FLAG_ONDISK) { - assert(ip_op.waiting_for_commit.count(fromosd)); - ip_op.waiting_for_commit.erase(fromosd); + assert(ip_op.waiting_for_commit.count(from)); + ip_op.waiting_for_commit.erase(from); if (ip_op.op) ip_op.op->mark_event("sub_op_commit_rec"); } else { - assert(ip_op.waiting_for_applied.count(fromosd)); + assert(ip_op.waiting_for_applied.count(from)); if (ip_op.op) ip_op.op->mark_event("sub_op_applied_rec"); } - ip_op.waiting_for_applied.erase(fromosd); + ip_op.waiting_for_applied.erase(from); parent->update_peer_last_complete_ondisk( - fromosd, + from, r->get_last_complete_ondisk()); if (ip_op.waiting_for_applied.empty() && @@ -667,12 +671,21 @@ void ReplicatedBackend::be_scan_list( hobject_t poid = *p; struct stat st; - int r = osd->store->stat(coll, poid, &st, true); + int r = store->stat( + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &st, + true); if (r == 0) { ScrubMap::object &o = map.objects[poid]; o.size = st.st_size; assert(!o.negative); - osd->store->getattrs(coll, poid, o.attrs); + store->getattrs( + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + o.attrs); // calculate the CRC32 on deep scrubs if (deep) { @@ -680,9 +693,14 @@ void ReplicatedBackend::be_scan_list( bufferlist bl, hdrbl; int r; __u64 pos = 0; - while ( (r = osd->store->read(coll, poid, pos, - cct->_conf->osd_deep_scrub_stride, bl, - true)) > 0) { + while ( ( + r = store->read( + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + pos, + cct->_conf->osd_deep_scrub_stride, bl, + true)) > 0) { handle.reset_tp_timeout(); h << bl; pos += bl.length(); @@ -697,7 +715,11 @@ void ReplicatedBackend::be_scan_list( o.digest_present = true; bl.clear(); - r = osd->store->omap_get_header(coll, poid, &hdrbl, true); + r = store->omap_get_header( + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + &hdrbl, true); if (r == 0) { dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length()) << dendl; @@ -710,8 +732,10 @@ void ReplicatedBackend::be_scan_list( o.read_error = true; } - ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( - coll, poid); + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator( + coll, + ghobject_t( + poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); assert(iter); uint64_t keys_scanned = 0; for (iter->seek_to_first(); iter->valid() ; iter->next()) { @@ -756,9 +780,9 @@ void ReplicatedBackend::be_scan_list( } enum scrub_error_type ReplicatedBackend::be_compare_scrub_objects( - const ScrubMap::object &auth, - const ScrubMap::object &candidate, - ostream &errorstream) + const ScrubMap::object &auth, + const ScrubMap::object &candidate, + ostream &errorstream) { enum scrub_error_type error = CLEAN; if (candidate.read_error) { @@ -824,12 +848,13 @@ enum scrub_error_type ReplicatedBackend::be_compare_scrub_objects( return error; } -map::const_iterator ReplicatedBackend::be_select_auth_object( +map::const_iterator + ReplicatedBackend::be_select_auth_object( const hobject_t &obj, - const map &maps) + const map &maps) { - map::const_iterator auth = maps.end(); - for (map::const_iterator j = maps.begin(); + map::const_iterator auth = maps.end(); + for (map::const_iterator j = maps.begin(); j != maps.end(); ++j) { map::iterator i = @@ -896,19 +921,19 @@ map::const_iterator ReplicatedBackend::be_select_auth_object( return auth; } -void ReplicatedBackend::be_compare_scrubmaps(const map &maps, - map > &missing, - map > &inconsistent, - map &authoritative, - map > &invalid_snapcolls, - int &shallow_errors, - int &deep_errors, - const pg_t pgid, - const vector &acting, - ostream &errorstream) +void ReplicatedBackend::be_compare_scrubmaps( + const map &maps, + map > &missing, + map > &inconsistent, + map &authoritative, + map > &invalid_snapcolls, + int &shallow_errors, int &deep_errors, + const spg_t pgid, + const vector &acting, + ostream &errorstream) { map::const_iterator i; - map::const_iterator j; + map::const_iterator j; set master_set; // Construct master set @@ -922,10 +947,11 @@ void ReplicatedBackend::be_compare_scrubmaps(const map &maps, for (set::const_iterator k = master_set.begin(); k != master_set.end(); ++k) { - map::const_iterator auth = be_select_auth_object(*k, maps); + map::const_iterator auth = + be_select_auth_object(*k, maps); assert(auth != maps.end()); - set cur_missing; - set cur_inconsistent; + set cur_missing; + set cur_inconsistent; for (j = maps.begin(); j != maps.end(); ++j) { if (j == auth) continue; @@ -941,14 +967,13 @@ void ReplicatedBackend::be_compare_scrubmaps(const map &maps, ++shallow_errors; else ++deep_errors; - errorstream << pgid << " osd." << acting[j->first] + errorstream << pgid << " shard " << j->first << ": soid " << *k << " " << ss.str() << std::endl; } } else { cur_missing.insert(j->first); ++shallow_errors; - errorstream << pgid - << " osd." << acting[j->first] + errorstream << pgid << " shard " << j->first << " missing " << *k << std::endl; } } diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 05dd9c761ce..7f37fd285dd 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -23,8 +23,8 @@ struct C_ReplicatedBackend_OnPullComplete; class ReplicatedBackend : public PGBackend { struct RPGHandle : public PGBackend::RecoveryHandle { - map > pushes; - map > pulls; + map > pushes; + map > pulls; }; friend struct C_ReplicatedBackend_OnPullComplete; public: @@ -73,11 +73,11 @@ public: virtual void dump_recovery_info(Formatter *f) const { { f->open_array_section("pull_from_peer"); - for (map >::const_iterator i = pull_from_peer.begin(); + for (map >::const_iterator i = pull_from_peer.begin(); i != pull_from_peer.end(); ++i) { f->open_object_section("pulling_from"); - f->dump_int("pull_from", i->first); + f->dump_stream("pull_from") << i->first; { f->open_array_section("pulls"); for (set::const_iterator j = i->second.begin(); @@ -96,7 +96,7 @@ public: } { f->open_array_section("pushing"); - for (map >::const_iterator i = + for (map >::const_iterator i = pushing.begin(); i != pushing.end(); ++i) { @@ -104,11 +104,11 @@ public: f->dump_stream("pushing") << i->first; { f->open_array_section("pushing_to"); - for (map::const_iterator j = i->second.begin(); + for (map::const_iterator j = i->second.begin(); j != i->second.end(); ++j) { f->open_object_section("push_progress"); - f->dump_stream("object_pushing") << j->first; + f->dump_stream("pushing_to") << j->first; { f->open_object_section("push_info"); j->second.dump(f); @@ -157,7 +157,7 @@ private: } } }; - map > pushing; + map > pushing; // pull struct PullInfo { @@ -188,7 +188,7 @@ private: map pulling; // Reverse mapping from osd peer to objects beging pulled from that peer - map > pull_from_peer; + map > pull_from_peer; void sub_op_push(OpRequestRef op); void sub_op_push_reply(OpRequestRef op); @@ -206,13 +206,13 @@ private: void do_pull(OpRequestRef op); void do_push_reply(OpRequestRef op); - bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply); - void handle_pull(int peer, PullOp &op, PushOp *reply); + bool handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply); + void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply); bool handle_pull_response( - int from, PushOp &op, PullOp *response, + pg_shard_t from, PushOp &op, PullOp *response, list *to_continue, ObjectStore::Transaction *t); - void handle_push(int from, PushOp &op, PushReplyOp *response, + void handle_push(pg_shard_t from, PushOp &op, PushReplyOp *response, ObjectStore::Transaction *t); static void trim_pushed_data(const interval_set ©_subset, @@ -220,18 +220,18 @@ private: bufferlist data_received, interval_set *intervals_usable, bufferlist *data_usable); - void _failed_push(int from, const hobject_t &soid); + void _failed_push(pg_shard_t from, const hobject_t &soid); - void send_pushes(int prio, map > &pushes); + void send_pushes(int prio, map > &pushes); void prep_push_op_blank(const hobject_t& soid, PushOp *op); - int send_push_op_legacy(int priority, int peer, + int send_push_op_legacy(int priority, pg_shard_t peer, PushOp &pop); - int send_pull_legacy(int priority, int peer, + int send_pull_legacy(int priority, pg_shard_t peer, const ObjectRecoveryInfo& recovery_info, ObjectRecoveryProgress progress); void send_pulls( int priority, - map > &pulls); + map > &pulls); int build_push_op(const ObjectRecoveryInfo &recovery_info, const ObjectRecoveryProgress &progress, @@ -265,13 +265,13 @@ private: ObjectContextRef obj, RPGHandle *h); void prep_push_to_replica( - ObjectContextRef obc, const hobject_t& soid, int peer, + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop); void prep_push(ObjectContextRef obc, - const hobject_t& oid, int dest, + const hobject_t& oid, pg_shard_t dest, PushOp *op); void prep_push(ObjectContextRef obc, - const hobject_t& soid, int peer, + const hobject_t& soid, pg_shard_t peer, eversion_t version, interval_set &data_subset, map >& clone_subsets, @@ -291,8 +291,8 @@ private: */ struct InProgressOp { tid_t tid; - set waiting_for_commit; - set waiting_for_applied; + set waiting_for_commit; + set waiting_for_applied; Context *on_commit; Context *on_applied; OpRequestRef op; @@ -380,24 +380,27 @@ private: void sub_op_modify_applied(RepModifyRef rm); void sub_op_modify_commit(RepModifyRef rm); bool scrub_supported() { return true; } - void be_scan_list(ScrubMap &map, const vector &ls, bool deep, + + void be_scan_list( + ScrubMap &map, const vector &ls, bool deep, ThreadPool::TPHandle &handle); enum scrub_error_type be_compare_scrub_objects( - const ScrubMap::object &auth, - const ScrubMap::object &candidate, - ostream &errorstream); - map::const_iterator be_select_auth_object( + const ScrubMap::object &auth, + const ScrubMap::object &candidate, + ostream &errorstream); + map::const_iterator be_select_auth_object( const hobject_t &obj, - const map &maps); - void be_compare_scrubmaps(const map &maps, - map > &missing, - map > &inconsistent, - map &authoritative, - map > &invalid_snapcolls, - int &shallow_errors, int &deep_errors, - const pg_t pgid, - const vector &acting, - ostream &errorstream); + const map &maps); + void be_compare_scrubmaps( + const map &maps, + map > &missing, + map > &inconsistent, + map &authoritative, + map > &invalid_snapcolls, + int &shallow_errors, int &deep_errors, + const spg_t pgid, + const vector &acting, + ostream &errorstream); }; #endif diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 879a1928373..a6e958d6452 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -305,7 +305,7 @@ void ReplicatedPG::on_global_recover( } void ReplicatedPG::on_peer_recover( - int peer, + pg_shard_t peer, const hobject_t &soid, const ObjectRecoveryInfo &recovery_info, const object_stat_sum_t &stat) @@ -317,7 +317,7 @@ void ReplicatedPG::on_peer_recover( } void ReplicatedPG::begin_peer_recover( - int peer, + pg_shard_t peer, const hobject_t soid) { peer_missing[peer].revise_have(soid, eversion_t()); @@ -422,8 +422,12 @@ bool ReplicatedPG::is_degraded_object(const hobject_t& soid) { if (pg_log.get_missing().missing.count(soid)) return true; - for (unsigned i = 1; i < actingbackfill.size(); i++) { - int peer = actingbackfill[i]; + assert(actingbackfill.size() > 0); + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; if (peer_missing.count(peer) && peer_missing[peer].missing.count(soid)) return true; @@ -456,8 +460,11 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef << dendl; eversion_t v; assert(actingbackfill.size() > 0); - for (unsigned i = 1; i < actingbackfill.size(); i++) { - int peer = actingbackfill[i]; + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; if (peer_missing.count(peer) && peer_missing[peer].missing.count(soid)) { v = peer_missing[peer].missing[soid].need; @@ -582,14 +589,18 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss, f->close_section(); if (backfill_targets.size() > 0) { f->open_array_section("backfill_targets"); - for (vector::iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p) - f->dump_unsigned("osd", *p); + for (set::iterator p = backfill_targets.begin(); + p != backfill_targets.end(); + ++p) + f->dump_stream("shard") << *p; f->close_section(); } if (actingbackfill.size() > 0) { f->open_array_section("actingbackfill"); - for (vector::iterator p = actingbackfill.begin(); p != actingbackfill.end(); ++p) - f->dump_unsigned("osd", *p); + for (set::iterator p = actingbackfill.begin(); + p != actingbackfill.end(); + ++p) + f->dump_stream("shard") << *p; f->close_section(); } f->open_object_section("info"); @@ -598,11 +609,11 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss, f->close_section(); f->open_array_section("peer_info"); - for (map::iterator p = peer_info.begin(); + for (map::iterator p = peer_info.begin(); p != peer_info.end(); ++p) { f->open_object_section("info"); - f->dump_unsigned("peer", p->first); + f->dump_stream("peer") << p->first; p->second.dump(f.get()); f->close_section(); } @@ -683,10 +694,13 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss, p->second.dump(f.get()); // have, need keys { f->open_array_section("locations"); - map >::iterator q = missing_loc.find(p->first); + map >::iterator q = + missing_loc.find(p->first); if (q != missing_loc.end()) - for (set::iterator r = q->second.begin(); r != q->second.end(); ++r) - f->dump_int("osd", *r); + for (set::iterator r = q->second.begin(); + r != q->second.end(); + ++r) + f->dump_stream("shard") << *r; f->close_section(); } f->close_section(); @@ -761,7 +775,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) // fall through case CEPH_OSD_OP_PGLS: - if (m->get_pg() != info.pgid) { + if (m->get_pg() != info.pgid.pgid) { dout(10) << " pgls pg=" << m->get_pg() << " != " << info.pgid << dendl; result = 0; // hmm? } else { @@ -997,7 +1011,7 @@ void ReplicatedPG::calc_trim_to() } ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap, - const PGPool &_pool, pg_t p, const hobject_t& oid, + const PGPool &_pool, spg_t p, const hobject_t& oid, const hobject_t& ioid) : PG(o, curmap, _pool, p, oid, ioid), pgbackend(new ReplicatedBackend(this, coll_t(p), o->store, cct)), @@ -1085,9 +1099,11 @@ void ReplicatedPG::do_request( hobject_t ReplicatedPG::earliest_backfill() const { hobject_t e = hobject_t::get_max(); - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - map::const_iterator iter = peer_info.find(bt); + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + map::const_iterator iter = peer_info.find(bt); assert(iter != peer_info.end()); if (iter->second.last_backfill < e) e = iter->second.last_backfill; @@ -1105,9 +1121,12 @@ hobject_t ReplicatedPG::earliest_backfill() const // take the larger of last_backfill_started and the replicas last_backfill. bool ReplicatedPG::check_src_targ(const hobject_t& soid, const hobject_t& toid) const { - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - map::const_iterator iter = peer_info.find(bt); + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t bt = *i; + map::const_iterator iter = peer_info.find(bt); assert(iter != peer_info.end()); if (toid <= MAX(last_backfill_started, iter->second.last_backfill) && @@ -1917,9 +1936,11 @@ void ReplicatedPG::do_scan( cct->_conf->osd_backfill_scan_max, &bi, handle); - MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST, - get_osdmap()->get_epoch(), m->query_epoch, - info.pgid, bi.begin, bi.end); + MOSDPGScan *reply = new MOSDPGScan( + MOSDPGScan::OP_SCAN_DIGEST, + pg_whoami, + get_osdmap()->get_epoch(), m->query_epoch, + spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end); ::encode(bi.objects, reply->get_data()); osd->send_message_osd_cluster(reply, m->get_connection()); } @@ -1927,7 +1948,7 @@ void ReplicatedPG::do_scan( case MOSDPGScan::OP_SCAN_DIGEST: { - int from = m->get_source().num(); + pg_shard_t from = m->from; // Check that from is in backfill_targets vector assert(is_backfill_targets(from)); @@ -1969,7 +1990,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op) { MOSDPGPush *m = static_cast(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH); - int from = m->get_source().num(); + pg_shard_t from = m->from; vector replies; ObjectStore::Transaction *t = new ObjectStore::Transaction; @@ -1981,6 +2002,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op) } MOSDPGPushReply *reply = new MOSDPGPushReply; + reply->from = get_parent()->whoami_shard(); reply->set_priority(m->get_priority()); reply->pgid = get_info().pgid; reply->map_epoch = m->map_epoch; @@ -2027,7 +2049,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) { MOSDPGPush *m = static_cast(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH); - int from = m->get_source().num(); + pg_shard_t from = m->from; vector replies(1); ObjectStore::Transaction *t = new ObjectStore::Transaction; @@ -2054,6 +2076,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) if (replies.size()) { MOSDPGPull *reply = new MOSDPGPull; + reply->from = parent->whoami_shard(); reply->set_priority(m->get_priority()); reply->pgid = get_info().pgid; reply->map_epoch = m->map_epoch; @@ -2074,9 +2097,9 @@ void ReplicatedBackend::do_pull(OpRequestRef op) { MOSDPGPull *m = static_cast(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PULL); - int from = m->get_source().num(); + pg_shard_t from = m->from; - map > replies; + map > replies; for (vector::iterator i = m->pulls.begin(); i != m->pulls.end(); ++i) { @@ -2090,7 +2113,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op) { MOSDPGPushReply *m = static_cast(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY); - int from = m->get_source().num(); + pg_shard_t from = m->from; vector replies(1); for (vector::iterator i = m->replies.begin(); @@ -2102,7 +2125,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op) } replies.erase(replies.end() - 1); - map > _replies; + map > _replies; _replies[from].swap(replies); send_pushes(m->get_priority(), _replies); } @@ -2120,9 +2143,11 @@ void ReplicatedPG::do_backfill(OpRequestRef op) { assert(cct->_conf->osd_kill_backfill_at != 1); - MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, - get_osdmap()->get_epoch(), m->query_epoch, - info.pgid); + MOSDPGBackfill *reply = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + get_osdmap()->get_epoch(), + m->query_epoch, + spg_t(info.pgid.pgid, primary.shard)); reply->set_priority(cct->_conf->osd_recovery_op_priority); osd->send_message_osd_cluster(reply, m->get_connection()); queue_peering_event( @@ -5068,8 +5093,10 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type) ctx->obc->ssc->snapset = ctx->new_snapset; info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category); - for (unsigned i = 0; i < backfill_targets.size() ; ++i) { - int bt = backfill_targets[i]; + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; pg_info_t& pinfo = peer_info[bt]; if (soid <= pinfo.last_backfill) pinfo.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category); @@ -6102,7 +6129,7 @@ void ReplicatedPG::op_applied(const eversion_t &applied_version) scrubber.finalizing = true; scrub_gather_replica_maps(); ++scrubber.waiting_on; - scrubber.waiting_on_whom.insert(osd->whoami); + scrubber.waiting_on_whom.insert(pg_whoami); osd->scrub_wq.queue(this); } } else { @@ -6269,9 +6296,10 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now) repop->v = ctx->at_version; - for (vector::iterator i = actingbackfill.begin() + 1; + for (set::iterator i = actingbackfill.begin(); i != actingbackfill.end(); ++i) { + if (*i == get_primary()) continue; pg_info_t &pinfo = peer_info[*i]; // keep peer_info up to date if (pinfo.last_complete == pinfo.last_update) @@ -6336,25 +6364,27 @@ void ReplicatedBackend::issue_op( { int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; - if (parent->get_actingbackfill().size() > 1) { + if (parent->get_actingbackfill_shards().size() > 1) { ostringstream ss; - ss << "waiting for subops from " << - vector( - parent->get_actingbackfill().begin() + 1, - parent->get_actingbackfill().end()); + set replicas = parent->get_actingbackfill_shards(); + replicas.erase(parent->whoami_shard()); + ss << "waiting for subops from " << replicas; if (op->op) op->op->mark_sub_op_sent(ss.str()); } - for (unsigned i=1; iget_actingbackfill().size(); i++) { - int peer = parent->get_actingbackfill()[i]; - const pg_info_t &pinfo = parent->get_peer_info().find(peer)->second; - - op->waiting_for_applied.insert(peer); - op->waiting_for_commit.insert(peer); + for (set::const_iterator i = + parent->get_actingbackfill_shards().begin(); + i != parent->get_actingbackfill_shards().end(); + ++i) { + if (*i == parent->whoami_shard()) continue; + pg_shard_t peer = *i; + const pg_info_t &pinfo = parent->get_shard_info().find(peer)->second; // forward the write/update/whatever MOSDSubOp *wr = new MOSDSubOp( - reqid, get_info().pgid, soid, + reqid, parent->whoami_shard(), + spg_t(get_info().pgid.pgid, i->shard), + soid, false, acks_wanted, get_osdmap()->get_epoch(), tid, at_version); @@ -6385,7 +6415,7 @@ void ReplicatedBackend::issue_op( wr->discard_temp_oid = discard_temp_oid; get_parent()->send_message_osd_cluster( - peer, wr, get_osdmap()->get_epoch()); + peer.osd, wr, get_osdmap()->get_epoch()); } } @@ -7145,7 +7175,9 @@ void ReplicatedBackend::sub_op_modify_applied(RepModifyRef rm) if (!rm->committed) { // send ack to acker only if we haven't sent a commit already - MOSDSubOpReply *ack = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); + MOSDSubOpReply *ack = new MOSDSubOpReply( + m, parent->whoami_shard(), + 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); ack->set_priority(CEPH_MSG_PRIO_HIGH); // this better match commit priority! get_parent()->send_message_osd_cluster( rm->ackerosd, ack, get_osdmap()->get_epoch()); @@ -7166,7 +7198,10 @@ void ReplicatedBackend::sub_op_modify_commit(RepModifyRef rm) assert(get_osdmap()->is_up(rm->ackerosd)); get_parent()->update_last_complete_ondisk(rm->last_complete); - MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK); + MOSDSubOpReply *commit = new MOSDSubOpReply( + static_cast(rm->op->get_req()), + get_parent()->whoami_shard(), + 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK); commit->set_last_complete_ondisk(rm->last_complete); commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority! get_parent()->send_message_osd_cluster( @@ -7334,34 +7369,32 @@ void ReplicatedBackend::prepare_pull( eversion_t _v = get_parent()->get_local_missing().missing.find( soid)->second.need; assert(_v == v); - const map > &missing_loc( - get_parent()->get_missing_loc()); - const map &peer_missing( - get_parent()->get_peer_missing()); - int fromosd = -1; - map >::const_iterator q = missing_loc.find(soid); + const map > &missing_loc( + get_parent()->get_missing_loc_shards()); + const map &peer_missing( + get_parent()->get_shard_missing()); + map >::const_iterator q = missing_loc.find(soid); assert(q != missing_loc.end()); assert(!q->second.empty()); // pick a pullee - vector shuffle(q->second.begin(), q->second.end()); + vector shuffle(q->second.begin(), q->second.end()); random_shuffle(shuffle.begin(), shuffle.end()); - vector::iterator p = shuffle.begin(); - assert(get_osdmap()->is_up(*p)); - fromosd = *p; - assert(fromosd >= 0); + vector::iterator p = shuffle.begin(); + assert(get_osdmap()->is_up(p->osd)); + pg_shard_t fromshard = *p; dout(7) << "pull " << soid << "v " << v << " on osds " << *p - << " from osd." << fromosd + << " from osd." << fromshard << dendl; - assert(peer_missing.count(fromosd)); - const pg_missing_t &pmissing = peer_missing.find(fromosd)->second; + assert(peer_missing.count(fromshard)); + const pg_missing_t &pmissing = peer_missing.find(fromshard)->second; if (pmissing.is_missing(soid, v)) { assert(pmissing.missing.find(soid)->second.have != v); - dout(10) << "pulling soid " << soid << " from osd " << fromosd + dout(10) << "pulling soid " << soid << " from osd " << fromshard << " at version " << pmissing.missing.find(soid)->second.have << " rather than at version " << v << dendl; v = pmissing.missing.find(soid)->second.have; @@ -7398,8 +7431,8 @@ void ReplicatedBackend::prepare_pull( recovery_info.size = ((uint64_t)-1); } - h->pulls[fromosd].push_back(PullOp()); - PullOp &op = h->pulls[fromosd].back(); + h->pulls[fromshard].push_back(PullOp()); + PullOp &op = h->pulls[fromshard].back(); op.soid = soid; op.recovery_info = recovery_info; @@ -7411,7 +7444,7 @@ void ReplicatedBackend::prepare_pull( op.recovery_progress.first = true; assert(!pulling.count(soid)); - pull_from_peer[fromosd].insert(soid); + pull_from_peer[fromshard].insert(soid); PullInfo &pi = pulling[soid]; pi.head_ctx = headctx; pi.recovery_info = op.recovery_info; @@ -7423,7 +7456,7 @@ int ReplicatedPG::recover_missing( int priority, PGBackend::RecoveryHandle *h) { - map >::iterator q = missing_loc.find(soid); + map >::iterator q = missing_loc.find(soid); if (q == missing_loc.end()) { dout(7) << "pull " << soid << " v " << v @@ -7489,7 +7522,8 @@ int ReplicatedPG::recover_missing( return PULL_YES; } -void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer) +void ReplicatedPG::send_remove_op( + const hobject_t& oid, eversion_t v, pg_shard_t peer) { tid_t tid = osd->get_tid(); osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid); @@ -7497,12 +7531,14 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer) dout(10) << "send_remove_op " << oid << " from osd." << peer << " tid " << tid << dendl; - MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, oid, false, CEPH_OSD_FLAG_ACK, - get_osdmap()->get_epoch(), tid, v); + MOSDSubOp *subop = new MOSDSubOp( + rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard), + oid, false, CEPH_OSD_FLAG_ACK, + get_osdmap()->get_epoch(), tid, v); subop->ops = vector(1); subop->ops[0].op.op = CEPH_OSD_OP_DELETE; - osd->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch()); } /* @@ -7510,7 +7546,7 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer) * clones/heads and dup data ranges where possible. */ void ReplicatedBackend::prep_push_to_replica( - ObjectContextRef obc, const hobject_t& soid, int peer, + ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop) { const object_info_t& oi = obc->obs.oi; @@ -7544,12 +7580,12 @@ void ReplicatedBackend::prep_push_to_replica( SnapSetContext *ssc = obc->ssc; assert(ssc); dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; - map::const_iterator pm = - get_parent()->get_peer_missing().find(peer); - assert(pm != get_parent()->get_peer_missing().end()); - map::const_iterator pi = - get_parent()->get_peer_info().find(peer); - assert(pi != get_parent()->get_peer_info().end()); + map::const_iterator pm = + get_parent()->get_shard_missing().find(peer); + assert(pm != get_parent()->get_shard_missing().end()); + map::const_iterator pi = + get_parent()->get_shard_info().find(peer); + assert(pi != get_parent()->get_shard_info().end()); calc_clone_subsets(ssc->snapset, soid, pm->second, pi->second.last_backfill, @@ -7562,8 +7598,8 @@ void ReplicatedBackend::prep_push_to_replica( dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; calc_head_subsets( obc, - ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second, - get_parent()->get_peer_info().find(peer)->second.last_backfill, + ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second, + get_parent()->get_shard_info().find(peer)->second.last_backfill, data_subset, clone_subsets); } @@ -7571,7 +7607,7 @@ void ReplicatedBackend::prep_push_to_replica( } void ReplicatedBackend::prep_push(ObjectContextRef obc, - const hobject_t& soid, int peer, + const hobject_t& soid, pg_shard_t peer, PushOp *pop) { interval_set data_subset; @@ -7586,7 +7622,7 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc, void ReplicatedBackend::prep_push( ObjectContextRef obc, - const hobject_t& soid, int peer, + const hobject_t& soid, pg_shard_t peer, eversion_t version, interval_set &data_subset, map >& clone_subsets, @@ -7617,7 +7653,7 @@ void ReplicatedBackend::prep_push( pi.recovery_progress = new_progress; } -int ReplicatedBackend::send_pull_legacy(int prio, int peer, +int ReplicatedBackend::send_pull_legacy(int prio, pg_shard_t peer, const ObjectRecoveryInfo &recovery_info, ObjectRecoveryProgress progress) { @@ -7632,10 +7668,12 @@ int ReplicatedBackend::send_pull_legacy(int prio, int peer, << " from osd." << peer << " tid " << tid << dendl; - MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid, - false, CEPH_OSD_FLAG_ACK, - get_osdmap()->get_epoch(), tid, - recovery_info.version); + MOSDSubOp *subop = new MOSDSubOp( + rid, parent->whoami_shard(), + get_info().pgid, recovery_info.soid, + false, CEPH_OSD_FLAG_ACK, + get_osdmap()->get_epoch(), tid, + recovery_info.version); subop->set_priority(prio); subop->ops = vector(1); subop->ops[0].op.op = CEPH_OSD_OP_PULL; @@ -7644,7 +7682,7 @@ int ReplicatedBackend::send_pull_legacy(int prio, int peer, subop->recovery_progress = progress; get_parent()->send_message_osd_cluster( - peer, subop, get_osdmap()->get_epoch()); + peer.osd, subop, get_osdmap()->get_epoch()); get_parent()->get_logger()->inc(l_osd_pull); return 0; @@ -7740,7 +7778,7 @@ ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( } bool ReplicatedBackend::handle_pull_response( - int from, PushOp &pop, PullOp *response, + pg_shard_t from, PushOp &pop, PullOp *response, list *to_continue, ObjectStore::Transaction *t ) @@ -7842,7 +7880,7 @@ struct C_OnPushCommit : public Context { }; void ReplicatedBackend::handle_push( - int from, PushOp &pop, PushReplyOp *response, + pg_shard_t from, PushOp &pop, PushReplyOp *response, ObjectStore::Transaction *t) { dout(10) << "handle_push " @@ -7875,13 +7913,13 @@ void ReplicatedBackend::handle_push( t); } -void ReplicatedBackend::send_pushes(int prio, map > &pushes) +void ReplicatedBackend::send_pushes(int prio, map > &pushes) { - for (map >::iterator i = pushes.begin(); + for (map >::iterator i = pushes.begin(); i != pushes.end(); ++i) { ConnectionRef con = get_parent()->get_con_osd_cluster( - i->first, + i->first.osd, get_osdmap()->get_epoch()); if (!con) continue; @@ -7899,7 +7937,8 @@ void ReplicatedBackend::send_pushes(int prio, map > &pushes) uint64_t cost = 0; uint64_t pushes = 0; MOSDPGPush *msg = new MOSDPGPush(); - msg->pgid = get_info().pgid; + msg->from = get_parent()->whoami_shard(); + msg->pgid = get_parent()->primary_spg_t(); msg->map_epoch = get_osdmap()->get_epoch(); msg->set_priority(prio); for (; @@ -7920,13 +7959,13 @@ void ReplicatedBackend::send_pushes(int prio, map > &pushes) } } -void ReplicatedBackend::send_pulls(int prio, map > &pulls) +void ReplicatedBackend::send_pulls(int prio, map > &pulls) { - for (map >::iterator i = pulls.begin(); + for (map >::iterator i = pulls.begin(); i != pulls.end(); ++i) { ConnectionRef con = get_parent()->get_con_osd_cluster( - i->first, + i->first.osd, get_osdmap()->get_epoch()); if (!con) continue; @@ -7946,8 +7985,9 @@ void ReplicatedBackend::send_pulls(int prio, map > &pulls) dout(20) << __func__ << ": sending pulls " << i->second << " to osd." << i->first << dendl; MOSDPGPull *msg = new MOSDPGPull(); + msg->from = parent->whoami_shard(); msg->set_priority(prio); - msg->pgid = get_info().pgid; + msg->pgid = get_parent()->primary_spg_t(); msg->map_epoch = get_osdmap()->get_epoch(); msg->pulls.swap(i->second); msg->compute_cost(cct); @@ -8069,13 +8109,15 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, return 0; } -int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop) +int ReplicatedBackend::send_push_op_legacy(int prio, pg_shard_t peer, PushOp &pop) { tid_t tid = get_parent()->get_tid(); osd_reqid_t rid(get_parent()->get_cluster_msgr_name(), 0, tid); - MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid, - false, 0, get_osdmap()->get_epoch(), - tid, pop.recovery_info.version); + MOSDSubOp *subop = new MOSDSubOp( + rid, parent->whoami_shard(), + spg_t(get_info().pgid.pgid, peer.shard), pop.soid, + false, 0, get_osdmap()->get_epoch(), + tid, pop.recovery_info.version); subop->ops = vector(1); subop->ops[0].op.op = CEPH_OSD_OP_PUSH; @@ -8090,7 +8132,7 @@ int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop) subop->current_progress = pop.before_progress; subop->recovery_progress = pop.after_progress; - get_parent()->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch()); + get_parent()->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch()); return 0; } @@ -8107,7 +8149,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op) const hobject_t& soid = reply->get_poid(); assert(reply->get_header().type == MSG_OSD_SUBOPREPLY); dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl; - int peer = reply->get_source().num(); + pg_shard_t peer = reply->from; op->mark_started(); @@ -8119,7 +8161,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op) send_push_op_legacy(op->get_req()->get_priority(), peer, pop); } -bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply) +bool ReplicatedBackend::handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply) { const hobject_t &soid = op.soid; if (pushing.count(soid) == 0) { @@ -8217,16 +8259,16 @@ void ReplicatedBackend::sub_op_pull(OpRequestRef op) pop.recovery_progress = m->recovery_progress; PushOp reply; - handle_pull(m->get_source().num(), pop, &reply); + handle_pull(m->from, pop, &reply); send_push_op_legacy( m->get_priority(), - m->get_source().num(), + m->from, reply); log_subop_stats(get_parent()->get_logger(), op, 0, l_osd_sop_pull_lat); } -void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply) +void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply) { const hobject_t &soid = op.soid; struct stat st; @@ -8267,10 +8309,13 @@ void ReplicatedPG::_committed_pushed_object( if (!is_primary()) { // Either we are a replica or backfill target. // we are fully up to date. tell the primary! - osd->send_message_osd_cluster(get_primary(), - new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid, - last_complete_ondisk), - get_osdmap()->get_epoch()); + osd->send_message_osd_cluster( + get_primary().osd, + new MOSDPGTrim( + get_osdmap()->get_epoch(), + spg_t(info.pgid.pgid, primary.shard), + last_complete_ondisk), + get_osdmap()->get_epoch()); } else { // we are the primary. tell replicas to trim? if (calc_min_last_complete_ondisk()) @@ -8412,12 +8457,12 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op) RPGHandle *h = _open_recovery_op(); list to_continue; bool more = handle_pull_response( - m->get_source().num(), pop, &resp, + m->from, pop, &resp, &to_continue, t); if (more) { send_pull_legacy( m->get_priority(), - m->get_source().num(), + m->from, resp.recovery_info, resp.recovery_progress); } else { @@ -8435,10 +8480,11 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op) } else { PushReplyOp resp; MOSDSubOpReply *reply = new MOSDSubOpReply( - m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); + m, parent->whoami_shard(), 0, + get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK); reply->set_priority(m->get_priority()); assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type); - handle_push(m->get_source().num(), pop, &resp, t); + handle_push(m->from, pop, &resp, t); t->register_on_complete(new PG_SendMessageOnConn( get_parent(), reply, m->get_connection())); } @@ -8448,26 +8494,26 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op) return; } -void ReplicatedPG::failed_push(int from, const hobject_t &soid) +void ReplicatedPG::failed_push(pg_shard_t from, const hobject_t &soid) { assert(recovering.count(soid)); recovering.erase(soid); - map >::iterator p = missing_loc.find(soid); + map >::iterator p = missing_loc.find(soid); if (p != missing_loc.end()) { - dout(0) << "_failed_push " << soid << " from osd." << from + dout(0) << "_failed_push " << soid << " from shard " << from << ", reps on " << p->second << dendl; p->second.erase(from); // forget about this (bad) peer replica if (p->second.empty()) missing_loc.erase(p); } else { - dout(0) << "_failed_push " << soid << " from osd." << from + dout(0) << "_failed_push " << soid << " from shard " << from << " but not in missing_loc ???" << dendl; } finish_recovery_op(soid); // close out this attempt, } -void ReplicatedBackend::_failed_push(int from, const hobject_t &soid) +void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid) { get_parent()->failed_push(from, soid); pull_from_peer[from].erase(soid); @@ -8500,8 +8546,11 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid) dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl; assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; if (!peer_missing[peer].is_missing(oid)) { assert(is_backfill_targets(peer)); continue; @@ -8781,7 +8830,7 @@ void ReplicatedPG::on_shutdown() osd->local_reserver.cancel_reservation(info.pgid); clear_primary_state(); - osd->remove_want_pg_temp(info.pgid); + osd->remove_want_pg_temp(info.pgid.pgid); cancel_recovery(); } @@ -8793,9 +8842,11 @@ void ReplicatedPG::on_activate() assert(!last_backfill_started.is_max()); dout(5) << "on activate: bft=" << backfill_targets << " from " << last_backfill_started << dendl; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - dout(5) << "target osd." << backfill_targets[i] - << " from " << peer_info[backfill_targets[i]].last_backfill + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + dout(5) << "target shard " << *i + << " from " << peer_info[*i].last_backfill << dendl; } } @@ -8930,11 +8981,11 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap) * check that any peers we are planning to (or currently) pulling * objects from are dealt with. */ - set now_down; - for (set::iterator p = missing_loc_sources.begin(); + set now_down; + for (set::iterator p = missing_loc_sources.begin(); p != missing_loc_sources.end(); ) { - if (osdmap->is_up(*p)) { + if (osdmap->is_up(p->osd)) { ++p; continue; } @@ -8951,9 +9002,9 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap) << missing_loc_sources << dendl; // filter missing_loc - map >::iterator p = missing_loc.begin(); + map >::iterator p = missing_loc.begin(); while (p != missing_loc.end()) { - set::iterator q = p->second.begin(); + set::iterator q = p->second.begin(); while (q != p->second.end()) if (now_down.count(*q)) { p->second.erase(q++); @@ -8968,10 +9019,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap) } } - for (set::iterator i = peer_log_requested.begin(); + for (set::iterator i = peer_log_requested.begin(); i != peer_log_requested.end(); ) { - if (!osdmap->is_up(*i)) { + if (!osdmap->is_up(i->osd)) { dout(10) << "peer_log_requested removing " << *i << dendl; peer_log_requested.erase(i++); } else { @@ -8979,10 +9030,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap) } } - for (set::iterator i = peer_missing_requested.begin(); + for (set::iterator i = peer_missing_requested.begin(); i != peer_missing_requested.end(); ) { - if (!osdmap->is_up(*i)) { + if (!osdmap->is_up(i->osd)) { dout(10) << "peer_missing_requested removing " << *i << dendl; peer_missing_requested.erase(i++); } else { @@ -9238,8 +9289,10 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle) eversion_t alternate_need = latest->reverting_to; dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl; - set& loc = missing_loc[soid]; - for (map::iterator p = peer_missing.begin(); p != peer_missing.end(); ++p) + set& loc = missing_loc[soid]; + for (map::iterator p = peer_missing.begin(); + p != peer_missing.end(); + ++p) if (p->second.is_missing(soid, need) && p->second.missing[soid].have == alternate_need) { missing_loc_sources.insert(p->first); @@ -9300,13 +9353,16 @@ int ReplicatedPG::prep_object_replica_pushes( pg_log.missing_add(soid, v, eversion_t()); bool uhoh = true; assert(actingbackfill.size() > 0); - for (unsigned i=1; i::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; if (!peer_missing[peer].is_missing(soid, v)) { missing_loc[soid].insert(peer); missing_loc_sources.insert(peer); dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v - << ", there should be a copy on osd." << peer << dendl; + << ", there should be a copy on shard " << peer << dendl; uhoh = false; } } @@ -9345,12 +9401,16 @@ int ReplicatedBackend::start_pushes( { int pushes = 0; // who needs it? - assert(get_parent()->get_actingbackfill().size() > 0); - for (unsigned i=1; iget_actingbackfill().size(); i++) { - int peer = get_parent()->get_actingbackfill()[i]; - map::const_iterator j = - get_parent()->get_peer_missing().find(peer); - assert(j != get_parent()->get_peer_missing().end()); + assert(get_parent()->get_actingbackfill_shards().size() > 0); + for (set::iterator i = + get_parent()->get_actingbackfill_shards().begin(); + i != get_parent()->get_actingbackfill_shards().end(); + ++i) { + if (*i == get_parent()->whoami_shard()) continue; + pg_shard_t peer = *i; + map::const_iterator j = + get_parent()->get_shard_missing().find(peer); + assert(j != get_parent()->get_shard_missing().end()); if (j->second.is_missing(soid)) { ++pushes; h->pushes[peer].push_back(PushOp()); @@ -9371,11 +9431,14 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle) // this is FAR from an optimal recovery order. pretty lame, really. assert(actingbackfill.size() > 0); - for (unsigned i=1; i::const_iterator pm = peer_missing.find(peer); + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + map::const_iterator pm = peer_missing.find(peer); assert(pm != peer_missing.end()); - map::const_iterator pi = peer_info.find(peer); + map::const_iterator pi = peer_info.find(peer); assert(pi != peer_info.end()); size_t m_sz = pm->second.num_missing(); @@ -9426,9 +9489,11 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle) hobject_t ReplicatedPG::earliest_peer_backfill() const { hobject_t e = hobject_t::get_max(); - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int peer = backfill_targets[i]; - map::const_iterator iter = + for (set::const_iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t peer = *i; + map::const_iterator iter = peer_backfill_info.find(peer); assert(iter != peer_backfill_info.end()); if (iter->second.begin < e) @@ -9442,9 +9507,11 @@ bool ReplicatedPG::all_peer_done() const // Primary hasn't got any more objects assert(backfill_info.empty()); - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - map::const_iterator piter = + for (set::const_iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; + map::const_iterator piter = peer_backfill_info.find(bt); assert(piter != peer_backfill_info.end()); const BackfillInterval& pbi = piter->second; @@ -9497,20 +9564,22 @@ int ReplicatedPG::recover_backfill( // on_activate() was called prior to getting here assert(last_backfill_started == earliest_backfill()); new_backfill = false; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - peer_backfill_info[bt].reset(peer_info[bt].last_backfill); + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + peer_backfill_info[*i].reset(peer_info[*i].last_backfill); } backfill_info.reset(last_backfill_started); } - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - dout(10) << "peer osd." << bt - << " info " << peer_info[bt] - << " interval " << peer_backfill_info[bt].begin - << "-" << peer_backfill_info[bt].end - << " " << peer_backfill_info[bt].objects.size() << " objects" + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + dout(10) << "peer osd." << *i + << " info " << peer_info[*i] + << " interval " << peer_backfill_info[*i].begin + << "-" << peer_backfill_info[*i].end + << " " << peer_backfill_info[*i].objects.size() << " objects" << dendl; } @@ -9520,13 +9589,14 @@ int ReplicatedPG::recover_backfill( int ops = 0; vector > > to_push; - vector > to_remove; + ObjectContextRef, vector > > to_push; + vector > to_remove; set add_to_stat; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; - peer_backfill_info[bt].trim_to(last_backfill_started); + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + peer_backfill_info[*i].trim_to(last_backfill_started); } backfill_info.trim_to(last_backfill_started); @@ -9549,19 +9619,23 @@ int ReplicatedPG::recover_backfill( << dendl; bool sent_scan = false; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; BackfillInterval& pbi = peer_backfill_info[bt]; - dout(20) << " peer osd." << bt << " backfill " << pbi.begin << "-" + dout(20) << " peer shard " << bt << " backfill " << pbi.begin << "-" << pbi.end << " " << pbi.objects << dendl; if (pbi.begin <= backfill_info.begin && !pbi.extends_to_end() && pbi.empty()) { dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl; epoch_t e = get_osdmap()->get_epoch(); - MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid, - pbi.end, hobject_t()); - osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch()); + MOSDPGScan *m = new MOSDPGScan( + MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, e, + spg_t(info.pgid.pgid, bt.shard), + pbi.end, hobject_t()); + osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch()); assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end()); waiting_on_backfill.insert(bt); sent_scan = true; @@ -9586,19 +9660,23 @@ int ReplicatedPG::recover_backfill( if (check < backfill_info.begin) { - vector check_targets; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; + set check_targets; + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; BackfillInterval& pbi = peer_backfill_info[bt]; if (pbi.begin == check) - check_targets.push_back(bt); + check_targets.insert(bt); } assert(!check_targets.empty()); dout(20) << " BACKFILL removing " << check << " from peers " << check_targets << dendl; - for (unsigned i = 0; i < check_targets.size(); ++i) { - int bt = check_targets[i]; + for (set::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; BackfillInterval& pbi = peer_backfill_info[bt]; assert(pbi.begin == check); @@ -9613,9 +9691,11 @@ int ReplicatedPG::recover_backfill( } else { eversion_t& obj_v = backfill_info.objects.begin()->second; - vector need_ver_targs, missing_targs, keep_ver_targs, skip_targs; - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; + vector need_ver_targs, missing_targs, keep_ver_targs, skip_targs; + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; BackfillInterval& pbi = peer_backfill_info[bt]; // Find all check peers that have the wrong version if (check == backfill_info.begin && check == pbi.begin) { @@ -9658,11 +9738,11 @@ int ReplicatedPG::recover_backfill( << " with ver " << obj_v << " to peers " << missing_targs << dendl; } - vector all_push = need_ver_targs; + vector all_push = need_ver_targs; all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); to_push.push_back( - boost::tuple > + boost::tuple > (backfill_info.begin, obj_v, obc, all_push)); // Count all simultaneous pushes of the same object as a single op ops++; @@ -9682,10 +9762,12 @@ int ReplicatedPG::recover_backfill( last_backfill_started = backfill_info.begin; add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes? backfill_info.pop_front(); - vector check_targets = need_ver_targs; + vector check_targets = need_ver_targs; check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end()); - for (unsigned i = 0; i < check_targets.size(); ++i) { - int bt = check_targets[i]; + for (vector::iterator i = check_targets.begin(); + i != check_targets.end(); + ++i) { + pg_shard_t bt = *i; BackfillInterval& pbi = peer_backfill_info[bt]; pbi.pop_front(); } @@ -9734,8 +9816,10 @@ int ReplicatedPG::recover_backfill( i->first < next_backfill_to_complete; pending_backfill_updates.erase(i++)) { assert(i->first > new_last_backfill); - for (unsigned j = 0; j < backfill_targets.size(); ++j) { - int bt = backfill_targets[j]; + for (set::iterator j = backfill_targets.begin(); + j != backfill_targets.end(); + ++j) { + pg_shard_t bt = *j; pg_info_t& pinfo = peer_info[bt]; //Add stats to all peers that were missing object if (i->first > pinfo.last_backfill) @@ -9768,8 +9852,10 @@ int ReplicatedPG::recover_backfill( // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to // all the backfill targets. Otherwise, we will move last_backfill up on // those targets need it and send OP_BACKFILL_PROGRESS to them. - for (unsigned i = 0; i < backfill_targets.size(); ++i) { - int bt = backfill_targets[i]; + for (set::iterator i = backfill_targets.begin(); + i != backfill_targets.end(); + ++i) { + pg_shard_t bt = *i; pg_info_t& pinfo = peer_info[bt]; if (new_last_backfill > pinfo.last_backfill) { @@ -9777,7 +9863,11 @@ int ReplicatedPG::recover_backfill( epoch_t e = get_osdmap()->get_epoch(); MOSDPGBackfill *m = NULL; if (pinfo.last_backfill.is_max()) { - m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH, e, e, info.pgid); + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_FINISH, + e, + e, + spg_t(info.pgid.pgid, bt.shard)); // Use default priority here, must match sub_op priority /* pinfo.stats might be wrong if we did log-based recovery on the * backfilled portion in addition to continuing backfill. @@ -9785,13 +9875,17 @@ int ReplicatedPG::recover_backfill( pinfo.stats = info.stats; start_recovery_op(hobject_t::get_max()); } else { - m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid); + m = new MOSDPGBackfill( + MOSDPGBackfill::OP_BACKFILL_PROGRESS, + e, + e, + spg_t(info.pgid.pgid, bt.shard)); // Use default priority here, must match sub_op priority } m->last_backfill = pinfo.last_backfill; m->stats = pinfo.stats; - osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch()); - dout(10) << " peer osd." << bt + osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch()); + dout(10) << " peer " << bt << " num_objects now " << pinfo.stats.stats.sum.num_objects << " / " << info.stats.stats.sum.num_objects << dendl; } @@ -9805,7 +9899,7 @@ int ReplicatedPG::recover_backfill( void ReplicatedPG::prep_backfill_object_push( hobject_t oid, eversion_t v, ObjectContextRef obc, - vector peers, + vector peers, PGBackend::RecoveryHandle *h) { dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl; @@ -9813,7 +9907,7 @@ void ReplicatedPG::prep_backfill_object_push( backfills_in_flight.insert(oid); for (unsigned int i = 0 ; i < peers.size(); ++i) { - map::iterator bpm = peer_missing.find(peers[i]); + map::iterator bpm = peer_missing.find(peers[i]); assert(bpm != peer_missing.end()); bpm->second.add(oid, eversion_t(), eversion_t()); } @@ -10284,8 +10378,9 @@ void ReplicatedPG::agent_setup() // choose random starting position agent_state->position = hobject_t(); agent_state->position.pool = info.pgid.pool(); - agent_state->position.hash = pool.info.get_random_pg_position(info.pgid, - rand()); + agent_state->position.hash = pool.info.get_random_pg_position( + info.pgid.pgid, + rand()); dout(10) << __func__ << " allocated new state, position " << agent_state->position << dendl; @@ -10546,7 +10641,7 @@ void ReplicatedPG::agent_stop() void ReplicatedPG::agent_choose_mode() { - uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid); + uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid); // get dirty, full ratios uint64_t dirty_micro = 0; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 2a0e1080b96..1acbf9ccbad 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -225,17 +225,17 @@ public: ObjectStore::Transaction *t ); void on_peer_recover( - int peer, + pg_shard_t peer, const hobject_t &oid, const ObjectRecoveryInfo &recovery_info, const object_stat_sum_t &stat ); void begin_peer_recover( - int peer, + pg_shard_t peer, const hobject_t oid); void on_global_recover( const hobject_t &oid); - void failed_push(int from, const hobject_t &soid); + void failed_push(pg_shard_t from, const hobject_t &soid); void cancel_pull(const hobject_t &soid); template @@ -288,27 +288,27 @@ public: tls.push_back(t); osd->store->queue_transaction(osr.get(), t, 0, 0, 0, op); } - epoch_t get_epoch() { + epoch_t get_epoch() const { return get_osdmap()->get_epoch(); } - const vector &get_actingbackfill() { + const set &get_actingbackfill_shards() const { return actingbackfill; } std::string gen_dbg_prefix() const { return gen_prefix(); } - const map > &get_missing_loc() { + const map > &get_missing_loc_shards() const { return missing_loc; } - const map &get_peer_missing() { + const map &get_shard_missing() const { return peer_missing; } - const map &get_peer_info() { + const map &get_shard_info() const { return peer_info; } - const pg_missing_t &get_local_missing() { + const pg_missing_t &get_local_missing() const { return pg_log.get_missing(); } - const PGLog &get_log() { + const PGLog &get_log() const { return pg_log; } bool pgb_is_primary() const { @@ -337,8 +337,10 @@ public: const eversion_t &applied_version); bool should_send_op( - int peer, + pg_shard_t peer, const hobject_t &hoid) { + if (peer == get_primary()) + return true; assert(peer_info.count(peer)); bool should_send = hoid.pool != (int64_t)info.pgid.pool() || hoid <= MAX(last_backfill_started, peer_info[peer].last_backfill); @@ -348,7 +350,7 @@ public: } void update_peer_last_complete_ondisk( - int fromosd, + pg_shard_t fromosd, eversion_t lcod) { peer_last_complete_ondisk[fromosd] = lcod; } @@ -366,8 +368,14 @@ public: void schedule_work( GenContext *c); - int whoami() const { - return osd->whoami; + pg_shard_t whoami_shard() const { + return pg_whoami; + } + spg_t primary_spg_t() const { + return spg_t(info.pgid.pgid, primary.shard); + } + pg_shard_t primary_shard() const { + return primary; } void send_message_osd_cluster( @@ -852,14 +860,14 @@ protected: void dump_recovery_info(Formatter *f) const { f->open_array_section("backfill_targets"); - for (vector::const_iterator p = backfill_targets.begin(); + for (set::const_iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p) - f->dump_int("osd", *p); + f->dump_stream("replica") << *p; f->close_section(); f->open_array_section("waiting_on_backfill"); - for (set::const_iterator p = waiting_on_backfill.begin(); + for (set::const_iterator p = waiting_on_backfill.begin(); p != waiting_on_backfill.end(); ++p) - f->dump_int("osd", *p); + f->dump_stream("osd") << *p; f->close_section(); f->dump_stream("last_backfill_started") << last_backfill_started; { @@ -869,9 +877,10 @@ protected: } { f->open_array_section("peer_backfill_info"); - for (map::const_iterator pbi = peer_backfill_info.begin(); + for (map::const_iterator pbi = + peer_backfill_info.begin(); pbi != peer_backfill_info.end(); ++pbi) { - f->dump_int("osd", pbi->first); + f->dump_stream("osd") << pbi->first; f->open_object_section("BackfillInterval"); pbi->second.dump(f); f->close_section(); @@ -1011,9 +1020,9 @@ protected: void prep_backfill_object_push( hobject_t oid, eversion_t v, ObjectContextRef obc, - vector peer, + vector peers, PGBackend::RecoveryHandle *h); - void send_remove_op(const hobject_t& oid, eversion_t v, int peer); + void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer); struct C_OSD_OndiskWriteUnlock : public Context { @@ -1140,7 +1149,7 @@ protected: public: ReplicatedPG(OSDService *o, OSDMapRef curmap, - const PGPool &_pool, pg_t p, const hobject_t& oid, + const PGPool &_pool, spg_t p, const hobject_t& oid, const hobject_t& ioid); ~ReplicatedPG() {} @@ -1182,7 +1191,7 @@ public: return pgbackend->temp_colls(out); } void split_colls( - pg_t child, + spg_t child, int split_bits, int seed, ObjectStore::Transaction *t) { diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 404fb492598..caae5782b64 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -481,7 +481,7 @@ ostream& operator<<(ostream& out, const pg_t &pg) const coll_t coll_t::META_COLL("meta"); -bool coll_t::is_temp(pg_t& pgid) const +bool coll_t::is_temp(spg_t& pgid) const { const char *cstr(str.c_str()); if (!pgid.parse(cstr)) @@ -494,7 +494,7 @@ bool coll_t::is_temp(pg_t& pgid) const return false; } -bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const +bool coll_t::is_pg(spg_t& pgid, snapid_t& snap) const { const char *cstr(str.c_str()); @@ -514,7 +514,7 @@ bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const return true; } -bool coll_t::is_pg_prefix(pg_t& pgid) const +bool coll_t::is_pg_prefix(spg_t& pgid) const { const char *cstr(str.c_str()); @@ -526,7 +526,7 @@ bool coll_t::is_pg_prefix(pg_t& pgid) const return true; } -bool coll_t::is_removal(uint64_t *seq, pg_t *pgid) const +bool coll_t::is_removal(uint64_t *seq, spg_t *pgid) const { if (str.substr(0, 11) != string("FORREMOVAL_")) return false; @@ -558,13 +558,13 @@ void coll_t::decode(bufferlist::iterator& bl) ::decode(struct_v, bl); switch (struct_v) { case 1: { - pg_t pgid; + spg_t pgid; snapid_t snap; ::decode(pgid, bl); ::decode(snap, bl); // infer the type - if (pgid == pg_t() && snap == 0) + if (pgid == spg_t() && snap == 0) str = "meta"; else str = pg_and_snap_to_str(pgid, snap); @@ -573,7 +573,7 @@ void coll_t::decode(bufferlist::iterator& bl) case 2: { __u8 type; - pg_t pgid; + spg_t pgid; snapid_t snap; ::decode(type, bl); @@ -1879,8 +1879,8 @@ void pg_history_t::generate_test_instances(list& o) void pg_info_t::encode(bufferlist &bl) const { - ENCODE_START(29, 26, bl); - ::encode(pgid, bl); + ENCODE_START(30, 26, bl); + ::encode(pgid.pgid, bl); ::encode(last_update, bl); ::encode(last_complete, bl); ::encode(log_tail, bl); @@ -1891,6 +1891,7 @@ void pg_info_t::encode(bufferlist &bl) const ::encode(last_epoch_started, bl); ::encode(last_user_version, bl); ::encode(hit_set, bl); + ::encode(pgid.shard, bl); ENCODE_FINISH(bl); } @@ -1900,9 +1901,9 @@ void pg_info_t::decode(bufferlist::iterator &bl) if (struct_v < 23) { old_pg_t opgid; ::decode(opgid, bl); - pgid = opgid; + pgid.pgid = opgid; } else { - ::decode(pgid, bl); + ::decode(pgid.pgid, bl); } ::decode(last_update, bl); ::decode(last_complete, bl); @@ -1932,6 +1933,10 @@ void pg_info_t::decode(bufferlist::iterator &bl) last_user_version = last_update.version; if (struct_v >= 29) ::decode(hit_set, bl); + if (struct_v >= 30) + ::decode(pgid.shard, bl); + else + pgid.shard = ghobject_t::no_shard(); DECODE_FINISH(bl); } @@ -1970,7 +1975,7 @@ void pg_info_t::generate_test_instances(list& o) list h; pg_history_t::generate_test_instances(h); o.back()->history = *h.back(); - o.back()->pgid = pg_t(1, 2, -1); + o.back()->pgid = spg_t(pg_t(1, 2, -1), ghobject_t::no_shard()); o.back()->last_update = eversion_t(3, 4); o.back()->last_complete = eversion_t(5, 6); o.back()->last_user_version = 2; @@ -1991,24 +1996,35 @@ void pg_info_t::generate_test_instances(list& o) // -- pg_notify_t -- void pg_notify_t::encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(query_epoch, bl); ::encode(epoch_sent, bl); ::encode(info, bl); + ::encode(to, bl); + ::encode(from, bl); ENCODE_FINISH(bl); } void pg_notify_t::decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(query_epoch, bl); ::decode(epoch_sent, bl); ::decode(info, bl); + if (struct_v >= 2) { + ::decode(to, bl); + ::decode(from, bl); + } else { + to = ghobject_t::NO_SHARD; + from = ghobject_t::NO_SHARD; + } DECODE_FINISH(bl); } void pg_notify_t::dump(Formatter *f) const { + f->dump_int("from", from); + f->dump_int("to", to); f->dump_stream("query_epoch") << query_epoch; f->dump_stream("epoch_sent") << epoch_sent; { @@ -2020,15 +2036,20 @@ void pg_notify_t::dump(Formatter *f) const void pg_notify_t::generate_test_instances(list& o) { - o.push_back(new pg_notify_t(1,1,pg_info_t())); - o.push_back(new pg_notify_t(3,10,pg_info_t())); + o.push_back(new pg_notify_t(3, ghobject_t::NO_SHARD, 1 ,1 , pg_info_t())); + o.push_back(new pg_notify_t(0, 0, 3, 10, pg_info_t())); } ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify) { - return lhs << "(query_epoch:" << notify.query_epoch - << ", epoch_sent:" << notify.epoch_sent - << ", info:" << notify.info << ")"; + lhs << "(query_epoch:" << notify.query_epoch + << ", epoch_sent:" << notify.epoch_sent + << ", info:" << notify.info; + if (notify.from != ghobject_t::NO_SHARD || + notify.to != ghobject_t::NO_SHARD) + lhs << " " << (unsigned)notify.from + << "->" << (unsigned)notify.to; + return lhs << ")"; } // -- pg_interval_t -- @@ -2178,11 +2199,13 @@ ostream& operator<<(ostream& out, const pg_interval_t& i) void pg_query_t::encode(bufferlist &bl, uint64_t features) const { if (features & CEPH_FEATURE_QUERY_T) { - ENCODE_START(2, 2, bl); + ENCODE_START(3, 2, bl); ::encode(type, bl); ::encode(since, bl); history.encode(bl); ::encode(epoch_sent, bl); + ::encode(to, bl); + ::encode(from, bl); ENCODE_FINISH(bl); } else { ::encode(type, bl); @@ -2194,11 +2217,18 @@ void pg_query_t::encode(bufferlist &bl, uint64_t features) const { void pg_query_t::decode(bufferlist::iterator &bl) { bufferlist::iterator bl2 = bl; try { - DECODE_START(2, bl); + DECODE_START(3, bl); ::decode(type, bl); ::decode(since, bl); history.decode(bl); ::decode(epoch_sent, bl); + if (struct_v >= 3) { + ::decode(to, bl); + ::decode(from, bl); + } else { + to = ghobject_t::NO_SHARD; + from = ghobject_t::NO_SHARD; + } DECODE_FINISH(bl); } catch (...) { bl = bl2; @@ -2210,6 +2240,8 @@ void pg_query_t::decode(bufferlist::iterator &bl) { void pg_query_t::dump(Formatter *f) const { + f->dump_int("from", from); + f->dump_int("to", to); f->dump_string("type", get_type_name()); f->dump_stream("since") << since; f->dump_stream("epoch_sent") << epoch_sent; @@ -2222,10 +2254,13 @@ void pg_query_t::generate_test_instances(list& o) o.push_back(new pg_query_t()); list h; pg_history_t::generate_test_instances(h); - o.push_back(new pg_query_t(pg_query_t::INFO, *h.back(), 4)); - o.push_back(new pg_query_t(pg_query_t::MISSING, *h.back(), 4)); - o.push_back(new pg_query_t(pg_query_t::LOG, eversion_t(4, 5), *h.back(), 4)); - o.push_back(new pg_query_t(pg_query_t::FULLLOG, *h.back(), 5)); + o.push_back(new pg_query_t(pg_query_t::INFO, 1, 2, *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::MISSING, 2, 3, *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::LOG, 0, 0, + eversion_t(4, 5), *h.back(), 4)); + o.push_back(new pg_query_t(pg_query_t::FULLLOG, + ghobject_t::NO_SHARD, ghobject_t::NO_SHARD, + *h.back(), 5)); } // -- ObjectModDesc -- diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index ed3f1870b40..026d7c51c0c 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -470,15 +470,15 @@ public: : str(str_) { } - explicit coll_t(pg_t pgid, snapid_t snap = CEPH_NOSNAP) + explicit coll_t(spg_t pgid, snapid_t snap = CEPH_NOSNAP) : str(pg_and_snap_to_str(pgid, snap)) { } - static coll_t make_temp_coll(pg_t pgid) { + static coll_t make_temp_coll(spg_t pgid) { return coll_t(pg_to_tmp_str(pgid)); } - static coll_t make_removal_coll(uint64_t seq, pg_t pgid) { + static coll_t make_removal_coll(uint64_t seq, spg_t pgid) { return coll_t(seq_to_removal_str(seq, pgid)); } @@ -494,10 +494,10 @@ public: return str < rhs.str; } - bool is_pg_prefix(pg_t& pgid) const; - bool is_pg(pg_t& pgid, snapid_t& snap) const; - bool is_temp(pg_t& pgid) const; - bool is_removal(uint64_t *seq, pg_t *pgid) const; + bool is_pg_prefix(spg_t& pgid) const; + bool is_pg(spg_t& pgid, snapid_t& snap) const; + bool is_temp(spg_t& pgid) const; + bool is_removal(uint64_t *seq, spg_t *pgid) const; void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); inline bool operator==(const coll_t& rhs) const { @@ -511,17 +511,17 @@ public: static void generate_test_instances(list& o); private: - static std::string pg_and_snap_to_str(pg_t p, snapid_t s) { + static std::string pg_and_snap_to_str(spg_t p, snapid_t s) { std::ostringstream oss; oss << p << "_" << s; return oss.str(); } - static std::string pg_to_tmp_str(pg_t p) { + static std::string pg_to_tmp_str(spg_t p) { std::ostringstream oss; oss << p << "_TEMP"; return oss.str(); } - static std::string seq_to_removal_str(uint64_t seq, pg_t pgid) { + static std::string seq_to_removal_str(uint64_t seq, spg_t pgid) { std::ostringstream oss; oss << "FORREMOVAL_" << seq << "_" << pgid; return oss.str(); @@ -1528,7 +1528,7 @@ inline ostream& operator<<(ostream& out, const pg_history_t& h) { * otherwise, we have no idea what the pg is supposed to contain. */ struct pg_info_t { - pg_t pgid; + spg_t pgid; eversion_t last_update; // last object version applied to store. eversion_t last_complete; // last version pg was complete through. epoch_t last_epoch_started;// last epoch at which this pg started on this osd @@ -1550,7 +1550,7 @@ struct pg_info_t { : last_epoch_started(0), last_user_version(0), last_backfill(hobject_t::get_max()) { } - pg_info_t(pg_t p) + pg_info_t(spg_t p) : pgid(p), last_epoch_started(0), last_user_version(0), last_backfill(hobject_t::get_max()) @@ -1564,6 +1564,11 @@ struct pg_info_t { void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); void dump(Formatter *f) const; + bool overlaps_with(const pg_info_t &oinfo) const { + return last_update > oinfo.log_tail ? + oinfo.last_update >= log_tail : + last_update >= oinfo.log_tail; + } static void generate_test_instances(list& o); }; WRITE_CLASS_ENCODER(pg_info_t) @@ -1595,13 +1600,22 @@ struct pg_notify_t { epoch_t query_epoch; epoch_t epoch_sent; pg_info_t info; - pg_notify_t() : query_epoch(0), epoch_sent(0) {} - pg_notify_t(epoch_t query_epoch, - epoch_t epoch_sent, - const pg_info_t &info) + shard_id_t to; + shard_id_t from; + pg_notify_t() : + query_epoch(0), epoch_sent(0), to(ghobject_t::no_shard()), + from(ghobject_t::no_shard()) {} + pg_notify_t( + shard_id_t to, + shard_id_t from, + epoch_t query_epoch, + epoch_t epoch_sent, + const pg_info_t &info) : query_epoch(query_epoch), epoch_sent(epoch_sent), - info(info) {} + info(info), to(to), from(from) { + assert(from == info.pgid.shard); + } void encode(bufferlist &bl) const; void decode(bufferlist::iterator &p); void dump(Formatter *f) const; @@ -1679,18 +1693,32 @@ struct pg_query_t { eversion_t since; pg_history_t history; epoch_t epoch_sent; - - pg_query_t() : type(-1), epoch_sent(0) {} - pg_query_t(int t, const pg_history_t& h, - epoch_t epoch_sent) - : type(t), history(h), - epoch_sent(epoch_sent) { + shard_id_t to; + shard_id_t from; + + pg_query_t() : type(-1), epoch_sent(0), to(ghobject_t::NO_SHARD), + from(ghobject_t::NO_SHARD) {} + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + const pg_history_t& h, + epoch_t epoch_sent) + : type(t), + history(h), + epoch_sent(epoch_sent), + to(to), from(from) { assert(t != LOG); } - pg_query_t(int t, eversion_t s, const pg_history_t& h, - epoch_t epoch_sent) + pg_query_t( + int t, + shard_id_t to, + shard_id_t from, + eversion_t s, + const pg_history_t& h, + epoch_t epoch_sent) : type(t), since(s), history(h), - epoch_sent(epoch_sent) { + epoch_sent(epoch_sent), to(to), from(from) { assert(t == LOG); } diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index c62954b86a4..bc5b4e40fd1 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -670,7 +670,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -718,7 +718,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -805,7 +805,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -900,7 +900,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -1015,7 +1015,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -1088,7 +1088,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -1129,7 +1129,7 @@ TEST_F(PGLogTest, merge_log) { ObjectStore::Transaction t; pg_log_t olog; pg_info_t oinfo; - int fromosd = -1; + pg_shard_t fromosd; pg_info_t info; list remove_snap; bool dirty_info = false; @@ -1176,7 +1176,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; eversion_t last_update(1, 1); oinfo.last_update = last_update; @@ -1228,7 +1228,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; { pg_log_entry_t e; @@ -1279,7 +1279,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; hobject_t divergent_object; @@ -1408,7 +1408,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; eversion_t last_update(1, 2); @@ -1491,7 +1491,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; eversion_t last_update(1, 2); hobject_t divergent_object; @@ -1579,7 +1579,7 @@ TEST_F(PGLogTest, proc_replica_log) { pg_log_t olog; pg_info_t oinfo; pg_missing_t omissing; - int from = -1; + pg_shard_t from; eversion_t last_update(1, 2); hobject_t divergent_object; diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc index 60f48095008..6c8b309a012 100644 --- a/src/tools/ceph-filestore-dump.cc +++ b/src/tools/ceph-filestore-dump.cc @@ -434,7 +434,7 @@ int finish_remove_pgs(ObjectStore *store, uint64_t *next_removal_seq) for (vector::iterator it = ls.begin(); it != ls.end(); ++it) { - pg_t pgid; + spg_t pgid; snapid_t snap; if (it->is_temp(pgid)) { @@ -467,12 +467,13 @@ int initiate_new_remove_pg(ObjectStore *store, pg_t r_pgid, { ObjectStore::Transaction *rmt = new ObjectStore::Transaction; - if (store->collection_exists(coll_t(r_pgid))) { + if (store->collection_exists(coll_t(spg_t(r_pgid, ghobject_t::no_shard())))) { coll_t to_remove = coll_t::make_removal_coll((*next_removal_seq)++, - r_pgid); - cout << "collection rename " << coll_t(r_pgid) << " to " << to_remove + spg_t(r_pgid, ghobject_t::no_shard())); + cout << "collection rename " << coll_t(spg_t(r_pgid, ghobject_t::no_shard())) + << " to " << to_remove << std::endl; - rmt->collection_rename(coll_t(r_pgid), to_remove); + rmt->collection_rename(coll_t(spg_t(r_pgid, ghobject_t::no_shard())), to_remove); } else { delete rmt; return ENOENT; @@ -992,11 +993,11 @@ int do_import(ObjectStore *store, OSDSuperblock sb) return 1; } - log_oid = OSD::make_pg_log_oid(pgid); - biginfo_oid = OSD::make_pg_biginfo_oid(pgid); + log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard())); + biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard())); //Check for PG already present. - coll_t coll(pgid); + coll_t coll(spg_t(pgid, ghobject_t::no_shard())); if (store->collection_exists(coll)) { cout << "pgid " << pgid << " already exists" << std::endl; return 1; @@ -1004,7 +1005,8 @@ int do_import(ObjectStore *store, OSDSuperblock sb) //Switch to collection which will be removed automatically if //this program is interupted. - coll_t rmcoll = coll_t::make_removal_coll(next_removal_seq, pgid); + coll_t rmcoll = coll_t::make_removal_coll( + next_removal_seq, spg_t(pgid, ghobject_t::no_shard())); ObjectStore::Transaction *t = new ObjectStore::Transaction; t->create_collection(rmcoll); store->apply_transaction(*t); @@ -1290,8 +1292,8 @@ int main(int argc, char **argv) goto out; } - log_oid = OSD::make_pg_log_oid(pgid); - biginfo_oid = OSD::make_pg_biginfo_oid(pgid); + log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard())); + biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard())); if (type == "remove") { uint64_t next_removal_seq = 0; //My local seq @@ -1315,13 +1317,13 @@ int main(int argc, char **argv) for (it = ls.begin(); it != ls.end(); ++it) { snapid_t snap; - pg_t tmppgid; + spg_t tmppgid; if (!it->is_pg(tmppgid, snap)) { continue; } - if (tmppgid != pgid) { + if (tmppgid.pgid != pgid) { continue; } if (snap != CEPH_NOSNAP && debug) { @@ -1344,9 +1346,10 @@ int main(int argc, char **argv) if (debug) cerr << "map_epoch " << map_epoch << std::endl; - pg_info_t info(pgid); + pg_info_t info(spg_t(pgid, ghobject_t::no_shard())); map past_intervals; - hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid); + hobject_t biginfo_oid = OSD::make_pg_biginfo_oid( + spg_t(pgid, ghobject_t::no_shard())); interval_set snap_collections; __u8 struct_ver; diff --git a/src/tools/ceph-filestore-tool.cc b/src/tools/ceph-filestore-tool.cc index e9b1351d87c..eb9f8dac36d 100644 --- a/src/tools/ceph-filestore-tool.cc +++ b/src/tools/ceph-filestore-tool.cc @@ -162,7 +162,7 @@ int main(int argc, char **argv) vector colls_to_check; if (pgidstr.length()) { - pg_t pgid; + spg_t pgid; if (!pgid.parse(pgidstr.c_str())) { cout << "Invalid pgid '" << pgidstr << "' specified" << std::endl; exit(1); @@ -178,7 +178,7 @@ int main(int argc, char **argv) for (vector::iterator i = candidates.begin(); i != candidates.end(); ++i) { - pg_t pgid; + spg_t pgid; snapid_t snap; if (i->is_pg(pgid, snap)) { colls_to_check.push_back(*i);