]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/: refer to pg by spg_t and replica by pg_shard_t
authorSamuel Just <sam.just@inktank.com>
Tue, 28 Jan 2014 20:50:05 +0000 (12:50 -0800)
committerSamuel Just <sam.just@inktank.com>
Tue, 18 Feb 2014 04:12:10 +0000 (20:12 -0800)
We may have multiple pg shards on the same osd for an
ec pool.  Thus, replicas must be referred to by <osd, shard>
and pgs by <pgid, shard>.

Signed-off-by: Samuel Just <sam.just@inktank.com>
19 files changed:
src/os/DBObjectMap.cc
src/os/LFNIndex.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/OSDMap.h
src/osd/PG.cc
src/osd/PG.h
src/osd/PGBackend.h
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedBackend.h
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/osd/osd_types.cc
src/osd/osd_types.h
src/test/osd/TestPGLog.cc
src/tools/ceph-filestore-dump.cc
src/tools/ceph-filestore-tool.cc

index 886658f9aa760a9305fcdfc625596ad265dadd33..f23815a42987c68710b053e37038dc680aadc94b 100644 (file)
@@ -248,9 +248,9 @@ bool DBObjectMap::parse_ghobject_key_v0(const string &in, coll_t *c,
 
   *c = coll_t(coll);
   int64_t pool = -1;
-  pg_t pg;
+  spg_t pg;
   if (c->is_pg_prefix(pg))
-    pool = (int64_t)pg.pool();
+    pool = (int64_t)pg.pgid.pool();
   (*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
   return true;
 }
index a250016e8e9dfde120e52c439e5d251ac3274a74..92ccebf024989fd1758784b2d893c1da26fdff1a 100644 (file)
@@ -950,9 +950,9 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t
 {
   bool r = parse_object(long_name.c_str(), *out);
   int64_t pool = -1;
-  pg_t pg;
+  spg_t pg;
   if (coll().is_pg_prefix(pg))
-    pool = (int64_t)pg.pool();
+    pool = (int64_t)pg.pgid.pool();
   out->hobj.pool = pool;
   if (!r) return r;
   string temp = lfn_generate_object_name(*out);
@@ -1043,9 +1043,9 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
 
 
   int64_t pool = -1;
-  pg_t pg;
+  spg_t pg;
   if (coll().is_pg_prefix(pg))
-    pool = (int64_t)pg.pool();
+    pool = (int64_t)pg.pgid.pool();
   (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
   return true;
 }
index 22b1ad4215b3c56eaf47f26615c00a3981a5a079..15c38833ade5c0b0d18c0a70bd9b60881f7fe937 100644 (file)
@@ -235,9 +235,9 @@ OSDService::~OSDService()
   delete objecter;
 }
 
-void OSDService::_start_split(pg_t parent, const set<pg_t> &children)
+void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
 {
-  for (set<pg_t>::const_iterator i = children.begin();
+  for (set<spg_t>::const_iterator i = children.begin();
        i != children.end();
        ++i) {
     dout(10) << __func__ << ": Starting split on pg " << *i
@@ -251,12 +251,12 @@ void OSDService::_start_split(pg_t parent, const set<pg_t> &children)
   }
 }
 
-void OSDService::mark_split_in_progress(pg_t parent, const set<pg_t> &children)
+void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
 {
   Mutex::Locker l(in_progress_split_lock);
-  map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+  map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
   assert(piter != rev_pending_splits.end());
-  for (set<pg_t>::const_iterator i = children.begin();
+  for (set<spg_t>::const_iterator i = children.begin();
        i != children.end();
        ++i) {
     assert(piter->second.count(*i));
@@ -272,19 +272,19 @@ void OSDService::mark_split_in_progress(pg_t parent, const set<pg_t> &children)
     rev_pending_splits.erase(piter);
 }
 
-void OSDService::cancel_pending_splits_for_parent(pg_t parent)
+void OSDService::cancel_pending_splits_for_parent(spg_t parent)
 {
   Mutex::Locker l(in_progress_split_lock);
   return _cancel_pending_splits_for_parent(parent);
 }
 
-void OSDService::_cancel_pending_splits_for_parent(pg_t parent)
+void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
 {
-  map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+  map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
   if (piter == rev_pending_splits.end())
     return;
 
-  for (set<pg_t>::iterator i = piter->second.begin();
+  for (set<spg_t>::iterator i = piter->second.begin();
        i != piter->second.end();
        ++i) {
     assert(pending_splits.count(*i));
@@ -299,11 +299,11 @@ void OSDService::_cancel_pending_splits_for_parent(pg_t parent)
 
 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
                                  OSDMapRef new_map,
-                                 pg_t pgid)
+                                 spg_t pgid)
 {
   assert(old_map->have_pg_pool(pgid.pool()));
   if (pgid.ps() < static_cast<unsigned>(old_map->get_pg_num(pgid.pool()))) {
-    set<pg_t> children;
+    set<spg_t> children;
     pgid.is_split(old_map->get_pg_num(pgid.pool()),
                  new_map->get_pg_num(pgid.pool()), &children);
     _start_split(pgid, children);
@@ -312,7 +312,7 @@ void OSDService::_maybe_split_pgid(OSDMapRef old_map,
   }
 }
 
-void OSDService::init_splits_between(pg_t pgid,
+void OSDService::init_splits_between(spg_t pgid,
                                     OSDMapRef frommap,
                                     OSDMapRef tomap)
 {
@@ -323,7 +323,7 @@ void OSDService::init_splits_between(pg_t pgid,
        tomap->get_pg_num(pgid.pool()),
        NULL)) {
     // Ok, a split happened, so we need to walk the osdmaps
-    set<pg_t> new_pgs; // pgs to scan on each map
+    set<spg_t> new_pgs; // pgs to scan on each map
     new_pgs.insert(pgid);
     OSDMapRef curmap(get_map(frommap->get_epoch()));
     for (epoch_t e = frommap->get_epoch() + 1;
@@ -332,9 +332,9 @@ void OSDService::init_splits_between(pg_t pgid,
       OSDMapRef nextmap(try_get_map(e));
       if (!nextmap)
        continue;
-      set<pg_t> even_newer_pgs; // pgs added in this loop
-      for (set<pg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
-       set<pg_t> split_pgs;
+      set<spg_t> even_newer_pgs; // pgs added in this loop
+      for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
+       set<spg_t> split_pgs;
        if (i->is_split(curmap->get_pg_num(i->pool()),
                        nextmap->get_pg_num(i->pool()),
                        &split_pgs)) {
@@ -353,7 +353,7 @@ void OSDService::expand_pg_num(OSDMapRef old_map,
                               OSDMapRef new_map)
 {
   Mutex::Locker l(in_progress_split_lock);
-  for (set<pg_t>::iterator i = in_progress_splits.begin();
+  for (set<spg_t>::iterator i = in_progress_splits.begin();
        i != in_progress_splits.end();
     ) {
     if (!new_map->have_pg_pool(i->pool())) {
@@ -363,7 +363,7 @@ void OSDService::expand_pg_num(OSDMapRef old_map,
       ++i;
     }
   }
-  for (map<pg_t, pg_t>::iterator i = pending_splits.begin();
+  for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
        i != pending_splits.end();
     ) {
     if (!new_map->have_pg_pool(i->first.pool())) {
@@ -376,17 +376,17 @@ void OSDService::expand_pg_num(OSDMapRef old_map,
   }
 }
 
-bool OSDService::splitting(pg_t pgid)
+bool OSDService::splitting(spg_t pgid)
 {
   Mutex::Locker l(in_progress_split_lock);
   return in_progress_splits.count(pgid) ||
     pending_splits.count(pgid);
 }
 
-void OSDService::complete_split(const set<pg_t> &pgs)
+void OSDService::complete_split(const set<spg_t> &pgs)
 {
   Mutex::Locker l(in_progress_split_lock);
-  for (set<pg_t>::const_iterator i = pgs.begin();
+  for (set<spg_t>::const_iterator i = pgs.begin();
        i != pgs.end();
        ++i) {
     dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
@@ -598,7 +598,7 @@ int OSD::do_convertfs(ObjectStore *store)
   for (vector<coll_t>::iterator i = collections.begin();
        i != collections.end();
        ++i) {
-    pg_t pgid;
+    spg_t pgid;
     if (i->is_temp(pgid))
       recursive_remove_collection(store, *i);
     else if (i->to_str() == "convertfs_temp" ||
@@ -1011,7 +1011,7 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format,
     list<obj_watch_item_t> watchers;
     osd_lock.Lock();
     // scan pg's
-    for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+    for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
         it != pg_map.end();
         ++it) {
 
@@ -1525,7 +1525,7 @@ int OSD::shutdown()
   cct->_conf->apply_changes(NULL);
   
   // Shutdown PGs
-  for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+  for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
        p != pg_map.end();
        ++p) {
     dout(20) << " kicking pg " << p->first << dendl;
@@ -1624,7 +1624,7 @@ int OSD::shutdown()
 #ifdef PG_DEBUG_REFS
   service.dump_live_pgids();
 #endif
-  for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+  for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
        p != pg_map.end();
        ++p) {
     dout(20) << " kicking pg " << p->first << dendl;
@@ -1757,7 +1757,7 @@ PGPool OSD::_get_pool(int id, OSDMapRef createmap)
 
 PG *OSD::_open_lock_pg(
   OSDMapRef createmap,
-  pg_t pgid, bool no_lockdep_check, bool hold_map_lock)
+  spg_t pgid, bool no_lockdep_check, bool hold_map_lock)
 {
   assert(osd_lock.is_locked());
 
@@ -1772,7 +1772,7 @@ PG *OSD::_open_lock_pg(
 
 PG* OSD::_make_pg(
   OSDMapRef createmap,
-  pg_t pgid)
+  spg_t pgid)
 {
   dout(10) << "_open_lock_pg " << pgid << dendl;
   PGPool pool = _get_pool(pgid.pool(), createmap);
@@ -1781,7 +1781,7 @@ PG* OSD::_make_pg(
   PG *pg;
   hobject_t logoid = make_pg_log_oid(pgid);
   hobject_t infooid = make_pg_biginfo_oid(pgid);
-  if (createmap->get_pg_type(pgid) == pg_pool_t::TYPE_REPLICATED)
+  if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED)
     pg = new ReplicatedPG(&service, createmap, pool, pgid, logoid, infooid);
   else 
     assert(0);
@@ -1797,14 +1797,14 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
   pg_map[pg->info.pgid] = pg;
   dout(10) << "Adding newly split pg " << *pg << dendl;
   vector<int> up, acting;
-  pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid, up, acting);
+  pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
   int role = OSDMap::calc_pg_role(service.whoami, acting);
   pg->set_role(role);
   pg->reg_next_scrub();
   pg->handle_loaded(rctx);
   pg->write_if_dirty(*(rctx->transaction));
   pg->queue_null(e, e);
-  map<pg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
+  map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
     peering_wait_for_split.find(pg->info.pgid);
   if (to_wake != peering_wait_for_split.end()) {
     for (list<PG::CephPeeringEvtRef>::iterator i =
@@ -1821,13 +1821,13 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
 }
 
 OSD::res_result OSD::_try_resurrect_pg(
-  OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state)
+  OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
 {
   assert(resurrected);
   assert(old_pg_state);
   // find nearest ancestor
   DeletingStateRef df;
-  pg_t cur(pgid);
+  spg_t cur(pgid);
   while (true) {
     df = service.deleting_pgs.lookup(cur);
     if (df)
@@ -1843,7 +1843,7 @@ OSD::res_result OSD::_try_resurrect_pg(
   OSDMapRef create_map = df->old_pg_state->get_osdmap();
   df->old_pg_state->unlock();
 
-  set<pg_t> children;
+  set<spg_t> children;
   if (cur == pgid) {
     if (df->try_stop_deletion()) {
       dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
@@ -1880,11 +1880,14 @@ OSD::res_result OSD::_try_resurrect_pg(
 
 PG *OSD::_create_lock_pg(
   OSDMapRef createmap,
-  pg_t pgid,
+  spg_t pgid,
   bool newly_created,
   bool hold_map_lock,
   bool backfill,
-  int role, vector<int>& up, vector<int>& acting, pg_history_t history,
+  int role,
+  vector<int>& up, int up_primary,
+  vector<int>& acting, int acting_primary,
+  pg_history_t history,
   pg_interval_map_t& pi,
   ObjectStore::Transaction& t)
 {
@@ -1895,20 +1898,29 @@ PG *OSD::_create_lock_pg(
 
   service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
 
-  pg->init(role, up, acting, history, pi, backfill, &t);
+  pg->init(
+    role,
+    up,
+    up_primary,
+    acting,
+    acting_primary,
+    history,
+    pi,
+    backfill,
+    &t);
 
   dout(7) << "_create_lock_pg " << *pg << dendl;
   return pg;
 }
 
 
-bool OSD::_have_pg(pg_t pgid)
+bool OSD::_have_pg(spg_t pgid)
 {
   assert(osd_lock.is_locked());
   return pg_map.count(pgid);
 }
 
-PG *OSD::_lookup_lock_pg(pg_t pgid)
+PG *OSD::_lookup_lock_pg(spg_t pgid)
 {
   assert(osd_lock.is_locked());
   if (!pg_map.count(pgid))
@@ -1919,7 +1931,7 @@ PG *OSD::_lookup_lock_pg(pg_t pgid)
 }
 
 
-PG *OSD::_lookup_pg(pg_t pgid)
+PG *OSD::_lookup_pg(spg_t pgid)
 {
   assert(osd_lock.is_locked());
   if (!pg_map.count(pgid))
@@ -1928,7 +1940,7 @@ PG *OSD::_lookup_pg(pg_t pgid)
   return pg;
 }
 
-PG *OSD::_lookup_lock_pg_with_map_lock_held(pg_t pgid)
+PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
 {
   assert(osd_lock.is_locked());
   assert(pg_map.count(pgid));
@@ -1949,12 +1961,12 @@ void OSD::load_pgs()
     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
   }
 
-  set<pg_t> head_pgs;
-  map<pg_t, interval_set<snapid_t> > pgs;
+  set<spg_t> head_pgs;
+  map<spg_t, interval_set<snapid_t> > pgs;
   for (vector<coll_t>::iterator it = ls.begin();
        it != ls.end();
        ++it) {
-    pg_t pgid;
+    spg_t pgid;
     snapid_t snap;
     uint64_t seq;
 
@@ -1981,10 +1993,10 @@ void OSD::load_pgs()
   }
 
   bool has_upgraded = false;
-  for (map<pg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
+  for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
        i != pgs.end();
        ++i) {
-    pg_t pgid(i->first);
+    spg_t pgid(i->first);
 
     if (!head_pgs.count(pgid)) {
       dout(10) << __func__ << ": " << pgid << " has orphan snap collections " << i->second
@@ -2051,7 +2063,15 @@ void OSD::load_pgs()
     pg->reg_next_scrub();
 
     // generate state for PG's current mapping
-    pg->get_osdmap()->pg_to_up_acting_osds(pgid, pg->up, pg->acting);
+    int primary, up_primary;
+    vector<int> acting, up;
+    pg->get_osdmap()->pg_to_up_acting_osds(
+      pgid.pgid, &up, &up_primary, &acting, &primary);
+    pg->init_primary_up_acting(
+      up,
+      acting,
+      up_primary,
+      primary);
     int role = OSDMap::calc_pg_role(whoami, pg->acting);
     pg->set_role(role);
 
@@ -2089,7 +2109,7 @@ void OSD::build_past_intervals_parallel()
   // calculate untion of map range
   epoch_t end_epoch = superblock.oldest_map;
   epoch_t cur_epoch = superblock.newest_map;
-  for (ceph::unordered_map<pg_t, PG*>::iterator i = pg_map.begin();
+  for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
        i != pg_map.end();
        ++i) {
     PG *pg = i->second;
@@ -2131,7 +2151,7 @@ void OSD::build_past_intervals_parallel()
        continue;
 
       vector<int> acting, up;
-      cur_map->pg_to_up_acting_osds(pg->info.pgid, up, acting);
+      cur_map->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
 
       if (p.same_interval_since == 0) {
        dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
@@ -2145,15 +2165,16 @@ void OSD::build_past_intervals_parallel()
       assert(last_map);
 
       std::stringstream debug;
-      bool new_interval = pg_interval_t::check_new_interval(p.old_acting, acting,
-                                                           p.old_up, up,
-                                                           p.same_interval_since,
-                                                           pg->info.history.last_epoch_clean,
-                                                           cur_map, last_map,
-                                                           pg->info.pgid.pool(),
-                                                           pg->info.pgid,
-                                                           &pg->past_intervals,
-                                                           &debug);
+      bool new_interval = pg_interval_t::check_new_interval(
+       p.old_acting, acting,
+       p.old_up, up,
+       p.same_interval_since,
+       pg->info.history.last_epoch_clean,
+       cur_map, last_map,
+       pg->info.pgid.pool(),
+       pg->info.pgid.pgid,
+       &pg->past_intervals,
+       &debug);
       if (new_interval) {
        dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
                 << " " << debug.str() << dendl;
@@ -2195,37 +2216,40 @@ void OSD::build_past_intervals_parallel()
  * hasn't changed since the given epoch and we are the primary.
  */
 void OSD::handle_pg_peering_evt(
+  spg_t pgid,
   const pg_info_t& info,
   pg_interval_map_t& pi,
   epoch_t epoch,
-  int from,
+  pg_shard_t from,
   bool primary,
   PG::CephPeeringEvtRef evt)
 {
-  if (service.splitting(info.pgid)) {
-    peering_wait_for_split[info.pgid].push_back(evt);
+  if (service.splitting(pgid)) {
+    peering_wait_for_split[pgid].push_back(evt);
     return;
   }
 
-  if (!_have_pg(info.pgid)) {
+  if (!_have_pg(pgid)) {
     // same primary?
-    if (!osdmap->have_pg_pool(info.pgid.pool()))
+    if (!osdmap->have_pg_pool(pgid.pool()))
       return;
+    int up_primary, acting_primary;
     vector<int> up, acting;
-    osdmap->pg_to_up_acting_osds(info.pgid, up, acting);
+    osdmap->pg_to_up_acting_osds(
+      pgid.pgid, &up, &up_primary, &acting, &acting_primary);
     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
     pg_history_t history = info.history;
     bool valid_history = project_pg_history(
-      info.pgid, history, epoch, up, acting);
+      pgid, history, epoch, up, acting);
 
     if (!valid_history || epoch < history.same_interval_since) {
-      dout(10) << "get_or_create_pg " << info.pgid << " acting changed in "
+      dout(10) << "get_or_create_pg " << pgid << " acting changed in "
               << history.same_interval_since << " (msg from " << epoch << ")" << dendl;
       return;
     }
 
-    if (service.splitting(info.pgid)) {
+    if (service.splitting(pgid)) {
       assert(0);
     }
 
@@ -2234,29 +2258,29 @@ void OSD::handle_pg_peering_evt(
       // DNE on source?
       if (info.dne()) {
        // is there a creation pending on this pg?
-       if (creating_pgs.count(info.pgid)) {
-         creating_pgs[info.pgid].prior.erase(from);
-         if (!can_create_pg(info.pgid))
+       if (creating_pgs.count(pgid)) {
+         creating_pgs[pgid].prior.erase(from);
+         if (!can_create_pg(pgid))
            return;
-         history = creating_pgs[info.pgid].history;
+         history = creating_pgs[pgid].history;
          create = true;
        } else {
-         dout(10) << "get_or_create_pg " << info.pgid
+         dout(10) << "get_or_create_pg " << pgid
                   << " DNE on source, but creation probe, ignoring" << dendl;
          return;
        }
       }
-      creating_pgs.erase(info.pgid);
+      creating_pgs.erase(pgid);
     } else {
       assert(!info.dne());  // pg exists if we are hearing about it
     }
 
     // do we need to resurrect a deleting pg?
-    pg_t resurrected;
+    spg_t resurrected;
     PGRef old_pg_state;
     res_result result = _try_resurrect_pg(
       service.get_osdmap(),
-      info.pgid,
+      pgid,
       &resurrected,
       &old_pg_state);
 
@@ -2264,11 +2288,14 @@ void OSD::handle_pg_peering_evt(
     switch (result) {
     case RES_NONE: {
       // ok, create the pg locally using provided Info and History
-      rctx.transaction->create_collection(coll_t(info.pgid));
+      rctx.transaction->create_collection(coll_t(pgid));
       PG *pg = _create_lock_pg(
        get_map(epoch),
-       info.pgid, create, false, result == RES_SELF,
-       role, up, acting, history, pi,
+       pgid, create, false, result == RES_SELF,
+       role,
+       up, up_primary,
+       acting, acting_primary,
+       history, pi,
        *rctx.transaction);
       pg->handle_create(&rctx);
       pg->write_if_dirty(*rctx.transaction);
@@ -2293,7 +2320,9 @@ void OSD::handle_pg_peering_evt(
        true,
        old_pg_state->role,
        old_pg_state->up,
+       old_pg_state->up_primary.osd,
        old_pg_state->acting,
+       old_pg_state->primary.osd,
        old_pg_state->info.history,
        old_pg_state->past_intervals,
        *rctx.transaction);
@@ -2322,7 +2351,9 @@ void OSD::handle_pg_peering_evt(
        true,
        old_pg_state->role,
        old_pg_state->up,
+       old_pg_state->up_primary.osd,
        old_pg_state->acting,
+       old_pg_state->primary.osd,
        old_pg_state->info.history,
        old_pg_state->past_intervals,
        *rctx.transaction
@@ -2337,8 +2368,8 @@ void OSD::handle_pg_peering_evt(
       // kick any waiters
       wake_pg_waiters(parent->info.pgid);
 
-      assert(service.splitting(info.pgid));
-      peering_wait_for_split[info.pgid].push_back(evt);
+      assert(service.splitting(pgid));
+      peering_wait_for_split[pgid].push_back(evt);
 
       //parent->queue_peering_event(evt);
       parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
@@ -2348,7 +2379,7 @@ void OSD::handle_pg_peering_evt(
     }
   } else {
     // already had it.  did the mapping change?
-    PG *pg = _lookup_lock_pg(info.pgid);
+    PG *pg = _lookup_lock_pg(pgid);
     if (epoch < pg->info.history.same_interval_since) {
       dout(10) << *pg << " get_or_create_pg acting changed in "
               << pg->info.history.same_interval_since
@@ -2368,27 +2399,36 @@ void OSD::handle_pg_peering_evt(
  *  - from each epoch, include all osds up then AND now
  *  - if no osds from then are up now, include them all, even tho they're not reachable now
  */
-void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset)
+void OSD::calc_priors_during(
+  spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset)
 {
-  dout(15) << "calc_priors_during " << pgid << " [" << start << "," << end << ")" << dendl;
+  dout(15) << "calc_priors_during " << pgid << " [" << start
+          << "," << end << ")" << dendl;
   
   for (epoch_t e = start; e < end; e++) {
     OSDMapRef oldmap = get_map(e);
     vector<int> acting;
-    oldmap->pg_to_acting_osds(pgid, acting);
+    oldmap->pg_to_acting_osds(pgid.pgid, acting);
     dout(20) << "  " << pgid << " in epoch " << e << " was " << acting << dendl;
     int up = 0;
     for (unsigned i=0; i<acting.size(); i++)
       if (osdmap->is_up(acting[i])) {
-       if (acting[i] != whoami)
-         pset.insert(acting[i]);
+       if (acting[i] != whoami) {
+         pset.insert(
+           pg_shard_t(
+             acting[i],
+             osdmap->pg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD));
+       }
        up++;
       }
     if (!up && !acting.empty()) {
       // sucky.  add down osds, even tho we can't reach them right now.
       for (unsigned i=0; i<acting.size(); i++)
        if (acting[i] != whoami)
-         pset.insert(acting[i]);
+         pset.insert(
+           pg_shard_t(
+             acting[i],
+             osdmap->pg_is_ec(pgid.pgid) ? i : ghobject_t::NO_SHARD));
     }
   }
   dout(10) << "calc_priors_during " << pgid
@@ -2401,7 +2441,7 @@ void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& ps
  * Fill in the passed history so you know same_interval_since, same_up_since,
  * and same_primary_since.
  */
-bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from,
+bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
                             const vector<int>& currentup,
                             const vector<int>& currentacting)
 {
@@ -2423,7 +2463,7 @@ bool OSD::project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from,
     assert(oldmap->have_pg_pool(pgid.pool()));
 
     vector<int> up, acting;
-    oldmap->pg_to_up_acting_osds(pgid, up, acting);
+    oldmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
 
     // acting set change?
     if ((acting != currentacting || up != currentup) && e > h.same_interval_since) {
@@ -2667,7 +2707,7 @@ void OSD::maybe_update_heartbeat_peers()
 
   // build heartbeat from set
   if (is_active()) {
-    for (ceph::unordered_map<pg_t, PG*>::iterator i = pg_map.begin();
+    for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
         i != pg_map.end();
         ++i) {
       PG *pg = i->second;
@@ -3177,7 +3217,7 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       command == "truncobj" || command == "injectmdataerr" ||
       command == "injectdataerr"
     ) {
-    pg_t rawpg, pgid;
+    pg_t rawpg;
     int64_t pool;
     OSDMapRef curmap = service->get_osdmap();
     int r;
@@ -3208,7 +3248,11 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
       ss << "Invalid namespace/objname";
       return;
     }
-    pgid = curmap->raw_pg_to_pg(rawpg);
+    if (curmap->pg_is_ec(rawpg)) {
+      ss << "Must not call on ec pool";
+      return;
+    }
+    spg_t pgid = spg_t(curmap->raw_pg_to_pg(rawpg), ghobject_t::no_shard());
 
     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
     ObjectStore::Transaction t;
@@ -3844,7 +3888,7 @@ void OSD::send_pg_stats(const utime_t &now)
       }
       pg->pg_stats_publish_lock.Lock();
       if (pg->pg_stats_publish_valid) {
-       m->pg_stat[pg->info.pgid] = pg->pg_stats_publish;
+       m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
        dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
                 << pg->pg_stats_publish.reported_seq << dendl;
       } else {
@@ -3888,8 +3932,8 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
     PGRef _pg(pg);
     ++p;
 
-    if (ack->pg_stat.count(pg->info.pgid)) {
-      pair<version_t,epoch_t> acked = ack->pg_stat[pg->info.pgid];
+    if (ack->pg_stat.count(pg->info.pgid.pgid)) {
+      pair<version_t,epoch_t> acked = ack->pg_stat[pg->info.pgid.pgid];
       pg->pg_stats_publish_lock.Lock();
       if (acked.first == pg->pg_stats_publish.reported_seq &&
          acked.second == pg->pg_stats_publish.reported_epoch) {
@@ -4137,7 +4181,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
              prefix == "mark_unfound_lost" ||
              prefix == "list_missing")
           )) {
-    pg_t pgid;
+    spg_t pgid;
 
     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
       ss << "no pgid specified";
@@ -4235,16 +4279,16 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
        goto out;
     }
 
-    std::set <pg_t> keys;
-    for (ceph::unordered_map<pg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
+    std::set <spg_t> keys;
+    for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
         pg_map_e != pg_map.end(); ++pg_map_e) {
       keys.insert(pg_map_e->first);
     }
 
     fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
-    for (std::set <pg_t>::iterator p = keys.begin();
+    for (std::set <spg_t>::iterator p = keys.begin();
         p != keys.end(); ++p) {
-      ceph::unordered_map<pg_t, PG*>::iterator q = pg_map.find(*p);
+      ceph::unordered_map<spg_t, PG*>::iterator q = pg_map.find(*p);
       assert(q != pg_map.end());
       PG *pg = q->second;
       pg->lock();
@@ -4256,11 +4300,11 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist
        pg->pg_log.get_missing().missing.begin();
       for (; mi != mend; ++mi) {
        fout << mi->first << " -> " << mi->second << std::endl;
-       map<hobject_t, set<int> >::const_iterator mli =
+       map<hobject_t, set<pg_shard_t> >::const_iterator mli =
          pg->missing_loc.find(mi->first);
        if (mli == pg->missing_loc.end())
          continue;
-       const set<int> &mls(mli->second);
+       const set<pg_shard_t> &mls(mli->second);
        if (mls.empty())
          continue;
        fout << "missing_loc: " << mls << std::endl;
@@ -4816,7 +4860,7 @@ void OSD::handle_scrub(MOSDScrub *m)
   }
 
   if (m->scrub_pgs.empty()) {
-    for (ceph::unordered_map<pg_t, PG*>::iterator p = pg_map.begin();
+    for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
         p != pg_map.end();
         ++p) {
       PG *pg = p->second;
@@ -4834,9 +4878,11 @@ void OSD::handle_scrub(MOSDScrub *m)
   } else {
     for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
         p != m->scrub_pgs.end();
-        ++p)
-      if (pg_map.count(*p)) {
-       PG *pg = pg_map[*p];
+        ++p) {
+      spg_t pcand;
+      if (osdmap->get_primary_shard(*p, &pcand) &&
+         pg_map.count(pcand)) {
+       PG *pg = pg_map[pcand];
        pg->lock();
        if (pg->is_primary()) {
          pg->unreg_next_scrub();
@@ -4848,6 +4894,7 @@ void OSD::handle_scrub(MOSDScrub *m)
        }
        pg->unlock();
       }
+    }
   }
   
   m->put();
@@ -4896,11 +4943,11 @@ void OSD::sched_scrub()
   
   //dout(20) << " " << last_scrub_pg << dendl;
 
-  pair<utime_t, pg_t> pos;
+  pair<utime_t, spg_t> pos;
   if (service.first_scrub_stamp(&pos)) {
     do {
       utime_t t = pos.first;
-      pg_t pgid = pos.second;
+      spg_t pgid = pos.second;
       dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
 
       utime_t diff = now - t;
@@ -5496,12 +5543,19 @@ void OSD::advance_pg(
       continue;
 
     vector<int> newup, newacting;
-    nextmap->pg_to_up_acting_osds(pg->info.pgid, newup, newacting);
-    pg->handle_advance_map(nextmap, lastmap, newup, newacting, rctx);
+    int up_primary, acting_primary;
+    nextmap->pg_to_up_acting_osds(
+      pg->info.pgid.pgid,
+      &newup, &up_primary,
+      &newacting, &acting_primary);
+    pg->handle_advance_map(
+      nextmap, lastmap, newup, up_primary,
+      newacting, acting_primary, rctx);
 
     // Check for split!
-    set<pg_t> children;
-    if (pg->info.pgid.is_split(
+    set<spg_t> children;
+    spg_t parent(pg->info.pgid);
+    if (parent.is_split(
        lastmap->get_pg_num(pg->pool.id),
        nextmap->get_pg_num(pg->pool.id),
        &children)) {
@@ -5541,15 +5595,15 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
   }
 
   // scan pg creations
-  ceph::unordered_map<pg_t, create_pg_info>::iterator n = creating_pgs.begin();
+  ceph::unordered_map<spg_t, create_pg_info>::iterator n = creating_pgs.begin();
   while (n != creating_pgs.end()) {
-    ceph::unordered_map<pg_t, create_pg_info>::iterator p = n++;
-    pg_t pgid = p->first;
+    ceph::unordered_map<spg_t, create_pg_info>::iterator p = n++;
+    spg_t pgid = p->first;
 
     // am i still primary?
     vector<int> acting;
     int primary;
-    osdmap->pg_to_acting_osds(pgid, &acting, &primary);
+    osdmap->pg_to_acting_osds(pgid.pgid, &acting, &primary);
     if (primary != whoami) {
       dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl;
       creating_pgs.erase(p);
@@ -5563,12 +5617,12 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
   }
 
   // scan pgs with waiters
-  map<pg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
+  map<spg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
   while (p != waiting_for_pg.end()) {
-    pg_t pgid = p->first;
+    spg_t pgid = p->first;
 
     vector<int> acting;
-    int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+    int nrep = osdmap->pg_to_acting_osds(pgid.pgid, acting);
     int role = osdmap->calc_pg_role(whoami, acting, nrep);
     if (role >= 0) {
       ++p;  // still me
@@ -5591,7 +5645,7 @@ void OSD::consume_map()
   list<PGRef> to_remove;
 
   // scan pg's
-  for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+  for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
        it != pg_map.end();
        ++it) {
     PG *pg = it->second;
@@ -5628,7 +5682,7 @@ void OSD::consume_map()
   service.publish_map(osdmap);
 
   // scan pg's
-  for (ceph::unordered_map<pg_t,PG*>::iterator it = pg_map.begin();
+  for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
        it != pg_map.end();
        ++it) {
     PG *pg = it->second;
@@ -5912,7 +5966,7 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
 // pg creation
 
 
-bool OSD::can_create_pg(pg_t pgid)
+bool OSD::can_create_pg(spg_t pgid)
 {
   assert(creating_pgs.count(pgid));
 
@@ -5929,7 +5983,7 @@ bool OSD::can_create_pg(pg_t pgid)
 
 void OSD::split_pgs(
   PG *parent,
-  const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
+  const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
   OSDMapRef curmap,
   OSDMapRef nextmap,
   PG::RecoveryCtx *rctx)
@@ -5939,7 +5993,7 @@ void OSD::split_pgs(
   parent->update_snap_mapper_bits(
     parent->info.pgid.get_split_bits(pg_num)
     );
-  for (set<pg_t>::const_iterator i = childpgids.begin();
+  for (set<spg_t>::const_iterator i = childpgids.begin();
        i != childpgids.end();
        ++i) {
     dout(10) << "Splitting " << *parent << " into " << *i << dendl;
@@ -5956,10 +6010,10 @@ void OSD::split_pgs(
     parent->split_colls(
       *i,
       split_bits,
-      i->m_seed,
+      i->ps(),
       rctx->transaction);
     parent->split_into(
-      *i,
+      i->pgid,
       child,
       split_bits);
 
@@ -6014,41 +6068,48 @@ void OSD::handle_pg_create(OpRequestRef op)
   for (map<pg_t,pg_create_t>::iterator p = m->mkpg.begin();
        p != m->mkpg.end();
        ++p) {
-    pg_t pgid = p->first;
     epoch_t created = p->second.created;
     pg_t parent = p->second.parent;
     if (p->second.split_bits) // Skip split pgs
       continue;
-    pg_t on = pgid;
+    pg_t on = p->first;
 
-    if (pgid.preferred() >= 0) {
-      dout(20) << "ignoring localized pg " << pgid << dendl;
+    if (on.preferred() >= 0) {
+      dout(20) << "ignoring localized pg " << on << dendl;
       continue;
     }
-    if (!osdmap->have_pg_pool(pgid.pool())) {
-      dout(20) << "ignoring pg on deleted pool " << pgid << dendl;
+
+    if (!osdmap->have_pg_pool(on.pool())) {
+      dout(20) << "ignoring pg on deleted pool " << on << dendl;
       continue;
     }
 
-    dout(20) << "mkpg " << pgid << " e" << created << dendl;
+    dout(20) << "mkpg " << on << " e" << created << dendl;
    
     // is it still ours?
     vector<int> up, acting;
-    int up_primary, acting_primary;
+    int up_primary = -1;
+    int acting_primary = -1;
     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
     if (up_primary != whoami) {
-      dout(10) << "mkpg " << pgid << "  not primary (role=" << role << "), skipping" << dendl;
+      dout(10) << "mkpg " << on << "  not primary (role="
+              << role << "), skipping" << dendl;
       continue;
     }
     if (up != acting) {
-      dout(10) << "mkpg " << pgid << "  up " << up << " != acting " << acting << ", ignoring" << dendl;
+      dout(10) << "mkpg " << on << "  up " << up
+              << " != acting " << acting << ", ignoring" << dendl;
       // we'll get a query soon anyway, since we know the pg
       // must exist. we can ignore this.
       continue;
     }
 
+    spg_t pgid;
+    bool mapped = osdmap->get_primary_shard(on, &pgid);
+    assert(mapped);
+
     // does it already exist?
     if (_have_pg(pgid)) {
       dout(10) << "mkpg " << pgid << "  already exists, skipping" << dendl;
@@ -6080,14 +6141,18 @@ void OSD::handle_pg_create(OpRequestRef op)
 
     PG::RecoveryCtx rctx = create_context();
     // poll priors
-    set<int>& pset = creating_pgs[pgid].prior;
+    set<pg_shard_t>& pset = creating_pgs[pgid].prior;
     dout(10) << "mkpg " << pgid << " e" << created
             << " h " << history
             << " : querying priors " << pset << dendl;
-    for (set<int>::iterator p = pset.begin(); p != pset.end(); ++p) 
-      if (osdmap->is_up(*p))
-       (*rctx.query_map)[*p][pgid] = pg_query_t(pg_query_t::INFO, history,
-                                                osdmap->get_epoch());
+    for (set<pg_shard_t>::iterator p = pset.begin(); p != pset.end(); ++p)
+      if (osdmap->is_up(p->osd))
+       (*rctx.query_map)[p->osd][spg_t(pgid.pgid, p->shard)] =
+         pg_query_t(
+           pg_query_t::INFO,
+           p->shard, pgid.shard,
+           history,
+           osdmap->get_epoch());
 
     PG *pg = NULL;
     if (can_create_pg(pgid)) {
@@ -6095,7 +6160,8 @@ void OSD::handle_pg_create(OpRequestRef op)
       rctx.transaction->create_collection(coll_t(pgid));
       pg = _create_lock_pg(
        osdmap, pgid, true, false, false,
-       0, creating_pgs[pgid].acting, creating_pgs[pgid].acting,
+       0, creating_pgs[pgid].acting, whoami,
+       creating_pgs[pgid].acting, whoami,
        history, pi,
        *rctx.transaction);
       pg->info.last_epoch_started = pg->info.history.last_epoch_started;
@@ -6122,10 +6188,10 @@ PG::RecoveryCtx OSD::create_context()
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   C_Contexts *on_applied = new C_Contexts(cct);
   C_Contexts *on_safe = new C_Contexts(cct);
-  map< int, map<pg_t,pg_query_t> > *query_map =
-    new map<int, map<pg_t, pg_query_t> >;
+  map<int, map<spg_t,pg_query_t> > *query_map =
+    new map<int, map<spg_t, pg_query_t> >;
   map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list =
-    new map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >;
+    new map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >;
   map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map =
     new map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >;
   PG::RecoveryCtx rctx(query_map, info_map, notify_list,
@@ -6152,16 +6218,25 @@ void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
 bool OSD::compat_must_dispatch_immediately(PG *pg)
 {
   assert(pg->is_locked());
-  vector<int> *tmpacting = &pg->acting;
-  if (pg->actingbackfill.size() > 0)
-    tmpacting = &pg->actingbackfill;
-  for (vector<int>::iterator i = tmpacting->begin();
-       i != tmpacting->end();
+  set<pg_shard_t> tmpacting;
+  if (pg->actingbackfill.size() > 0) {
+    tmpacting = pg->actingbackfill;
+  } else {
+    for (unsigned i = 0; i < pg->acting.size(); ++i) {
+      tmpacting.insert(
+       pg_shard_t(
+         pg->acting[i],
+         pg->pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+    }
+  }
+
+  for (set<pg_shard_t>::iterator i = tmpacting.begin();
+       i != tmpacting.end();
        ++i) {
-    if (*i == whoami)
+    if (i->osd == whoami)
       continue;
     ConnectionRef conn =
-      service.get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch());
+      service.get_con_osd_cluster(i->osd, pg->get_osdmap()->get_epoch());
     if (conn && !conn->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
       return true;
     }
@@ -6203,30 +6278,29 @@ void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
  */
 
 void OSD::do_notifies(
-  map< int,vector<pair<pg_notify_t,pg_interval_map_t> > >& notify_list,
+  map<int,vector<pair<pg_notify_t,pg_interval_map_t> > >& notify_list,
   OSDMapRef curmap)
 {
-  for (map< int, vector<pair<pg_notify_t,pg_interval_map_t> > >::iterator it = notify_list.begin();
+  for (map<int,
+          vector<pair<pg_notify_t,pg_interval_map_t> > >::iterator it =
+        notify_list.begin();
        it != notify_list.end();
        ++it) {
-    if (it->first == whoami) {
-      dout(7) << "do_notify osd." << it->first << " is self, skipping" << dendl;
-      continue;
-    }
     if (!curmap->is_up(it->first))
       continue;
-    ConnectionRef con = service.get_con_osd_cluster(it->first, curmap->get_epoch());
+    ConnectionRef con = service.get_con_osd_cluster(
+      it->first, curmap->get_epoch());
     if (!con)
       continue;
     _share_map_outgoing(it->first, con.get(), curmap);
     if (con->has_feature(CEPH_FEATURE_INDEP_PG_MAP)) {
-      dout(7) << "do_notify osd." << it->first
+      dout(7) << "do_notify osd " << it->first
              << " on " << it->second.size() << " PGs" << dendl;
       MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
                                         it->second);
       cluster_messenger->send_message(m, con.get());
     } else {
-      dout(7) << "do_notify osd." << it->first
+      dout(7) << "do_notify osd " << it->first
              << " sending seperate messages" << dendl;
       for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i =
             it->second.begin();
@@ -6246,10 +6320,10 @@ void OSD::do_notifies(
 /** do_queries
  * send out pending queries for info | summaries
  */
-void OSD::do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
+void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
                     OSDMapRef curmap)
 {
-  for (map< int, map<pg_t,pg_query_t> >::iterator pit = query_map.begin();
+  for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
        pit != query_map.end();
        ++pit) {
     if (!curmap->is_up(pit->first))
@@ -6268,10 +6342,10 @@ void OSD::do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
       dout(7) << "do_queries querying osd." << who
              << " sending seperate messages "
              << " on " << pit->second.size() << " PGs" << dendl;
-      for (map<pg_t, pg_query_t>::iterator i = pit->second.begin();
+      for (map<spg_t, pg_query_t>::iterator i = pit->second.begin();
           i != pit->second.end();
           ++i) {
-       map<pg_t, pg_query_t> to_send;
+       map<spg_t, pg_query_t> to_send;
        to_send.insert(*i);
        MOSDPGQuery *m = new MOSDPGQuery(i->second.epoch_sent, to_send);
        cluster_messenger->send_message(m, con.get());
@@ -6281,10 +6355,13 @@ void OSD::do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
 }
 
 
-void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+void OSD::do_infos(map<int,
+                      vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
                   OSDMapRef curmap)
 {
-  for (map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator p = info_map.begin();
+  for (map<int,
+          vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator p =
+        info_map.begin();
        p != info_map.end();
        ++p) { 
     if (!curmap->is_up(p->first))
@@ -6292,9 +6369,11 @@ void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info
     for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator i = p->second.begin();
         i != p->second.end();
         ++i) {
-      dout(20) << "Sending info " << i->first.info << " to osd." << p->first << dendl;
+      dout(20) << "Sending info " << i->first.info
+              << " to shard " << p->first << dendl;
     }
-    ConnectionRef con = service.get_con_osd_cluster(p->first, curmap->get_epoch());
+    ConnectionRef con = service.get_con_osd_cluster(
+      p->first, curmap->get_epoch());
     if (!con)
       continue;
     _share_map_outgoing(p->first, con.get(), curmap);
@@ -6349,12 +6428,13 @@ void OSD::handle_pg_notify(OpRequestRef op)
     }
 
     handle_pg_peering_evt(
+      spg_t(it->first.info.pgid.pgid, it->first.to),
       it->first.info, it->second,
-      it->first.query_epoch, from, true,
+      it->first.query_epoch, pg_shard_t(from, it->first.from), true,
       PG::CephPeeringEvtRef(
        new PG::CephPeeringEvt(
          it->first.epoch_sent, it->first.query_epoch,
-         PG::MNotifyRec(from, it->first)))
+         PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first)))
       );
   }
 }
@@ -6378,12 +6458,13 @@ void OSD::handle_pg_log(OpRequestRef op)
 
   op->mark_started();
   handle_pg_peering_evt(
+    spg_t(m->info.pgid.pgid, m->to),
     m->info, m->past_intervals, m->get_epoch(),
-    from, false,
+    pg_shard_t(from, m->from), false,
     PG::CephPeeringEvtRef(
       new PG::CephPeeringEvt(
        m->get_epoch(), m->get_query_epoch(),
-       PG::MLogRec(from, m)))
+       PG::MLogRec(pg_shard_t(from, m->from), m)))
     );
 }
 
@@ -6410,12 +6491,15 @@ void OSD::handle_pg_info(OpRequestRef op)
     }
 
     handle_pg_peering_evt(
+      spg_t(p->first.info.pgid.pgid, p->first.to),
       p->first.info, p->second, p->first.epoch_sent,
-      from, false,
+      pg_shard_t(from, p->first.from), false,
       PG::CephPeeringEvtRef(
        new PG::CephPeeringEvt(
          p->first.epoch_sent, p->first.query_epoch,
-         PG::MInfoRec(from, p->first.info, p->first.epoch_sent)))
+         PG::MInfoRec(
+           pg_shard_t(
+             from, p->first.from), p->first.info, p->first.epoch_sent)))
       );
   }
 }
@@ -6454,7 +6538,8 @@ void OSD::handle_pg_trim(OpRequestRef op)
     if (pg->is_primary()) {
       // peer is informing us of their last_complete_ondisk
       dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
-      pg->peer_last_complete_ondisk[from] = m->trim_to;
+      pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
+       m->trim_to;
       if (pg->calc_min_last_complete_ondisk()) {
        dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
        pg->trim_peers();
@@ -6654,10 +6739,10 @@ void OSD::handle_pg_query(OpRequestRef op)
 
   map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
   
-  for (map<pg_t,pg_query_t>::iterator it = m->pg_list.begin();
+  for (map<spg_t,pg_query_t>::iterator it = m->pg_list.begin();
        it != m->pg_list.end();
        ++it) {
-    pg_t pgid = it->first;
+    spg_t pgid = it->first;
 
     if (pgid.preferred() >= 0) {
       dout(10) << "ignoring localized pg " << pgid << dendl;
@@ -6669,15 +6754,17 @@ void OSD::handle_pg_query(OpRequestRef op)
        PG::CephPeeringEvtRef(
          new PG::CephPeeringEvt(
            it->second.epoch_sent, it->second.epoch_sent,
-           PG::MQuery(from, it->second, it->second.epoch_sent))));
+           PG::MQuery(pg_shard_t(from, it->second.from),
+                      it->second, it->second.epoch_sent))));
       continue;
     }
 
     if (pg_map.count(pgid)) {
       PG *pg = 0;
       pg = _lookup_lock_pg(pgid);
-      pg->queue_query(it->second.epoch_sent, it->second.epoch_sent,
-                     from, it->second);
+      pg->queue_query(
+       it->second.epoch_sent, it->second.epoch_sent,
+       pg_shard_t(from, it->second.from), it->second);
       pg->unlock();
       continue;
     }
@@ -6687,7 +6774,7 @@ void OSD::handle_pg_query(OpRequestRef op)
 
     // get active crush mapping
     vector<int> up, acting;
-    osdmap->pg_to_up_acting_osds(pgid, up, acting);
+    osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
 
     // same primary?
     pg_history_t history = it->second.history;
@@ -6703,21 +6790,27 @@ void OSD::handle_pg_query(OpRequestRef op)
     }
 
     dout(10) << " pg " << pgid << " dne" << dendl;
-    pg_info_t empty(pgid);
+    pg_info_t empty(spg_t(pgid.pgid, it->second.to));
     if (it->second.type == pg_query_t::LOG ||
        it->second.type == pg_query_t::FULLLOG) {
       ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
       if (con) {
-       MOSDPGLog *mlog = new MOSDPGLog(osdmap->get_epoch(), empty,
-                                       it->second.epoch_sent);
+       MOSDPGLog *mlog = new MOSDPGLog(
+         it->second.from, it->second.to,
+         osdmap->get_epoch(), empty,
+         it->second.epoch_sent);
        _share_map_outgoing(from, con.get(), osdmap);
        cluster_messenger->send_message(mlog, con.get());
       }
     } else {
-      notify_list[from].push_back(make_pair(pg_notify_t(it->second.epoch_sent,
-                                                       osdmap->get_epoch(),
-                                                       empty),
-                                           pg_interval_map_t()));
+      notify_list[from].push_back(
+       make_pair(
+         pg_notify_t(
+           it->second.from, it->second.to,
+           it->second.epoch_sent,
+           osdmap->get_epoch(),
+           empty),
+         pg_interval_map_t()));
     }
   }
   do_notifies(notify_list, osdmap);
@@ -6740,10 +6833,10 @@ void OSD::handle_pg_remove(OpRequestRef op)
   
   op->mark_started();
 
-  for (vector<pg_t>::iterator it = m->pg_list.begin();
+  for (vector<spg_t>::iterator it = m->pg_list.begin();
        it != m->pg_list.end();
        ++it) {
-    pg_t pgid = *it;
+    spg_t pgid = *it;
     if (pgid.preferred() >= 0) {
       dout(10) << "ignoring localized pg " << pgid << dendl;
       continue;
@@ -6757,13 +6850,13 @@ void OSD::handle_pg_remove(OpRequestRef op)
     PG *pg = _lookup_lock_pg(pgid);
     pg_history_t history = pg->info.history;
     vector<int> up, acting;
-    osdmap->pg_to_up_acting_osds(pgid, up, acting);
+    osdmap->pg_to_up_acting_osds(pgid.pgid, up, acting);
     bool valid_history =
       project_pg_history(pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
        up, acting);
     if (valid_history &&
         history.same_interval_since <= m->get_epoch()) {
-      assert(pg->get_primary() == m->get_source().num());
+      assert(pg->get_primary().osd == m->get_source().num());
       PGRef _pg(pg);
       _remove_pg(pg);
       pg->unlock();
@@ -6820,7 +6913,7 @@ void OSD::check_replay_queue()
   assert(osd_lock.is_locked());
 
   utime_t now = ceph_clock_now(cct);
-  list< pair<pg_t,utime_t> > pgids;
+  list< pair<spg_t,utime_t> > pgids;
   replay_queue_lock.Lock();
   while (!replay_queue.empty() &&
         replay_queue.front().second <= now) {
@@ -6829,8 +6922,8 @@ void OSD::check_replay_queue()
   }
   replay_queue_lock.Unlock();
 
-  for (list< pair<pg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); ++p) {
-    pg_t pgid = p->first;
+  for (list< pair<spg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); ++p) {
+    spg_t pgid = p->first;
     if (pg_map.count(pgid)) {
       PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
       dout(10) << "check_replay_queue " << *pg << dendl;
@@ -7111,18 +7204,24 @@ void OSD::handle_op(OpRequestRef op)
     }
   }
   // calc actual pgid
-  pg_t pgid = m->get_pg();
-  int64_t pool = pgid.pool();
+  pg_t _pgid = m->get_pg();
+  int64_t pool = _pgid.pool();
   if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 &&
       osdmap->have_pg_pool(pool))
-    pgid = osdmap->raw_pg_to_pg(pgid);
+    _pgid = osdmap->raw_pg_to_pg(_pgid);
+
+  spg_t pgid;
+  if (!osdmap->get_primary_shard(_pgid, &pgid)) {
+    // missing pool or acting set empty -- drop
+    return;
+  }
 
   // get and lock *pg.
   PG *pg = _have_pg(pgid) ? _lookup_pg(pgid) : NULL;
   if (!pg) {
     dout(7) << "hit non-existent pg " << pgid << dendl;
 
-    if (osdmap->get_pg_acting_role(pgid, whoami) >= 0) {
+    if (osdmap->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
       dout(7) << "we are valid target for op, waiting" << dendl;
       waiting_for_pg[pgid].push_back(op);
       op->mark_delayed("waiting for pg to exist locally");
@@ -7136,7 +7235,7 @@ void OSD::handle_op(OpRequestRef op)
     }
     OSDMapRef send_map = get_map(m->get_map_epoch());
 
-    if (send_map->get_pg_acting_role(pgid, whoami) >= 0) {
+    if (send_map->get_pg_acting_role(pgid.pgid, whoami) >= 0) {
       dout(7) << "dropping request; client will resend when they get new map" << dendl;
     } else if (!send_map->have_pg_pool(pgid.pool())) {
       dout(7) << "dropping request; pool did not exist" << dendl;
@@ -7149,9 +7248,6 @@ void OSD::handle_op(OpRequestRef op)
                  << "\n";
     } else {
       dout(7) << "we are invalid target" << dendl;
-      pgid = m->get_pg();
-      if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
-       pgid = send_map->raw_pg_to_pg(pgid);
       clog.warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
                  << " pg " << m->get_pg()
                  << " to osd." << whoami
@@ -7195,7 +7291,7 @@ void OSD::handle_replica_op(OpRequestRef op)
                      static_cast<Session*>(m->get_connection()->get_priv()));
 
   // make sure we have the pg
-  const pg_t pgid = m->pgid;
+  const spg_t pgid = m->pgid;
   if (service.splitting(pgid)) {
     waiting_for_pg[pgid].push_back(op);
     return;
@@ -7351,7 +7447,7 @@ struct C_CompleteSplits : public Context {
     if (osd->is_stopping())
       return;
     PG::RecoveryCtx rctx = osd->create_context();
-    set<pg_t> to_complete;
+    set<spg_t> to_complete;
     for (set<boost::intrusive_ptr<PG> >::iterator i = pgs.begin();
         i != pgs.end();
         ++i) {
index 3bf76d11dc6c93251b3dd51dee8f419832c13f79..1a592d667b25d96dd6d60b6f277808b3b0fafc56 100644 (file)
@@ -197,9 +197,9 @@ class DeletingState {
   } status;
   bool stop_deleting;
 public:
-  const pg_t pgid;
+  const spg_t pgid;
   const PGRef old_pg_state;
-  DeletingState(const pair<pg_t, PGRef> &in) :
+  DeletingState(const pair<spg_t, PGRef> &in) :
     lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
     pgid(in.first), old_pg_state(in.second) {}
 
@@ -289,8 +289,8 @@ class OSDService {
 public:
   OSD *osd;
   CephContext *cct;
-  SharedPtrRegistry<pg_t, ObjectStore::Sequencer> osr_registry;
-  SharedPtrRegistry<pg_t, DeletingState> deleting_pgs;
+  SharedPtrRegistry<spg_t, ObjectStore::Sequencer> osr_registry;
+  SharedPtrRegistry<spg_t, DeletingState> deleting_pgs;
   const int whoami;
   ObjectStore *&store;
   LogClient &clog;
@@ -385,33 +385,33 @@ public:
   Mutex sched_scrub_lock;
   int scrubs_pending;
   int scrubs_active;
-  set< pair<utime_t,pg_t> > last_scrub_pg;
+  set< pair<utime_t,spg_t> > last_scrub_pg;
 
-  void reg_last_pg_scrub(pg_t pgid, utime_t t) {
+  void reg_last_pg_scrub(spg_t pgid, utime_t t) {
     Mutex::Locker l(sched_scrub_lock);
-    last_scrub_pg.insert(pair<utime_t,pg_t>(t, pgid));
+    last_scrub_pg.insert(pair<utime_t,spg_t>(t, pgid));
   }
-  void unreg_last_pg_scrub(pg_t pgid, utime_t t) {
+  void unreg_last_pg_scrub(spg_t pgid, utime_t t) {
     Mutex::Locker l(sched_scrub_lock);
-    pair<utime_t,pg_t> p(t, pgid);
-    set<pair<utime_t,pg_t> >::iterator it = last_scrub_pg.find(p);
+    pair<utime_t,spg_t> p(t, pgid);
+    set<pair<utime_t,spg_t> >::iterator it = last_scrub_pg.find(p);
     assert(it != last_scrub_pg.end());
     last_scrub_pg.erase(it);
   }
-  bool first_scrub_stamp(pair<utime_t, pg_t> *out) {
+  bool first_scrub_stamp(pair<utime_t, spg_t> *out) {
     Mutex::Locker l(sched_scrub_lock);
     if (last_scrub_pg.empty())
       return false;
-    set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.begin();
+    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.begin();
     *out = *iter;
     return true;
   }
-  bool next_scrub_stamp(pair<utime_t, pg_t> next,
-                       pair<utime_t, pg_t> *out) {
+  bool next_scrub_stamp(pair<utime_t, spg_t> next,
+                       pair<utime_t, spg_t> *out) {
     Mutex::Locker l(sched_scrub_lock);
     if (last_scrub_pg.empty())
       return false;
-    set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
+    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
     if (iter == last_scrub_pg.end())
       return false;
     ++iter;
@@ -577,11 +577,11 @@ public:
   enum {
     BACKFILL_LOW = 0,   // backfill non-degraded PGs
     BACKFILL_HIGH = 1, // backfill degraded PGs
-    RECOVERY = AsyncReserver<pg_t>::MAX_PRIORITY  // log based recovery
+    RECOVERY = AsyncReserver<spg_t>::MAX_PRIORITY  // log based recovery
   };
   Finisher reserver_finisher;
-  AsyncReserver<pg_t> local_reserver;
-  AsyncReserver<pg_t> remote_reserver;
+  AsyncReserver<spg_t> local_reserver;
+  AsyncReserver<spg_t> remote_reserver;
 
   // -- pg_temp --
   Mutex pg_temp_lock;
@@ -652,26 +652,26 @@ public:
 
   // split
   Mutex in_progress_split_lock;
-  map<pg_t, pg_t> pending_splits; // child -> parent
-  map<pg_t, set<pg_t> > rev_pending_splits; // parent -> [children]
-  set<pg_t> in_progress_splits;       // child
+  map<spg_t, spg_t> pending_splits; // child -> parent
+  map<spg_t, set<spg_t> > rev_pending_splits; // parent -> [children]
+  set<spg_t> in_progress_splits;       // child
 
-  void _start_split(pg_t parent, const set<pg_t> &children);
-  void start_split(pg_t parent, const set<pg_t> &children) {
+  void _start_split(spg_t parent, const set<spg_t> &children);
+  void start_split(spg_t parent, const set<spg_t> &children) {
     Mutex::Locker l(in_progress_split_lock);
     return _start_split(parent, children);
   }
-  void mark_split_in_progress(pg_t parent, const set<pg_t> &pgs);
-  void complete_split(const set<pg_t> &pgs);
-  void cancel_pending_splits_for_parent(pg_t parent);
-  void _cancel_pending_splits_for_parent(pg_t parent);
-  bool splitting(pg_t pgid);
+  void mark_split_in_progress(spg_t parent, const set<spg_t> &pgs);
+  void complete_split(const set<spg_t> &pgs);
+  void cancel_pending_splits_for_parent(spg_t parent);
+  void _cancel_pending_splits_for_parent(spg_t parent);
+  bool splitting(spg_t pgid);
   void expand_pg_num(OSDMapRef old_map,
                     OSDMapRef new_map);
   void _maybe_split_pgid(OSDMapRef old_map,
                         OSDMapRef new_map,
-                        pg_t pgid);
-  void init_splits_between(pg_t pgid, OSDMapRef frommap, OSDMapRef tomap);
+                        spg_t pgid);
+  void init_splits_between(spg_t pgid, OSDMapRef frommap, OSDMapRef tomap);
 
   // -- OSD Full Status --
   Mutex full_status_lock;
@@ -706,9 +706,9 @@ public:
 
 #ifdef PG_DEBUG_REFS
   Mutex pgid_lock;
-  map<pg_t, int> pgid_tracker;
-  map<pg_t, PG*> live_pgs;
-  void add_pgid(pg_t pgid, PG *pg) {
+  map<spg_t, int> pgid_tracker;
+  map<spg_t, PG*> live_pgs;
+  void add_pgid(spg_t pgid, PG *pg) {
     Mutex::Locker l(pgid_lock);
     if (!pgid_tracker.count(pgid)) {
       pgid_tracker[pgid] = 0;
@@ -716,7 +716,7 @@ public:
     }
     pgid_tracker[pgid]++;
   }
-  void remove_pgid(pg_t pgid, PG *pg) {
+  void remove_pgid(spg_t pgid, PG *pg) {
     Mutex::Locker l(pgid_lock);
     assert(pgid_tracker.count(pgid));
     assert(pgid_tracker[pgid] > 0);
@@ -729,7 +729,7 @@ public:
   void dump_live_pgids() {
     Mutex::Locker l(pgid_lock);
     derr << "live pgids:" << dendl;
-    for (map<pg_t, int>::iterator i = pgid_tracker.begin();
+    for (map<spg_t, int>::iterator i = pgid_tracker.begin();
         i != pgid_tracker.end();
         ++i) {
       derr << "\t" << *i << dendl;
@@ -831,7 +831,7 @@ public:
        0));
   }
 
-  static hobject_t make_pg_log_oid(pg_t pg) {
+  static hobject_t make_pg_log_oid(spg_t pg) {
     stringstream ss;
     ss << "pglog_" << pg;
     string s;
@@ -839,7 +839,7 @@ public:
     return hobject_t(sobject_t(object_t(s.c_str()), 0));
   }
   
-  static hobject_t make_pg_biginfo_oid(pg_t pg) {
+  static hobject_t make_pg_biginfo_oid(spg_t pg) {
     stringstream ss;
     ss << "pginfo_" << pg;
     string s;
@@ -1264,19 +1264,19 @@ private:
 
 protected:
   // -- placement groups --
-  ceph::unordered_map<pg_t, PG*> pg_map;
-  map<pg_t, list<OpRequestRef> > waiting_for_pg;
-  map<pg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
+  ceph::unordered_map<spg_t, PG*> pg_map;
+  map<spg_t, list<OpRequestRef> > waiting_for_pg;
+  map<spg_t, list<PG::CephPeeringEvtRef> > peering_wait_for_split;
   PGRecoveryStats pg_recovery_stats;
 
   PGPool _get_pool(int id, OSDMapRef createmap);
 
-  bool  _have_pg(pg_t pgid);
-  PG   *_lookup_lock_pg_with_map_lock_held(pg_t pgid);
-  PG   *_lookup_lock_pg(pg_t pgid);
-  PG   *_lookup_pg(pg_t pgid);
+  bool  _have_pg(spg_t pgid);
+  PG   *_lookup_lock_pg_with_map_lock_held(spg_t pgid);
+  PG   *_lookup_lock_pg(spg_t pgid);
+  PG   *_lookup_pg(spg_t pgid);
   PG   *_open_lock_pg(OSDMapRef createmap,
-                     pg_t pg, bool no_lockdep_check=false,
+                     spg_t pg, bool no_lockdep_check=false,
                      bool hold_map_lock=false);
   enum res_result {
     RES_PARENT,    // resurrected a parent
@@ -1284,50 +1284,54 @@ protected:
     RES_NONE       // nothing relevant deleting
   };
   res_result _try_resurrect_pg(
-    OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state);
-  PG   *_create_lock_pg(OSDMapRef createmap,
-                       pg_t pgid,
-                       bool newly_created,
-                       bool hold_map_lock,
-                       bool backfill,
-                       int role,
-                       vector<int>& up,
-                       vector<int>& acting,
-                       pg_history_t history,
-                       pg_interval_map_t& pi,
-                       ObjectStore::Transaction& t);
-  PG   *_lookup_qlock_pg(pg_t pgid);
-
-  PG* _make_pg(OSDMapRef createmap, pg_t pgid);
+    OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state);
+  PG   *_create_lock_pg(
+    OSDMapRef createmap,
+    spg_t pgid,
+    bool newly_created,
+    bool hold_map_lock,
+    bool backfill,
+    int role,
+    vector<int>& up, int up_primary,
+    vector<int>& acting, int acting_primary,
+    pg_history_t history,
+    pg_interval_map_t& pi,
+    ObjectStore::Transaction& t);
+  PG   *_lookup_qlock_pg(spg_t pgid);
+
+  PG* _make_pg(OSDMapRef createmap, spg_t pgid);
   void add_newly_split_pg(PG *pg,
                          PG::RecoveryCtx *rctx);
 
   void handle_pg_peering_evt(
+    spg_t pgid,
     const pg_info_t& info,
     pg_interval_map_t& pi,
-    epoch_t epoch, int from,
+    epoch_t epoch,
+    pg_shard_t from,
     bool primary,
     PG::CephPeeringEvtRef evt);
   
   void load_pgs();
   void build_past_intervals_parallel();
 
-  void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& pset);
+  void calc_priors_during(
+    spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset);
 
   /// project pg history from from to now
   bool project_pg_history(
-    pg_t pgid, pg_history_t& h, epoch_t from,
+    spg_t pgid, pg_history_t& h, epoch_t from,
     const vector<int>& lastup, const vector<int>& lastacting
     ); ///< @return false if there was a map gap between from and now
 
-  void wake_pg_waiters(pg_t pgid) {
+  void wake_pg_waiters(spg_t pgid) {
     if (waiting_for_pg.count(pgid)) {
       take_waiters_front(waiting_for_pg[pgid]);
       waiting_for_pg.erase(pgid);
     }
   }
   void wake_all_pg_waiters() {
-    for (map<pg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
+    for (map<spg_t, list<OpRequestRef> >::iterator p = waiting_for_pg.begin();
         p != waiting_for_pg.end();
         ++p)
       take_waiters_front(p->second);
@@ -1339,20 +1343,20 @@ protected:
   struct create_pg_info {
     pg_history_t history;
     vector<int> acting;
-    set<int> prior;
+    set<pg_shard_t> prior;
     pg_t parent;
   };
-  ceph::unordered_map<pg_t, create_pg_info> creating_pgs;
+  ceph::unordered_map<spg_t, create_pg_info> creating_pgs;
   double debug_drop_pg_create_probability;
   int debug_drop_pg_create_duration;
   int debug_drop_pg_create_left;  // 0 if we just dropped the last one, -1 if we can drop more
 
-  bool can_create_pg(pg_t pgid);
+  bool can_create_pg(spg_t pgid);
   void handle_pg_create(OpRequestRef op);
 
   void split_pgs(
     PG *parent,
-    const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
+    const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
     OSDMapRef curmap,
     OSDMapRef nextmap,
     PG::RecoveryCtx *rctx);
@@ -1445,13 +1449,16 @@ protected:
                         ThreadPool::TPHandle *handle = NULL);
   void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
                                     ThreadPool::TPHandle *handle = NULL);
-  void do_notifies(map< int,vector<pair<pg_notify_t, pg_interval_map_t> > >& notify_list,
+  void do_notifies(map<int,
+                      vector<pair<pg_notify_t, pg_interval_map_t> > >&
+                      notify_list,
                   OSDMapRef map);
-  void do_queries(map< int, map<pg_t,pg_query_t> >& query_map,
+  void do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
                  OSDMapRef map);
-  void do_infos(map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+  void do_infos(map<int,
+                   vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
                OSDMapRef map);
-  void repeer(PG *pg, map< int, map<pg_t,pg_query_t> >& query_map);
+  void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
 
   bool require_mon_peer(Message *m);
   bool require_osd_peer(OpRequestRef op);
@@ -1535,7 +1542,7 @@ protected:
   utime_t defer_recovery_until;
   int recovery_ops_active;
 #ifdef DEBUG_RECOVERY_OIDS
-  map<pg_t, set<hobject_t> > recovery_oids;
+  map<spg_t, set<hobject_t> > recovery_oids;
 #endif
 
   struct RecoveryWQ : public ThreadPool::WorkQueue<PG> {
@@ -1588,7 +1595,7 @@ protected:
 
   // replay / delayed pg activation
   Mutex replay_queue_lock;
-  list< pair<pg_t, utime_t > > replay_queue;
+  list< pair<spg_t, utime_t > > replay_queue;
   
   void check_replay_queue();
 
@@ -1815,7 +1822,7 @@ protected:
     }
   } remove_wq;
   uint64_t next_removal_seq;
-  coll_t get_next_removal_coll(pg_t pgid) {
+  coll_t get_next_removal_coll(spg_t pgid) {
     return coll_t::make_removal_coll(next_removal_seq++, pgid);
   }
 
index a9eed12c19fa3bd4e1e9e5d9cd5bef014823d9c6..47ac2f796cfcb6f9022b9bf79ed5533620e5bc44 100644 (file)
@@ -622,6 +622,12 @@ public:
     assert(up.empty() || up_primary == up.front());
     assert(acting.empty() || acting_primary == acting.front());
   }
+  bool pg_is_ec(pg_t pg) const {
+    map<int64_t, pg_pool_t>::const_iterator i = pools.find(pg.pool());
+    assert(i != pools.end());
+    return i->second.ec_pool();
+  }
+  spg_t get_primary_shard(pg_t pgid) const { return spg_t(); /* TODOSAM: fix */}
 
   int64_t lookup_pg_pool_name(const string& name) {
     if (name_pool.count(name))
index 7046a6fcdcf7d91864bfdb204ee0f9c01fbb6053..793a3947b58aec17dbb0e7053425e788b1dbce0a 100644 (file)
@@ -139,14 +139,14 @@ void PGPool::update(OSDMapRef map)
 }
 
 PG::PG(OSDService *o, OSDMapRef curmap,
-       const PGPool &_pool, pg_t p, const hobject_t& loid,
+       const PGPool &_pool, spg_t p, const hobject_t& loid,
        const hobject_t& ioid) :
   osd(o),
   cct(o->cct),
   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
   snap_mapper(
     &osdriver,
-    p.m_seed,
+    p.ps(),
     p.get_split_bits(curmap->get_pg_num(_pool.id)),
     _pool.id,
     p.shard),
@@ -166,6 +166,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   role(0),
   state(0),
   send_notify(false),
+  pg_whoami(osd->whoami, p.shard),
   need_up_thru(false),
   last_peering_reset(0),
   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
@@ -227,9 +228,12 @@ std::string PG::gen_prefix() const
   
 /********* PG **********/
 
-void PG::proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from)
+void PG::proc_master_log(
+  ObjectStore::Transaction& t, pg_info_t &oinfo,
+  pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
 {
-  dout(10) << "proc_master_log for osd." << from << ": " << olog << " " << omissing << dendl;
+  dout(10) << "proc_master_log for osd." << from << ": "
+          << olog << " " << omissing << dendl;
   assert(!is_active() && is_primary());
 
   // merge log into our own log to build master log.  no need to
@@ -245,8 +249,10 @@ void PG::proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t
   peer_missing[from].swap(omissing);
 }
     
-void PG::proc_replica_log(ObjectStore::Transaction& t,
-                         pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing, int from)
+void PG::proc_replica_log(
+  ObjectStore::Transaction& t,
+  pg_info_t &oinfo, pg_log_t &olog, pg_missing_t& omissing,
+  pg_shard_t from)
 {
   dout(10) << "proc_replica_log for osd." << from << ": "
           << oinfo << " " << olog << " " << omissing << dendl;
@@ -267,9 +273,9 @@ void PG::proc_replica_log(ObjectStore::Transaction& t,
   peer_missing[from].swap(omissing);
 }
 
-bool PG::proc_replica_info(int from, const pg_info_t &oinfo)
+bool PG::proc_replica_info(pg_shard_t from, const pg_info_t &oinfo)
 {
-  map<int,pg_info_t>::iterator p = peer_info.find(from);
+  map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
     return false;
@@ -344,7 +350,7 @@ void PG::update_object_snap_mapping(
 }
 
 void PG::merge_log(
-  ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from)
+  ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
 {
   PGLogEntryHandler rollbacker;
   pg_log.merge_log(
@@ -367,8 +373,9 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
  * TODO: if the missing set becomes very large, this could get expensive.
  * Instead, we probably want to just iterate over our unfound set.
  */
-bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing,
-                           int fromosd)
+bool PG::search_for_missing(
+  const pg_info_t &oinfo, const pg_missing_t *omissing,
+  pg_shard_t fromosd)
 {
   bool stats_updated = false;
   bool found_missing = false;
@@ -418,7 +425,7 @@ bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing
     dout(10) << "search_for_missing " << soid << " " << need
             << " is on osd." << fromosd << dendl;
 
-    map<hobject_t, set<int> >::iterator ml = missing_loc.find(soid);
+    map<hobject_t, set<pg_shard_t> >::iterator ml = missing_loc.find(soid);
     if (ml == missing_loc.end()) {
       map<hobject_t, list<OpRequestRef> >::iterator wmo =
        waiting_for_missing_object.find(soid);
@@ -443,7 +450,7 @@ bool PG::search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing
   return found_missing;
 }
 
-void PG::discover_all_missing(map< int, map<pg_t,pg_query_t> > &query_map)
+void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 {
   const pg_missing_t &missing = pg_log.get_missing();
   assert(missing.have_missing());
@@ -453,17 +460,17 @@ void PG::discover_all_missing(map< int, map<pg_t,pg_query_t> > &query_map)
           << get_num_unfound() << " unfound"
           << dendl;
 
-  std::set<int>::const_iterator m = might_have_unfound.begin();
-  std::set<int>::const_iterator mend = might_have_unfound.end();
+  std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
+  std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
   for (; m != mend; ++m) {
-    int peer(*m);
+    pg_shard_t peer(*m);
     
-    if (!get_osdmap()->is_up(peer)) {
+    if (!get_osdmap()->is_up(peer.osd)) {
       dout(20) << __func__ << " skipping down osd." << peer << dendl;
       continue;
     }
 
-    map<int, pg_info_t>::const_iterator iter = peer_info.find(peer);
+    map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
     if (iter != peer_info.end() &&
         (iter->second.is_empty() || iter->second.dne())) {
       // ignore empty peers
@@ -493,8 +500,11 @@ void PG::discover_all_missing(map< int, map<pg_t,pg_query_t> > &query_map)
     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
             << dendl;
     peer_missing_requested.insert(peer);
-    query_map[peer][info.pgid] =
-      pg_query_t(pg_query_t::MISSING, info.history, get_osdmap()->get_epoch());
+    query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
+      pg_query_t(
+       pg_query_t::MISSING,
+       peer.shard, pg_whoami.shard,
+       info.history, get_osdmap()->get_epoch());
   }
 }
 
@@ -513,13 +523,13 @@ bool PG::needs_recovery() const
   }
 
   assert(actingbackfill.size() > 0);
-  vector<int>::const_iterator end = actingbackfill.end();
-  vector<int>::const_iterator a = actingbackfill.begin();
+  set<pg_shard_t>::const_iterator end = actingbackfill.end();
+  set<pg_shard_t>::const_iterator a = actingbackfill.begin();
   assert(a != end);
-  ++a;
   for (; a != end; ++a) {
-    int peer = *a;
-    map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
+    if (*a == get_primary()) continue;
+    pg_shard_t peer = *a;
+    map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
     if (pm == peer_missing.end()) {
       dout(10) << __func__ << " osd." << peer << " don't have missing set" << dendl;
       ret = true;
@@ -544,11 +554,11 @@ bool PG::needs_backfill() const
 
   // We can assume that only possible osds that need backfill
   // are on the backfill_targets vector nodes.
-  vector<int>::const_iterator end = backfill_targets.end();
-  vector<int>::const_iterator a = backfill_targets.begin();
+  set<pg_shard_t>::const_iterator end = backfill_targets.end();
+  set<pg_shard_t>::const_iterator a = backfill_targets.begin();
   for (; a != end; ++a) {
-    int peer = *a;
-    map<int,pg_info_t>::const_iterator pi = peer_info.find(peer);
+    pg_shard_t peer = *a;
+    map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
     if (!pi->second.last_backfill.is_max()) {
       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
       ret = true;
@@ -599,7 +609,7 @@ void PG::generate_past_intervals()
   vector<int> acting, up, old_acting, old_up;
 
   cur_map = osd->get_map(cur_epoch);
-  cur_map->pg_to_up_acting_osds(get_pgid(), up, acting);
+  cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting);
   epoch_t same_interval_since = cur_epoch;
   dout(10) << __func__ << " over epochs " << cur_epoch << "-"
           << end_epoch << dendl;
@@ -610,7 +620,7 @@ void PG::generate_past_intervals()
     old_acting.swap(acting);
 
     cur_map = osd->get_map(cur_epoch);
-    cur_map->pg_to_up_acting_osds(get_pgid(), up, acting);
+    cur_map->pg_to_up_acting_osds(get_pgid().pgid, up, acting);
 
     std::stringstream debug;
     bool new_interval = pg_interval_t::check_new_interval(
@@ -623,7 +633,7 @@ void PG::generate_past_intervals()
       cur_map,
       last_map,
       info.pgid.pool(),
-      info.pgid,
+      info.pgid.pgid,
       &past_intervals,
       &debug);
     if (new_interval) {
@@ -672,9 +682,9 @@ void PG::remove_down_peer_info(const OSDMapRef osdmap)
 {
   // Remove any downed osds from peer_info
   bool removed = false;
-  map<int,pg_info_t>::iterator p = peer_info.begin();
+  map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
   while (p != peer_info.end()) {
-    if (!osdmap->is_up(p->first)) {
+    if (!osdmap->is_up(p->first.osd)) {
       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
       peer_missing.erase(p->first);
       peer_log_requested.erase(p->first);
@@ -698,16 +708,16 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
 {
   assert(is_primary());
 
-  set<int>::const_iterator peer = might_have_unfound.begin();
-  set<int>::const_iterator mend = might_have_unfound.end();
+  set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
+  set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
   for (; peer != mend; ++peer) {
     if (peer_missing.count(*peer))
       continue;
-    map<int, pg_info_t>::const_iterator iter = peer_info.find(*peer);
+    map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
     if (iter != peer_info.end() &&
         (iter->second.is_empty() || iter->second.dne()))
       continue;
-    const osd_info_t &osd_info(osdmap->get_info(*peer));
+    const osd_info_t &osd_info(osdmap->get_info(peer->osd));
     if (osd_info.lost_at <= osd_info.up_from) {
       // If there is even one OSD in might_have_unfound that isn't lost, we
       // still might retrieve our unfound.
@@ -723,18 +733,21 @@ void PG::build_prior(std::auto_ptr<PriorSet> &prior_set)
 {
   if (1) {
     // sanity check
-    for (map<int,pg_info_t>::iterator it = peer_info.begin();
+    for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
         it != peer_info.end();
         ++it) {
       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
     }
   }
-  prior_set.reset(new PriorSet(*get_osdmap(),
-                                past_intervals,
-                                up,
-                                acting,
-                                info,
-                                this));
+  prior_set.reset(
+    new PriorSet(
+      pool.info.ec_pool(),
+      *get_osdmap(),
+      past_intervals,
+      up,
+      acting,
+      info,
+      this));
   PriorSet &prior(*prior_set.get());
                                 
   if (prior.pg_down) {
@@ -799,11 +812,12 @@ void PG::clear_primary_state()
  *  2) Prefer longer tail if it brings another info into contiguity
  *  3) Prefer current primary
  */
-map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t> &infos) const
+map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
+  const map<pg_shard_t, pg_info_t> &infos) const
 {
   eversion_t min_last_update_acceptable = eversion_t::max();
   epoch_t max_last_epoch_started_found = 0;
-  for (map<int, pg_info_t>::const_iterator i = infos.begin();
+  for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
        i != infos.end();
        ++i) {
     if (max_last_epoch_started_found < i->second.last_epoch_started) {
@@ -817,12 +831,12 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
   }
   assert(min_last_update_acceptable != eversion_t::max());
 
-  map<int, pg_info_t>::const_iterator best = infos.end();
+  map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
   // find osd with newest last_update (oldest for ec_pool).
   // if there are multiples, prefer
   //  - a longer tail, if it brings another peer into log contiguity
   //  - the current primary
-  for (map<int, pg_info_t>::const_iterator p = infos.begin();
+  for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
        p != infos.end();
        ++p) {
     // Only consider peers with last_update >= min_last_update_acceptable
@@ -852,7 +866,7 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
       }
     }
     // Prefer longer tail if it brings another peer into contiguity
-    for (map<int, pg_info_t>::const_iterator q = infos.begin();
+    for (map<pg_shard_t, pg_info_t>::const_iterator q = infos.begin();
         q != infos.end();
         ++q) {
       if (q->second.is_incomplete())
@@ -870,7 +884,7 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
       }
     }
     // prefer current primary (usually the caller), all things being equal
-    if (p->first == acting[0]) {
+    if (p->first == pg_whoami) {
       dout(10) << "calc_acting prefer osd." << p->first
               << " because it is current primary" << dendl;
       best = p;
@@ -887,20 +901,29 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
  * incomplete, or another osd has a longer tail that allows us to
  * bring other up nodes up to date.
  */
-bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>& backfill) const
-{
-  map<int, pg_info_t> all_info(peer_info.begin(), peer_info.end());
-  all_info[osd->whoami] = info;
-
-  for (map<int,pg_info_t>::iterator p = all_info.begin(); p != all_info.end(); ++p) {
+bool PG::calc_acting(
+  pg_shard_t &auth_log_shard_id,
+  vector<int> &want,
+  set<pg_shard_t> &backfill) const
+{
+// TODOSAM: fix
+#if 0
+  map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+  all_info[pg_whoami] = info;
+
+  for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
+       p != all_info.end();
+       ++p) {
     dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
   }
 
-  map<int, pg_info_t>::const_iterator newest_update_osd = find_best_info(all_info);
+  map<pg_shard_t, pg_info_t>::const_iterator newest_update_osd =
+    find_best_info(all_info);
 
   if (newest_update_osd == all_info.end()) {
     if (up != acting) {
-      dout(10) << "calc_acting no suitable info found (incomplete backfills?), reverting to up" << dendl;
+      dout(10) << "calc_acting no suitable info found (incomplete backfills?),"
+              << " reverting to up" << dendl;
       want = up;
       return true;
     } else {
@@ -915,19 +938,19 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>&
   newest_update_osd_id = newest_update_osd->first;
   
   // select primary
-  map<int,pg_info_t>::const_iterator primary;
+  map<pg_shard_t,pg_info_t>::const_iterator primary;
   if (up.size() &&
-      !all_info[up[0]].is_incomplete() &&
-      all_info[up[0]].last_update >= newest_update_osd->second.log_tail) {
-    dout(10) << "up[0](osd." << up[0] << ") selected as primary" << dendl;
-    primary = all_info.find(up[0]);         // prefer up[0], all thing being equal
+      !all_info[up_primary].is_incomplete() &&
+      all_info[up_primary].last_update >= newest_update_osd->second.log_tail) {
+    dout(10) << "up_primary: " << up_primary << ") selected as primary" << dendl;
+    primary = all_info.find(up_primary); // prefer up[0], all thing being equal
   } else if (!newest_update_osd->second.is_incomplete()) {
     dout(10) << "up[0] needs backfill, osd." << newest_update_osd_id
             << " selected as primary instead" << dendl;
     primary = newest_update_osd;
   } else {
-    map<int, pg_info_t> complete_infos;
-    for (map<int, pg_info_t>::iterator i = all_info.begin();
+    map<pg_shard_t, pg_info_t> complete_infos;
+    for (map<pg_shard_t, pg_info_t>::iterator i = all_info.begin();
         i != all_info.end();
         ++i) {
       if (!i->second.is_incomplete())
@@ -1002,7 +1025,7 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>&
     }
   }
 
-  for (map<int,pg_info_t>::const_iterator i = all_info.begin();
+  for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
        i != all_info.end();
        ++i) {
     if (usable >= get_osdmap()->get_pg_size(info.pgid))
@@ -1027,6 +1050,7 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>&
     }
   }
 
+#endif
   return true;
 }
 
@@ -1036,11 +1060,12 @@ bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>&
  * calculate the desired acting, and request a change with the monitor
  * if it differs from the current acting.
  */
-bool PG::choose_acting(int& newest_update_osd)
+bool PG::choose_acting(pg_shard_t &auth_log_shard)
 {
-  vector<int> want, backfill;
+  vector<int> want;
+  set<pg_shard_t> backfill;
 
-  if (!calc_acting(newest_update_osd, want, backfill)) {
+  if (!calc_acting(auth_log_shard, want, backfill)) {
     dout(10) << "choose_acting failed" << dendl;
     assert(want_acting.empty());
     return false;
@@ -1049,6 +1074,8 @@ bool PG::choose_acting(int& newest_update_osd)
   // Determine if compatibility needed
   bool compat_mode = !cct->_conf->osd_debug_override_acting_compat;
 
+  //TODOSAM: fix
+#if 0
   if (compat_mode) {
     bool all_support = true;
     OSDMapRef osdmap = get_osdmap();
@@ -1069,9 +1096,12 @@ bool PG::choose_acting(int& newest_update_osd)
     if (all_support)
       compat_mode = false;
   }
+#endif
 
   if (compat_mode && !backfill.empty()) {
-      backfill.resize(1);
+    pg_shard_t only_backfill(*backfill.begin());
+    backfill.clear();
+    backfill.insert(only_backfill);
   }
 
   // This might cause a problem if min_size is large
@@ -1083,8 +1113,8 @@ bool PG::choose_acting(int& newest_update_osd)
     return false;
   }
 
-  if (compat_mode) {
-    want.insert(want.end(), backfill.begin(), backfill.end());
+  if (compat_mode && backfill.size()) {
+    want.insert(want.end(), backfill.begin()->osd);
   }
 
   if (want != acting) {
@@ -1097,30 +1127,42 @@ bool PG::choose_acting(int& newest_update_osd)
       // want is the same as crush map up OSDs.
       assert(compat_mode || backfill.empty());
       vector<int> empty;
-      osd->queue_want_pg_temp(info.pgid, empty);
+      osd->queue_want_pg_temp(info.pgid.pgid, empty);
     } else
-      osd->queue_want_pg_temp(info.pgid, want);
+      osd->queue_want_pg_temp(info.pgid.pgid, want);
     return false;
   }
   want_acting.clear();
   // We can only get here when new interval has arrived and
   // we've accepted the acting set.  Now we can create
   // actingbackfill and backfill_targets vectors.
-  actingbackfill = acting;
+  actingbackfill.clear();
+  for (unsigned i = 0; i < acting.size(); ++i) {
+    if (acting[i] != -1) {
+      actingbackfill.insert(
+       pg_shard_t(
+         acting[i],
+         pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+    }
+  }
   if (!compat_mode)
-    actingbackfill.insert(actingbackfill.end(), backfill.begin(), backfill.end());
+    actingbackfill.insert(backfill.begin(), backfill.end());
   assert(backfill_targets.empty() || backfill_targets == backfill);
   if (backfill_targets.empty()) {
     backfill_targets = backfill;
-    for (unsigned i = 0; i < backfill.size() ; ++i) {
-      stray_set.erase(backfill[i]);
+    for (set<pg_shard_t>::iterator i = backfill.begin();
+        i != backfill.end();
+        ++i) {
+      stray_set.erase(*i);
     }
   } else {
     // Will not change if already set because up would have had to change
     assert(backfill_targets == backfill);
     // Verify that nothing in backfill is in stray_set
-    for (unsigned i = 0; i < backfill.size() ; ++i) {
-      assert(stray_set.find(backfill[i]) == stray_set.end());
+    for (set<pg_shard_t>::iterator i = backfill.begin();
+        i != backfill.end();
+        ++i) {
+      assert(stray_set.find(*i) == stray_set.end());
     }
   }
   dout(10) << "choose_acting want " << want << " (== acting) backfill_targets " 
@@ -1160,16 +1202,20 @@ void PG::build_might_have_unfound()
     if (!interval.maybe_went_rw)
       continue;
 
+    int i = 0;
     std::vector<int>::const_iterator a = interval.acting.begin();
     std::vector<int>::const_iterator a_end = interval.acting.end();
-    for (; a != a_end; ++a) {
+    for (; a != a_end; ++a, ++i) {
       if (*a != osd->whoami)
-       might_have_unfound.insert(*a);
+       might_have_unfound.insert(
+         pg_shard_t(
+           *a,
+           pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
     }
   }
 
   // include any (stray) peers
-  for (map<int,pg_info_t>::iterator p = peer_info.begin();
+  for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
        p != peer_info.end();
        ++p)
     might_have_unfound.insert(p->first);
@@ -1190,8 +1236,11 @@ struct C_PG_ActivateCommitted : public Context {
 void PG::activate(ObjectStore::Transaction& t,
                  epoch_t query_epoch,
                  list<Context*>& tfin,
-                 map< int, map<pg_t,pg_query_t> >& query_map,
-                 map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map)
+                 map<int, map<spg_t,pg_query_t> >& query_map,
+                 map<int,
+                     vector<
+                       pair<pg_notify_t,
+                            pg_interval_map_t> > > *activator_map)
 {
   assert(!is_active());
   assert(scrubber.callbacks.empty());
@@ -1209,7 +1258,8 @@ void PG::activate(ObjectStore::Transaction& t,
 
     // TODOSAM: osd->osd-> is no good
     osd->osd->replay_queue_lock.Lock();
-    osd->osd->replay_queue.push_back(pair<pg_t,utime_t>(info.pgid, replay_until));
+    osd->osd->replay_queue.push_back(pair<spg_t,utime_t>(
+       info.pgid, replay_until));
     osd->osd->replay_queue_lock.Unlock();
   }
 
@@ -1286,8 +1336,11 @@ void PG::activate(ObjectStore::Transaction& t,
     // start up replicas
 
     assert(actingbackfill.size() > 0);
-    for (unsigned i=1; i<actingbackfill.size(); i++) {
-      int peer = actingbackfill[i];
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == pg_whoami) continue;
+      pg_shard_t peer = *i;
       assert(peer_info.count(peer));
       pg_info_t& pi = peer_info[peer];
 
@@ -1302,16 +1355,19 @@ void PG::activate(ObjectStore::Transaction& t,
         // empty log
        if (!pi.is_empty() && activator_map) {
          dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
-         (*activator_map)[peer].push_back(
+         (*activator_map)[peer.osd].push_back(
            make_pair(
              pg_notify_t(
+               peer.shard, pg_whoami.shard,
                get_osdmap()->get_epoch(),
                get_osdmap()->get_epoch(),
                info),
              past_intervals));
        } else {
          dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
-         m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
+         m = new MOSDPGLog(
+           i->shard, pg_whoami.shard,
+           get_osdmap()->get_epoch(), info);
        }
       } else if (pg_log.get_tail() > pi.last_update || pi.last_backfill == hobject_t()) {
        // backfill
@@ -1325,7 +1381,9 @@ void PG::activate(ObjectStore::Transaction& t,
        pi.history = info.history;
        pi.stats.stats.clear();
 
-       m = new MOSDPGLog(get_osdmap()->get_epoch(), pi);
+       m = new MOSDPGLog(
+         i->shard, pg_whoami.shard,
+         get_osdmap()->get_epoch(), pi);
 
        // send some recent log, so that op dup detection works well.
        m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
@@ -1336,7 +1394,9 @@ void PG::activate(ObjectStore::Transaction& t,
       } else {
        // catch up
        assert(pg_log.get_tail() <= pi.last_update);
-       m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
+       m = new MOSDPGLog(
+         i->shard, pg_whoami.shard,
+         get_osdmap()->get_epoch(), info);
        // send new stuff to append to replicas log
        m->log.copy_after(pg_log.get_log(), pi.last_update);
       }
@@ -1359,7 +1419,7 @@ void PG::activate(ObjectStore::Transaction& t,
       if (m) {
        dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
        //m->log.print(cout);
-       osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+       osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
       }
 
       // peer now has 
@@ -1375,7 +1435,7 @@ void PG::activate(ObjectStore::Transaction& t,
     }
 
     // degraded?
-    if (get_osdmap()->get_pg_size(info.pgid) > acting.size())
+    if (get_osdmap()->get_pg_size(info.pgid.pgid) > acting.size())
       state_set(PG_STATE_DEGRADED);
 
     // all clean?
@@ -1520,7 +1580,7 @@ void PG::_activate_committed(epoch_t e)
   if (pg_has_reset_since(e)) {
     dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl;
   } else if (is_primary()) {
-    peer_activated.insert(osd->whoami);
+    peer_activated.insert(pg_whoami);
     dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated 
             << " last_epoch_started " << info.history.last_epoch_started
             << " same_interval_since " << info.history.same_interval_since << dendl;
@@ -1530,12 +1590,14 @@ void PG::_activate_committed(epoch_t e)
   } else {
     dout(10) << "_activate_committed " << e << " telling primary" << dendl;
     MOSDPGInfo *m = new MOSDPGInfo(e);
-    pg_notify_t i = pg_notify_t(get_osdmap()->get_epoch(),
-                               get_osdmap()->get_epoch(),
-                               info);
+    pg_notify_t i = pg_notify_t(
+      get_primary().shard, pg_whoami.shard,
+      get_osdmap()->get_epoch(),
+      get_osdmap()->get_epoch(),
+      info);
     i.info.history.last_epoch_started = e;
     m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
-    osd->send_message_osd_cluster(acting[0], m, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
   }
 
   if (dirty_info) {
@@ -1614,7 +1676,7 @@ void PG::mark_clean()
 {
   // only mark CLEAN if we have the desired number of replicas AND we
   // are not remapped.
-  if (acting.size() == get_osdmap()->get_pg_size(info.pgid) &&
+  if (acting.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
       up == acting)
     state_set(PG_STATE_CLEAN);
 
@@ -1741,7 +1803,7 @@ static void split_replay_queue(
 }
 
 void PG::split_ops(PG *child, unsigned split_bits) {
-  unsigned match = child->info.pgid.m_seed;
+  unsigned match = child->info.pgid.ps();
   assert(waiting_for_all_missing.empty());
   assert(waiting_for_cache_not_full.empty());
   assert(waiting_for_missing_object.empty());
@@ -1795,7 +1857,15 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   child->snap_trimq = snap_trimq;
 
   // There can't be recovery/backfill going on now
-  get_osdmap()->pg_to_up_acting_osds(child->info.pgid, child->up, child->acting);
+  int primary, up_primary;
+  vector<int> newup, newacting;
+  get_osdmap()->pg_to_up_acting_osds(
+    child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
+  child->init_primary_up_acting(
+    newup,
+    newacting,
+    up_primary,
+    primary);
   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
   if (get_primary() != child->get_primary())
     child->info.history.same_primary_since = get_osdmap()->get_epoch();
@@ -1846,17 +1916,18 @@ void PG::purge_strays()
   dout(10) << "purge_strays " << stray_set << dendl;
   
   bool removed = false;
-  for (set<int>::iterator p = stray_set.begin();
+  for (set<pg_shard_t>::iterator p = stray_set.begin();
        p != stray_set.end();
        ++p) {
-    if (get_osdmap()->is_up(*p)) {
+    assert(!is_actingbackfill(*p));
+    if (get_osdmap()->is_up(p->osd)) {
       dout(10) << "sending PGRemove to osd." << *p << dendl;
-      vector<pg_t> to_remove;
-      to_remove.push_back(info.pgid);
+      vector<spg_t> to_remove;
+      to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
       MOSDPGRemove *m = new MOSDPGRemove(
        get_osdmap()->get_epoch(),
        to_remove);
-      osd->send_message_osd_cluster(*p, m, get_osdmap()->get_epoch());
+      osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
       stray_purged.insert(*p);
     } else {
       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
@@ -1878,10 +1949,15 @@ void PG::purge_strays()
   peer_missing_requested.clear();
 }
 
-void PG::set_probe_targets(const set<int> &probe_set)
+void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
 {
   Mutex::Locker l(heartbeat_peer_lock);
-  probe_targets = probe_set;
+  probe_targets.clear();
+  for (set<pg_shard_t>::iterator i = probe_set.begin();
+       i != probe_set.end();
+       ++i) {
+    probe_targets.insert(i->osd);
+  }
 }
 
 void PG::clear_probe_targets()
@@ -1900,8 +1976,10 @@ void PG::update_heartbeat_peers()
       new_peers.insert(acting[i]);
     for (unsigned i=0; i<up.size(); i++)
       new_peers.insert(up[i]);
-    for (map<int,pg_info_t>::iterator p = peer_info.begin(); p != peer_info.end(); ++p)
-      new_peers.insert(p->first);
+    for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
+        p != peer_info.end();
+        ++p)
+      new_peers.insert(p->first.osd);
   }
 
   bool need_update = false;
@@ -1937,7 +2015,8 @@ void PG::_update_calc_stats()
   info.stats.ondisk_log_start = pg_log.get_tail();
 
   // calc copies, degraded
-  unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), actingbackfill.size());
+  unsigned target = MAX(
+    get_osdmap()->get_pg_size(info.pgid.pgid), actingbackfill.size());
   info.stats.stats.calc_copies(target);
   info.stats.stats.sum.num_objects_degraded = 0;
   if ((is_degraded() || !is_clean()) && is_active()) {
@@ -1957,14 +2036,17 @@ void PG::_update_calc_stats()
     degraded += pg_log.get_missing().num_missing();
 
     assert(actingbackfill.size() > 0);
-    for (unsigned i=1; i<actingbackfill.size(); i++) {
-      assert(peer_missing.count(actingbackfill[i]));
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == pg_whoami) continue;
+      assert(peer_missing.count(*i));
 
       // in missing set
-      degraded += peer_missing[actingbackfill[i]].num_missing();
+      degraded += peer_missing[*i].num_missing();
 
       // not yet backfilled
-      degraded += num_objects - peer_info[actingbackfill[i]].stats.stats.sum.num_objects;
+      degraded += num_objects - peer_info[*i].stats.stats.sum.num_objects;
     }
     info.stats.stats.sum.num_objects_degraded = degraded;
     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
@@ -2041,11 +2123,14 @@ void PG::clear_publish_stats()
  * @param backfill true if info should be marked as backfill
  * @param t transaction to write out our new state in
  */
-void PG::init(int role, vector<int>& newup, vector<int>& newacting,
-             pg_history_t& history,
-             pg_interval_map_t& pi,
-             bool backfill,
-             ObjectStore::Transaction *t)
+void PG::init(
+  int role,
+  vector<int>& newup, int new_up_primary,
+  vector<int>& newacting, int new_acting_primary,
+  pg_history_t& history,
+  pg_interval_map_t& pi,
+  bool backfill,
+  ObjectStore::Transaction *t)
 {
   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
           << " history " << history
@@ -2055,6 +2140,11 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting,
   set_role(role);
   acting = newacting;
   up = newup;
+  init_primary_up_acting(
+    newup,
+    newacting,
+    new_up_primary,
+    new_acting_primary);
 
   info.history = history;
   past_intervals.swap(pi);
@@ -2272,7 +2362,7 @@ void PG::write_info(ObjectStore::Transaction& t)
 epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl)
 {
   assert(bl);
-  pg_t pgid;
+  spg_t pgid;
   snapid_t snap;
   bool ok = coll.is_pg(pgid, snap);
   assert(ok);
@@ -2316,11 +2406,18 @@ void PG::trim_peers()
   dout(10) << "trim_peers " << pg_trim_to << dendl;
   if (pg_trim_to != eversion_t()) {
     assert(actingbackfill.size() > 0);
-    for (unsigned i=1; i<actingbackfill.size(); i++)
-      osd->send_message_osd_cluster(actingbackfill[i],
-                                   new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
-                                                  pg_trim_to),
-                                   get_osdmap()->get_epoch());
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == pg_whoami) continue;
+      osd->send_message_osd_cluster(
+       i->osd,
+       new MOSDPGTrim(
+         get_osdmap()->get_epoch(),
+         spg_t(info.pgid.pgid, i->shard),
+         pg_trim_to),
+       get_osdmap()->get_epoch());
+    }
   }
 }
 
@@ -2395,8 +2492,9 @@ std::string PG::get_corrupt_pg_log_name() const
     dout(0) << "strftime failed" << dendl;
     return "corrupt_log_unknown_time";
   }
-  info.pgid.print(buf + ret, MAX_BUF - ret);
-  return buf;
+  string out(buf);
+  out += stringify(info.pgid);
+  return out;
 }
 
 int PG::read_info(
@@ -2679,7 +2777,7 @@ bool PG::sched_scrub()
     if (osd->inc_scrubs_pending()) {
       dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
       scrubber.reserved = true;
-      scrubber.reserved_peers.insert(osd->whoami);
+      scrubber.reserved_peers.insert(pg_whoami);
       scrub_reserve_replicas();
     } else {
       dout(20) << "sched_scrub: failed to reserve locally" << dendl;
@@ -2737,28 +2835,28 @@ void PG::sub_op_scrub_map(OpRequestRef op)
 
   op->mark_started();
 
-  int from = m->get_source().num();
-
-  dout(10) << " got osd." << from << " scrub map" << dendl;
+  dout(10) << " got " << m->from << " scrub map" << dendl;
   bufferlist::iterator p = m->get_data().begin();
 
   if (scrubber.is_chunky) { // chunky scrub
-    scrubber.received_maps[from].decode(p, info.pgid.pool());
-    dout(10) << "map version is " << scrubber.received_maps[from].valid_through << dendl;
+    scrubber.received_maps[m->from].decode(p, info.pgid.pool());
+    dout(10) << "map version is "
+            << scrubber.received_maps[m->from].valid_through
+            << dendl;
   } else {               // classic scrub
-    if (scrubber.received_maps.count(from)) {
+    if (scrubber.received_maps.count(m->from)) {
       ScrubMap incoming;
       incoming.decode(p, info.pgid.pool());
-      dout(10) << "from replica " << from << dendl;
+      dout(10) << "from replica " << m->from << dendl;
       dout(10) << "map version is " << incoming.valid_through << dendl;
-      scrubber.received_maps[from].merge_incr(incoming);
+      scrubber.received_maps[m->from].merge_incr(incoming);
     } else {
-      scrubber.received_maps[from].decode(p, info.pgid.pool());
+      scrubber.received_maps[m->from].decode(p, info.pgid.pool());
     }
   }
 
   --scrubber.waiting_on;
-  scrubber.waiting_on_whom.erase(from);
+  scrubber.waiting_on_whom.erase(m->from);
 
   if (scrubber.waiting_on == 0) {
     if (scrubber.is_chunky) { // chunky scrub
@@ -2772,7 +2870,7 @@ void PG::sub_op_scrub_map(OpRequestRef op)
           scrubber.finalizing = true;
           scrub_gather_replica_maps();
           ++scrubber.waiting_on;
-          scrubber.waiting_on_whom.insert(osd->whoami);
+          scrubber.waiting_on_whom.insert(pg_whoami);
           osd->scrub_wq.queue(this);
         }
       }
@@ -2781,27 +2879,33 @@ void PG::sub_op_scrub_map(OpRequestRef op)
 }
 
 // send scrub v2-compatible messages (classic scrub)
-void PG::_request_scrub_map_classic(int replica, eversion_t version)
+void PG::_request_scrub_map_classic(pg_shard_t replica, eversion_t version)
 {
-  assert(replica != osd->whoami);
+  assert(replica != pg_whoami);
   dout(10) << "scrub  requesting scrubmap from osd." << replica << dendl;
-  MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version,
-                                             last_update_applied,
-                                              get_osdmap()->get_epoch());
-  osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch());
+  MOSDRepScrub *repscrubop =
+    new MOSDRepScrub(
+      spg_t(info.pgid.pgid, replica.shard), version,
+      last_update_applied,
+      get_osdmap()->get_epoch());
+  osd->send_message_osd_cluster(
+    replica.osd, repscrubop, get_osdmap()->get_epoch());
 }
 
 // send scrub v3 messages (chunky scrub)
-void PG::_request_scrub_map(int replica, eversion_t version,
-                            hobject_t start, hobject_t end,
-                            bool deep)
+void PG::_request_scrub_map(
+  pg_shard_t replica, eversion_t version,
+  hobject_t start, hobject_t end,
+  bool deep)
 {
-  assert(replica != osd->whoami);
+  assert(replica != pg_whoami);
   dout(10) << "scrub  requesting scrubmap from osd." << replica << dendl;
-  MOSDRepScrub *repscrubop = new MOSDRepScrub(info.pgid, version,
-                                              get_osdmap()->get_epoch(),
-                                              start, end, deep);
-  osd->send_message_osd_cluster(replica, repscrubop, get_osdmap()->get_epoch());
+  MOSDRepScrub *repscrubop = new MOSDRepScrub(
+    spg_t(info.pgid.pgid, replica.shard), version,
+    get_osdmap()->get_epoch(),
+    start, end, deep);
+  osd->send_message_osd_cluster(
+    replica.osd, repscrubop, get_osdmap()->get_epoch());
 }
 
 void PG::sub_op_scrub_reserve(OpRequestRef op)
@@ -2819,7 +2923,8 @@ void PG::sub_op_scrub_reserve(OpRequestRef op)
 
   scrubber.reserved = osd->inc_scrubs_pending();
 
-  MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+  MOSDSubOpReply *reply = new MOSDSubOpReply(
+    m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
   ::encode(scrubber.reserved, reply->get_data());
   osd->send_message_osd_cluster(reply, m->get_connection());
 }
@@ -2837,7 +2942,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
 
   op->mark_started();
 
-  int from = reply->get_source().num();
+  pg_shard_t from = reply->from;
   bufferlist::iterator p = reply->get_data().begin();
   bool reserved;
   ::decode(reserved, p);
@@ -2878,17 +2983,18 @@ void PG::sub_op_scrub_stop(OpRequestRef op)
   // see comment in sub_op_scrub_reserve
   scrubber.reserved = false;
 
-  MOSDSubOpReply *reply = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+  MOSDSubOpReply *reply = new MOSDSubOpReply(
+    m, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
   osd->send_message_osd_cluster(reply, m->get_connection());
 }
 
 void PG::reject_reservation()
 {
   osd->send_message_osd_cluster(
-    acting[0],
+    primary.osd,
     new MBackfillReserve(
       MBackfillReserve::REJECT,
-      info.pgid,
+      spg_t(info.pgid.pgid, primary.shard),
       get_osdmap()->get_epoch()),
     get_osdmap()->get_epoch());
 }
@@ -2918,34 +3024,43 @@ void PG::clear_scrub_reserved()
 void PG::scrub_reserve_replicas()
 {
   assert(backfill_targets.empty());
-  for (unsigned i=1; i<acting.size(); i++) {
-    dout(10) << "scrub requesting reserve from osd." << acting[i] << dendl;
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == pg_whoami) continue;
+    dout(10) << "scrub requesting reserve from osd." << *i << dendl;
     vector<OSDOp> scrub(1);
     scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
     hobject_t poid;
     eversion_t v;
     osd_reqid_t reqid;
-    MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
-                                     get_osdmap()->get_epoch(), osd->get_tid(), v);
+    MOSDSubOp *subop = new MOSDSubOp(
+      reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0,
+      get_osdmap()->get_epoch(), osd->get_tid(), v);
     subop->ops = scrub;
-    osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(
+      i->osd, subop, get_osdmap()->get_epoch());
   }
 }
 
 void PG::scrub_unreserve_replicas()
 {
   assert(backfill_targets.empty());
-  for (unsigned i=1; i<acting.size(); i++) {
-    dout(10) << "scrub requesting unreserve from osd." << acting[i] << dendl;
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == pg_whoami) continue;
+    dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
     vector<OSDOp> scrub(1);
     scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
     hobject_t poid;
     eversion_t v;
     osd_reqid_t reqid;
-    MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
-                                     get_osdmap()->get_epoch(), osd->get_tid(), v);
+    MOSDSubOp *subop = new MOSDSubOp(
+      reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, false, 0,
+      get_osdmap()->get_epoch(), osd->get_tid(), v);
     subop->ops = scrub;
-    osd->send_message_osd_cluster(acting[i], subop, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
   }
 }
 
@@ -3127,14 +3242,17 @@ void PG::build_inc_scrub_map(
   osd->store->collection_getattrs(coll, map.attrs);
 }
 
-void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer)
+void PG::repair_object(
+  const hobject_t& soid, ScrubMap::object *po,
+  pg_shard_t bad_peer, pg_shard_t ok_peer)
 {
-  dout(10) << "repair_object " << soid << " bad_peer osd." << bad_peer << " ok_peer osd." << ok_peer << dendl;
+  dout(10) << "repair_object " << soid << " bad_peer osd."
+          << bad_peer << " ok_peer osd." << ok_peer << dendl;
   eversion_t v;
   bufferlist bv;
   bv.push_back(po->attrs[OI_ATTR]);
   object_info_t oi(bv);
-  if (bad_peer != acting[0]) {
+  if (bad_peer != primary) {
     peer_missing[bad_peer].add(soid, oi.version, eversion_t());
   } else {
     // We should only be scrubbing if the PG is clean.
@@ -3237,8 +3355,16 @@ void PG::replica_scrub(
   hobject_t poid;
   eversion_t v;
   osd_reqid_t reqid;
-  MOSDSubOp *subop = new MOSDSubOp(reqid, info.pgid, poid, false, 0,
-                                  msg->map_epoch, osd->get_tid(), v);
+  MOSDSubOp *subop = new MOSDSubOp(
+    reqid,
+    pg_whoami,
+    spg_t(info.pgid.pgid, get_primary().shard),
+    poid,
+    false,
+    0,
+    msg->map_epoch,
+    osd->get_tid(),
+    v);
   ::encode(map, subop->get_data());
   subop->ops = scrub;
 
@@ -3367,11 +3493,16 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle)
      * last_update_applied == info.last_update)
      */
     scrubber.waiting_on = acting.size();
-    scrubber.waiting_on_whom.insert(acting.begin(), acting.end());
+    scrubber.waiting_on_whom.insert(
+      actingbackfill.begin(), actingbackfill.end());
+    scrubber.waiting_on_whom.erase(pg_whoami);
 
     // request maps from replicas
-    for (unsigned i=1; i<acting.size(); i++) {
-      _request_scrub_map_classic(acting[i], eversion_t());
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == pg_whoami) continue;
+      _request_scrub_map_classic(*i, eversion_t());
     }
 
     // Unlocks and relocks...
@@ -3386,7 +3517,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle)
     }
 
     --scrubber.waiting_on;
-    scrubber.waiting_on_whom.erase(osd->whoami);
+    scrubber.waiting_on_whom.erase(pg_whoami);
 
     if (scrubber.waiting_on == 0) {
       // the replicas have completed their scrub map, so lock out writes
@@ -3406,7 +3537,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle)
     // request incrementals from replicas
     scrub_gather_replica_maps();
     ++scrubber.waiting_on;
-    scrubber.waiting_on_whom.insert(osd->whoami);
+    scrubber.waiting_on_whom.insert(pg_whoami);
   }
     
   dout(10) << "clean up scrub" << dendl;
@@ -3428,7 +3559,7 @@ void PG::classic_scrub(ThreadPool::TPHandle &handle)
   }
   
   --scrubber.waiting_on;
-  scrubber.waiting_on_whom.erase(osd->whoami);
+  scrubber.waiting_on_whom.erase(pg_whoami);
   if (scrubber.waiting_on == 0) {
     assert(last_update_applied == info.last_update);
     osd->scrub_finalize_wq.queue(this);
@@ -3603,14 +3734,17 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         }
 
         // ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan
-        scrubber.waiting_on_whom.insert(osd->whoami);
+        scrubber.waiting_on_whom.insert(pg_whoami);
         ++scrubber.waiting_on;
 
         // request maps from replicas
-        for (unsigned i=1; i<acting.size(); i++) {
-          _request_scrub_map(acting[i], scrubber.subset_last_update,
+       for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+            i != actingbackfill.end();
+            ++i) {
+         if (*i == pg_whoami) continue;
+          _request_scrub_map(*i, scrubber.subset_last_update,
                              scrubber.start, scrubber.end, scrubber.deep);
-          scrubber.waiting_on_whom.insert(acting[i]);
+          scrubber.waiting_on_whom.insert(*i);
           ++scrubber.waiting_on;
         }
 
@@ -3653,7 +3787,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         }
 
         --scrubber.waiting_on;
-        scrubber.waiting_on_whom.erase(osd->whoami);
+        scrubber.waiting_on_whom.erase(pg_whoami);
 
         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
         break;
@@ -3735,7 +3869,7 @@ bool PG::scrub_gather_replica_maps()
   assert(scrubber.waiting_on == 0);
   assert(_lock.is_locked());
 
-  for (map<int,ScrubMap>::iterator p = scrubber.received_maps.begin();
+  for (map<pg_shard_t, ScrubMap>::iterator p = scrubber.received_maps.begin();
        p != scrubber.received_maps.end();
        ++p) {
     
@@ -3754,8 +3888,6 @@ bool PG::scrub_gather_replica_maps()
   }
 }
 
-
-
 void PG::scrub_compare_maps() 
 {
   dout(10) << "scrub_compare_maps has maps, analyzing" << dendl;
@@ -3769,16 +3901,21 @@ void PG::scrub_compare_maps()
     stringstream ss;
 
     // Map from object with errors to good peer
-    map<hobject_t, int> authoritative;
-    map<int,ScrubMap *> maps;
+    map<hobject_t, pg_shard_t> authoritative;
+    map<pg_shard_t, ScrubMap *> maps;
 
     dout(2) << "scrub   osd." << acting[0] << " has " 
            << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
-    maps[0] = &scrubber.primary_scrubmap;
-    for (unsigned i=1; i<acting.size(); i++) {
-      dout(2) << "scrub   osd." << acting[i] << " has " 
-             << scrubber.received_maps[acting[i]].objects.size() << " items" << dendl;
-      maps[i] = &scrubber.received_maps[acting[i]];
+    maps[pg_whoami] = &scrubber.primary_scrubmap;
+
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == pg_whoami) continue;
+      dout(2) << "scrub replica " << *i << " has "
+             << scrubber.received_maps[*i].objects.size()
+             << " items" << dendl;
+      maps[*i] = &scrubber.received_maps[*i];
     }
 
     get_pgbackend()->be_compare_scrubmaps(
@@ -3797,7 +3934,7 @@ void PG::scrub_compare_maps()
       osd->clog.error(ss);
     }
 
-    for (map<hobject_t, int>::iterator i = authoritative.begin();
+    for (map<hobject_t, pg_shard_t>::iterator i = authoritative.begin();
         i != authoritative.end();
         ++i) {
       scrubber.authoritative.insert(
@@ -3806,7 +3943,7 @@ void PG::scrub_compare_maps()
          make_pair(maps[i->second]->objects[i->first], i->second)));
     }
 
-    for (map<hobject_t, int>::iterator i = authoritative.begin();
+    for (map<hobject_t, pg_shard_t>::iterator i = authoritative.begin();
         i != authoritative.end();
         ++i) {
       authmap.objects.erase(i->first);
@@ -3827,11 +3964,11 @@ void PG::scrub_process_inconsistent()
 
   if (!scrubber.authoritative.empty() || !scrubber.inconsistent.empty()) {
     stringstream ss;
-    for (map<hobject_t, set<int> >::iterator obj =
+    for (map<hobject_t, set<pg_shard_t> >::iterator obj =
           scrubber.inconsistent_snapcolls.begin();
         obj != scrubber.inconsistent_snapcolls.end();
         ++obj) {
-      for (set<int>::iterator j = obj->second.begin();
+      for (set<pg_shard_t>::iterator j = obj->second.begin();
           j != obj->second.end();
           ++j) {
        ++scrubber.shallow_errors;
@@ -3840,26 +3977,28 @@ void PG::scrub_process_inconsistent()
       }
     }
 
-    ss << info.pgid << " " << mode << " " << scrubber.missing.size() << " missing, "
+    ss << info.pgid << " " << mode << " "
+       << scrubber.missing.size() << " missing, "
        << scrubber.inconsistent.size() << " inconsistent objects\n";
     dout(2) << ss.str() << dendl;
     osd->clog.error(ss);
     if (repair) {
       state_clear(PG_STATE_CLEAN);
-      for (map<hobject_t, pair<ScrubMap::object, int> >::iterator i =
+      for (map<hobject_t, pair<ScrubMap::object, pg_shard_t> >::iterator i =
             scrubber.authoritative.begin();
           i != scrubber.authoritative.end();
           ++i) {
-       set<int>::iterator j;
+       set<pg_shard_t>::iterator j;
        
        if (scrubber.missing.count(i->first)) {
          for (j = scrubber.missing[i->first].begin();
               j != scrubber.missing[i->first].end(); 
               ++j) {
-           repair_object(i->first, 
+           repair_object(
+             i->first,
              &(i->second.first),
-             acting[*j],
-             acting[i->second.second]);
+             *j,
+             i->second.second);
            ++scrubber.fixed;
          }
        }
@@ -3869,8 +4008,8 @@ void PG::scrub_process_inconsistent()
               ++j) {
            repair_object(i->first, 
              &(i->second.first),
-             acting[*j],
-             acting[i->second.second]);
+             *j,
+             i->second.second);
            ++scrubber.fixed;
          }
        }
@@ -4012,21 +4151,25 @@ void PG::share_pg_info()
 
   // share new pg_info_t with replicas
   assert(actingbackfill.size() > 0);
-  for (unsigned i=1; i<actingbackfill.size(); i++) {
-    int peer = actingbackfill[i];
-    if (peer_info.count(i)) {
-      peer_info[i].last_epoch_started = info.last_epoch_started;
-      peer_info[i].history.merge(info.history);
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == pg_whoami) continue;
+    pg_shard_t peer = *i;
+    if (peer_info.count(peer)) {
+      peer_info[peer].last_epoch_started = info.last_epoch_started;
+      peer_info[peer].history.merge(info.history);
     }
     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
     m->pg_list.push_back(
       make_pair(
        pg_notify_t(
+         peer.shard, pg_whoami.shard,
          get_osdmap()->get_epoch(),
          get_osdmap()->get_epoch(),
          info),
        pg_interval_map_t()));
-    osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
   }
 }
 
@@ -4040,15 +4183,19 @@ void PG::share_pg_log()
   dout(10) << __func__ << dendl;
   assert(is_primary());
 
-  vector<int>::const_iterator a = actingbackfill.begin();
+  set<pg_shard_t>::const_iterator a = actingbackfill.begin();
   assert(a != actingbackfill.end());
-  vector<int>::const_iterator end = actingbackfill.end();
-  while (++a != end) {
-    int peer(*a);
+  set<pg_shard_t>::const_iterator end = actingbackfill.end();
+  while (a != end) {
+    pg_shard_t peer(*a);
+    ++a;
+    if (peer == pg_whoami) continue;
     pg_missing_t& pmissing(peer_missing[peer]);
     pg_info_t& pinfo(peer_info[peer]);
 
-    MOSDPGLog *m = new MOSDPGLog(info.last_update.epoch, info);
+    MOSDPGLog *m = new MOSDPGLog(
+      peer.shard, pg_whoami.shard,
+      info.last_update.epoch, info);
     m->log.copy_after(pg_log.get_log(), pinfo.last_update);
 
     for (list<pg_log_entry_t>::const_iterator i = m->log.log.begin();
@@ -4058,7 +4205,7 @@ void PG::share_pg_log()
     }
     pinfo.last_update = m->log.head;
 
-    osd->send_message_osd_cluster(peer, m, get_osdmap()->get_epoch());
+    osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
   }
 }
 
@@ -4069,11 +4216,11 @@ void PG::update_history_from_master(pg_history_t new_history)
   reg_next_scrub();
 }
 
-void PG::fulfill_info(int from, const pg_query_t &query, 
-                     pair<int, pg_info_t> &notify_info)
+void PG::fulfill_info(
+  pg_shard_t from, const pg_query_t &query,
+  pair<pg_shard_t, pg_info_t> &notify_info)
 {
-  assert(!acting.empty());
-  assert(from == acting[0]);
+  assert(from == primary);
   assert(query.type == pg_query_t::INFO);
 
   // info
@@ -4081,14 +4228,17 @@ void PG::fulfill_info(int from, const pg_query_t &query,
   notify_info = make_pair(from, info);
 }
 
-void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch)
+void PG::fulfill_log(
+  pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
 {
-  assert(!acting.empty());
-  assert(from == acting[0]);
+  dout(10) << "log request from " << from << dendl;
+  assert(from == primary);
   assert(query.type != pg_query_t::INFO);
 
-  MOSDPGLog *mlog = new MOSDPGLog(get_osdmap()->get_epoch(),
-                                 info, query_epoch);
+  MOSDPGLog *mlog = new MOSDPGLog(
+    from.shard, pg_whoami.shard,
+    get_osdmap()->get_epoch(),
+    info, query_epoch);
   mlog->missing = pg_log.get_missing();
 
   // primary -> other, when building master log
@@ -4110,9 +4260,10 @@ void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch)
 
   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
 
-  ConnectionRef con = osd->get_con_osd_cluster(from, get_osdmap()->get_epoch());
+  ConnectionRef con = osd->get_con_osd_cluster(
+    from.osd, get_osdmap()->get_epoch());
   if (con) {
-    osd->osd->_share_map_outgoing(from, con.get(), get_osdmap());
+    osd->osd->_share_map_outgoing(from.osd, con.get(), get_osdmap());
     osd->send_message_osd_cluster(mlog, con.get());
   } else {
     mlog->put();
@@ -4260,10 +4411,11 @@ void PG::start_flush(ObjectStore::Transaction *t,
 }
 
 /* Called before initializing peering during advance_map */
-void PG::start_peering_interval(const OSDMapRef lastmap,
-                               const vector<int>& newup,
-                               const vector<int>& newacting,
-                               ObjectStore::Transaction *t)
+void PG::start_peering_interval(
+  const OSDMapRef lastmap,
+  const vector<int>& newup, int new_up_primary,
+  const vector<int>& newacting, int new_acting_primary,
+  ObjectStore::Transaction *t)
 {
   const OSDMapRef osdmap = get_osdmap();
 
@@ -4271,13 +4423,17 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
 
   vector<int> oldacting, oldup;
   int oldrole = get_role();
-  int oldprimary = get_primary();
+
+  pg_shard_t oldprimary = get_primary();
   bool was_old_primary = is_primary();
+
   acting.swap(oldacting);
   up.swap(oldup);
-
-  up = newup;
-  acting = newacting;
+  init_primary_up_acting(
+    newup,
+    newacting,
+    new_up_primary,
+    new_acting_primary);
 
   if (info.stats.up != up ||
       info.stats.acting != acting) {
@@ -4294,7 +4450,10 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
     state_clear(PG_STATE_REMAPPED);
 
   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
-  set_role(role);
+  if (role == pg_whoami.shard)
+    set_role(role);
+  else
+    set_role(-1);
 
   // did acting, up, primary|acker change?
   if (!lastmap) {
@@ -4311,7 +4470,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
       osdmap,
       lastmap,
       info.pgid.pool(),
-      info.pgid,
+      info.pgid.pgid,
       &past_intervals,
       &debug);
     dout(10) << __func__ << ": check_new_interval output: "
@@ -4405,12 +4564,12 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
     }
   }
   // make sure we clear out any pg_temp change requests
-  osd->remove_want_pg_temp(info.pgid);
+  osd->remove_want_pg_temp(info.pgid.pgid);
   cancel_recovery();
 
-  if (acting.empty() && !up.empty() && up[0] == osd->whoami) {
+  if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
-    osd->queue_want_pg_temp(info.pgid, acting);
+    osd->queue_want_pg_temp(info.pgid.pgid, acting);
   }
 }
 
@@ -4727,9 +4886,9 @@ void PG::queue_peering_event(CephPeeringEvtRef evt)
 
 void PG::queue_notify(epoch_t msg_epoch,
                      epoch_t query_epoch,
-                     int from, pg_notify_t& i)
+                     pg_shard_t from, pg_notify_t& i)
 {
-  dout(10) << "notify " << i << " from osd." << from << dendl;
+  dout(10) << "notify " << i << " from replica " << from << dendl;
   queue_peering_event(
     CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
                                         MNotifyRec(from, i))));
@@ -4737,9 +4896,9 @@ void PG::queue_notify(epoch_t msg_epoch,
 
 void PG::queue_info(epoch_t msg_epoch,
                     epoch_t query_epoch,
-                    int from, pg_info_t& i)
+                    pg_shard_t from, pg_info_t& i)
 {
-  dout(10) << "info " << i << " from osd." << from << dendl;
+  dout(10) << "info " << i << " from replica " << from << dendl;
   queue_peering_event(
     CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
                                         MInfoRec(from, i, msg_epoch))));
@@ -4747,10 +4906,10 @@ void PG::queue_info(epoch_t msg_epoch,
 
 void PG::queue_log(epoch_t msg_epoch,
                   epoch_t query_epoch,
-                  int from,
+                  pg_shard_t from,
                   MOSDPGLog *msg)
 {
-  dout(10) << "log " << *msg << " from osd." << from << dendl;
+  dout(10) << "log " << *msg << " from replica " << from << dendl;
   queue_peering_event(
     CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
                                         MLogRec(from, msg))));
@@ -4775,26 +4934,33 @@ void PG::queue_flushed(epoch_t e)
 
 void PG::queue_query(epoch_t msg_epoch,
                     epoch_t query_epoch,
-                    int from, const pg_query_t& q)
+                    pg_shard_t from, const pg_query_t& q)
 {
-  dout(10) << "handle_query " << q << " from osd." << from << dendl;
+  dout(10) << "handle_query " << q << " from replica " << from << dendl;
   queue_peering_event(
     CephPeeringEvtRef(new CephPeeringEvt(msg_epoch, query_epoch,
                                         MQuery(from, q, query_epoch))));
 }
 
-void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
-                           vector<int>& newup, vector<int>& newacting,
-                           RecoveryCtx *rctx)
+void PG::handle_advance_map(
+  OSDMapRef osdmap, OSDMapRef lastmap,
+  vector<int>& newup, int up_primary,
+  vector<int>& newacting, int acting_primary,
+  RecoveryCtx *rctx)
 {
   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
   assert(lastmap == osdmap_ref);
-  dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl;
+  dout(10) << "handle_advance_map "
+          << newup << "/" << newacting
+          << " -- " << up_primary << "/" << acting_primary
+          << dendl;
   update_osdmap_ref(osdmap);
   pool.update(osdmap);
   if (pool.info.last_change == osdmap_ref->get_epoch())
     on_pool_change();
-  AdvMap evt(osdmap, lastmap, newup, newacting);
+  AdvMap evt(
+    osdmap, lastmap, newup, up_primary,
+    newacting, acting_primary);
   recovery_state.handle_event(evt, rctx);
 }
 
@@ -4993,26 +5159,32 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
   // _before_ we are active.
   pg->generate_past_intervals();
 
-  pg->remove_down_peer_info(advmap.osdmap);
   if (pg->acting_up_affected(advmap.newup, advmap.newacting) ||
     pg->is_split(advmap.lastmap, advmap.osdmap)) {
     dout(10) << "up or acting affected, calling start_peering_interval again"
             << dendl;
-    pg->start_peering_interval(advmap.lastmap, advmap.newup, advmap.newacting,
-                              context< RecoveryMachine >().get_cur_transaction());
+    pg->start_peering_interval(
+      advmap.lastmap,
+      advmap.newup, advmap.up_primary,
+      advmap.newacting, advmap.acting_primary,
+      context< RecoveryMachine >().get_cur_transaction());
   }
+  pg->remove_down_peer_info(advmap.osdmap);
   return discard_event();
 }
 
 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  if (pg->should_send_notify() && pg->get_primary() >= 0) {
-    context< RecoveryMachine >().send_notify(pg->get_primary(),
-                                            pg_notify_t(pg->get_osdmap()->get_epoch(),
-                                                        pg->get_osdmap()->get_epoch(),
-                                                        pg->info),
-                                            pg->past_intervals);
+  if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+    context< RecoveryMachine >().send_notify(
+      pg->get_primary(),
+      pg_notify_t(
+       pg->get_primary().shard, pg->pg_whoami.shard,
+       pg->get_osdmap()->get_epoch(),
+       pg->get_osdmap()->get_epoch(),
+       pg->info),
+      pg->past_intervals);
   }
 
   pg->update_heartbeat_peers();
@@ -5073,13 +5245,6 @@ PG::RecoveryState::Primary::Primary(my_context ctx)
   assert(pg->want_acting.empty());
 }
 
-boost::statechart::result PG::RecoveryState::Primary::react(const AdvMap &advmap)
-{
-  PG *pg = context< RecoveryMachine >().pg;
-  pg->remove_down_peer_info(advmap.osdmap);
-  return forward_event();
-}
-
 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
 {
   dout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
@@ -5160,15 +5325,19 @@ boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
   q.f->close_section();
 
   q.f->open_array_section("probing_osds");
-  for (set<int>::iterator p = prior_set->probe.begin(); p != prior_set->probe.end(); ++p)
-    q.f->dump_int("osd", *p);
+  for (set<pg_shard_t>::iterator p = prior_set->probe.begin();
+       p != prior_set->probe.end();
+       ++p)
+    q.f->dump_stream("osd") << *p;
   q.f->close_section();
 
   if (prior_set->pg_down)
     q.f->dump_string("blocked", "peering is blocked due to down osds");
 
   q.f->open_array_section("down_osds_we_would_probe");
-  for (set<int>::iterator p = prior_set->down.begin(); p != prior_set->down.end(); ++p)
+  for (set<int>::iterator p = prior_set->down.begin();
+       p != prior_set->down.end();
+       ++p)
     q.f->dump_int("osd", *p);
   q.f->close_section();
 
@@ -5259,8 +5428,9 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
 
   if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) {
     //The primary never backfills itself
-    assert(*backfill_osd_it != pg->osd->whoami);
-    ConnectionRef con = pg->osd->get_con_osd_cluster(*backfill_osd_it, pg->get_osdmap()->get_epoch());
+    assert(*backfill_osd_it != pg->pg_whoami);
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
       if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
         unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
@@ -5268,7 +5438,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
         pg->osd->send_message_osd_cluster(
           new MBackfillReserve(
          MBackfillReserve::REQUEST,
-         pg->info.pgid,
+         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
          pg->get_osdmap()->get_epoch(), priority),
        con.get());
       } else {
@@ -5379,10 +5549,10 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->send_message_osd_cluster(
-    pg->acting[0],
+    pg->primary.osd,
     new MRecoveryReserve(
       MRecoveryReserve::GRANT,
-      pg->info.pgid,
+      spg_t(pg->info.pgid.pgid, pg->primary.shard),
       pg->get_osdmap()->get_epoch()),
     pg->get_osdmap()->get_epoch());
   return transit<RepRecovering>();
@@ -5439,10 +5609,10 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
 {
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->send_message_osd_cluster(
-    pg->acting[0],
+    pg->primary.osd,
     new MBackfillReserve(
       MBackfillReserve::GRANT,
-      pg->info.pgid,
+      spg_t(pg->info.pgid.pgid, pg->primary.shard),
       pg->get_osdmap()->get_epoch()),
     pg->get_osdmap()->get_epoch());
   return transit<RepRecovering>();
@@ -5522,7 +5692,7 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
-    acting_osd_it(context< Active >().sorted_acting_set.begin())
+    acting_osd_it(context< Active >().sorted_actingbackfill_set.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
   post_event(RemoteRecoveryReserved());
@@ -5532,20 +5702,22 @@ boost::statechart::result
 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
   PG *pg = context< RecoveryMachine >().pg;
 
-  if (acting_osd_it != context< Active >().sorted_acting_set.end()) {
+  if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
     // skip myself
-    if (*acting_osd_it == pg->osd->whoami)
+    if (*acting_osd_it == pg->pg_whoami)
       ++acting_osd_it;
   }
 
-  if (acting_osd_it != context< Active >().sorted_acting_set.end()) {
-    ConnectionRef con = pg->osd->get_con_osd_cluster(*acting_osd_it, pg->get_osdmap()->get_epoch());
+  if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      acting_osd_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
       if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
        pg->osd->send_message_osd_cluster(
-          new MRecoveryReserve(MRecoveryReserve::REQUEST,
-                              pg->info.pgid,
-                              pg->get_osdmap()->get_epoch()),
+          new MRecoveryReserve(
+           MRecoveryReserve::REQUEST,
+           spg_t(pg->info.pgid.pgid, acting_osd_it->shard),
+           pg->get_osdmap()->get_epoch()),
          con.get());
       } else {
        post_event(RemoteRecoveryReserved());
@@ -5584,18 +5756,21 @@ void PG::RecoveryState::Recovering::release_reservations()
   assert(!pg->pg_log.get_missing().have_missing());
 
   // release remote reservations
-  for (set<int>::const_iterator i = context< Active >().sorted_acting_set.begin();
-        i != context< Active >().sorted_acting_set.end();
+  for (set<pg_shard_t>::const_iterator i =
+        context< Active >().sorted_actingbackfill_set.begin();
+        i != context< Active >().sorted_actingbackfill_set.end();
         ++i) {
-    if (*i == pg->osd->whoami) // skip myself
+    if (*i == pg->pg_whoami) // skip myself
       continue;
-    ConnectionRef con = pg->osd->get_con_osd_cluster(*i, pg->get_osdmap()->get_epoch());
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      i->osd, pg->get_osdmap()->get_epoch());
     if (con) {
       if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
        pg->osd->send_message_osd_cluster(
-          new MRecoveryReserve(MRecoveryReserve::RELEASE,
-                              pg->info.pgid,
-                              pg->get_osdmap()->get_epoch()),
+          new MRecoveryReserve(
+           MRecoveryReserve::RELEASE,
+           spg_t(pg->info.pgid.pgid, i->shard),
+           pg->get_osdmap()->get_epoch()),
          con.get());
       }
     }
@@ -5632,7 +5807,7 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovered")
 {
-  int newest_update_osd;
+  pg_shard_t auth_log_shard;
 
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -5642,11 +5817,12 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
   // if we finished backfill, all acting are active; recheck if
   // DEGRADED is appropriate.
   assert(pg->actingbackfill.size() > 0);
-  if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->actingbackfill.size())
+  if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
+      pg->actingbackfill.size())
     pg->state_clear(PG_STATE_DEGRADED);
 
   // adjust acting set?  (e.g. because backfill completed...)
-  if (pg->acting != pg->up && !pg->choose_acting(newest_update_osd))
+  if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard))
     assert(pg->want_acting.size());
 
   assert(!pg->needs_recovery());
@@ -5695,10 +5871,12 @@ void PG::RecoveryState::Clean::exit()
 PG::RecoveryState::Active::Active(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
-    sorted_acting_set(context< RecoveryMachine >().pg->actingbackfill.begin(),
-                      context< RecoveryMachine >().pg->actingbackfill.end()),
-    sorted_backfill_set(context< RecoveryMachine >().pg->backfill_targets.begin(),
-                      context< RecoveryMachine >().pg->backfill_targets.end()),
+    sorted_actingbackfill_set(
+      context< RecoveryMachine >().pg->actingbackfill.begin(),
+      context< RecoveryMachine >().pg->actingbackfill.end()),
+    sorted_backfill_set(
+      context< RecoveryMachine >().pg->backfill_targets.begin(),
+      context< RecoveryMachine >().pg->backfill_targets.end()),
     all_replicas_activated(false)
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -5745,9 +5923,9 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
 
   /* Check for changes in pool size (if the acting set changed as a result,
    * this does not matter) */
-  if (advmap.lastmap->get_pg_size(pg->info.pgid) !=
-      pg->get_osdmap()->get_pg_size(pg->info.pgid)) {
-    if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
+  if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
+      pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
+    if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->acting.size())
       pg->state_clear(PG_STATE_DEGRADED);
     else
       pg->state_set(PG_STATE_DEGRADED);
@@ -5876,16 +6054,16 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
 
   {
     q.f->open_array_section("might_have_unfound");
-    for (set<int>::iterator p = pg->might_have_unfound.begin();
+    for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
         p != pg->might_have_unfound.end();
         ++p) {
       q.f->open_object_section("osd");
-      q.f->dump_int("osd", *p);
+      q.f->dump_stream("osd") << *p;
       if (pg->peer_missing.count(*p)) {
        q.f->dump_string("status", "already probed");
       } else if (pg->peer_missing_requested.count(*p)) {
        q.f->dump_string("status", "querying");
-      } else if (!pg->get_osdmap()->is_up(*p)) {
+      } else if (!pg->get_osdmap()->is_up(p->osd)) {
        q.f->dump_string("status", "osd is down");
       } else {
        q.f->dump_string("status", "not queried");
@@ -5909,10 +6087,10 @@ boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
     {
       q.f->open_array_section("scrubber.waiting_on_whom");
-      for (set<int>::iterator p = pg->scrubber.waiting_on_whom.begin();
+      for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
           p != pg->scrubber.waiting_on_whom.end();
           ++p) {
-       q.f->dump_int("osd", *p);
+       q.f->dump_stream("shard") << *p;
       }
       q.f->close_section();
     }
@@ -5968,7 +6146,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(
   const Activate& actevt) {
   dout(10) << "In ReplicaActive, about to call activate" << dendl;
   PG *pg = context< RecoveryMachine >().pg;
-  map< int, map< pg_t, pg_query_t> > query_map;
+  map<int, map<spg_t, pg_query_t> > query_map;
   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
               actevt.query_epoch,
               *context< RecoveryMachine >().get_on_safe_context_list(),
@@ -5999,12 +6177,15 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec&
 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  if (pg->should_send_notify() && pg->get_primary() >= 0) {
-    context< RecoveryMachine >().send_notify(pg->get_primary(),
-                                            pg_notify_t(pg->get_osdmap()->get_epoch(),
-                                                        pg->get_osdmap()->get_epoch(),
-                                                        pg->info),
-                                            pg->past_intervals);
+  if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+    context< RecoveryMachine >().send_notify(
+      pg->get_primary(),
+      pg_notify_t(
+       pg->get_primary().shard, pg->pg_whoami.shard,
+       pg->get_osdmap()->get_epoch(),
+       pg->get_osdmap()->get_epoch(),
+       pg->info),
+      pg->past_intervals);
   }
   pg->take_waiters();
   return discard_event();
@@ -6107,14 +6288,17 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
 {
   PG *pg = context< RecoveryMachine >().pg;
   if (query.query.type == pg_query_t::INFO) {
-    pair<int, pg_info_t> notify_info;
+    pair<pg_shard_t, pg_info_t> notify_info;
     pg->update_history_from_master(query.query.history);
     pg->fulfill_info(query.from, query.query, notify_info);
-    context< RecoveryMachine >().send_notify(notify_info.first,
-                                            pg_notify_t(query.query_epoch,
-                                                        pg->get_osdmap()->get_epoch(),
-                                                        notify_info.second),
-                                            pg->past_intervals);
+    context< RecoveryMachine >().send_notify(
+      notify_info.first,
+      pg_notify_t(
+       notify_info.first.shard, pg->pg_whoami.shard,
+       query.query_epoch,
+       pg->get_osdmap()->get_epoch(),
+       notify_info.second),
+      pg->past_intervals);
   } else {
     pg->fulfill_log(query.from, query.query, query.query_epoch);
   }
@@ -6124,12 +6308,15 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
 {
   PG *pg = context< RecoveryMachine >().pg;
-  if (pg->should_send_notify() && pg->get_primary() >= 0) {
-    context< RecoveryMachine >().send_notify(pg->get_primary(),
-                                            pg_notify_t(pg->get_osdmap()->get_epoch(),
-                                                        pg->get_osdmap()->get_epoch(),
-                                                        pg->info),
-                                            pg->past_intervals);
+  if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
+    context< RecoveryMachine >().send_notify(
+      pg->get_primary(),
+      pg_notify_t(
+       pg->get_primary().shard, pg->pg_whoami.shard,
+       pg->get_osdmap()->get_epoch(),
+       pg->get_osdmap()->get_epoch(),
+       pg->info),
+      pg->past_intervals);
   }
   pg->take_waiters();
   return discard_event();
@@ -6170,11 +6357,11 @@ void PG::RecoveryState::GetInfo::get_infos()
   PG *pg = context< RecoveryMachine >().pg;
   auto_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
 
-  for (set<int>::const_iterator it = prior_set->probe.begin();
+  for (set<pg_shard_t>::const_iterator it = prior_set->probe.begin();
        it != prior_set->probe.end();
        ++it) {
-    int peer = *it;
-    if (peer == pg->osd->whoami) {
+    pg_shard_t peer = *it;
+    if (peer == pg->pg_whoami) {
       continue;
     }
     if (pg->peer_info.count(peer)) {
@@ -6183,12 +6370,13 @@ void PG::RecoveryState::GetInfo::get_infos()
     }
     if (peer_info_requested.count(peer)) {
       dout(10) << " already requested info from osd." << peer << dendl;
-    } else if (!pg->get_osdmap()->is_up(peer)) {
+    } else if (!pg->get_osdmap()->is_up(peer.osd)) {
       dout(10) << " not querying info from down osd." << peer << dendl;
     } else {
       dout(10) << " querying info from osd." << peer << dendl;
       context< RecoveryMachine >().send_query(
        peer, pg_query_t(pg_query_t::INFO,
+                        it->shard, pg->pg_whoami.shard,
                         pg->info.history,
                         pg->get_osdmap()->get_epoch()));
       peer_info_requested.insert(peer);
@@ -6198,7 +6386,7 @@ void PG::RecoveryState::GetInfo::get_infos()
 
 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt) 
 {
-  set<int>::iterator p = peer_info_requested.find(infoevt.from);
+  set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
   if (p != peer_info_requested.end())
     peer_info_requested.erase(p);
 
@@ -6214,7 +6402,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
       // filter out any osds that got dropped from the probe set from
       // peer_info_requested.  this is less expensive than restarting
       // peering (which would re-probe everyone).
-      set<int>::iterator p = peer_info_requested.begin();
+      set<pg_shard_t>::iterator p = peer_info_requested.begin();
       while (p != peer_info_requested.end()) {
        if (prior_set->probe.count(*p) == 0) {
          dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
@@ -6256,15 +6444,16 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
          bool any_down_now = false;
          for (unsigned i=0; i<interval.acting.size(); i++) {
            int o = interval.acting[i];
+           pg_shard_t so(o, pg->pool.info.ec_pool() ? i : ghobject_t::NO_SHARD);
            if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
              continue;  // dne or lost
            if (osdmap->is_up(o)) {
              pg_info_t *pinfo;
-             if (o == pg->osd->whoami) {
+             if (so == pg->pg_whoami) {
                pinfo = &pg->info;
              } else {
-               assert(pg->peer_info.count(o));
-               pinfo = &pg->peer_info[o];
+               assert(pg->peer_info.count(so));
+               pinfo = &pg->peer_info[so];
              }
              if (!pinfo->is_incomplete())
                any_up_complete_now = true;
@@ -6294,9 +6483,11 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
   q.f->dump_stream("enter_time") << enter_time;
 
   q.f->open_array_section("requested_info_from");
-  for (set<int>::iterator p = peer_info_requested.begin(); p != peer_info_requested.end(); ++p) {
+  for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
+       p != peer_info_requested.end();
+       ++p) {
     q.f->open_object_section("osd");
-    q.f->dump_int("osd", *p);
+    q.f->dump_stream("osd") << *p;
     if (pg->peer_info.count(*p)) {
       q.f->open_object_section("got_info");
       pg->peer_info[*p].dump(q.f);
@@ -6321,15 +6512,16 @@ void PG::RecoveryState::GetInfo::exit()
 /*------GetLog------------*/
 PG::RecoveryState::GetLog::GetLog(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
-    newest_update_osd(-1), msg(0)
+    NamedState(
+      context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
+    msg(0)
 {
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
 
   // adjust acting?
-  if (!pg->choose_acting(newest_update_osd)) {
+  if (!pg->choose_acting(auth_log_shard)) {
     if (!pg->want_acting.empty()) {
       post_event(NeedActingChange());
     } else {
@@ -6339,16 +6531,16 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx)
   }
 
   // am i the best?
-  if (newest_update_osd == pg->osd->whoami) {
+  if (auth_log_shard == pg->pg_whoami) {
     post_event(GotLog());
     return;
   }
 
-  const pg_info_t& best = pg->peer_info[newest_update_osd];
+  const pg_info_t& best = pg->peer_info[auth_log_shard];
 
   // am i broken?
   if (pg->info.last_update < best.log_tail) {
-    dout(10) << " not contiguous with osd." << newest_update_osd << ", down" << dendl;
+    dout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
     post_event(IsIncomplete());
     return;
   }
@@ -6356,19 +6548,24 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx)
   // how much log to request?
   eversion_t request_log_from = pg->info.last_update;
   assert(pg->actingbackfill.size() > 0);
-  for (vector<int>::iterator p = pg->actingbackfill.begin() + 1;
-          p != pg->actingbackfill.end(); ++p) {
+  for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
+       p != pg->actingbackfill.end();
+       ++p) {
+    if (*p == pg->pg_whoami) continue;
     pg_info_t& ri = pg->peer_info[*p];
     if (ri.last_update >= best.log_tail && ri.last_update < request_log_from)
       request_log_from = ri.last_update;
   }
 
   // how much?
-  dout(10) << " requesting log from osd." << newest_update_osd << dendl;
+  dout(10) << " requesting log from osd." << auth_log_shard << dendl;
   context<RecoveryMachine>().send_query(
-    newest_update_osd,
-    pg_query_t(pg_query_t::LOG, request_log_from, pg->info.history,
-              pg->get_osdmap()->get_epoch()));
+    auth_log_shard,
+    pg_query_t(
+      pg_query_t::LOG,
+      auth_log_shard.shard, pg->pg_whoami.shard,
+      request_log_from, pg->info.history,
+      pg->get_osdmap()->get_epoch()));
 }
 
 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
@@ -6376,8 +6573,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
   // make sure our log source didn't go down.  we need to check
   // explicitly because it may not be part of the prior set, which
   // means the Peering state check won't catch it going down.
-  if (!advmap.osdmap->is_up(newest_update_osd)) {
-    dout(10) << "GetLog: newest_update_osd osd." << newest_update_osd << " went down" << dendl;
+  if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
+    dout(10) << "GetLog: auth_log_shard osd."
+            << auth_log_shard.osd << " went down" << dendl;
     post_event(advmap);
     return transit< Reset >();
   }
@@ -6389,9 +6587,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
 {
   assert(!msg);
-  if (logevt.from != newest_update_osd) {
+  if (logevt.from != auth_log_shard) {
     dout(10) << "GetLog: discarding log from "
-            << "non-newest_update_osd osd." << logevt.from << dendl;
+            << "non-auth_log_shard osd." << logevt.from << dendl;
     return discard_event();
   }
   dout(10) << "GetLog: recieved master log from osd" 
@@ -6409,7 +6607,7 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
     dout(10) << "processing master log" << dendl;
     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
                        msg->info, msg->log, msg->missing, 
-                       newest_update_osd);
+                       auth_log_shard);
   }
   pg->start_flush(
     context< RecoveryMachine >().get_cur_transaction(),
@@ -6423,7 +6621,7 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
   q.f->open_object_section("state");
   q.f->dump_string("name", state_name);
   q.f->dump_stream("enter_time") << enter_time;
-  q.f->dump_int("newest_update_osd", newest_update_osd);
+  q.f->dump_stream("auth_log_shard") << auth_log_shard;
   q.f->close_section();
   return forward_event();
 }
@@ -6557,9 +6755,10 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
 
   PG *pg = context< RecoveryMachine >().pg;
   assert(pg->actingbackfill.size() > 0);
-  for (vector<int>::iterator i = pg->actingbackfill.begin() + 1;
+  for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
        i != pg->actingbackfill.end();
        ++i) {
+    if (*i == pg->get_primary()) continue;
     const pg_info_t& pi = pg->peer_info[*i];
 
     if (pi.is_empty())
@@ -6596,15 +6795,20 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
       dout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
       context< RecoveryMachine >().send_query(
        *i,
-       pg_query_t(pg_query_t::LOG, since, pg->info.history,
-                  pg->get_osdmap()->get_epoch()));
+       pg_query_t(
+         pg_query_t::LOG,
+         i->shard, pg->pg_whoami.shard,
+         since, pg->info.history,
+         pg->get_osdmap()->get_epoch()));
     } else {
       dout(10) << " requesting fulllog+missing from osd." << *i
               << " (want since " << since << " < log.tail " << pi.log_tail << ")"
               << dendl;
       context< RecoveryMachine >().send_query(
-       *i, pg_query_t(pg_query_t::FULLLOG,
-                      pg->info.history, pg->get_osdmap()->get_epoch()));
+       *i, pg_query_t(
+         pg_query_t::FULLLOG,
+         i->shard, pg->pg_whoami.shard,
+         pg->info.history, pg->get_osdmap()->get_epoch()));
     }
     peer_missing_requested.insert(*i);
   }
@@ -6650,9 +6854,11 @@ boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState&
   q.f->dump_stream("enter_time") << enter_time;
 
   q.f->open_array_section("peer_missing_requested");
-  for (set<int>::iterator p = peer_missing_requested.begin(); p != peer_missing_requested.end(); ++p) {
+  for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
+       p != peer_missing_requested.end();
+       ++p) {
     q.f->open_object_section("osd");
-    q.f->dump_int("osd", *p);
+    q.f->dump_stream("osd") << *p;
     if (pg->peer_missing.count(*p)) {
       q.f->open_object_section("got_missing");
       pg->peer_missing[*p].dump(q.f);
@@ -6779,13 +6985,14 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_
 #undef dout_prefix
 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
 
-PG::PriorSet::PriorSet(const OSDMap &osdmap,
+PG::PriorSet::PriorSet(bool ec_pool,
+                      const OSDMap &osdmap,
                       const map<epoch_t, pg_interval_t> &past_intervals,
                       const vector<int> &up,
                       const vector<int> &acting,
                       const pg_info_t &info,
                       const PG *debug_pg)
-  : pg_down(false)
+  : ec_pool(ec_pool), pg_down(false)
 {
   /*
    * We have to be careful to gracefully deal with situations like
@@ -6835,11 +7042,11 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
   // so that we know what they do/do not have explicitly before
   // sending them any new info/logs/whatever.
   for (unsigned i=0; i<acting.size(); i++)
-    probe.insert(acting[i]);
+    probe.insert(pg_shard_t(acting[i], ec_pool ? i : ghobject_t::NO_SHARD));
   // It may be possible to exlude the up nodes, but let's keep them in
   // there for now.
   for (unsigned i=0; i<up.size(); i++)
-    probe.insert(up[i]);
+    probe.insert(pg_shard_t(up[i], ec_pool ? i : ghobject_t::NO_SHARD));
 
   for (map<epoch_t,pg_interval_t>::const_reverse_iterator p = past_intervals.rbegin();
        p != past_intervals.rend();
@@ -6865,6 +7072,7 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
     // consider ACTING osds
     for (unsigned i=0; i<interval.acting.size(); i++) {
       int o = interval.acting[i];
+      pg_shard_t so(o, ec_pool ? i : ghobject_t::NO_SHARD);
 
       const osd_info_t *pinfo = 0;
       if (osdmap.exists(o))
@@ -6872,7 +7080,7 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
 
       if (osdmap.is_up(o)) {
        // include past acting osds if they are up.
-       probe.insert(o);
+       probe.insert(so);
        any_up_now = true;
       } else if (!pinfo) {
        dout(10) << "build_prior  prior osd." << o << " no longer exists" << dendl;
@@ -6898,7 +7106,6 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
           ++i) {
        if (osdmap.exists(*i) &&   // if it doesn't exist, we already consider it lost.
            osdmap.is_down(*i)) {
-         probe.insert(*i);
          pg_down = true;
 
          // make note of when any down osd in the cur set was lost, so that
@@ -6919,10 +7126,10 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap,
 // true if the given map affects the prior set
 bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) const
 {
-  for (set<int>::iterator p = probe.begin();
+  for (set<pg_shard_t>::iterator p = probe.begin();
        p != probe.end();
        ++p) {
-    int o = *p;
+    int o = p->osd;
 
     // did someone in the prior set go down?
     if (osdmap->is_down(o) && down.count(o) == 0) {
@@ -6931,7 +7138,7 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c
     }
 
     // did a down osd in cur get (re)marked as lost?
-    map<int,epoch_t>::const_iterator r = blocked_by.find(o);
+    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
     if (r != blocked_by.end()) {
       if (!osdmap->exists(o)) {
        dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl;
index 2ff9e164d0cc3b4753f30adb3c8a87ecb2734412..032e731240fba134427016c3340dd533e2b7cf5d 100644 (file)
@@ -289,19 +289,19 @@ public:
 
   const coll_t coll;
   PGLog  pg_log;
-  static string get_info_key(pg_t pgid) {
+  static string get_info_key(spg_t pgid) {
     return stringify(pgid) + "_info";
   }
-  static string get_biginfo_key(pg_t pgid) {
+  static string get_biginfo_key(spg_t pgid) {
     return stringify(pgid) + "_biginfo";
   }
-  static string get_epoch_key(pg_t pgid) {
+  static string get_epoch_key(spg_t pgid) {
     return stringify(pgid) + "_epoch";
   }
   hobject_t    log_oid;
   hobject_t    biginfo_oid;
-  map<hobject_t, set<int> > missing_loc;
-  set<int> missing_loc_sources;           // superset of missing_loc locations
+  map<hobject_t, set<pg_shard_t> > missing_loc;
+  set<pg_shard_t> missing_loc_sources;           // superset of missing_loc locations
   
   interval_set<snapid_t> snap_collections; // obsolete
   map<epoch_t,pg_interval_t> past_intervals;
@@ -312,7 +312,7 @@ public:
    * (if they have one) */
   xlist<PG*>::item recovery_item, scrub_item, scrub_finalize_item, snap_trim_item, stat_queue_item;
   int recovery_ops_active;
-  set<int> waiting_on_backfill;
+  set<pg_shard_t> waiting_on_backfill;
 #ifdef DEBUG_RECOVERY_OIDS
   set<hobject_t> recovering_oids;
 #endif
@@ -332,20 +332,26 @@ public:
 
   // primary state
  public:
-  vector<int> up, acting, want_acting, actingbackfill;
-  map<int,eversion_t> peer_last_complete_ondisk;
+  pg_shard_t primary;
+  pg_shard_t pg_whoami;
+  pg_shard_t up_primary;
+  vector<int> up, acting, want_acting;
+  set<pg_shard_t> actingbackfill;
+  map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
   eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
   eversion_t  pg_trim_to;
 
   // [primary only] content recovery state
  protected:
   struct PriorSet {
-    set<int> probe; /// current+prior OSDs we need to probe.
+    const bool ec_pool;
+    set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
     set<int> down;  /// down osds that would normally be in @a probe and might be interesting.
-    map<int,epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
+    map<int, epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
 
     bool pg_down;   /// some down osds are included in @a cur; the DOWN pg state bit should be set.
-    PriorSet(const OSDMap &osdmap,
+    PriorSet(bool ec_pool,
+            const OSDMap &osdmap,
             const map<epoch_t, pg_interval_t> &past_intervals,
             const vector<int> &up,
             const vector<int> &acting,
@@ -364,15 +370,17 @@ public:
 public:    
   struct RecoveryCtx {
     utime_t start_time;
-    map< int, map<pg_t, pg_query_t> > *query_map;
-    map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
-    map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list;
+    map<int, map<spg_t, pg_query_t> > *query_map;
+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list;
     C_Contexts *on_applied;
     C_Contexts *on_safe;
     ObjectStore::Transaction *transaction;
-    RecoveryCtx(map< int, map<pg_t, pg_query_t> > *query_map,
-               map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map,
-               map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list,
+    RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
+               map<int,
+                   vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map,
+               map<int,
+                   vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list,
                C_Contexts *on_applied,
                C_Contexts *on_safe,
                ObjectStore::Transaction *transaction)
@@ -403,24 +411,26 @@ protected:
    */
   
   bool        need_up_thru;
-  set<int>    stray_set;   // non-acting osds that have PG data.
+  set<pg_shard_t>    stray_set;   // non-acting osds that have PG data.
   eversion_t  oldest_update; // acting: lowest (valid) last_update in active set
-  map<int,pg_info_t>    peer_info;   // info from peers (stray or prior)
-  set<int> peer_purged; // peers purged
-  map<int,pg_missing_t> peer_missing;
-  set<int>             peer_log_requested;  // logs i've requested (and start stamps)
-  set<int>             peer_missing_requested;
-  set<int>             stray_purged;  // i deleted these strays; ignore racing PGInfo from them
-  set<int>             peer_activated;
+  map<pg_shard_t, pg_info_t>    peer_info;   // info from peers (stray or prior)
+  set<pg_shard_t> peer_purged; // peers purged
+  map<pg_shard_t, pg_missing_t> peer_missing;
+  set<pg_shard_t> peer_log_requested;  // logs i've requested (and start stamps)
+  set<pg_shard_t> peer_missing_requested;
+
+  // i deleted these strays; ignore racing PGInfo from them
+  set<pg_shard_t> stray_purged;
+  set<pg_shard_t> peer_activated;
 
   // primary-only, recovery-only state
-  set<int>             might_have_unfound;  // These osds might have objects on them
-                                           // which are unfound on the primary
+  set<pg_shard_t> might_have_unfound;  // These osds might have objects on them
+                                       // which are unfound on the primary
   epoch_t last_peering_reset;
 
 
   /* heartbeat peers */
-  void set_probe_targets(const set<int> &probe_set);
+  void set_probe_targets(const set<pg_shard_t> &probe_set);
   void clear_probe_targets();
 public:
   Mutex heartbeat_peer_lock;
@@ -505,21 +515,17 @@ protected:
   };
   
   BackfillInterval backfill_info;
-  map<int, BackfillInterval> peer_backfill_info;
+  map<pg_shard_t, BackfillInterval> peer_backfill_info;
   bool backfill_reserved;
   bool backfill_reserving;
 
   friend class OSD;
 
 public:
-  vector<int> backfill_targets;
+  set<pg_shard_t> backfill_targets;
 
-  bool is_backfill_targets(int osd) {
-    if (std::find(backfill_targets.begin(), backfill_targets.end(), osd)
-        != backfill_targets.end())
-      return true;
-    else
-      return false;
+  bool is_backfill_targets(pg_shard_t osd) {
+    return backfill_targets.count(osd);
   }
 
 protected:
@@ -564,20 +570,15 @@ public:
   void clear_primary_state();
 
  public:
-  bool is_acting(int osd) const { 
-    for (unsigned i=0; i<acting.size(); i++)
-      if (acting[i] == osd) return true;
-    return false;
+  bool is_actingbackfill(pg_shard_t osd) const {
+    return actingbackfill.count(osd);
   }
-  bool is_up(int osd) const { 
-    for (unsigned i=0; i<up.size(); i++)
-      if (up[i] == osd) return true;
-    return false;
-  }
-  bool is_actingbackfill(int osd) const {
-    for (unsigned i=0; i<actingbackfill.size(); i++)
-      if (actingbackfill[i] == osd) return true;
-    return false;
+  bool is_acting(pg_shard_t osd) const {
+    if (pool.info.ec_pool()) {
+      return acting.size() > osd.shard && acting[osd.shard] == osd.osd;
+    } else {
+      return std::find(acting.begin(), acting.end(), osd.osd) != acting.end();
+    }
   }
   
   bool needs_recovery() const;
@@ -601,10 +602,13 @@ public:
   bool calc_min_last_complete_ondisk() {
     eversion_t min = last_complete_ondisk;
     assert(actingbackfill.size() > 0);
-    for (unsigned i=1; i<actingbackfill.size(); i++) {
-      if (peer_last_complete_ondisk.count(actingbackfill[i]) == 0)
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == get_primary()) continue;
+      if (peer_last_complete_ondisk.count(*i) == 0)
        return false;   // we don't have complete info
-      eversion_t a = peer_last_complete_ondisk[actingbackfill[i]];
+      eversion_t a = peer_last_complete_ondisk[*i];
       if (a < min)
        min = a;
     }
@@ -617,10 +621,10 @@ public:
   virtual void calc_trim_to() = 0;
 
   void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
-                       pg_missing_t& omissing, int from);
+                       pg_missing_t& omissing, pg_shard_t from);
   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
-                      pg_missing_t& omissing, int from);
-  bool proc_replica_info(int from, const pg_info_t &info);
+                      pg_missing_t& omissing, pg_shard_t from);
+  bool proc_replica_info(pg_shard_t from, const pg_info_t &info);
 
 
   struct LogEntryTrimmer : public ObjectModDesc::Visitor {
@@ -719,28 +723,37 @@ public:
     ObjectStore::Transaction *t, const hobject_t &soid);
   void remove_snap_mapped_object(
     ObjectStore::Transaction& t, const hobject_t& soid);
-  void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from);
+  void merge_log(
+    ObjectStore::Transaction& t, pg_info_t &oinfo,
+    pg_log_t &olog, pg_shard_t from);
   void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
-  bool search_for_missing(const pg_info_t &oinfo, const pg_missing_t *omissing,
-                         int fromosd);
+  bool search_for_missing(
+    const pg_info_t &oinfo, const pg_missing_t *omissing,
+    pg_shard_t fromosd);
 
   void check_for_lost_objects();
   void forget_lost_objects();
 
-  void discover_all_missing(std::map< int, map<pg_t,pg_query_t> > &query_map);
+  void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
   
   void trim_write_ahead();
 
-  map<int, pg_info_t>::const_iterator find_best_info(const map<int, pg_info_t> &infos) const;
-  bool calc_acting(int& newest_update_osd, vector<int>& want, vector<int>& backfill) const;
-  bool choose_acting(int& newest_update_osd);
+  map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+    const map<pg_shard_t, pg_info_t> &infos) const;
+  bool calc_acting(
+    pg_shard_t &auth_log_shard,
+    vector<int> &want,
+    set<pg_shard_t> &backfill) const;
+  bool choose_acting(pg_shard_t &auth_log_shard);
   void build_might_have_unfound();
   void replay_queued_ops();
-  void activate(ObjectStore::Transaction& t,
-               epoch_t query_epoch,
-               list<Context*>& tfin,
-               map< int, map<pg_t,pg_query_t> >& query_map,
-               map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map=0);
+  void activate(
+    ObjectStore::Transaction& t,
+    epoch_t query_epoch,
+    list<Context*>& tfin,
+    map<int, map<spg_t,pg_query_t> >& query_map,
+    map<int,
+        vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map=0);
   void _activate_committed(epoch_t e);
   void all_activated_and_committed();
 
@@ -805,7 +818,7 @@ public:
     }
 
     // metadata
-    set<int> reserved_peers;
+    set<pg_shard_t> reserved_peers;
     bool reserved, reserve_failed;
     epoch_t epoch_start;
 
@@ -814,12 +827,12 @@ public:
     bool active;
     bool queue_snap_trim;
     int waiting_on;
-    set<int> waiting_on_whom;
+    set<pg_shard_t> waiting_on_whom;
     int shallow_errors;
     int deep_errors;
     int fixed;
     ScrubMap primary_scrubmap;
-    map<int,ScrubMap> received_maps;
+    map<pg_shard_t, ScrubMap> received_maps;
     MOSDRepScrub *active_rep_scrub;
     utime_t scrub_reg_stamp;  // stamp we registered for
 
@@ -827,12 +840,12 @@ public:
     bool must_scrub, must_deep_scrub, must_repair;
 
     // Maps from objects with errors to missing/inconsistent peers
-    map<hobject_t, set<int> > missing;
-    map<hobject_t, set<int> > inconsistent;
-    map<hobject_t, set<int> > inconsistent_snapcolls;
+    map<hobject_t, set<pg_shard_t> > missing;
+    map<hobject_t, set<pg_shard_t> > inconsistent;
+    map<hobject_t, set<pg_shard_t> > inconsistent_snapcolls;
 
     // Map from object with errors to good peer
-    map<hobject_t, pair<ScrubMap::object, int> > authoritative;
+    map<hobject_t, pair<ScrubMap::object, pg_shard_t> > authoritative;
 
     // classic scrub
     bool classic;
@@ -944,7 +957,11 @@ public:
 
   int active_pushes;
 
-  void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer);
+  void repair_object(
+    const hobject_t& soid, ScrubMap::object *po,
+    pg_shard_t bad_peer,
+    pg_shard_t ok_peer);
+
   void scrub(ThreadPool::TPHandle &handle);
   void classic_scrub(ThreadPool::TPHandle &handle);
   void chunky_scrub(ThreadPool::TPHandle &handle);
@@ -955,8 +972,8 @@ public:
   void scrub_clear_state();
   bool scrub_gather_replica_maps();
   void _scan_snaps(ScrubMap &map);
-  void _request_scrub_map_classic(int replica, eversion_t version);
-  void _request_scrub_map(int replica, eversion_t version,
+  void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
+  void _request_scrub_map(pg_shard_t replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep);
   int build_scrub_map_chunk(
     ScrubMap &map,
@@ -970,14 +987,14 @@ public:
   virtual void _scrub_finish() { }
   virtual void get_colls(list<coll_t> *out) = 0;
   virtual void split_colls(
-    pg_t child,
+    spg_t child,
     int split_bits,
     int seed,
     ObjectStore::Transaction *t) = 0;
   virtual bool _report_snap_collection_errors(
     const hobject_t &hoid,
     const map<string, bufferptr> &attrs,
-    int osd,
+    pg_shard_t osd,
     ostream &out) { return false; };
   void clear_scrub_reserved();
   void scrub_reserve_replicas();
@@ -1055,10 +1072,10 @@ public:
   };
 
   struct MInfoRec : boost::statechart::event< MInfoRec > {
-    int from;
+    pg_shard_t from;
     pg_info_t info;
     epoch_t msg_epoch;
-    MInfoRec(int from, pg_info_t &info, epoch_t msg_epoch) :
+    MInfoRec(pg_shard_t from, pg_info_t &info, epoch_t msg_epoch) :
       from(from), info(info), msg_epoch(msg_epoch) {}
     void print(std::ostream *out) const {
       *out << "MInfoRec from " << from << " info: " << info;
@@ -1066,9 +1083,9 @@ public:
   };
 
   struct MLogRec : boost::statechart::event< MLogRec > {
-    int from;
+    pg_shard_t from;
     boost::intrusive_ptr<MOSDPGLog> msg;
-    MLogRec(int from, MOSDPGLog *msg) :
+    MLogRec(pg_shard_t from, MOSDPGLog *msg) :
       from(from), msg(msg) {}
     void print(std::ostream *out) const {
       *out << "MLogRec from " << from;
@@ -1076,9 +1093,9 @@ public:
   };
 
   struct MNotifyRec : boost::statechart::event< MNotifyRec > {
-    int from;
+    pg_shard_t from;
     pg_notify_t notify;
-    MNotifyRec(int from, pg_notify_t &notify) :
+    MNotifyRec(pg_shard_t from, pg_notify_t &notify) :
       from(from), notify(notify) {}
     void print(std::ostream *out) const {
       *out << "MNotifyRec from " << from << " notify: " << notify;
@@ -1086,10 +1103,10 @@ public:
   };
 
   struct MQuery : boost::statechart::event< MQuery > {
-    int from;
+    pg_shard_t from;
     pg_query_t query;
     epoch_t query_epoch;
-    MQuery(int from, const pg_query_t &query, epoch_t query_epoch):
+    MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
       from(from), query(query), query_epoch(query_epoch) {}
     void print(std::ostream *out) const {
       *out << "MQuery from " << from
@@ -1102,8 +1119,16 @@ public:
     OSDMapRef osdmap;
     OSDMapRef lastmap;
     vector<int> newup, newacting;
-    AdvMap(OSDMapRef osdmap, OSDMapRef lastmap, vector<int>& newup, vector<int>& newacting):
-      osdmap(osdmap), lastmap(lastmap), newup(newup), newacting(newacting) {}
+    int up_primary, acting_primary;
+    AdvMap(
+      OSDMapRef osdmap, OSDMapRef lastmap,
+      vector<int>& newup, int up_primary,
+      vector<int>& newacting, int acting_primary):
+      osdmap(osdmap), lastmap(lastmap),
+      newup(newup),
+      newacting(newacting),
+      up_primary(up_primary),
+      acting_primary(acting_primary) {}
     void print(std::ostream *out) const {
       *out << "AdvMap";
     }
@@ -1196,12 +1221,13 @@ public:
        return state->rctx->transaction;
       }
 
-      void send_query(int to, const pg_query_t &query) {
+      void send_query(pg_shard_t to, const pg_query_t &query) {
        assert(state->rctx->query_map);
-       (*state->rctx->query_map)[to][pg->info.pgid] = query;
+       (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
+         query;
       }
 
-      map<int, map<pg_t, pg_query_t> > *get_query_map() {
+      map<int, map<spg_t, pg_query_t> > *get_query_map() {
        assert(state->rctx->query_map);
        return state->rctx->query_map;
       }
@@ -1221,9 +1247,10 @@ public:
        return &(state->rctx->on_applied->contexts);
       }
 
-      void send_notify(int to, const pg_notify_t &info, const pg_interval_map_t &pi) {
+      void send_notify(pg_shard_t to,
+                      const pg_notify_t &info, const pg_interval_map_t &pi) {
        assert(state->rctx->notify_list);
-       (*state->rctx->notify_list)[to].push_back(make_pair(info, pi));
+       (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
       }
     };
     friend class RecoveryMachine;
@@ -1335,10 +1362,8 @@ public:
       typedef boost::mpl::list <
        boost::statechart::custom_reaction< ActMap >,
        boost::statechart::custom_reaction< MNotifyRec >,
-       boost::statechart::transition< NeedActingChange, WaitActingChange >,
-       boost::statechart::custom_reaction< AdvMap>
+       boost::statechart::transition< NeedActingChange, WaitActingChange >
        > reactions;
-      boost::statechart::result react(const AdvMap&);
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const MNotifyRec&);
     };
@@ -1385,8 +1410,8 @@ public:
       Active(my_context ctx);
       void exit();
 
-      const set<int> sorted_acting_set;
-      const set<int> sorted_backfill_set;
+      const set<pg_shard_t> sorted_actingbackfill_set;
+      const set<pg_shard_t> sorted_backfill_set;
       bool all_replicas_activated;
 
       typedef boost::mpl::list <
@@ -1448,7 +1473,7 @@ public:
        boost::statechart::custom_reaction< RemoteReservationRejected >,
        boost::statechart::transition< AllBackfillsReserved, Backfilling >
        > reactions;
-      set<int>::const_iterator backfill_osd_it;
+      set<pg_shard_t>::const_iterator backfill_osd_it;
       WaitRemoteBackfillReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved& evt);
@@ -1550,7 +1575,7 @@ public:
        boost::statechart::custom_reaction< RemoteRecoveryReserved >,
        boost::statechart::transition< AllRemotesReserved, Recovering >
        > reactions;
-      set<int>::const_iterator acting_osd_it;
+      set<pg_shard_t>::const_iterator acting_osd_it;
       WaitRemoteRecoveryReserved(my_context ctx);
       boost::statechart::result react(const RemoteRecoveryReserved &evt);
       void exit();
@@ -1599,7 +1624,7 @@ public:
     struct GetLog;
 
     struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
-      set<int> peer_info_requested;
+      set<pg_shard_t> peer_info_requested;
 
       GetInfo(my_context ctx);
       void exit();
@@ -1620,7 +1645,7 @@ public:
     };
 
     struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
-      int newest_update_osd;
+      pg_shard_t auth_log_shard;
       boost::intrusive_ptr<MOSDPGLog> msg;
 
       GetLog(my_context ctx);
@@ -1643,7 +1668,7 @@ public:
     struct WaitFlushedPeering;
 
     struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
-      set<int> peer_missing_requested;
+      set<pg_shard_t> peer_missing_requested;
 
       GetMissing(my_context ctx);
       void exit();
@@ -1725,7 +1750,7 @@ public:
 
  public:
   PG(OSDService *o, OSDMapRef curmap,
-     const PGPool &pool, pg_t p, const hobject_t& loid, const hobject_t& ioid);
+     const PGPool &pool, spg_t p, const hobject_t& loid, const hobject_t& ioid);
   virtual ~PG();
 
  private:
@@ -1734,15 +1759,52 @@ public:
   PG& operator=(const PG& rhs);
 
  public:
-  pg_t       get_pgid() const { return info.pgid; }
+  spg_t      get_pgid() const { return info.pgid; }
   int        get_nrep() const { return acting.size(); }
 
-  int        get_primary() { return acting.empty() ? -1:acting[0]; }
+  void init_primary_up_acting(
+    const vector<int> &newup,
+    const vector<int> &newacting,
+    int new_up_primary,
+    int new_acting_primary) {
+    actingset.clear();
+    acting = newacting;
+    for (shard_id_t i = 0; i < acting.size(); ++i) {
+      if (acting[i] != CRUSH_ITEM_NONE)
+       actingset.insert(
+         pg_shard_t(
+           acting[i],
+           pool.info.ec_pool() ? i : ghobject_t::NO_SHARD));
+    }
+    up = newup;
+    if (!pool.info.ec_pool()) {
+      up_primary = pg_shard_t(new_up_primary, ghobject_t::no_shard());
+      primary = pg_shard_t(new_acting_primary, ghobject_t::no_shard());
+      return;
+    }
+    up_primary = pg_shard_t();
+    primary = pg_shard_t();
+    for (shard_id_t i = 0; i < up.size(); ++i) {
+      if (up[i] == new_up_primary) {
+       up_primary = pg_shard_t(up[i], i);
+       break;
+      }
+    }
+    for (shard_id_t i = 0; i < acting.size(); ++i) {
+      if (acting[i] == new_acting_primary) {
+       primary = pg_shard_t(acting[i], i);
+       break;
+      }
+    }
+    assert(up_primary.osd == new_up_primary);
+    assert(primary.osd == new_acting_primary);
+  }
+  pg_shard_t get_primary() const { return primary; }
   
   int        get_role() const { return role; }
   void       set_role(int r) { role = r; }
 
-  bool       is_primary() const { return role == 0; }
+  bool       is_primary() const { return pg_whoami == primary; }
   bool       is_replica() const { return role > 0; }
 
   epoch_t get_last_peering_reset() const { return last_peering_reset; }
@@ -1770,7 +1832,9 @@ public:
   void init(
     int role,
     vector<int>& up,
+    int up_primary,
     vector<int>& acting,
+    int acting_primary,
     pg_history_t& history,
     pg_interval_map_t& pim,
     bool backfill,
@@ -1831,10 +1895,11 @@ public:
   /// share new pg log entries after a pg is active
   void share_pg_log();
 
-  void start_peering_interval(const OSDMapRef lastmap,
-                             const vector<int>& newup,
-                             const vector<int>& newacting,
-                             ObjectStore::Transaction *t);
+  void start_peering_interval(
+    const OSDMapRef lastmap,
+    const vector<int>& newup, int up_primary,
+    const vector<int>& newacting, int acting_primary,
+    ObjectStore::Transaction *t);
   void start_flush(ObjectStore::Transaction *t,
                   list<Context *> *on_applied,
                   list<Context *> *on_safe);
@@ -1845,9 +1910,9 @@ public:
   }
 
   void update_history_from_master(pg_history_t new_history);
-  void fulfill_info(int from, const pg_query_t &query, 
-                   pair<int, pg_info_t> &notify_info);
-  void fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch);
+  void fulfill_info(pg_shard_t from, const pg_query_t &query,
+                   pair<pg_shard_t, pg_info_t> &notify_info);
+  void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
   bool is_split(OSDMapRef lastmap, OSDMapRef nextmap);
   bool acting_up_affected(const vector<int>& newup, const vector<int>& newacting);
 
@@ -1883,18 +1948,20 @@ public:
   void queue_peering_event(CephPeeringEvtRef evt);
   void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
   void queue_notify(epoch_t msg_epoch, epoch_t query_epoch,
-                   int from, pg_notify_t& i);
+                   pg_shard_t from, pg_notify_t& i);
   void queue_info(epoch_t msg_epoch, epoch_t query_epoch,
-                 int from, pg_info_t& i);
-  void queue_log(epoch_t msg_epoch, epoch_t query_epoch, int from,
+                 pg_shard_t from, pg_info_t& i);
+  void queue_log(epoch_t msg_epoch, epoch_t query_epoch, pg_shard_t from,
                 MOSDPGLog *msg);
   void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
-                  int from, const pg_query_t& q);
+                  pg_shard_t from, const pg_query_t& q);
   void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
   void queue_flushed(epoch_t started_at);
-  void handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
-                         vector<int>& newup, vector<int>& newacting,
-                         RecoveryCtx *rctx);
+  void handle_advance_map(
+    OSDMapRef osdmap, OSDMapRef lastmap,
+    vector<int>& newup, int up_primary,
+    vector<int>& newacting, int acting_primary,
+    RecoveryCtx *rctx);
   void handle_activate_map(RecoveryCtx *rctx);
   void handle_create(RecoveryCtx *rctx);
   void handle_loaded(RecoveryCtx *rctx);
index e13369c2ee070e4641bd3bb1ce5c7b01bb17a743..0968323da79e9c3a182415616fc117af832690d7 100644 (file)
       * Called when peer is recovered
       */
      virtual void on_peer_recover(
-       int peer,
+       pg_shard_t peer,
        const hobject_t &oid,
        const ObjectRecoveryInfo &recovery_info,
        const object_stat_sum_t &stat
        ) = 0;
 
      virtual void begin_peer_recover(
-       int peer,
+       pg_shard_t peer,
        const hobject_t oid) = 0;
 
-     virtual void failed_push(int from, const hobject_t &soid) = 0;
-
+     virtual void failed_push(pg_shard_t from, const hobject_t &soid) = 0;
      
      virtual void cancel_pull(const hobject_t &soid) = 0;
 
        ObjectStore::Transaction *t,
        OpRequestRef op = OpRequestRef()
        ) = 0;
-     virtual epoch_t get_epoch() = 0;
-     virtual const vector<int> &get_actingbackfill() = 0;
+     virtual epoch_t get_epoch() const = 0;
+
+     virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
+
      virtual std::string gen_dbg_prefix() const = 0;
 
-     virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
-     virtual const map<int, pg_missing_t> &get_peer_missing() = 0;
-     virtual const map<int, pg_info_t> &get_peer_info() = 0;
-     virtual const pg_missing_t &get_local_missing() = 0;
-     virtual const PGLog &get_log() = 0;
+     virtual const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards()
+       const = 0;
+
+     virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
+       const = 0;
+
+     virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+
+     virtual const pg_missing_t &get_local_missing() const = 0;
+     virtual const PGLog &get_log() const = 0;
      virtual bool pgb_is_primary() const = 0;
      virtual OSDMapRef pgb_get_osdmap() const = 0;
      virtual const pg_info_t &get_info() const = 0;
        const eversion_t &applied_version) = 0;
 
      virtual bool should_send_op(
-       int peer,
+       pg_shard_t peer,
        const hobject_t &hoid) = 0;
 
      virtual void log_operation(
        ObjectStore::Transaction *t) = 0;
 
      virtual void update_peer_last_complete_ondisk(
-       int fromosd,
+       pg_shard_t fromosd,
        eversion_t lcod) = 0;
 
      virtual void update_last_complete_ondisk(
      virtual void schedule_work(
        GenContext<ThreadPool::TPHandle&> *c) = 0;
 
-     virtual int whoami() const = 0;
+     virtual pg_shard_t whoami_shard() const = 0;
+     int whoami() const {
+       return whoami_shard().osd;
+     }
+     spg_t whoami_spg_t() const {
+       return get_info().pgid;
+     }
+
+     virtual spg_t primary_spg_t() const = 0;
+     virtual pg_shard_t primary_shard() const = 0;
 
      virtual void send_message_osd_cluster(
        int peer, Message *m, epoch_t from_epoch) = 0;
        out->push_back(temp_coll);
    }
    void split_colls(
-     pg_t child,
+     spg_t child,
      int split_bits,
      int seed,
      ObjectStore::Transaction *t) {
      Context *on_complete) = 0;
 
    virtual bool scrub_supported() { return false; }
-   virtual void be_scan_list(ScrubMap &map, const vector<hobject_t> &ls, bool deep,
+   virtual void be_scan_list(
+     ScrubMap &map, const vector<hobject_t> &ls, bool deep,
      ThreadPool::TPHandle &handle) { assert(0); }
    virtual enum scrub_error_type be_compare_scrub_objects(
-                               const ScrubMap::object &auth,
-                               const ScrubMap::object &candidate,
-                               ostream &errorstream) { assert(0); }
-   virtual map<int, ScrubMap *>::const_iterator be_select_auth_object(
+     const ScrubMap::object &auth,
+     const ScrubMap::object &candidate,
+     ostream &errorstream) { assert(0); }
+   virtual map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
      const hobject_t &obj,
-     const map<int,ScrubMap*> &maps) { assert(0); }
-   virtual void be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
-                           map<hobject_t, set<int> > &missing,
-                           map<hobject_t, set<int> > &inconsistent,
-                           map<hobject_t, int> &authoritative,
-                           map<hobject_t, set<int> > &invalid_snapcolls,
-                           int &shallow_errors, int &deep_errors,
-                           const pg_t pgid,
-                           const vector<int> &acting,
-                           ostream &errorstream) { assert(0); }
+     const map<pg_shard_t,ScrubMap*> &maps) { assert(0); }
+   virtual void be_compare_scrubmaps(
+     const map<pg_shard_t,ScrubMap*> &maps,
+     map<hobject_t, set<pg_shard_t> > &missing,
+     map<hobject_t, set<pg_shard_t> > &inconsistent,
+     map<hobject_t, pg_shard_t> &authoritative,
+     map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+     int &shallow_errors, int &deep_errors,
+     const spg_t pgid,
+     const vector<int> &acting,
+     ostream &errorstream) { assert(0); }
  };
 
 struct PG_SendMessageOnConn: public Context {
index a643a5a7fa7f99cb5e39571c65429543cd422709..c6b391521a05307959364082cb63e073c5be6c70 100644 (file)
@@ -110,7 +110,7 @@ void PGLog::clear() {
 }
 
 void PGLog::clear_info_log(
-  pg_t pgid,
+  spg_t pgid,
   const hobject_t &infos_oid,
   const hobject_t &log_oid,
   ObjectStore::Transaction *t) {
@@ -144,8 +144,10 @@ void PGLog::trim(
   }
 }
 
-void PGLog::proc_replica_log(ObjectStore::Transaction& t,
-                         pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing, int from) const
+void PGLog::proc_replica_log(
+  ObjectStore::Transaction& t,
+  pg_info_t &oinfo, const pg_log_t &olog, pg_missing_t& omissing,
+  pg_shard_t from) const
 {
   dout(10) << "proc_replica_log for osd." << from << ": "
           << oinfo << " " << olog << " " << omissing << dendl;
@@ -451,7 +453,7 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead
 }
 
 void PGLog::merge_log(ObjectStore::Transaction& t,
-                      pg_info_t &oinfo, pg_log_t &olog, int fromosd,
+                      pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
                       pg_info_t &info, LogEntryHandler *rollbacker,
                       bool &dirty_info, bool &dirty_big_info)
 {
index beafdbfc13f4945ae11d18499beff8184edd6d86..1cf9df505ecb0b0b95745548b60e7f200d6be694 100644 (file)
@@ -313,7 +313,7 @@ public:
   void reset_recovery_pointers() { log.reset_recovery_pointers(); }
 
   static void clear_info_log(
-    pg_t pgid,
+    spg_t pgid,
     const hobject_t &infos_oid,
     const hobject_t &log_oid,
     ObjectStore::Transaction *t);
@@ -385,7 +385,7 @@ public:
   }
 
   void proc_replica_log(ObjectStore::Transaction& t, pg_info_t &oinfo, const pg_log_t &olog,
-                       pg_missing_t& omissing, int from) const;
+                       pg_missing_t& omissing, pg_shard_t from) const;
 
 protected:
   bool _merge_old_entry(
@@ -418,7 +418,8 @@ public:
                             pg_info_t &info, LogEntryHandler *rollbacker,
                             bool &dirty_info, bool &dirty_big_info);
 
-  void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, int from,
+  void merge_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
+                pg_shard_t from,
                 pg_info_t &info, LogEntryHandler *rollbacker,
                 bool &dirty_info, bool &dirty_big_info);
 
index bf0c8542c08e8618f103b148a2086758f8cd57dd..8dc86b4965cb1ece003db6d65639c66839763687 100644 (file)
@@ -76,10 +76,10 @@ void ReplicatedBackend::recover_object(
 
 void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
 {
-  for(map<int, set<hobject_t> >::iterator i = pull_from_peer.begin();
+  for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin();
       i != pull_from_peer.end();
       ) {
-    if (osdmap->is_down(i->first)) {
+    if (osdmap->is_down(i->first.osd)) {
       dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
               << ", osdmap has it marked down" << dendl;
       for (set<hobject_t>::iterator j = i->second.begin();
@@ -504,6 +504,14 @@ void ReplicatedBackend::submit_transaction(
       )
     ).first->second;
 
+  op.waiting_for_applied.insert(
+    parent->get_actingbackfill_shards().begin(),
+    parent->get_actingbackfill_shards().end());
+  op.waiting_for_commit.insert(
+    parent->get_actingbackfill_shards().begin(),
+    parent->get_actingbackfill_shards().end());
+
+
   issue_op(
     soid,
     at_version,
@@ -517,10 +525,6 @@ void ReplicatedBackend::submit_transaction(
     &op,
     op_t);
 
-  // add myself to gather set
-  op.waiting_for_applied.insert(osd->whoami);
-  op.waiting_for_commit.insert(osd->whoami);
-
   ObjectStore::Transaction local_t;
   if (t->get_temp_added().size()) {
     get_temp_coll(&local_t);
@@ -553,7 +557,7 @@ void ReplicatedBackend::op_applied(
   if (op->op)
     op->op->mark_event("op_applied");
 
-  op->waiting_for_applied.erase(get_parent()->whoami());
+  op->waiting_for_applied.erase(get_parent()->whoami_shard());
   parent->op_applied(op->v);
 
   if (op->waiting_for_applied.empty()) {
@@ -573,7 +577,7 @@ void ReplicatedBackend::op_commit(
   if (op->op)
     op->op->mark_event("op_commit");
 
-  op->waiting_for_commit.erase(get_parent()->whoami());
+  op->waiting_for_commit.erase(get_parent()->whoami_shard());
 
   if (op->waiting_for_commit.empty()) {
     op->on_commit->complete(0);
@@ -594,7 +598,7 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op)
 
   // must be replication.
   tid_t rep_tid = r->get_tid();
-  int fromosd = r->get_source().num();
+  pg_shard_t from = r->from;
 
   if (in_progress_ops.count(rep_tid)) {
     map<tid_t, InProgressOp>::iterator iter =
@@ -607,30 +611,30 @@ void ReplicatedBackend::sub_op_modify_reply(OpRequestRef op)
     if (m)
       dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
              << " ack_type " << (int)r->ack_type
-             << " from osd." << fromosd
+             << " from " << from
              << dendl;
     else
       dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) "
              << " ack_type " << (int)r->ack_type
-             << " from osd." << fromosd
+             << " from " << from
              << dendl;
 
     // oh, good.
 
     if (r->ack_type & CEPH_OSD_FLAG_ONDISK) {
-      assert(ip_op.waiting_for_commit.count(fromosd));
-      ip_op.waiting_for_commit.erase(fromosd);
+      assert(ip_op.waiting_for_commit.count(from));
+      ip_op.waiting_for_commit.erase(from);
       if (ip_op.op)
        ip_op.op->mark_event("sub_op_commit_rec");
     } else {
-      assert(ip_op.waiting_for_applied.count(fromosd));
+      assert(ip_op.waiting_for_applied.count(from));
       if (ip_op.op)
        ip_op.op->mark_event("sub_op_applied_rec");
     }
-    ip_op.waiting_for_applied.erase(fromosd);
+    ip_op.waiting_for_applied.erase(from);
 
     parent->update_peer_last_complete_ondisk(
-      fromosd,
+      from,
       r->get_last_complete_ondisk());
 
     if (ip_op.waiting_for_applied.empty() &&
@@ -667,12 +671,21 @@ void ReplicatedBackend::be_scan_list(
     hobject_t poid = *p;
 
     struct stat st;
-    int r = osd->store->stat(coll, poid, &st, true);
+    int r = store->stat(
+      coll,
+      ghobject_t(
+       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      &st,
+      true);
     if (r == 0) {
       ScrubMap::object &o = map.objects[poid];
       o.size = st.st_size;
       assert(!o.negative);
-      osd->store->getattrs(coll, poid, o.attrs);
+      store->getattrs(
+       coll,
+       ghobject_t(
+         poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+       o.attrs);
 
       // calculate the CRC32 on deep scrubs
       if (deep) {
@@ -680,9 +693,14 @@ void ReplicatedBackend::be_scan_list(
         bufferlist bl, hdrbl;
         int r;
         __u64 pos = 0;
-        while ( (r = osd->store->read(coll, poid, pos,
-                                       cct->_conf->osd_deep_scrub_stride, bl,
-                                     true)) > 0) {
+        while ( (
+           r = store->read(
+             coll,
+             ghobject_t(
+               poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+             pos,
+             cct->_conf->osd_deep_scrub_stride, bl,
+             true)) > 0) {
          handle.reset_tp_timeout();
           h << bl;
           pos += bl.length();
@@ -697,7 +715,11 @@ void ReplicatedBackend::be_scan_list(
         o.digest_present = true;
 
         bl.clear();
-        r = osd->store->omap_get_header(coll, poid, &hdrbl, true);
+        r = store->omap_get_header(
+         coll,
+         ghobject_t(
+           poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+         &hdrbl, true);
         if (r == 0) {
           dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length())
              << dendl;
@@ -710,8 +732,10 @@ void ReplicatedBackend::be_scan_list(
          o.read_error = true;
        }
 
-        ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
-          coll, poid);
+        ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
+          coll,
+         ghobject_t(
+           poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
         assert(iter);
        uint64_t keys_scanned = 0;
         for (iter->seek_to_first(); iter->valid() ; iter->next()) {
@@ -756,9 +780,9 @@ void ReplicatedBackend::be_scan_list(
 }
 
 enum scrub_error_type ReplicatedBackend::be_compare_scrub_objects(
-                               const ScrubMap::object &auth,
-                               const ScrubMap::object &candidate,
-                               ostream &errorstream)
+  const ScrubMap::object &auth,
+  const ScrubMap::object &candidate,
+  ostream &errorstream)
 {
   enum scrub_error_type error = CLEAN;
   if (candidate.read_error) {
@@ -824,12 +848,13 @@ enum scrub_error_type ReplicatedBackend::be_compare_scrub_objects(
   return error;
 }
 
-map<int, ScrubMap *>::const_iterator ReplicatedBackend::be_select_auth_object(
+map<pg_shard_t, ScrubMap *>::const_iterator
+  ReplicatedBackend::be_select_auth_object(
   const hobject_t &obj,
-  const map<int,ScrubMap*> &maps)
+  const map<pg_shard_t,ScrubMap*> &maps)
 {
-  map<int, ScrubMap *>::const_iterator auth = maps.end();
-  for (map<int, ScrubMap *>::const_iterator j = maps.begin();
+  map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+  for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
        j != maps.end();
        ++j) {
     map<hobject_t, ScrubMap::object>::iterator i =
@@ -896,19 +921,19 @@ map<int, ScrubMap *>::const_iterator ReplicatedBackend::be_select_auth_object(
   return auth;
 }
 
-void ReplicatedBackend::be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
-                           map<hobject_t, set<int> > &missing,
-                           map<hobject_t, set<int> > &inconsistent,
-                           map<hobject_t, int> &authoritative,
-                           map<hobject_t, set<int> > &invalid_snapcolls,
-                           int &shallow_errors,
-                           int &deep_errors,
-                           const pg_t pgid,
-                           const vector<int> &acting,
-                           ostream &errorstream)
+void ReplicatedBackend::be_compare_scrubmaps(
+  const map<pg_shard_t,ScrubMap*> &maps,
+  map<hobject_t, set<pg_shard_t> > &missing,
+  map<hobject_t, set<pg_shard_t> > &inconsistent,
+  map<hobject_t, pg_shard_t> &authoritative,
+  map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+  int &shallow_errors, int &deep_errors,
+  const spg_t pgid,
+  const vector<int> &acting,
+  ostream &errorstream)
 {
   map<hobject_t,ScrubMap::object>::const_iterator i;
-  map<int, ScrubMap *>::const_iterator j;
+  map<pg_shard_t, ScrubMap *>::const_iterator j;
   set<hobject_t> master_set;
 
   // Construct master set
@@ -922,10 +947,11 @@ void ReplicatedBackend::be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
   for (set<hobject_t>::const_iterator k = master_set.begin();
        k != master_set.end();
        ++k) {
-    map<int, ScrubMap *>::const_iterator auth = be_select_auth_object(*k, maps);
+    map<pg_shard_t, ScrubMap *>::const_iterator auth =
+      be_select_auth_object(*k, maps);
     assert(auth != maps.end());
-    set<int> cur_missing;
-    set<int> cur_inconsistent;
+    set<pg_shard_t> cur_missing;
+    set<pg_shard_t> cur_inconsistent;
     for (j = maps.begin(); j != maps.end(); ++j) {
       if (j == auth)
        continue;
@@ -941,14 +967,13 @@ void ReplicatedBackend::be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
            ++shallow_errors;
           else
            ++deep_errors;
-         errorstream << pgid << " osd." << acting[j->first]
+         errorstream << pgid << " shard " << j->first
                      << ": soid " << *k << " " << ss.str() << std::endl;
        }
       } else {
        cur_missing.insert(j->first);
        ++shallow_errors;
-       errorstream << pgid
-                   << " osd." << acting[j->first]
+       errorstream << pgid << " shard " << j->first
                    << " missing " << *k << std::endl;
       }
     }
index 05dd9c761cef01aa5d5143e12e5ae8df37aebc04..7f37fd285ddf2815c1cc8effb6be9af326cb1898 100644 (file)
@@ -23,8 +23,8 @@
 struct C_ReplicatedBackend_OnPullComplete;
 class ReplicatedBackend : public PGBackend {
   struct RPGHandle : public PGBackend::RecoveryHandle {
-    map<int, vector<PushOp> > pushes;
-    map<int, vector<PullOp> > pulls;
+    map<pg_shard_t, vector<PushOp> > pushes;
+    map<pg_shard_t, vector<PullOp> > pulls;
   };
   friend struct C_ReplicatedBackend_OnPullComplete;
 public:
@@ -73,11 +73,11 @@ public:
   virtual void dump_recovery_info(Formatter *f) const {
     {
       f->open_array_section("pull_from_peer");
-      for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+      for (map<pg_shard_t, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
           i != pull_from_peer.end();
           ++i) {
        f->open_object_section("pulling_from");
-       f->dump_int("pull_from", i->first);
+       f->dump_stream("pull_from") << i->first;
        {
          f->open_array_section("pulls");
          for (set<hobject_t>::const_iterator j = i->second.begin();
@@ -96,7 +96,7 @@ public:
     }
     {
       f->open_array_section("pushing");
-      for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
+      for (map<hobject_t, map<pg_shard_t, PushInfo> >::const_iterator i =
             pushing.begin();
           i != pushing.end();
           ++i) {
@@ -104,11 +104,11 @@ public:
        f->dump_stream("pushing") << i->first;
        {
          f->open_array_section("pushing_to");
-         for (map<int, PushInfo>::const_iterator j = i->second.begin();
+         for (map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin();
               j != i->second.end();
               ++j) {
            f->open_object_section("push_progress");
-           f->dump_stream("object_pushing") << j->first;
+           f->dump_stream("pushing_to") << j->first;
            {
              f->open_object_section("push_info");
              j->second.dump(f);
@@ -157,7 +157,7 @@ private:
       }
     }
   };
-  map<hobject_t, map<int, PushInfo> > pushing;
+  map<hobject_t, map<pg_shard_t, PushInfo> > pushing;
 
   // pull
   struct PullInfo {
@@ -188,7 +188,7 @@ private:
   map<hobject_t, PullInfo> pulling;
 
   // Reverse mapping from osd peer to objects beging pulled from that peer
-  map<int, set<hobject_t> > pull_from_peer;
+  map<pg_shard_t, set<hobject_t> > pull_from_peer;
 
   void sub_op_push(OpRequestRef op);
   void sub_op_push_reply(OpRequestRef op);
@@ -206,13 +206,13 @@ private:
   void do_pull(OpRequestRef op);
   void do_push_reply(OpRequestRef op);
 
-  bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
-  void handle_pull(int peer, PullOp &op, PushOp *reply);
+  bool handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply);
+  void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply);
   bool handle_pull_response(
-    int from, PushOp &op, PullOp *response,
+    pg_shard_t from, PushOp &op, PullOp *response,
     list<hobject_t> *to_continue,
     ObjectStore::Transaction *t);
-  void handle_push(int from, PushOp &op, PushReplyOp *response,
+  void handle_push(pg_shard_t from, PushOp &op, PushReplyOp *response,
                   ObjectStore::Transaction *t);
 
   static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
@@ -220,18 +220,18 @@ private:
                               bufferlist data_received,
                               interval_set<uint64_t> *intervals_usable,
                               bufferlist *data_usable);
-  void _failed_push(int from, const hobject_t &soid);
+  void _failed_push(pg_shard_t from, const hobject_t &soid);
 
-  void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
+  void send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes);
   void prep_push_op_blank(const hobject_t& soid, PushOp *op);
-  int send_push_op_legacy(int priority, int peer,
+  int send_push_op_legacy(int priority, pg_shard_t peer,
                          PushOp &pop);
-  int send_pull_legacy(int priority, int peer,
+  int send_pull_legacy(int priority, pg_shard_t peer,
                       const ObjectRecoveryInfo& recovery_info,
                       ObjectRecoveryProgress progress);
   void send_pulls(
     int priority,
-    map<int, vector<PullOp> > &pulls);
+    map<pg_shard_t, vector<PullOp> > &pulls);
 
   int build_push_op(const ObjectRecoveryInfo &recovery_info,
                    const ObjectRecoveryProgress &progress,
@@ -265,13 +265,13 @@ private:
     ObjectContextRef obj,
     RPGHandle *h);
   void prep_push_to_replica(
-    ObjectContextRef obc, const hobject_t& soid, int peer,
+    ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
     PushOp *pop);
   void prep_push(ObjectContextRef obc,
-                const hobject_t& oid, int dest,
+                const hobject_t& oid, pg_shard_t dest,
                 PushOp *op);
   void prep_push(ObjectContextRef obc,
-                const hobject_t& soid, int peer,
+                const hobject_t& soid, pg_shard_t peer,
                 eversion_t version,
                 interval_set<uint64_t> &data_subset,
                 map<hobject_t, interval_set<uint64_t> >& clone_subsets,
@@ -291,8 +291,8 @@ private:
    */
   struct InProgressOp {
     tid_t tid;
-    set<int> waiting_for_commit;
-    set<int> waiting_for_applied;
+    set<pg_shard_t> waiting_for_commit;
+    set<pg_shard_t> waiting_for_applied;
     Context *on_commit;
     Context *on_applied;
     OpRequestRef op;
@@ -380,24 +380,27 @@ private:
   void sub_op_modify_applied(RepModifyRef rm);
   void sub_op_modify_commit(RepModifyRef rm);
   bool scrub_supported() { return true; }
-  void be_scan_list(ScrubMap &map, const vector<hobject_t> &ls, bool deep,
+
+  void be_scan_list(
+    ScrubMap &map, const vector<hobject_t> &ls, bool deep,
     ThreadPool::TPHandle &handle);
   enum scrub_error_type be_compare_scrub_objects(
-                               const ScrubMap::object &auth,
-                               const ScrubMap::object &candidate,
-                               ostream &errorstream);
-  map<int, ScrubMap *>::const_iterator be_select_auth_object(
+    const ScrubMap::object &auth,
+    const ScrubMap::object &candidate,
+    ostream &errorstream);
+  map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
     const hobject_t &obj,
-    const map<int,ScrubMap*> &maps);
-  void be_compare_scrubmaps(const map<int,ScrubMap*> &maps,
-                           map<hobject_t, set<int> > &missing,
-                           map<hobject_t, set<int> > &inconsistent,
-                           map<hobject_t, int> &authoritative,
-                           map<hobject_t, set<int> > &invalid_snapcolls,
-                           int &shallow_errors, int &deep_errors,
-                           const pg_t pgid,
-                           const vector<int> &acting,
-                           ostream &errorstream);
+    const map<pg_shard_t,ScrubMap*> &maps);
+  void be_compare_scrubmaps(
+    const map<pg_shard_t,ScrubMap*> &maps,
+    map<hobject_t, set<pg_shard_t> > &missing,
+    map<hobject_t, set<pg_shard_t> > &inconsistent,
+    map<hobject_t, pg_shard_t> &authoritative,
+    map<hobject_t, set<pg_shard_t> > &invalid_snapcolls,
+    int &shallow_errors, int &deep_errors,
+    const spg_t pgid,
+    const vector<int> &acting,
+    ostream &errorstream);
 };
 
 #endif
index 879a192837344c5dd0778f515f46164c5f5d015c..a6e958d6452be31bd8e88f1ecc9b8f5fe71e62dd 100644 (file)
@@ -305,7 +305,7 @@ void ReplicatedPG::on_global_recover(
 }
 
 void ReplicatedPG::on_peer_recover(
-  int peer,
+  pg_shard_t peer,
   const hobject_t &soid,
   const ObjectRecoveryInfo &recovery_info,
   const object_stat_sum_t &stat)
@@ -317,7 +317,7 @@ void ReplicatedPG::on_peer_recover(
 }
 
 void ReplicatedPG::begin_peer_recover(
-  int peer,
+  pg_shard_t peer,
   const hobject_t soid)
 {
   peer_missing[peer].revise_have(soid, eversion_t());
@@ -422,8 +422,12 @@ bool ReplicatedPG::is_degraded_object(const hobject_t& soid)
 {
   if (pg_log.get_missing().missing.count(soid))
     return true;
-  for (unsigned i = 1; i < actingbackfill.size(); i++) {
-    int peer = actingbackfill[i];
+  assert(actingbackfill.size() > 0);
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
     if (peer_missing.count(peer) &&
        peer_missing[peer].missing.count(soid))
       return true;
@@ -456,8 +460,11 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
            << dendl;
     eversion_t v;
     assert(actingbackfill.size() > 0);
-    for (unsigned i = 1; i < actingbackfill.size(); i++) {
-      int peer = actingbackfill[i];
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == get_primary()) continue;
+      pg_shard_t peer = *i;
       if (peer_missing.count(peer) &&
          peer_missing[peer].missing.count(soid)) {
        v = peer_missing[peer].missing[soid].need;
@@ -582,14 +589,18 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
     f->close_section();
     if (backfill_targets.size() > 0) {
       f->open_array_section("backfill_targets");
-      for (vector<int>::iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
-        f->dump_unsigned("osd", *p);
+      for (set<pg_shard_t>::iterator p = backfill_targets.begin();
+          p != backfill_targets.end();
+          ++p)
+        f->dump_stream("shard") << *p;
       f->close_section();
     }
     if (actingbackfill.size() > 0) {
       f->open_array_section("actingbackfill");
-      for (vector<int>::iterator p = actingbackfill.begin(); p != actingbackfill.end(); ++p)
-        f->dump_unsigned("osd", *p);
+      for (set<pg_shard_t>::iterator p = actingbackfill.begin();
+          p != actingbackfill.end();
+          ++p)
+        f->dump_stream("shard") << *p;
       f->close_section();
     }
     f->open_object_section("info");
@@ -598,11 +609,11 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
     f->close_section();
 
     f->open_array_section("peer_info");
-    for (map<int,pg_info_t>::iterator p = peer_info.begin();
+    for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
         p != peer_info.end();
         ++p) {
       f->open_object_section("info");
-      f->dump_unsigned("peer", p->first);
+      f->dump_stream("peer") << p->first;
       p->second.dump(f.get());
       f->close_section();
     }
@@ -683,10 +694,13 @@ int ReplicatedPG::do_command(cmdmap_t cmdmap, ostream& ss,
        p->second.dump(f.get());  // have, need keys
        {
          f->open_array_section("locations");
-         map<hobject_t,set<int> >::iterator q = missing_loc.find(p->first);
+         map<hobject_t,set<pg_shard_t> >::iterator q =
+           missing_loc.find(p->first);
          if (q != missing_loc.end())
-           for (set<int>::iterator r = q->second.begin(); r != q->second.end(); ++r)
-             f->dump_int("osd", *r);
+           for (set<pg_shard_t>::iterator r = q->second.begin();
+                r != q->second.end();
+                ++r)
+             f->dump_stream("shard") << *r;
          f->close_section();
        }
        f->close_section();
@@ -761,7 +775,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
       // fall through
 
     case CEPH_OSD_OP_PGLS:
-      if (m->get_pg() != info.pgid) {
+      if (m->get_pg() != info.pgid.pgid) {
         dout(10) << " pgls pg=" << m->get_pg() << " != " << info.pgid << dendl;
        result = 0; // hmm?
       } else {
@@ -997,7 +1011,7 @@ void ReplicatedPG::calc_trim_to()
 }
 
 ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
-                          const PGPool &_pool, pg_t p, const hobject_t& oid,
+                          const PGPool &_pool, spg_t p, const hobject_t& oid,
                           const hobject_t& ioid) :
   PG(o, curmap, _pool, p, oid, ioid),
   pgbackend(new ReplicatedBackend(this, coll_t(p), o->store, cct)),
@@ -1085,9 +1099,11 @@ void ReplicatedPG::do_request(
 hobject_t ReplicatedPG::earliest_backfill() const
 {
   hobject_t e = hobject_t::get_max();
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
-    map<int, pg_info_t>::const_iterator iter = peer_info.find(bt);
+  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    pg_shard_t bt = *i;
+    map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
     assert(iter != peer_info.end());
     if (iter->second.last_backfill < e)
       e = iter->second.last_backfill;
@@ -1105,9 +1121,12 @@ hobject_t ReplicatedPG::earliest_backfill() const
 // take the larger of last_backfill_started and the replicas last_backfill.
 bool ReplicatedPG::check_src_targ(const hobject_t& soid, const hobject_t& toid) const
 {
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
-    map<int, pg_info_t>::const_iterator iter = peer_info.find(bt);
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t bt = *i;
+    map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
     assert(iter != peer_info.end());
 
     if (toid <= MAX(last_backfill_started, iter->second.last_backfill) &&
@@ -1917,9 +1936,11 @@ void ReplicatedPG::do_scan(
        cct->_conf->osd_backfill_scan_max,
        &bi,
        handle);
-      MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
-                                        get_osdmap()->get_epoch(), m->query_epoch,
-                                        info.pgid, bi.begin, bi.end);
+      MOSDPGScan *reply = new MOSDPGScan(
+       MOSDPGScan::OP_SCAN_DIGEST,
+       pg_whoami,
+       get_osdmap()->get_epoch(), m->query_epoch,
+       spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
       ::encode(bi.objects, reply->get_data());
       osd->send_message_osd_cluster(reply, m->get_connection());
     }
@@ -1927,7 +1948,7 @@ void ReplicatedPG::do_scan(
 
   case MOSDPGScan::OP_SCAN_DIGEST:
     {
-      int from = m->get_source().num();
+      pg_shard_t from = m->from;
 
       // Check that from is in backfill_targets vector
       assert(is_backfill_targets(from));
@@ -1969,7 +1990,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op)
 {
   MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
-  int from = m->get_source().num();
+  pg_shard_t from = m->from;
 
   vector<PushReplyOp> replies;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -1981,6 +2002,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op)
   }
 
   MOSDPGPushReply *reply = new MOSDPGPushReply;
+  reply->from = get_parent()->whoami_shard();
   reply->set_priority(m->get_priority());
   reply->pgid = get_info().pgid;
   reply->map_epoch = m->map_epoch;
@@ -2027,7 +2049,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
 {
   MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
-  int from = m->get_source().num();
+  pg_shard_t from = m->from;
 
   vector<PullOp> replies(1);
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -2054,6 +2076,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
 
   if (replies.size()) {
     MOSDPGPull *reply = new MOSDPGPull;
+    reply->from = parent->whoami_shard();
     reply->set_priority(m->get_priority());
     reply->pgid = get_info().pgid;
     reply->map_epoch = m->map_epoch;
@@ -2074,9 +2097,9 @@ void ReplicatedBackend::do_pull(OpRequestRef op)
 {
   MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PULL);
-  int from = m->get_source().num();
+  pg_shard_t from = m->from;
 
-  map<int, vector<PushOp> > replies;
+  map<pg_shard_t, vector<PushOp> > replies;
   for (vector<PullOp>::iterator i = m->pulls.begin();
        i != m->pulls.end();
        ++i) {
@@ -2090,7 +2113,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op)
 {
   MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
-  int from = m->get_source().num();
+  pg_shard_t from = m->from;
 
   vector<PushOp> replies(1);
   for (vector<PushReplyOp>::iterator i = m->replies.begin();
@@ -2102,7 +2125,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op)
   }
   replies.erase(replies.end() - 1);
 
-  map<int, vector<PushOp> > _replies;
+  map<pg_shard_t, vector<PushOp> > _replies;
   _replies[from].swap(replies);
   send_pushes(m->get_priority(), _replies);
 }
@@ -2120,9 +2143,11 @@ void ReplicatedPG::do_backfill(OpRequestRef op)
     {
       assert(cct->_conf->osd_kill_backfill_at != 1);
 
-      MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
-                                                get_osdmap()->get_epoch(), m->query_epoch,
-                                                info.pgid);
+      MOSDPGBackfill *reply = new MOSDPGBackfill(
+       MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+       get_osdmap()->get_epoch(),
+       m->query_epoch,
+       spg_t(info.pgid.pgid, primary.shard));
       reply->set_priority(cct->_conf->osd_recovery_op_priority);
       osd->send_message_osd_cluster(reply, m->get_connection());
       queue_peering_event(
@@ -5068,8 +5093,10 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type)
   ctx->obc->ssc->snapset = ctx->new_snapset;
   info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
 
-  for (unsigned i = 0; i < backfill_targets.size() ; ++i) {
-    int bt = backfill_targets[i];
+  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    pg_shard_t bt = *i;
     pg_info_t& pinfo = peer_info[bt];
     if (soid <= pinfo.last_backfill)
       pinfo.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category);
@@ -6102,7 +6129,7 @@ void ReplicatedPG::op_applied(const eversion_t &applied_version)
       scrubber.finalizing = true;
       scrub_gather_replica_maps();
       ++scrubber.waiting_on;
-      scrubber.waiting_on_whom.insert(osd->whoami);
+      scrubber.waiting_on_whom.insert(pg_whoami);
       osd->scrub_wq.queue(this);
     }
   } else {
@@ -6269,9 +6296,10 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
 
   repop->v = ctx->at_version;
 
-  for (vector<int>::iterator i = actingbackfill.begin() + 1;
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
        i != actingbackfill.end();
        ++i) {
+    if (*i == get_primary()) continue;
     pg_info_t &pinfo = peer_info[*i];
     // keep peer_info up to date
     if (pinfo.last_complete == pinfo.last_update)
@@ -6336,25 +6364,27 @@ void ReplicatedBackend::issue_op(
 {
   int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 
-  if (parent->get_actingbackfill().size() > 1) {
+  if (parent->get_actingbackfill_shards().size() > 1) {
     ostringstream ss;
-    ss << "waiting for subops from " << 
-      vector<int>(
-       parent->get_actingbackfill().begin() + 1,
-       parent->get_actingbackfill().end());
+    set<pg_shard_t> replicas = parent->get_actingbackfill_shards();
+    replicas.erase(parent->whoami_shard());
+    ss << "waiting for subops from " << replicas;
     if (op->op)
       op->op->mark_sub_op_sent(ss.str());
   }
-  for (unsigned i=1; i<parent->get_actingbackfill().size(); i++) {
-    int peer = parent->get_actingbackfill()[i];
-    const pg_info_t &pinfo = parent->get_peer_info().find(peer)->second;
-
-    op->waiting_for_applied.insert(peer);
-    op->waiting_for_commit.insert(peer);
+  for (set<pg_shard_t>::const_iterator i =
+        parent->get_actingbackfill_shards().begin();
+       i != parent->get_actingbackfill_shards().end();
+       ++i) {
+    if (*i == parent->whoami_shard()) continue;
+    pg_shard_t peer = *i;
+    const pg_info_t &pinfo = parent->get_shard_info().find(peer)->second;
 
     // forward the write/update/whatever
     MOSDSubOp *wr = new MOSDSubOp(
-      reqid, get_info().pgid, soid,
+      reqid, parent->whoami_shard(),
+      spg_t(get_info().pgid.pgid, i->shard),
+      soid,
       false, acks_wanted,
       get_osdmap()->get_epoch(),
       tid, at_version);
@@ -6385,7 +6415,7 @@ void ReplicatedBackend::issue_op(
     wr->discard_temp_oid = discard_temp_oid;
 
     get_parent()->send_message_osd_cluster(
-      peer, wr, get_osdmap()->get_epoch());
+      peer.osd, wr, get_osdmap()->get_epoch());
   }
 }
 
@@ -7145,7 +7175,9 @@ void ReplicatedBackend::sub_op_modify_applied(RepModifyRef rm)
   
   if (!rm->committed) {
     // send ack to acker only if we haven't sent a commit already
-    MOSDSubOpReply *ack = new MOSDSubOpReply(m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+    MOSDSubOpReply *ack = new MOSDSubOpReply(
+      m, parent->whoami_shard(),
+      0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
     ack->set_priority(CEPH_MSG_PRIO_HIGH); // this better match commit priority!
     get_parent()->send_message_osd_cluster(
       rm->ackerosd, ack, get_osdmap()->get_epoch());
@@ -7166,7 +7198,10 @@ void ReplicatedBackend::sub_op_modify_commit(RepModifyRef rm)
   
   assert(get_osdmap()->is_up(rm->ackerosd));
   get_parent()->update_last_complete_ondisk(rm->last_complete);
-  MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+  MOSDSubOpReply *commit = new MOSDSubOpReply(
+    static_cast<MOSDSubOp*>(rm->op->get_req()),
+    get_parent()->whoami_shard(),
+    0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
   commit->set_last_complete_ondisk(rm->last_complete);
   commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
   get_parent()->send_message_osd_cluster(
@@ -7334,34 +7369,32 @@ void ReplicatedBackend::prepare_pull(
   eversion_t _v = get_parent()->get_local_missing().missing.find(
     soid)->second.need;
   assert(_v == v);
-  const map<hobject_t, set<int> > &missing_loc(
-    get_parent()->get_missing_loc());
-  const map<int, pg_missing_t > &peer_missing(
-    get_parent()->get_peer_missing());
-  int fromosd = -1;
-  map<hobject_t,set<int> >::const_iterator q = missing_loc.find(soid);
+  const map<hobject_t, set<pg_shard_t> > &missing_loc(
+    get_parent()->get_missing_loc_shards());
+  const map<pg_shard_t, pg_missing_t > &peer_missing(
+    get_parent()->get_shard_missing());
+  map<hobject_t, set<pg_shard_t> >::const_iterator q = missing_loc.find(soid);
   assert(q != missing_loc.end());
   assert(!q->second.empty());
 
   // pick a pullee
-  vector<int> shuffle(q->second.begin(), q->second.end());
+  vector<pg_shard_t> shuffle(q->second.begin(), q->second.end());
   random_shuffle(shuffle.begin(), shuffle.end());
-  vector<int>::iterator p = shuffle.begin();
-  assert(get_osdmap()->is_up(*p));
-  fromosd = *p;
-  assert(fromosd >= 0);
+  vector<pg_shard_t>::iterator p = shuffle.begin();
+  assert(get_osdmap()->is_up(p->osd));
+  pg_shard_t fromshard = *p;
 
   dout(7) << "pull " << soid
          << "v " << v
          << " on osds " << *p
-         << " from osd." << fromosd
+         << " from osd." << fromshard
          << dendl;
 
-  assert(peer_missing.count(fromosd));
-  const pg_missing_t &pmissing = peer_missing.find(fromosd)->second;
+  assert(peer_missing.count(fromshard));
+  const pg_missing_t &pmissing = peer_missing.find(fromshard)->second;
   if (pmissing.is_missing(soid, v)) {
     assert(pmissing.missing.find(soid)->second.have != v);
-    dout(10) << "pulling soid " << soid << " from osd " << fromosd
+    dout(10) << "pulling soid " << soid << " from osd " << fromshard
             << " at version " << pmissing.missing.find(soid)->second.have
             << " rather than at version " << v << dendl;
     v = pmissing.missing.find(soid)->second.have;
@@ -7398,8 +7431,8 @@ void ReplicatedBackend::prepare_pull(
     recovery_info.size = ((uint64_t)-1);
   }
 
-  h->pulls[fromosd].push_back(PullOp());
-  PullOp &op = h->pulls[fromosd].back();
+  h->pulls[fromshard].push_back(PullOp());
+  PullOp &op = h->pulls[fromshard].back();
   op.soid = soid;
 
   op.recovery_info = recovery_info;
@@ -7411,7 +7444,7 @@ void ReplicatedBackend::prepare_pull(
   op.recovery_progress.first = true;
 
   assert(!pulling.count(soid));
-  pull_from_peer[fromosd].insert(soid);
+  pull_from_peer[fromshard].insert(soid);
   PullInfo &pi = pulling[soid];
   pi.head_ctx = headctx;
   pi.recovery_info = op.recovery_info;
@@ -7423,7 +7456,7 @@ int ReplicatedPG::recover_missing(
   int priority,
   PGBackend::RecoveryHandle *h)
 {
-  map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
+  map<hobject_t, set<pg_shard_t> >::iterator q = missing_loc.find(soid);
   if (q == missing_loc.end()) {
     dout(7) << "pull " << soid
            << " v " << v 
@@ -7489,7 +7522,8 @@ int ReplicatedPG::recover_missing(
   return PULL_YES;
 }
 
-void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
+void ReplicatedPG::send_remove_op(
+  const hobject_t& oid, eversion_t v, pg_shard_t peer)
 {
   tid_t tid = osd->get_tid();
   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
@@ -7497,12 +7531,14 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
   dout(10) << "send_remove_op " << oid << " from osd." << peer
           << " tid " << tid << dendl;
 
-  MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, oid, false, CEPH_OSD_FLAG_ACK,
-                                  get_osdmap()->get_epoch(), tid, v);
+  MOSDSubOp *subop = new MOSDSubOp(
+    rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
+    oid, false, CEPH_OSD_FLAG_ACK,
+    get_osdmap()->get_epoch(), tid, v);
   subop->ops = vector<OSDOp>(1);
   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
 
-  osd->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch());
+  osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
 }
 
 /*
@@ -7510,7 +7546,7 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
  * clones/heads and dup data ranges where possible.
  */
 void ReplicatedBackend::prep_push_to_replica(
-  ObjectContextRef obc, const hobject_t& soid, int peer,
+  ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
   PushOp *pop)
 {
   const object_info_t& oi = obc->obs.oi;
@@ -7544,12 +7580,12 @@ void ReplicatedBackend::prep_push_to_replica(
     SnapSetContext *ssc = obc->ssc;
     assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
-    map<int, pg_missing_t>::const_iterator pm =
-      get_parent()->get_peer_missing().find(peer);
-    assert(pm != get_parent()->get_peer_missing().end());
-    map<int, pg_info_t>::const_iterator pi =
-      get_parent()->get_peer_info().find(peer);
-    assert(pi != get_parent()->get_peer_info().end());
+    map<pg_shard_t, pg_missing_t>::const_iterator pm =
+      get_parent()->get_shard_missing().find(peer);
+    assert(pm != get_parent()->get_shard_missing().end());
+    map<pg_shard_t, pg_info_t>::const_iterator pi =
+      get_parent()->get_shard_info().find(peer);
+    assert(pi != get_parent()->get_shard_info().end());
     calc_clone_subsets(ssc->snapset, soid,
                       pm->second,
                       pi->second.last_backfill,
@@ -7562,8 +7598,8 @@ void ReplicatedBackend::prep_push_to_replica(
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
     calc_head_subsets(
       obc,
-      ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second,
-      get_parent()->get_peer_info().find(peer)->second.last_backfill,
+      ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second,
+      get_parent()->get_shard_info().find(peer)->second.last_backfill,
       data_subset, clone_subsets);
   }
 
@@ -7571,7 +7607,7 @@ void ReplicatedBackend::prep_push_to_replica(
 }
 
 void ReplicatedBackend::prep_push(ObjectContextRef obc,
-                            const hobject_t& soid, int peer,
+                            const hobject_t& soid, pg_shard_t peer,
                             PushOp *pop)
 {
   interval_set<uint64_t> data_subset;
@@ -7586,7 +7622,7 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc,
 
 void ReplicatedBackend::prep_push(
   ObjectContextRef obc,
-  const hobject_t& soid, int peer,
+  const hobject_t& soid, pg_shard_t peer,
   eversion_t version,
   interval_set<uint64_t> &data_subset,
   map<hobject_t, interval_set<uint64_t> >& clone_subsets,
@@ -7617,7 +7653,7 @@ void ReplicatedBackend::prep_push(
   pi.recovery_progress = new_progress;
 }
 
-int ReplicatedBackend::send_pull_legacy(int prio, int peer,
+int ReplicatedBackend::send_pull_legacy(int prio, pg_shard_t peer,
                                        const ObjectRecoveryInfo &recovery_info,
                                        ObjectRecoveryProgress progress)
 {
@@ -7632,10 +7668,12 @@ int ReplicatedBackend::send_pull_legacy(int prio, int peer,
           << " from osd." << peer
           << " tid " << tid << dendl;
 
-  MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid,
-                                  false, CEPH_OSD_FLAG_ACK,
-                                  get_osdmap()->get_epoch(), tid,
-                                  recovery_info.version);
+  MOSDSubOp *subop = new MOSDSubOp(
+    rid, parent->whoami_shard(),
+    get_info().pgid, recovery_info.soid,
+    false, CEPH_OSD_FLAG_ACK,
+    get_osdmap()->get_epoch(), tid,
+    recovery_info.version);
   subop->set_priority(prio);
   subop->ops = vector<OSDOp>(1);
   subop->ops[0].op.op = CEPH_OSD_OP_PULL;
@@ -7644,7 +7682,7 @@ int ReplicatedBackend::send_pull_legacy(int prio, int peer,
   subop->recovery_progress = progress;
 
   get_parent()->send_message_osd_cluster(
-    peer, subop, get_osdmap()->get_epoch());
+    peer.osd, subop, get_osdmap()->get_epoch());
 
   get_parent()->get_logger()->inc(l_osd_pull);
   return 0;
@@ -7740,7 +7778,7 @@ ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
 }
 
 bool ReplicatedBackend::handle_pull_response(
-  int from, PushOp &pop, PullOp *response,
+  pg_shard_t from, PushOp &pop, PullOp *response,
   list<hobject_t> *to_continue,
   ObjectStore::Transaction *t
   )
@@ -7842,7 +7880,7 @@ struct C_OnPushCommit : public Context {
 };
 
 void ReplicatedBackend::handle_push(
-  int from, PushOp &pop, PushReplyOp *response,
+  pg_shard_t from, PushOp &pop, PushReplyOp *response,
   ObjectStore::Transaction *t)
 {
   dout(10) << "handle_push "
@@ -7875,13 +7913,13 @@ void ReplicatedBackend::handle_push(
       t);
 }
 
-void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
+void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes)
 {
-  for (map<int, vector<PushOp> >::iterator i = pushes.begin();
+  for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin();
        i != pushes.end();
        ++i) {
     ConnectionRef con = get_parent()->get_con_osd_cluster(
-      i->first,
+      i->first.osd,
       get_osdmap()->get_epoch());
     if (!con)
       continue;
@@ -7899,7 +7937,8 @@ void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
        uint64_t cost = 0;
        uint64_t pushes = 0;
        MOSDPGPush *msg = new MOSDPGPush();
-       msg->pgid = get_info().pgid;
+       msg->from = get_parent()->whoami_shard();
+       msg->pgid = get_parent()->primary_spg_t();
        msg->map_epoch = get_osdmap()->get_epoch();
        msg->set_priority(prio);
        for (;
@@ -7920,13 +7959,13 @@ void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
   }
 }
 
-void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
+void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls)
 {
-  for (map<int, vector<PullOp> >::iterator i = pulls.begin();
+  for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin();
        i != pulls.end();
        ++i) {
     ConnectionRef con = get_parent()->get_con_osd_cluster(
-      i->first,
+      i->first.osd,
       get_osdmap()->get_epoch());
     if (!con)
       continue;
@@ -7946,8 +7985,9 @@ void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
       dout(20) << __func__ << ": sending pulls " << i->second
               << " to osd." << i->first << dendl;
       MOSDPGPull *msg = new MOSDPGPull();
+      msg->from = parent->whoami_shard();
       msg->set_priority(prio);
-      msg->pgid = get_info().pgid;
+      msg->pgid = get_parent()->primary_spg_t();
       msg->map_epoch = get_osdmap()->get_epoch();
       msg->pulls.swap(i->second);
       msg->compute_cost(cct);
@@ -8069,13 +8109,15 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
   return 0;
 }
 
-int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
+int ReplicatedBackend::send_push_op_legacy(int prio, pg_shard_t peer, PushOp &pop)
 {
   tid_t tid = get_parent()->get_tid();
   osd_reqid_t rid(get_parent()->get_cluster_msgr_name(), 0, tid);
-  MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid,
-                                  false, 0, get_osdmap()->get_epoch(),
-                                  tid, pop.recovery_info.version);
+  MOSDSubOp *subop = new MOSDSubOp(
+    rid, parent->whoami_shard(),
+    spg_t(get_info().pgid.pgid, peer.shard), pop.soid,
+    false, 0, get_osdmap()->get_epoch(),
+    tid, pop.recovery_info.version);
   subop->ops = vector<OSDOp>(1);
   subop->ops[0].op.op = CEPH_OSD_OP_PUSH;
 
@@ -8090,7 +8132,7 @@ int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
   subop->current_progress = pop.before_progress;
   subop->recovery_progress = pop.after_progress;
 
-  get_parent()->send_message_osd_cluster(peer, subop, get_osdmap()->get_epoch());
+  get_parent()->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
   return 0;
 }
 
@@ -8107,7 +8149,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
   const hobject_t& soid = reply->get_poid();
   assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
   dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl;
-  int peer = reply->get_source().num();
+  pg_shard_t peer = reply->from;
 
   op->mark_started();
   
@@ -8119,7 +8161,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
     send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
 }
 
-bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
+bool ReplicatedBackend::handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply)
 {
   const hobject_t &soid = op.soid;
   if (pushing.count(soid) == 0) {
@@ -8217,16 +8259,16 @@ void ReplicatedBackend::sub_op_pull(OpRequestRef op)
   pop.recovery_progress = m->recovery_progress;
 
   PushOp reply;
-  handle_pull(m->get_source().num(), pop, &reply);
+  handle_pull(m->from, pop, &reply);
   send_push_op_legacy(
     m->get_priority(),
-    m->get_source().num(),
+    m->from,
     reply);
 
   log_subop_stats(get_parent()->get_logger(), op, 0, l_osd_sop_pull_lat);
 }
 
-void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply)
+void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
 {
   const hobject_t &soid = op.soid;
   struct stat st;
@@ -8267,10 +8309,13 @@ void ReplicatedPG::_committed_pushed_object(
       if (!is_primary()) {
         // Either we are a replica or backfill target.
        // we are fully up to date.  tell the primary!
-       osd->send_message_osd_cluster(get_primary(),
-                                     new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
-                                                    last_complete_ondisk),
-                                     get_osdmap()->get_epoch());
+       osd->send_message_osd_cluster(
+         get_primary().osd,
+         new MOSDPGTrim(
+           get_osdmap()->get_epoch(),
+           spg_t(info.pgid.pgid, primary.shard),
+           last_complete_ondisk),
+         get_osdmap()->get_epoch());
       } else {
        // we are the primary.  tell replicas to trim?
        if (calc_min_last_complete_ondisk())
@@ -8412,12 +8457,12 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op)
     RPGHandle *h = _open_recovery_op();
     list<hobject_t> to_continue;
     bool more = handle_pull_response(
-      m->get_source().num(), pop, &resp,
+      m->from, pop, &resp,
       &to_continue, t);
     if (more) {
       send_pull_legacy(
        m->get_priority(),
-       m->get_source().num(),
+       m->from,
        resp.recovery_info,
        resp.recovery_progress);
     } else {
@@ -8435,10 +8480,11 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op)
   } else {
     PushReplyOp resp;
     MOSDSubOpReply *reply = new MOSDSubOpReply(
-      m, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+      m, parent->whoami_shard(), 0,
+      get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
     reply->set_priority(m->get_priority());
     assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type);
-    handle_push(m->get_source().num(), pop, &resp, t);
+    handle_push(m->from, pop, &resp, t);
     t->register_on_complete(new PG_SendMessageOnConn(
                              get_parent(), reply, m->get_connection()));
   }
@@ -8448,26 +8494,26 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op)
   return;
 }
 
-void ReplicatedPG::failed_push(int from, const hobject_t &soid)
+void ReplicatedPG::failed_push(pg_shard_t from, const hobject_t &soid)
 {
   assert(recovering.count(soid));
   recovering.erase(soid);
-  map<hobject_t,set<int> >::iterator p = missing_loc.find(soid);
+  map<hobject_t,set<pg_shard_t> >::iterator p = missing_loc.find(soid);
   if (p != missing_loc.end()) {
-    dout(0) << "_failed_push " << soid << " from osd." << from
+    dout(0) << "_failed_push " << soid << " from shard " << from
            << ", reps on " << p->second << dendl;
 
     p->second.erase(from);          // forget about this (bad) peer replica
     if (p->second.empty())
       missing_loc.erase(p);
   } else {
-    dout(0) << "_failed_push " << soid << " from osd." << from
+    dout(0) << "_failed_push " << soid << " from shard " << from
            << " but not in missing_loc ???" << dendl;
   }
   finish_recovery_op(soid);  // close out this attempt,
 }
 
-void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
+void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid)
 {
   get_parent()->failed_push(from, soid);
   pull_from_peer[from].erase(soid);
@@ -8500,8 +8546,11 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid)
   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
 
   assert(actingbackfill.size() > 0);
-  for (unsigned i=1; i<actingbackfill.size(); ++i) {
-    int peer = actingbackfill[i];
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
     if (!peer_missing[peer].is_missing(oid)) {
       assert(is_backfill_targets(peer));
       continue;
@@ -8781,7 +8830,7 @@ void ReplicatedPG::on_shutdown()
   osd->local_reserver.cancel_reservation(info.pgid);
 
   clear_primary_state();
-  osd->remove_want_pg_temp(info.pgid);
+  osd->remove_want_pg_temp(info.pgid.pgid);
   cancel_recovery();
 }
 
@@ -8793,9 +8842,11 @@ void ReplicatedPG::on_activate()
     assert(!last_backfill_started.is_max());
     dout(5) << "on activate: bft=" << backfill_targets
           << " from " << last_backfill_started << dendl;
-    for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-      dout(5) << "target osd." << backfill_targets[i]
-            << " from " << peer_info[backfill_targets[i]].last_backfill
+    for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+        i != backfill_targets.end();
+        ++i) {
+      dout(5) << "target shard " << *i
+            << " from " << peer_info[*i].last_backfill
             << dendl;
     }
   }
@@ -8930,11 +8981,11 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
    * check that any peers we are planning to (or currently) pulling
    * objects from are dealt with.
    */
-  set<int> now_down;
-  for (set<int>::iterator p = missing_loc_sources.begin();
+  set<pg_shard_t> now_down;
+  for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
        p != missing_loc_sources.end();
        ) {
-    if (osdmap->is_up(*p)) {
+    if (osdmap->is_up(p->osd)) {
       ++p;
       continue;
     }
@@ -8951,9 +9002,9 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
             << missing_loc_sources << dendl;
     
     // filter missing_loc
-    map<hobject_t, set<int> >::iterator p = missing_loc.begin();
+    map<hobject_t, set<pg_shard_t> >::iterator p = missing_loc.begin();
     while (p != missing_loc.end()) {
-      set<int>::iterator q = p->second.begin();
+      set<pg_shard_t>::iterator q = p->second.begin();
       while (q != p->second.end())
        if (now_down.count(*q)) {
          p->second.erase(q++);
@@ -8968,10 +9019,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
     }
   }
 
-  for (set<int>::iterator i = peer_log_requested.begin();
+  for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
        i != peer_log_requested.end();
        ) {
-    if (!osdmap->is_up(*i)) {
+    if (!osdmap->is_up(i->osd)) {
       dout(10) << "peer_log_requested removing " << *i << dendl;
       peer_log_requested.erase(i++);
     } else {
@@ -8979,10 +9030,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
     }
   }
 
-  for (set<int>::iterator i = peer_missing_requested.begin();
+  for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
        i != peer_missing_requested.end();
        ) {
-    if (!osdmap->is_up(*i)) {
+    if (!osdmap->is_up(i->osd)) {
       dout(10) << "peer_missing_requested removing " << *i << dendl;
       peer_missing_requested.erase(i++);
     } else {
@@ -9238,8 +9289,10 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
            eversion_t alternate_need = latest->reverting_to;
            dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
 
-           set<int>& loc = missing_loc[soid];
-           for (map<int,pg_missing_t>::iterator p = peer_missing.begin(); p != peer_missing.end(); ++p)
+           set<pg_shard_t>& loc = missing_loc[soid];
+           for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
+                p != peer_missing.end();
+                ++p)
              if (p->second.is_missing(soid, need) &&
                  p->second.missing[soid].have == alternate_need) {
                missing_loc_sources.insert(p->first);
@@ -9300,13 +9353,16 @@ int ReplicatedPG::prep_object_replica_pushes(
     pg_log.missing_add(soid, v, eversion_t());
     bool uhoh = true;
     assert(actingbackfill.size() > 0);
-    for (unsigned i=1; i<actingbackfill.size(); i++) {
-      int peer = actingbackfill[i];
+    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+        i != actingbackfill.end();
+        ++i) {
+      if (*i == get_primary()) continue;
+      pg_shard_t peer = *i;
       if (!peer_missing[peer].is_missing(soid, v)) {
        missing_loc[soid].insert(peer);
        missing_loc_sources.insert(peer);
        dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
-                << ", there should be a copy on osd." << peer << dendl;
+                << ", there should be a copy on shard " << peer << dendl;
        uhoh = false;
       }
     }
@@ -9345,12 +9401,16 @@ int ReplicatedBackend::start_pushes(
 {
   int pushes = 0;
   // who needs it?  
-  assert(get_parent()->get_actingbackfill().size() > 0);
-  for (unsigned i=1; i<get_parent()->get_actingbackfill().size(); i++) {
-    int peer = get_parent()->get_actingbackfill()[i];
-    map<int, pg_missing_t>::const_iterator j =
-      get_parent()->get_peer_missing().find(peer);
-    assert(j != get_parent()->get_peer_missing().end());
+  assert(get_parent()->get_actingbackfill_shards().size() > 0);
+  for (set<pg_shard_t>::iterator i =
+        get_parent()->get_actingbackfill_shards().begin();
+       i != get_parent()->get_actingbackfill_shards().end();
+       ++i) {
+    if (*i == get_parent()->whoami_shard()) continue;
+    pg_shard_t peer = *i;
+    map<pg_shard_t, pg_missing_t>::const_iterator j =
+      get_parent()->get_shard_missing().find(peer);
+    assert(j != get_parent()->get_shard_missing().end());
     if (j->second.is_missing(soid)) {
       ++pushes;
       h->pushes[peer].push_back(PushOp());
@@ -9371,11 +9431,14 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
 
   // this is FAR from an optimal recovery order.  pretty lame, really.
   assert(actingbackfill.size() > 0);
-  for (unsigned i=1; i<actingbackfill.size(); i++) {
-    int peer = actingbackfill[i];
-    map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
+    map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
     assert(pm != peer_missing.end());
-    map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+    map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
     assert(pi != peer_info.end());
     size_t m_sz = pm->second.num_missing();
 
@@ -9426,9 +9489,11 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
 hobject_t ReplicatedPG::earliest_peer_backfill() const
 {
   hobject_t e = hobject_t::get_max();
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int peer = backfill_targets[i];
-    map<int, BackfillInterval>::const_iterator iter =
+  for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    pg_shard_t peer = *i;
+    map<pg_shard_t, BackfillInterval>::const_iterator iter =
       peer_backfill_info.find(peer);
     assert(iter != peer_backfill_info.end());
     if (iter->second.begin < e)
@@ -9442,9 +9507,11 @@ bool ReplicatedPG::all_peer_done() const
   // Primary hasn't got any more objects
   assert(backfill_info.empty());
 
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
-    map<int, BackfillInterval>::const_iterator piter =
+  for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    pg_shard_t bt = *i;
+    map<pg_shard_t, BackfillInterval>::const_iterator piter =
       peer_backfill_info.find(bt);
     assert(piter != peer_backfill_info.end());
     const BackfillInterval& pbi = piter->second;
@@ -9497,20 +9564,22 @@ int ReplicatedPG::recover_backfill(
     // on_activate() was called prior to getting here
     assert(last_backfill_started == earliest_backfill());
     new_backfill = false;
-    for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-      int bt = backfill_targets[i];
-      peer_backfill_info[bt].reset(peer_info[bt].last_backfill);
+    for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+        i != backfill_targets.end();
+        ++i) {
+      peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
     }
     backfill_info.reset(last_backfill_started);
   }
 
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
-    dout(10) << "peer osd." << bt
-          << " info " << peer_info[bt]
-          << " interval " << peer_backfill_info[bt].begin
-          << "-" << peer_backfill_info[bt].end
-          << " " << peer_backfill_info[bt].objects.size() << " objects"
+  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    dout(10) << "peer osd." << *i
+          << " info " << peer_info[*i]
+          << " interval " << peer_backfill_info[*i].begin
+          << "-" << peer_backfill_info[*i].end
+          << " " << peer_backfill_info[*i].objects.size() << " objects"
           << dendl;
   }
 
@@ -9520,13 +9589,14 @@ int ReplicatedPG::recover_backfill(
 
   int ops = 0;
   vector<boost::tuple<hobject_t, eversion_t,
-                      ObjectContextRef, vector<int> > > to_push;
-  vector<boost::tuple<hobject_t, eversion_t, int> > to_remove;
+                      ObjectContextRef, vector<pg_shard_t> > > to_push;
+  vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
   set<hobject_t> add_to_stat;
 
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
-    peer_backfill_info[bt].trim_to(last_backfill_started);
+  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    peer_backfill_info[*i].trim_to(last_backfill_started);
   }
   backfill_info.trim_to(last_backfill_started);
 
@@ -9549,19 +9619,23 @@ int ReplicatedPG::recover_backfill(
             << dendl;
 
     bool sent_scan = false;
-    for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-      int bt = backfill_targets[i];
+    for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+        i != backfill_targets.end();
+        ++i) {
+      pg_shard_t bt = *i;
       BackfillInterval& pbi = peer_backfill_info[bt];
 
-      dout(20) << " peer osd." << bt << " backfill " << pbi.begin << "-"
+      dout(20) << " peer shard " << bt << " backfill " << pbi.begin << "-"
               << pbi.end << " " << pbi.objects << dendl;
       if (pbi.begin <= backfill_info.begin &&
          !pbi.extends_to_end() && pbi.empty()) {
        dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
        epoch_t e = get_osdmap()->get_epoch();
-       MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid,
-                                    pbi.end, hobject_t());
-       osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch());
+       MOSDPGScan *m = new MOSDPGScan(
+         MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, e,
+         spg_t(info.pgid.pgid, bt.shard),
+         pbi.end, hobject_t());
+       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
        assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
        waiting_on_backfill.insert(bt);
         sent_scan = true;
@@ -9586,19 +9660,23 @@ int ReplicatedPG::recover_backfill(
 
     if (check < backfill_info.begin) {
 
-      vector<int> check_targets;
-      for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-        int bt = backfill_targets[i];
+      set<pg_shard_t> check_targets;
+      for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+          i != backfill_targets.end();
+          ++i) {
+        pg_shard_t bt = *i;
         BackfillInterval& pbi = peer_backfill_info[bt];
         if (pbi.begin == check)
-          check_targets.push_back(bt);
+          check_targets.insert(bt);
       }
       assert(!check_targets.empty());
 
       dout(20) << " BACKFILL removing " << check
               << " from peers " << check_targets << dendl;
-      for (unsigned i = 0; i < check_targets.size(); ++i) {
-        int bt = check_targets[i];
+      for (set<pg_shard_t>::iterator i = check_targets.begin();
+          i != check_targets.end();
+          ++i) {
+        pg_shard_t bt = *i;
         BackfillInterval& pbi = peer_backfill_info[bt];
         assert(pbi.begin == check);
 
@@ -9613,9 +9691,11 @@ int ReplicatedPG::recover_backfill(
     } else {
       eversion_t& obj_v = backfill_info.objects.begin()->second;
 
-      vector<int> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
-      for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-       int bt = backfill_targets[i];
+      vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
+      for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+          i != backfill_targets.end();
+          ++i) {
+       pg_shard_t bt = *i;
        BackfillInterval& pbi = peer_backfill_info[bt];
         // Find all check peers that have the wrong version
        if (check == backfill_info.begin && check == pbi.begin) {
@@ -9658,11 +9738,11 @@ int ReplicatedPG::recover_backfill(
                 << " with ver " << obj_v
                 << " to peers " << missing_targs << dendl;
          }
-         vector<int> all_push = need_ver_targs;
+         vector<pg_shard_t> all_push = need_ver_targs;
          all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
 
          to_push.push_back(
-           boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<int> >
+           boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
            (backfill_info.begin, obj_v, obc, all_push));
          // Count all simultaneous pushes of the same object as a single op
          ops++;
@@ -9682,10 +9762,12 @@ int ReplicatedPG::recover_backfill(
       last_backfill_started = backfill_info.begin;
       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
       backfill_info.pop_front();
-      vector<int> check_targets = need_ver_targs;
+      vector<pg_shard_t> check_targets = need_ver_targs;
       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
-      for (unsigned i = 0; i < check_targets.size(); ++i) {
-        int bt = check_targets[i];
+      for (vector<pg_shard_t>::iterator i = check_targets.begin();
+          i != check_targets.end();
+          ++i) {
+        pg_shard_t bt = *i;
         BackfillInterval& pbi = peer_backfill_info[bt];
         pbi.pop_front();
       }
@@ -9734,8 +9816,10 @@ int ReplicatedPG::recover_backfill(
         i->first < next_backfill_to_complete;
        pending_backfill_updates.erase(i++)) {
     assert(i->first > new_last_backfill);
-    for (unsigned j = 0; j < backfill_targets.size(); ++j) {
-      int bt = backfill_targets[j];
+    for (set<pg_shard_t>::iterator j = backfill_targets.begin();
+        j != backfill_targets.end();
+        ++j) {
+      pg_shard_t bt = *j;
       pg_info_t& pinfo = peer_info[bt];
       //Add stats to all peers that were missing object
       if (i->first > pinfo.last_backfill)
@@ -9768,8 +9852,10 @@ int ReplicatedPG::recover_backfill(
   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
   // all the backfill targets.  Otherwise, we will move last_backfill up on
   // those targets need it and send OP_BACKFILL_PROGRESS to them.
-  for (unsigned i = 0; i < backfill_targets.size(); ++i) {
-    int bt = backfill_targets[i];
+  for (set<pg_shard_t>::iterator i = backfill_targets.begin();
+       i != backfill_targets.end();
+       ++i) {
+    pg_shard_t bt = *i;
     pg_info_t& pinfo = peer_info[bt];
 
     if (new_last_backfill > pinfo.last_backfill) {
@@ -9777,7 +9863,11 @@ int ReplicatedPG::recover_backfill(
       epoch_t e = get_osdmap()->get_epoch();
       MOSDPGBackfill *m = NULL;
       if (pinfo.last_backfill.is_max()) {
-        m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH, e, e, info.pgid);
+        m = new MOSDPGBackfill(
+         MOSDPGBackfill::OP_BACKFILL_FINISH,
+         e,
+         e,
+         spg_t(info.pgid.pgid, bt.shard));
         // Use default priority here, must match sub_op priority
         /* pinfo.stats might be wrong if we did log-based recovery on the
          * backfilled portion in addition to continuing backfill.
@@ -9785,13 +9875,17 @@ int ReplicatedPG::recover_backfill(
         pinfo.stats = info.stats;
         start_recovery_op(hobject_t::get_max());
       } else {
-        m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid);
+        m = new MOSDPGBackfill(
+         MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+         e,
+         e,
+         spg_t(info.pgid.pgid, bt.shard));
         // Use default priority here, must match sub_op priority
       }
       m->last_backfill = pinfo.last_backfill;
       m->stats = pinfo.stats;
-      osd->send_message_osd_cluster(bt, m, get_osdmap()->get_epoch());
-      dout(10) << " peer osd." << bt
+      osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
+      dout(10) << " peer " << bt
               << " num_objects now " << pinfo.stats.stats.sum.num_objects
               << " / " << info.stats.stats.sum.num_objects << dendl;
     }
@@ -9805,7 +9899,7 @@ int ReplicatedPG::recover_backfill(
 void ReplicatedPG::prep_backfill_object_push(
   hobject_t oid, eversion_t v,
   ObjectContextRef obc,
-  vector<int> peers,
+  vector<pg_shard_t> peers,
   PGBackend::RecoveryHandle *h)
 {
   dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl;
@@ -9813,7 +9907,7 @@ void ReplicatedPG::prep_backfill_object_push(
 
   backfills_in_flight.insert(oid);
   for (unsigned int i = 0 ; i < peers.size(); ++i) {
-    map<int, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
+    map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
     assert(bpm != peer_missing.end());
     bpm->second.add(oid, eversion_t(), eversion_t());
   }
@@ -10284,8 +10378,9 @@ void ReplicatedPG::agent_setup()
     // choose random starting position
     agent_state->position = hobject_t();
     agent_state->position.pool = info.pgid.pool();
-    agent_state->position.hash = pool.info.get_random_pg_position(info.pgid,
-                                                                 rand());
+    agent_state->position.hash = pool.info.get_random_pg_position(
+      info.pgid.pgid,
+      rand());
 
     dout(10) << __func__ << " allocated new state, position "
             << agent_state->position << dendl;
@@ -10546,7 +10641,7 @@ void ReplicatedPG::agent_stop()
 
 void ReplicatedPG::agent_choose_mode()
 {
-  uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid);
+  uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
 
   // get dirty, full ratios
   uint64_t dirty_micro = 0;
index 2a0e1080b96bb4c79a654f8a0b4408bcb29a91b6..1acbf9ccbad296bbcdba122b33e2c438604211fa 100644 (file)
@@ -225,17 +225,17 @@ public:
     ObjectStore::Transaction *t
     );
   void on_peer_recover(
-    int peer,
+    pg_shard_t peer,
     const hobject_t &oid,
     const ObjectRecoveryInfo &recovery_info,
     const object_stat_sum_t &stat
     );
   void begin_peer_recover(
-    int peer,
+    pg_shard_t peer,
     const hobject_t oid);
   void on_global_recover(
     const hobject_t &oid);
-  void failed_push(int from, const hobject_t &soid);
+  void failed_push(pg_shard_t from, const hobject_t &soid);
   void cancel_pull(const hobject_t &soid);
 
   template <typename T>
@@ -288,27 +288,27 @@ public:
     tls.push_back(t);
     osd->store->queue_transaction(osr.get(), t, 0, 0, 0, op);
   }
-  epoch_t get_epoch() {
+  epoch_t get_epoch() const {
     return get_osdmap()->get_epoch();
   }
-  const vector<int> &get_actingbackfill() {
+  const set<pg_shard_t> &get_actingbackfill_shards() const {
     return actingbackfill;
   }
   std::string gen_dbg_prefix() const { return gen_prefix(); }
   
-  const map<hobject_t, set<int> > &get_missing_loc() {
+  const map<hobject_t, set<pg_shard_t> > &get_missing_loc_shards() const {
     return missing_loc;
   }
-  const map<int, pg_missing_t> &get_peer_missing() {
+  const map<pg_shard_t, pg_missing_t> &get_shard_missing() const {
     return peer_missing;
   }
-  const map<int, pg_info_t> &get_peer_info() {
+  const map<pg_shard_t, pg_info_t> &get_shard_info() const {
     return peer_info;
   }
-  const pg_missing_t &get_local_missing() {
+  const pg_missing_t &get_local_missing() const {
     return pg_log.get_missing();
   }
-  const PGLog &get_log() {
+  const PGLog &get_log() const {
     return pg_log;
   }
   bool pgb_is_primary() const {
@@ -337,8 +337,10 @@ public:
     const eversion_t &applied_version);
 
   bool should_send_op(
-    int peer,
+    pg_shard_t peer,
     const hobject_t &hoid) {
+    if (peer == get_primary())
+      return true;
     assert(peer_info.count(peer));
     bool should_send = hoid.pool != (int64_t)info.pgid.pool() ||
       hoid <= MAX(last_backfill_started, peer_info[peer].last_backfill);
@@ -348,7 +350,7 @@ public:
   }
   
   void update_peer_last_complete_ondisk(
-    int fromosd,
+    pg_shard_t fromosd,
     eversion_t lcod) {
     peer_last_complete_ondisk[fromosd] = lcod;
   }
@@ -366,8 +368,14 @@ public:
   void schedule_work(
     GenContext<ThreadPool::TPHandle&> *c);
 
-  int whoami() const {
-    return osd->whoami;
+  pg_shard_t whoami_shard() const {
+    return pg_whoami;
+  }
+  spg_t primary_spg_t() const {
+    return spg_t(info.pgid.pgid, primary.shard);
+  }
+  pg_shard_t primary_shard() const {
+    return primary;
   }
 
   void send_message_osd_cluster(
@@ -852,14 +860,14 @@ protected:
 
   void dump_recovery_info(Formatter *f) const {
     f->open_array_section("backfill_targets");
-    for (vector<int>::const_iterator p = backfill_targets.begin();
+    for (set<pg_shard_t>::const_iterator p = backfill_targets.begin();
         p != backfill_targets.end(); ++p)
-      f->dump_int("osd", *p);
+      f->dump_stream("replica") << *p;
     f->close_section();
     f->open_array_section("waiting_on_backfill");
-    for (set<int>::const_iterator p = waiting_on_backfill.begin();
+    for (set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
         p != waiting_on_backfill.end(); ++p)
-      f->dump_int("osd", *p);
+      f->dump_stream("osd") << *p;
     f->close_section();
     f->dump_stream("last_backfill_started") << last_backfill_started;
     {
@@ -869,9 +877,10 @@ protected:
     }
     {
       f->open_array_section("peer_backfill_info");
-      for (map<int, BackfillInterval>::const_iterator pbi = peer_backfill_info.begin();
+      for (map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+            peer_backfill_info.begin();
           pbi != peer_backfill_info.end(); ++pbi) {
-        f->dump_int("osd", pbi->first);
+        f->dump_stream("osd") << pbi->first;
         f->open_object_section("BackfillInterval");
           pbi->second.dump(f);
         f->close_section();
@@ -1011,9 +1020,9 @@ protected:
 
   void prep_backfill_object_push(
     hobject_t oid, eversion_t v, ObjectContextRef obc,
-    vector<int> peer,
+    vector<pg_shard_t> peers,
     PGBackend::RecoveryHandle *h);
-  void send_remove_op(const hobject_t& oid, eversion_t v, int peer);
+  void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
 
 
   struct C_OSD_OndiskWriteUnlock : public Context {
@@ -1140,7 +1149,7 @@ protected:
 
 public:
   ReplicatedPG(OSDService *o, OSDMapRef curmap,
-              const PGPool &_pool, pg_t p, const hobject_t& oid,
+              const PGPool &_pool, spg_t p, const hobject_t& oid,
               const hobject_t& ioid);
   ~ReplicatedPG() {}
 
@@ -1182,7 +1191,7 @@ public:
     return pgbackend->temp_colls(out);
   }
   void split_colls(
-    pg_t child,
+    spg_t child,
     int split_bits,
     int seed,
     ObjectStore::Transaction *t) {
index 404fb4925981b2cbb899225c8134e1671a6fe0e9..caae5782b646be84d299b4e2248dd8099577c6a6 100644 (file)
@@ -481,7 +481,7 @@ ostream& operator<<(ostream& out, const pg_t &pg)
 
 const coll_t coll_t::META_COLL("meta");
 
-bool coll_t::is_temp(pg_t& pgid) const
+bool coll_t::is_temp(spg_t& pgid) const
 {
   const char *cstr(str.c_str());
   if (!pgid.parse(cstr))
@@ -494,7 +494,7 @@ bool coll_t::is_temp(pg_t& pgid) const
   return false;
 }
 
-bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const
+bool coll_t::is_pg(spg_t& pgid, snapid_t& snap) const
 {
   const char *cstr(str.c_str());
 
@@ -514,7 +514,7 @@ bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const
   return true;
 }
 
-bool coll_t::is_pg_prefix(pg_t& pgid) const
+bool coll_t::is_pg_prefix(spg_t& pgid) const
 {
   const char *cstr(str.c_str());
 
@@ -526,7 +526,7 @@ bool coll_t::is_pg_prefix(pg_t& pgid) const
   return true;
 }
 
-bool coll_t::is_removal(uint64_t *seq, pg_t *pgid) const
+bool coll_t::is_removal(uint64_t *seq, spg_t *pgid) const
 {
   if (str.substr(0, 11) != string("FORREMOVAL_"))
     return false;
@@ -558,13 +558,13 @@ void coll_t::decode(bufferlist::iterator& bl)
   ::decode(struct_v, bl);
   switch (struct_v) {
   case 1: {
-    pg_t pgid;
+    spg_t pgid;
     snapid_t snap;
 
     ::decode(pgid, bl);
     ::decode(snap, bl);
     // infer the type
-    if (pgid == pg_t() && snap == 0)
+    if (pgid == spg_t() && snap == 0)
       str = "meta";
     else
       str = pg_and_snap_to_str(pgid, snap);
@@ -573,7 +573,7 @@ void coll_t::decode(bufferlist::iterator& bl)
 
   case 2: {
     __u8 type;
-    pg_t pgid;
+    spg_t pgid;
     snapid_t snap;
     
     ::decode(type, bl);
@@ -1879,8 +1879,8 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
 
 void pg_info_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(29, 26, bl);
-  ::encode(pgid, bl);
+  ENCODE_START(30, 26, bl);
+  ::encode(pgid.pgid, bl);
   ::encode(last_update, bl);
   ::encode(last_complete, bl);
   ::encode(log_tail, bl);
@@ -1891,6 +1891,7 @@ void pg_info_t::encode(bufferlist &bl) const
   ::encode(last_epoch_started, bl);
   ::encode(last_user_version, bl);
   ::encode(hit_set, bl);
+  ::encode(pgid.shard, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -1900,9 +1901,9 @@ void pg_info_t::decode(bufferlist::iterator &bl)
   if (struct_v < 23) {
     old_pg_t opgid;
     ::decode(opgid, bl);
-    pgid = opgid;
+    pgid.pgid = opgid;
   } else {
-    ::decode(pgid, bl);
+    ::decode(pgid.pgid, bl);
   }
   ::decode(last_update, bl);
   ::decode(last_complete, bl);
@@ -1932,6 +1933,10 @@ void pg_info_t::decode(bufferlist::iterator &bl)
     last_user_version = last_update.version;
   if (struct_v >= 29)
     ::decode(hit_set, bl);
+  if (struct_v >= 30)
+    ::decode(pgid.shard, bl);
+  else
+    pgid.shard = ghobject_t::no_shard();
   DECODE_FINISH(bl);
 }
 
@@ -1970,7 +1975,7 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
   list<pg_history_t*> h;
   pg_history_t::generate_test_instances(h);
   o.back()->history = *h.back();
-  o.back()->pgid = pg_t(1, 2, -1);
+  o.back()->pgid = spg_t(pg_t(1, 2, -1), ghobject_t::no_shard());
   o.back()->last_update = eversion_t(3, 4);
   o.back()->last_complete = eversion_t(5, 6);
   o.back()->last_user_version = 2;
@@ -1991,24 +1996,35 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
 // -- pg_notify_t --
 void pg_notify_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(query_epoch, bl);
   ::encode(epoch_sent, bl);
   ::encode(info, bl);
+  ::encode(to, bl);
+  ::encode(from, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_notify_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
   ::decode(query_epoch, bl);
   ::decode(epoch_sent, bl);
   ::decode(info, bl);
+  if (struct_v >= 2) {
+    ::decode(to, bl);
+    ::decode(from, bl);
+  } else {
+    to = ghobject_t::NO_SHARD;
+    from = ghobject_t::NO_SHARD;
+  }
   DECODE_FINISH(bl);
 }
 
 void pg_notify_t::dump(Formatter *f) const
 {
+  f->dump_int("from", from);
+  f->dump_int("to", to);
   f->dump_stream("query_epoch") << query_epoch;
   f->dump_stream("epoch_sent") << epoch_sent;
   {
@@ -2020,15 +2036,20 @@ void pg_notify_t::dump(Formatter *f) const
 
 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
 {
-  o.push_back(new pg_notify_t(1,1,pg_info_t()));
-  o.push_back(new pg_notify_t(3,10,pg_info_t()));
+  o.push_back(new pg_notify_t(3, ghobject_t::NO_SHARD, 1 ,1 , pg_info_t()));
+  o.push_back(new pg_notify_t(0, 0, 3, 10, pg_info_t()));
 }
 
 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
 {
-  return lhs << "(query_epoch:" << notify.query_epoch
-            << ", epoch_sent:" << notify.epoch_sent
-            << ", info:" << notify.info << ")";
+  lhs << "(query_epoch:" << notify.query_epoch
+      << ", epoch_sent:" << notify.epoch_sent
+      << ", info:" << notify.info;
+  if (notify.from != ghobject_t::NO_SHARD ||
+      notify.to != ghobject_t::NO_SHARD)
+    lhs << " " << (unsigned)notify.from
+       << "->" << (unsigned)notify.to;
+  return lhs << ")";
 }
 
 // -- pg_interval_t --
@@ -2178,11 +2199,13 @@ ostream& operator<<(ostream& out, const pg_interval_t& i)
 
 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
   if (features & CEPH_FEATURE_QUERY_T) {
-    ENCODE_START(2, 2, bl);
+    ENCODE_START(3, 2, bl);
     ::encode(type, bl);
     ::encode(since, bl);
     history.encode(bl);
     ::encode(epoch_sent, bl);
+    ::encode(to, bl);
+    ::encode(from, bl);
     ENCODE_FINISH(bl);
   } else {
     ::encode(type, bl);
@@ -2194,11 +2217,18 @@ void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
 void pg_query_t::decode(bufferlist::iterator &bl) {
   bufferlist::iterator bl2 = bl;
   try {
-    DECODE_START(2, bl);
+    DECODE_START(3, bl);
     ::decode(type, bl);
     ::decode(since, bl);
     history.decode(bl);
     ::decode(epoch_sent, bl);
+    if (struct_v >= 3) {
+      ::decode(to, bl);
+      ::decode(from, bl);
+    } else {
+      to = ghobject_t::NO_SHARD;
+      from = ghobject_t::NO_SHARD;
+    }
     DECODE_FINISH(bl);
   } catch (...) {
     bl = bl2;
@@ -2210,6 +2240,8 @@ void pg_query_t::decode(bufferlist::iterator &bl) {
 
 void pg_query_t::dump(Formatter *f) const
 {
+  f->dump_int("from", from);
+  f->dump_int("to", to);
   f->dump_string("type", get_type_name());
   f->dump_stream("since") << since;
   f->dump_stream("epoch_sent") << epoch_sent;
@@ -2222,10 +2254,13 @@ void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
   o.push_back(new pg_query_t());
   list<pg_history_t*> h;
   pg_history_t::generate_test_instances(h);
-  o.push_back(new pg_query_t(pg_query_t::INFO, *h.back(), 4));
-  o.push_back(new pg_query_t(pg_query_t::MISSING, *h.back(), 4));
-  o.push_back(new pg_query_t(pg_query_t::LOG, eversion_t(4, 5), *h.back(), 4));
-  o.push_back(new pg_query_t(pg_query_t::FULLLOG, *h.back(), 5));
+  o.push_back(new pg_query_t(pg_query_t::INFO, 1, 2, *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::MISSING, 2, 3, *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::LOG, 0, 0,
+                            eversion_t(4, 5), *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::FULLLOG,
+                            ghobject_t::NO_SHARD, ghobject_t::NO_SHARD,
+                            *h.back(), 5));
 }
 
 // -- ObjectModDesc --
index ed3f1870b40d392cd709da4447695c546d6dd4f4..026d7c51c0c128b272e1352d7f4400ea6de30a31 100644 (file)
@@ -470,15 +470,15 @@ public:
     : str(str_)
   { }
 
-  explicit coll_t(pg_t pgid, snapid_t snap = CEPH_NOSNAP)
+  explicit coll_t(spg_t pgid, snapid_t snap = CEPH_NOSNAP)
     : str(pg_and_snap_to_str(pgid, snap))
   { }
 
-  static coll_t make_temp_coll(pg_t pgid) {
+  static coll_t make_temp_coll(spg_t pgid) {
     return coll_t(pg_to_tmp_str(pgid));
   }
 
-  static coll_t make_removal_coll(uint64_t seq, pg_t pgid) {
+  static coll_t make_removal_coll(uint64_t seq, spg_t pgid) {
     return coll_t(seq_to_removal_str(seq, pgid));
   }
 
@@ -494,10 +494,10 @@ public:
     return str < rhs.str;
   }
 
-  bool is_pg_prefix(pg_t& pgid) const;
-  bool is_pg(pg_t& pgid, snapid_t& snap) const;
-  bool is_temp(pg_t& pgid) const;
-  bool is_removal(uint64_t *seq, pg_t *pgid) const;
+  bool is_pg_prefix(spg_t& pgid) const;
+  bool is_pg(spg_t& pgid, snapid_t& snap) const;
+  bool is_temp(spg_t& pgid) const;
+  bool is_removal(uint64_t *seq, spg_t *pgid) const;
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
   inline bool operator==(const coll_t& rhs) const {
@@ -511,17 +511,17 @@ public:
   static void generate_test_instances(list<coll_t*>& o);
 
 private:
-  static std::string pg_and_snap_to_str(pg_t p, snapid_t s) {
+  static std::string pg_and_snap_to_str(spg_t p, snapid_t s) {
     std::ostringstream oss;
     oss << p << "_" << s;
     return oss.str();
   }
-  static std::string pg_to_tmp_str(pg_t p) {
+  static std::string pg_to_tmp_str(spg_t p) {
     std::ostringstream oss;
     oss << p << "_TEMP";
     return oss.str();
   }
-  static std::string seq_to_removal_str(uint64_t seq, pg_t pgid) {
+  static std::string seq_to_removal_str(uint64_t seq, spg_t pgid) {
     std::ostringstream oss;
     oss << "FORREMOVAL_" << seq << "_" << pgid;
     return oss.str();
@@ -1528,7 +1528,7 @@ inline ostream& operator<<(ostream& out, const pg_history_t& h) {
  *    otherwise, we have no idea what the pg is supposed to contain.
  */
 struct pg_info_t {
-  pg_t pgid;
+  spg_t pgid;
   eversion_t last_update;    // last object version applied to store.
   eversion_t last_complete;  // last version pg was complete through.
   epoch_t last_epoch_started;// last epoch at which this pg started on this osd
@@ -1550,7 +1550,7 @@ struct pg_info_t {
     : last_epoch_started(0), last_user_version(0),
       last_backfill(hobject_t::get_max())
   { }
-  pg_info_t(pg_t p)
+  pg_info_t(spg_t p)
     : pgid(p),
       last_epoch_started(0), last_user_version(0),
       last_backfill(hobject_t::get_max())
@@ -1564,6 +1564,11 @@ struct pg_info_t {
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& p);
   void dump(Formatter *f) const;
+  bool overlaps_with(const pg_info_t &oinfo) const {
+    return last_update > oinfo.log_tail ?
+      oinfo.last_update >= log_tail :
+      last_update >= oinfo.log_tail;
+  }
   static void generate_test_instances(list<pg_info_t*>& o);
 };
 WRITE_CLASS_ENCODER(pg_info_t)
@@ -1595,13 +1600,22 @@ struct pg_notify_t {
   epoch_t query_epoch;
   epoch_t epoch_sent;
   pg_info_t info;
-  pg_notify_t() : query_epoch(0), epoch_sent(0) {}
-  pg_notify_t(epoch_t query_epoch,
-             epoch_t epoch_sent,
-             const pg_info_t &info)
+  shard_id_t to;
+  shard_id_t from;
+  pg_notify_t() :
+    query_epoch(0), epoch_sent(0), to(ghobject_t::no_shard()),
+    from(ghobject_t::no_shard()) {}
+  pg_notify_t(
+    shard_id_t to,
+    shard_id_t from,
+    epoch_t query_epoch,
+    epoch_t epoch_sent,
+    const pg_info_t &info)
     : query_epoch(query_epoch),
       epoch_sent(epoch_sent),
-      info(info) {}
+      info(info), to(to), from(from) {
+    assert(from == info.pgid.shard);
+  }
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &p);
   void dump(Formatter *f) const;
@@ -1679,18 +1693,32 @@ struct pg_query_t {
   eversion_t since;
   pg_history_t history;
   epoch_t epoch_sent;
-
-  pg_query_t() : type(-1), epoch_sent(0) {}
-  pg_query_t(int t, const pg_history_t& h,
-            epoch_t epoch_sent)
-    : type(t), history(h),
-      epoch_sent(epoch_sent) {
+  shard_id_t to;
+  shard_id_t from;
+
+  pg_query_t() : type(-1), epoch_sent(0), to(ghobject_t::NO_SHARD),
+                from(ghobject_t::NO_SHARD) {}
+  pg_query_t(
+    int t,
+    shard_id_t to,
+    shard_id_t from,
+    const pg_history_t& h,
+    epoch_t epoch_sent)
+    : type(t),
+      history(h),
+      epoch_sent(epoch_sent),
+      to(to), from(from) {
     assert(t != LOG);
   }
-  pg_query_t(int t, eversion_t s, const pg_history_t& h,
-            epoch_t epoch_sent)
+  pg_query_t(
+    int t,
+    shard_id_t to,
+    shard_id_t from,
+    eversion_t s,
+    const pg_history_t& h,
+    epoch_t epoch_sent)
     : type(t), since(s), history(h),
-      epoch_sent(epoch_sent) {
+      epoch_sent(epoch_sent), to(to), from(from) {
     assert(t == LOG);
   }
   
index c62954b86a45e79e053e613a49018aa00123bf68..bc5b4e40fd18c05e7f530078ddaa9beea9d4932c 100644 (file)
@@ -670,7 +670,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -718,7 +718,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -805,7 +805,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -900,7 +900,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -1015,7 +1015,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -1088,7 +1088,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -1129,7 +1129,7 @@ TEST_F(PGLogTest, merge_log) {
     ObjectStore::Transaction t;
     pg_log_t olog;
     pg_info_t oinfo;
-    int fromosd = -1;
+    pg_shard_t fromosd;
     pg_info_t info;
     list<hobject_t> remove_snap;
     bool dirty_info = false;
@@ -1176,7 +1176,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     eversion_t last_update(1, 1);
     oinfo.last_update = last_update;
@@ -1228,7 +1228,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     {
       pg_log_entry_t e;
@@ -1279,7 +1279,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     hobject_t divergent_object;
 
@@ -1408,7 +1408,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     eversion_t last_update(1, 2);
 
@@ -1491,7 +1491,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     eversion_t last_update(1, 2);
     hobject_t divergent_object;
@@ -1579,7 +1579,7 @@ TEST_F(PGLogTest, proc_replica_log) {
     pg_log_t olog;
     pg_info_t oinfo;
     pg_missing_t omissing;
-    int from = -1;
+    pg_shard_t from;
 
     eversion_t last_update(1, 2);
     hobject_t divergent_object;
index 60f4809500869bc4969f65149fe76c972af2282c..6c8b309a0123afd7cc96bca9b0a41ad5a8e0b9b7 100644 (file)
@@ -434,7 +434,7 @@ int finish_remove_pgs(ObjectStore *store, uint64_t *next_removal_seq)
   for (vector<coll_t>::iterator it = ls.begin();
        it != ls.end();
        ++it) {
-    pg_t pgid;
+    spg_t pgid;
     snapid_t snap;
 
     if (it->is_temp(pgid)) {
@@ -467,12 +467,13 @@ int initiate_new_remove_pg(ObjectStore *store, pg_t r_pgid,
 {
   ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
 
-  if (store->collection_exists(coll_t(r_pgid))) {
+  if (store->collection_exists(coll_t(spg_t(r_pgid, ghobject_t::no_shard())))) {
       coll_t to_remove = coll_t::make_removal_coll((*next_removal_seq)++,
-        r_pgid);
-      cout << "collection rename " << coll_t(r_pgid) << " to " << to_remove
+        spg_t(r_pgid, ghobject_t::no_shard()));
+      cout << "collection rename " << coll_t(spg_t(r_pgid, ghobject_t::no_shard()))
+          << " to " << to_remove
         << std::endl;
-      rmt->collection_rename(coll_t(r_pgid), to_remove);
+      rmt->collection_rename(coll_t(spg_t(r_pgid, ghobject_t::no_shard())), to_remove);
   } else {
     delete rmt;
     return ENOENT;
@@ -992,11 +993,11 @@ int do_import(ObjectStore *store, OSDSuperblock sb)
     return 1;
   }
 
-  log_oid = OSD::make_pg_log_oid(pgid);
-  biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+  log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard()));
+  biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard()));
 
   //Check for PG already present.
-  coll_t coll(pgid);
+  coll_t coll(spg_t(pgid, ghobject_t::no_shard()));
   if (store->collection_exists(coll)) {
     cout << "pgid " << pgid << " already exists" << std::endl;
     return 1;
@@ -1004,7 +1005,8 @@ int do_import(ObjectStore *store, OSDSuperblock sb)
 
   //Switch to collection which will be removed automatically if
   //this program is interupted.
-  coll_t rmcoll = coll_t::make_removal_coll(next_removal_seq, pgid);
+  coll_t rmcoll = coll_t::make_removal_coll(
+    next_removal_seq, spg_t(pgid, ghobject_t::no_shard()));
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   t->create_collection(rmcoll);
   store->apply_transaction(*t);
@@ -1290,8 +1292,8 @@ int main(int argc, char **argv)
     goto out;
   }
 
-  log_oid = OSD::make_pg_log_oid(pgid);
-  biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+  log_oid = OSD::make_pg_log_oid(spg_t(pgid, ghobject_t::no_shard()));
+  biginfo_oid = OSD::make_pg_biginfo_oid(spg_t(pgid, ghobject_t::no_shard()));
 
   if (type == "remove") {
     uint64_t next_removal_seq = 0;     //My local seq
@@ -1315,13 +1317,13 @@ int main(int argc, char **argv)
 
   for (it = ls.begin(); it != ls.end(); ++it) {
     snapid_t snap;
-    pg_t tmppgid;
+    spg_t tmppgid;
 
     if (!it->is_pg(tmppgid, snap)) {
       continue;
     }
 
-    if (tmppgid != pgid) {
+    if (tmppgid.pgid != pgid) {
       continue;
     }
     if (snap != CEPH_NOSNAP && debug) {
@@ -1344,9 +1346,10 @@ int main(int argc, char **argv)
     if (debug)
       cerr << "map_epoch " << map_epoch << std::endl;
 
-    pg_info_t info(pgid);
+    pg_info_t info(spg_t(pgid, ghobject_t::no_shard()));
     map<epoch_t,pg_interval_t> past_intervals;
-    hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
+    hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(
+      spg_t(pgid, ghobject_t::no_shard()));
     interval_set<snapid_t> snap_collections;
   
     __u8 struct_ver;
index e9b1351d87cf6438ff07d04e4bd1a6f7c06df2c7..eb9f8dac36d05c6586ef0230049ee9ec69953bf6 100644 (file)
@@ -162,7 +162,7 @@ int main(int argc, char **argv)
 
   vector<coll_t> colls_to_check;
   if (pgidstr.length()) {
-    pg_t pgid;
+    spg_t pgid;
     if (!pgid.parse(pgidstr.c_str())) {
       cout << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
       exit(1);
@@ -178,7 +178,7 @@ int main(int argc, char **argv)
     for (vector<coll_t>::iterator i = candidates.begin();
         i != candidates.end();
         ++i) {
-      pg_t pgid;
+      spg_t pgid;
       snapid_t snap;
       if (i->is_pg(pgid, snap)) {
        colls_to_check.push_back(*i);