]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
OSD: resurrect a parent if it splits into the pg we want to create
authorSamuel Just <sam.just@inktank.com>
Thu, 30 May 2013 22:11:58 +0000 (15:11 -0700)
committerSamuel Just <sam.just@inktank.com>
Wed, 5 Jun 2013 20:07:42 +0000 (13:07 -0700)
When attempting to create a new pg object in response to a
peering message, there are 3 cases:
1) That pg is currently being deleted.  In this case, we
cancel the deletion and resurrect the pg at the epoch at
which it had been deleted.
2) A pg is being deleted which would have split into the
pg we want to create had it not been deleted.  In that case,
we resurrect that pg at the map at which it had been deleted
and let the request wait on the impending split.
3) Neither that pg nor a parent can be resurrected.  In this
case, we create a new pg at the map epoch of the peering
request.

Fixes: #5154
Signed-off-by: Samuel Just <sam.just@inktank.com>
src/osd/OSD.cc
src/osd/OSD.h

index 8a4da41da8a3aa87a6680408c96afa9e9c5ba68f..cdfa4d417bacb7a6bedbfbd4b6fa7b024acef772 100644 (file)
@@ -1639,9 +1639,68 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
     _remove_pg(pg);
 }
 
+OSD::res_result OSD::_try_resurrect_pg(
+  OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state)
+{
+  assert(resurrected);
+  assert(old_pg_state);
+  // find nearest ancestor
+  DeletingStateRef df;
+  pg_t cur(pgid);
+  while (cur.ps()) {
+    df = service.deleting_pgs.lookup(pgid);
+    if (df)
+      break;
+    cur = cur.get_parent();
+  }
+  if (!df)
+    return RES_NONE; // good to go
+
+  df->old_pg_state->lock();
+  OSDMapRef create_map = df->old_pg_state->get_osdmap();
+  df->old_pg_state->unlock();
+
+  set<pg_t> children;
+  if (cur == pgid) {
+    if (df->try_stop_deletion()) {
+      dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
+      *resurrected = cur;
+      *old_pg_state = df->old_pg_state;
+      service.deleting_pgs.remove(pgid); // PG is no longer being removed!
+      return RES_SELF;
+    } else {
+      // raced, ensure we don't see DeletingStateRef when we try to
+      // delete this pg
+      service.deleting_pgs.remove(pgid);
+      return RES_NONE;
+    }
+  } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
+                         curmap->get_pg_num(cur.pool()),
+                         &children) &&
+            children.count(pgid)) {
+    if (df->try_stop_deletion()) {
+      dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
+              << dendl;
+      *resurrected = cur;
+      *old_pg_state = df->old_pg_state;
+      service.deleting_pgs.remove(pgid); // PG is no longer being removed!
+      return RES_PARENT;
+    } else {
+      /* this is not a problem, failing to cancel proves that all objects
+       * have been removed, so no hobject_t overlap is possible
+       */
+      return RES_NONE;
+    }
+  }
+  return RES_NONE;
+}
+
 PG *OSD::_create_lock_pg(
   OSDMapRef createmap,
-  pg_t pgid, bool newly_created, bool hold_map_lock,
+  pg_t pgid,
+  bool newly_created,
+  bool hold_map_lock,
+  bool backfill,
   int role, vector<int>& up, vector<int>& acting, pg_history_t history,
   pg_interval_map_t& pi,
   ObjectStore::Transaction& t)
@@ -1651,22 +1710,7 @@ PG *OSD::_create_lock_pg(
 
   PG *pg = _open_lock_pg(createmap, pgid, true, hold_map_lock);
 
-  DeletingStateRef df = service.deleting_pgs.lookup(pgid);
-  bool backfill = false;
-
-  if (df && df->try_stop_deletion()) {
-    dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
-    backfill = true;
-    service.deleting_pgs.remove(pgid); // PG is no longer being removed!
-  } else {
-    if (df) {
-      // raced, ensure we don't see DeletingStateRef when we try to
-      // delete this pg
-      service.deleting_pgs.remove(pgid);
-    }
-    // either it's not deleting, or we failed to get to it in time
-    t.create_collection(coll_t(pgid));
-  }
+  service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
 
   pg->init(role, up, acting, history, pi, backfill, &t);
 
@@ -1980,8 +2024,6 @@ void OSD::handle_pg_peering_evt(
     return;
   }
 
-  PG *pg;
-
   if (!_have_pg(info.pgid)) {
     // same primary?
     if (!osdmap->have_pg_pool(info.pgid.pool()))
@@ -2028,24 +2070,104 @@ void OSD::handle_pg_peering_evt(
       assert(!info.dne());  // and pg exists if we are hearing about it
     }
 
-    // ok, create PG locally using provided Info and History
+    // do we need to resurrect a deleting pg?
+    pg_t resurrected;
+    PGRef old_pg_state;
+    res_result result = _try_resurrect_pg(
+      service.get_osdmap(),
+      info.pgid,
+      &resurrected,
+      &old_pg_state);
+
     PG::RecoveryCtx rctx = create_context();
-    pg = _create_lock_pg(
-      get_map(epoch),
-      info.pgid, create, false, role, up, acting, history, pi,
-      *rctx.transaction);
-    pg->handle_create(&rctx);
-    pg->write_if_dirty(*rctx.transaction);
-    dispatch_context(rctx, pg, osdmap);
+    switch (result) {
+    case RES_NONE: {
+      // ok, create the pg locally using provided Info and History
+      rctx.transaction->create_collection(coll_t(info.pgid));
+      PG *pg = _create_lock_pg(
+       get_map(epoch),
+       info.pgid, create, false, result == RES_SELF,
+       role, up, acting, history, pi,
+       *rctx.transaction);
+      pg->handle_create(&rctx);
+      pg->write_if_dirty(*rctx.transaction);
+      dispatch_context(rctx, pg, osdmap);
+
+      dout(10) << *pg << " is new" << dendl;
+
+      // kick any waiters
+      wake_pg_waiters(pg->info.pgid);
       
-    dout(10) << *pg << " is new" << dendl;
+      pg->queue_peering_event(evt);
+      pg->unlock();
+      return;
+    }
+    case RES_SELF: {
+      old_pg_state->lock();
+      PG *pg = _create_lock_pg(
+       old_pg_state->get_osdmap(),
+       resurrected,
+       false,
+       false,
+       true,
+       old_pg_state->role,
+       old_pg_state->up,
+       old_pg_state->acting,
+       old_pg_state->info.history,
+       old_pg_state->past_intervals,
+       *rctx.transaction);
+      old_pg_state->unlock();
+      pg->handle_create(&rctx);
+      pg->write_if_dirty(*rctx.transaction);
+      dispatch_context(rctx, pg, osdmap);
 
-    // kick any waiters
-    wake_pg_waiters(pg->info.pgid);
+      dout(10) << *pg << " is new (resurrected)" << dendl;
 
+      // kick any waiters
+      wake_pg_waiters(pg->info.pgid);
+
+      pg->queue_peering_event(evt);
+      pg->unlock();
+      return;
+    }
+    case RES_PARENT: {
+      assert(old_pg_state);
+      old_pg_state->lock();
+      PG *parent = _create_lock_pg(
+       old_pg_state->get_osdmap(),
+       resurrected,
+       false,
+       false,
+       true,
+       old_pg_state->role,
+       old_pg_state->up,
+       old_pg_state->acting,
+       old_pg_state->info.history,
+       old_pg_state->past_intervals,
+       *rctx.transaction
+       );
+      old_pg_state->unlock();
+      parent->handle_create(&rctx);
+      parent->write_if_dirty(*rctx.transaction);
+      dispatch_context(rctx, parent, osdmap);
+
+      dout(10) << *parent << " is new" << dendl;
+
+      // kick any waiters
+      wake_pg_waiters(parent->info.pgid);
+
+      assert(service.splitting(info.pgid));
+      peering_wait_for_split[info.pgid].push_back(evt);
+
+      //parent->queue_peering_event(evt);
+      parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
+      parent->unlock();
+      return;
+    }
+    }
   } else {
     // already had it.  did the mapping change?
-    pg = _lookup_lock_pg(info.pgid);
+    PG *pg = _lookup_lock_pg(info.pgid);
     if (epoch < pg->info.history.same_interval_since) {
       dout(10) << *pg << " get_or_create_pg acting changed in "
               << pg->info.history.same_interval_since
@@ -2053,10 +2175,10 @@ void OSD::handle_pg_peering_evt(
       pg->unlock();
       return;
     }
+    pg->queue_peering_event(evt);
+    pg->unlock();
+    return;
   }
-
-  pg->queue_peering_event(evt);
-  pg->unlock();
 }
 
 
@@ -5391,10 +5513,11 @@ void OSD::handle_pg_create(OpRequestRef op)
     if (can_create_pg(pgid)) {
       pg_interval_map_t pi;
       pg = _create_lock_pg(
-       osdmap, pgid, true, false,
+       osdmap, pgid, true, false, false,
        0, creating_pgs[pgid].acting, creating_pgs[pgid].acting,
        history, pi,
        *rctx.transaction);
+      rctx.transaction->create_collection(coll_t(pgid));
       pg->info.last_epoch_started = pg->info.history.last_epoch_started;
       creating_pgs.erase(pgid);
       wake_pg_waiters(pg->info.pgid);
index a91b657934b770e0b7ad633a5a231d3a62748ef2..0e35250b79aa55eff819ee908711080c2cd16d9a 100644 (file)
@@ -1049,10 +1049,21 @@ protected:
   PG   *_open_lock_pg(OSDMapRef createmap,
                      pg_t pg, bool no_lockdep_check=false,
                      bool hold_map_lock=false);
+  enum res_result {
+    RES_PARENT,    // resurrected a parent
+    RES_SELF,      // resurrected self
+    RES_NONE       // nothing relevant deleting
+  };
+  res_result _try_resurrect_pg(
+    OSDMapRef curmap, pg_t pgid, pg_t *resurrected, PGRef *old_pg_state);
   PG   *_create_lock_pg(OSDMapRef createmap,
-                       pg_t pgid, bool newly_created,
-                       bool hold_map_lock, int role,
-                       vector<int>& up, vector<int>& acting,
+                       pg_t pgid,
+                       bool newly_created,
+                       bool hold_map_lock,
+                       bool backfill,
+                       int role,
+                       vector<int>& up,
+                       vector<int>& acting,
                        pg_history_t history,
                        pg_interval_map_t& pi,
                        ObjectStore::Transaction& t);