]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: handle pg delete vs merge race
authorSage Weil <sage@redhat.com>
Tue, 14 Aug 2018 17:15:52 +0000 (12:15 -0500)
committerSage Weil <sage@redhat.com>
Fri, 7 Sep 2018 17:09:42 +0000 (12:09 -0500)
Deletion involves an awkward dance between the pg lock and shard locks,
while the merge prep and tracking is "shard down".  If the delete has
finished its work we may find that a merge has since been prepped.

Unwinding the merge tracking is nontrivial, especially because it might
involved a second PG, possibly even a fabricated placeholder one. Instead,
if we delete and find that a merge is coming, undo our deletion and let
things play out in the future map epoch.

Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc

index 180b84b14213b77c47d7cb1e04c51ae1eba6ee22..416714b7ddeb52bb15d6b925edeb074a6072f8e8 100644 (file)
@@ -1689,20 +1689,9 @@ void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
       e));
 }
 
-void OSDService::finish_pg_delete(PG *pg, unsigned old_pg_num)
+bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
 {
-  // update pg count now since we might not get an osdmap any time soon.
-  if (pg->is_primary())
-    logger->dec(l_osd_pg_primary);
-  else if (pg->is_replica())
-    logger->dec(l_osd_pg_replica);
-  else
-    logger->dec(l_osd_pg_stray);
-
-  osd->unregister_pg(pg);
-  for (auto shard : osd->shards) {
-    shard->unprime_split_children(pg->pg_id, old_pg_num);
-  }
+  return osd->try_finish_pg_delete(pg, old_pg_num);
 }
 
 // ---
@@ -3934,19 +3923,39 @@ void OSD::register_pg(PGRef pg)
   sdata->_attach_pg(slot, pg.get());
 }
 
-void OSD::unregister_pg(PG *pg)
+bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
 {
   auto sdata = pg->osd_shard;
   ceph_assert(sdata);
-  Mutex::Locker l(sdata->shard_lock);
-  auto p = sdata->pg_slots.find(pg->pg_id);
-  if (p != sdata->pg_slots.end() &&
-      p->second->pg) {
+  {
+    Mutex::Locker l(sdata->shard_lock);
+    auto p = sdata->pg_slots.find(pg->pg_id);
+    if (p == sdata->pg_slots.end() ||
+       !p->second->pg) {
+      dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
+      return false;
+    }
+    if (p->second->waiting_for_merge_epoch) {
+      dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
+      return false;
+    }
     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
     sdata->_detach_pg(p->second.get());
-  } else {
-    dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
   }
+
+  for (auto shard : shards) {
+    shard->unprime_split_children(pg->pg_id, old_pg_num);
+  }
+
+  // update pg count now since we might not get an osdmap any time soon.
+  if (pg->is_primary())
+    service.logger->dec(l_osd_pg_primary);
+  else if (pg->is_replica())
+    service.logger->dec(l_osd_pg_replica);
+  else
+    service.logger->dec(l_osd_pg_stray);
+
+  return true;
 }
 
 PGRef OSD::_lookup_pg(spg_t pgid)
@@ -8198,6 +8207,7 @@ bool OSD::advance_pg(
            unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
            dout(1) << __func__ << " merging " << pg->pg_id << dendl;
            pg->merge_from(sources, rctx, split_bits);
+           pg->pg_slot->waiting_for_merge_epoch = 0;
          } else {
            dout(20) << __func__ << " not ready to merge yet" << dendl;
            pg->write_if_dirty(rctx);
index a0e9b373b6e26ad2ade5dda558f154802a223746..9d9ac42c333deb487406bed3ab6dfbdaeb868662 100644 (file)
@@ -753,7 +753,7 @@ public:
   void queue_for_snap_trim(PG *pg);
   void queue_for_scrub(PG *pg, bool with_high_priority);
   void queue_for_pg_delete(spg_t pgid, epoch_t e);
-  void finish_pg_delete(PG *pg, unsigned old_pg_num);
+  bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
 
 private:
   // -- pg recovery and associated throttling --
@@ -1896,7 +1896,7 @@ protected:
   PGRef _lookup_pg(spg_t pgid);
   PGRef _lookup_lock_pg(spg_t pgid);
   void register_pg(PGRef pg);
-  void unregister_pg(PG *pg);
+  bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
 
   void _get_pgs(vector<PGRef> *v, bool clear_too=false);
   void _get_pgids(vector<spg_t> *v);
index 4fe7efb36f5bb27e2ccf01639813f367ed5358c6..4ba31413ba3d71efa9e4a927329712c7491d4452 100644 (file)
@@ -6734,14 +6734,28 @@ void PG::_delete_some(ObjectStore::Transaction *t)
     }
     ch->flush();
 
-    osd->finish_pg_delete(this, pool.info.get_pg_num());
-    deleted = true;
+    if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
+      dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
+      ch = osd->store->create_new_collection(coll);
+      _create(*t,
+             info.pgid,
+             info.pgid.get_split_bits(pool.info.get_pg_num()));
+      _init(*t, info.pgid, &pool.info);
+      dirty_info = true;
+      dirty_big_info = true;
 
-    // cancel reserver here, since the PG is about to get deleted and the
-    // exit() methods don't run when that happens.
-    osd->local_reserver.cancel_reservation(info.pgid);
+#warning remove me before final merge
+      // REMOVE ME: trigger log error to ensure we exercise this in testing
+      osd->clog->error() << info.pgid << " delete merge race";
+    } else {
+      deleted = true;
 
-    osd->logger->dec(l_osd_pg_removing);
+      // cancel reserver here, since the PG is about to get deleted and the
+      // exit() methods don't run when that happens.
+      osd->local_reserver.cancel_reservation(info.pgid);
+
+      osd->logger->dec(l_osd_pg_removing);
+    }
   }
 }