osd: close split vs query race in consume_map

author Sage Weil <sage@redhat.com>

Fri, 23 Feb 2018 19:39:40 +0000 (13:39 -0600)

committer Sage Weil <sage@redhat.com>

Wed, 4 Apr 2018 13:26:57 +0000 (08:26 -0500)
author Sage Weil <sage@redhat.com>
Fri, 23 Feb 2018 19:39:40 +0000 (13:39 -0600)
committer Sage Weil <sage@redhat.com>
Wed, 4 Apr 2018 13:26:57 +0000 (08:26 -0500)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index d2c87aab175a1586dbb0d0d7b50b00e3bdf3dabb..59776bf2c14334989758c5fe6d10bb61ef765112 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -7766,12 +7766,10 @@ void OSD::consume_map()
    service.await_reserved_maps();
    service.publish_map(osdmap);
  
-  int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
-
-  unsigned pushes_to_free = 0;
+  // prime splits
    set<spg_t> newly_split;
    for (auto& shard : shards) {
-    shard->consume_map(osdmap, &pushes_to_free, &newly_split);
+    shard->identify_splits(osdmap, &newly_split);
    }
    if (!newly_split.empty()) {
      for (auto& shard : shards) {
@@ -7780,10 +7778,16 @@ void OSD::consume_map()
      assert(newly_split.empty());
    }
  
+  unsigned pushes_to_free = 0;
+  for (auto& shard : shards) {
+    shard->consume_map(osdmap, &pushes_to_free);
+  }
+
    vector<spg_t> pgids;
    _get_pgids(&pgids);
  
    // count (FIXME)
+  int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
    vector<PGRef> pgs;
    _get_pgs(&pgs);
    for (auto& pg : pgs) {
@@ -9298,8 +9302,7 @@ void OSDShard::wait_min_pg_epoch(epoch_t need)
  
  void OSDShard::consume_map(
    OSDMapRef& new_osdmap,
-  unsigned *pushes_to_free,
-  set<spg_t> *new_children)
+  unsigned *pushes_to_free)
  {
    Mutex::Locker l(sdata_op_ordering_lock);
    OSDMapRef old_osdmap;
@@ -9319,14 +9322,6 @@ void OSDShard::consume_map(
      OSDShardPGSlot *slot = p->second.get();
      const spg_t& pgid = p->first;
      dout(20) << __func__ << " " << pgid << dendl;
-    if (old_osdmap &&
-       (slot->pg || slot->waiting_for_split)) {
-      // only prime children for parent slots that are attached to a
-      // pg or are waiting_for_split (because their ancestor is
-      // attached to a pg).
-      osd->service.identify_split_children(old_osdmap, new_osdmap, pgid,
-                                          new_children);
-    }
      if (slot->waiting_for_split) {
        dout(20) << __func__ << "  " << pgid
                << " waiting for split" << dendl;
@@ -9373,7 +9368,6 @@ void OSDShard::consume_map(
      }
      ++p;
    }
-  _prime_splits(new_children);
    if (queued) {
      sdata_lock.Lock();
      sdata_cond.SignalOne();
@@ -9415,6 +9409,20 @@ void OSDShard::_wake_pg_slot(
    ++slot->requeue_seq;
  }
  
+void OSDShard::identify_splits(OSDMapRef as_of_osdmap, set<spg_t> *pgids)
+{
+  Mutex::Locker l(sdata_op_ordering_lock);
+  if (osdmap) {
+    for (auto& i : pg_slots) {
+      const spg_t& pgid = i.first;
+      auto *slot = i.second.get();
+      if (slot->pg || slot->waiting_for_split) {
+       osd->service.identify_split_children(osdmap, as_of_osdmap, pgid, pgids);
+      }
+    }
+  }
+}
+
  void OSDShard::prime_splits(OSDMapRef as_of_osdmap, set<spg_t> *pgids)
  {
    Mutex::Locker l(sdata_op_ordering_lock);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index dcb5c87780b2d9a2af967c35b8634453c5d38d5d..27422bb29c8582be57eae3e5fcdf0f3f9f98d0fb 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1078,22 +1078,6 @@ enum class io_queue {
      discard any slots with no pg (and not waiting_for_split) that no
      longer map to the current host.
  
-  Some notes:
-
-  - There is theoretical race between query (which can proceed if the pg doesn't
-    exist) and split (which may materialize a PG in a different shard):
-      - osd has epoch E
-      - shard 0 processes notify on P from epoch E-1
-      - shard 0 identifies P splits to P+C in epoch E
-      - shard 1 receives query for P (epoch E), returns DNE
-      - shard 1 installs C in shard 0 with waiting_for_split
-
-    This can't really be fixed without ordering queries over all shards.  In
-    practice, it is very unlikely to occur, since only the primary sends a
-    notify (or other creating event) and only the primary who sends a query.
-    Even if it does happen, the instantiated P is empty, so reporting DNE vs
-    empty C is minimal harm.
-
    */
  
  struct OSDShardPGSlot {
@@ -1190,11 +1174,11 @@ struct OSDShard {
    /// push osdmap into shard
    void consume_map(
      OSDMapRef& osdmap,
-    unsigned *pushes_to_free,
-    set<spg_t> *new_children);
+    unsigned *pushes_to_free);
  
    void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot);
  
+  void identify_splits(OSDMapRef as_of_osdmap, set<spg_t> *pgids);
    void _prime_splits(set<spg_t> *pgids);
    void prime_splits(OSDMapRef as_of_osdmap, set<spg_t> *pgids);
    void register_and_wake_split_child(PG *pg);
author	Sage Weil <sage@redhat.com>
	Fri, 23 Feb 2018 19:39:40 +0000 (13:39 -0600)
committer	Sage Weil <sage@redhat.com>
	Wed, 4 Apr 2018 13:26:57 +0000 (08:26 -0500)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history