From ce699ff8701a46974f09fd1512e1fe61abb1a5a7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 23 Feb 2018 13:39:40 -0600 Subject: [PATCH] osd: close split vs query race in consume_map Consider the race: - shard 0 consumes epoch E - shard 1 consumes epoch E - shard 1 pg P will split to C - shard 0 processes query on C, returns DNE - shard 0 primes slot C Close race by priming split children before consuming map into each OSDShard. That way the query will either (1) arrive before E and before slot C is primed and wait for E, or find the slot present with waiting_for_split true. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 38 +++++++++++++++++++++++--------------- src/osd/OSD.h | 20 ++------------------ 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d2c87aab175a1..59776bf2c1433 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7766,12 +7766,10 @@ void OSD::consume_map() service.await_reserved_maps(); service.publish_map(osdmap); - int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0; - - unsigned pushes_to_free = 0; + // prime splits set newly_split; for (auto& shard : shards) { - shard->consume_map(osdmap, &pushes_to_free, &newly_split); + shard->identify_splits(osdmap, &newly_split); } if (!newly_split.empty()) { for (auto& shard : shards) { @@ -7780,10 +7778,16 @@ void OSD::consume_map() assert(newly_split.empty()); } + unsigned pushes_to_free = 0; + for (auto& shard : shards) { + shard->consume_map(osdmap, &pushes_to_free); + } + vector pgids; _get_pgids(&pgids); // count (FIXME) + int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0; vector pgs; _get_pgs(&pgs); for (auto& pg : pgs) { @@ -9298,8 +9302,7 @@ void OSDShard::wait_min_pg_epoch(epoch_t need) void OSDShard::consume_map( OSDMapRef& new_osdmap, - unsigned *pushes_to_free, - set *new_children) + unsigned *pushes_to_free) { Mutex::Locker l(sdata_op_ordering_lock); OSDMapRef old_osdmap; @@ -9319,14 +9322,6 @@ void OSDShard::consume_map( OSDShardPGSlot *slot = p->second.get(); const spg_t& pgid = p->first; dout(20) << __func__ << " " << pgid << dendl; - if (old_osdmap && - (slot->pg || slot->waiting_for_split)) { - // only prime children for parent slots that are attached to a - // pg or are waiting_for_split (because their ancestor is - // attached to a pg). - osd->service.identify_split_children(old_osdmap, new_osdmap, pgid, - new_children); - } if (slot->waiting_for_split) { dout(20) << __func__ << " " << pgid << " waiting for split" << dendl; @@ -9373,7 +9368,6 @@ void OSDShard::consume_map( } ++p; } - _prime_splits(new_children); if (queued) { sdata_lock.Lock(); sdata_cond.SignalOne(); @@ -9415,6 +9409,20 @@ void OSDShard::_wake_pg_slot( ++slot->requeue_seq; } +void OSDShard::identify_splits(OSDMapRef as_of_osdmap, set *pgids) +{ + Mutex::Locker l(sdata_op_ordering_lock); + if (osdmap) { + for (auto& i : pg_slots) { + const spg_t& pgid = i.first; + auto *slot = i.second.get(); + if (slot->pg || slot->waiting_for_split) { + osd->service.identify_split_children(osdmap, as_of_osdmap, pgid, pgids); + } + } + } +} + void OSDShard::prime_splits(OSDMapRef as_of_osdmap, set *pgids) { Mutex::Locker l(sdata_op_ordering_lock); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index dcb5c87780b2d..27422bb29c858 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1078,22 +1078,6 @@ enum class io_queue { discard any slots with no pg (and not waiting_for_split) that no longer map to the current host. - Some notes: - - - There is theoretical race between query (which can proceed if the pg doesn't - exist) and split (which may materialize a PG in a different shard): - - osd has epoch E - - shard 0 processes notify on P from epoch E-1 - - shard 0 identifies P splits to P+C in epoch E - - shard 1 receives query for P (epoch E), returns DNE - - shard 1 installs C in shard 0 with waiting_for_split - - This can't really be fixed without ordering queries over all shards. In - practice, it is very unlikely to occur, since only the primary sends a - notify (or other creating event) and only the primary who sends a query. - Even if it does happen, the instantiated P is empty, so reporting DNE vs - empty C is minimal harm. - */ struct OSDShardPGSlot { @@ -1190,11 +1174,11 @@ struct OSDShard { /// push osdmap into shard void consume_map( OSDMapRef& osdmap, - unsigned *pushes_to_free, - set *new_children); + unsigned *pushes_to_free); void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot); + void identify_splits(OSDMapRef as_of_osdmap, set *pgids); void _prime_splits(set *pgids); void prime_splits(OSDMapRef as_of_osdmap, set *pgids); void register_and_wake_split_child(PG *pg); -- 2.39.5