From: Sage Weil Date: Tue, 14 Aug 2018 17:15:52 +0000 (-0500) Subject: osd: handle pg delete vs merge race X-Git-Tag: v14.0.1~371^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=93b62838298d9156d3b9229abb4e7f1567659651;p=ceph.git osd: handle pg delete vs merge race Deletion involves an awkward dance between the pg lock and shard locks, while the merge prep and tracking is "shard down". If the delete has finished its work we may find that a merge has since been prepped. Unwinding the merge tracking is nontrivial, especially because it might involved a second PG, possibly even a fabricated placeholder one. Instead, if we delete and find that a merge is coming, undo our deletion and let things play out in the future map epoch. Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 180b84b14213..416714b7ddeb 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1689,20 +1689,9 @@ void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e) e)); } -void OSDService::finish_pg_delete(PG *pg, unsigned old_pg_num) +bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num) { - // update pg count now since we might not get an osdmap any time soon. - if (pg->is_primary()) - logger->dec(l_osd_pg_primary); - else if (pg->is_replica()) - logger->dec(l_osd_pg_replica); - else - logger->dec(l_osd_pg_stray); - - osd->unregister_pg(pg); - for (auto shard : osd->shards) { - shard->unprime_split_children(pg->pg_id, old_pg_num); - } + return osd->try_finish_pg_delete(pg, old_pg_num); } // --- @@ -3934,19 +3923,39 @@ void OSD::register_pg(PGRef pg) sdata->_attach_pg(slot, pg.get()); } -void OSD::unregister_pg(PG *pg) +bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num) { auto sdata = pg->osd_shard; ceph_assert(sdata); - Mutex::Locker l(sdata->shard_lock); - auto p = sdata->pg_slots.find(pg->pg_id); - if (p != sdata->pg_slots.end() && - p->second->pg) { + { + Mutex::Locker l(sdata->shard_lock); + auto p = sdata->pg_slots.find(pg->pg_id); + if (p == sdata->pg_slots.end() || + !p->second->pg) { + dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl; + return false; + } + if (p->second->waiting_for_merge_epoch) { + dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl; + return false; + } dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl; sdata->_detach_pg(p->second.get()); - } else { - dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl; } + + for (auto shard : shards) { + shard->unprime_split_children(pg->pg_id, old_pg_num); + } + + // update pg count now since we might not get an osdmap any time soon. + if (pg->is_primary()) + service.logger->dec(l_osd_pg_primary); + else if (pg->is_replica()) + service.logger->dec(l_osd_pg_replica); + else + service.logger->dec(l_osd_pg_stray); + + return true; } PGRef OSD::_lookup_pg(spg_t pgid) @@ -8198,6 +8207,7 @@ bool OSD::advance_pg( unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num); dout(1) << __func__ << " merging " << pg->pg_id << dendl; pg->merge_from(sources, rctx, split_bits); + pg->pg_slot->waiting_for_merge_epoch = 0; } else { dout(20) << __func__ << " not ready to merge yet" << dendl; pg->write_if_dirty(rctx); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index a0e9b373b6e2..9d9ac42c333d 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -753,7 +753,7 @@ public: void queue_for_snap_trim(PG *pg); void queue_for_scrub(PG *pg, bool with_high_priority); void queue_for_pg_delete(spg_t pgid, epoch_t e); - void finish_pg_delete(PG *pg, unsigned old_pg_num); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); private: // -- pg recovery and associated throttling -- @@ -1896,7 +1896,7 @@ protected: PGRef _lookup_pg(spg_t pgid); PGRef _lookup_lock_pg(spg_t pgid); void register_pg(PGRef pg); - void unregister_pg(PG *pg); + bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); void _get_pgs(vector *v, bool clear_too=false); void _get_pgids(vector *v); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 4fe7efb36f5b..4ba31413ba3d 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6734,14 +6734,28 @@ void PG::_delete_some(ObjectStore::Transaction *t) } ch->flush(); - osd->finish_pg_delete(this, pool.info.get_pg_num()); - deleted = true; + if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) { + dout(1) << __func__ << " raced with merge, reinstantiating" << dendl; + ch = osd->store->create_new_collection(coll); + _create(*t, + info.pgid, + info.pgid.get_split_bits(pool.info.get_pg_num())); + _init(*t, info.pgid, &pool.info); + dirty_info = true; + dirty_big_info = true; - // cancel reserver here, since the PG is about to get deleted and the - // exit() methods don't run when that happens. - osd->local_reserver.cancel_reservation(info.pgid); +#warning remove me before final merge + // REMOVE ME: trigger log error to ensure we exercise this in testing + osd->clog->error() << info.pgid << " delete merge race"; + } else { + deleted = true; - osd->logger->dec(l_osd_pg_removing); + // cancel reserver here, since the PG is about to get deleted and the + // exit() methods don't run when that happens. + osd->local_reserver.cancel_reservation(info.pgid); + + osd->logger->dec(l_osd_pg_removing); + } } }