From: Samuel Just Date: Wed, 8 May 2013 23:19:34 +0000 (-0700) Subject: PG,OSD: delay ops for map prior to queueing in the OpWQ X-Git-Tag: v0.62~5^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7a8d6fd4a02cdae84b2e34652b4b97f6bd133e8f;p=ceph.git PG,OSD: delay ops for map prior to queueing in the OpWQ Previously, we simply queued ops in the OpWQ without checking. The PG would then check in do_request whether the message should wait for a new map. Unfortunately, this has the side effect that any op requeued for any reason must also requeue the waiting_for_map queue. Now, we will check before queueing the op whether it must wait on a map. To avoid contention, there is now a map_lock which must be held along with the PG lock in order to update the osdmap_ref. The map_lock also protects the waiting_for_map list and queueing PG ops at the back of the OpWQ. A few details: 1) It is no longer necessary to requeue waiting_for_map in on_change() since the other ops are queued at the front. 2) Once waiting_for_map is non-empty, all ops are delayed to simplify ordering. 3) waiting_for_map may now be non-empty during split, so we must split waiting_for_map along with waiting_for_active. This must be done under the map_lock. The bug which uncovered this involved an out of order op as follows: client.4208.0:2378 (e252) arrives, object is degraded client.4208.0:2379 (e253) arrives, waits for map client.4208.0:2378 (e252) is requeued after recovery client.4208.0:2379 (e253) is requeued on map arrival client.4208.0:2379 is processed client.4208.0:2378 is processed Fixes: #4955 Signed-off-by: Samuel Just --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f9f2b46a2218..26dacf577af8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6416,7 +6416,7 @@ void OSD::enqueue_op(PG *pg, OpRequestRef op) << " cost " << op->request->get_cost() << " latency " << latency << " " << *(op->request) << dendl; - op_wq.queue(make_pair(PGRef(pg), op)); + pg->queue_op(op); } void OSD::OpWQ::_enqueue(pair item) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 208d6cc21069..9a074a1ae053 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -139,6 +139,7 @@ PG::PG(OSDService *o, OSDMapRef curmap, p.m_seed, p.get_split_bits(curmap->get_pg_num(_pool.id)), _pool.id), + map_lock("PG::map_lock"), osdmap_ref(curmap), pool(_pool), _lock("PG::_lock"), ref(0), @@ -1748,6 +1749,36 @@ bool PG::op_has_sufficient_caps(OpRequestRef op) return cap; } +void PG::take_op_map_waiters() +{ + Mutex::Locker l(map_lock); + for (list::iterator i = waiting_for_map.begin(); + i != waiting_for_map.end(); + ) { + if (op_must_wait_for_map(get_osdmap_with_maplock(), *i)) { + break; + } else { + osd->op_wq.queue(make_pair(PGRef(this), *i)); + waiting_for_map.erase(i++); + } + } +} + +void PG::queue_op(OpRequestRef op) +{ + Mutex::Locker l(map_lock); + if (!waiting_for_map.empty()) { + // preserve ordering + waiting_for_map.push_back(op); + return; + } + if (op_must_wait_for_map(get_osdmap_with_maplock(), op)) { + waiting_for_map.push_back(op); + return; + } + osd->op_wq.queue(make_pair(PGRef(this), op)); +} + void PG::do_request(OpRequestRef op) { // do any pending flush @@ -1757,11 +1788,7 @@ void PG::do_request(OpRequestRef op) osd->reply_op_error(op, -EPERM); return; } - if (op_must_wait_for_map(get_osdmap(), op)) { - dout(20) << " waiting for map on " << op << dendl; - waiting_for_map.push_back(op); - return; - } + assert(!op_must_wait_for_map(get_osdmap(), op)); if (can_discard_request(op)) { return; } @@ -2099,7 +2126,6 @@ static void split_replay_queue( void PG::split_ops(PG *child, unsigned split_bits) { unsigned match = child->info.pgid.m_seed; - assert(waiting_for_map.empty()); assert(waiting_for_all_missing.empty()); assert(waiting_for_missing_object.empty()); assert(waiting_for_degraded_object.empty()); @@ -2109,12 +2135,16 @@ void PG::split_ops(PG *child, unsigned split_bits) { osd->dequeue_pg(this, &waiting_for_active); split_list(&waiting_for_active, &(child->waiting_for_active), match, split_bits); + { + Mutex::Locker l(map_lock); // to avoid a race with the osd dispatch + split_list(&waiting_for_map, &(child->waiting_for_map), match, split_bits); + } } void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) { child->update_snap_mapper_bits(split_bits); - child->osdmap_ref = osdmap_ref; + child->update_osdmap_ref(get_osdmap()); child->pool = pool; @@ -5396,7 +5426,7 @@ bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op) void PG::take_waiters() { dout(10) << "take_waiters" << dendl; - requeue_ops(waiting_for_map); + take_op_map_waiters(); for (list::iterator i = peering_waiters.begin(); i != peering_waiters.end(); ++i) osd->queue_for_peering(this); @@ -5491,7 +5521,7 @@ void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap, assert(lastmap->get_epoch() == osdmap_ref->get_epoch()); assert(lastmap == osdmap_ref); dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl; - osdmap_ref = osdmap; + update_osdmap_ref(osdmap); pool.update(osdmap); AdvMap evt(osdmap, lastmap, newup, newacting); recovery_state.handle_event(evt, rctx); diff --git a/src/osd/PG.h b/src/osd/PG.h index 1a4767b29f4d..b65de0fcffc9 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -381,9 +381,27 @@ public: snap_mapper.update_bits(bits); } protected: + // Ops waiting for map, should be queued at back + Mutex map_lock; + list waiting_for_map; OSDMapRef osdmap_ref; PGPool pool; + void queue_op(OpRequestRef op); + void take_op_map_waiters(); + + void update_osdmap_ref(OSDMapRef newmap) { + assert(_lock.is_locked_by_me()); + Mutex::Locker l(map_lock); + osdmap_ref = newmap; + } + + OSDMapRef get_osdmap_with_maplock() const { + assert(map_lock.is_locked()); + assert(osdmap_ref); + return osdmap_ref; + } + OSDMapRef get_osdmap() const { assert(is_locked()); assert(osdmap_ref); @@ -692,8 +710,6 @@ protected: // Ops waiting on backfill_pos to change list waiting_for_backfill_pos; - - list waiting_for_map; list waiting_for_active; list waiting_for_all_missing; map > waiting_for_missing_object, diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 708e4153ca89..8811b9ab0ff3 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6328,7 +6328,6 @@ void ReplicatedPG::on_change() // requeue everything in the reverse order they should be // reexamined. - requeue_ops(waiting_for_map); clear_scrub_reserved(); scrub_clear_state();