]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
PG,OSD: delay ops for map prior to queueing in the OpWQ
authorSamuel Just <sam.just@inktank.com>
Wed, 8 May 2013 23:19:34 +0000 (16:19 -0700)
committerSamuel Just <sam.just@inktank.com>
Fri, 10 May 2013 00:30:42 +0000 (17:30 -0700)
Previously, we simply queued ops in the OpWQ without checking.  The PG
would then check in do_request whether the message should wait for a new
map.  Unfortunately, this has the side effect that any op requeued for
any reason must also requeue the waiting_for_map queue.

Now, we will check before queueing the op whether it must wait on a map.
To avoid contention, there is now a map_lock which must be held along
with the PG lock in order to update the osdmap_ref.  The map_lock also
protects the waiting_for_map list and queueing PG ops at the back of
the OpWQ.  A few details:
1) It is no longer necessary to requeue waiting_for_map in on_change()
since the other ops are queued at the front.
2) Once waiting_for_map is non-empty, all ops are delayed to simplify
ordering.
3) waiting_for_map may now be non-empty during split, so we must split
waiting_for_map along with waiting_for_active.  This must be done
under the map_lock.

The bug which uncovered this involved an out of order op as follows:

client.4208.0:2378 (e252) arrives, object is degraded
client.4208.0:2379 (e253) arrives, waits for map
client.4208.0:2378 (e252) is requeued after recovery
client.4208.0:2379 (e253) is requeued on map arrival
client.4208.0:2379 is processed
client.4208.0:2378 is processed

Fixes: #4955
Signed-off-by: Samuel Just <sam.just@inktank.com>
src/osd/OSD.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/ReplicatedPG.cc

index f9f2b46a2218d1923e8e58818f7aab661398ef01..26dacf577af8c2ccae000abc1ebde7d0ff458f48 100644 (file)
@@ -6416,7 +6416,7 @@ void OSD::enqueue_op(PG *pg, OpRequestRef op)
           << " cost " << op->request->get_cost()
           << " latency " << latency
           << " " << *(op->request) << dendl;
-  op_wq.queue(make_pair(PGRef(pg), op));
+  pg->queue_op(op);
 }
 
 void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item)
index 208d6cc21069292ddb425a814a49155607f5d2f5..9a074a1ae0531a55c6b9acbacb6582998b47c88e 100644 (file)
@@ -139,6 +139,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
     p.m_seed,
     p.get_split_bits(curmap->get_pg_num(_pool.id)),
     _pool.id),
+  map_lock("PG::map_lock"),
   osdmap_ref(curmap), pool(_pool),
   _lock("PG::_lock"),
   ref(0),
@@ -1748,6 +1749,36 @@ bool PG::op_has_sufficient_caps(OpRequestRef op)
   return cap;
 }
 
+void PG::take_op_map_waiters()
+{
+  Mutex::Locker l(map_lock);
+  for (list<OpRequestRef>::iterator i = waiting_for_map.begin();
+       i != waiting_for_map.end();
+       ) {
+    if (op_must_wait_for_map(get_osdmap_with_maplock(), *i)) {
+      break;
+    } else {
+      osd->op_wq.queue(make_pair(PGRef(this), *i));
+      waiting_for_map.erase(i++);
+    }
+  }
+}
+
+void PG::queue_op(OpRequestRef op)
+{
+  Mutex::Locker l(map_lock);
+  if (!waiting_for_map.empty()) {
+    // preserve ordering
+    waiting_for_map.push_back(op);
+    return;
+  }
+  if (op_must_wait_for_map(get_osdmap_with_maplock(), op)) {
+    waiting_for_map.push_back(op);
+    return;
+  }
+  osd->op_wq.queue(make_pair(PGRef(this), op));
+}
+
 void PG::do_request(OpRequestRef op)
 {
   // do any pending flush
@@ -1757,11 +1788,7 @@ void PG::do_request(OpRequestRef op)
     osd->reply_op_error(op, -EPERM);
     return;
   }
-  if (op_must_wait_for_map(get_osdmap(), op)) {
-    dout(20) << " waiting for map on " << op << dendl;
-    waiting_for_map.push_back(op);
-    return;
-  }
+  assert(!op_must_wait_for_map(get_osdmap(), op));
   if (can_discard_request(op)) {
     return;
   }
@@ -2099,7 +2126,6 @@ static void split_replay_queue(
 
 void PG::split_ops(PG *child, unsigned split_bits) {
   unsigned match = child->info.pgid.m_seed;
-  assert(waiting_for_map.empty());
   assert(waiting_for_all_missing.empty());
   assert(waiting_for_missing_object.empty());
   assert(waiting_for_degraded_object.empty());
@@ -2109,12 +2135,16 @@ void PG::split_ops(PG *child, unsigned split_bits) {
 
   osd->dequeue_pg(this, &waiting_for_active);
   split_list(&waiting_for_active, &(child->waiting_for_active), match, split_bits);
+  {
+    Mutex::Locker l(map_lock); // to avoid a race with the osd dispatch
+    split_list(&waiting_for_map, &(child->waiting_for_map), match, split_bits);
+  }
 }
 
 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
 {
   child->update_snap_mapper_bits(split_bits);
-  child->osdmap_ref = osdmap_ref;
+  child->update_osdmap_ref(get_osdmap());
 
   child->pool = pool;
 
@@ -5396,7 +5426,7 @@ bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op)
 void PG::take_waiters()
 {
   dout(10) << "take_waiters" << dendl;
-  requeue_ops(waiting_for_map);
+  take_op_map_waiters();
   for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
        i != peering_waiters.end();
        ++i) osd->queue_for_peering(this);
@@ -5491,7 +5521,7 @@ void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
   assert(lastmap == osdmap_ref);
   dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl;
-  osdmap_ref = osdmap;
+  update_osdmap_ref(osdmap);
   pool.update(osdmap);
   AdvMap evt(osdmap, lastmap, newup, newacting);
   recovery_state.handle_event(evt, rctx);
index 1a4767b29f4d7bfc5645775e13bc1663bf9cefac..b65de0fcffc9e8bc205621ab8814c978b642d25e 100644 (file)
@@ -381,9 +381,27 @@ public:
     snap_mapper.update_bits(bits);
   }
 protected:
+  // Ops waiting for map, should be queued at back
+  Mutex map_lock;
+  list<OpRequestRef> waiting_for_map;
   OSDMapRef osdmap_ref;
   PGPool pool;
 
+  void queue_op(OpRequestRef op);
+  void take_op_map_waiters();
+
+  void update_osdmap_ref(OSDMapRef newmap) {
+    assert(_lock.is_locked_by_me());
+    Mutex::Locker l(map_lock);
+    osdmap_ref = newmap;
+  }
+
+  OSDMapRef get_osdmap_with_maplock() const {
+    assert(map_lock.is_locked());
+    assert(osdmap_ref);
+    return osdmap_ref;
+  }
+
   OSDMapRef get_osdmap() const {
     assert(is_locked());
     assert(osdmap_ref);
@@ -692,8 +710,6 @@ protected:
 
   // Ops waiting on backfill_pos to change
   list<OpRequestRef> waiting_for_backfill_pos;
-
-  list<OpRequestRef> waiting_for_map;
   list<OpRequestRef>            waiting_for_active;
   list<OpRequestRef>            waiting_for_all_missing;
   map<hobject_t, list<OpRequestRef> > waiting_for_missing_object,
index 708e4153ca89301d7e1f49ba7058b5601783ee5a..8811b9ab0ff360d272836dcd9045dd7b2c2c8e11 100644 (file)
@@ -6328,7 +6328,6 @@ void ReplicatedPG::on_change()
 
   // requeue everything in the reverse order they should be
   // reexamined.
-  requeue_ops(waiting_for_map);
 
   clear_scrub_reserved();
   scrub_clear_state();