]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/ECBackend: fix on_write ordering w/ sync onreadable callbacks 18196/head
authorSage Weil <sage@redhat.com>
Wed, 18 Oct 2017 02:45:45 +0000 (21:45 -0500)
committerSage Weil <sage@redhat.com>
Tue, 12 Dec 2017 17:21:49 +0000 (11:21 -0600)
When we call handle_sub_write after a write completion, we may
do a sync read completion and then call back into check_ops().  Attaching
the on_write events to the op we're applying means that we don't ensure
that the on_write event(s) happen before the next write in the queue
is submitted (when we call back into check_ops()).

For example, if we have op A, on_write event W, then op B, a sync
applied completion would mean that we would queue the write for A, call
back into SubWriteApplied -> handle_sub_write_reply -> check_ops and then
process B... before getting to W.

Resolve this by attaching the on_write callback to a separate Op that is
placed into the queue, just like any other Op.  This keeps the ordering
logic clean, although it is a bit ugly with the polymorphism around Op
being either an Op or an on_write callback.

Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/ECBackend.cc
src/osd/ECBackend.h

index 19de8ad0642a51e1f359794613f15de225595df2..89b928c3a94156e498376433969bd87ae9cc0b9e 100644 (file)
@@ -1496,14 +1496,26 @@ void ECBackend::submit_transaction(
   dout(10) << "onreadable_sync: " << op->on_local_applied_sync << dendl;
 }
 
-void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
-  if (!waiting_state.empty()) {
-    waiting_state.back().on_write.emplace_back(std::move(cb));
-  } else if (!waiting_reads.empty()) {
-    waiting_reads.back().on_write.emplace_back(std::move(cb));
-  } else {
-    // Nothing earlier in the pipeline, just call it
+void ECBackend::call_write_ordered(std::function<void(void)> &&cb)
+{
+  if (waiting_state.empty() &&
+      waiting_reads.empty()) {
+    dout(10) << __func__ << " sync" << dendl;
     cb();
+  } else {
+    ceph_tid_t tid = parent->get_tid();
+    Op& op = tid_to_op_map[tid];
+    op.tid = tid;
+    op.on_write = std::move(cb);
+    if (!waiting_state.empty()) {
+      dout(10) << __func__ << " tid " << tid << " waiting_state" << dendl;
+      waiting_state.push_back(op);
+    } else if (!waiting_reads.empty()) {
+      dout(10) << __func__ << " tid " << tid << " waiting_reads" << dendl;
+      waiting_reads.push_back(op);
+    } else {
+      ceph_abort();
+    }
   }
 }
 
@@ -1884,6 +1896,12 @@ bool ECBackend::try_reads_to_commit()
   if (waiting_reads.empty())
     return false;
   Op *op = &(waiting_reads.front());
+  if (op->on_write) {
+    waiting_reads.pop_front();
+    op->on_write();
+    tid_to_op_map.erase(op->tid);
+    return true;
+  }
   if (op->read_in_progress())
     return false;
   waiting_reads.pop_front();
@@ -2021,21 +2039,14 @@ bool ECBackend::try_reads_to_commit()
     }
   }
   if (should_write_local) {
-      handle_sub_write(
-       get_parent()->whoami_shard(),
-       op->client_op,
-       local_write_op,
-       op->trace,
-       op->on_local_applied_sync);
-      op->on_local_applied_sync = 0;
-  }
-
-  for (auto i = op->on_write.begin();
-       i != op->on_write.end();
-       op->on_write.erase(i++)) {
-    (*i)();
+    handle_sub_write(
+      get_parent()->whoami_shard(),
+      op->client_op,
+      local_write_op,
+      op->trace,
+      op->on_local_applied_sync);
+    op->on_local_applied_sync = 0;
   }
-
   return true;
 }
 
index 53852583ae4e2d82469ff2928489551fb538315a..a854ef99ffae4251c651470aed50177208b81b22 100644 (file)
@@ -469,7 +469,7 @@ public:
     map<hobject_t, ObjectContextRef> obc_map;
 
     /// see call_write_ordered
-    std::list<std::function<void(void)> > on_write;
+    std::function<void(void)> on_write;
 
     /// Generated internally
     set<hobject_t> temp_added;
@@ -507,6 +507,10 @@ public:
     Context *on_local_applied_sync = nullptr;
     Context *on_all_applied = nullptr;
     Context *on_all_commit = nullptr;
+
+    Op() {}
+    Op(ceph_tid_t t, std::function<void(void)>&& cb)
+      : tid(t), on_write(cb) { }
     ~Op() {
       delete on_local_applied_sync;
       delete on_all_applied;