]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
common: fix races when visiting a TrackedOp. 23026/head
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 5 Jul 2018 12:39:31 +0000 (14:39 +0200)
committerNathan Cutler <ncutler@suse.com>
Fri, 13 Jul 2018 11:23:24 +0000 (13:23 +0200)
Before the commit following race can happen:
  ```
  A           : OpTracker::visit_ops_in_flight(..., callable leaking TrackedOpRef outside)
  A           : Mutex::Locker::Locker(sdata->ops_in_flight_lock_sharded)
  A with lock : (nref > 0) == true

  B           : TrackedOp::put(), nref := 0 // updating the counter is done without the lock
  B           : OpTracker::unregister_inflight_op()
  B           : Mutex::Locker::Locker(sdata->ops_in_flight_lock_sharded)

  A with lock : visit() -> TrackedOp::get(), nref := 1
  A with lock : Mutex::Locker::~Locker()

  B with lock : boost::intrusive::list::iterator_to(op)
  B with lock : boost::intrusive::list::erase(iter)
  B with lock : Mutex::Locker::~Locker()

  A           : TrackedOp::put(), nref := 0
  A           : OpTracker::unregister_inflight_op()
  A           : Mutex::Locker::Locker(sdata->ops_in_flight_lock_sharded)
  A with lock : boost::intrusive::list::iterator_to(op) // oops as op doesn't belong to the list anymore
  ```

Fixes: https://tracker.ceph.com/issues/24664
Related-To: https://tracker.ceph.com/issues/24037
Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
(cherry picked from commit 871cbf0f6114c301acc749ae184dbbac32baacf2)

src/common/TrackedOp.cc
src/common/TrackedOp.h

index 8d4e986190aa22ee42afcbb8bb9994908109b650..93df29736e182e12fd48f9f5009d9ccc735b0a1a 100644 (file)
@@ -265,7 +265,7 @@ bool OpTracker::register_inflight_op(TrackedOp *i)
   return true;
 }
 
-void OpTracker::unregister_inflight_op(TrackedOp *i)
+void OpTracker::unregister_inflight_op(TrackedOp* const i)
 {
   // caller checks;
   assert(i->state);
@@ -278,16 +278,12 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
     auto p = sdata->ops_in_flight_sharded.iterator_to(*i);
     sdata->ops_in_flight_sharded.erase(p);
   }
-  i->_unregistered();
+}
 
-  if (!tracking_enabled)
-    delete i;
-  else {
-    RWLock::RLocker l(lock);
-    i->state = TrackedOp::STATE_HISTORY;
-    utime_t now = ceph_clock_now();
-    history.insert(now, TrackedOpRef(i));
-  }
+void OpTracker::record_history_op(TrackedOpRef&& i)
+{
+  RWLock::RLocker l(lock);
+  history.insert(ceph_clock_now(), std::move(i));
 }
 
 bool OpTracker::visit_ops_in_flight(utime_t* oldest_secs,
@@ -328,7 +324,7 @@ bool OpTracker::visit_ops_in_flight(utime_t* oldest_secs,
     assert(NULL != sdata);
     Mutex::Locker locker(sdata->ops_in_flight_lock_sharded);
     for (auto& op : sdata->ops_in_flight_sharded) {
-      if (op.nref > 0 && !visit(op))
+      if (!visit(op))
        break;
     }
   }
index 2e3cc09a22c2fdbeedf7ab1fc535ace6b301eff3..a0790e48d9853653da99fa53df9a0c5d04972a19 100644 (file)
@@ -134,6 +134,7 @@ public:
   bool dump_historic_slow_ops(Formatter *f, set<string> filters = {""});
   bool register_inflight_op(TrackedOp *i);
   void unregister_inflight_op(TrackedOp *i);
+  void record_history_op(TrackedOpRef&& i);
 
   void get_age_ms_histogram(pow2_hist_t *h);
 
@@ -303,7 +304,9 @@ public:
     ++nref;
   }
   void put() {
-    if (--nref == 0) {
+  again:
+    auto nref_snap = nref.load();
+    if (nref_snap == 1) {
       switch (state.load()) {
       case STATE_UNTRACKED:
        _unregistered();
@@ -313,6 +316,14 @@ public:
       case STATE_LIVE:
        mark_event("done");
        tracker->unregister_inflight_op(this);
+       _unregistered();
+       if (!tracker->is_tracking()) {
+         delete this;
+       } else {
+         state = TrackedOp::STATE_HISTORY;
+         tracker->record_history_op(
+           TrackedOpRef(this, /* add_ref = */ false));
+       }
        break;
 
       case STATE_HISTORY:
@@ -322,6 +333,8 @@ public:
       default:
        ceph_abort();
       }
+    } else if (!nref.compare_exchange_weak(nref_snap, nref_snap - 1)) {
+      goto again;
     }
   }