]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: avoid two copy with same src cancel each other
authorYuanXin <yuanxin@didiglobal.com>
Sun, 21 Feb 2021 07:08:07 +0000 (15:08 +0800)
committermyoungwon oh <ohmyoungwon@gmail.com>
Thu, 8 Apr 2021 04:58:08 +0000 (13:58 +0900)
For cache tier, if some head object has two snaps, the two snaps share the same clone object,
and the clone object was flush/evicted from cache pool, when a rollback requests and a read
snap request to these two snaps at the same time will generate two promote requests to the
same clone object, these two promote requests will generate two copy ops with same src, than
the second copy op will cancel the first copy op by calling cancel_copy and kick_object_context_blocked,
but after calling kick_object_context_blocked, a new promote request corresponding to first
copy op will be restarted and generate a new copy op, the new copy op will cancel the second
copy op again, so two promote requests will cancel their copy op each other and run into dead
loop.

Fixes: https://tracker.ceph.com/issues/49409
Signed-off-by: YuanXin <yuanxin@didiglobal.com>
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/test/librados/tier_cxx.cc

index 8d01780a8053f98c2e1daa6696611c74b3dcb198..94d74a15db47ffcc9abfe504480cfc9d92152851 100644 (file)
@@ -10142,7 +10142,6 @@ void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
   copy_ops.erase(cop->obc->obs.oi.soid);
   cop->obc->stop_block();
 
-  kick_object_context_blocked(cop->obc);
   cop->results.should_requeue = requeue;
   CopyCallbackResults result(-ECANCELED, &cop->results);
   cop->cb->complete(result);
@@ -10153,13 +10152,15 @@ void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
   cop->obc = ObjectContextRef();
 }
 
-void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
+void PrimaryLogPG::cancel_and_kick_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
 {
   dout(10) << __func__ << dendl;
   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
   while (p != copy_ops.end()) {
+    ObjectContextRef obc = p->second->obc;
     // requeue this op? can I queue up all of them?
     cancel_copy((p++)->second, requeue, tids);
+    kick_object_context_blocked(obc);
   }
 }
 
@@ -12563,7 +12564,7 @@ void PrimaryLogPG::on_shutdown()
   m_scrubber->unreg_next_scrub();
 
   vector<ceph_tid_t> tids;
-  cancel_copy_ops(false, &tids);
+  cancel_and_kick_copy_ops(false, &tids);
   cancel_flush_ops(false, &tids);
   cancel_proxy_ops(false, &tids);
   cancel_manifest_ops(false, &tids);
@@ -12680,7 +12681,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
   requeue_ops(waiting_for_readable);
 
   vector<ceph_tid_t> tids;
-  cancel_copy_ops(is_primary(), &tids);
+  cancel_and_kick_copy_ops(is_primary(), &tids);
   cancel_flush_ops(is_primary(), &tids);
   cancel_proxy_ops(is_primary(), &tids);
   cancel_manifest_ops(is_primary(), &tids);
index e9f9cb9c828cb2b53b837bc617a7282f9b5a4080..f4fb347c87e6017c7053ab41e2c8342d5b799eff 100644 (file)
@@ -1329,7 +1329,7 @@ protected:
   void finish_copyfrom(CopyFromCallback *cb);
   void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
   void cancel_copy(CopyOpRef cop, bool requeue, std::vector<ceph_tid_t> *tids);
-  void cancel_copy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+  void cancel_and_kick_copy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
 
   friend struct C_Copyfrom;
 
index d3cd46a908980631896806dd372d9287f592b258..10f567ddd395648c55384f01e35d87093d3fbb89 100644 (file)
@@ -1506,6 +1506,117 @@ TEST_F(LibRadosTwoPoolsPP, ListSnap){
   ioctx.selfmanaged_snap_remove(my_snaps[0]);
 }
 
+// This test case reproduces https://tracker.ceph.com/issues/49409
+TEST_F(LibRadosTwoPoolsPP, EvictSnapRollbackReadRace) {
+  // create object
+  {
+    bufferlist bl;
+    int len = string("hi there").length() * 2;
+    // append more chrunk data make sure the second promote
+    // op coming before the first promote op finished
+    for (int i=0; i<4*1024*1024/len; ++i)
+      bl.append("hi therehi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+
+  // create two snapshot, a clone
+  vector<uint64_t> my_snaps(2);
+  ASSERT_EQ(0, ioctx.selfmanaged_snap_create(&my_snaps[1]));
+  ASSERT_EQ(0, ioctx.selfmanaged_snap_create(&my_snaps[0]));
+  ASSERT_EQ(0, ioctx.selfmanaged_snap_set_write_ctx(my_snaps[0],
+                                                         my_snaps));
+  {
+    bufferlist bl;
+    bl.append("ciao!");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // read, trigger a promote on the head
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ('c', bl[0]);
+  }
+
+  // try more times
+  int retries = 50;
+  for (int i=0; i<retries; ++i)
+  {
+    {
+      librados::AioCompletion * completion = cluster.aio_create_completion();
+      librados::AioCompletion * completion1 = cluster.aio_create_completion();
+
+      // send a snap rollback op and a snap read op parallel
+      // trigger two promote(copy) to the same snap clone obj
+      // the second snap read op is read-ordered make sure
+      // op not wait for objects_blocked_on_snap_promotion
+      ObjectWriteOperation op;
+      op.selfmanaged_snap_rollback(my_snaps[0]);
+      ASSERT_EQ(0, ioctx.aio_operate(
+        "foo", completion, &op));
+
+      ioctx.snap_set_read(my_snaps[1]);
+      std::map<uint64_t, uint64_t> extents;
+      bufferlist read_bl;
+      int rval = -1;
+      ObjectReadOperation op1;
+      op1.sparse_read(0, 8, &extents, &read_bl, &rval);
+      ASSERT_EQ(0, ioctx.aio_operate("foo", completion1, &op1, &read_bl));
+      ioctx.snap_set_read(librados::SNAP_HEAD);
+
+      completion->wait_for_safe();
+      ASSERT_EQ(0, completion->get_return_value());
+      completion->release();
+
+      completion1->wait_for_safe();
+      ASSERT_EQ(0, completion1->get_return_value());
+      completion1->release();
+    }
+
+    // evict foo snap
+    ioctx.snap_set_read(my_snaps[0]);
+    {
+      ObjectReadOperation op;
+      op.cache_evict();
+      librados::AioCompletion *completion = cluster.aio_create_completion();
+      ASSERT_EQ(0, ioctx.aio_operate(
+        "foo", completion, &op,
+        librados::OPERATION_IGNORE_CACHE, NULL));
+      completion->wait_for_safe();
+      ASSERT_EQ(0, completion->get_return_value());
+      completion->release();
+    }
+    ioctx.snap_set_read(librados::SNAP_HEAD);
+  }
+
+  // cleanup
+  ioctx.selfmanaged_snap_remove(my_snaps[0]);
+  ioctx.selfmanaged_snap_remove(my_snaps[1]);
+}
+
 TEST_F(LibRadosTwoPoolsPP, TryFlush) {
   // configure cache
   bufferlist inbl;