]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: fix improper blob usage while handling deferred big write. 34754/head
authorIgor Fedotov <ifedotov@suse.com>
Fri, 24 Apr 2020 22:03:25 +0000 (01:03 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Fri, 24 Apr 2020 22:03:25 +0000 (01:03 +0300)
The root cause is tail extent iterator invalidation when rewriting head
one.

Fixes: https://tracker.ceph.com/issues/45195
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/test/objectstore/store_test.cc

index de18468d474526ca3855ecabb398ba3aa57b42fc..639f115d7a55e2257fc2dc430fd77f421e0a822d 100644 (file)
@@ -13336,14 +13336,17 @@ bool BlueStore::BigDeferredWriteContext::can_defer(
     res = blob_aligned_len() <= prefer_deferred_size &&
       blob_aligned_len() <= ondisk &&
       blob.is_allocated(b_off, blob_aligned_len());
+    if (res) {
+      blob_ref = ep->blob;
+      blob_start = ep->blob_start();
+    }
   }
   return res;
 }
 
-bool BlueStore::BigDeferredWriteContext::apply_defer(
-    BlueStore::extent_map_t::iterator ep)
+bool BlueStore::BigDeferredWriteContext::apply_defer()
 {
-  int r = ep->blob->get_blob().map(
+  int r = blob_ref->get_blob().map(
     b_off, blob_aligned_len(),
     [&](const bluestore_pextent_t& pext,
       uint64_t offset,
@@ -13364,7 +13367,6 @@ void BlueStore::_do_write_big_apply_deferred(
     TransContext* txc,
     CollectionRef& c,
     OnodeRef o,
-    BlueStore::extent_map_t::iterator ep,
     BlueStore::BigDeferredWriteContext& dctx,
     bufferlist::iterator& blp,
     WriteContext* wctx)
@@ -13402,14 +13404,14 @@ void BlueStore::_do_write_big_apply_deferred(
     bl.claim_append(tail_bl);
     logger->inc(l_bluestore_write_penalty_read_ops);
   }
-  auto b0 = ep->blob;
+  auto& b0 = dctx.blob_ref;
   _buffer_cache_write(txc, b0, dctx.b_off, bl,
     wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
 
   b0->dirty_blob().calc_csum(dctx.b_off, bl);
 
   Extent* le = o->extent_map.set_lextent(c, dctx.off,
-    dctx.off - ep->blob_start(), dctx.used, b0, &wctx->old_extents);
+    dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
 
   // in fact this is a no-op for big writes but left here to maintain
   // uniformity and avoid missing after some refactor.
@@ -13475,7 +13477,6 @@ void BlueStore::_do_write_big(
           false;
         auto offset_next = offset + head_info.used;
         auto remaining = l - head_info.used;
-
         if (will_defer && remaining) {
           will_defer = false;
           if (remaining <= prefer_deferred_size_snapshot) {
@@ -13488,31 +13489,30 @@ void BlueStore::_do_write_big(
                 block_size,
                 offset_next,
                 remaining);
-
             will_defer = will_defer && remaining == tail_info.used;
           }
         }
         if (will_defer) {
-          dout(20) << __func__ << " " << *(ep->blob)
+          dout(20) << __func__ << " " << *(head_info.blob_ref)
             << " deferring big " << std::hex
             << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
             << std::dec << " write via deferred"
             << dendl;
           if (remaining) {
-            dout(20) << __func__ << " " << *(ep_next->blob)
+            dout(20) << __func__ << " " << *(tail_info.blob_ref)
               << " deferring big " << std::hex
               << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
               << std::dec << " write via deferred"
               << dendl;
           }
 
-          will_defer = head_info.apply_defer(ep);
+          will_defer = head_info.apply_defer();
           if (!will_defer) {
             dout(20) << __func__
               << " deferring big fell back, head isn't continuous"
               << dendl;
           } else if (remaining) {
-            will_defer = tail_info.apply_defer(ep_next);
+            will_defer = tail_info.apply_defer();
             if (!will_defer) {
               dout(20) << __func__
                 << " deferring big fell back, tail isn't continuous"
@@ -13521,9 +13521,9 @@ void BlueStore::_do_write_big(
           }
         }
         if (will_defer) {
-          _do_write_big_apply_deferred(txc, c, o, ep, head_info, blp, wctx);
+          _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
           if (remaining) {
-            _do_write_big_apply_deferred(txc, c, o, ep_next, tail_info,
+            _do_write_big_apply_deferred(txc, c, o, tail_info,
               blp, wctx);
           }
           offset += l;
index c9e831c7a4b173540a0985089cafa117610e5842..cedbe53688d61cbe6ccb639c723be4c77ed73dca 100644 (file)
@@ -1945,6 +1945,8 @@ public:
     uint32_t used = 0;
     uint64_t head_read = 0;
     uint64_t tail_read = 0;
+    BlobRef blob_ref;
+    uint64_t blob_start = 0;
     PExtentVector res_extents;
 
     inline uint64_t blob_aligned_len() const {
@@ -1956,7 +1958,7 @@ public:
       uint64_t block_size,
       uint64_t offset,
       uint64_t l);
-    bool apply_defer(BlueStore::extent_map_t::iterator ep);
+    bool apply_defer();
   };
 
   // --------------------------------------------------------
@@ -3090,7 +3092,6 @@ private:
     TransContext* txc,
     CollectionRef& c,
     OnodeRef o,
-    BlueStore::extent_map_t::iterator ep,
     BigDeferredWriteContext& dctx,
     bufferlist::iterator& blp,
     WriteContext* wctx);
index 9226ce302e5765fb798e6b97098fc9305e95e1bc..3e643d58680770b6ee0ab1ce28acf7641fff070c 100644 (file)
@@ -8815,6 +8815,90 @@ TEST_P(StoreTestSpecificAUSize, SpilloverFixed2Test) {
   );
 }
 
+TEST_P(StoreTestSpecificAUSize, Ticket45195Repro) {
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  SetVal(g_conf(), "bluestore_default_buffered_write", "true");
+  SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+  SetVal(g_conf(), "bluestore_debug_enforce_settings", "hdd");
+  SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+  g_conf().apply_changes(nullptr);
+
+  StartDeferred(0x1000);
+
+  int r;
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+  auto ch = store->create_new_collection(cid);
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    size_t large_object_size = 1 * 1024 * 1024;
+    size_t expected_write_size = 0x8000;
+    ObjectStore::Transaction t;
+    t.touch(cid, hoid);
+    t.set_alloc_hint(cid, hoid, large_object_size, expected_write_size,
+      CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ |
+      CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, orig;
+    string s(0xc000, '0');
+    bl.append(s);
+    t.write(cid, hoid, 0xb000, bl.length(), bl);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, orig;
+    string s(0x10000, '1');
+    bl.append(s);
+    t.write(cid, hoid, 0x16000, bl.length(), bl);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, orig;
+    string s(0x4000, '1');
+    bl.append(s);
+    t.write(cid, hoid, 0x1b000, bl.length(), bl);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist bl;
+  r = store->read(ch, hoid, 0xb000, 0xb000, bl);
+  ASSERT_EQ(r, 0xb000);
+
+  store->umount();
+  store->mount();
+
+  ch = store->open_collection(cid);
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, orig;
+    string s(0xf000, '3');
+    bl.append(s);
+    t.write(cid, hoid, 0xf000, bl.length(), bl);
+    cerr << "write4" << std::endl;
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  r = store->read(ch, hoid, 0xb000, 0x10000, bl);
+  ASSERT_EQ(r, 0x10000);
+}
+
 #endif  // WITH_BLUESTORE
 
 int main(int argc, char **argv) {