]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: imlement deferred writes for big writes
authorIgor Fedotov <ifedotov@suse.com>
Thu, 20 Feb 2020 10:43:04 +0000 (13:43 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Wed, 1 Apr 2020 09:55:33 +0000 (12:55 +0300)
Intended to ensure spinner's read performance for 4K min alloc size to be
on par with 64K one. Degradation is caused by additional fragmentation
produced by partial overwrites when 4K min alloc size is set./.

E.g. consider 4K partial overwrite in the middle of the previously written contiguous
64K blob.
For 64K min alloc size deferred write procedure comes in and preserve
data continuity.
When min alloc size set to 4K overwritten blob is broken into 3 parts
whithout the patch. This patch applies deferred writing processing for
the case and hence avoids blob continuity breakage. At cost of
write performance drop though.

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/bluestore_types.h
src/test/objectstore/store_test.cc

index 0937c6a4f7bea063948f6c93812fe550adb6d32f..03a7a963ac2b794b07a1f12824dc1cc7717b43a7 100644 (file)
@@ -4662,6 +4662,9 @@ void BlueStore::_init_logger()
                    "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
   b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
                    "Large aligned writes into fresh blobs (blobs)");
+  b.add_u64_counter(l_bluestore_write_big_deferred,
+                   "bluestore_write_big_deferred",
+                   "Big overwrites using deferred");
   b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
                    "Small writes into existing or sparse small blobs");
   b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
@@ -12950,11 +12953,10 @@ void BlueStore::_do_write_small(
       ++ep;
     }
   }
-  auto prev_ep = ep;
-  if (prev_ep != begin) {
+  auto prev_ep = end;
+  if (ep != begin) {
+    prev_ep = ep;
     --prev_ep;
-  } else {
-    prev_ep = end; // to avoid this extent check as it's a duplicate
   }
 
   boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
@@ -13107,7 +13109,7 @@ void BlueStore::_do_write_small(
            bl.claim_append(tail_bl);
            logger->inc(l_bluestore_write_penalty_read_ops);
          }
-         logger->inc(l_bluestore_write_small_pre_read);
+          logger->inc(l_bluestore_write_small_pre_read);
 
          _buffer_cache_write(txc, b, b_off, bl,
                              wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
@@ -13282,8 +13284,8 @@ void BlueStore::_do_write_big(
           << dendl;
   logger->inc(l_bluestore_write_big);
   logger->inc(l_bluestore_write_big_bytes, length);
-  o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
   auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
   while (length > 0) {
     bool new_blob = false;
     uint32_t l = std::min(max_bsize, length);
@@ -13292,16 +13294,109 @@ void BlueStore::_do_write_big(
 
     //attempting to reuse existing blob
     if (!wctx->compress) {
-      // look for an existing mutable blob we can reuse
-      auto begin = o->extent_map.extent_map.begin();
-      auto end = o->extent_map.extent_map.end();
+      // look for an existing mutable blob we can write into
       auto ep = o->extent_map.seek_lextent(offset);
-      auto prev_ep = ep;
-      if (prev_ep != begin) {
+      auto end = o->extent_map.extent_map.end();
+      // First try if we can apply deferred write
+      if (prefer_deferred_size_snapshot && ep != end &&
+            offset >= ep->blob_start() &&
+            ep->blob->get_blob().is_mutable()) {
+        auto b0 = ep->blob;
+        auto b_off = offset - ep->blob_start();
+        uint64_t chunk_size = b0->get_blob().get_chunk_size(block_size);
+        auto l_aligned = l;
+
+        // read some data to fill out the chunk?
+        uint64_t head_read = p2phase<uint64_t>(b_off, chunk_size);
+        uint64_t tail_read = p2nphase<uint64_t>(b_off + l, chunk_size);
+        if ((head_read || tail_read) &&
+          (b0->get_blob().get_ondisk_length() >=
+            b_off + l + tail_read)) {
+          b_off = head_read;
+          l_aligned += head_read + tail_read;
+        } else {
+          head_read = tail_read = 0;
+        }
+
+        if (l_aligned <= prefer_deferred_size_snapshot &&
+            b_off % chunk_size == 0 &&
+            l_aligned % chunk_size == 0 &&
+            b0->get_blob().is_allocated(b_off, l_aligned)) {
+          dout(20) << __func__ << " " << *b0
+            << " deferring big " << std::hex
+            << " (0x" << b_off << "~" << l_aligned << ")"
+            << std::dec << " write via deferred"
+            << dendl;
+
+          bluestore_deferred_op_t *op = _get_deferred_op(txc);
+          op->op = bluestore_deferred_op_t::OP_WRITE;
+          int r = b0->get_blob().map(
+            b_off, l_aligned,
+            [&](uint64_t offset, uint64_t length) {
+              op->extents.emplace_back(bluestore_pextent_t(offset, length));
+              return 0;
+            });
+          ceph_assert(r == 0);
+
+          dout(20) << __func__ << "  reading head 0x" << std::hex << head_read
+            << " and tail 0x" << tail_read << std::dec << dendl;
+          if (head_read) {
+            int r = _do_read(c.get(), o, offset - head_read, head_read,
+              op->data, 0);
+            ceph_assert(r >= 0 && r <= (int)head_read);
+            size_t zlen = head_read - r;
+            if (zlen) {
+              op->data.append_zero(zlen);
+              logger->inc(l_bluestore_write_pad_bytes, zlen);
+            }
+            logger->inc(l_bluestore_write_penalty_read_ops);
+          }
+          blp.copy(l, op->data);
+
+          if (tail_read) {
+            bufferlist tail_bl;
+            int r = _do_read(c.get(), o, offset + l, tail_read,
+              tail_bl, 0);
+            ceph_assert(r >= 0 && r <= (int)tail_read);
+            size_t zlen = tail_read - r;
+            if (zlen) {
+              tail_bl.append_zero(zlen);
+              logger->inc(l_bluestore_write_pad_bytes, zlen);
+            }
+            op->data.claim_append(tail_bl);
+            logger->inc(l_bluestore_write_penalty_read_ops);
+          }
+
+          _buffer_cache_write(txc, b0, b_off, op->data,
+            wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+          if (b0->get_blob().csum_type) {
+            b0->dirty_blob().calc_csum(b_off, op->data);
+          }
+          Extent *le = o->extent_map.set_lextent(c, offset,
+            offset - ep->blob_start(), l, b0, &wctx->old_extents);
+          txc->statfs_delta.stored() += le->length;
+
+          offset += l;
+          length -= l;
+          logger->inc(l_bluestore_write_big_blobs);
+          logger->inc(l_bluestore_write_big_deferred);
+
+          continue;
+        }
+      }
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+
+      // seek again as punch_hole could invalidate ep
+      ep = o->extent_map.seek_lextent(offset);
+      auto begin = o->extent_map.extent_map.begin();
+      auto prev_ep = end;
+      if (ep != begin) {
+        prev_ep = ep;
         --prev_ep;
-      } else {
-        prev_ep = end; // to avoid this extent check as it's a duplicate
       }
+      dout(20) << __func__ << " no deferred" << dendl;
+
       auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
       // search suitable extent in both forward and reverse direction in
       // [offset - target_max_blob_size, offset + target_max_blob_size] range
@@ -13310,12 +13405,16 @@ void BlueStore::_do_write_big(
       do {
        any_change = false;
        if (ep != end && ep->logical_offset < offset + max_bsize) {
-         if (offset >= ep->blob_start() &&
+          dout(20) << __func__ << " considering " << *ep << dendl;
+          dout(20) << __func__ << " considering " << *(ep->blob)
+            << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
+
+          if (offset >= ep->blob_start() &&
               ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
                                       offset - ep->blob_start(),
                                       &l)) {
            b = ep->blob;
-           b_off = offset - ep->blob_start();
+            b_off = offset - ep->blob_start();
             prev_ep = end; // to avoid check below
            dout(20) << __func__ << " reuse blob " << *b << std::hex
                     << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
@@ -13326,7 +13425,10 @@ void BlueStore::_do_write_big(
        }
 
        if (prev_ep != end && prev_ep->logical_offset >= min_off) {
-         if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+          dout(20) << __func__ << " considering rev " << *prev_ep << dendl;
+          dout(20) << __func__ << " considering reverse " << *(prev_ep->blob)
+            << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
+          if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
                                            offset - prev_ep->blob_start(),
                                            &l)) {
            b = prev_ep->blob;
@@ -13341,13 +13443,15 @@ void BlueStore::_do_write_big(
          }
        }
       } while (b == nullptr && any_change);
-    }
+    } else {
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+    } // if (!wctx->compress)
+
     if (b == nullptr) {
       b = c->new_blob();
       b_off = 0;
       new_blob = true;
     }
-
     bufferlist t;
     blp.copy(l, t);
     wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
@@ -13947,8 +14051,12 @@ int BlueStore::_do_write(
           << " 0x" << std::hex << offset << "~" << length
           << " - have 0x" << o->onode.size
           << " (" << std::dec << o->onode.size << ")"
-          << " bytes"
-          << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec
+          << " bytes" << std::hex
+          << " fadvise_flags 0x" << fadvise_flags
+          << " alloc_hint 0x" << o->onode.alloc_hint_flags
+           << " expected_object_size " << o->onode.expected_object_size
+           << " expected_write_size " << o->onode.expected_write_size
+           << std::dec
           << dendl;
   _dump_onode<30>(cct, *o);
 
index b680774ba35c35328e02c554555e0945a1963e61..f869f527bb9beaa5ded19aafbd18b9367ea0bb9a 100644 (file)
@@ -118,6 +118,7 @@ enum {
   l_bluestore_write_big,
   l_bluestore_write_big_bytes,
   l_bluestore_write_big_blobs,
+  l_bluestore_write_big_deferred,
   l_bluestore_write_small,
   l_bluestore_write_small_bytes,
   l_bluestore_write_small_unused,
index d83b8bc94f1d564340f0f7243f08a12d42ae6b9e..32750ffb2d1a126171fed999416dda0d2249c15b 100644 (file)
@@ -597,21 +597,20 @@ public:
     ceph_assert(p != extents.end());
     while (b_off >= p->length) {
       b_off -= p->length;
-      ++p;
-      ceph_assert(p != extents.end());
+      if (++p == extents.end())
+        return false;
     }
     b_len += b_off;
     while (b_len) {
-      ceph_assert(p != extents.end());
       if (require_allocated != p->is_valid()) {
         return false;
       }
-
       if (p->length >= b_len) {
         return true;
       }
       b_len -= p->length;
-      ++p;
+      if (++p == extents.end())
+        return false;
     }
     ceph_abort_msg("we should not get here");
     return false;
index c3e5c021c241b180d0b7731b542cb7ccc09ea4e9..eceecb00d37cef9c41c0e12615fa6a402915326d 100644 (file)
@@ -6674,6 +6674,249 @@ TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwrite) {
   }
 }
 
+TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) {
+
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  size_t block_size = 4096;
+  StartDeferred(block_size);
+  SetVal(g_conf(), "bluestore_max_blob_size", "65536");
+  SetVal(g_conf(), "bluestore_prefer_deferred_size", "32768");
+
+  g_conf().apply_changes(nullptr);
+
+  int r;
+  coll_t cid;
+  ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+  ghobject_t hoid2(hobject_t("test2", "", CEPH_NOSNAP, 0, -1, ""));
+
+  const PerfCounters* logger = store->get_perf_counters();
+
+  auto ch = store->create_new_collection(cid);
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl, bl2;
+
+    bl.append(std::string(block_size * 2, 'c'));
+    bl2.append(std::string(block_size * 3, 'd'));
+
+    t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    t.set_alloc_hint(cid, hoid2, block_size * 4, block_size * 4,
+      CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ);
+    t.write(cid, hoid2, 0, bl2.length(), bl2, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 2u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u);
+
+  {
+    struct store_statfs_t statfs;
+    int r = store->statfs(&statfs);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5);
+    ASSERT_LE(statfs.allocated, (unsigned)block_size * 5);
+  }
+
+  // overwrite at the beginning, 4K alignment
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+
+    bl.append(std::string(block_size, 'b'));
+    t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 3u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u);
+
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, 0, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'b'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, block_size, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'c'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+
+  // overwrite at 4K, 12K alignment
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+
+    bl.append(std::string(block_size, 'e'));
+    t.write(cid, hoid2, block_size , bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 4u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u);
+
+  // makes sure deferred has been submitted
+  // and do all the checks again
+  sleep(g_conf().get_val<double>("bluestore_max_defer_interval") + 2);
+
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 4u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u);
+
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, 0, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'b'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, block_size, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'c'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid2, 0, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'd'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid2, block_size, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'e'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid2, block_size * 2, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'd'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+
+  {
+    struct store_statfs_t statfs;
+    int r = store->statfs(&statfs);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5);
+    ASSERT_LE(statfs.allocated, (unsigned)block_size * 5);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_blobs), 2u);
+  ASSERT_EQ(logger->get(l_bluestore_extents), 2u);
+
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove(cid, hoid2);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+    bl.append(std::string(block_size * 2, 'f'));
+
+    t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 5u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u);
+
+  {
+    ObjectStore::Transaction t;
+    t.zero(cid, hoid, 0, 100);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, 0, 100, bl);
+    ASSERT_EQ(r, (int)100);
+    expected.append(string(100, 0));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, 100, block_size * 2 - 100, bl);
+    ASSERT_EQ(r, (int)block_size * 2 - 100);
+    expected.append(string(block_size * 2 - 100, 'f'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  sleep(2);
+  {
+    struct store_statfs_t statfs;
+    int r = store->statfs(&statfs);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2 - 100);
+    ASSERT_LE(statfs.allocated, (unsigned)block_size * 2);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+  ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+    bl.append(std::string(block_size, 'g'));
+
+    t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_write_big), 6u);
+  ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u);
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, 0, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'g'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+  {
+    bufferlist bl, expected;
+    r = store->read(ch, hoid, block_size, block_size, bl);
+    ASSERT_EQ(r, (int)block_size);
+    expected.append(string(block_size, 'f'));
+    ASSERT_TRUE(bl_eq(expected, bl));
+  }
+
+  {
+    struct store_statfs_t statfs;
+    int r = store->statfs(&statfs);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2);
+    ASSERT_LE(statfs.allocated, (unsigned)block_size * 2);
+  }
+  ASSERT_EQ(logger->get(l_bluestore_blobs), 1u);
+  ASSERT_EQ(logger->get(l_bluestore_extents), 1u);
+
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove(cid, hoid2);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+}
+
 TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwriteReverse) {
 
   if (string(GetParam()) != "bluestore")