]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/BlueFS: clear newly allocated space for WAL logs 30549/head
authorAdam Kupczyk <akupczyk@redhat.com>
Mon, 23 Sep 2019 11:52:45 +0000 (13:52 +0200)
committerAdam Kupczyk <akupczyk@redhat.com>
Wed, 9 Oct 2019 14:34:39 +0000 (16:34 +0200)
Signed-off-by: Adam Kupczyk <akupczyk@redhat.com>
src/common/options.cc
src/os/bluestore/BlueFS.cc

index b0158b0c09ebd55023d207e19d67bef94faab09e..fe5dac0a692efc66916d9f7e893df7a79e3ec1bf 100644 (file)
@@ -4343,7 +4343,7 @@ std::vector<Option> get_global_options() {
 
     Option("bluefs_preextend_wal_files", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(true)
-    .set_description("Preextent rocksdb wal files on mkfs to avoid performance penalty for young stores"),
+    .set_description("Preextent rocksdb wal files on mkfs to avoid performance penalty"),
 
     Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(true)
index ddff4e2c92f637f0ef8ad856128cd179b712f11a..ee27028a418c2e48b9deb532b6830376914cc3fd 100644 (file)
@@ -2300,6 +2300,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
   // do not bother to dirty the file if we are overwriting
   // previously allocated extents.
   bool must_dirty = false;
+  uint64_t clear_upto = 0;
   if (allocated < offset + length) {
     // we should never run out of log space here; see the min runway check
     // in _flush_and_sync_log.
@@ -2321,6 +2322,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
       // records.  otherwise, we will fail to reply the rocksdb log
       // properly due to garbage on the device.
       h->file->fnode.size = h->file->fnode.get_allocated();
+      clear_upto = h->file->fnode.size;
       dout(10) << __func__ << " extending WAL size to 0x" << std::hex
               << h->file->fnode.size << std::dec << " to include allocated"
               << dendl;
@@ -2385,7 +2387,8 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
       }
     }
   }
-  if (length == partial + h->buffer.length()) {
+  if (length == partial + h->buffer.length() || clear_upto != 0) {
+    /* in case of inital allocation and need to zero, limited flush is unacceptable */
     bl.claim_append_piecewise(h->buffer);
   } else {
     bufferlist t;
@@ -2398,6 +2401,31 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
   }
   ceph_assert(bl.length() == length);
 
+  h->pos = offset + length;
+
+  unsigned tail = bl.length() & ~super.block_mask();
+  if (tail) {
+    dout(20) << __func__ << " caching tail of 0x"
+             << std::hex << tail
+             << " and padding block with 0x" << (super.block_size - tail)
+             << std::dec << dendl;
+    h->tail_block.substr_of(bl, bl.length() - tail, tail);
+    bl.append_zero(super.block_size - tail);
+    length += super.block_size - tail;
+  } else {
+    h->tail_block.clear();
+  }
+  if (clear_upto != 0) {
+    if (offset + length < clear_upto) {
+      dout(20) << __func__ << " zeroing WAL log up to 0x"
+               << std::hex << clear_upto
+               << std::dec << dendl;
+      bl.append_zero(clear_upto - (offset + length));
+      length += clear_upto - (offset + length);
+    } 
+  } 
+  ceph_assert(bl.length() == length);
+
   switch (h->writer_type) {
   case WRITER_WAL:
     logger->inc(l_bluefs_bytes_written_wal, length);
@@ -2411,40 +2439,12 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
   bl.hexdump(*_dout);
   *_dout << dendl;
 
-  h->pos = offset + length;
-  h->tail_block.clear();
-
   uint64_t bloff = 0;
   uint64_t bytes_written_slow = 0;
   while (length > 0) {
     uint64_t x_len = std::min(p->length - x_off, length);
     bufferlist t;
     t.substr_of(bl, bloff, x_len);
-    unsigned tail = x_len & ~super.block_mask();
-    if (tail) {
-      size_t zlen = super.block_size - tail;
-      dout(20) << __func__ << " caching tail of 0x"
-               << std::hex << tail
-              << " and padding block with 0x" << zlen
-              << std::dec << dendl;
-      h->tail_block.substr_of(bl, bl.length() - tail, tail);
-      if (h->file->fnode.ino > 1) {
-       // we are using the page_aligned_appender, and can safely use
-       // the tail of the raw buffer.
-       const bufferptr &last = t.back();
-       if (last.unused_tail_length() < zlen) {
-         derr << " wtf, last is " << last << " from " << t << dendl;
-         ceph_assert(last.unused_tail_length() >= zlen);
-       }
-       bufferptr z = last;
-       z.set_offset(last.offset() + last.length());
-       z.set_length(zlen);
-       z.zero();
-       t.append(z, 0, zlen);
-      } else {
-       t.append_zero(zlen);
-      }
-    }
     if (cct->_conf->bluefs_sync_write) {
       bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
     } else {