os/bluestore: refactor small write handling to reuse blob more effectively.

author Igor Fedotov <ifedotov@mirantis.com>

Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)

committer Igor Fedotov <ifedotov@mirantis.com>

Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)
author Igor Fedotov <ifedotov@mirantis.com>
Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)
committer Igor Fedotov <ifedotov@mirantis.com>
Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index 2dad3552069aa5a258b1f2429f8867079e584fcd..c59964c21596e05ccda6e49cff441fce7277923a 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -8703,7 +8703,7 @@ void BlueStore::_do_write_small(
    dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
            << std::dec << dendl;
    assert(length < min_alloc_size);
-  uint64_t end = offset + length;
+  uint64_t end_offs = offset + length;
  
    logger->inc(l_bluestore_write_small);
    logger->inc(l_bluestore_write_small_bytes, length);
@@ -8712,232 +8712,291 @@ void BlueStore::_do_write_small(
    blp.copy(length, bl);
  
    // Look for an existing mutable blob we can use.
-  // NB: Current approach prevents us from reusing blobs that might be extended
-  // but have all the extents prior to the offset. Don't care for now...
+  auto begin = o->extent_map.extent_map.begin();
+  auto end = o->extent_map.extent_map.end();
    auto ep = o->extent_map.seek_lextent(offset);
-  if (ep != o->extent_map.extent_map.begin()) {
+  if (ep != begin) {
      --ep;
      if (ep->blob_end() <= offset) {
        ++ep;
      }
    }
-  BlobRef b;
+  auto prev_ep = ep;
+  if (prev_ep != begin) {
+    --prev_ep;
+  } else {
+    prev_ep = end; // to avoid this extent check as it's a duplicate
+  }
+
    auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size);
-  while (ep != o->extent_map.extent_map.end()) {
-    if (ep->blob_start() >= end) {
-      break;
-    }
-    b = ep->blob;
-    if (!b->get_blob().is_mutable()) {
-      dout(20) << __func__ << " ignoring immutable " << *b << dendl;
-      ++ep;
-      continue;
-    }
-    if (ep->logical_offset % min_alloc_size !=
-       ep->blob_offset % min_alloc_size) {
-      dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
-      ++ep;
-      continue;
-    }
-    uint64_t bstart = ep->blob_start();
-    dout(20) << __func__ << " considering " << *b
-            << " bstart 0x" << std::hex << bstart << std::dec << dendl;
-
-    // can we pad our head/tail out with zeros?
-    uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
-    uint64_t head_pad = P2PHASE(offset, chunk_size);
-    uint64_t tail_pad = P2NPHASE(end, chunk_size);
-    if (head_pad || tail_pad) {
-      o->extent_map.fault_range(db, offset - head_pad,
-                                length + head_pad + tail_pad);
-    }
-    if (head_pad &&
-       o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
-      head_pad = 0;
-    }
-
-    if (tail_pad && o->extent_map.has_any_lextents(end, tail_pad)) {
-      tail_pad = 0;
-    }
-
-    bufferlist padded = bl;
-    if (head_pad) {
-      bufferlist z;
-      z.append_zero(head_pad);
-      z.claim_append(padded);
-      padded.claim(z);
-    }
-    if (tail_pad) {
-      padded.append_zero(tail_pad);
-    }
-    if (head_pad || tail_pad) {
-      dout(20) << __func__ << "  can pad head 0x" << std::hex << head_pad
-              << " tail 0x" << tail_pad << std::dec << dendl;
-      logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
-    }
-
-    // direct write into unused blocks of an existing mutable blob?
-    uint64_t b_off = offset - head_pad - bstart;
-    uint64_t b_len = length + head_pad + tail_pad;
-    if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
-       b->get_blob().get_ondisk_length() >= b_off + b_len &&
-       b->get_blob().is_unused(b_off, b_len) &&
-       b->get_blob().is_allocated(b_off, b_len)) {
-      dout(20) << __func__ << "  write to unused 0x" << std::hex
-              << b_off << "~" << b_len
-              << " pad 0x" << head_pad << " + 0x" << tail_pad
-              << std::dec << " of mutable " << *b << dendl;
-      _buffer_cache_write(txc, b, b_off, padded,
-                         wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
-
-      if (!g_conf->bluestore_debug_omit_block_device_write) {
-        if (b_len <= prefer_deferred_size) {
-         dout(20) << __func__ << " deferring small 0x" << std::hex
-                  << b_len << std::dec << " unused write via deferred" << dendl;
+  auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+  uint32_t alloc_len = min_alloc_size;
+  auto offset0 = P2ALIGN(offset, alloc_len);
+
+  bool any_change;
+
+  // search suitable extent in both forward and reverse direction in
+  // [offset - target_max_blob_size, offset + target_max_blob_size] range
+  // then check if blob can be reused via try_reuse_blob func or apply
+  // direct/deferred write (the latter for extents including or higher
+  // than 'offset' only).
+  do {
+    any_change = false;
+
+    if (ep != end && ep->logical_offset < offset + max_bsize) {
+      BlobRef b = ep->blob;
+      auto bstart = ep->blob_start();
+      dout(20) << __func__ << " considering " << *b
+              << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (bstart >= end_offs) {
+       dout(20) << __func__ << " ignoring distant " << *b << dendl;
+      } else if (!b->get_blob().is_mutable()) {
+       dout(20) << __func__ << " ignoring immutable " << *b << dendl;
+      } else if (ep->logical_offset % min_alloc_size !=
+                 ep->blob_offset % min_alloc_size) {
+       dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
+      } else {
+       uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+       // can we pad our head/tail out with zeros?
+       uint64_t head_pad, tail_pad;
+       head_pad = P2PHASE(offset, chunk_size);
+       tail_pad = P2NPHASE(end_offs, chunk_size);
+       if (head_pad || tail_pad) {
+         o->extent_map.fault_range(db, offset - head_pad,
+                                   end_offs - offset + head_pad + tail_pad);
+       }
+       if (head_pad &&
+           o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) {
+         head_pad = 0;
+       }
+       if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
+         tail_pad = 0;
+       }
+
+       uint64_t b_off = offset - head_pad - bstart;
+       uint64_t b_len = length + head_pad + tail_pad;
+
+       // direct write into unused blocks of an existing mutable blob?
+       if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
+           b->get_blob().get_ondisk_length() >= b_off + b_len &&
+           b->get_blob().is_unused(b_off, b_len) &&
+           b->get_blob().is_allocated(b_off, b_len)) {
+         bufferlist padded;
+         _apply_padding(head_pad, tail_pad, bl, padded);
+
+         dout(20) << __func__ << "  write to unused 0x" << std::hex
+                  << b_off << "~" << b_len
+                  << " pad 0x" << head_pad << " + 0x" << tail_pad
+                  << std::dec << " of mutable " << *b << dendl;
+         _buffer_cache_write(txc, b, b_off, padded,
+                             wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+         if (!g_conf->bluestore_debug_omit_block_device_write) {
+           if (b_len <= prefer_deferred_size) {
+             dout(20) << __func__ << " deferring small 0x" << std::hex
+                      << b_len << std::dec << " unused write via deferred" << dendl;
+             bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
+             op->op = bluestore_deferred_op_t::OP_WRITE;
+             b->get_blob().map(
+               b_off, b_len,
+               [&](uint64_t offset, uint64_t length) {
+                 op->extents.emplace_back(bluestore_pextent_t(offset, length));
+                 return 0;
+               });
+             op->data = padded;
+           } else {
+             b->get_blob().map_bl(
+               b_off, padded,
+               [&](uint64_t offset, bufferlist& t) {
+                 bdev->aio_write(offset, t,
+                                 &txc->ioc, wctx->buffered);
+               });
+           }
+         }
+         b->dirty_blob().calc_csum(b_off, padded);
+         dout(20) << __func__ << "  lex old " << *ep << dendl;
+         Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
+                                                b,
+                                                &wctx->old_extents);
+         b->dirty_blob().mark_used(le->blob_offset, le->length);
+         txc->statfs_delta.stored() += le->length;
+         dout(20) << __func__ << "  lex " << *le << dendl;
+         logger->inc(l_bluestore_write_small_unused);
+         return;
+       }
+       // read some data to fill out the chunk?
+       uint64_t head_read = P2PHASE(b_off, chunk_size);
+       uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
+       if ((head_read || tail_read) &&
+           (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
+           head_read + tail_read < min_alloc_size) {
+         b_off -= head_read;
+         b_len += head_read + tail_read;
+
+       } else {
+         head_read = tail_read = 0;
+       }
+
+       // chunk-aligned deferred overwrite?
+       if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
+           b_off % chunk_size == 0 &&
+           b_len % chunk_size == 0 &&
+           b->get_blob().is_allocated(b_off, b_len)) {
+
+         bufferlist padded;
+         _apply_padding(head_pad, tail_pad, bl, padded);
+
+         dout(20) << __func__ << "  reading head 0x" << std::hex << head_read
+                  << " and tail 0x" << tail_read << std::dec << dendl;
+         if (head_read) {
+           bufferlist head_bl;
+           int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
+                            head_bl, 0);
+           assert(r >= 0 && r <= (int)head_read);
+           size_t zlen = head_read - r;
+           if (zlen) {
+             head_bl.append_zero(zlen);
+             logger->inc(l_bluestore_write_pad_bytes, zlen);
+           }
+           head_bl.claim_append(padded);
+           padded.swap(head_bl);
+           logger->inc(l_bluestore_write_penalty_read_ops);
+         }
+         if (tail_read) {
+           bufferlist tail_bl;
+           int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
+                            tail_bl, 0);
+           assert(r >= 0 && r <= (int)tail_read);
+           size_t zlen = tail_read - r;
+           if (zlen) {
+             tail_bl.append_zero(zlen);
+             logger->inc(l_bluestore_write_pad_bytes, zlen);
+           }
+           padded.claim_append(tail_bl);
+           logger->inc(l_bluestore_write_penalty_read_ops);
+         }
+         logger->inc(l_bluestore_write_small_pre_read);
+
           bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
           op->op = bluestore_deferred_op_t::OP_WRITE;
-         b->get_blob().map(
+         _buffer_cache_write(txc, b, b_off, padded,
+                             wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+         int r = b->get_blob().map(
             b_off, b_len,
             [&](uint64_t offset, uint64_t length) {
               op->extents.emplace_back(bluestore_pextent_t(offset, length));
               return 0;
             });
-         op->data = padded;
-       } else {
-         b->get_blob().map_bl(
-           b_off, padded,
-           [&](uint64_t offset, bufferlist& t) {
-             bdev->aio_write(offset, t,
-                             &txc->ioc, wctx->buffered);
-           });
+         assert(r == 0);
+         if (b->get_blob().csum_type) {
+           b->dirty_blob().calc_csum(b_off, padded);
+         }
+         op->data.claim(padded);
+         dout(20) << __func__ << "  deferred write 0x" << std::hex << b_off << "~"
+                  << b_len << std::dec << " of mutable " << *b
+                  << " at " << op->extents << dendl;
+         Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
+                                                b, &wctx->old_extents);
+         b->dirty_blob().mark_used(le->blob_offset, le->length);
+         txc->statfs_delta.stored() += le->length;
+         dout(20) << __func__ << "  lex " << *le << dendl;
+         logger->inc(l_bluestore_write_small_deferred);
+         return;
         }
-      }
-      b->dirty_blob().calc_csum(b_off, padded);
-      dout(20) << __func__ << "  lex old " << *ep << dendl;
-      Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
-                                            b,
-                                            &wctx->old_extents);
-      b->dirty_blob().mark_used(le->blob_offset, le->length);
-      txc->statfs_delta.stored() += le->length;
-      dout(20) << __func__ << "  lex " << *le << dendl;
-      logger->inc(l_bluestore_write_small_unused);
-      return;
-    }
+       //try to reuse blob
+       if (b->try_reuse_blob(min_alloc_size,
+                             max_bsize,
+                             offset0 - bstart,
+                             &alloc_len)) {
+         assert(alloc_len == min_alloc_size); // expecting data always
+                                              // fit into reused blob
+         // Need to check for pending writes desiring to
+         // reuse the same pextent. The rationale is that during GC two chunks
+         // from garbage blobs(compressed?) can share logical space within the same
+         // AU. That's in turn might be caused by unaligned len in clone_range2.
+         // Hence the second write will fail in an attempt to reuse blob at
+         // do_alloc_write().
+         if (!wctx->has_conflict(b,
+                                 offset0,
+                                 offset0 + alloc_len, 
+                                 min_alloc_size)) {
+
+           // we can't reuse pad_head/pad_tail since they might be truncated 
+           // due to existent extents
+           uint64_t b_off = offset - bstart;
+           uint64_t b_off0 = b_off;
+           _pad_zeros(&bl, &b_off0, chunk_size);
  
-    // read some data to fill out the chunk?
-    uint64_t head_read = P2PHASE(b_off, chunk_size);
-    uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
-    if ((head_read || tail_read) &&
-       (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
-       head_read + tail_read < min_alloc_size) {
-      dout(20) << __func__ << "  reading head 0x" << std::hex << head_read
-              << " and tail 0x" << tail_read << std::dec << dendl;
-      if (head_read) {
-       bufferlist head_bl;
-       int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
-                        head_bl, 0);
-       assert(r >= 0 && r <= (int)head_read);
-       size_t zlen = head_read - r;
-       if (zlen) {
-         head_bl.append_zero(zlen);
-         logger->inc(l_bluestore_write_pad_bytes, zlen);
-       }
-       b_off -= head_read;
-       b_len += head_read;
-       head_bl.claim_append(padded);
-       padded.swap(head_bl);
-       logger->inc(l_bluestore_write_penalty_read_ops);
-      }
-      if (tail_read) {
-       bufferlist tail_bl;
-       int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
-                        tail_bl, 0);
-       assert(r >= 0 && r <= (int)tail_read);
-       b_len += tail_read;
-       padded.claim_append(tail_bl);
-       size_t zlen = tail_read - r;
-       if (zlen) {
-         padded.append_zero(zlen);
-         logger->inc(l_bluestore_write_pad_bytes, zlen);
+           dout(20) << __func__ << " reuse blob " << *b << std::hex
+                    << " (" << b_off0 << "~" << bl.length() << ")"
+                    << " (" << b_off << "~" << length << ")"
+                    << std::dec << dendl;
+
+           o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+           wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+                       false, false);
+           logger->inc(l_bluestore_write_small_unused);
+           return;
+         }
         }
-       logger->inc(l_bluestore_write_penalty_read_ops);
        }
-      logger->inc(l_bluestore_write_small_pre_read);
-    }
-
-    // chunk-aligned deferred overwrite?
-    if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
-       b_off % chunk_size == 0 &&
-       b_len % chunk_size == 0 &&
-       b->get_blob().is_allocated(b_off, b_len)) {
-      bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
-      op->op = bluestore_deferred_op_t::OP_WRITE;
-      _buffer_cache_write(txc, b, b_off, padded,
-                         wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
-
-      int r = b->get_blob().map(
-       b_off, b_len,
-       [&](uint64_t offset, uint64_t length) {
-         op->extents.emplace_back(bluestore_pextent_t(offset, length));
-          return 0;
-       });
-      assert(r == 0);
-      if (b->get_blob().csum_type) {
-       b->dirty_blob().calc_csum(b_off, padded);
-      }
-      op->data.claim(padded);
-      dout(20) << __func__ << "  deferred write 0x" << std::hex << b_off << "~"
-              << b_len << std::dec << " of mutable " << *b
-              << " at " << op->extents << dendl;
-      Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
-                                            b, &wctx->old_extents);
-      b->dirty_blob().mark_used(le->blob_offset, le->length);
-      txc->statfs_delta.stored() += le->length;
-      dout(20) << __func__ << "  lex " << *le << dendl;
-      logger->inc(l_bluestore_write_small_deferred);
-      return;
-    }
-    uint32_t alloc_len = min_alloc_size;
-    auto offset0 = P2ALIGN(offset, alloc_len);
-    if (!head_read && !tail_read &&
-      b->try_reuse_blob(min_alloc_size,
-                        max_bsize,
-                        offset0 - bstart,
-                        &alloc_len)) {
-      assert(alloc_len == min_alloc_size); // expecting data always
-                                           // fit into reused blob
-      // Need to check for pending writes desiring to
-      // reuse the same pextent. The rationale is that during GC two chunks
-      // from garbage blobs(compressed?) can share logical space within the same
-      // AU. That's in turn might be caused by unaligned len in clone_range2.
-      // Hence the second write will fail in an attempt to reuse blob at
-      // do_alloc_write().
-      if (!wctx->has_conflict(b,
-                             offset0,
-                             offset0 + alloc_len, 
-                             min_alloc_size)) {
-        uint64_t b_off = offset - bstart;
-        uint64_t b_off0 = b_off - head_pad;
-        dout(20) << __func__ << " reuse blob " << *b << std::hex
-                 << " (" << b_off0 << "~" << padded.length() << ")"
-                 << " (" << b_off << "~" << length << ")"
-                 << std::dec << dendl;
-
-        o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
-       wctx->write(offset, b, alloc_len, b_off0, padded, b_off, length,
-                   false, false);
-        logger->inc(l_bluestore_write_small_unused);
-        return;
+      ++ep;
+      any_change = true;
+    } // if (ep != end && ep->logical_offset < offset + max_bsize)
+
+    // check extent for reuse in reverse order
+    if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+      BlobRef b = prev_ep->blob;
+      auto bstart = prev_ep->blob_start();
+      dout(20) << __func__ << " considering " << *b
+              << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (b->try_reuse_blob(min_alloc_size,
+                           max_bsize,
+                            offset0 - bstart,
+                            &alloc_len)) {
+       assert(alloc_len == min_alloc_size); // expecting data always
+                                            // fit into reused blob
+       // Need to check for pending writes desiring to
+       // reuse the same pextent. The rationale is that during GC two chunks
+       // from garbage blobs(compressed?) can share logical space within the same
+       // AU. That's in turn might be caused by unaligned len in clone_range2.
+       // Hence the second write will fail in an attempt to reuse blob at
+       // do_alloc_write().
+       if (!wctx->has_conflict(b,
+                               offset0,
+                               offset0 + alloc_len, 
+                               min_alloc_size)) {
+
+         uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+         uint64_t b_off = offset - bstart;
+         uint64_t b_off0 = b_off;
+         _pad_zeros(&bl, &b_off0, chunk_size);
+
+         dout(20) << __func__ << " reuse blob " << *b << std::hex
+                   << " (" << b_off0 << "~" << bl.length() << ")"
+                   << " (" << b_off << "~" << length << ")"
+                   << std::dec << dendl;
+
+         o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+         wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+                     false, false);
+         logger->inc(l_bluestore_write_small_unused);
+         return;
+       }
+      } 
+      if (prev_ep != begin) {
+       --prev_ep;
+       any_change = true;
+      } else {
+       prev_ep = end; // to avoid useless first extent re-check
        }
-    }
-
-
-    ++ep;
-  }
+    } // if (prev_ep != end && prev_ep->logical_offset >= min_off) 
+  } while (any_change);
  
    // new blob.
-  b = c->new_blob();
-  unsigned alloc_len = min_alloc_size;
+  
+  BlobRef b = c->new_blob();
    uint64_t b_off = P2PHASE(offset, alloc_len);
    uint64_t b_off0 = b_off;
    _pad_zeros(&bl, &b_off0, block_size);
@@ -10602,4 +10661,27 @@ void BlueStore::flush_cache()
    }
    coll_map.clear();
  }
+
+void BlueStore::_apply_padding(uint64_t head_pad,
+                              uint64_t tail_pad,
+                              bufferlist& bl,
+                              bufferlist& padded)
+{
+  padded = bl;
+  if (head_pad) {
+    bufferlist z;
+    z.append_zero(head_pad);
+    z.claim_append(padded);
+    padded.claim(z);
+  }
+  if (tail_pad) {
+    padded.append_zero(tail_pad);
+  }
+  if (head_pad || tail_pad) {
+    dout(20) << __func__ << "  can pad head 0x" << std::hex << head_pad
+             << " tail 0x" << tail_pad << std::dec << dendl;
+    logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
+  }
+}
+
  // ===========================================
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h

index 8a8c8a8493bae8e7901223ba4e5f31b16f2bfafd..42cca7710b1c40c68c718e57b0d50a2f5ed758f1 100644 (file)
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -1975,6 +1975,11 @@ private:
      return val1;
    }
  
+  void _apply_padding(uint64_t head_pad,
+                     uint64_t tail_pad,
+                     bufferlist& bl,
+                     bufferlist& padded);
+
    // -- ondisk version ---
  public:
    const int32_t latest_ondisk_format = 2;        ///< our version
author	Igor Fedotov <ifedotov@mirantis.com>
	Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)
committer	Igor Fedotov <ifedotov@mirantis.com>
	Fri, 7 Apr 2017 18:32:22 +0000 (18:32 +0000)
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/BlueStore.h		patch \| blob \| history