From 47c60699b8d818ca72dbc59c70aa41282304adda Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Sep 2016 15:42:06 -0400 Subject: [PATCH] os/bluestore/BlueFS: write into page-aligned buffers from rocksdb The BlueRocksEnv uses the append(const char *, size_t) appender. Ensure that this data is copied into a page-aligned buffer, and that we "pad" the write out with the remainder of the page. Signed-off-by: Sage Weil --- src/os/bluestore/BlueFS.cc | 28 +++++++++++++++++++++++++--- src/os/bluestore/BlueFS.h | 16 +++++++++++----- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 4dc02d662b861..c1d43f8d52eca 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1398,6 +1398,8 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) assert(!h->file->deleted); assert(h->file->num_readers.load() == 0); + h->buffer_appender.flush(); + bool buffered; if (h->file->fnode.ino == 1) buffered = false; @@ -1529,11 +1531,28 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) t.substr_of(bl, bloff, x_len); unsigned tail = x_len & ~super.block_mask(); if (tail) { + size_t zlen = super.block_size - tail; dout(20) << __func__ << " caching tail of 0x" - << std::hex << tail << std::dec - << " and padding block with zeros" << dendl; + << std::hex << tail + << " and padding block with 0x" << zlen + << std::dec << dendl; h->tail_block.substr_of(bl, bl.length() - tail, tail); - t.append_zero(super.block_size - tail); + if (h->file->fnode.ino > 1) { + // we are using the page_aligned_appender, and can safely use + // the tail of the raw buffer. + const bufferptr &last = t.back(); + if (last.unused_tail_length() != zlen) { + derr << " wtf, last is " << last << " from " << t << dendl; + } + assert(last.unused_tail_length() == zlen); + bufferptr z = last; + z.set_offset(last.offset() + last.length()); + z.set_length(zlen); + z.zero(); + t.append(z, 0, zlen); + } else { + t.append_zero(zlen); + } } bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered); bloff += x_len; @@ -1584,6 +1603,7 @@ void BlueFS::wait_for_aio(FileWriter *h) int BlueFS::_flush(FileWriter *h, bool force) { + h->buffer_appender.flush(); uint64_t length = h->buffer.length(); uint64_t offset = h->pos; if (!force && @@ -1617,6 +1637,8 @@ int BlueFS::_truncate(FileWriter *h, uint64_t offset) // we never truncate internal log files assert(h->file->fnode.ino > 1); + h->buffer_appender.flush(); + // truncate off unflushed data? if (h->pos < offset && h->pos + h->buffer.length() > offset) { diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 92260c5059617..f786f54722ceb 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -114,6 +114,7 @@ public: uint64_t pos; ///< start offset for buffer bufferlist buffer; ///< new data to write (at end of file) bufferlist tail_block; ///< existing partial block at end of file, if any + bufferlist::page_aligned_appender buffer_appender; //< for const char* only int writer_type = 0; ///< WRITER_* std::mutex lock; @@ -121,24 +122,29 @@ public: FileWriter(FileRef f) : file(f), - pos(0) { + pos(0), + buffer_appender(buffer.get_page_aligned_appender()) { ++file->num_writers; } // NOTE: caller must call BlueFS::close_writer() ~FileWriter() { --file->num_writers; } + + // note: BlueRocksEnv uses this append exclusively, so it's safe + // to use buffer_appender exclusively here (e.g., it's notion of + // offset will remain accurate). void append(const char *buf, size_t len) { - buffer.append(buf, len); + buffer_appender.append(buf, len); } + + // note: used internally only, for ino 1 or 0. void append(bufferlist& bl) { buffer.claim_append(bl); } - void append(bufferptr& bp) { - buffer.append(bp); - } uint64_t get_effective_write_pos() { + buffer_appender.flush(); return pos + buffer.length(); } }; -- 2.39.5