]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/BlueFS: write into page-aligned buffers from rocksdb
authorSage Weil <sage@redhat.com>
Thu, 22 Sep 2016 19:42:06 +0000 (15:42 -0400)
committerSage Weil <sage@redhat.com>
Sun, 16 Oct 2016 14:32:51 +0000 (10:32 -0400)
The BlueRocksEnv uses the append(const char *, size_t) appender.  Ensure
that this data is copied into a page-aligned buffer, and that we "pad" the
write out with the remainder of the page.

Signed-off-by: Sage Weil <sage@redhat.com>
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h

index 4dc02d662b86182cd8b4aa5cf3aef8f65f3852ba..c1d43f8d52eca27161549f488a0dc229b68ba3ad 100644 (file)
@@ -1398,6 +1398,8 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
   assert(!h->file->deleted);
   assert(h->file->num_readers.load() == 0);
 
+  h->buffer_appender.flush();
+
   bool buffered;
   if (h->file->fnode.ino == 1)
     buffered = false;
@@ -1529,11 +1531,28 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
     t.substr_of(bl, bloff, x_len);
     unsigned tail = x_len & ~super.block_mask();
     if (tail) {
+      size_t zlen = super.block_size - tail;
       dout(20) << __func__ << " caching tail of 0x"
-               << std::hex << tail << std::dec
-              << " and padding block with zeros" << dendl;
+               << std::hex << tail
+              << " and padding block with 0x" << zlen
+              << std::dec << dendl;
       h->tail_block.substr_of(bl, bl.length() - tail, tail);
-      t.append_zero(super.block_size - tail);
+      if (h->file->fnode.ino > 1) {
+       // we are using the page_aligned_appender, and can safely use
+       // the tail of the raw buffer.
+       const bufferptr &last = t.back();
+       if (last.unused_tail_length() != zlen) {
+         derr << " wtf, last is " << last << " from " << t << dendl;
+       }
+       assert(last.unused_tail_length() == zlen);
+       bufferptr z = last;
+       z.set_offset(last.offset() + last.length());
+       z.set_length(zlen);
+       z.zero();
+       t.append(z, 0, zlen);
+      } else {
+       t.append_zero(zlen);
+      }
     }
     bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
     bloff += x_len;
@@ -1584,6 +1603,7 @@ void BlueFS::wait_for_aio(FileWriter *h)
 
 int BlueFS::_flush(FileWriter *h, bool force)
 {
+  h->buffer_appender.flush();
   uint64_t length = h->buffer.length();
   uint64_t offset = h->pos;
   if (!force &&
@@ -1617,6 +1637,8 @@ int BlueFS::_truncate(FileWriter *h, uint64_t offset)
   // we never truncate internal log files
   assert(h->file->fnode.ino > 1);
 
+  h->buffer_appender.flush();
+
   // truncate off unflushed data?
   if (h->pos < offset &&
       h->pos + h->buffer.length() > offset) {
index 92260c50596172ed88028b16fbf84261102bd8ef..f786f54722cebfbfecb6f7cbf0d19905c3f91963 100644 (file)
@@ -114,6 +114,7 @@ public:
     uint64_t pos;           ///< start offset for buffer
     bufferlist buffer;      ///< new data to write (at end of file)
     bufferlist tail_block;  ///< existing partial block at end of file, if any
+    bufferlist::page_aligned_appender buffer_appender;  //< for const char* only
     int writer_type = 0;    ///< WRITER_*
 
     std::mutex lock;
@@ -121,24 +122,29 @@ public:
 
     FileWriter(FileRef f)
       : file(f),
-       pos(0) {
+       pos(0),
+       buffer_appender(buffer.get_page_aligned_appender()) {
       ++file->num_writers;
     }
     // NOTE: caller must call BlueFS::close_writer()
     ~FileWriter() {
       --file->num_writers;
     }
+
+    // note: BlueRocksEnv uses this append exclusively, so it's safe
+    // to use buffer_appender exclusively here (e.g., it's notion of
+    // offset will remain accurate).
     void append(const char *buf, size_t len) {
-      buffer.append(buf, len);
+      buffer_appender.append(buf, len);
     }
+
+    // note: used internally only, for ino 1 or 0.
     void append(bufferlist& bl) {
       buffer.claim_append(bl);
     }
-    void append(bufferptr& bp) {
-      buffer.append(bp);
-    }
 
     uint64_t get_effective_write_pos() {
+      buffer_appender.flush();
       return pos + buffer.length();
     }
   };