]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/BlueFS: add bluefs_sync_write option 14510/head
authorSage Weil <sage@redhat.com>
Thu, 4 May 2017 13:09:25 +0000 (08:09 -0500)
committerSage Weil <sage@redhat.com>
Thu, 4 May 2017 13:09:25 +0000 (08:09 -0500)
If we have a fast device we can do our writes using synchronous
IO instead of aio.  Most of the time rocksdb is doing sync writes
anyway (write and then fsync from the same thread).

Note that this might not be the case when using the
bluestore_sync_submit_transaction mode... that probably should
not be combined with bluefs_sync_write.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/config_opts.h
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h

index 18dd5e5d2957e5a55073807bf1b2389a7c2d7894..a69f36390b207c9d2667528d377795f2192e9caa 100644 (file)
@@ -1031,6 +1031,7 @@ OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576)  // before we consider
 OPTION(bluefs_min_flush_size, OPT_U64, 524288)  // ignore flush until its this big
 OPTION(bluefs_compact_log_sync, OPT_BOOL, false)  // sync or async log compaction?
 OPTION(bluefs_buffered_io, OPT_BOOL, false)
+OPTION(bluefs_sync_write, OPT_BOOL, false)
 OPTION(bluefs_allocator, OPT_STR, "bitmap")     // stupid | bitmap
 OPTION(bluefs_preextend_wal_files, OPT_BOOL, false)  // this *requires* that rocksdb has recycling enabled
 
index d084b8bcd1e327adf48571d56836b07448386640..a31e1de446225b495389b27d9c0604e7f0c6be90 100644 (file)
@@ -1397,14 +1397,7 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
     log_writer->file->fnode.size = jump_to;
   }
 
-  // drop lock while we wait for io
-  list<aio_t> completed_ios;
-  _claim_completed_aios(log_writer, &completed_ios);
-  l.unlock();
-  wait_for_aio(log_writer);
-  completed_ios.clear();
-  flush_bdev();
-  l.lock();
+  _flush_bdev_safely(log_writer);
 
   log_flushing = false;
   log_cond.notify_all();
@@ -1620,7 +1613,11 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
        t.append_zero(zlen);
       }
     }
-    bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
+    if (cct->_conf->bluefs_sync_write) {
+      bdev[p->bdev]->write(p->offset + x_off, t, buffered);
+    } else {
+      bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
+    }
     bloff += x_len;
     length -= x_len;
     ++p;
@@ -1740,13 +1737,7 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
      return r;
   uint64_t old_dirty_seq = h->file->dirty_seq;
 
-  list<aio_t> completed_ios;
-  _claim_completed_aios(h, &completed_ios);
-  lock.unlock();
-  wait_for_aio(h);
-  completed_ios.clear();
-  flush_bdev();
-  lock.lock();
+  _flush_bdev_safely(h);
 
   if (old_dirty_seq) {
     uint64_t s = log_seq;
@@ -1759,6 +1750,23 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
   return 0;
 }
 
+void BlueFS::_flush_bdev_safely(FileWriter *h)
+{
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(h, &completed_ios);
+    lock.unlock();
+    wait_for_aio(h);
+    completed_ios.clear();
+    flush_bdev();
+    lock.lock();
+  } else {
+    lock.unlock();
+    flush_bdev();
+    lock.lock();
+  }
+}
+
 void BlueFS::flush_bdev()
 {
   // NOTE: this is safe to call without a lock.
index be845d8500ebd81afaae5524e6fb9c4222ba6b86..7229355a10dfe561d06d5985996974700a4ab186 100644 (file)
@@ -287,6 +287,7 @@ private:
 
   //void _aio_finish(void *priv);
 
+  void _flush_bdev_safely(FileWriter *h);
   void flush_bdev();  // this is safe to call without a lock
 
   int _preallocate(FileRef f, uint64_t off, uint64_t len);