From 43d60b5d4bc4d9840b7ddd0ea68482e300a7e30f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 May 2017 08:09:25 -0500 Subject: [PATCH] os/bluestore/BlueFS: add bluefs_sync_write option If we have a fast device we can do our writes using synchronous IO instead of aio. Most of the time rocksdb is doing sync writes anyway (write and then fsync from the same thread). Note that this might not be the case when using the bluestore_sync_submit_transaction mode... that probably should not be combined with bluefs_sync_write. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/bluestore/BlueFS.cc | 40 +++++++++++++++++++++++--------------- src/os/bluestore/BlueFS.h | 1 + 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 18dd5e5d2957..a69f36390b20 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1031,6 +1031,7 @@ OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576) // before we consider OPTION(bluefs_min_flush_size, OPT_U64, 524288) // ignore flush until its this big OPTION(bluefs_compact_log_sync, OPT_BOOL, false) // sync or async log compaction? OPTION(bluefs_buffered_io, OPT_BOOL, false) +OPTION(bluefs_sync_write, OPT_BOOL, false) OPTION(bluefs_allocator, OPT_STR, "bitmap") // stupid | bitmap OPTION(bluefs_preextend_wal_files, OPT_BOOL, false) // this *requires* that rocksdb has recycling enabled diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index d084b8bcd1e3..a31e1de44622 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1397,14 +1397,7 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, log_writer->file->fnode.size = jump_to; } - // drop lock while we wait for io - list completed_ios; - _claim_completed_aios(log_writer, &completed_ios); - l.unlock(); - wait_for_aio(log_writer); - completed_ios.clear(); - flush_bdev(); - l.lock(); + _flush_bdev_safely(log_writer); log_flushing = false; log_cond.notify_all(); @@ -1620,7 +1613,11 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) t.append_zero(zlen); } } - bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered); + if (cct->_conf->bluefs_sync_write) { + bdev[p->bdev]->write(p->offset + x_off, t, buffered); + } else { + bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered); + } bloff += x_len; length -= x_len; ++p; @@ -1740,13 +1737,7 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock& l) return r; uint64_t old_dirty_seq = h->file->dirty_seq; - list completed_ios; - _claim_completed_aios(h, &completed_ios); - lock.unlock(); - wait_for_aio(h); - completed_ios.clear(); - flush_bdev(); - lock.lock(); + _flush_bdev_safely(h); if (old_dirty_seq) { uint64_t s = log_seq; @@ -1759,6 +1750,23 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock& l) return 0; } +void BlueFS::_flush_bdev_safely(FileWriter *h) +{ + if (!cct->_conf->bluefs_sync_write) { + list completed_ios; + _claim_completed_aios(h, &completed_ios); + lock.unlock(); + wait_for_aio(h); + completed_ios.clear(); + flush_bdev(); + lock.lock(); + } else { + lock.unlock(); + flush_bdev(); + lock.lock(); + } +} + void BlueFS::flush_bdev() { // NOTE: this is safe to call without a lock. diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index be845d8500eb..7229355a10df 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -287,6 +287,7 @@ private: //void _aio_finish(void *priv); + void _flush_bdev_safely(FileWriter *h); void flush_bdev(); // this is safe to call without a lock int _preallocate(FileRef f, uint64_t off, uint64_t len); -- 2.47.3