From: Sage Weil Date: Mon, 21 Dec 2015 20:07:44 +0000 (-0500) Subject: os/bluestore/BlueFS: ignore flush when buffer is small X-Git-Tag: v10.0.3~154^2~107 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1605e04d6504f2d30be83f7cfdd9141b819fb35d;p=ceph.git os/bluestore/BlueFS: ignore flush when buffer is small Rocksdb does a flush after every append, each of which is often less than a full block. This is very inefficient when our _flush() will send that to disk (and block). Avoid this most of the time by ignoring small flush requests entirely, unless the force flag is set (e.g., by fsync). Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 1596d3406751..652c31bdd818 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -842,6 +842,7 @@ OPTION(bluefs_min_log_runway, OPT_U64, 1048576) // alloc when we get this low OPTION(bluefs_max_log_runway, OPT_U64, 4194304) // alloc this much at a time OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT, 5.0) // before we consider OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576) // before we consider +OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this big OPTION(bluestore_bluefs, OPT_BOOL, true) OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 97db917e713b..23f7cee7c3a9 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -754,7 +754,8 @@ void BlueFS::_compact_log() log_file->fnode.size = bl.length(); log_writer = new FileWriter(log_file, bdev.size()); log_writer->append(bl); - _flush(log_writer); + int r = _flush(log_writer, true); + assert(r == 0); dout(10) << __func__ << " writing super" << dendl; super.log_fnode = log_file->fnode; @@ -811,7 +812,7 @@ int BlueFS::_flush_log() log_t.seq = 0; // just so debug output is less confusing _flush_bdev(); - int r = _flush(log_writer); + int r = _flush(log_writer, true); assert(r == 0); _flush_bdev(); @@ -930,10 +931,17 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) return 0; } -int BlueFS::_flush(FileWriter *h) +int BlueFS::_flush(FileWriter *h, bool force) { uint64_t length = h->buffer.length(); uint64_t offset = h->pos; + if (!force && + length < g_conf->bluefs_min_flush_size) { + dout(10) << __func__ << " " << h << " ignoring, length " << length + << " < min_flush_size " << g_conf->bluefs_min_flush_size + << dendl; + return 0; + } if (length == 0) { if (h->file->dirty) { dout(10) << __func__ << " " << h << " no data, flushing metadata on " @@ -965,7 +973,7 @@ int BlueFS::_truncate(FileWriter *h, uint64_t offset) assert(0 == "actually this shouldn't happen"); } if (h->buffer.length()) { - int r = _flush(h); + int r = _flush(h, true); if (r < 0) return r; } @@ -984,7 +992,7 @@ int BlueFS::_truncate(FileWriter *h, uint64_t offset) void BlueFS::_fsync(FileWriter *h) { dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl; - _flush(h); + _flush(h, true); if (h->file->dirty) { _flush_log(); assert(!h->file->dirty); diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 7e62d7434854..0c9bb6dfc04a 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -191,7 +191,7 @@ private: int _allocate(unsigned bdev, uint64_t len, vector *ev); int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); - int _flush(FileWriter *h); + int _flush(FileWriter *h, bool force); void _fsync(FileWriter *h); int _flush_log(); @@ -290,7 +290,7 @@ public: void flush(FileWriter *h) { Mutex::Locker l(lock); - _flush(h); + _flush(h, false); } void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { Mutex::Locker l(lock);