From 47cd1afa7ba1a7f303b1e949d569c52d6b1a6995 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 7 Mar 2019 02:01:18 +0300 Subject: [PATCH] osd/bluestore: Actually wait until completion in write_sync This function is only used by RocksDB WAL writing so it must sync data. This fixes #18338 and thus allows to actually set `bluefs_preextend_wal_files` to true, gaining +100% single-thread write iops in disk-bound (HDD or bad SSD) setups. To my knowledge it doesn't hurt performance in other cases. Test it yourself on any HDD with `fio -ioengine=rbd -direct=1 -bs=4k -iodepth=1`. Issue #18338 is easily reproduced without this patch by issuing a `kill -9` to the OSD while doing `fio -ioengine=rbd -direct=1 -bs=4M -iodepth=16`. Fixes: https://tracker.ceph.com/issues/18338 https://tracker.ceph.com/issues/38559 Signed-off-by: Vitaliy Filippov (cherry picked from commit c703cf9a7632cbd9f17e148ef203509549a28571) Conflicts: src/os/bluestore/KernelDevice.cc - mimic has a single variable "fd_buffered" where master has an array "fd_buffereds" --- src/os/bluestore/KernelDevice.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc index caae073731d6f..b8d1ebf195145 100644 --- a/src/os/bluestore/KernelDevice.cc +++ b/src/os/bluestore/KernelDevice.cc @@ -692,8 +692,8 @@ int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered) return r; } if (buffered) { - // initiate IO (but do not wait) - r = ::sync_file_range(fd_buffered, off, len, SYNC_FILE_RANGE_WRITE); + // initiate IO and wait till it completes + r = ::sync_file_range(fd_buffered, off, len, SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER|SYNC_FILE_RANGE_WAIT_BEFORE); if (r < 0) { r = -errno; derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl; -- 2.39.5