From c08720553f6ae787fe3b0edbdd1497859cdfe0d4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Mar 2017 16:51:21 -0500 Subject: [PATCH] os/bluestore/BlueFS: fix flush_bdev placement We need to flush any new writes on any fsync(). Notably, this includes the rocksdb log. However, previously _fsync was only doing a bdev flush if we also had a dirty bluefs journal and called into _sync_and_flush_journal. If we didn't, we weren't doing a flush() at all, which could lead to corrupted data. Fix this by moving the first flush_bdev *out* of _sync_and_flush_log. (The second one is there to flush the bluefs journal; the first one was to ensure prior writes are stable.) Instead, flush prior writes in all of the callers prior to calling _sync_and_flush_log. This includes _fsync (and fixes the bug by covering the non-journal-flush path) as well as several other callers. Fixes: http://tracker.ceph.com/issues/19250 Signed-off-by: Sage Weil (cherry picked from commit 2924a96493d8570317e55854a25fc64911ecf151) --- src/os/bluestore/BlueFS.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index a49c9a7e8e22f..07e87c4637439 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -195,6 +195,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want, log_t.op_alloc_rm(id, p.offset, p.length); } + flush_bdev(); r = _flush_and_sync_log(l); assert(r == 0); @@ -1159,6 +1160,9 @@ void BlueFS::_compact_log_async(std::unique_lock& l) // write the new entries log_t.op_file_update(log_file->fnode); log_t.op_jump(log_seq, old_log_jump_to); + + flush_bdev(); // FIXME? + _flush_and_sync_log(l, 0, old_log_jump_to); // 2. prepare compacted log @@ -1280,6 +1284,7 @@ void BlueFS::_pad_bl(bufferlist& bl) void BlueFS::flush_log() { std::unique_lock l(lock); + flush_bdev(); _flush_and_sync_log(l); } @@ -1350,7 +1355,6 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, log_t.seq = 0; // just so debug output is less confusing log_flushing = true; - flush_bdev(); int r = _flush(log_writer, true); assert(r == 0); @@ -1702,12 +1706,15 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock& l) if (r < 0) return r; uint64_t old_dirty_seq = h->file->dirty_seq; + list completed_ios; _claim_completed_aios(h, &completed_ios); lock.unlock(); wait_for_aio(h); completed_ios.clear(); + flush_bdev(); lock.lock(); + if (old_dirty_seq) { uint64_t s = log_seq; dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq @@ -1825,6 +1832,7 @@ void BlueFS::sync_metadata() utime_t start = ceph_clock_now(); vector> to_release(pending_release.size()); to_release.swap(pending_release); + flush_bdev(); // FIXME? _flush_and_sync_log(l); for (unsigned i = 0; i < to_release.size(); ++i) { for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) { -- 2.39.5