]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/BlueFS: fix flush_bdev placement
authorSage Weil <sage@redhat.com>
Thu, 9 Mar 2017 21:51:21 +0000 (16:51 -0500)
committerNathan Cutler <ncutler@suse.com>
Tue, 4 Jul 2017 14:08:57 +0000 (16:08 +0200)
We need to flush any new writes on any fsync().  Notably, this includes
the rocksdb log. However, previously _fsync was only doing a bdev flush if
we also had a dirty bluefs journal and called into _sync_and_flush_journal.
If we didn't, we weren't doing a flush() at all, which could lead to
corrupted data.

Fix this by moving the first flush_bdev *out* of _sync_and_flush_log.  (The
second one is there to flush the bluefs journal; the first one was to
ensure prior writes are stable.)  Instead, flush prior writes in all of the
callers prior to calling _sync_and_flush_log.  This includes _fsync (and
fixes the bug by covering the non-journal-flush path) as well as several
other callers.

Fixes: http://tracker.ceph.com/issues/19250
Signed-off-by: Sage Weil <sage@redhat.com>
(cherry picked from commit 2924a96493d8570317e55854a25fc64911ecf151)

src/os/bluestore/BlueFS.cc

index a49c9a7e8e22f6b3317c1d4b0efaf4bd642b4478..07e87c46374399d974c9ff996cc7d5d7c524bbe9 100644 (file)
@@ -195,6 +195,7 @@ int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
     log_t.op_alloc_rm(id, p.offset, p.length);
   }
 
+  flush_bdev();
   r = _flush_and_sync_log(l);
   assert(r == 0);
 
@@ -1159,6 +1160,9 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
   // write the new entries
   log_t.op_file_update(log_file->fnode);
   log_t.op_jump(log_seq, old_log_jump_to);
+
+  flush_bdev();  // FIXME?
+
   _flush_and_sync_log(l, 0, old_log_jump_to);
 
   // 2. prepare compacted log
@@ -1280,6 +1284,7 @@ void BlueFS::_pad_bl(bufferlist& bl)
 void BlueFS::flush_log()
 {
   std::unique_lock<std::mutex> l(lock);
+  flush_bdev();
   _flush_and_sync_log(l);
 }
 
@@ -1350,7 +1355,6 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
   log_t.seq = 0;  // just so debug output is less confusing
   log_flushing = true;
 
-  flush_bdev();
   int r = _flush(log_writer, true);
   assert(r == 0);
 
@@ -1702,12 +1706,15 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
   if (r < 0)
      return r;
   uint64_t old_dirty_seq = h->file->dirty_seq;
+
   list<FS::aio_t> completed_ios;
   _claim_completed_aios(h, &completed_ios);
   lock.unlock();
   wait_for_aio(h);
   completed_ios.clear();
+  flush_bdev();
   lock.lock();
+
   if (old_dirty_seq) {
     uint64_t s = log_seq;
     dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
@@ -1825,6 +1832,7 @@ void BlueFS::sync_metadata()
   utime_t start = ceph_clock_now();
   vector<interval_set<uint64_t>> to_release(pending_release.size());
   to_release.swap(pending_release);
+  flush_bdev(); // FIXME?
   _flush_and_sync_log(l);
   for (unsigned i = 0; i < to_release.size(); ++i) {
     for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {