]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Fix WAL log data corruption #8723 (#8746)
authoreharry <eharry@foxmail.com>
Tue, 14 Sep 2021 03:15:00 +0000 (20:15 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Tue, 14 Sep 2021 03:15:59 +0000 (20:15 -0700)
Summary:
Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together (https://github.com/facebook/rocksdb/issues/8723)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8746

Reviewed By: ajkr

Differential Revision: D30758468

Pulled By: riversand963

fbshipit-source-id: 07c20899d5f2447dc77861b4845efc68a59aa4e8

HISTORY.md
db/db_impl/db_impl_write.cc
db/db_test.cc

index 8a44d216a704a408e854d1fa9230c64676a1a448..680c050884cb91cdbaa534ca3731c3a25943be70 100644 (file)
@@ -9,6 +9,7 @@
 * Fix a race in BackupEngine if RateLimiter is reconfigured during concurrent Restore operations.
 * Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding.
 * Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates.
+* Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_.
 
 ### New Features
 * RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions.
index 6fcb56b7994ef7c141d5cf7612fb2a0e3e5a09cf..b7d856ca3fcfda46690c90485da705cff8625192 100644 (file)
@@ -1124,6 +1124,18 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
     //    writer thread, so no one will push to logs_,
     //  - as long as other threads don't modify it, it's safe to read
     //    from std::deque from multiple threads concurrently.
+    //
+    // Sync operation should work with locked log_write_mutex_, because:
+    //   when DBOptions.manual_wal_flush_ is set,
+    //   FlushWAL function will be invoked by another thread.
+    //   if without locked log_write_mutex_, the log file may get data
+    //   corruption
+
+    const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Lock();
+    }
+
     for (auto& log : logs_) {
       io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
       if (!io_s.ok()) {
@@ -1131,6 +1143,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
       }
     }
 
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Unlock();
+    }
+
     if (io_s.ok() && need_log_dir_sync) {
       // We only sync WAL directory the first time WAL syncing is
       // requested, so that in case users never turn on WAL sync,
index fca8b2880d95bbc109ca43e681306ffa13ec287d..b99615caa8db5ea77c7a69438df7bfd244d1daf8 100644 (file)
@@ -4098,6 +4098,39 @@ TEST_F(DBTest, ConcurrentFlushWAL) {
   }
 }
 
+// This test failure will be caught with a probability
+TEST_F(DBTest, ManualFlushWalAndWriteRace) {
+  Options options;
+  options.env = env_;
+  options.manual_wal_flush = true;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  port::Thread writeThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      auto istr = ToString(i);
+      dbfull()->Put(wopts, "key_" + istr, "value_" + istr);
+    }
+  });
+  port::Thread flushThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(dbfull()->FlushWAL(false));
+    }
+  });
+
+  writeThread.join();
+  flushThread.join();
+  ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
+  ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
+  Reopen(options);
+  ASSERT_EQ("value1", Get("foo1"));
+  ASSERT_EQ("value2", Get("foo2"));
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, DynamicMemtableOptions) {
   const uint64_t k64KB = 1 << 16;