]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Fix possible crash in failure to sync some WALs (#12789)
authorPeter Dillinger <peterd@meta.com>
Fri, 21 Jun 2024 19:56:21 +0000 (12:56 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Fri, 21 Jun 2024 19:56:21 +0000 (12:56 -0700)
Summary:
I believe this was possible with recyclable logs before recent work like https://github.com/facebook/rocksdb/issues/12734, but this cleans up a couple of possible crashes revealed by the crash test.  A WAL with a nullptr file writer (already closed) can persist in `logs_` if a later WAL fails to sync. In case of any WAL sync failures, we don't record WAL syncs to the manifest. Thus, even if a WAL is fully synced and closed, we might need to keep it on the `logs_` list so that we know to record its sync to the manifest if there should be a successful sync next time. (However, I believe that's future-looking because currently any failure in WAL sync is considered non-recoverable.)

I don't believe this was likely enough before recent changes to warrant a release note (if it was possible).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12789

Test Plan: A unit test that would reveal the crashes, now fixed

Reviewed By: cbi42

Differential Revision: D58874154

Pulled By: pdillinger

fbshipit-source-id: bc69407cd9cbcd080af9585d502d4e33dafc3d29

db/db_impl/db_impl.cc
db/db_impl/db_impl.h
db/db_wal_test.cc

index 12549dc31c7e2a0676dd583f35168fbe53da50df..b67539352ae46f5b6b63b02f8a54395e3cd7f498 100644 (file)
@@ -1615,8 +1615,14 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
     for (auto it = logs_.begin();
          it != logs_.end() && it->number <= up_to_number; ++it) {
       auto& log = *it;
+      // Ensure the head of logs_ is marked as getting_synced if any is.
       log.PrepareForSync();
-      wals_to_sync.push_back(log.writer);
+      // If last sync failed on a later WAL, this could be a fully synced
+      // and closed WAL that just needs to be recorded as synced in the
+      // manifest.
+      if (log.writer->file()) {
+        wals_to_sync.push_back(log.writer);
+      }
     }
 
     need_wal_dir_sync = !log_dir_synced_;
index 6e6671f8a821e8f0bd223e812cdc73ec6c9fc7e1..7ecf53b40a92f3790f08955fbafab326a52351b2 100644 (file)
@@ -1722,8 +1722,11 @@ class DBImpl : public DB {
       return w;
     }
     Status ClearWriter() {
-      // TODO: plumb Env::IOActivity, Env::IOPriority
-      Status s = writer->WriteBuffer(WriteOptions());
+      Status s;
+      if (writer->file()) {
+        // TODO: plumb Env::IOActivity, Env::IOPriority
+        s = writer->WriteBuffer(WriteOptions());
+      }
       delete writer;
       writer = nullptr;
       return s;
@@ -1738,10 +1741,16 @@ class DBImpl : public DB {
 
     void PrepareForSync() {
       assert(!getting_synced);
-      // Size is expected to be monotonically increasing.
-      assert(writer->file()->GetFlushedSize() >= pre_sync_size);
+      // Ensure the head of logs_ is marked as getting_synced if any is.
       getting_synced = true;
-      pre_sync_size = writer->file()->GetFlushedSize();
+      // If last sync failed on a later WAL, this could be a fully synced
+      // and closed WAL that just needs to be recorded as synced in the
+      // manifest.
+      if (writer->file()) {
+        // Size is expected to be monotonically increasing.
+        assert(writer->file()->GetFlushedSize() >= pre_sync_size);
+        pre_sync_size = writer->file()->GetFlushedSize();
+      }
     }
 
     void FinishSync() {
index a4976c3ed356d2550dab497ea68f281cad7db759..494298be7e68a2faac355229aa2fc71238e382ec 100644 (file)
@@ -14,6 +14,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/file_system.h"
 #include "test_util/sync_point.h"
+#include "util/defer.h"
 #include "util/udt_util.h"
 #include "utilities/fault_injection_env.h"
 #include "utilities/fault_injection_fs.h"
@@ -1471,6 +1472,93 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
   ASSERT_OK(dbfull()->SyncWAL());
 }
 
+TEST_F(DBWALTest, SyncWalPartialFailure) {
+  class MyTestFileSystem : public FileSystemWrapper {
+   public:
+    explicit MyTestFileSystem(std::shared_ptr<FileSystem> base)
+        : FileSystemWrapper(std::move(base)) {}
+
+    static const char* kClassName() { return "MyTestFileSystem"; }
+    const char* Name() const override { return kClassName(); }
+    IOStatus NewWritableFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
+      IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+      if (s.ok()) {
+        *result =
+            std::make_unique<MyTestWritableFile>(std::move(*result), *this);
+      }
+      return s;
+    }
+
+    AcqRelAtomic<uint32_t> syncs_before_failure_{UINT32_MAX};
+
+   protected:
+    class MyTestWritableFile : public FSWritableFileOwnerWrapper {
+     public:
+      MyTestWritableFile(std::unique_ptr<FSWritableFile>&& file,
+                         MyTestFileSystem& fs)
+          : FSWritableFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+      IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+        int prev_val = fs_.syncs_before_failure_.FetchSub(1);
+        if (prev_val == 0) {
+          return IOStatus::IOError("fault");
+        } else {
+          return target()->Sync(options, dbg);
+        }
+      }
+
+     protected:
+      MyTestFileSystem& fs_;
+    };
+  };
+
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 4;
+  options.track_and_verify_wals_in_manifest = true;
+  options.max_bgerror_resume_count = 0;  // manual resume
+
+  auto custom_fs =
+      std::make_shared<MyTestFileSystem>(options.env->GetFileSystem());
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(custom_fs));
+  options.env = fault_fs_env.get();
+  Reopen(options);
+  Defer closer([this]() { Close(); });
+
+  // This is the simplest way to get
+  // * one inactive WAL, synced
+  // * one inactive WAL, not synced, and
+  // * one active WAL, not synced
+  // with a single thread, to exercise as much logic as we reasonably can.
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(db_->SyncWAL());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key3", "val3"));
+
+  // Allow 1 of the WALs to sync, but another won't
+  custom_fs->syncs_before_failure_.Store(1);
+  ASSERT_NOK(db_->SyncWAL());
+
+  // Stuck in this state. (This could previously cause a segfault.)
+  ASSERT_NOK(db_->SyncWAL());
+
+  // Can't Resume because WAL write failure is considered non-recoverable,
+  // regardless of the IOStatus itself. (Can/should be fixed?)
+  ASSERT_NOK(db_->Resume());
+
+  // Verify no data loss after reopen.
+  // Also Close() could previously crash in this state.
+  Reopen(options);
+  ASSERT_EQ("val1", Get("key1"));
+  ASSERT_EQ("val2", Get("key2"));
+  ASSERT_EQ("val3", Get("key3"));
+}
+
 // Github issue 1339. Prior the fix we read sequence id from the first log to
 // a local variable, then keep increase the variable as we replay logs,
 // ignoring actual sequence id of the records. This is incorrect if some writes