Fix the false positive alert of CF consistency check in WAL recovery (#8207)

author Zhichao Cao <zhichao@fb.com>

Thu, 22 Apr 2021 17:27:56 +0000 (10:27 -0700)

committer Zhichao Cao <zhichao@fb.com>

Fri, 23 Apr 2021 23:42:20 +0000 (16:42 -0700)
author Zhichao Cao <zhichao@fb.com>
Thu, 22 Apr 2021 17:27:56 +0000 (10:27 -0700)
committer Zhichao Cao <zhichao@fb.com>
Fri, 23 Apr 2021 23:42:20 +0000 (16:42 -0700)
diff --git a/HISTORY.md b/HISTORY.md

index 76b52730eab5e57c3990749fcf9f8dd930277d4e..012c1cb39d586730041774aedba18a492737fecf 100644 (file)
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,10 @@
  * Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
  * Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
  * Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
+* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
+
+### Behavior Changes
+* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
  
  ## 6.20.0 (04/16/2021)
  ### Behavior Changes
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc

index a4c9657666ad6e9f8a11a55c49c9991bfe98e150..28ad6fd01e6d3d46b413ccd24d7da7f4d8b848fa 100644 (file)
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2582,6 +2582,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
  
    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                         immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
    {
      InstrumentedMutexLock l(&mutex_);
      assert(bg_flush_scheduled_);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc

index d9683a802febd8cc5538f5b5d3723ae115e430c4..684d70d28ab8ec16fa08f3ada3c3e07c0ba37a29 100644 (file)
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1137,11 +1137,29 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
         immutable_db_options_.wal_recovery_mode ==
             WALRecoveryMode::kTolerateCorruptedTailRecords)) {
      for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->GetLogNumber() > corrupted_wal_number) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                          "Column family inconsistency: SST file contains data"
                          " beyond the point of corruption.");
-        return Status::Corruption("SST file is ahead of WALs");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
        }
      }
    }
diff --git a/db/db_test2.cc b/db/db_test2.cc

index a7952cce11fab0eaf040092ee0995feb9c342fb2..42ec2d103787f55f02f1490b12579a42ac67d489 100644 (file)
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -5558,6 +5558,35 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
    Status s = TryReopen(options);
    ASSERT_TRUE(s.IsIOError());
  }
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
  }  // namespace ROCKSDB_NAMESPACE
  
  #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/db_test_util.h b/db/db_test_util.h

index 8dc0e3a333eaa116301d9b5266a53509cf6f1f2e..eb5853b00f6551cd27d882ed0d7a747677a2aadb 100644 (file)
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -393,6 +393,10 @@ class SpecialEnv : public EnvWrapper {
        Status Flush() override { return base_->Flush(); }
        Status Sync() override {
          ++env_->sync_counter_;
+        if (env_->corrupt_in_sync_) {
+          Append(std::string(33000, ' '));
+          return Status::IOError("Ingested Sync Failure");
+        }
          if (env_->skip_fsync_) {
            return Status::OK();
          } else {
@@ -717,6 +721,9 @@ class SpecialEnv : public EnvWrapper {
    // If true, all fsync to files and directories are skipped.
    bool skip_fsync_ = false;
  
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
    std::atomic<uint32_t> non_writeable_rate_;
  
    std::atomic<uint32_t> new_writable_count_;
author	Zhichao Cao <zhichao@fb.com>
	Thu, 22 Apr 2021 17:27:56 +0000 (10:27 -0700)
committer	Zhichao Cao <zhichao@fb.com>
	Fri, 23 Apr 2021 23:42:20 +0000 (16:42 -0700)
HISTORY.md		patch \| blob \| history
db/db_impl/db_impl_compaction_flush.cc		patch \| blob \| history
db/db_impl/db_impl_open.cc		patch \| blob \| history
db/db_test2.cc		patch \| blob \| history
db/db_test_util.h		patch \| blob \| history