]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Deflake DBWALTest.FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL (#11016)
authorHui Xiao <huixiao@fb.com>
Wed, 7 Dec 2022 02:31:43 +0000 (18:31 -0800)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 7 Dec 2022 02:31:43 +0000 (18:31 -0800)
Summary:
**Context/Summary:**
Credit to ajkr's https://github.com/facebook/rocksdb/pull/11016#pullrequestreview-1205020134,
flaky test https://app.circleci.com/pipelines/github/facebook/rocksdb/21985/workflows/5f6cc355-78c1-46d8-89ee-0fd679725a8a/jobs/540878 is due to `Flush()` called in the test returned earlier than obsoleted WAL being found in background flush and SyncWAL() was called (i.e, "sync_point_called" sets to true).  Fix this by making checking `sync_point_called == true` after obsoleted WAL is found and `SyncWAL()` is called. Also rename the "sync_point_called" to be something more specific.

Also, fix a potential flakiness due to manually setting a log threshold to force new manifest creation. This is unreliable so I decided to use sync point to force new manifest creation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11016

Test Plan: make check

Reviewed By: pdillinger

Differential Revision: D41717786

Pulled By: hx235

fbshipit-source-id: ad1e4701a987285bbe6c8e7d9b05c4db06b4edf4

db/db_wal_test.cc
db/version_set.cc

index 6246cc70f37521097656d75f60308530dbdfb1e0..2bdfaada5e5dec93363c7f3ae51abebd31cec7b7 100644 (file)
@@ -1602,9 +1602,6 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
 TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
   Options options = CurrentOptions();
   options.track_and_verify_wals_in_manifest = true;
-  // Set a small max_manifest_file_size to force manifest creation
-  // in SyncWAL() for tet purpose
-  options.max_manifest_file_size = 170;
   DestroyAndReopen(options);
 
   // Accumulate memtable m1 and create the 1st wal (i.e, 4.log)
@@ -1619,25 +1616,26 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
   // active) log and release the lock
   // (2) SyncWAL() proceeds with the lock. It
   // creates a new manifest and syncs all the inactive wals before the latest
-  // (i.e, active log), which is 4.log. SyncWAL() is not aware of the fact
-  // that 4.log has marked as to be obseleted.
-  // Prior to the fix, such wal sync will then add a WAL addition record of
-  // 4.log to the new manifest without any special treatment.
+  // (i.e, active log), which is 4.log. Note that SyncWAL() is not aware of the
+  // fact that 4.log has marked as to be obseleted. Prior to the fix, such wal
+  // sync will then add a WAL addition record of 4.log to the new manifest
+  // without any special treatment.
   // (3) BackgroundFlush() will eventually purge 4.log.
-  bool sync_point_called = false;
-  bool new_manifest_created = false;
+  bool wal_synced = false;
   SyncPoint::GetInstance()->SetCallBack(
       "FindObsoleteFiles::PostMutexUnlock", [&](void*) {
         ASSERT_OK(env_->FileExists(wal_file_path));
 
         SyncPoint::GetInstance()->SetCallBack(
-            "VersionSet::ProcessManifestWrites:BeforeNewManifest",
-            [&](void*) { new_manifest_created = true; });
+            "VersionSet::ProcessManifestWrites:"
+            "PostDecidingCreateNewManifestOrNot",
+            [&](void* arg) {
+              bool* new_descriptor_log = (bool*)arg;
+              *new_descriptor_log = true;
+            });
 
         ASSERT_OK(db_->SyncWAL());
-        ASSERT_TRUE(new_manifest_created);
-
-        sync_point_called = true;
+        wal_synced = true;
       });
 
   SyncPoint::GetInstance()->SetCallBack(
@@ -1650,21 +1648,22 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
               "PostDeleteWAL");
         }
       });
+
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
+      {{"DBImpl::BackgroundCallFlush:FilesFound",
+        "PreConfrimObsoletedWALSynced"},
+       {"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
         "PostDeleteWAL",
-        "DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
         "PreConfrimWALDeleted"}});
 
   SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Flush());
-  ASSERT_TRUE(sync_point_called);
 
-  TEST_SYNC_POINT(
-      "DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::"
-      "PreConfrimWALDeleted");
+  TEST_SYNC_POINT("PreConfrimObsoletedWALSynced");
+  ASSERT_TRUE(wal_synced);
 
+  TEST_SYNC_POINT("PreConfrimWALDeleted");
   // BackgroundFlush() purged 4.log
   // because the memtable associated with the WAL was flushed and new WAL was
   // created (i.e, 8.log)
index 33cd7002271e2e643fa1c2c2c29a45173cfdce4b..becdd679067b89fbd590d03681ec44e311ca1110 100644 (file)
@@ -4983,6 +4983,9 @@ Status VersionSet::ProcessManifestWrites(
   } else {
     pending_manifest_file_number_ = manifest_file_number_;
   }
+  TEST_SYNC_POINT_CALLBACK(
+      "VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot",
+      &new_descriptor_log);
 
   // Local cached copy of state variable(s). WriteCurrentStateToManifest()
   // reads its content after releasing db mutex to avoid race with