From: Yanqin Jin Date: Tue, 14 Jun 2022 01:54:38 +0000 (-0700) Subject: Fix a race condition in transaction stress test (#10157) X-Git-Tag: v7.4.3~40 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bfaf8291c5e1f342cb242d78597e8cdf69965450;p=rocksdb.git Fix a race condition in transaction stress test (#10157) Summary: Before this PR, there can be a race condition between the thread calling `StressTest::Open()` and a background compaction thread calling `MultiOpsTxnsStressTest::VerifyPkSkFast()`. ``` Time thread1 bg_compact_thr | TransactionDB::Open(..., &txn_db_) | db_ is still nullptr | db_->GetSnapshot() // segfault | db_ = txn_db_ V ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/10157 Test Plan: CI Reviewed By: akankshamahajan15 Differential Revision: D37121653 Pulled By: riversand963 fbshipit-source-id: 6a53117f958e9ee86f77297fdeb843e5160a9331 --- diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 9e7fa4e2b..dc7c71c34 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -66,6 +66,7 @@ StressTest::StressTest() #ifndef ROCKSDB_LITE txn_db_(nullptr), #endif + db_aptr_(nullptr), clock_(db_stress_env->GetSystemClock().get()), new_column_family_name_(1), num_times_reopened_(0), @@ -2541,7 +2542,13 @@ void StressTest::Open(SharedState* shared) { fflush(stderr); } assert(s.ok()); - db_ = txn_db_; + + // Do not swap the order of the following. + { + db_ = txn_db_; + db_aptr_.store(txn_db_, std::memory_order_release); + } + // after a crash, rollback to commit recovered transactions std::vector trans; txn_db_->GetAllPreparedTransactions(&trans); diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index fcb89c4c2..d4b2c5e21 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -236,6 +236,10 @@ class StressTest { #ifndef ROCKSDB_LITE TransactionDB* txn_db_; #endif + + // Currently only used in MultiOpsTxnsStressTest + std::atomic db_aptr_; + Options options_; SystemClock* clock_; std::vector column_families_; diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index 07a1783f2..31c88e6d5 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -1248,7 +1248,19 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { } } +// VerifyPkSkFast() can be called by MultiOpsTxnsStressListener's callbacks +// which can be called before TransactionDB::Open() returns to caller. +// Therefore, at that time, db_ and txn_db_ may still be nullptr. +// Caller has to make sure that the race condition does not happen. void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { + DB* const db = db_aptr_.load(std::memory_order_acquire); + if (db == nullptr) { + return; + } + + assert(db_ == db); + assert(db_ != nullptr); + const Snapshot* const snapshot = db_->GetSnapshot(); assert(snapshot); ManagedSnapshot snapshot_guard(db_, snapshot);