From 8c069883131b4750cd70164a828db8611599e7dc Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 4 Nov 2022 15:55:54 -0700 Subject: [PATCH] Fix flush picking non-consecutive memtables (#10921) Summary: Prevents `MemTableList::PickMemtablesToFlush()` from picking non-consecutive memtables. It leads to wrong ordering in L0 if the files are committed, or an error like below if force_consistency_checks=true catches it: ``` Corruption: force_consistency_checks: VersionBuilder: L0 file https://github.com/facebook/rocksdb/issues/25 with seqno 320416 368066 vs. file https://github.com/facebook/rocksdb/issues/24 with seqno 336037 352068 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/10921 Test Plan: fix the expectation in the existing test of this behavior Reviewed By: riversand963 Differential Revision: D41046935 Pulled By: ajkr fbshipit-source-id: 783696bff56115063d5dc5856dfaed6a9881d1ab --- HISTORY.md | 4 ++++ db/memtable_list.cc | 7 ++++++ db/memtable_list_test.cc | 48 +++++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5da65748..3b4f8190 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## Unreleased +### Bug Fixes +* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`). + ## 7.8.2 (11/23/2022) ### Behavior changes * Make best-efforts recovery verify SST unique ID before Version construction (#10962) diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 7cc669cd..1545003a 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -419,6 +419,13 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, std::max(m->GetNextLogNumber(), *max_next_log_number); } ret->push_back(m); + } else if (!ret->empty()) { + // This `break` is necessary to prevent picking non-consecutive memtables + // in case `memlist` has one or more entries with + // `flush_in_progress_ == true` sandwiched between entries with + // `flush_in_progress_ == false`. This could happen after parallel flushes + // are picked and the one flushing older memtables is rolled back. + break; } } if (!atomic_flush || num_flush_not_started_ == 0) { diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 458dc18a..0b90cd43 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -742,28 +742,40 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Pick tables to flush list.PickMemtablesToFlush( std::numeric_limits::max() /* memtable_id */, &to_flush); - // Should pick 4 of 5 since 1 table has been picked in to_flush2 - ASSERT_EQ(4, to_flush.size()); + // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so + // must be excluded. The newest (fifth oldest) is non-consecutive with the + // three oldest due to omitting the fourth oldest so must not be picked. + ASSERT_EQ(3, to_flush.size()); ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush again autovector to_flush3; list.PickMemtablesToFlush( std::numeric_limits::max() /* memtable_id */, &to_flush3); - ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed + // Picks newest (fifth oldest) + ASSERT_EQ(1, to_flush3.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Nothing left to flush + autovector to_flush4; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush4); + ASSERT_EQ(0, to_flush4.size()); ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - // Flush the 4 memtables that were picked in to_flush + // Flush the 3 memtables that were picked in to_flush s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, &to_delete); ASSERT_OK(s); - // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains - // tables[3]. + // Note: now to_flush contains tables[0,1,2]. to_flush2 contains + // tables[3]. to_flush3 contains tables[4]. // Current implementation will only commit memtables in the order they were // created. So TryInstallMemtableFlushResults will install the first 3 tables // in to_flush and stop when it encounters a table not yet flushed. @@ -779,7 +791,17 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - // Flush the 1 memtable that was picked in to_flush2 + // Flush the 1 memtable (tables[4]) that was picked in to_flush3 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush3, &to_delete); + ASSERT_OK(s); + + // This will install 0 tables since tables[4] flushed while tables[3] has not + // yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Flush the 1 memtable (tables[3]) that was picked in to_flush2 s = MemTableListTest::Mock_InstallMemtableFlushResults( &list, mutable_cf_options, to_flush2, &to_delete); ASSERT_OK(s); @@ -809,11 +831,11 @@ TEST_F(MemTableListTest, FlushPendingTest) { memtable_id = 4; // Pick tables to flush. The tables to pick must have ID smaller than or // equal to 4. Therefore, no table will be selected in this case. - autovector to_flush4; + autovector to_flush5; list.FlushRequested(); ASSERT_TRUE(list.HasFlushRequested()); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_TRUE(to_flush4.empty()); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_TRUE(to_flush5.empty()); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.IsFlushPending()); @@ -823,8 +845,8 @@ TEST_F(MemTableListTest, FlushPendingTest) { // equal to 5. Therefore, only tables[5] will be selected. memtable_id = 5; list.FlushRequested(); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_EQ(1, static_cast(to_flush4.size())); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_EQ(1, static_cast(to_flush5.size())); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.IsFlushPending()); -- 2.47.3