]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Fix flush picking non-consecutive memtables (#10921)
authorAndrew Kryczka <andrewkr@fb.com>
Fri, 4 Nov 2022 22:55:54 +0000 (15:55 -0700)
committerAndrew Kryczka <andrewkr@fb.com>
Thu, 24 Nov 2022 21:43:36 +0000 (13:43 -0800)
Summary:
Prevents `MemTableList::PickMemtablesToFlush()` from picking non-consecutive memtables. It leads to wrong ordering in L0 if the files are committed, or an error like below if force_consistency_checks=true catches it:

```
Corruption: force_consistency_checks: VersionBuilder: L0 file https://github.com/facebook/rocksdb/issues/25 with seqno 320416 368066 vs. file https://github.com/facebook/rocksdb/issues/24 with seqno 336037 352068
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10921

Test Plan: fix the expectation in the existing test of this behavior

Reviewed By: riversand963

Differential Revision: D41046935

Pulled By: ajkr

fbshipit-source-id: 783696bff56115063d5dc5856dfaed6a9881d1ab

HISTORY.md
db/memtable_list.cc
db/memtable_list_test.cc

index 5da6574893c767ec49b5c7accef8d360ced4823f..3b4f81909e2c9cfe49e74c08742922c34a434b6e 100644 (file)
@@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Bug Fixes
+* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`).
+
 ## 7.8.2 (11/23/2022)
 ### Behavior changes
 * Make best-efforts recovery verify SST unique ID before Version construction (#10962)
index 7cc669cdd5fb1e45daa3d59f767ca38b0dd63d64..1545003aded1219c334f1bae6a91f0bdc6844572 100644 (file)
@@ -419,6 +419,13 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
             std::max(m->GetNextLogNumber(), *max_next_log_number);
       }
       ret->push_back(m);
+    } else if (!ret->empty()) {
+      // This `break` is necessary to prevent picking non-consecutive memtables
+      // in case `memlist` has one or more entries with
+      // `flush_in_progress_ == true` sandwiched between entries with
+      // `flush_in_progress_ == false`. This could happen after parallel flushes
+      // are picked and the one flushing older memtables is rolled back.
+      break;
     }
   }
   if (!atomic_flush || num_flush_not_started_ == 0) {
index 458dc18a3758c0b178a5279fbba3cdb9a87f22f6..0b90cd437d3faf835752c9922895b692cf96529e 100644 (file)
@@ -742,28 +742,40 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Pick tables to flush
   list.PickMemtablesToFlush(
       std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
-  // Should pick 4 of 5 since 1 table has been picked in to_flush2
-  ASSERT_EQ(4, to_flush.size());
+  // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
+  // must be excluded. The newest (fifth oldest) is non-consecutive with the
+  // three oldest due to omitting the fourth oldest so must not be picked.
+  ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
-  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
   list.PickMemtablesToFlush(
       std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
-  ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
+  // Picks newest (fifth oldest)
+  ASSERT_EQ(1, to_flush3.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Nothing left to flush
+  autovector<MemTable*> to_flush4;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
+  ASSERT_EQ(0, to_flush4.size());
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
-  // Flush the 4 memtables that were picked in to_flush
+  // Flush the 3 memtables that were picked in to_flush
   s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
                                        &to_delete);
   ASSERT_OK(s);
 
-  // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
-  // tables[3].
+  // Note:  now to_flush contains tables[0,1,2].  to_flush2 contains
+  // tables[3]. to_flush3 contains tables[4].
   // Current implementation will only commit memtables in the order they were
   // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
@@ -779,7 +791,17 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
-  // Flush the 1 memtable that was picked in to_flush2
+  // Flush the 1 memtable (tables[4]) that was picked in to_flush3
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush3, &to_delete);
+  ASSERT_OK(s);
+
+  // This will install 0 tables since tables[4] flushed while tables[3] has not
+  // yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Flush the 1 memtable (tables[3]) that was picked in to_flush2
   s = MemTableListTest::Mock_InstallMemtableFlushResults(
       &list, mutable_cf_options, to_flush2, &to_delete);
   ASSERT_OK(s);
@@ -809,11 +831,11 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   memtable_id = 4;
   // Pick tables to flush. The tables to pick must have ID smaller than or
   // equal to 4. Therefore, no table will be selected in this case.
-  autovector<MemTable*> to_flush4;
+  autovector<MemTable*> to_flush5;
   list.FlushRequested();
   ASSERT_TRUE(list.HasFlushRequested());
-  list.PickMemtablesToFlush(memtable_id, &to_flush4);
-  ASSERT_TRUE(to_flush4.empty());
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_TRUE(to_flush5.empty());
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
   ASSERT_FALSE(list.IsFlushPending());
@@ -823,8 +845,8 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // equal to 5. Therefore, only tables[5] will be selected.
   memtable_id = 5;
   list.FlushRequested();
-  list.PickMemtablesToFlush(memtable_id, &to_flush4);
-  ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   ASSERT_FALSE(list.IsFlushPending());