]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Fix flush picking non-consecutive memtables (#10921)
authorAndrew Kryczka <andrewkr@fb.com>
Fri, 4 Nov 2022 22:55:54 +0000 (15:55 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Fri, 4 Nov 2022 22:55:54 +0000 (15:55 -0700)
Summary:
Prevents `MemTableList::PickMemtablesToFlush()` from picking non-consecutive memtables. It leads to wrong ordering in L0 if the files are committed, or an error like below if force_consistency_checks=true catches it:

```
Corruption: force_consistency_checks: VersionBuilder: L0 file https://github.com/facebook/rocksdb/issues/25 with seqno 320416 368066 vs. file https://github.com/facebook/rocksdb/issues/24 with seqno 336037 352068
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10921

Test Plan: fix the expectation in the existing test of this behavior

Reviewed By: riversand963

Differential Revision: D41046935

Pulled By: ajkr

fbshipit-source-id: 783696bff56115063d5dc5856dfaed6a9881d1ab

HISTORY.md
db/memtable_list.cc
db/memtable_list_test.cc

index b57b84374c2c07942920d49716face73470b98a3..33f87d36de82e762b2184748132625c70cd62083 100644 (file)
@@ -6,6 +6,7 @@
 ### Bug Fixes
 * Fix FIFO compaction causing corruption of overlapping seqnos in L0 files due to ingesting files of overlapping seqnos with memtable's under `CompactionOptionsFIFO::allow_compaction=true` or `CompactionOptionsFIFO::age_for_warm>0` or `CompactRange()/CompactFiles()` is used. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected.
 * Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading.
+* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`).
 
 ### New Features
 * Add basic support for user-defined timestamp to Merge (#10819).
index 7cc669cdd5fb1e45daa3d59f767ca38b0dd63d64..1545003aded1219c334f1bae6a91f0bdc6844572 100644 (file)
@@ -419,6 +419,13 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
             std::max(m->GetNextLogNumber(), *max_next_log_number);
       }
       ret->push_back(m);
+    } else if (!ret->empty()) {
+      // This `break` is necessary to prevent picking non-consecutive memtables
+      // in case `memlist` has one or more entries with
+      // `flush_in_progress_ == true` sandwiched between entries with
+      // `flush_in_progress_ == false`. This could happen after parallel flushes
+      // are picked and the one flushing older memtables is rolled back.
+      break;
     }
   }
   if (!atomic_flush || num_flush_not_started_ == 0) {
index 258f02f96d77c4da950f48544a621b33ceb67d38..8242061afb0d874ae7a644a93a954d77daa4c815 100644 (file)
@@ -744,28 +744,40 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Pick tables to flush
   list.PickMemtablesToFlush(
       std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
-  // Should pick 4 of 5 since 1 table has been picked in to_flush2
-  ASSERT_EQ(4, to_flush.size());
+  // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
+  // must be excluded. The newest (fifth oldest) is non-consecutive with the
+  // three oldest due to omitting the fourth oldest so must not be picked.
+  ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
-  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
   list.PickMemtablesToFlush(
       std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
-  ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
+  // Picks newest (fifth oldest)
+  ASSERT_EQ(1, to_flush3.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Nothing left to flush
+  autovector<MemTable*> to_flush4;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
+  ASSERT_EQ(0, to_flush4.size());
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
-  // Flush the 4 memtables that were picked in to_flush
+  // Flush the 3 memtables that were picked in to_flush
   s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
                                        &to_delete);
   ASSERT_OK(s);
 
-  // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
-  // tables[3].
+  // Note:  now to_flush contains tables[0,1,2].  to_flush2 contains
+  // tables[3]. to_flush3 contains tables[4].
   // Current implementation will only commit memtables in the order they were
   // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
@@ -781,7 +793,17 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
-  // Flush the 1 memtable that was picked in to_flush2
+  // Flush the 1 memtable (tables[4]) that was picked in to_flush3
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush3, &to_delete);
+  ASSERT_OK(s);
+
+  // This will install 0 tables since tables[4] flushed while tables[3] has not
+  // yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Flush the 1 memtable (tables[3]) that was picked in to_flush2
   s = MemTableListTest::Mock_InstallMemtableFlushResults(
       &list, mutable_cf_options, to_flush2, &to_delete);
   ASSERT_OK(s);
@@ -811,11 +833,11 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   memtable_id = 4;
   // Pick tables to flush. The tables to pick must have ID smaller than or
   // equal to 4. Therefore, no table will be selected in this case.
-  autovector<MemTable*> to_flush4;
+  autovector<MemTable*> to_flush5;
   list.FlushRequested();
   ASSERT_TRUE(list.HasFlushRequested());
-  list.PickMemtablesToFlush(memtable_id, &to_flush4);
-  ASSERT_TRUE(to_flush4.empty());
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_TRUE(to_flush5.empty());
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
   ASSERT_FALSE(list.IsFlushPending());
@@ -825,8 +847,8 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // equal to 5. Therefore, only tables[5] will be selected.
   memtable_id = 5;
   list.FlushRequested();
-  list.PickMemtablesToFlush(memtable_id, &to_flush4);
-  ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   ASSERT_FALSE(list.IsFlushPending());