]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Steps toward deprecating implicit prefix seek, related fixes (#13026)
authorPeter Dillinger <peterd@meta.com>
Fri, 20 Sep 2024 22:54:19 +0000 (15:54 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Fri, 20 Sep 2024 22:54:19 +0000 (15:54 -0700)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.

Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.

Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)

Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026

Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.

Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).

Reviewed By: ltamasi

Differential Revision: D63147378

Pulled By: pdillinger

fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e

26 files changed:
db/arena_wrapped_db_iter.cc
db/column_family.cc
db/db_bloom_filter_test.cc
db/db_impl/db_impl.cc
db/db_impl/db_impl_open.cc
db/db_iter.cc
db/db_test2.cc
db/db_with_timestamp_basic_test.cc
db/flush_job.cc
db/forward_iterator.cc
db/memtable.cc
db/memtable.h
db/memtable_list.cc
db/memtable_list.h
db/repair.cc
db/write_batch_test.cc
db_stress_tool/db_stress_test_base.cc
include/rocksdb/options.h
java/rocksjni/write_batch_test.cc
options/db_options.cc
options/db_options.h
table/block_based/block_based_table_reader.cc
table/plain/plain_table_reader.cc
table/table_test.cc
unreleased_history/bug_fixes/memtable_prefix.md [new file with mode: 0644]
unreleased_history/new_features/prefix_seek_opt_in_only.md [new file with mode: 0644]

index 83cc3cfa5032fcbf58f7aab0991267cb8a69f3ab..22be567fdb0394203b2d3f0062faaaf3a25af911 100644 (file)
@@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init(
     const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
     uint64_t version_number, ReadCallback* read_callback,
     ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(
-      env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
-      /* iter */ nullptr, version, sequence, true,
-      max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
-  sv_number_ = version_number;
   read_options_ = read_options;
-  allow_refresh_ = allow_refresh;
-  memtable_range_tombstone_iter_ = nullptr;
-
   if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
                              FSSupportedOps::kAsyncIO)) {
     read_options_.async_io = false;
   }
+  read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
+
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
+                              ioptions.user_comparator,
+                              /* iter */ nullptr, version, sequence, true,
+                              max_sequential_skip_in_iteration, read_callback,
+                              cfh, expose_blob_index);
+
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+  memtable_range_tombstone_iter_ = nullptr;
 }
 
 Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
index 2b611fda7ca70e4b2c0c89cfb99cf48633e119ee..8abad2941fa81d5e3b4d87081c31feee965183d1 100644 (file)
@@ -1201,8 +1201,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
   read_opts.total_order_seek = true;
   MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
   merge_iter_builder.AddIterator(super_version->mem->NewIterator(
-      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
+      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
+      /*prefix_extractor=*/nullptr));
   super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
+                                   /*prefix_extractor=*/nullptr,
                                    &merge_iter_builder,
                                    false /* add_range_tombstone_iter */);
   ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
index 7b42f4ee5e473238ab95f04c545884232b0a1d3b..f0b3c406679b48a8299aef7230e3337eef13849c 100644 (file)
@@ -171,7 +171,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
 
     ASSERT_OK(Put(1, "a", "b"));
     bool value_found = false;
@@ -187,7 +187,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(
         db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
-    ASSERT_TRUE(!value_found);
+    ASSERT_FALSE(value_found);
     // assert that no new files were opened and no new blocks were
     // read into block cache.
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
@@ -197,7 +197,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -207,7 +207,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -215,7 +215,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -2177,24 +2177,146 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
 
   db_->ReleaseSnapshot(snapshot);
 }
+namespace {
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+  if (sst) {
+    return {options.statistics->getAndResetTickerCount(
+                NON_LAST_LEVEL_SEEK_FILTER_MATCH),
+            options.statistics->getAndResetTickerCount(
+                NON_LAST_LEVEL_SEEK_FILTERED)};
+  } else {
+    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+    return {hit, miss};
+  }
+}
+
+std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
+  return {hits, misses};
+}
+}  // namespace
 
-TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
-  constexpr size_t kPrefixSize = 8;
-  const std::string kKey = "key";
-  assert(kKey.size() < kPrefixSize);
+TEST_F(DBBloomFilterTest, MemtablePrefixBloom) {
   Options options = CurrentOptions();
-  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
   options.memtable_prefix_bloom_size_ratio = 0.25;
   Reopen(options);
-  ASSERT_OK(Put(kKey, "v"));
-  ASSERT_EQ("v", Get(kKey));
-  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
-  iter->Seek(kKey);
+  ASSERT_FALSE(options.prefix_extractor->InDomain("key"));
+  ASSERT_OK(Put("key", "v"));
+  ASSERT_OK(Put("goat1", "g1"));
+  ASSERT_OK(Put("goat2", "g2"));
+
+  // Reset from other tests
+  GetBloomStat(options, false);
+
+  // Out of domain (Get)
+  ASSERT_EQ("v", Get("key"));
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // In domain (Get)
+  ASSERT_EQ("g1", Get("goat1"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat9"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goan1"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+  ReadOptions ropts;
+  if (options.prefix_seek_opt_in_only) {
+    ropts.prefix_same_as_start = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+  // Out of domain (scan)
+  iter->Seek("ke");
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(kKey, iter->key());
-  iter->SeekForPrev(kKey);
+  ASSERT_EQ("key", iter->key());
+  iter->SeekForPrev("kez");
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(kKey, iter->key());
+  ASSERT_EQ("key", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // In domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+  iter->Seek("goat");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+
+  // Changing prefix extractor should affect prefix query semantics
+  // and bypass the existing memtable Bloom filter
+  ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}}));
+  iter.reset(db_->NewIterator(ropts));
+  // Now out of domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat2", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  if (ropts.prefix_same_as_start) {
+    iter->Seek("goat0");
+    ASSERT_OK(iter->status());
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  } else {
+    // NOTE: legacy prefix Seek may return keys outside of prefix
+  }
+
+  // Start a fresh new memtable, using new prefix extractor
+  ASSERT_OK(SingleDelete("key"));
+  ASSERT_OK(SingleDelete("goat1"));
+  ASSERT_OK(SingleDelete("goat2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "_v"));
+  ASSERT_OK(Put("goat1", "_g1"));
+  ASSERT_OK(Put("goat2", "_g2"));
+
+  iter.reset(db_->NewIterator(ropts));
+
+  // Still out of domain (Get)
+  ASSERT_EQ("_v", Get("key"));
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // Still in domain (Get)
+  ASSERT_EQ("_g1", Get("goat1"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat11"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat9"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goan1"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+  // Now out of domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat2", iter->key());
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat0");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
 }
 
 class DBBloomFilterTestVaryPrefixAndFormatVer
@@ -2507,7 +2629,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_OK(Put(key1, value1, WriteOptions()));
   ASSERT_OK(Put(key3, value3, WriteOptions()));
 
-  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  ReadOptions ropts;
+  if (options_.prefix_seek_opt_in_only) {
+    ropts.prefix_same_as_start = true;
+  }
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
 
   // check memtable bloom stats
   iter->Seek(key1);
@@ -2526,13 +2652,13 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
 
   iter->Seek(key2);
   ASSERT_OK(iter->status());
-  ASSERT_TRUE(!iter->Valid());
+  ASSERT_FALSE(iter->Valid());
   ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
   ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
 
   ASSERT_OK(Flush());
 
-  iter.reset(dbfull()->NewIterator(ReadOptions()));
+  iter.reset(dbfull()->NewIterator(ropts));
 
   // Check SST bloom stats
   iter->Seek(key1);
@@ -2550,7 +2676,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
 
   iter->Seek(key2);
   ASSERT_OK(iter->status());
-  ASSERT_TRUE(!iter->Valid());
+  ASSERT_FALSE(iter->Valid());
   ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 }
@@ -2659,9 +2785,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) {
     PrefixScanInit(this);
     count = 0;
     env_->random_read_counter_.Reset();
-    iter = db_->NewIterator(ReadOptions());
+    ReadOptions ropts;
+    if (options.prefix_seek_opt_in_only) {
+      ropts.prefix_same_as_start = true;
+    }
+    iter = db_->NewIterator(ropts);
     for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
       if (!iter->key().starts_with(prefix)) {
+        ASSERT_FALSE(ropts.prefix_same_as_start);
         break;
       }
       count++;
@@ -3397,23 +3528,6 @@ class FixedSuffix4Transform : public SliceTransform {
 
   bool InDomain(const Slice& src) const override { return src.size() >= 4; }
 };
-
-std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
-  if (sst) {
-    return {options.statistics->getAndResetTickerCount(
-                NON_LAST_LEVEL_SEEK_FILTER_MATCH),
-            options.statistics->getAndResetTickerCount(
-                NON_LAST_LEVEL_SEEK_FILTERED)};
-  } else {
-    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
-    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
-    return {hit, miss};
-  }
-}
-
-std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
-  return {hits, misses};
-}
 }  // anonymous namespace
 
 // This uses a prefix_extractor + comparator combination that violates
@@ -3520,9 +3634,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
     if (flushed) {  // TODO: support auto_prefix_mode in memtable?
       read_options.auto_prefix_mode = true;
     } else {
-      // TODO: why needed?
-      get_perf_context()->bloom_memtable_hit_count = 0;
-      get_perf_context()->bloom_memtable_miss_count = 0;
+      // Reset from other tests
+      GetBloomStat(options, flushed);
     }
     EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
     {
@@ -3664,9 +3777,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
       if (flushed) {  // TODO: support auto_prefix_mode in memtable?
         read_options.auto_prefix_mode = true;
       } else {
-        // TODO: why needed?
-        get_perf_context()->bloom_memtable_hit_count = 0;
-        get_perf_context()->bloom_memtable_miss_count = 0;
+        // Reset from other tests
+        GetBloomStat(options, flushed);
       }
       EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
       {
index e95561efab9ab2a6da23ae6a1315630e80047c9a..b218bd0ba56dd48c99cf99e8f7ff355fec20c4f7 100644 (file)
@@ -2066,15 +2066,19 @@ InternalIterator* DBImpl::NewInternalIterator(
     bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
   InternalIterator* internal_iter;
   assert(arena != nullptr);
+  auto prefix_extractor =
+      super_version->mutable_cf_options.prefix_extractor.get();
   // Need to create internal iterator from the arena.
   MergeIteratorBuilder merge_iter_builder(
       &cfd->internal_comparator(), arena,
-      !read_options.total_order_seek &&
-          super_version->mutable_cf_options.prefix_extractor != nullptr,
+      // FIXME? It's not clear what interpretation of prefix seek is needed
+      // here, and no unit test cares about the value provided here.
+      !read_options.total_order_seek && prefix_extractor != nullptr,
       read_options.iterate_upper_bound);
   // Collect iterator for mutable memtable
   auto mem_iter = super_version->mem->NewIterator(
-      read_options, super_version->GetSeqnoToTimeMapping(), arena);
+      read_options, super_version->GetSeqnoToTimeMapping(), arena,
+      super_version->mutable_cf_options.prefix_extractor.get());
   Status s;
   if (!read_options.ignore_range_deletions) {
     std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
@@ -2098,6 +2102,7 @@ InternalIterator* DBImpl::NewInternalIterator(
   if (s.ok()) {
     super_version->imm->AddIterators(
         read_options, super_version->GetSeqnoToTimeMapping(),
+        super_version->mutable_cf_options.prefix_extractor.get(),
         &merge_iter_builder, !read_options.ignore_range_deletions);
   }
   TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
@@ -3843,6 +3848,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
     }
   }
   if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
     auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                     /* allow_unprepared_value */ true);
     result = NewDBIterator(
@@ -4044,6 +4052,9 @@ Status DBImpl::NewIterators(
 
   assert(cf_sv_pairs.size() == column_families.size());
   if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
     for (const auto& cf_sv_pair : cf_sv_pairs) {
       auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
                                       cf_sv_pair.super_version,
index 4fb1008761a8bafc8e0d345cc52ef7bad5232065..dec61c0504fe8b9ad7354b41f7990bdb4f014426 100644 (file)
@@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   TableProperties table_properties;
   {
     ScopedArenaPtr<InternalIterator> iter(
-        mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+        mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                         /*prefix_extractor=*/nullptr));
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] [WriteLevel0TableForRecovery]"
                     " Level-0 table #%" PRIu64 ": started",
index b42acc4bc6a79b9ad11b4f6e52ae4775a5dfce02..e02586377fd29b89d4a76e15a6cf1dfdfa23a4b6 100644 (file)
@@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       valid_(false),
       current_entry_is_merged_(false),
       is_key_seqnum_zero_(false),
-      prefix_same_as_start_(mutable_cf_options.prefix_extractor
-                                ? read_options.prefix_same_as_start
-                                : false),
+      prefix_same_as_start_(
+          prefix_extractor_ ? read_options.prefix_same_as_start : false),
       pin_thru_lifetime_(read_options.pin_data),
       expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
                                      read_options.total_order_seek ||
@@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
   status_.PermitUncheckedError();
   assert(timestamp_size_ ==
          user_comparator_.user_comparator()->timestamp_size());
+  // prefix_seek_opt_in_only should force total_order_seek whereever the caller
+  // is duplicating the original ReadOptions
+  assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
 }
 
 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
index 92211ad42ddde755ab1faaea3b0f8c930e98f3d6..fbe66a986b04fa7dbc5ad22e4705d4cfe378455d 100644 (file)
@@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
   bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
   bbto.whole_key_filtering = false;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
 
-  // Construct two L1 files with keys:
-  // f1:[aaa1 ccc1] f2:[ddd0]
-  ASSERT_OK(Put("aaa1", ""));
-  ASSERT_OK(Put("ccc1", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("ddd0", ""));
-  ASSERT_OK(Flush());
-  CompactRangeOptions cro;
-  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // This test is also the primary test for prefix_seek_opt_in_only
+  for (bool opt_in : {false, true}) {
+    options.prefix_seek_opt_in_only = opt_in;
+    DestroyAndReopen(options);
 
-  Iterator* iter = db_->NewIterator(ReadOptions());
-  ASSERT_OK(iter->status());
+    // Construct two L1 files with keys:
+    // f1:[aaa1 ccc1] f2:[ddd0]
+    ASSERT_OK(Put("aaa1", ""));
+    ASSERT_OK(Put("ccc1", ""));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("ddd0", ""));
+    ASSERT_OK(Flush());
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // Bloom filter is filterd out by f1.
-  // This is just one of several valid position following the contract.
-  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
-  // the behavior of the current implementation. If underlying implementation
-  // changes, the test might fail here.
-  iter->Seek("bbb1");
-  ASSERT_OK(iter->status());
-  ASSERT_FALSE(iter->Valid());
+    ReadOptions ropts;
+    for (bool same : {false, true}) {
+      ropts.prefix_same_as_start = same;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+      ASSERT_OK(iter->status());
 
-  delete iter;
+      iter->Seek("bbb1");
+      ASSERT_OK(iter->status());
+      if (opt_in && !same) {
+        // Unbounded total order seek
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key(), "ccc1");
+      } else {
+        // Bloom filter is filterd out by f1. When same == false, this is just
+        // one valid position following the contract. Postioning to ccc1 or ddd0
+        // is also valid. This is just to validate the behavior of the current
+        // implementation. If underlying implementation changes, the test might
+        // fail here.
+        ASSERT_FALSE(iter->Valid());
+      }
+    }
+  }
 }
 
 TEST_F(DBTest2, RowCacheSnapshot) {
@@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
     // create a DB with block prefix index
     BlockBasedTableOptions table_options;
     Options options = CurrentOptions();
+    options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
 
     // Sometimes filter is checked based on upper bound. Assert counters
     // for that case. Otherwise, only check data correctness.
index acb1c05c1d2673d25a295ca7b77e2abc8c4a81b9..60441da0b52460ca0d300778f426494c1292f2c4 100644 (file)
@@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
 
 TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
   Options options = CurrentOptions();
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
   options.env = env_;
   options.create_if_missing = true;
   options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
   TestComparator test_cmp(kTimestampSize);
   options.comparator = &test_cmp;
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   DestroyAndReopen(options);
   const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
index 6bd71dd56247ab29e8e2d1c1b10c6431ac77209a..8206bd298a8bd53e5a5510464081d14042c4085c 100644 (file)
@@ -421,8 +421,8 @@ Status FlushJob::MemPurge() {
   std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
       range_del_iters;
   for (MemTable* m : mems_) {
-    memtables.push_back(
-        m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+    memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+                                       &arena, /*prefix_extractor=*/nullptr));
     auto* range_del_iter = m->NewRangeTombstoneIterator(
         ro, kMaxSequenceNumber, true /* immutable_memtable */);
     if (range_del_iter != nullptr) {
@@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() {
           db_options_.info_log,
           "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
-      memtables.push_back(
-          m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+      memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+                                         &arena, /*prefix_extractor=*/nullptr));
       auto* range_del_iter = m->NewRangeTombstoneIterator(
           ro, kMaxSequenceNumber, true /* immutable_memtable */);
       if (range_del_iter != nullptr) {
index a4cbdb46679d0c9b1812f598d0fcb1cdec504843..0bf7c15ab8e41ab757ef5d8b7241d9a582732497 100644 (file)
@@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
       sv_->GetSeqnoToTimeMapping();
   mutable_iter_ =
-      sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
-  sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
-                         &arena_);
+      sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+                            sv_->mutable_cf_options.prefix_extractor.get());
+  sv_->imm->AddIterators(read_options_, seqno_to_time_mapping,
+                         sv_->mutable_cf_options.prefix_extractor.get(),
+                         &imm_iters_, &arena_);
   if (!read_options_.ignore_range_deletions) {
     std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
         sv_->mem->NewRangeTombstoneIterator(
@@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() {
   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
       svnew->GetSeqnoToTimeMapping();
   mutable_iter_ =
-      svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
-  svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
-                           &arena_);
+      svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+                              svnew->mutable_cf_options.prefix_extractor.get());
+  svnew->imm->AddIterators(read_options_, seqno_to_time_mapping,
+                           svnew->mutable_cf_options.prefix_extractor.get(),
+                           &imm_iters_, &arena_);
   ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
                                        kMaxSequenceNumber /* upper_bound */);
   if (!read_options_.ignore_range_deletions) {
index ef1184ded4807224b2b54a6e8ca9d44618e9f81e..5ba0a0dacc9488f78de60bcebf5d327465e3803b 100644 (file)
@@ -365,9 +365,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator : public InternalIterator {
  public:
-  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
-                   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
-                   Arena* arena, bool use_range_del_table = false)
+  enum Kind { kPointEntries, kRangeDelEntries };
+  MemTableIterator(
+      Kind kind, const MemTable& mem, const ReadOptions& read_options,
+      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping = nullptr,
+      Arena* arena = nullptr,
+      const SliceTransform* cf_prefix_extractor = nullptr)
       : bloom_(nullptr),
         prefix_extractor_(mem.prefix_extractor_),
         comparator_(mem.comparator_),
@@ -382,14 +385,21 @@ class MemTableIterator : public InternalIterator {
         arena_mode_(arena != nullptr),
         paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
         allow_data_in_error(mem.moptions_.allow_data_in_errors) {
-    if (use_range_del_table) {
+    if (kind == kRangeDelEntries) {
       iter_ = mem.range_del_table_->GetIterator(arena);
-    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
-               !read_options.auto_prefix_mode) {
+    } else if (prefix_extractor_ != nullptr &&
+               // NOTE: checking extractor equivalence when not pointer
+               // equivalent is arguably too expensive for memtable
+               prefix_extractor_ == cf_prefix_extractor &&
+               (read_options.prefix_same_as_start ||
+                (!read_options.total_order_seek &&
+                 !read_options.auto_prefix_mode))) {
       // Auto prefix mode is not implemented in memtable yet.
+      assert(kind == kPointEntries);
       bloom_ = mem.bloom_filter_.get();
       iter_ = mem.table_->GetDynamicPrefixIterator(arena);
     } else {
+      assert(kind == kPointEntries);
       iter_ = mem.table_->GetIterator(arena);
     }
     status_.PermitUncheckedError();
@@ -433,8 +443,8 @@ class MemTableIterator : public InternalIterator {
       // iterator should only use prefix bloom filter
       Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
       if (prefix_extractor_->InDomain(user_k_without_ts)) {
-        if (!bloom_->MayContain(
-                prefix_extractor_->Transform(user_k_without_ts))) {
+        Slice prefix = prefix_extractor_->Transform(user_k_without_ts);
+        if (!bloom_->MayContain(prefix)) {
           PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
           valid_ = false;
           return;
@@ -594,11 +604,13 @@ class MemTableIterator : public InternalIterator {
 
 InternalIterator* MemTable::NewIterator(
     const ReadOptions& read_options,
-    UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
+    UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+    const SliceTransform* prefix_extractor) {
   assert(arena != nullptr);
   auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
   return new (mem)
-      MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
+      MemTableIterator(MemTableIterator::kPointEntries, *this, read_options,
+                       seqno_to_time_mapping, arena, prefix_extractor);
 }
 
 FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
@@ -633,8 +645,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
     cache->reader_mutex.lock();
     if (!cache->tombstones) {
       auto* unfragmented_iter = new MemTableIterator(
-          *this, read_options, nullptr /* seqno_to_time_mapping= */,
-          nullptr /* arena */, true /* use_range_del_table */);
+          MemTableIterator::kRangeDelEntries, *this, read_options);
       cache->tombstones.reset(new FragmentedRangeTombstoneList(
           std::unique_ptr<InternalIterator>(unfragmented_iter),
           comparator_.comparator));
@@ -655,8 +666,7 @@ void MemTable::ConstructFragmentedRangeTombstones() {
   if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto* unfragmented_iter = new MemTableIterator(
-        *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
-        nullptr /* arena */, true /* use_range_del_table */);
+        MemTableIterator::kRangeDelEntries, *this, ReadOptions());
 
     fragmented_range_tombstone_list_ =
         std::make_unique<FragmentedRangeTombstoneList>(
index ca0652bc0442df444d1de168d587d21f53a151b4..194b4543c218fed665bfd5cded7651f51b0f268f 100644 (file)
@@ -210,7 +210,8 @@ class MemTable {
   // data, currently only needed for iterators serving user reads.
   InternalIterator* NewIterator(
       const ReadOptions& read_options,
-      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
+      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+      const SliceTransform* prefix_extractor);
 
   // Returns an iterator that yields the range tombstones of the memtable.
   // The caller must ensure that the underlying MemTable remains live
index c3612656e24f49a853cacd51644d227e0de30e80..c81c096b5115f2400a13d1fe3f4fdf48f9795eb4 100644 (file)
@@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
 void MemTableListVersion::AddIterators(
     const ReadOptions& options,
     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+    const SliceTransform* prefix_extractor,
     std::vector<InternalIterator*>* iterator_list, Arena* arena) {
   for (auto& m : memlist_) {
-    iterator_list->push_back(
-        m->NewIterator(options, seqno_to_time_mapping, arena));
+    iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping,
+                                            arena, prefix_extractor));
   }
 }
 
 void MemTableListVersion::AddIterators(
     const ReadOptions& options,
     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+    const SliceTransform* prefix_extractor,
     MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
   for (auto& m : memlist_) {
-    auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
-                                   merge_iter_builder->GetArena());
+    auto mem_iter =
+        m->NewIterator(options, seqno_to_time_mapping,
+                       merge_iter_builder->GetArena(), prefix_extractor);
     if (!add_range_tombstone_iter || options.ignore_range_deletions) {
       merge_iter_builder->AddIterator(mem_iter);
     } else {
index dd439de55906aac488104f7013a913d6b12b9371..390b4137dd27bd7909661e9d965f521a8fd5b218 100644 (file)
@@ -113,11 +113,13 @@ class MemTableListVersion {
 
   void AddIterators(const ReadOptions& options,
                     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+                    const SliceTransform* prefix_extractor,
                     std::vector<InternalIterator*>* iterator_list,
                     Arena* arena);
 
   void AddIterators(const ReadOptions& options,
                     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+                    const SliceTransform* prefix_extractor,
                     MergeIteratorBuilder* merge_iter_builder,
                     bool add_range_tombstone_iter);
 
index 114d36a6a8257f877aabb9621952484b4676d912..f7f4fbafb7f6908c042f0b831a8bc6902efb7e0a 100644 (file)
@@ -443,7 +443,8 @@ class Repairer {
       ro.total_order_seek = true;
       Arena arena;
       ScopedArenaPtr<InternalIterator> iter(
-          mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+          mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                           /*prefix_extractor=*/nullptr));
       int64_t _current_time = 0;
       immutable_db_options_.clock->GetCurrentTime(&_current_time)
           .PermitUncheckedError();  // ignore error
index 8db8c32a0a83aa98fdce2b14a150ffa3912af98a..d09dd0167d966b9c41650975bbf20a6a316c0d9e 100644 (file)
@@ -59,7 +59,7 @@ static std::string PrintContents(WriteBatch* b,
     InternalIterator* iter;
     if (i == 0) {
       iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
-                              &arena);
+                              &arena, /*prefix_extractor=*/nullptr);
       arena_iter_guard.reset(iter);
     } else {
       iter = mem->NewRangeTombstoneIterator(ReadOptions(),
index 5d4839df8787083656a813879504b5e957e585d3..5a4c310c415099e856abdbea3535e630d8a9fcc0 100644 (file)
@@ -1592,7 +1592,7 @@ Status StressTest::TestIterateImpl(ThreadState* thread,
     ro.total_order_seek = true;
     expect_total_order = true;
   } else if (thread->rand.OneIn(4)) {
-    ro.total_order_seek = false;
+    ro.total_order_seek = thread->rand.OneIn(2);
     ro.auto_prefix_mode = true;
     expect_total_order = true;
   } else if (options_.prefix_extractor.get() == nullptr) {
index 27feadb804eaef46e249815fba7220e7cab04a69..e272e3a69ad597e5b70cc5166f997f530e7e1bd2 100644 (file)
@@ -1399,6 +1399,17 @@ struct DBOptions {
   // eventually be obsolete and removed as Identity files are phased out.
   bool write_identity_file = true;
 
+  // Historically, when prefix_extractor != nullptr, iterators have an
+  // unfortunate default semantics of *possibly* only returning data
+  // within the same prefix. To avoid "spooky action at a distance," iterator
+  // bounds should come from the instantiation or seeking of the iterator,
+  // not from a mutable column family option.
+  //
+  // When set to true, it is as if every iterator is created with
+  // total_order_seek=true and only auto_prefix_mode=true and
+  // prefix_same_as_start=true can take advantage of prefix seek optimizations.
+  bool prefix_seek_opt_in_only = false;
+
   // The number of bytes to prefetch when reading the log. This is mostly useful
   // for reading a remotely located log, as it can save the number of
   // round-trips. If 0, then the prefetching is disabled.
@@ -1848,10 +1859,10 @@ struct ReadOptions {
   bool auto_prefix_mode = false;
 
   // Enforce that the iterator only iterates over the same prefix as the seek.
-  // This option is effective only for prefix seeks, i.e. prefix_extractor is
-  // non-null for the column family and total_order_seek is false.  Unlike
-  // iterate_upper_bound, prefix_same_as_start only works within a prefix
-  // but in both directions.
+  // This makes the iterator bounds dependent on the column family's current
+  // prefix_extractor, which is mutable. When SST files have been built with
+  // the same prefix extractor, prefix filtering optimizations will be used
+  // for both Seek and SeekForPrev.
   bool prefix_same_as_start = false;
 
   // Keep the blocks loaded by the iterator pinned in memory as long as the
index 53f10998ca86e771ee82781c825f5dbf131a807d..ed456acc3ed3413a99888c0749829bf09ec64b4d 100644 (file)
@@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   ROCKSDB_NAMESPACE::Arena arena;
   ROCKSDB_NAMESPACE::ScopedArenaPtr<ROCKSDB_NAMESPACE::InternalIterator> iter(
       mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
-                       /*seqno_to_time_mapping=*/nullptr, &arena));
+                       /*seqno_to_time_mapping=*/nullptr, &arena,
+                       /*prefix_extractor=*/nullptr));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
     ikey.clear();
index 0b880f4b9399b8dd7dc7e20514033826dda6eb40..e9e8fc5b7e032e6ccc72ae09149064a7b2a6e643 100644 (file)
@@ -403,6 +403,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"prefix_seek_opt_in_only",
+         {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"write_dbid_to_manifest",
          {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
           OptionType::kBoolean, OptionVerificationType::kNormal,
@@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       background_close_inactive_wals(options.background_close_inactive_wals),
       atomic_flush(options.atomic_flush),
       avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+      prefix_seek_opt_in_only(options.prefix_seek_opt_in_only),
       persist_stats_to_disk(options.persist_stats_to_disk),
       write_dbid_to_manifest(options.write_dbid_to_manifest),
       write_identity_file(options.write_identity_file),
@@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "            Options.avoid_unnecessary_blocking_io: %d",
                    avoid_unnecessary_blocking_io);
+  ROCKS_LOG_HEADER(log, "            Options.prefix_seek_opt_in_only: %d",
+                   prefix_seek_opt_in_only);
   ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
                    persist_stats_to_disk);
   ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
index 842fefca8604cdb5dbd5950634b954484b41f549..ac76ea40d8eb3fdde39c394e184c31c21d560c45 100644 (file)
@@ -87,6 +87,7 @@ struct ImmutableDBOptions {
   bool background_close_inactive_wals;
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
+  bool prefix_seek_opt_in_only;
   bool persist_stats_to_disk;
   bool write_dbid_to_manifest;
   bool write_identity_file;
index fe45224d08eba494287e502d49f91a7c558d3c9e..0dfe3e38ab66285751b511bd58e4cf60efc8695b 100644 (file)
@@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator(
   if (arena == nullptr) {
     return new BlockBasedTableIterator(
         this, read_options, rep_->internal_comparator, std::move(index_iter),
-        !skip_filters && !read_options.total_order_seek &&
+        !skip_filters &&
+            (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+             read_options.prefix_same_as_start) &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, caller,
         compaction_readahead_size, allow_unprepared_value);
@@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator(
     auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
     return new (mem) BlockBasedTableIterator(
         this, read_options, rep_->internal_comparator, std::move(index_iter),
-        !skip_filters && !read_options.total_order_seek &&
+        !skip_filters &&
+            (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+             read_options.prefix_same_as_start) &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, caller,
         compaction_readahead_size, allow_unprepared_value);
index d3c968f73a9e115c88fed1825a27dfdb3151ce1a..9d4e8eccf3ec009e172a8a90ccefe5af5e23bc40 100644 (file)
@@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator(
   assert(table_properties_);
 
   // Auto prefix mode is not implemented in PlainTable.
-  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
-                         !options.auto_prefix_mode;
+  bool use_prefix_seek =
+      !IsTotalOrderMode() &&
+      (options.prefix_same_as_start ||
+       (!options.total_order_seek && !options.auto_prefix_mode));
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
   } else {
index 9eee267614b00886a083f610001a1a062a1f0741..bb0b70222c0b154e9c0ea7000ba8ccfcea9fab24 100644 (file)
@@ -534,7 +534,7 @@ class MemTableConstructor : public Constructor {
       const SliceTransform* /*prefix_extractor*/) const override {
     return new KeyConvertingIterator(
         memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
-                               &arena_),
+                               &arena_, /*prefix_extractor=*/nullptr),
         true);
   }
 
@@ -4904,8 +4904,9 @@ TEST_F(MemTableTest, Simple) {
     std::unique_ptr<InternalIterator> iter_guard;
     InternalIterator* iter;
     if (i == 0) {
-      iter = GetMemTable()->NewIterator(
-          ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
+      iter = GetMemTable()->NewIterator(ReadOptions(),
+                                        /*seqno_to_time_mapping=*/nullptr,
+                                        &arena, /*prefix_extractor=*/nullptr);
       arena_iter_guard.reset(iter);
     } else {
       iter = GetMemTable()->NewRangeTombstoneIterator(
diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md
new file mode 100644 (file)
index 0000000..d7b45c6
--- /dev/null
@@ -0,0 +1 @@
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md
new file mode 100644 (file)
index 0000000..71c3cd6
--- /dev/null
@@ -0,0 +1 @@
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.