const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
uint64_t version_number, ReadCallback* read_callback,
ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
- auto mem = arena_.AllocateAligned(sizeof(DBIter));
- db_iter_ = new (mem) DBIter(
- env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
- /* iter */ nullptr, version, sequence, true,
- max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
- sv_number_ = version_number;
read_options_ = read_options;
- allow_refresh_ = allow_refresh;
- memtable_range_tombstone_iter_ = nullptr;
-
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false;
}
+ read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
+
+ auto mem = arena_.AllocateAligned(sizeof(DBIter));
+ db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
+ ioptions.user_comparator,
+ /* iter */ nullptr, version, sequence, true,
+ max_sequential_skip_in_iteration, read_callback,
+ cfh, expose_blob_index);
+
+ sv_number_ = version_number;
+ allow_refresh_ = allow_refresh;
+ memtable_range_tombstone_iter_ = nullptr;
}
Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
merge_iter_builder.AddIterator(super_version->mem->NewIterator(
- read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
+ read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
+ /*prefix_extractor=*/nullptr,
&merge_iter_builder,
false /* add_range_tombstone_iter */);
ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
CreateAndReopenWithCF({"pikachu"}, options);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_OK(Put(1, "a", "b"));
bool value_found = false;
uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
ASSERT_TRUE(
db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
- ASSERT_TRUE(!value_found);
+ ASSERT_FALSE(value_found);
// assert that no new files were opened and no new blocks were
// read into block cache.
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
- ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+ ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value));
ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
db_->ReleaseSnapshot(snapshot);
}
+namespace {
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+ if (sst) {
+ return {options.statistics->getAndResetTickerCount(
+ NON_LAST_LEVEL_SEEK_FILTER_MATCH),
+ options.statistics->getAndResetTickerCount(
+ NON_LAST_LEVEL_SEEK_FILTERED)};
+ } else {
+ auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+ auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+ return {hit, miss};
+ }
+}
+
+std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
+ return {hits, misses};
+}
+} // namespace
-TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
- constexpr size_t kPrefixSize = 8;
- const std::string kKey = "key";
- assert(kKey.size() < kPrefixSize);
+TEST_F(DBBloomFilterTest, MemtablePrefixBloom) {
Options options = CurrentOptions();
- options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
options.memtable_prefix_bloom_size_ratio = 0.25;
Reopen(options);
- ASSERT_OK(Put(kKey, "v"));
- ASSERT_EQ("v", Get(kKey));
- std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
- iter->Seek(kKey);
+ ASSERT_FALSE(options.prefix_extractor->InDomain("key"));
+ ASSERT_OK(Put("key", "v"));
+ ASSERT_OK(Put("goat1", "g1"));
+ ASSERT_OK(Put("goat2", "g2"));
+
+ // Reset from other tests
+ GetBloomStat(options, false);
+
+ // Out of domain (Get)
+ ASSERT_EQ("v", Get("key"));
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // In domain (Get)
+ ASSERT_EQ("g1", Get("goat1"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat9"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goan1"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+ ReadOptions ropts;
+ if (options.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+ // Out of domain (scan)
+ iter->Seek("ke");
+ ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
- ASSERT_EQ(kKey, iter->key());
- iter->SeekForPrev(kKey);
+ ASSERT_EQ("key", iter->key());
+ iter->SeekForPrev("kez");
+ ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
- ASSERT_EQ(kKey, iter->key());
+ ASSERT_EQ("key", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // In domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+ iter->Seek("goat");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+
+ // Changing prefix extractor should affect prefix query semantics
+ // and bypass the existing memtable Bloom filter
+ ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}}));
+ iter.reset(db_->NewIterator(ropts));
+ // Now out of domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat2");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat2", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ if (ropts.prefix_same_as_start) {
+ iter->Seek("goat0");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ } else {
+ // NOTE: legacy prefix Seek may return keys outside of prefix
+ }
+
+ // Start a fresh new memtable, using new prefix extractor
+ ASSERT_OK(SingleDelete("key"));
+ ASSERT_OK(SingleDelete("goat1"));
+ ASSERT_OK(SingleDelete("goat2"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("key", "_v"));
+ ASSERT_OK(Put("goat1", "_g1"));
+ ASSERT_OK(Put("goat2", "_g2"));
+
+ iter.reset(db_->NewIterator(ropts));
+
+ // Still out of domain (Get)
+ ASSERT_EQ("_v", Get("key"));
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+ // Still in domain (Get)
+ ASSERT_EQ("_g1", Get("goat1"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat11"));
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goat9"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+ ASSERT_EQ("NOT_FOUND", Get("goan1"));
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+ // Now out of domain (scan)
+ iter->Seek("goan");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat1", iter->key());
+ ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat2");
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("goat2", iter->key());
+ ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+ // In domain (scan)
+ iter->Seek("goat0");
+ ASSERT_OK(iter->status());
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
}
class DBBloomFilterTestVaryPrefixAndFormatVer
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
- std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+ ReadOptions ropts;
+ if (options_.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
// check memtable bloom stats
iter->Seek(key1);
iter->Seek(key2);
ASSERT_OK(iter->status());
- ASSERT_TRUE(!iter->Valid());
+ ASSERT_FALSE(iter->Valid());
ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
ASSERT_OK(Flush());
- iter.reset(dbfull()->NewIterator(ReadOptions()));
+ iter.reset(dbfull()->NewIterator(ropts));
// Check SST bloom stats
iter->Seek(key1);
iter->Seek(key2);
ASSERT_OK(iter->status());
- ASSERT_TRUE(!iter->Valid());
+ ASSERT_FALSE(iter->Valid());
ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
}
PrefixScanInit(this);
count = 0;
env_->random_read_counter_.Reset();
- iter = db_->NewIterator(ReadOptions());
+ ReadOptions ropts;
+ if (options.prefix_seek_opt_in_only) {
+ ropts.prefix_same_as_start = true;
+ }
+ iter = db_->NewIterator(ropts);
for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
if (!iter->key().starts_with(prefix)) {
+ ASSERT_FALSE(ropts.prefix_same_as_start);
break;
}
count++;
bool InDomain(const Slice& src) const override { return src.size() >= 4; }
};
-
-std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
- if (sst) {
- return {options.statistics->getAndResetTickerCount(
- NON_LAST_LEVEL_SEEK_FILTER_MATCH),
- options.statistics->getAndResetTickerCount(
- NON_LAST_LEVEL_SEEK_FILTERED)};
- } else {
- auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
- auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
- return {hit, miss};
- }
-}
-
-std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
- return {hits, misses};
-}
} // anonymous namespace
// This uses a prefix_extractor + comparator combination that violates
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
- // TODO: why needed?
- get_perf_context()->bloom_memtable_hit_count = 0;
- get_perf_context()->bloom_memtable_miss_count = 0;
+ // Reset from other tests
+ GetBloomStat(options, flushed);
}
EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
{
if (flushed) { // TODO: support auto_prefix_mode in memtable?
read_options.auto_prefix_mode = true;
} else {
- // TODO: why needed?
- get_perf_context()->bloom_memtable_hit_count = 0;
- get_perf_context()->bloom_memtable_miss_count = 0;
+ // Reset from other tests
+ GetBloomStat(options, flushed);
}
EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
{
bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
InternalIterator* internal_iter;
assert(arena != nullptr);
+ auto prefix_extractor =
+ super_version->mutable_cf_options.prefix_extractor.get();
// Need to create internal iterator from the arena.
MergeIteratorBuilder merge_iter_builder(
&cfd->internal_comparator(), arena,
- !read_options.total_order_seek &&
- super_version->mutable_cf_options.prefix_extractor != nullptr,
+ // FIXME? It's not clear what interpretation of prefix seek is needed
+ // here, and no unit test cares about the value provided here.
+ !read_options.total_order_seek && prefix_extractor != nullptr,
read_options.iterate_upper_bound);
// Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator(
- read_options, super_version->GetSeqnoToTimeMapping(), arena);
+ read_options, super_version->GetSeqnoToTimeMapping(), arena,
+ super_version->mutable_cf_options.prefix_extractor.get());
Status s;
if (!read_options.ignore_range_deletions) {
std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
if (s.ok()) {
super_version->imm->AddIterators(
read_options, super_version->GetSeqnoToTimeMapping(),
+ super_version->mutable_cf_options.prefix_extractor.get(),
&merge_iter_builder, !read_options.ignore_range_deletions);
}
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
}
}
if (read_options.tailing) {
+ read_options.total_order_seek |=
+ immutable_db_options_.prefix_seek_opt_in_only;
+
auto iter = new ForwardIterator(this, read_options, cfd, sv,
/* allow_unprepared_value */ true);
result = NewDBIterator(
assert(cf_sv_pairs.size() == column_families.size());
if (read_options.tailing) {
+ read_options.total_order_seek |=
+ immutable_db_options_.prefix_seek_opt_in_only;
+
for (const auto& cf_sv_pair : cf_sv_pairs) {
auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
cf_sv_pair.super_version,
TableProperties table_properties;
{
ScopedArenaPtr<InternalIterator> iter(
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": started",
valid_(false),
current_entry_is_merged_(false),
is_key_seqnum_zero_(false),
- prefix_same_as_start_(mutable_cf_options.prefix_extractor
- ? read_options.prefix_same_as_start
- : false),
+ prefix_same_as_start_(
+ prefix_extractor_ ? read_options.prefix_same_as_start : false),
pin_thru_lifetime_(read_options.pin_data),
expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
read_options.total_order_seek ||
status_.PermitUncheckedError();
assert(timestamp_size_ ==
user_comparator_.user_comparator()->timestamp_size());
+ // prefix_seek_opt_in_only should force total_order_seek whereever the caller
+ // is duplicating the original ReadOptions
+ assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
}
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
bbto.whole_key_filtering = false;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
- DestroyAndReopen(options);
- // Construct two L1 files with keys:
- // f1:[aaa1 ccc1] f2:[ddd0]
- ASSERT_OK(Put("aaa1", ""));
- ASSERT_OK(Put("ccc1", ""));
- ASSERT_OK(Flush());
- ASSERT_OK(Put("ddd0", ""));
- ASSERT_OK(Flush());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
- ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+ // This test is also the primary test for prefix_seek_opt_in_only
+ for (bool opt_in : {false, true}) {
+ options.prefix_seek_opt_in_only = opt_in;
+ DestroyAndReopen(options);
- Iterator* iter = db_->NewIterator(ReadOptions());
- ASSERT_OK(iter->status());
+ // Construct two L1 files with keys:
+ // f1:[aaa1 ccc1] f2:[ddd0]
+ ASSERT_OK(Put("aaa1", ""));
+ ASSERT_OK(Put("ccc1", ""));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("ddd0", ""));
+ ASSERT_OK(Flush());
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
- // Bloom filter is filterd out by f1.
- // This is just one of several valid position following the contract.
- // Postioning to ccc1 or ddd0 is also valid. This is just to validate
- // the behavior of the current implementation. If underlying implementation
- // changes, the test might fail here.
- iter->Seek("bbb1");
- ASSERT_OK(iter->status());
- ASSERT_FALSE(iter->Valid());
+ ReadOptions ropts;
+ for (bool same : {false, true}) {
+ ropts.prefix_same_as_start = same;
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+ ASSERT_OK(iter->status());
- delete iter;
+ iter->Seek("bbb1");
+ ASSERT_OK(iter->status());
+ if (opt_in && !same) {
+ // Unbounded total order seek
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key(), "ccc1");
+ } else {
+ // Bloom filter is filterd out by f1. When same == false, this is just
+ // one valid position following the contract. Postioning to ccc1 or ddd0
+ // is also valid. This is just to validate the behavior of the current
+ // implementation. If underlying implementation changes, the test might
+ // fail here.
+ ASSERT_FALSE(iter->Valid());
+ }
+ }
+ }
}
TEST_F(DBTest2, RowCacheSnapshot) {
// create a DB with block prefix index
BlockBasedTableOptions table_options;
Options options = CurrentOptions();
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
// Sometimes filter is checked based on upper bound. Assert counters
// for that case. Otherwise, only check data correctness.
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
Options options = CurrentOptions();
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.env = env_;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
DestroyAndReopen(options);
const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters;
for (MemTable* m : mems_) {
- memtables.push_back(
- m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr));
auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) {
db_options_.info_log,
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
- memtables.push_back(
- m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr));
auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) {
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
sv_->GetSeqnoToTimeMapping();
mutable_iter_ =
- sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
- sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
- &arena_);
+ sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+ sv_->mutable_cf_options.prefix_extractor.get());
+ sv_->imm->AddIterators(read_options_, seqno_to_time_mapping,
+ sv_->mutable_cf_options.prefix_extractor.get(),
+ &imm_iters_, &arena_);
if (!read_options_.ignore_range_deletions) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
sv_->mem->NewRangeTombstoneIterator(
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
svnew->GetSeqnoToTimeMapping();
mutable_iter_ =
- svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
- svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
- &arena_);
+ svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+ svnew->mutable_cf_options.prefix_extractor.get());
+ svnew->imm->AddIterators(read_options_, seqno_to_time_mapping,
+ svnew->mutable_cf_options.prefix_extractor.get(),
+ &imm_iters_, &arena_);
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
kMaxSequenceNumber /* upper_bound */);
if (!read_options_.ignore_range_deletions) {
class MemTableIterator : public InternalIterator {
public:
- MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
- Arena* arena, bool use_range_del_table = false)
+ enum Kind { kPointEntries, kRangeDelEntries };
+ MemTableIterator(
+ Kind kind, const MemTable& mem, const ReadOptions& read_options,
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping = nullptr,
+ Arena* arena = nullptr,
+ const SliceTransform* cf_prefix_extractor = nullptr)
: bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_),
arena_mode_(arena != nullptr),
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
- if (use_range_del_table) {
+ if (kind == kRangeDelEntries) {
iter_ = mem.range_del_table_->GetIterator(arena);
- } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
- !read_options.auto_prefix_mode) {
+ } else if (prefix_extractor_ != nullptr &&
+ // NOTE: checking extractor equivalence when not pointer
+ // equivalent is arguably too expensive for memtable
+ prefix_extractor_ == cf_prefix_extractor &&
+ (read_options.prefix_same_as_start ||
+ (!read_options.total_order_seek &&
+ !read_options.auto_prefix_mode))) {
// Auto prefix mode is not implemented in memtable yet.
+ assert(kind == kPointEntries);
bloom_ = mem.bloom_filter_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
+ assert(kind == kPointEntries);
iter_ = mem.table_->GetIterator(arena);
}
status_.PermitUncheckedError();
// iterator should only use prefix bloom filter
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
if (prefix_extractor_->InDomain(user_k_without_ts)) {
- if (!bloom_->MayContain(
- prefix_extractor_->Transform(user_k_without_ts))) {
+ Slice prefix = prefix_extractor_->Transform(user_k_without_ts);
+ if (!bloom_->MayContain(prefix)) {
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
valid_ = false;
return;
InternalIterator* MemTable::NewIterator(
const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+ const SliceTransform* prefix_extractor) {
assert(arena != nullptr);
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem)
- MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
+ MemTableIterator(MemTableIterator::kPointEntries, *this, read_options,
+ seqno_to_time_mapping, arena, prefix_extractor);
}
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
cache->reader_mutex.lock();
if (!cache->tombstones) {
auto* unfragmented_iter = new MemTableIterator(
- *this, read_options, nullptr /* seqno_to_time_mapping= */,
- nullptr /* arena */, true /* use_range_del_table */);
+ MemTableIterator::kRangeDelEntries, *this, read_options);
cache->tombstones.reset(new FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator>(unfragmented_iter),
comparator_.comparator));
if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
// TODO: plumb Env::IOActivity, Env::IOPriority
auto* unfragmented_iter = new MemTableIterator(
- *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
- nullptr /* arena */, true /* use_range_del_table */);
+ MemTableIterator::kRangeDelEntries, *this, ReadOptions());
fragmented_range_tombstone_list_ =
std::make_unique<FragmentedRangeTombstoneList>(
// data, currently only needed for iterators serving user reads.
InternalIterator* NewIterator(
const ReadOptions& read_options,
- UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
+ UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+ const SliceTransform* prefix_extractor);
// Returns an iterator that yields the range tombstones of the memtable.
// The caller must ensure that the underlying MemTable remains live
void MemTableListVersion::AddIterators(
const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
std::vector<InternalIterator*>* iterator_list, Arena* arena) {
for (auto& m : memlist_) {
- iterator_list->push_back(
- m->NewIterator(options, seqno_to_time_mapping, arena));
+ iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping,
+ arena, prefix_extractor));
}
}
void MemTableListVersion::AddIterators(
const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
for (auto& m : memlist_) {
- auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
- merge_iter_builder->GetArena());
+ auto mem_iter =
+ m->NewIterator(options, seqno_to_time_mapping,
+ merge_iter_builder->GetArena(), prefix_extractor);
if (!add_range_tombstone_iter || options.ignore_range_deletions) {
merge_iter_builder->AddIterator(mem_iter);
} else {
void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
std::vector<InternalIterator*>* iterator_list,
Arena* arena);
void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+ const SliceTransform* prefix_extractor,
MergeIteratorBuilder* merge_iter_builder,
bool add_range_tombstone_iter);
ro.total_order_seek = true;
Arena arena;
ScopedArenaPtr<InternalIterator> iter(
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+ mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
int64_t _current_time = 0;
immutable_db_options_.clock->GetCurrentTime(&_current_time)
.PermitUncheckedError(); // ignore error
InternalIterator* iter;
if (i == 0) {
iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
- &arena);
+ &arena, /*prefix_extractor=*/nullptr);
arena_iter_guard.reset(iter);
} else {
iter = mem->NewRangeTombstoneIterator(ReadOptions(),
ro.total_order_seek = true;
expect_total_order = true;
} else if (thread->rand.OneIn(4)) {
- ro.total_order_seek = false;
+ ro.total_order_seek = thread->rand.OneIn(2);
ro.auto_prefix_mode = true;
expect_total_order = true;
} else if (options_.prefix_extractor.get() == nullptr) {
// eventually be obsolete and removed as Identity files are phased out.
bool write_identity_file = true;
+ // Historically, when prefix_extractor != nullptr, iterators have an
+ // unfortunate default semantics of *possibly* only returning data
+ // within the same prefix. To avoid "spooky action at a distance," iterator
+ // bounds should come from the instantiation or seeking of the iterator,
+ // not from a mutable column family option.
+ //
+ // When set to true, it is as if every iterator is created with
+ // total_order_seek=true and only auto_prefix_mode=true and
+ // prefix_same_as_start=true can take advantage of prefix seek optimizations.
+ bool prefix_seek_opt_in_only = false;
+
// The number of bytes to prefetch when reading the log. This is mostly useful
// for reading a remotely located log, as it can save the number of
// round-trips. If 0, then the prefetching is disabled.
bool auto_prefix_mode = false;
// Enforce that the iterator only iterates over the same prefix as the seek.
- // This option is effective only for prefix seeks, i.e. prefix_extractor is
- // non-null for the column family and total_order_seek is false. Unlike
- // iterate_upper_bound, prefix_same_as_start only works within a prefix
- // but in both directions.
+ // This makes the iterator bounds dependent on the column family's current
+ // prefix_extractor, which is mutable. When SST files have been built with
+ // the same prefix extractor, prefix filtering optimizations will be used
+ // for both Seek and SeekForPrev.
bool prefix_same_as_start = false;
// Keep the blocks loaded by the iterator pinned in memory as long as the
ROCKSDB_NAMESPACE::Arena arena;
ROCKSDB_NAMESPACE::ScopedArenaPtr<ROCKSDB_NAMESPACE::InternalIterator> iter(
mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
- /*seqno_to_time_mapping=*/nullptr, &arena));
+ /*seqno_to_time_mapping=*/nullptr, &arena,
+ /*prefix_extractor=*/nullptr));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
ikey.clear();
{offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
+ {"prefix_seek_opt_in_only",
+ {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
{"write_dbid_to_manifest",
{offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
OptionType::kBoolean, OptionVerificationType::kNormal,
background_close_inactive_wals(options.background_close_inactive_wals),
atomic_flush(options.atomic_flush),
avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+ prefix_seek_opt_in_only(options.prefix_seek_opt_in_only),
persist_stats_to_disk(options.persist_stats_to_disk),
write_dbid_to_manifest(options.write_dbid_to_manifest),
write_identity_file(options.write_identity_file),
ROCKS_LOG_HEADER(log,
" Options.avoid_unnecessary_blocking_io: %d",
avoid_unnecessary_blocking_io);
+ ROCKS_LOG_HEADER(log, " Options.prefix_seek_opt_in_only: %d",
+ prefix_seek_opt_in_only);
ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u",
persist_stats_to_disk);
ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d",
bool background_close_inactive_wals;
bool atomic_flush;
bool avoid_unnecessary_blocking_io;
+ bool prefix_seek_opt_in_only;
bool persist_stats_to_disk;
bool write_dbid_to_manifest;
bool write_identity_file;
if (arena == nullptr) {
return new BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, std::move(index_iter),
- !skip_filters && !read_options.total_order_seek &&
+ !skip_filters &&
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+ read_options.prefix_same_as_start) &&
prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, caller,
compaction_readahead_size, allow_unprepared_value);
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
return new (mem) BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, std::move(index_iter),
- !skip_filters && !read_options.total_order_seek &&
+ !skip_filters &&
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+ read_options.prefix_same_as_start) &&
prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, caller,
compaction_readahead_size, allow_unprepared_value);
assert(table_properties_);
// Auto prefix mode is not implemented in PlainTable.
- bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
- !options.auto_prefix_mode;
+ bool use_prefix_seek =
+ !IsTotalOrderMode() &&
+ (options.prefix_same_as_start ||
+ (!options.total_order_seek && !options.auto_prefix_mode));
if (arena == nullptr) {
return new PlainTableIterator(this, use_prefix_seek);
} else {
const SliceTransform* /*prefix_extractor*/) const override {
return new KeyConvertingIterator(
memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
- &arena_),
+ &arena_, /*prefix_extractor=*/nullptr),
true);
}
std::unique_ptr<InternalIterator> iter_guard;
InternalIterator* iter;
if (i == 0) {
- iter = GetMemTable()->NewIterator(
- ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
+ iter = GetMemTable()->NewIterator(ReadOptions(),
+ /*seqno_to_time_mapping=*/nullptr,
+ &arena, /*prefix_extractor=*/nullptr);
arena_iter_guard.reset(iter);
} else {
iter = GetMemTable()->NewRangeTombstoneIterator(
--- /dev/null
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
--- /dev/null
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.