current_entry_is_merged_(false),
statistics_(options.statistics) {
RecordTick(statistics_, NO_ITERATORS, 1);
+ max_skip_ = options.max_sequential_skip_in_iterations;
}
virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1);
bool valid_;
bool current_entry_is_merged_;
std::shared_ptr<Statistics> statistics_;
+ uint64_t max_skip_;
// No copying allowed
DBIter(const DBIter&);
assert(iter_->Valid());
assert(direction_ == kForward);
current_entry_is_merged_ = false;
+ uint64_t num_skipped = 0;
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
- // skip this entry
+ num_skipped++; // skip this entry
} else {
skipping = false;
switch (ikey.type) {
// they are hidden by this deletion.
SaveKey(ikey.user_key, &saved_key_);
skipping = true;
+ num_skipped = 0;
break;
case kTypeValue:
valid_ = true;
}
}
}
- iter_->Next();
+ // If we have sequentially iterated via numerous keys and still not
+ // found the next user-key, then it is better to seek so that we can
+ // avoid too many key comparisons. We seek to the last occurence of
+ // our current key by looking for sequence number 0.
+ if (skipping && num_skipped > max_skip_) {
+ num_skipped = 0;
+ std::string last_key;
+ AppendInternalKey(&last_key,
+ ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
+ iter_->Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ } else {
+ iter_->Next();
+ }
} while (iter_->Valid());
valid_ = false;
}
void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse);
+ uint64_t num_skipped = 0;
ValueType value_type = kTypeDeletion;
if (iter_->Valid()) {
saved_value_.assign(raw_value.data(), raw_value.size());
}
}
- iter_->Prev();
+ num_skipped++;
+ // If we have sequentially iterated via numerous keys and still not
+ // found the prev user-key, then it is better to seek so that we can
+ // avoid too many key comparisons. We seek to the first occurence of
+ // our current key by looking for max sequence number.
+ if (num_skipped > max_skip_) {
+ num_skipped = 0;
+ std::string last_key;
+ AppendInternalKey(&last_key,
+ ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
+ kValueTypeForSeek));
+ iter_->Seek(last_key);
+ RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+ } else {
+ iter_->Prev();
+ }
} while (iter_->Valid());
}
count_ = 0;
}
};
+
}
// Special Env used to delay background operations
} while (ChangeCompactOptions());
}
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST(DBTest, IterReseek) {
+ Options options = CurrentOptions();
+ options.max_sequential_skip_in_iterations = 3;
+ options.create_if_missing = true;
+ options.statistics = leveldb::CreateDBStatistics();
+ DestroyAndReopen(&options);
+
+ // insert two keys with same userkey and verify that
+ // reseek is not invoked. For each of these test cases,
+ // verify that we can find the next key "b".
+ ASSERT_OK(Put("a", "one"));
+ ASSERT_OK(Put("a", "two"));
+ ASSERT_OK(Put("b", "bone"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ iter->SeekToFirst();
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "a->two");
+ iter->Next();
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of three keys with same userkey and verify
+ // that reseek is still not invoked.
+ ASSERT_OK(Put("a", "three"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->three");
+ iter->Next();
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // insert a total of four keys with same userkey and verify
+ // that reseek is invoked.
+ ASSERT_OK(Put("a", "four"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+ iter->Next();
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+ ASSERT_EQ(IterStatus(iter), "b->bone");
+ delete iter;
+
+ // Testing reverse iterator
+ // At this point, we have three versions of "a" and one version of "b".
+ // The reseek statistics is already at 1.
+ int num_reseeks = (int)options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION);
+
+ // Insert another version of b and assert that reseek is not invoked
+ ASSERT_OK(Put("b", "btwo"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->btwo");
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
+ iter->Prev();
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+
+ // insert two more versions of b. This makes a total of 4 versions
+ // of b and 4 versions of a.
+ ASSERT_OK(Put("b", "bthree"));
+ ASSERT_OK(Put("b", "bfour"));
+ iter = db_->NewIterator(ReadOptions());
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "b->bfour");
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
+ iter->Prev();
+
+ // the previous Prev call should have invoked reseek
+ ASSERT_EQ(options.statistics.get()->getTickerCount(
+ NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
+ ASSERT_EQ(IterStatus(iter), "a->four");
+ delete iter;
+}
+
TEST(DBTest, IterSmallAndLargeMix) {
do {
ASSERT_OK(Put("a", "va"));
// Default: false
bool filter_deletes;
+ // An iteration->Next() sequentially skips over keys with the same
+ // user-key unless this option is set. This number specifies the number
+ // of keys (with the same userkey) that will be sequentially
+ // skipped before a reseek is issued.
+ // Default: 8
+ uint64_t max_sequential_skip_in_iterations;
+
// This is a factory that provides MemTableRep objects.
// Default: a factory that provides a skip-list-based implementation of
// MemTableRep.
NUMBER_MULTIGET_KEYS_READ = 19,
NUMBER_MULTIGET_BYTES_READ = 20,
+ // Number of deletes records that were not required to be
+ // written to storage because key does not exist
NUMBER_FILTERED_DELETES = 21,
NUMBER_MERGE_FAILURES = 22,
SEQUENCE_NUMBER = 23,
BLOOM_FILTER_PREFIX_CHECKED = 24,
BLOOM_FILTER_PREFIX_USEFUL = 25,
- TICKER_ENUM_MAX = 26
+ // Number of times we had to reseek inside an iteration to skip
+ // over large number of keys with same userkey.
+ NUMBER_OF_RESEEKS_IN_ITERATION = 26,
+
+ TICKER_ENUM_MAX = 27
};
+// The order of items listed in Tickers should be the same as
+// the order listed in TickersNameMap
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
- { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }
+ { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
+ { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }
};
/**
bytes_per_sync(0),
compaction_style(kCompactionStyleLevel),
filter_deletes(false),
+ max_sequential_skip_in_iterations(8),
memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
compaction_filter_factory(
std::shared_ptr<CompactionFilterFactory>(
new DefaultCompactionFilterFactory())) {
-
assert(memtable_factory.get() != nullptr);
}
Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
i, max_bytes_for_level_multiplier_additional[i]);
}
+ Log(log," Options.max_sequential_skip_in_iterations: %ld",
+ max_sequential_skip_in_iterations);
Log(log," Options.expanded_compaction_factor: %d",
expanded_compaction_factor);
Log(log," Options.source_compaction_factor: %d",