]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
An iterator may automatically invoke reseeks.
authorDhruba Borthakur <dhruba@fb.com>
Sun, 28 Jul 2013 18:53:08 +0000 (11:53 -0700)
committerDhruba Borthakur <dhruba@fb.com>
Fri, 6 Sep 2013 18:50:53 +0000 (11:50 -0700)
Summary:
An iterator invokes reseek if the number of sequential skips over the
same userkey exceeds a configured number. This makes iter->Next()
faster (bacause of fewer key compares) if a large number of
adjacent internal keys in a table (sst or memtable) have the
same userkey.

Test Plan: Unit test DBTest.IterReseek.

Reviewers: emayanke, haobo, xjin

Reviewed By: xjin

CC: leveldb, xjin
Differential Revision: https://reviews.facebook.net/D11865

db/db_iter.cc
db/db_test.cc
include/rocksdb/options.h
include/rocksdb/statistics.h
util/options.cc

index e8643d7271abc084d8009573a059e74644cafd26..38e9953049dbe5a332b3d46a666a361555828164 100644 (file)
@@ -65,6 +65,7 @@ class DBIter: public Iterator {
         current_entry_is_merged_(false),
         statistics_(options.statistics) {
     RecordTick(statistics_, NO_ITERATORS, 1);
+    max_skip_ = options.max_sequential_skip_in_iterations;
   }
   virtual ~DBIter() {
     RecordTick(statistics_, NO_ITERATORS, -1);
@@ -129,6 +130,7 @@ class DBIter: public Iterator {
   bool valid_;
   bool current_entry_is_merged_;
   std::shared_ptr<Statistics> statistics_;
+  uint64_t max_skip_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -188,12 +190,13 @@ void DBIter::FindNextUserEntry(bool skipping) {
   assert(iter_->Valid());
   assert(direction_ == kForward);
   current_entry_is_merged_ = false;
+  uint64_t num_skipped = 0;
   do {
     ParsedInternalKey ikey;
     if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
       if (skipping &&
           user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
-        // skip this entry
+        num_skipped++; // skip this entry
       } else {
         skipping = false;
         switch (ikey.type) {
@@ -202,6 +205,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
             // they are hidden by this deletion.
             SaveKey(ikey.user_key, &saved_key_);
             skipping = true;
+            num_skipped = 0;
             break;
           case kTypeValue:
             valid_ = true;
@@ -220,7 +224,20 @@ void DBIter::FindNextUserEntry(bool skipping) {
         }
       }
     }
-    iter_->Next();
+    // If we have sequentially iterated via numerous keys and still not
+    // found the next user-key, then it is better to seek so that we can
+    // avoid too many key comparisons. We seek to the last occurence of
+    // our current key by looking for sequence number 0.
+    if (skipping && num_skipped > max_skip_) {
+      num_skipped = 0;
+      std::string last_key;
+      AppendInternalKey(&last_key,
+        ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_->Next();
+    }
   } while (iter_->Valid());
   valid_ = false;
 }
@@ -342,6 +359,7 @@ void DBIter::Prev() {
 
 void DBIter::FindPrevUserEntry() {
   assert(direction_ == kReverse);
+  uint64_t num_skipped = 0;
 
   ValueType value_type = kTypeDeletion;
   if (iter_->Valid()) {
@@ -367,7 +385,22 @@ void DBIter::FindPrevUserEntry() {
           saved_value_.assign(raw_value.data(), raw_value.size());
         }
       }
-      iter_->Prev();
+      num_skipped++;
+      // If we have sequentially iterated via numerous keys and still not
+      // found the prev user-key, then it is better to seek so that we can
+      // avoid too many key comparisons. We seek to the first occurence of
+      // our current key by looking for max sequence number.
+      if (num_skipped > max_skip_) {
+        num_skipped = 0;
+        std::string last_key;
+        AppendInternalKey(&last_key,
+          ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
+                            kValueTypeForSeek));
+        iter_->Seek(last_key);
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        iter_->Prev();
+      }
     } while (iter_->Valid());
   }
 
index fce858d6c7f88c34d9741fba2581dc2a9d87fd59..3f2879027b4192f9343df593f4e0038e5aa360d8 100644 (file)
@@ -69,6 +69,7 @@ class AtomicCounter {
     count_ = 0;
   }
 };
+
 }
 
 // Special Env used to delay background operations
@@ -1133,6 +1134,95 @@ TEST(DBTest, IterMulti) {
   } while (ChangeCompactOptions());
 }
 
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST(DBTest, IterReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = leveldb::CreateDBStatistics();
+  DestroyAndReopen(&options);
+
+  // insert two keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put("a",  "one"));
+  ASSERT_OK(Put("a",  "two"));
+  ASSERT_OK(Put("b",  "bone"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put("a",  "three"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put("a",  "four"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks = (int)options.statistics.get()->getTickerCount(
+                 NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put("b",  "btwo"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put("b",  "bthree"));
+  ASSERT_OK(Put("b",  "bfour"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(options.statistics.get()->getTickerCount(
+            NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
 TEST(DBTest, IterSmallAndLargeMix) {
   do {
     ASSERT_OK(Put("a", "va"));
index 6d028e2561037e16862693a6e159883ef0fb7bf9..0c8bdde3a30f93e6b0f1ff9b9b72e9e145c0955c 100644 (file)
@@ -532,6 +532,13 @@ struct Options {
   // Default: false
   bool filter_deletes;
 
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  // Default: 8
+  uint64_t max_sequential_skip_in_iterations;
+
   // This is a factory that provides MemTableRep objects.
   // Default: a factory that provides a skip-list-based implementation of
   // MemTableRep.
index e665278b071b59ed760e8f71f73408b5ac03ad21..5525a092b8e8d59af95982265856a62b0bfe4fbd 100644 (file)
@@ -58,6 +58,8 @@ enum Tickers {
   NUMBER_MULTIGET_KEYS_READ = 19,
   NUMBER_MULTIGET_BYTES_READ = 20,
 
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
   NUMBER_FILTERED_DELETES = 21,
   NUMBER_MERGE_FAILURES = 22,
   SEQUENCE_NUMBER = 23,
@@ -68,9 +70,15 @@ enum Tickers {
   BLOOM_FILTER_PREFIX_CHECKED = 24,
   BLOOM_FILTER_PREFIX_USEFUL = 25,
 
-  TICKER_ENUM_MAX = 26
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION = 26,
+
+  TICKER_ENUM_MAX = 27
 };
 
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
 const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
   { BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
   { BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
@@ -97,7 +105,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
   { NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
   { SEQUENCE_NUMBER, "rocksdb.sequence.number" },
   { BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
-  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }
+  { BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
+  { NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }
 };
 
 /**
index 7907f7171f440410ca304a27766dffb93d3be437..3ca71e5ff2c9bdbda2ca5360d2913532f7f693da 100644 (file)
@@ -81,11 +81,11 @@ Options::Options()
       bytes_per_sync(0),
       compaction_style(kCompactionStyleLevel),
       filter_deletes(false),
+      max_sequential_skip_in_iterations(8),
       memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
       compaction_filter_factory(
           std::shared_ptr<CompactionFilterFactory>(
             new DefaultCompactionFilterFactory())) {
-
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -174,6 +174,8 @@ Options::Dump(Logger* log) const
       Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
           i, max_bytes_for_level_multiplier_additional[i]);
     }
+    Log(log,"      Options.max_sequential_skip_in_iterations: %ld",
+        max_sequential_skip_in_iterations);
     Log(log,"             Options.expanded_compaction_factor: %d",
         expanded_compaction_factor);
     Log(log,"               Options.source_compaction_factor: %d",