Fix WriteBatchWithIndex's SeekForPrev() (#4559)

author Siying Dong <siying.d@fb.com>

Fri, 19 Oct 2018 21:38:32 +0000 (14:38 -0700)

committer sdong <siying.d@fb.com>

Fri, 19 Oct 2018 22:44:58 +0000 (15:44 -0700)
author Siying Dong <siying.d@fb.com>
Fri, 19 Oct 2018 21:38:32 +0000 (14:38 -0700)
committer sdong <siying.d@fb.com>
Fri, 19 Oct 2018 22:44:58 +0000 (15:44 -0700)
diff --git a/HISTORY.md b/HISTORY.md

index f26e53c3b5b71ee79cf50b3aff926470086999aa..363239525b6fe486cae20fcd3c2cfc60b32db6c5 100644 (file)
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,9 @@
  # Rocksdb Change Log
+
+# Unreleased
+### Bug Fixes
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+
  ## 5.16.5 (10/16/2018)
  ### Bug Fixes
  * Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc

index 52eb700246ffe03df68d99f9bfade319f24c8df6..2202d6baf75ab1e93b969816393f0522a4fdc5fc 100644 (file)
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -352,14 +352,16 @@ class WBWIIteratorImpl : public WBWIIterator {
    }
  
    virtual void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
      skip_list_iter_.Seek(&search_entry);
    }
  
    virtual void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_ + 1, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
      skip_list_iter_.Seek(&search_entry);
      if (!skip_list_iter_.Valid()) {
        skip_list_iter_.SeekToLast();
@@ -369,12 +371,16 @@ class WBWIIteratorImpl : public WBWIIterator {
    }
  
    virtual void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
      skip_list_iter_.Seek(&search_entry);
    }
  
    virtual void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
      skip_list_iter_.SeekForPrev(&search_entry);
    }
  
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc

index 14e5f2147db4d305fff00fe5484000a1668d03d4..243672ce4df12906ab7115d6e6888b168e985ab0 100644 (file)
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -85,6 +85,20 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
    return Status::OK();
  }
  
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+//    and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and offset, the one with larger offset
+//    will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
  int WriteBatchEntryComparator::operator()(
      const WriteBatchIndexEntry* entry1,
      const WriteBatchIndexEntry* entry2) const {
@@ -94,9 +108,10 @@ int WriteBatchEntryComparator::operator()(
      return -1;
    }
  
-  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+  // Deal with special case of seeking to the beginning of a column family
+  if (entry1->is_min_in_cf()) {
      return -1;
-  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+  } else if (entry2->is_min_in_cf()) {
      return 1;
    }
  
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h

index ac20f1b862eb63da858c77fab325b1e15078f0ed..3eed7c724c2ffb98416c82abcb2a04d1d43cfa2d 100644 (file)
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -31,21 +31,52 @@ struct WriteBatchIndexEntry {
          key_offset(ko),
          key_size(ksz),
          search_key(nullptr) {}
-  WriteBatchIndexEntry(const Slice* sk, uint32_t c)
-      : offset(0),
-        column_family(c),
+  // Create a dummy entry as the search key. This index entry won't be backed
+  // by an entry from the write batch, but a pointer to the search key. Or a
+  // special flag of offset can indicate we are seek to first.
+  // @_search_key: the search key
+  // @_column_family: column family
+  // @is_forward_direction: true for Seek(). False for SeekForPrev()
+  // @is_seek_to_first: true if we seek to the beginning of the column family
+  //                    _search_key should be null in this case.
+  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+                       bool is_forward_direction, bool is_seek_to_first)
+      // For SeekForPrev(), we need to make the dummy entry larger than any
+      // entry who has the same search key. Otherwise, we'll miss those entries.
+      : offset(is_forward_direction ? 0 : port::kMaxSizet),
+        column_family(_column_family),
          key_offset(0),
-        key_size(0),
-        search_key(sk) {}
+        key_size(is_seek_to_first ? kFlagMinInCf : 0),
+        search_key(_search_key) {
+    assert(_search_key != nullptr || is_seek_to_first);
+  }
  
-  // If this flag appears in the offset, it indicates a key that is smaller
-  // than any other entry for the same column family
-  static const size_t kFlagMin = port::kMaxSizet;
+  // If this flag appears in the key_size, it indicates a
+  // key that is smaller than any other entry for the same column family.
+  static const size_t kFlagMinInCf = port::kMaxSizet;
+
+  bool is_min_in_cf() const {
+    assert(key_size != kFlagMinInCf ||
+           (key_offset == 0 && search_key == nullptr));
+    return key_size == kFlagMinInCf;
+  }
  
-  size_t offset;           // offset of an entry in write batch's string buffer.
-  uint32_t column_family;  // column family of the entry.
+  // offset of an entry in write batch's string buffer. If this is a dummy
+  // lookup key, in which case search_key != nullptr, offset is set to either
+  // 0 or max, only for comparison purpose. Because when entries have the same
+  // key, the entry with larger offset is larger, offset = 0 will make a seek
+  // key small or equal than all the entries with the seek key, so that Seek()
+  // will find all the entries of the same key. Similarly, offset = MAX will
+  // make the entry just larger than all entries with the search key so
+  // SeekForPrev() will see all the keys with the same key.
+  size_t offset;
+  uint32_t column_family;  // c1olumn family of the entry.
    size_t key_offset;       // offset of the key in write batch's string buffer.
-  size_t key_size;         // size of the key.
+  size_t key_size;         // size of the key. kFlagMinInCf indicates
+                           // that this is a dummy look up entry for
+                           // SeekToFirst() to the beginning of the column
+                           // family. We use the flag here to save a boolean
+                           // in the struct.
  
    const Slice* search_key;  // if not null, instead of reading keys from
                              // write batch, use it to compare. This is used
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc

index aa484cde99d92d294d84a5a0c53d1aa63e809154..d477968ca9e5b8ed6186146fc75f6c92b09508ae 100644 (file)
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -621,7 +621,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
      for (int i = 0; i < 128; i++) {
        // Random walk and make sure iter and result_iter returns the
        // same key and value
-      int type = rnd.Uniform(5);
+      int type = rnd.Uniform(6);
        ASSERT_OK(iter->status());
        switch (type) {
          case 0:
@@ -642,7 +642,15 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
            result_iter->Seek(key);
            break;
          }
-        case 3:
+        case 3: {
+          // SeekForPrev to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->SeekForPrev(key);
+          result_iter->SeekForPrev(key);
+          break;
+        }
+        case 4:
            // Next
            if (is_valid) {
              iter->Next();
@@ -652,7 +660,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
            }
            break;
          default:
-          assert(type == 4);
+          assert(type == 5);
            // Prev
            if (is_valid) {
              iter->Prev();
author	Siying Dong <siying.d@fb.com>
	Fri, 19 Oct 2018 21:38:32 +0000 (14:38 -0700)
committer	sdong <siying.d@fb.com>
	Fri, 19 Oct 2018 22:44:58 +0000 (15:44 -0700)
HISTORY.md		patch \| blob \| history
utilities/write_batch_with_index/write_batch_with_index.cc		patch \| blob \| history
utilities/write_batch_with_index/write_batch_with_index_internal.cc		patch \| blob \| history
utilities/write_batch_with_index/write_batch_with_index_internal.h		patch \| blob \| history
utilities/write_batch_with_index/write_batch_with_index_test.cc		patch \| blob \| history