# Rocksdb Change Log
+
+# Unreleased
+### Bug Fixes
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+
## 5.16.5 (10/16/2018)
### Bug Fixes
* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
}
virtual void SeekToFirst() override {
- WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
- column_family_id_, 0, 0);
+ WriteBatchIndexEntry search_entry(
+ nullptr /* search_key */, column_family_id_,
+ true /* is_forward_direction */, true /* is_seek_to_first */);
skip_list_iter_.Seek(&search_entry);
}
virtual void SeekToLast() override {
- WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
- column_family_id_ + 1, 0, 0);
+ WriteBatchIndexEntry search_entry(
+ nullptr /* search_key */, column_family_id_ + 1,
+ true /* is_forward_direction */, true /* is_seek_to_first */);
skip_list_iter_.Seek(&search_entry);
if (!skip_list_iter_.Valid()) {
skip_list_iter_.SeekToLast();
}
virtual void Seek(const Slice& key) override {
- WriteBatchIndexEntry search_entry(&key, column_family_id_);
+ WriteBatchIndexEntry search_entry(&key, column_family_id_,
+ true /* is_forward_direction */,
+ false /* is_seek_to_first */);
skip_list_iter_.Seek(&search_entry);
}
virtual void SeekForPrev(const Slice& key) override {
- WriteBatchIndexEntry search_entry(&key, column_family_id_);
+ WriteBatchIndexEntry search_entry(&key, column_family_id_,
+ false /* is_forward_direction */,
+ false /* is_seek_to_first */);
skip_list_iter_.SeekForPrev(&search_entry);
}
return Status::OK();
}
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+// and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and offset, the one with larger offset
+// will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
int WriteBatchEntryComparator::operator()(
const WriteBatchIndexEntry* entry1,
const WriteBatchIndexEntry* entry2) const {
return -1;
}
- if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+ // Deal with special case of seeking to the beginning of a column family
+ if (entry1->is_min_in_cf()) {
return -1;
- } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+ } else if (entry2->is_min_in_cf()) {
return 1;
}
key_offset(ko),
key_size(ksz),
search_key(nullptr) {}
- WriteBatchIndexEntry(const Slice* sk, uint32_t c)
- : offset(0),
- column_family(c),
+ // Create a dummy entry as the search key. This index entry won't be backed
+ // by an entry from the write batch, but a pointer to the search key. Or a
+ // special flag of offset can indicate we are seek to first.
+ // @_search_key: the search key
+ // @_column_family: column family
+ // @is_forward_direction: true for Seek(). False for SeekForPrev()
+ // @is_seek_to_first: true if we seek to the beginning of the column family
+ // _search_key should be null in this case.
+ WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+ bool is_forward_direction, bool is_seek_to_first)
+ // For SeekForPrev(), we need to make the dummy entry larger than any
+ // entry who has the same search key. Otherwise, we'll miss those entries.
+ : offset(is_forward_direction ? 0 : port::kMaxSizet),
+ column_family(_column_family),
key_offset(0),
- key_size(0),
- search_key(sk) {}
+ key_size(is_seek_to_first ? kFlagMinInCf : 0),
+ search_key(_search_key) {
+ assert(_search_key != nullptr || is_seek_to_first);
+ }
- // If this flag appears in the offset, it indicates a key that is smaller
- // than any other entry for the same column family
- static const size_t kFlagMin = port::kMaxSizet;
+ // If this flag appears in the key_size, it indicates a
+ // key that is smaller than any other entry for the same column family.
+ static const size_t kFlagMinInCf = port::kMaxSizet;
+
+ bool is_min_in_cf() const {
+ assert(key_size != kFlagMinInCf ||
+ (key_offset == 0 && search_key == nullptr));
+ return key_size == kFlagMinInCf;
+ }
- size_t offset; // offset of an entry in write batch's string buffer.
- uint32_t column_family; // column family of the entry.
+ // offset of an entry in write batch's string buffer. If this is a dummy
+ // lookup key, in which case search_key != nullptr, offset is set to either
+ // 0 or max, only for comparison purpose. Because when entries have the same
+ // key, the entry with larger offset is larger, offset = 0 will make a seek
+ // key small or equal than all the entries with the seek key, so that Seek()
+ // will find all the entries of the same key. Similarly, offset = MAX will
+ // make the entry just larger than all entries with the search key so
+ // SeekForPrev() will see all the keys with the same key.
+ size_t offset;
+ uint32_t column_family; // c1olumn family of the entry.
size_t key_offset; // offset of the key in write batch's string buffer.
- size_t key_size; // size of the key.
+ size_t key_size; // size of the key. kFlagMinInCf indicates
+ // that this is a dummy look up entry for
+ // SeekToFirst() to the beginning of the column
+ // family. We use the flag here to save a boolean
+ // in the struct.
const Slice* search_key; // if not null, instead of reading keys from
// write batch, use it to compare. This is used
for (int i = 0; i < 128; i++) {
// Random walk and make sure iter and result_iter returns the
// same key and value
- int type = rnd.Uniform(5);
+ int type = rnd.Uniform(6);
ASSERT_OK(iter->status());
switch (type) {
case 0:
result_iter->Seek(key);
break;
}
- case 3:
+ case 3: {
+ // SeekForPrev to random key
+ auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+ auto key = source_strings[key_idx];
+ iter->SeekForPrev(key);
+ result_iter->SeekForPrev(key);
+ break;
+ }
+ case 4:
// Next
if (is_valid) {
iter->Next();
}
break;
default:
- assert(type == 4);
+ assert(type == 5);
// Prev
if (is_valid) {
iter->Prev();