From: Peter Dillinger Date: Fri, 10 Jun 2022 15:51:45 +0000 (-0700) Subject: Fix bug with kHashSearch and changing prefix_extractor with SetOptions (#10128) X-Git-Tag: v7.4.3~49 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d3a3b02134e9a71012fbdeeca2f4e0fb9b745312;p=rocksdb.git Fix bug with kHashSearch and changing prefix_extractor with SetOptions (#10128) Summary: When opening an SST file created using index_type=kHashSearch, the *current* prefix_extractor would be saved, and used with hash index if the *new current* prefix_extractor at query time is compatible with the SST file. This is a problem if the prefix_extractor at SST open time is not compatible but SetOptions later changes (back) to one that is compatible. This change fixes that by using the known compatible (or missing) prefix extractor we save for use with prefix filtering. Detail: I have moved the InternalKeySliceTransform wrapper to avoid some indirection and remove unnecessary fields. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10128 Test Plan: expanded unit test (using some logic from https://github.com/facebook/rocksdb/issues/10122) that fails before fix and probably covers some other previously uncovered cases. Reviewed By: siying Differential Revision: D36955738 Pulled By: pdillinger fbshipit-source-id: 0c78a6b0d24054ef2f3cb237bf010c1c5589fb10 --- diff --git a/HISTORY.md b/HISTORY.md index e65f204af..c2f891890 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Fix a race condition in WAL size tracking which is caused by an unsafe iterator access after container is changed. * Fix unprotected concurrent accesses to `WritableFileWriter::filesize_` by `DB::SyncWAL()` and `DB::Put()` in two write queue mode. * Fix a bug in WAL tracking. Before this PR (#10087), calling `SyncWAL()` on the only WAL file of the db will not log the event in MANIFEST, thus allowing a subsequent `DB::Open` even if the WAL file is missing or corrupted. +* Fix a bug that could return wrong results with `index_type=kHashSearch` and using `SetOptions` to change the `prefix_extractor`. * Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail. ### Public API changes diff --git a/db/db_test.cc b/db/db_test.cc index 14abe8056..bc1808be8 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3467,7 +3467,40 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_OK(Flush()); ASSERT_OK(Put("k2", "v2")); - // Reopen it without prefix extractor, make sure everything still works. + // Reopen with different prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Back to original + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + + // Same if there's a problem initally loading prefix transform + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + [&](void* arg) { *static_cast(arg) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); + +#ifndef ROCKSDB_LITE + // Change again + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +#endif // !ROCKSDB_LITE + SyncPoint::GetInstance()->DisableProcessing(); + + // Reopen with no prefix extractor, make sure everything still works. // RocksDB should just fall back to the binary index. table_options.index_type = BlockBasedTableOptions::kBinarySearch; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -3477,6 +3510,7 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); } + TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { // create a DB with block prefix index BlockBasedTableOptions table_options; diff --git a/db/dbformat.h b/db/dbformat.h index 670c188c7..3ff2277aa 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -625,7 +625,7 @@ class IterKey { }; // Convert from a SliceTransform of user keys, to a SliceTransform of -// user keys. +// internal keys. class InternalKeySliceTransform : public SliceTransform { public: explicit InternalKeySliceTransform(const SliceTransform* transform) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 19eab5f16..e119d1bc7 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -654,17 +654,6 @@ Status BlockBasedTable::Open( file_size, level, immortal_table); rep->file = std::move(file); rep->footer = footer; - // We've successfully read the footer. We are ready to serve requests. - // Better not mutate rep_ after the creation. eg. internal_prefix_transform - // raw pointer will be used to create HashIndexReader, whose reset may - // access a dangling pointer. - // We need to wrap data with internal_prefix_transform to make sure it can - // handle prefix correctly. - // FIXME: is changed prefix_extractor handled anywhere for hash index? - if (prefix_extractor != nullptr) { - rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor.get())); - } // For fully portable/stable cache keys, we need to read the properties // block before setting up cache keys. TODO: consider setting up a bootstrap @@ -694,8 +683,14 @@ Status BlockBasedTable::Open( if (!s.ok()) { return s; } - if (!PrefixExtractorChangedHelper(rep->table_properties.get(), - prefix_extractor.get())) { + bool force_null_table_prefix_extractor = false; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTable::Open::ForceNullTablePrefixExtractor", + &force_null_table_prefix_extractor); + if (force_null_table_prefix_extractor) { + assert(!rep->table_prefix_extractor); + } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(), + prefix_extractor.get())) { // Establish fast path for unchanged prefix_extractor rep->table_prefix_extractor = prefix_extractor; } else { @@ -2003,7 +1998,7 @@ InternalIterator* BlockBasedTable::NewIterator( read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor); std::unique_ptr> index_iter(NewIndexIterator( read_options, - need_upper_bound_check && + /*disable_prefix_seek=*/need_upper_bound_check && rep_->index_type == BlockBasedTableOptions::kHashSearch, /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); if (arena == nullptr) { @@ -2565,18 +2560,10 @@ Status BlockBasedTable::CreateIndexReader( lookup_context, index_reader); } case BlockBasedTableOptions::kHashSearch: { - std::unique_ptr metaindex_guard; - std::unique_ptr metaindex_iter_guard; - bool should_fallback = false; - // FIXME: is changed prefix_extractor handled anywhere for hash index? - if (rep_->internal_prefix_transform.get() == nullptr) { + if (!rep_->table_prefix_extractor) { ROCKS_LOG_WARN(rep_->ioptions.logger, - "No prefix extractor passed in. Fall back to binary" - " search index."); - should_fallback = true; - } - - if (should_fallback) { + "Missing prefix extractor for hash index. Fall back to" + " binary search index."); return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context, index_reader); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 39ef7aa41..2d624cd00 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -596,12 +596,6 @@ struct BlockBasedTable::Rep { BlockBasedTableOptions::IndexType index_type; bool whole_key_filtering; bool prefix_filtering; - // TODO(kailiu) It is very ugly to use internal key in table, since table - // module should not be relying on db module. However to make things easier - // and compatible with existing code, we introduce a wrapper that allows - // block to extract prefix without knowing if a key is internal or not. - // null if no prefix_extractor is passed in when opening the table reader. - std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor; std::shared_ptr fragmented_range_dels; diff --git a/table/block_based/block_prefix_index.cc b/table/block_based/block_prefix_index.cc index f9d92c74c..c83701d69 100644 --- a/table/block_based/block_prefix_index.cc +++ b/table/block_based/block_prefix_index.cc @@ -69,9 +69,6 @@ struct PrefixRecord { class BlockPrefixIndex::Builder { public: - explicit Builder(const SliceTransform* internal_prefix_extractor) - : internal_prefix_extractor_(internal_prefix_extractor) {} - void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) { PrefixRecord* record = reinterpret_cast( arena_.AllocateAligned(sizeof(PrefixRecord))); @@ -82,7 +79,7 @@ class BlockPrefixIndex::Builder { prefixes_.push_back(record); } - BlockPrefixIndex* Finish() { + BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) { // For now, use roughly 1:1 prefix to bucket ratio. uint32_t num_buckets = static_cast(prefixes_.size()) + 1; @@ -154,25 +151,22 @@ class BlockPrefixIndex::Builder { assert(offset == total_block_array_entries); - return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, - buckets, total_block_array_entries, - block_array_buffer); + return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets, + total_block_array_entries, block_array_buffer); } private: - const SliceTransform* internal_prefix_extractor_; - std::vector prefixes_; Arena arena_; }; -Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, +Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor, const Slice& prefixes, const Slice& prefix_meta, BlockPrefixIndex** prefix_index) { uint64_t pos = 0; auto meta_pos = prefix_meta; Status s; - Builder builder(internal_prefix_extractor); + Builder builder; while (!meta_pos.empty()) { uint32_t prefix_size = 0; @@ -201,14 +195,14 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, } if (s.ok()) { - *prefix_index = builder.Finish(); + *prefix_index = builder.Finish(prefix_extractor); } return s; } uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) { - Slice prefix = internal_prefix_extractor_->Transform(key); + Slice prefix = internal_prefix_extractor_.Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); uint32_t block_id = buckets_[bucket]; diff --git a/table/block_based/block_prefix_index.h b/table/block_based/block_prefix_index.h index 04121320e..4db8e2c65 100644 --- a/table/block_based/block_prefix_index.h +++ b/table/block_based/block_prefix_index.h @@ -5,6 +5,8 @@ #pragma once #include + +#include "db/dbformat.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { @@ -31,6 +33,8 @@ class BlockPrefixIndex { } // Create hash index by reading from the metadata blocks. + // Note: table reader (caller) is responsible for keeping shared_ptr to + // underlying prefix extractor // @params prefixes: a sequence of prefixes. // @params prefix_meta: contains the "metadata" to of the prefixes. static Status Create(const SliceTransform* hash_key_extractor, @@ -46,17 +50,17 @@ class BlockPrefixIndex { class Builder; friend Builder; - BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, - uint32_t num_buckets, uint32_t* buckets, - uint32_t num_block_array_buffer_entries, + BlockPrefixIndex(const SliceTransform* prefix_extractor, uint32_t num_buckets, + uint32_t* buckets, uint32_t num_block_array_buffer_entries, uint32_t* block_array_buffer) - : internal_prefix_extractor_(internal_prefix_extractor), + : internal_prefix_extractor_(prefix_extractor), num_buckets_(num_buckets), num_block_array_buffer_entries_(num_block_array_buffer_entries), buckets_(buckets), block_array_buffer_(block_array_buffer) {} - const SliceTransform* internal_prefix_extractor_; + InternalKeySliceTransform internal_prefix_extractor_; + uint32_t num_buckets_; uint32_t num_block_array_buffer_entries_; uint32_t* buckets_; diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 839c79dd9..bcaba17a2 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -95,8 +95,8 @@ Status HashIndexReader::Create(const BlockBasedTable* table, } BlockPrefixIndex* prefix_index = nullptr; - assert(rep->internal_prefix_transform.get() != nullptr); - s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + assert(rep->table_prefix_extractor); + s = BlockPrefixIndex::Create(rep->table_prefix_extractor.get(), prefixes_contents.data, prefixes_meta_contents.data, &prefix_index); // TODO: log error