From 850c16c2468c3200a340493c12930543f326b0e1 Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Thu, 14 Apr 2022 20:54:15 -0400 Subject: [PATCH] bluestore: set upper and lower bounds on rocksdb omap iterators Limits RocksDB omap Seek operations to the relevant key range of the object's omap. This prevents RocksDB from unnecessarily iterating over delete range tombstones in irrelevant omap CF shards. Avoids extreme performance degradation commonly caused by tombstones generated from RGW bucket resharding cleanup. Also prefer CFIteratorImpl over ShardMergeIteratorImpl when we can determine that all keys within specified IteratorBounds must be in a single CF. Fixes: https://tracker.ceph.com/issues/55324 Signed-off-by: Cory Snyder --- src/kv/KeyValueDB.h | 13 ++- src/kv/MemDB.h | 2 +- src/kv/RocksDBStore.cc | 122 +++++++++++++++++++------ src/kv/RocksDBStore.h | 40 +++++++- src/os/bluestore/BlueStore.cc | 25 ++++- src/test/ObjectMap/KeyValueDBMemory.cc | 2 +- src/test/ObjectMap/KeyValueDBMemory.h | 2 +- 7 files changed, 166 insertions(+), 40 deletions(-) diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h index 3dd0894bb1813..fee513c5a6823 100644 --- a/src/kv/KeyValueDB.h +++ b/src/kv/KeyValueDB.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include "include/encoding.h" @@ -324,11 +325,17 @@ private: public: typedef uint32_t IteratorOpts; static const uint32_t ITERATOR_NOCACHE = 1; - virtual WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0) = 0; - virtual Iterator get_iterator(const std::string &prefix, IteratorOpts opts = 0) { + + struct IteratorBounds { + std::optional lower_bound; + std::optional upper_bound; + }; + + virtual WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0, IteratorBounds bounds = IteratorBounds()) = 0; + virtual Iterator get_iterator(const std::string &prefix, IteratorOpts opts = 0, IteratorBounds bounds = IteratorBounds()) { return std::make_shared( prefix, - get_wholespace_iterator(opts)); + get_wholespace_iterator(opts, std::move(bounds))); } virtual uint64_t get_estimated_size(std::map &extra) = 0; diff --git a/src/kv/MemDB.h b/src/kv/MemDB.h index 32d81db225ed2..80fed647efbdd 100644 --- a/src/kv/MemDB.h +++ b/src/kv/MemDB.h @@ -212,7 +212,7 @@ public: return 0; } - WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0) override { + WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0, IteratorBounds bounds = IteratorBounds()) override { return std::shared_ptr( new MDBWholeSpaceIteratorImpl(&m_map, &m_lock, &iterator_seq_no, m_using_btree)); } diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 0379966c12c2f..487a2173207df 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -622,6 +622,18 @@ bool RocksDBStore::is_column_family(const std::string& prefix) { return cf_handles.count(prefix); } +std::string_view RocksDBStore::get_key_hash_view(const prefix_shards& shards, const char* key, const size_t keylen) { + uint32_t hash_l = std::min(shards.hash_l, keylen); + uint32_t hash_h = std::min(shards.hash_h, keylen); + return { key + hash_l, hash_h - hash_l }; +} + +rocksdb::ColumnFamilyHandle *RocksDBStore::get_key_cf(const prefix_shards& shards, const char* key, const size_t keylen) { + auto sv = get_key_hash_view(shards, key, keylen); + uint32_t hash = ceph_str_hash_rjenkins(sv.data(), sv.size()); + return shards.handles[hash % shards.handles.size()]; +} + rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& prefix, const std::string& key) { auto iter = cf_handles.find(prefix); if (iter == cf_handles.end()) { @@ -630,10 +642,7 @@ rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& pref if (iter->second.handles.size() == 1) { return iter->second.handles[0]; } else { - uint32_t hash_l = std::min(iter->second.hash_l, key.size()); - uint32_t hash_h = std::min(iter->second.hash_h, key.size()); - uint32_t hash = ceph_str_hash_rjenkins(&key[hash_l], hash_h - hash_l); - return iter->second.handles[hash % iter->second.handles.size()]; + return get_key_cf(iter->second, key.data(), key.size()); } } } @@ -646,10 +655,36 @@ rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& pref if (iter->second.handles.size() == 1) { return iter->second.handles[0]; } else { - uint32_t hash_l = std::min(iter->second.hash_l, keylen); - uint32_t hash_h = std::min(iter->second.hash_h, keylen); - uint32_t hash = ceph_str_hash_rjenkins(&key[hash_l], hash_h - hash_l); - return iter->second.handles[hash % iter->second.handles.size()]; + return get_key_cf(iter->second, key, keylen); + } + } +} + +/** + * If the specified IteratorBounds arg has both an upper and a lower bound defined, and they have equal placement hash + * strings, we can be sure that the entire iteration range exists in a single CF. In that case, we return the relevant + * CF handle. In all other cases, we return a nullptr to indicate that the specified bounds cannot necessarily be mapped + * to a single CF. + */ +rocksdb::ColumnFamilyHandle *RocksDBStore::get_cf_handle(const std::string& prefix, const IteratorBounds& bounds) { + if (!bounds.lower_bound || !bounds.upper_bound) { + return nullptr; + } + auto iter = cf_handles.find(prefix); + if (iter == cf_handles.end() || iter->second.hash_l != 0) { + return nullptr; + } else { + if (iter->second.handles.size() == 1) { + return iter->second.handles[0]; + } else { + auto lower_bound_hash_str = get_key_hash_view(iter->second, bounds.lower_bound->data(), bounds.lower_bound->size()); + auto upper_bound_hash_str = get_key_hash_view(iter->second, bounds.upper_bound->data(), bounds.upper_bound->size()); + if (lower_bound_hash_str == upper_bound_hash_str) { + auto key = *bounds.lower_bound; + return get_key_cf(iter->second, key.data(), key.size()); + } else { + return nullptr; + } } } } @@ -2194,10 +2229,27 @@ class CFIteratorImpl : public KeyValueDB::IteratorImpl { protected: string prefix; rocksdb::Iterator *dbiter; + const KeyValueDB::IteratorBounds bounds; + const rocksdb::Slice iterate_lower_bound; + const rocksdb::Slice iterate_upper_bound; public: - explicit CFIteratorImpl(const std::string& p, - rocksdb::Iterator *iter) - : prefix(p), dbiter(iter) { } + explicit CFIteratorImpl(const RocksDBStore* db, + const std::string& p, + rocksdb::ColumnFamilyHandle* cf, + KeyValueDB::IteratorBounds bounds_) + : prefix(p), bounds(std::move(bounds_)), + iterate_lower_bound(make_slice(bounds.lower_bound)), + iterate_upper_bound(make_slice(bounds.upper_bound)) + { + auto options = rocksdb::ReadOptions(); + if (bounds.lower_bound) { + options.iterate_lower_bound = &iterate_lower_bound; + } + if (bounds.upper_bound) { + options.iterate_upper_bound = &iterate_upper_bound; + } + dbiter = db->db->NewIterator(options, cf); + } ~CFIteratorImpl() { delete dbiter; } @@ -2729,16 +2781,29 @@ private: const RocksDBStore* db; KeyLess keyless; string prefix; + const KeyValueDB::IteratorBounds bounds; + const rocksdb::Slice iterate_lower_bound; + const rocksdb::Slice iterate_upper_bound; std::vector iters; public: explicit ShardMergeIteratorImpl(const RocksDBStore* db, const std::string& prefix, - const std::vector& shards) - : db(db), keyless(db->comparator), prefix(prefix) + const std::vector& shards, + KeyValueDB::IteratorBounds bounds_) + : db(db), keyless(db->comparator), prefix(prefix), bounds(std::move(bounds_)), + iterate_lower_bound(make_slice(bounds.lower_bound)), + iterate_upper_bound(make_slice(bounds.upper_bound)) { iters.reserve(shards.size()); + auto options = rocksdb::ReadOptions(); + if (bounds.lower_bound) { + options.iterate_lower_bound = &iterate_lower_bound; + } + if (bounds.upper_bound) { + options.iterate_upper_bound = &iterate_upper_bound; + } for (auto& s : shards) { - iters.push_back(db->db->NewIterator(rocksdb::ReadOptions(), s)); + iters.push_back(db->db->NewIterator(options, s)); } } ~ShardMergeIteratorImpl() { @@ -2909,22 +2974,31 @@ public: } }; -KeyValueDB::Iterator RocksDBStore::get_iterator(const std::string& prefix, IteratorOpts opts) +KeyValueDB::Iterator RocksDBStore::get_iterator(const std::string& prefix, IteratorOpts opts, IteratorBounds bounds) { auto cf_it = cf_handles.find(prefix); if (cf_it != cf_handles.end()) { + rocksdb::ColumnFamilyHandle* cf = nullptr; if (cf_it->second.handles.size() == 1) { + cf = cf_it->second.handles[0]; + } else { + cf = get_cf_handle(prefix, bounds); + } + if (cf) { return std::make_shared( - prefix, - db->NewIterator(rocksdb::ReadOptions(), cf_it->second.handles[0])); + this, + prefix, + cf, + std::move(bounds)); } else { return std::make_shared( this, prefix, - cf_it->second.handles); + cf_it->second.handles, + std::move(bounds)); } } else { - return KeyValueDB::get_iterator(prefix, opts); + return KeyValueDB::get_iterator(prefix, opts, std::move(bounds)); } } @@ -2933,14 +3007,11 @@ rocksdb::Iterator* RocksDBStore::new_shard_iterator(rocksdb::ColumnFamilyHandle* return db->NewIterator(rocksdb::ReadOptions(), cf); } -RocksDBStore::WholeSpaceIterator RocksDBStore::get_wholespace_iterator(IteratorOpts opts) +RocksDBStore::WholeSpaceIterator RocksDBStore::get_wholespace_iterator(IteratorOpts opts, IteratorBounds bounds) { if (cf_handles.size() == 0) { - rocksdb::ReadOptions opt = rocksdb::ReadOptions(); - if (opts & ITERATOR_NOCACHE) - opt.fill_cache=false; return std::make_shared( - db->NewIterator(opt, default_cf)); + this, default_cf, opts, std::move(bounds)); } else { return std::make_shared(this); } @@ -2948,8 +3019,7 @@ RocksDBStore::WholeSpaceIterator RocksDBStore::get_wholespace_iterator(IteratorO RocksDBStore::WholeSpaceIterator RocksDBStore::get_default_cf_iterator() { - return std::make_shared( - db->NewIterator(rocksdb::ReadOptions(), default_cf)); + return std::make_shared(this, default_cf, 0, IteratorBounds()); } int RocksDBStore::prepare_for_reshard(const std::string& new_sharding, diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index c15ff7bf0c0f2..932ad249e7287 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -64,6 +64,14 @@ namespace rocksdb{ extern rocksdb::Logger *create_rocksdb_ceph_logger(); +inline rocksdb::Slice make_slice(const std::optional& bound) { + if (bound) { + return {*bound}; + } else { + return {}; + } +} + /** * Uses RocksDB to implement the KeyValueDB interface */ @@ -83,6 +91,7 @@ class RocksDBStore : public KeyValueDB { uint64_t cache_size = 0; bool set_cache_flag = false; friend class ShardMergeIteratorImpl; + friend class CFIteratorImpl; friend class WholeMergeIteratorImpl; /* * See RocksDB's definition of a column family(CF) and how to use it. @@ -119,8 +128,11 @@ private: void add_column_family(const std::string& cf_name, uint32_t hash_l, uint32_t hash_h, size_t shard_idx, rocksdb::ColumnFamilyHandle *handle); bool is_column_family(const std::string& prefix); + std::string_view get_key_hash_view(const prefix_shards& shards, const char* key, const size_t keylen); + rocksdb::ColumnFamilyHandle *get_key_cf(const prefix_shards& shards, const char* key, const size_t keylen); rocksdb::ColumnFamilyHandle *get_cf_handle(const std::string& prefix, const std::string& key); rocksdb::ColumnFamilyHandle *get_cf_handle(const std::string& prefix, const char* key, size_t keylen); + rocksdb::ColumnFamilyHandle *get_cf_handle(const std::string& prefix, const IteratorBounds& bounds); int submit_common(rocksdb::WriteOptions& woptions, KeyValueDB::Transaction t); int install_cf_mergeop(const std::string &cf_name, rocksdb::ColumnFamilyOptions *cf_opt); @@ -341,9 +353,29 @@ public: public KeyValueDB::WholeSpaceIteratorImpl { protected: rocksdb::Iterator *dbiter; + const KeyValueDB::IteratorBounds bounds; + const rocksdb::Slice iterate_lower_bound; + const rocksdb::Slice iterate_upper_bound; public: - explicit RocksDBWholeSpaceIteratorImpl(rocksdb::Iterator *iter) : - dbiter(iter) { } + explicit RocksDBWholeSpaceIteratorImpl(const RocksDBStore* db, + rocksdb::ColumnFamilyHandle* cf, + const KeyValueDB::IteratorOpts opts, + KeyValueDB::IteratorBounds bounds_) : + bounds(std::move(bounds_)), + iterate_lower_bound(make_slice(bounds.lower_bound)), + iterate_upper_bound(make_slice(bounds.upper_bound)) + { + rocksdb::ReadOptions options = rocksdb::ReadOptions(); + if (opts & ITERATOR_NOCACHE) + options.fill_cache=false; + if (bounds.lower_bound) { + options.iterate_lower_bound = &iterate_lower_bound; + } + if (bounds.upper_bound) { + options.iterate_upper_bound = &iterate_upper_bound; + } + dbiter = db->db->NewIterator(options, cf); + } //virtual ~RocksDBWholeSpaceIteratorImpl() { } ~RocksDBWholeSpaceIteratorImpl() override; @@ -366,7 +398,7 @@ public: size_t value_size() override; }; - Iterator get_iterator(const std::string& prefix, IteratorOpts opts = 0) override; + Iterator get_iterator(const std::string& prefix, IteratorOpts opts = 0, IteratorBounds = IteratorBounds()) override; private: /// this iterator spans single cf rocksdb::Iterator* new_shard_iterator(rocksdb::ColumnFamilyHandle* cf); @@ -499,7 +531,7 @@ err: return nullptr; } - WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0) override; + WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0, IteratorBounds bounds = IteratorBounds()) override; private: WholeSpaceIterator get_default_cf_iterator(); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 55d08cfbddf7c..e007201cada86 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -11873,10 +11873,13 @@ int BlueStore::_onode_omap_get( o->flush(); { const string& prefix = o->get_omap_prefix(); - KeyValueDB::Iterator it = db->get_iterator(prefix); string head, tail; o->get_omap_header(&head); o->get_omap_tail(&tail); + auto bounds = KeyValueDB::IteratorBounds(); + bounds.lower_bound = head; + bounds.upper_bound = tail; + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, std::move(bounds)); it->lower_bound(head); while (it->valid()) { if (it->key() == head) { @@ -11958,10 +11961,13 @@ int BlueStore::omap_get_keys( o->flush(); { const string& prefix = o->get_omap_prefix(); - KeyValueDB::Iterator it = db->get_iterator(prefix); string head, tail; o->get_omap_key(string(), &head); o->get_omap_tail(&tail); + auto bounds = KeyValueDB::IteratorBounds(); + bounds.lower_bound = head; + bounds.upper_bound = tail; + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, std::move(bounds)); it->lower_bound(head); while (it->valid()) { if (it->key() >= tail) { @@ -12146,7 +12152,15 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( } o->flush(); dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <get_iterator(o->get_omap_prefix()); + auto bounds = KeyValueDB::IteratorBounds(); + if (o->onode.has_omap()) { + std::string lower_bound, upper_bound; + o->get_omap_key(string(), &lower_bound); + o->get_omap_tail(&upper_bound); + bounds.lower_bound = std::move(lower_bound); + bounds.upper_bound = std::move(upper_bound); + } + KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); } @@ -16659,10 +16673,13 @@ int BlueStore::_clone(TransContext *txc, // otherwise rewrite_omap_key will corrupt data ceph_assert(oldo->onode.flags == newo->onode.flags); const string& prefix = newo->get_omap_prefix(); - KeyValueDB::Iterator it = db->get_iterator(prefix); string head, tail; oldo->get_omap_header(&head); oldo->get_omap_tail(&tail); + auto bounds = KeyValueDB::IteratorBounds(); + bounds.lower_bound = head; + bounds.upper_bound = tail; + KeyValueDB::Iterator it = db->get_iterator(prefix, 0, std::move(bounds)); it->lower_bound(head); while (it->valid()) { if (it->key() >= tail) { diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc index 234e963397e31..b2b351baef44b 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.cc +++ b/src/test/ObjectMap/KeyValueDBMemory.cc @@ -234,7 +234,7 @@ int KeyValueDBMemory::rm_range_keys(const string &prefix, const string &start, c return 0; } -KeyValueDB::WholeSpaceIterator KeyValueDBMemory::get_wholespace_iterator(IteratorOpts opts) { +KeyValueDB::WholeSpaceIterator KeyValueDBMemory::get_wholespace_iterator(IteratorOpts opts, IteratorBounds bounds) { return std::shared_ptr( new WholeSpaceMemIterator(this) ); diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h index de84ede9049f8..8a154ea296df5 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.h +++ b/src/test/ObjectMap/KeyValueDBMemory.h @@ -184,5 +184,5 @@ private: friend class WholeSpaceMemIterator; public: - WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0) override; + WholeSpaceIterator get_wholespace_iterator(IteratorOpts opts = 0, IteratorBounds bounds = IteratorBounds()) override; }; -- 2.39.5