From 832be4f44c4902ac9213f69b53163e6c9483bad0 Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Fri, 11 Jan 2019 11:36:44 -0600 Subject: [PATCH] kv/KeyValueDB: Move PriCache implementation to ShardedCache. Signed-off-by: Mark Nelson --- src/kv/KeyValueDB.h | 59 ++------------------- src/kv/RocksDBStore.cc | 68 +----------------------- src/kv/RocksDBStore.h | 23 ++++---- src/kv/rocksdb_cache/BinnedLRUCache.cc | 73 ++++++++++++++++++++++---- src/kv/rocksdb_cache/BinnedLRUCache.h | 20 +++++-- src/kv/rocksdb_cache/ShardedCache.h | 32 ++++++++++- src/os/bluestore/BlueStore.cc | 51 ++++++++++-------- src/os/bluestore/BlueStore.h | 23 ++++---- 8 files changed, 171 insertions(+), 178 deletions(-) diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h index ce63abf1eae92..41d7c5402d263 100644 --- a/src/kv/KeyValueDB.h +++ b/src/kv/KeyValueDB.h @@ -21,7 +21,7 @@ using std::vector; * * Kyoto Cabinet or LevelDB should implement this */ -class KeyValueDB : public PriorityCache::PriCache { +class KeyValueDB { public: /* * See RocksDB's definition of a column family(CF) and how to use it. @@ -270,9 +270,6 @@ public: typedef std::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator; private: - int64_t cache_bytes[PriorityCache::Priority::LAST+1] = { 0 }; - double cache_ratio = 0; - // This class filters a WholeSpaceIterator by a prefix. class PrefixIteratorImpl : public IteratorImpl { const std::string prefix; @@ -347,56 +344,6 @@ public: return -EOPNOTSUPP; } - // PriCache - - virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const { - return -EOPNOTSUPP; - } - - virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const { - return cache_bytes[pri]; - } - - virtual int64_t get_cache_bytes() const { - int64_t total = 0; - - for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) { - PriorityCache::Priority pri = static_cast(i); - total += get_cache_bytes(pri); - } - return total; - } - - virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { - cache_bytes[pri] = bytes; - } - - virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { - cache_bytes[pri] += bytes; - } - - virtual int64_t commit_cache_size(uint64_t total_cache) { - return -EOPNOTSUPP; - } - - virtual int64_t get_committed_size() const { - return -EOPNOTSUPP; - } - - virtual double get_cache_ratio() const { - return cache_ratio; - } - - virtual void set_cache_ratio(double ratio) { - cache_ratio = ratio; - } - - virtual string get_cache_name() const { - return "Unknown KeyValueDB Cache"; - } - - // End PriCache - virtual int set_cache_high_pri_pool_ratio(double ratio) { return -EOPNOTSUPP; } @@ -405,6 +352,10 @@ public: return -EOPNOTSUPP; } + virtual std::shared_ptr get_priority_cache() const { + return nullptr; + } + virtual ~KeyValueDB() {} /// estimate space utilization for a prefix (in bytes) diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 790eb4be2a08e..0192c4bc665b4 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -18,7 +18,6 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/utilities/convenience.h" #include "rocksdb/merge_operator.h" -#include "kv/rocksdb_cache/BinnedLRUCache.h" using std::string; #include "common/perf_counters.h" @@ -392,6 +391,7 @@ int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options& if (g_conf()->rocksdb_cache_type == "binned_lru") { bbt_opts.block_cache = rocksdb_cache::NewBinnedLRUCache( + cct, block_cache_size, g_conf()->rocksdb_cache_shard_bits); } else if (g_conf()->rocksdb_cache_type == "lru") { @@ -1268,72 +1268,6 @@ void RocksDBStore::compact_range(const string& start, const string& end) db->CompactRange(options, &cstart, &cend); } -int64_t RocksDBStore::request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const -{ - auto cache = bbt_opts.block_cache; - - int64_t assigned = get_cache_bytes(pri); - int64_t usage = 0; - int64_t request = 0; - switch (pri) { - // PRI0 is for rocksdb's high priority items (indexes/filters) - case PriorityCache::Priority::PRI0: - { - usage += cache->GetPinnedUsage(); - if (g_conf()->rocksdb_cache_type == "binned_lru") { - auto binned_cache = - std::static_pointer_cast(cache); - usage += binned_cache->GetHighPriPoolUsage(); - } - break; - } - // All other cache items are currently shoved into the LAST priority. - case PriorityCache::Priority::LAST: - { - usage = get_cache_usage() - cache->GetPinnedUsage(); - if (g_conf()->rocksdb_cache_type == "binned_lru") { - auto binned_cache = - std::static_pointer_cast(cache); - usage -= binned_cache->GetHighPriPoolUsage(); - } - break; - } - default: - break; - } - request = (request > assigned) ? request - assigned : 0; - dout(10) << __func__ << " Priority: " << static_cast(pri) - << " Usage: " << usage << " Request: " << request << dendl; - return request; -} - -int64_t RocksDBStore::get_cache_usage() const -{ - return static_cast(bbt_opts.block_cache->GetUsage()); -} - -int64_t RocksDBStore::commit_cache_size(uint64_t total_bytes) -{ - size_t old_bytes = bbt_opts.block_cache->GetCapacity(); - int64_t new_bytes = PriorityCache::get_chunk( - get_cache_bytes(), total_bytes); - dout(10) << __func__ << " old: " << old_bytes - << " new: " << new_bytes << dendl; - bbt_opts.block_cache->SetCapacity((size_t) new_bytes); - - // Set the high priority pool ratio is this is the binned LRU cache. - if (g_conf()->rocksdb_cache_type == "binned_lru") { - auto binned_cache = - std::static_pointer_cast(bbt_opts.block_cache); - int64_t high_pri_bytes = PriorityCache::get_chunk( - binned_cache->GetHighPriPoolUsage()+1, total_bytes); - double ratio = (double) high_pri_bytes / new_bytes; - dout(10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl; - binned_cache->SetHighPriPoolRatio(ratio); - } - return new_bytes; -} - RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl() { delete dbiter; diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 5354285be1dfd..09622d5b17a02 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -16,6 +16,7 @@ #include "rocksdb/iostats_context.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" +#include "kv/rocksdb_cache/BinnedLRUCache.h" #include #include "common/errno.h" #include "common/dout.h" @@ -120,7 +121,6 @@ public: bool disableWAL; bool enable_rmrange; void compact() override; - int64_t high_pri_watermark; void compact_async() override { compact_range_async(string(), string()); @@ -159,8 +159,7 @@ public: compact_thread(this), compact_on_mount(false), disableWAL(false), - enable_rmrange(cct->_conf->rocksdb_enable_rmrange), - high_pri_watermark(0) + enable_rmrange(cct->_conf->rocksdb_enable_rmrange) {} ~RocksDBStore() override; @@ -478,17 +477,9 @@ err: return total_size; } - virtual int64_t request_cache_bytes( - PriorityCache::Priority pri, uint64_t cache_bytes) const override; - virtual int64_t commit_cache_size(uint64_t total_cache) override; - virtual int64_t get_committed_size() const override { - return bbt_opts.block_cache->GetCapacity(); + virtual int64_t get_cache_usage() const override { + return static_cast(bbt_opts.block_cache->GetUsage()); } - virtual std::string get_cache_name() const override { - return "RocksDB Block Cache"; - } - virtual int64_t get_cache_usage() const override; - int set_cache_size(uint64_t s) override { cache_size = s; @@ -499,6 +490,12 @@ err: int set_cache_capacity(int64_t capacity); int64_t get_cache_capacity(); + virtual std::shared_ptr get_priority_cache() + const override { + return dynamic_pointer_cast( + bbt_opts.block_cache); + } + WholeSpaceIterator get_wholespace_iterator() override; }; diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.cc b/src/kv/rocksdb_cache/BinnedLRUCache.cc index 91ed1185ec92a..38f367e1bc787 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.cc +++ b/src/kv/rocksdb_cache/BinnedLRUCache.cc @@ -13,11 +13,15 @@ #include "BinnedLRUCache.h" -#include #include #include #include +#define dout_context cct +#define dout_subsys ceph_subsys_rocksdb +#undef dout_prefix +#define dout_prefix *_dout << "rocksdb: " + namespace rocksdb_cache { BinnedLRUHandleTable::BinnedLRUHandleTable() : list_(nullptr), length_(0), elems_(0) { @@ -459,9 +463,12 @@ std::string BinnedLRUCacheShard::GetPrintableOptions() const { return std::string(buffer); } -BinnedLRUCache::BinnedLRUCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, double high_pri_pool_ratio) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { +BinnedLRUCache::BinnedLRUCache(CephContext *c, + size_t capacity, + int num_shard_bits, + bool strict_capacity_limit, + double high_pri_pool_ratio) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), cct(c) { num_shards_ = 1 << num_shard_bits; // TODO: Switch over to use mempool int rc = posix_memalign((void**) &shards_, @@ -542,9 +549,57 @@ size_t BinnedLRUCache::GetHighPriPoolUsage() const { return usage; } -std::shared_ptr NewBinnedLRUCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, - double high_pri_pool_ratio) { +// PriCache + +int64_t BinnedLRUCache::request_cache_bytes(PriorityCache::Priority pri, uint64_t total_cache) const +{ + int64_t assigned = get_cache_bytes(pri); + int64_t request = 0; + + switch (pri) { + // PRI0 is for rocksdb's high priority items (indexes/filters) + case PriorityCache::Priority::PRI0: + { + request = GetHighPriPoolUsage(); + break; + } + // All other cache items are currently shoved into the LAST priority. + case PriorityCache::Priority::LAST: + { + request = GetUsage(); + request -= GetHighPriPoolUsage(); + break; + } + default: + break; + } + request = (request > assigned) ? request - assigned : 0; + ldout(cct, 10) << __func__ << " Priority: " << static_cast(pri) + << " Request: " << request << dendl; + return request; +} + +int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes) +{ + size_t old_bytes = GetCapacity(); + int64_t new_bytes = PriorityCache::get_chunk( + get_cache_bytes(), total_bytes); + ldout(cct, 10) << __func__ << " old: " << old_bytes + << " new: " << new_bytes << dendl; + SetCapacity((size_t) new_bytes); + double ratio = + (double) get_cache_bytes(PriorityCache::Priority::PRI0) / new_bytes; + ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl; + SetHighPriPoolRatio(ratio); + return new_bytes; +} + +std::shared_ptr NewBinnedLRUCache( + CephContext *c, + size_t capacity, + int num_shard_bits, + bool strict_capacity_limit, + double high_pri_pool_ratio) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } @@ -555,8 +610,8 @@ std::shared_ptr NewBinnedLRUCache(size_t capacity, int num_shard if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared(capacity, num_shard_bits, - strict_capacity_limit, high_pri_pool_ratio); + return std::make_shared( + c, capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio); } } // namespace rocksdb_cache diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h index d48286a1dab26..96023ce22f7ed 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.h +++ b/src/kv/rocksdb_cache/BinnedLRUCache.h @@ -14,9 +14,10 @@ #include #include "ShardedCache.h" - #include "common/autovector.h" +#include "common/dout.h" #include "include/ceph_assert.h" +#include "common/ceph_context.h" namespace rocksdb_cache { @@ -47,6 +48,7 @@ namespace rocksdb_cache { // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3) std::shared_ptr NewBinnedLRUCache( + CephContext *c, size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, @@ -291,8 +293,8 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { class BinnedLRUCache : public ShardedCache { public: - BinnedLRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio); + BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits, + bool strict_capacity_limit, double high_pri_pool_ratio); virtual ~BinnedLRUCache(); virtual const char* Name() const override { return "BinnedLRUCache"; } virtual CacheShard* GetShard(int shard) override; @@ -311,7 +313,19 @@ class BinnedLRUCache : public ShardedCache { // Retrieves high pri pool usage size_t GetHighPriPoolUsage() const; + // PriorityCache + virtual int64_t request_cache_bytes( + PriorityCache::Priority pri, uint64_t total_cache) const; + virtual int64_t commit_cache_size(uint64_t total_cache); + virtual int64_t get_committed_size() const { + return GetCapacity(); + } + virtual std::string get_cache_name() const { + return "RocksDB Binned LRU Cache"; + } + private: + CephContext *cct; BinnedLRUCacheShard* shards_; int num_shards_ = 0; }; diff --git a/src/kv/rocksdb_cache/ShardedCache.h b/src/kv/rocksdb_cache/ShardedCache.h index e8e53b25e7ef9..4d64893ab1c7b 100644 --- a/src/kv/rocksdb_cache/ShardedCache.h +++ b/src/kv/rocksdb_cache/ShardedCache.h @@ -16,6 +16,7 @@ #include "rocksdb/cache.h" #include "include/ceph_hash.h" +#include "common/PriorityCache.h" //#include "hash.h" #ifndef CACHE_LINE_SIZE @@ -52,7 +53,7 @@ class CacheShard { // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits // shards will be created, with capacity split evenly to each of the shards. // Keys are sharded by the highest num_shard_bits bits of hash value. -class ShardedCache : public rocksdb::Cache { +class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache { public: ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit); virtual ~ShardedCache() = default; @@ -87,6 +88,32 @@ class ShardedCache : public rocksdb::Cache { int GetNumShardBits() const { return num_shard_bits_; } + // PriCache + virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const { + return cache_bytes[pri]; + } + virtual int64_t get_cache_bytes() const { + int64_t total = 0; + for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) { + PriorityCache::Priority pri = static_cast(i); + total += get_cache_bytes(pri); + } + return total; + } + virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] = bytes; + } + virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] += bytes; + } + virtual double get_cache_ratio() const { + return cache_ratio; + } + virtual void set_cache_ratio(double ratio) { + cache_ratio = ratio; + } + virtual std::string get_cache_name() const = 0; + private: static inline uint32_t HashSlice(const rocksdb::Slice& s) { return ceph_str_hash(CEPH_STR_HASH_RJENKINS, s.data(), s.size()); @@ -98,6 +125,9 @@ class ShardedCache : public rocksdb::Cache { return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; } + int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; + double cache_ratio = 0; + int num_shard_bits_; mutable std::mutex capacity_mutex_; size_t capacity_; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index d985167e46df4..6dcb24e5520c6 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3462,10 +3462,14 @@ void *BlueStore::MempoolThread::entry() { std::unique_lock l(lock); - std::list caches; - caches.push_back(store->db); - caches.push_back(&meta_cache); - caches.push_back(&data_cache); + std::list> caches; + binned_kv_cache = store->db->get_priority_cache(); + if (binned_kv_cache != nullptr) { + caches.push_back(binned_kv_cache); + } + caches.push_back(meta_cache); + caches.push_back(data_cache); + autotune_cache_size = store->osd_memory_cache_min; utime_t next_balance = ceph_clock_now(); @@ -3474,13 +3478,13 @@ void *BlueStore::MempoolThread::entry() bool interval_stats_trim = false; bool interval_stats_resize = false; while (!stop) { - _adjust_cache_settings(); - // Before we trim, check and see if it's time to rebalance/resize. double autotune_interval = store->cache_autotune_interval; double resize_interval = store->osd_memory_cache_resize_interval; if (autotune_interval > 0 && next_balance < ceph_clock_now()) { + _adjust_cache_settings(); + // Log events at 5 instead of 20 when balance happens. interval_stats_resize = true; interval_stats_trim = true; @@ -3515,9 +3519,11 @@ void *BlueStore::MempoolThread::entry() void BlueStore::MempoolThread::_adjust_cache_settings() { - store->db->set_cache_ratio(store->cache_kv_ratio); - meta_cache.set_cache_ratio(store->cache_meta_ratio); - data_cache.set_cache_ratio(store->cache_data_ratio); + if (binned_kv_cache != nullptr) { + binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); + } + meta_cache->set_cache_ratio(store->cache_meta_ratio); + data_cache->set_cache_ratio(store->cache_data_ratio); } void BlueStore::MempoolThread::_trim_shards(bool interval_stats) @@ -3526,23 +3532,23 @@ void BlueStore::MempoolThread::_trim_shards(bool interval_stats) size_t num_shards = store->cache_shards.size(); int64_t kv_used = store->db->get_cache_usage(); - int64_t meta_used = meta_cache._get_used_bytes(); - int64_t data_used = data_cache._get_used_bytes(); + int64_t meta_used = meta_cache->_get_used_bytes(); + int64_t data_used = data_cache->_get_used_bytes(); uint64_t cache_size = store->cache_size; int64_t kv_alloc = - static_cast(store->db->get_cache_ratio() * cache_size); + static_cast(store->cache_kv_ratio * cache_size); int64_t meta_alloc = - static_cast(meta_cache.get_cache_ratio() * cache_size); + static_cast(store->cache_meta_ratio * cache_size); int64_t data_alloc = - static_cast(data_cache.get_cache_ratio() * cache_size); + static_cast(store->cache_data_ratio * cache_size); - if (store->cache_autotune) { + if (binned_kv_cache != nullptr && store->cache_autotune) { cache_size = autotune_cache_size; - kv_alloc = store->db->get_committed_size(); - meta_alloc = meta_cache.get_committed_size(); - data_alloc = data_cache.get_committed_size(); + kv_alloc = binned_kv_cache->get_committed_size(); + meta_alloc = meta_cache->get_committed_size(); + data_alloc = data_cache->get_committed_size(); } if (interval_stats) { @@ -3564,7 +3570,7 @@ void BlueStore::MempoolThread::_trim_shards(bool interval_stats) } uint64_t max_shard_onodes = static_cast( - (meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode()); + (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode()); uint64_t max_shard_buffer = static_cast(data_alloc / num_shards); ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes @@ -3631,7 +3637,7 @@ void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats) } void BlueStore::MempoolThread::_balance_cache( - const std::list& caches) + const std::list>& caches) { int64_t mem_avail = autotune_cache_size; /* Each cache is going to get at least 1 chunk's worth of memory from get_chunk @@ -3669,9 +3675,10 @@ void BlueStore::MempoolThread::_balance_cache( } void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail, - const std::list& caches, PriorityCache::Priority pri) + const std::list>& caches, + PriorityCache::Priority pri) { - std::list tmp_caches = caches; + std::list> tmp_caches = caches; double cur_ratios = 0; double new_ratios = 0; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index e1b75d2e1ca8a..57b3afca03221 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2013,10 +2013,11 @@ private: ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock"); bool stop = false; uint64_t autotune_cache_size = 0; + std::shared_ptr binned_kv_cache = nullptr; struct MempoolCache : public PriorityCache::PriCache { BlueStore *store; - int64_t cache_bytes[PriorityCache::Priority::LAST+1]; + int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; int64_t committed_bytes = 0; double cache_ratio = 0; @@ -2097,7 +2098,8 @@ private: double get_bytes_per_onode() const { return (double)_get_used_bytes() / (double)_get_num_onodes(); } - } meta_cache; + }; + std::shared_ptr meta_cache; struct DataCache : public MempoolCache { DataCache(BlueStore *s) : MempoolCache(s) {}; @@ -2112,13 +2114,14 @@ private: virtual string get_cache_name() const { return "BlueStore Data Cache"; } - } data_cache; + }; + std::shared_ptr data_cache; public: explicit MempoolThread(BlueStore *s) : store(s), - meta_cache(MetaCache(s)), - data_cache(DataCache(s)) {} + meta_cache(new MetaCache(s)), + data_cache(new DataCache(s)) {} void *entry() override; void init() { @@ -2137,10 +2140,12 @@ private: void _adjust_cache_settings(); void _trim_shards(bool interval_stats); void _tune_cache_size(bool interval_stats); - void _balance_cache(const std::list& caches); - void _balance_cache_pri(int64_t *mem_avail, - const std::list& caches, - PriorityCache::Priority pri); + void _balance_cache( + const std::list>& caches); + void _balance_cache_pri( + int64_t *mem_avail, + const std::list>& caches, + PriorityCache::Priority pri); } mempool_thread; // -------------------------------------------------------- -- 2.39.5