From cadd4a4f6cc376882555a31f0304766430ba9e6a Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Thu, 17 Jul 2025 16:09:02 +0000 Subject: [PATCH] kv/RocksDB: Add instrumentation to BinnedLRUCache 1) perf counters For each cache section in perf counters is created. "rocksdb-cache-O": { "capacity": 134217728, "usage": 134182832, "pinned": 0, "elems": 24502, "inserts": 25806978, "lookups": 150436987, "hits": 124629911, "misses": 25807076 } In default configuration there are 2 sections: "rocksdb-cache-O", "rocksdb-cache-default". 2) admin command For each cache admin command "rocksdb show cache x" is added. > ./bin/ceph tell osd.0 rocksdb show cache O shard capacity usage pinned elems inserts lookups hits misses 0 13631488 11076400 0 2099 136987 822679 685923 136756 1 13631488 11549712 0 2043 133359 571500 438383 133117 2 13631488 11060608 0 2232 135076 908468 773313 135155 3 13631488 11166896 0 2269 134006 427070 293147 133923 4 13631488 11117984 0 2297 133367 700242 567318 132924 5 13631488 11306672 0 2155 137501 1130135 991810 138325 6 13631488 11506512 0 2353 134515 662792 528514 134278 7 13631488 11093856 0 2316 135348 718971 583421 135550 8 13631488 11660624 0 2424 137363 1092043 954248 137795 9 13631488 10962000 0 2561 131982 431702 300467 131235 10 13631488 11379392 0 1916 134543 477118 342854 134264 11 13631488 11294272 0 2555 134508 512393 378337 134056 12 13631488 11277136 0 2079 137312 1131571 993692 137879 13 13631488 10887776 0 2543 134001 567073 432903 134170 14 13631488 10986528 0 2394 133288 584452 451018 133434 15 13631488 11954464 0 2456 134615 708285 573374 134911 3) admin command " ./bin/ceph tell osd.0 rocksdb reset cache O" clears "inserts", "lookups", "hits" and "misses" Signed-off-by: Adam Kupczyk --- src/kv/CMakeLists.txt | 3 +- src/kv/RocksDBStore.cc | 7 +- src/kv/RocksDBStore.h | 4 +- src/kv/rocksdb_cache/BinnedLRUCache.cc | 193 ++++++++++++++++++++++++- src/kv/rocksdb_cache/BinnedLRUCache.h | 78 +++++++++- 5 files changed, 264 insertions(+), 21 deletions(-) diff --git a/src/kv/CMakeLists.txt b/src/kv/CMakeLists.txt index 9121a6257cc0..7e406c2f10a1 100644 --- a/src/kv/CMakeLists.txt +++ b/src/kv/CMakeLists.txt @@ -10,4 +10,5 @@ add_library(kv STATIC ${kv_srcs} target_link_libraries(kv RocksDB::RocksDB - heap_profiler) + heap_profiler + ${FMT_LIB}) diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index f0351c4ad8e5..978302012696 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -466,11 +466,12 @@ int RocksDBStore::create_and_open(ostream &out, } std::shared_ptr RocksDBStore::create_block_cache( + const std::string& name, const std::string& cache_type, size_t cache_size, double cache_prio_high) { std::shared_ptr cache; auto shard_bits = cct->_conf->rocksdb_cache_shard_bits; if (cache_type == "binned_lru") { - cache = rocksdb_cache::NewBinnedLRUCache(cct, cache_size, shard_bits, false, cache_prio_high); + cache = rocksdb_cache::NewBinnedLRUCache(cct, name, cache_size, shard_bits, false, cache_prio_high); } else if (cache_type == "lru") { cache = rocksdb::NewLRUCache(cache_size, shard_bits); } else if (cache_type == "clock") { @@ -556,7 +557,7 @@ int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options& uint64_t row_cache_size = cache_size * cct->_conf->rocksdb_cache_row_ratio; uint64_t block_cache_size = cache_size - row_cache_size; - bbt_opts.block_cache = create_block_cache(cct->_conf->rocksdb_cache_type, block_cache_size); + bbt_opts.block_cache = create_block_cache(rocksdb::kDefaultColumnFamilyName, cct->_conf->rocksdb_cache_type, block_cache_size); if (!bbt_opts.block_cache) { return -EINVAL; } @@ -1015,7 +1016,7 @@ int RocksDBStore::apply_block_cache_options(const std::string& column_name, column_bbt_opts.no_block_cache = true; } else { if (require_new_block_cache) { - block_cache = create_block_cache(cache_type, cache_size, high_pri_pool_ratio); + block_cache = create_block_cache(column_name, cache_type, cache_size, high_pri_pool_ratio); if (!block_cache) { dout(5) << __func__ << " failed to create block cache for params: " << block_cache_opt << dendl; return -EINVAL; diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index e52b969586ce..c0cab4cfbcf6 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -168,7 +168,9 @@ private: std::vector >& existing_cfs_shard, std::vector& missing_cfs, std::vector >& missing_cfs_shard); - std::shared_ptr create_block_cache(const std::string& cache_type, size_t cache_size, double cache_prio_high = 0.0); + std::shared_ptr create_block_cache( + const std::string& name, + const std::string& cache_type, size_t cache_size, double cache_prio_high = 0.0); int split_column_family_options(const std::string& opts_str, std::unordered_map* column_opts_map, std::string* block_cache_opt); diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.cc b/src/kv/rocksdb_cache/BinnedLRUCache.cc index d7de6cba3870..e93d44288b0d 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.cc +++ b/src/kv/rocksdb_cache/BinnedLRUCache.cc @@ -9,6 +9,9 @@ #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#include "common/admin_socket.h" +#include "common/pretty_binary.h" +#include #endif #include "BinnedLRUCache.h" @@ -16,6 +19,8 @@ #include #include #include +#include "common/debug.h" +#include "common/perf_counters_collection.h" #define dout_context cct #define dout_subsys ceph_subsys_rocksdb @@ -271,10 +276,12 @@ void BinnedLRUCacheShard::MaintainPoolSize() { void BinnedLRUCacheShard::EvictFromLRU(size_t charge, BinnedLRUHandle*& deleted) { + while (usage_ + charge > capacity_ && lru_.next != &lru_) { BinnedLRUHandle* old = lru_.next; ceph_assert(old->InCache()); ceph_assert(old->refs == 1); // LRU list contains elements which may be evicted + stats[l_elems]--; LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); @@ -286,6 +293,17 @@ void BinnedLRUCacheShard::EvictFromLRU(size_t charge, } } +int BinnedLRUCacheShard::FreeDeleted(BinnedLRUHandle* deleted) { + int del = 0; + while (deleted) { + auto* entry = deleted; + deleted = deleted->next; + entry->Free(); + del++; + } + return del; +} + void BinnedLRUCacheShard::SetCapacity(size_t capacity) { BinnedLRUHandle* deleted = nullptr; { @@ -299,6 +317,30 @@ void BinnedLRUCacheShard::SetCapacity(size_t capacity) { FreeDeleted(deleted); } +ShardStats BinnedLRUCacheShard::GetStats() { + std::lock_guard l(mutex_); + stats[l_capacity] = capacity_; + stats[l_usage] = usage_; + stats[l_pinned] = usage_ - lru_usage_; + stats[l_misses] = stats[l_lookups] - stats[l_hits]; + return stats; +} + +void BinnedLRUCacheShard::ClearStats() { + std::lock_guard l(mutex_); + for (int i = l_inserts; i <= l_misses; i++) { + stats[i] = 0; + } +} + +void BinnedLRUCacheShard::print_bins(std::stringstream& out) const +{ + for (const auto& i : age_bins) { + out << *i << " "; + } + out << std::endl; +} + void BinnedLRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { std::lock_guard l(mutex_); strict_capacity_limit_ = strict_capacity_limit; @@ -306,6 +348,7 @@ void BinnedLRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { rocksdb::Cache::Handle* BinnedLRUCacheShard::Lookup(const rocksdb::Slice& key, uint32_t hash) { std::lock_guard l(mutex_); + stats[l_lookups]++; BinnedLRUHandle* e = table_.Lookup(key, hash); if (e != nullptr) { ceph_assert(e->InCache()); @@ -314,6 +357,7 @@ rocksdb::Cache::Handle* BinnedLRUCacheShard::Lookup(const rocksdb::Slice& key, u } e->refs++; e->SetHit(); + stats[l_hits]++; } return reinterpret_cast(e); } @@ -346,6 +390,7 @@ bool BinnedLRUCacheShard::Release(rocksdb::Cache::Handle* handle, bool force_era last_reference = Unref(e); if (last_reference) { usage_ -= e->charge; + stats[l_elems]--; } if (e->refs == 1 && e->InCache()) { // The item is still in cache, and nobody else holds a reference to it @@ -359,6 +404,7 @@ bool BinnedLRUCacheShard::Release(rocksdb::Cache::Handle* handle, bool force_era Unref(e); usage_ -= e->charge; last_reference = true; + stats[l_elems]--; } else { // put the item on the list to be potentially freed LRU_Insert(e); @@ -398,6 +444,8 @@ rocksdb::Status BinnedLRUCacheShard::Insert(const rocksdb::Slice& key, uint32_t { std::lock_guard l(mutex_); + stats[l_elems]++; + stats[l_inserts]++; // Free the space following strict LRU policy until enough space // is freed or the lru list is empty EvictFromLRU(charge, deleted); @@ -454,6 +502,7 @@ void BinnedLRUCacheShard::Erase(const rocksdb::Slice& key, uint32_t hash) { bool last_reference = false; { std::lock_guard l(mutex_); + stats[l_elems]--; e = table_.Remove(key, hash); if (e != nullptr) { last_reference = Unref(e); @@ -517,12 +566,92 @@ DeleterFn BinnedLRUCacheShard::GetDeleter(rocksdb::Cache::Handle* h) const return handle->deleter; } -BinnedLRUCache::BinnedLRUCache(CephContext *c, - size_t capacity, - int num_shard_bits, - bool strict_capacity_limit, - double high_pri_pool_ratio) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), cct(c) { +#undef dout_context +#define dout_context cache.cct + +class BinnedLRUCache::SocketHook : public AdminSocketHook { + BinnedLRUCache& cache; + +public: + SocketHook( BinnedLRUCache& _cache) + : cache(_cache) + { + AdminSocket *admin_socket = cache.cct->get_admin_socket(); + if (admin_socket) { + int r = admin_socket->register_command( + std::string("rocksdb show cache ") + cache.name + std::string(" name=shard_no,type=CephInt,req=false"), + this, "show details of cache " + cache.name); + if (r != 0) { + dout(1) << __func__ << " cannot register SocketHook" << dendl; + return; + } + r = admin_socket->register_command( + std::string("rocksdb reset cache ") + cache.name, + this, "clear stats of cache " + cache.name); + ceph_assert(r == 0); + } + }; + ~SocketHook() { + AdminSocket *admin_socket = cache.cct->get_admin_socket(); + if (admin_socket) { + admin_socket->unregister_commands(this); + } + }; + int call(std::string_view command, + const cmdmap_t& cmdmap, + const bufferlist& inbl, + Formatter *f, + std::ostream& ss, + bufferlist& out) + { + int r = 0; + if (command == std::string("rocksdb show cache ") + cache.name) { + int64_t shard_no; + std::stringstream outstr; + if (!ceph::common::cmd_getval(cmdmap, "shard_no", shard_no)) { + outstr << fmt::format("{:>5}", "shard"); + for (int j = 0; j < stat_cnt; j++) { + outstr << fmt::format("{:>10}", ShardStats::stat_name[j]); + } + outstr << std::endl; + for (int i = 0; i < cache.num_shards_; i++) { + outstr << fmt::format("{:>5}", i); + ShardStats s = cache.shards_[i].GetStats(); + for (int j = 0; j < stat_cnt; j++) { + outstr << fmt::format("{:>10}", s[j]); + } + outstr << std::endl; + } + } else { + cache.printshard(shard_no, outstr); + } + out.append(outstr.str()); + } else if(command == std::string("rocksdb reset cache ") + cache.name) { + for (int i = 0; i < cache.num_shards_; i++) { + cache.shards_[i].ClearStats(); + } + } else { + ss << "Invalid command" << std::endl; + r = -ENOSYS; + } + return r; + }; +}; + +#undef dout_context +#define dout_context cct + +BinnedLRUCache::BinnedLRUCache( + CephContext *c, + const std::string& name, + size_t capacity, + int num_shard_bits, + bool strict_capacity_limit, + double high_pri_pool_ratio) + : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) + , cct(c) + , name(name) +{ num_shards_ = 1 << num_shard_bits; // TODO: Switch over to use mempool int rc = posix_memalign((void**) &shards_, @@ -536,6 +665,20 @@ BinnedLRUCache::BinnedLRUCache(CephContext *c, new (&shards_[i]) BinnedLRUCacheShard(c, per_shard, strict_capacity_limit, high_pri_pool_ratio); } + SetupPerfCounters(); + asok_hook = new SocketHook(*this); +} + +void BinnedLRUCache::SetupPerfCounters() +{ + int l_first = 0; + int l_last = l_first + 1 + stat_cnt; + PerfCountersBuilder b(cct, std::string("rocksdb-cache-") + name, l_first, l_last); + for (uint32_t j = l_capacity; j <= l_misses; j++) { + b.add_u64(1 + j, ShardStats::stat_name[j], ShardStats::stat_descr[j]); + } + perfstats = b.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfstats); } BinnedLRUCache::~BinnedLRUCache() { @@ -543,6 +686,11 @@ BinnedLRUCache::~BinnedLRUCache() { shards_[i].~BinnedLRUCacheShard(); } aligned_free(shards_); + cct->get_perfcounters_collection()->remove(perfstats); + delete perfstats; + perfstats = nullptr; + delete asok_hook; + asok_hook = nullptr; } CacheShard* BinnedLRUCache::GetShard(int shard) { @@ -667,9 +815,37 @@ int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes) } ldout(cct, 5) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl; SetHighPriPoolRatio(ratio); + + // not related to cache size, but called periodically + UpdatePerfCounters(); return new_bytes; } +void BinnedLRUCache::UpdatePerfCounters() { + ShardStats stats; + for (int i = 0; i < num_shards_; i++) { + ShardStats s = shards_[i].GetStats(); + stats.add(s); + } + //set these + for (int j = l_capacity ; j <= l_elems; j++) { + perfstats->set(1 + j, stats[j]); + } + //increment these, so one can reset perf counters + ShardStats tmp = stats; + tmp.sub(prev_stats); + for (int j = l_inserts; j <= l_misses; j++) { + perfstats->inc(1 + j, tmp[j]); + } + prev_stats = stats; +} + +void BinnedLRUCache::printshard(int shard_no, std::stringstream& out) { + if (shard_no < num_shards_) { + shards_[shard_no].print_bins(out); + } +} + void BinnedLRUCache::shift_bins() { for (int s = 0; s < num_shards_; s++) { shards_[s].shift_bins(); @@ -699,7 +875,8 @@ void BinnedLRUCache::set_bin_count(uint32_t count) { } std::shared_ptr NewBinnedLRUCache( - CephContext *c, + CephContext *c, + const std::string& name, size_t capacity, int num_shard_bits, bool strict_capacity_limit, @@ -715,7 +892,7 @@ std::shared_ptr NewBinnedLRUCache( num_shard_bits = GetDefaultCacheShardBits(capacity); } return std::make_shared( - c, capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio); + c, name, capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio); } } // namespace rocksdb_cache diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h index cb3487d30ddb..a419154c2773 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.h +++ b/src/kv/rocksdb_cache/BinnedLRUCache.h @@ -18,6 +18,7 @@ #include "common/dout.h" #include "include/ceph_assert.h" #include "common/ceph_context.h" +#include "common/admin_socket.h" namespace rocksdb_cache { @@ -49,6 +50,7 @@ namespace rocksdb_cache { std::shared_ptr NewBinnedLRUCache( CephContext *c, + const std::string& name, size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, @@ -169,6 +171,56 @@ class BinnedLRUHandleTable { uint32_t elems_; }; +enum stat_e : int { + l_capacity = 0, // capacity assigned to the shard + l_usage, // current usage of the shard + l_pinned, // size in elements currently referenced + l_elems, // count of separate items in shard + l_inserts, // increased when element inserted into the cache + l_lookups, // increased when trying to find element in shard + l_hits, // increased when lookup successful + l_misses, // calculated from lookups - hits + stat_cnt +}; + +struct ShardStats { + uint64_t val[stat_cnt] = {0}; + uint64_t& operator[](int idx) { + return val[idx]; + } + + static constexpr char const* stat_name[stat_cnt] = { + "capacity", + "usage", + "pinned", + "elems", + "inserts", + "lookups", + "hits", + "misses", + }; + static constexpr char const* stat_descr[stat_cnt] = { + "capacity assigned", + "current usage", + "currently pinned size (in use)", + "number of elems in shard", + "inserts into shard", + "lookups for an element", + "lookup successful", + "lookup failure", + }; + void add(const ShardStats& other) { + for (int j = 0; j < stat_cnt; j++) { + val[j] += other.val[j]; + } + } + void sub(const ShardStats& other) { + for (int j = 0; j < stat_cnt; j++) { + val[j] -= other.val[j]; + } + } +}; + // A single shard of sharded cache. class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { public: @@ -243,6 +295,10 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { // Get the byte counts for a range of age bins uint64_t sum_bins(uint32_t start, uint32_t end) const; + ShardStats GetStats(); + void ClearStats(); + void print_bins(std::stringstream& out) const; + private: CephContext *cct; void LRU_Remove(BinnedLRUHandle* e); @@ -262,13 +318,7 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { // holding the mutex_ void EvictFromLRU(size_t charge, BinnedLRUHandle*& deleted); - void FreeDeleted(BinnedLRUHandle* deleted) { - while (deleted) { - auto* entry = deleted; - deleted = deleted->next; - entry->Free(); - } - } + int FreeDeleted(BinnedLRUHandle* deleted); // Initialized before use. size_t capacity_; @@ -294,6 +344,8 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { // Pointer to head of low-pri pool in LRU list. BinnedLRUHandle* lru_low_pri_; + // Info about the shard + ShardStats stats; // ------------^^^^^^^^^^^^^----------- // Not frequently modified data members // ------------------------------------ @@ -324,7 +376,7 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { class BinnedLRUCache : public ShardedCache { public: - BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits, + BinnedLRUCache(CephContext *c, const std::string& name, size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio); virtual ~BinnedLRUCache(); virtual const char* Name() const override { return "BinnedLRUCache"; } @@ -362,10 +414,20 @@ class BinnedLRUCache : public ShardedCache { return "RocksDB Binned LRU Cache"; } + private: + void SetupPerfCounters(); + void UpdatePerfCounters(); + void printshard(int shard_no, std::stringstream& out); private: CephContext *cct; + std::string name; BinnedLRUCacheShard* shards_; int num_shards_ = 0; + PerfCounters* perfstats = nullptr; + ShardStats prev_stats; + class SocketHook; + friend class SocketHook; + AdminSocketHook* asok_hook = nullptr; }; } // namespace rocksdb_cache -- 2.47.3