From 5635183833727b09a645399dd098bc3118ddd4ba Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Tue, 13 Oct 2020 23:24:18 +0000 Subject: [PATCH] common/PriorityCache: Implement Cache Age Binning Signed-off-by: Mark Nelson --- src/common/PriorityCache.cc | 9 +- src/common/PriorityCache.h | 14 ++- src/common/options/global.yaml.in | 99 ++++++++++++++++ src/kv/rocksdb_cache/BinnedLRUCache.cc | 93 +++++++++++++-- src/kv/rocksdb_cache/BinnedLRUCache.h | 22 ++++ src/kv/rocksdb_cache/ShardedCache.h | 40 +++++++ src/mon/OSDMonitor.cc | 10 ++ src/os/bluestore/BlueStore.cc | 103 ++++++++++++++--- src/os/bluestore/BlueStore.h | 149 ++++++++++++++++++++++--- src/tools/rbd_mirror/Mirror.cc | 13 +++ 10 files changed, 510 insertions(+), 42 deletions(-) diff --git a/src/common/PriorityCache.cc b/src/common/PriorityCache.cc index ff96ad7a134..0fe781b3e1e 100644 --- a/src/common/PriorityCache.cc +++ b/src/common/PriorityCache.cc @@ -305,7 +305,6 @@ namespace PriorityCache // Commit the new cache size int64_t committed = it->second->commit_cache_size(tuned_mem); - // Update the perf counters int64_t alloc = it->second->get_cache_bytes(); @@ -314,6 +313,14 @@ namespace PriorityCache } } + void Manager::shift_bins() + { + for (auto &l : loggers) { + auto it = caches.find(l.first); + it->second->shift_bins(); + } + } + void Manager::balance_priority(int64_t *mem_avail, Priority pri) { std::unordered_map> tmp_caches = caches; diff --git a/src/common/PriorityCache.h b/src/common/PriorityCache.h index 362b5477da2..8233d0ecf27 100644 --- a/src/common/PriorityCache.h +++ b/src/common/PriorityCache.h @@ -100,6 +100,18 @@ namespace PriorityCache { // Get the name of this cache. virtual std::string get_cache_name() const = 0; + + // Rotate the bins + virtual void shift_bins() = 0; + + // Import user bins (from PRI1 to LAST-1) + virtual void import_bins(const std::vector &bins) = 0; + + // Set bins (PRI0 and LAST should be ignored) + virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) = 0; + + // Get bins + virtual uint64_t get_bins(PriorityCache::Priority pri) const = 0; }; class Manager { @@ -140,7 +152,7 @@ namespace PriorityCache { void clear(); void tune_memory(); void balance(); - + void shift_bins(); private: void balance_priority(int64_t *mem_avail, Priority pri); }; diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 557c67646be..084b591538f 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4677,6 +4677,105 @@ options: default: 5 see_also: - bluestore_cache_autotune +- name: bluestore_cache_age_bin_interval + type: float + level: dev + desc: The duration (in seconds) represented by a single cache age bin. + fmt_desc: | + The caches used by bluestore will assign cache entries to an 'age bin' + that represents a period of time during which that cache entry was most + recently updated. By binning the caches in this way, Ceph's priority + cache balancing code can make better decisions about which caches should + receive priority based on the relative ages of items in the caches. By + default, a single cache age bin represents 1 second of time. Note: + Setting this interval too small can result in high CPU usage and lower + performance. + default: 1 + see_also: + - bluestore_cache_age_bins_kv + - bluestore_cache_age_bins_kv_onode + - bluestore_cache_age_bins_meta + - bluestore_cache_age_bins_data +- name: bluestore_cache_age_bins_kv + type: str + level: dev + desc: A 10 element, space separated list of age bins for kv cache + fmt_desc: | + A 10 element, space separated list of cache age bins grouped by + priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ... + PRI10=[n+8,n+9). Values represent the starting and ending bin for each + priority level. A 0 in the 2nd term will prevent any items from being + associated with that priority. bin duration is based on the + bluestore_cache_age_bin_interval value. For example, + "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1 + contains 1 age bin. Assuming the default age bin interval of 1 second, + PRI1 represents cache items that are less than 1 second old. PRI2 has 4 + bins representing cache items that are 1 to less than 5 seconds old. All + other cache items in this example are associated with the lowest priority + level as PRI3-PRI10 all have 0s in their second term. + default: "1 2 6 24 120 720 0 0 0 0" + see_also: + - bluestore_cache_age_bin_interval +- name: bluestore_cache_age_bins_kv_onode + type: str + level: dev + desc: A 10 element, space separated list of age bins for kv onode cache + fmt_desc: | + A 10 element, space separated list of cache age bins grouped by + priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ... + PRI10=[n+8,n+9). Values represent the starting and ending bin for each + priority level. A 0 in the 2nd term will prevent any items from being + associated with that priority. bin duration is based on the + bluestore_cache_age_bin_interval value. For example, + "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1 + contains 1 age bin. Assuming the default age bin interval of 1 second, + PRI1 represents cache items that are less than 1 second old. PRI2 has 4 + bins representing cache items that are 1 to less than 5 seconds old. All + other cache items in this example are associated with the lowest priority + level as PRI3-PRI10 all have 0s in their second term. + default: "0 0 0 0 0 0 0 0 0 720" + see_also: + - bluestore_cache_age_bin_interval +- name: bluestore_cache_age_bins_meta + type: str + level: dev + desc: A 10 element, space separated list of age bins for onode cache + fmt_desc: | + A 10 element, space separated list of cache age bins grouped by + priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ... + PRI10=[n+8,n+9). Values represent the starting and ending bin for each + priority level. A 0 in the 2nd term will prevent any items from being + associated with that priority. bin duration is based on the + bluestore_cache_age_bin_interval value. For example, + "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1 + contains 1 age bin. Assuming the default age bin interval of 1 second, + PRI1 represents cache items that are less than 1 second old. PRI2 has 4 + bins representing cache items that are 1 to less than 5 seconds old. All + other cache items in this example are associated with the lowest priority + level as PRI3-PRI10 all have 0s in their second term. + default: "1 2 6 24 120 720 0 0 0 0" + see_also: + - bluestore_cache_age_bin_interval +- name: bluestore_cache_age_bins_data + type: str + level: dev + desc: A 10 element, space separated list of age bins for data cache + fmt_desc: | + A 10 element, space separated list of cache age bins grouped by + priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ... + PRI10=[n+8,n+9). Values represent the starting and ending bin for each + priority level. A 0 in the 2nd term will prevent any items from being + associated with that priority. bin duration is based on the + bluestore_cache_age_bin_interval value. For example, + "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1 + contains 1 age bin. Assuming the default age bin interval of 1 second, + PRI1 represents cache items that are less than 1 second old. PRI2 has 4 + bins representing cache items that are 1 to less than 5 seconds old. All + other cache items in this example are associated with the lowest priority + level as PRI3-PRI10 all have 0s in their second term. + default: "1 2 6 24 120 720 0 0 0 0" + see_also: + - bluestore_cache_age_bin_interval - name: bluestore_alloc_stats_dump_interval type: float level: dev diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.cc b/src/kv/rocksdb_cache/BinnedLRUCache.cc index 47c56e2ddd7..fce26c7b07d 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.cc +++ b/src/kv/rocksdb_cache/BinnedLRUCache.cc @@ -110,7 +110,9 @@ BinnedLRUCacheShard::BinnedLRUCacheShard(CephContext *c, size_t capacity, bool s high_pri_pool_ratio_(high_pri_pool_ratio), high_pri_pool_capacity_(0), usage_(0), - lru_usage_(0) { + lru_usage_(0), + age_bins(1) { + shift_bins(); // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; @@ -208,12 +210,17 @@ void BinnedLRUCacheShard::LRU_Remove(BinnedLRUHandle* e) { if (e->InHighPriPool()) { ceph_assert(high_pri_pool_usage_ >= e->charge); high_pri_pool_usage_ -= e->charge; + } else { + ceph_assert(*(e->age_bin) >= e->charge); + *(e->age_bin) -= e->charge; } } void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) { ceph_assert(e->next == nullptr); ceph_assert(e->prev == nullptr); + e->age_bin = age_bins.front(); + if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) { // Inset "e" to head of LRU list. e->next = &lru_; @@ -232,10 +239,25 @@ void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) { e->next->prev = e; e->SetInHighPriPool(false); lru_low_pri_ = e; + *(e->age_bin) += e->charge; } lru_usage_ += e->charge; } +uint64_t BinnedLRUCacheShard::sum_bins(uint32_t start, uint32_t end) const { + std::lock_guard l(mutex_); + auto size = age_bins.size(); + if (size < start) { + return 0; + } + uint64_t bytes = 0; + end = (size < end) ? size : end; + for (auto i = start; i < end; i++) { + bytes += *(age_bins[i]); + } + return bytes; +} + void BinnedLRUCacheShard::MaintainPoolSize() { while (high_pri_pool_usage_ > high_pri_pool_capacity_) { // Overflow last entry in high-pri pool to low-pri pool. @@ -243,6 +265,7 @@ void BinnedLRUCacheShard::MaintainPoolSize() { ceph_assert(lru_low_pri_ != &lru_); lru_low_pri_->SetInHighPriPool(false); high_pri_pool_usage_ -= lru_low_pri_->charge; + *(lru_low_pri_->age_bin) += lru_low_pri_->charge; } } @@ -460,6 +483,21 @@ size_t BinnedLRUCacheShard::GetPinnedUsage() const { return usage_ - lru_usage_; } +void BinnedLRUCacheShard::shift_bins() { + std::lock_guard l(mutex_); + age_bins.push_front(std::make_shared(0)); +} + +uint32_t BinnedLRUCacheShard::get_bin_count() const { + std::lock_guard l(mutex_); + return age_bins.capacity(); +} + +void BinnedLRUCacheShard::set_bin_count(uint32_t count) { + std::lock_guard l(mutex_); + age_bins.set_capacity(count); +} + std::string BinnedLRUCacheShard::GetPrintableOptions() const { const int kBufferSize = 200; char buffer[kBufferSize]; @@ -577,22 +615,33 @@ int64_t BinnedLRUCache::request_cache_bytes(PriorityCache::Priority pri, uint64_ int64_t assigned = get_cache_bytes(pri); int64_t request = 0; - switch (pri) { + switch(pri) { // PRI0 is for rocksdb's high priority items (indexes/filters) case PriorityCache::Priority::PRI0: { - request = GetHighPriPoolUsage(); + // Because we want the high pri cache to grow independently of the low + // pri cache, request a chunky allocation independent of the other + // priorities. + request = PriorityCache::get_chunk(GetHighPriPoolUsage(), total_cache); break; } - // All other cache items are currently shoved into the PRI1 priority. - case PriorityCache::Priority::PRI1: + case PriorityCache::Priority::LAST: { + auto max = get_bin_count(); request = GetUsage(); request -= GetHighPriPoolUsage(); + request -= sum_bins(0, max); break; } default: - break; + { + ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST); + auto prev_pri = static_cast(pri - 1); + uint64_t start = get_bins(prev_pri); + uint64_t end = get_bins(pri); + request = sum_bins(start, end); + break; + } } request = (request > assigned) ? request - assigned : 0; ldout(cct, 10) << __func__ << " Priority: " << static_cast(pri) @@ -612,15 +661,41 @@ int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes) double ratio = 0; if (new_bytes > 0) { int64_t pri0_bytes = get_cache_bytes(PriorityCache::Priority::PRI0); - // Add 10% of the "reserved" bytes so the ratio can't get stuck at 0 - pri0_bytes += (new_bytes - get_cache_bytes()) / 10; ratio = (double) pri0_bytes / new_bytes; } - ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl; + ldout(cct, 5) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl; SetHighPriPoolRatio(ratio); return new_bytes; } +void BinnedLRUCache::shift_bins() { + for (int s = 0; s < num_shards_; s++) { + shards_[s].shift_bins(); + } +} + +uint64_t BinnedLRUCache::sum_bins(uint32_t start, uint32_t end) const { + uint64_t bytes = 0; + for (int s = 0; s < num_shards_; s++) { + bytes += shards_[s].sum_bins(start, end); + } + return bytes; +} + +uint32_t BinnedLRUCache::get_bin_count() const { + uint32_t result = 0; + if (num_shards_ > 0) { + result = shards_[0].get_bin_count(); + } + return result; +} + +void BinnedLRUCache::set_bin_count(uint32_t count) { + for (int s = 0; s < num_shards_; s++) { + shards_[s].set_bin_count(count); + } +} + std::shared_ptr NewBinnedLRUCache( CephContext *c, size_t capacity, diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h index 88bf4502e89..fcf49b7e870 100644 --- a/src/kv/rocksdb_cache/BinnedLRUCache.h +++ b/src/kv/rocksdb_cache/BinnedLRUCache.h @@ -12,6 +12,7 @@ #include #include +#include #include "ShardedCache.h" #include "common/autovector.h" @@ -55,6 +56,7 @@ std::shared_ptr NewBinnedLRUCache( double high_pri_pool_ratio = 0.0); struct BinnedLRUHandle { + std::shared_ptr age_bin; void* value; DeleterFn deleter; BinnedLRUHandle* next_hash; @@ -230,6 +232,18 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { // Retrieves high pri pool usage size_t GetHighPriPoolUsage() const; + // Rotate the bins + void shift_bins(); + + // Get the bin count + uint32_t get_bin_count() const; + + // Set the bin count + void set_bin_count(uint32_t count); + + // Get the byte counts for a range of age bins + uint64_t sum_bins(uint32_t start, uint32_t end) const; + private: CephContext *cct; void LRU_Remove(BinnedLRUHandle* e); @@ -296,6 +310,9 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. mutable std::mutex mutex_; + + // Circular buffer of byte counters for age binning + boost::circular_buffer> age_bins; }; class BinnedLRUCache : public ShardedCache { @@ -329,6 +346,11 @@ class BinnedLRUCache : public ShardedCache { virtual int64_t get_committed_size() const { return GetCapacity(); } + virtual void shift_bins(); + uint64_t sum_bins(uint32_t start, uint32_t end) const; + uint32_t get_bin_count() const; + void set_bin_count(uint32_t count); + virtual std::string get_cache_name() const { return "RocksDB Binned LRU Cache"; } diff --git a/src/kv/rocksdb_cache/ShardedCache.h b/src/kv/rocksdb_cache/ShardedCache.h index f98421a09a3..0ed692233f1 100644 --- a/src/kv/rocksdb_cache/ShardedCache.h +++ b/src/kv/rocksdb_cache/ShardedCache.h @@ -104,6 +104,9 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache { int GetNumShardBits() const { return num_shard_bits_; } + virtual uint32_t get_bin_count() const = 0; + virtual void set_bin_count(uint32_t count) = 0; + // PriCache virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const { return cache_bytes[pri]; @@ -127,6 +130,42 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache { } virtual void set_cache_ratio(double ratio) { cache_ratio = ratio; + } + virtual uint64_t get_bins(PriorityCache::Priority pri) const { + if (pri > PriorityCache::Priority::PRI0 && + pri < PriorityCache::Priority::LAST) { + return bins[pri]; + } + return 0; + } + virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) { + if (pri <= PriorityCache::Priority::PRI0 || + pri >= PriorityCache::Priority::LAST) { + return; + } + bins[pri] = end_bin; + uint64_t max = 0; + for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) { + if (bins[pri] > max) { + max = bins[pri]; + } + } + set_bin_count(max); + } + virtual void import_bins(const std::vector &bins_v) { + uint64_t max = 0; + for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) { + unsigned i = (unsigned) pri - 1; + if (i < bins_v.size()) { + bins[pri] = bins_v[i]; + if (bins[pri] > max) { + max = bins[pri]; + } + } else { + bins[pri] = 0; + } + } + set_bin_count(max); } virtual std::string get_cache_name() const = 0; @@ -141,6 +180,7 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache { return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; } + uint64_t bins[PriorityCache::Priority::LAST+1] = {0}; int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; double cache_ratio = 0; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index ab9c53c060a..acf14bdbe54 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -214,6 +214,16 @@ struct OSDMemCache : public PriorityCache::PriCache { virtual void set_cache_ratio(double ratio) { cache_ratio = ratio; } + virtual void shift_bins() { + } + virtual void import_bins(const std::vector &bins) { + } + virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) { + } + virtual uint64_t get_bins(PriorityCache::Priority pri) const { + return 0; + } + virtual string get_cache_name() const = 0; }; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 89cc0713a44..e49d797fd23 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1108,6 +1108,8 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { { if (o->put_cache()) { (level > 0) ? lru.push_front(*o) : lru.push_back(*o); + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; } else { ++num_pinned; } @@ -1118,6 +1120,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { void _rm(BlueStore::Onode* o) override { if (o->pop_cache()) { + *(o->cache_age_bin) -= 1; lru.erase(lru.iterator_to(*o)); } else { ceph_assert(num_pinned); @@ -1129,6 +1132,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { } void _pin(BlueStore::Onode* o) override { + *(o->cache_age_bin) -= 1; lru.erase(lru.iterator_to(*o)); ++num_pinned; dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl; @@ -1136,6 +1140,8 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { void _unpin(BlueStore::Onode* o) override { lru.push_front(*o); + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; ceph_assert(num_pinned); --num_pinned; dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl; @@ -1169,6 +1175,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { ceph_assert(n == 0); lru.erase(p); } + *(o->cache_age_bin) -= 1; auto pinned = !o->pop_cache(); ceph_assert(!pinned); o->c->onode_map._remove(o->oid); @@ -1230,11 +1237,15 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard { lru.push_back(*b); } buffer_bytes += b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; num = lru.size(); } void _rm(BlueStore::Buffer *b) override { ceph_assert(buffer_bytes >= b->length); buffer_bytes -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; auto q = lru.iterator_to(*b); lru.erase(q); num = lru.size(); @@ -1246,11 +1257,16 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard { void _adjust_size(BlueStore::Buffer *b, int64_t delta) override { ceph_assert((int64_t)buffer_bytes + delta >= 0); buffer_bytes += delta; + assert(*(b->cache_age_bin) + delta >= 0); + *(b->cache_age_bin) += delta; } void _touch(BlueStore::Buffer *b) override { auto p = lru.iterator_to(*b); lru.erase(p); lru.push_front(*b); + *(b->cache_age_bin) -= b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; num = lru.size(); _audit("_touch_buffer end"); } @@ -1267,6 +1283,8 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard { BlueStore::Buffer *b = &*i; ceph_assert(b->is_clean()); dout(20) << __func__ << " rm " << *b << dendl; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; b->space->_rm_buffer(this, b); } num = lru.size(); @@ -1378,9 +1396,11 @@ public: ceph_abort_msg("bad cache_private"); } } + b->cache_age_bin = age_bins.front(); if (!b->is_empty()) { buffer_bytes += b->length; list_bytes[b->cache_private] += b->length; + *(b->cache_age_bin) += b->length; } num = hot.size() + warm_in.size(); } @@ -1393,6 +1413,8 @@ public: buffer_bytes -= b->length; ceph_assert(list_bytes[b->cache_private] >= b->length); list_bytes[b->cache_private] -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; } switch (b->cache_private) { case BUFFER_WARM_IN: @@ -1435,6 +1457,7 @@ public: if (!b->is_empty()) { buffer_bytes += b->length; list_bytes[b->cache_private] += b->length; + *(b->cache_age_bin) += b->length; } num = hot.size() + warm_in.size(); } @@ -1447,6 +1470,8 @@ public: buffer_bytes += delta; ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0); list_bytes[b->cache_private] += delta; + assert(*(b->cache_age_bin) + delta >= 0); + *(b->cache_age_bin) += delta; } } @@ -1465,6 +1490,9 @@ public: hot.push_front(*b); break; } + *(b->cache_age_bin) -= b->length; + b->cache_age_bin = age_bins.front(); + *(b->cache_age_bin) += b->length; num = hot.size() + warm_in.size(); _audit("_touch_buffer end"); } @@ -1512,7 +1540,9 @@ public: buffer_bytes -= b->length; ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length); list_bytes[BUFFER_WARM_IN] -= b->length; - to_evict_bytes -= b->length; + assert(*(b->cache_age_bin) >= b->length); + *(b->cache_age_bin) -= b->length; + to_evict_bytes -= b->length; evicted += b->length; b->state = BlueStore::Buffer::STATE_EMPTY; b->data.clear(); @@ -4177,6 +4207,7 @@ void *BlueStore::MempoolThread::entry() utime_t next_balance = ceph_clock_now(); utime_t next_resize = ceph_clock_now(); + utime_t next_bin_rotation = ceph_clock_now(); utime_t next_deferred_force_submit = ceph_clock_now(); utime_t alloc_stats_dump_clock = ceph_clock_now(); @@ -4189,21 +4220,47 @@ void *BlueStore::MempoolThread::entry() prev_config_change = cur_config_change; } - // Before we trim, check and see if it's time to rebalance/resize. + // define various intervals for background work + double age_bin_interval = store->cache_age_bin_interval; double autotune_interval = store->cache_autotune_interval; double resize_interval = store->osd_memory_cache_resize_interval; double max_defer_interval = store->max_defer_interval; - double alloc_stats_dump_interval = store->cct->_conf->bluestore_alloc_stats_dump_interval; + // alloc stats dump if (alloc_stats_dump_interval > 0 && alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) { store->_record_allocation_stats(); alloc_stats_dump_clock = ceph_clock_now(); } + // cache age binning + if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) { + if (binned_kv_cache != nullptr) { + binned_kv_cache->import_bins(store->kv_bins); + } + if (binned_kv_onode_cache != nullptr) { + binned_kv_onode_cache->import_bins(store->kv_onode_bins); + } + meta_cache->import_bins(store->meta_bins); + data_cache->import_bins(store->data_bins); + + if (pcm != nullptr) { + pcm->shift_bins(); + } + next_bin_rotation = ceph_clock_now(); + next_bin_rotation += age_bin_interval; + } + // cache balancing if (autotune_interval > 0 && next_balance < ceph_clock_now()) { - _adjust_cache_settings(); + if (binned_kv_cache != nullptr) { + binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); + } + if (binned_kv_onode_cache != nullptr) { + binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio); + } + meta_cache->set_cache_ratio(store->cache_meta_ratio); + data_cache->set_cache_ratio(store->cache_data_ratio); // Log events at 5 instead of 20 when balance happens. interval_stats_trim = true; @@ -4215,6 +4272,7 @@ void *BlueStore::MempoolThread::entry() next_balance = ceph_clock_now(); next_balance += autotune_interval; } + // memory resizing (ie autotuning) if (resize_interval > 0 && next_resize < ceph_clock_now()) { if (ceph_using_tcmalloc() && pcm != nullptr) { pcm->tune_memory(); @@ -4222,7 +4280,7 @@ void *BlueStore::MempoolThread::entry() next_resize = ceph_clock_now(); next_resize += resize_interval; } - + // deferred force submit if (max_defer_interval > 0 && next_deferred_force_submit < ceph_clock_now()) { if (store->get_deferred_last_submitted() + max_defer_interval < @@ -4249,18 +4307,6 @@ void *BlueStore::MempoolThread::entry() return NULL; } -void BlueStore::MempoolThread::_adjust_cache_settings() -{ - if (binned_kv_cache != nullptr) { - binned_kv_cache->set_cache_ratio(store->cache_kv_ratio); - } - if (binned_kv_onode_cache != nullptr) { - binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio); - } - meta_cache->set_cache_ratio(store->cache_meta_ratio); - data_cache->set_cache_ratio(store->cache_data_ratio); -} - void BlueStore::MempoolThread::_resize_shards(bool interval_stats) { size_t onode_shards = store->onode_cache_shards.size(); @@ -4272,7 +4318,7 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats) uint64_t cache_size = store->cache_size; int64_t kv_alloc = - static_cast(store->cache_kv_ratio * cache_size); + static_cast(store->cache_kv_ratio * cache_size); int64_t kv_onode_alloc = static_cast(store->cache_kv_onode_ratio * cache_size); int64_t meta_alloc = @@ -4609,6 +4655,11 @@ const char **BlueStore::get_tracked_conf_keys() const "osd_memory_expected_fragmentation", "bluestore_cache_autotune", "bluestore_cache_autotune_interval", + "bluestore_cache_age_bin_interval", + "bluestore_cache_kv_age_bins", + "bluestore_cache_kv_onode_age_bins", + "bluestore_cache_meta_age_bins", + "bluestore_cache_data_age_bins", "bluestore_warn_on_legacy_statfs", "bluestore_warn_on_no_per_pool_omap", "bluestore_warn_on_no_per_pg_omap", @@ -4808,6 +4859,22 @@ int BlueStore::_set_cache_sizes() cache_autotune = cct->_conf.get_val("bluestore_cache_autotune"); cache_autotune_interval = cct->_conf.get_val("bluestore_cache_autotune_interval"); + cache_age_bin_interval = + cct->_conf.get_val("bluestore_cache_age_bin_interval"); + auto _set_bin = [&](std::string conf_name, std::vector* intervals) + { + std::string intervals_str = cct->_conf.get_val(conf_name); + std::istringstream interval_stream(intervals_str); + std::copy( + std::istream_iterator(interval_stream), + std::istream_iterator(), + std::back_inserter(*intervals)); + }; + _set_bin("bluestore_cache_age_bins_kv", &kv_bins); + _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins); + _set_bin("bluestore_cache_age_bins_meta", &meta_bins); + _set_bin("bluestore_cache_age_bins_data", &data_bins); + osd_memory_target = cct->_conf.get_val("osd_memory_target"); osd_memory_base = cct->_conf.get_val("osd_memory_base"); osd_memory_expected_fragmentation = diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 00fee1ffa43..2afac549b09 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -281,6 +281,7 @@ public: uint64_t seq; uint32_t offset, length; ceph::buffer::list data; + std::shared_ptr cache_age_bin; ///< cache age bin boost::intrusive::list_member_hook<> lru_item; boost::intrusive::list_member_hook<> state_item; @@ -1152,6 +1153,7 @@ public: /// protect flush_txns ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock"); ceph::condition_variable flush_cond; ///< wait here for uncommitted txns + std::shared_ptr cache_age_bin; ///< cache age bin Onode(Collection *c, const ghobject_t& o, const mempool::bluestore_cache_meta::string& k) @@ -1260,8 +1262,11 @@ public: std::atomic max = {0}; std::atomic num = {0}; + boost::circular_buffer> age_bins; - CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {} + CacheShard(CephContext* cct) : cct(cct), logger(nullptr), age_bins(1) { + shift_bins(); + } virtual ~CacheShard() {} void set_max(uint64_t max_) { @@ -1288,10 +1293,36 @@ public: void flush() { std::lock_guard l(lock); // we should not be shutting down after the blackhole is enabled - assert(!cct->_conf->objectstore_blackhole); + ceph_assert(!cct->_conf->objectstore_blackhole); _trim_to(0); } + virtual void shift_bins() { + std::lock_guard l(lock); + age_bins.push_front(std::make_shared(0)); + } + virtual uint32_t get_bin_count() { + std::lock_guard l(lock); + return age_bins.capacity(); + } + virtual void set_bin_count(uint32_t count) { + std::lock_guard l(lock); + age_bins.set_capacity(count); + } + virtual uint64_t sum_bins(uint32_t start, uint32_t end) { + std::lock_guard l(lock); + auto size = age_bins.size(); + if (size < start) { + return 0; + } + uint64_t count = 0; + end = (size < end) ? size : end; + for (auto i = start; i < end; i++) { + count += *(age_bins[i]); + } + return count; + } + #ifdef DEBUG_CACHE virtual void _audit(const char *s) = 0; #else @@ -1302,7 +1333,6 @@ public: /// A Generic onode Cache Shard struct OnodeCacheShard : public CacheShard { std::atomic num_pinned = {0}; - std::array, 64> dumped_onodes; virtual void _pin(Onode* o) = 0; @@ -1990,7 +2020,7 @@ public: void flush_all_but_last() { std::unique_lock l(qlock); - assert (q.size() >= 1); + ceph_assert (q.size() >= 1); while (true) { // std::set flag before the check because the condition // may become true outside qlock, and we need to make @@ -2240,7 +2270,12 @@ private: double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF) double cache_data_ratio = 0; ///< cache ratio dedicated to object data bool cache_autotune = false; ///< cache autotune setting + double cache_age_bin_interval = 0; ///< time to wait between cache age bin rotations double cache_autotune_interval = 0; ///< time to wait between cache rebalancing + std::vector kv_bins; ///< kv autotune bins + std::vector kv_onode_bins; ///< kv onode autotune bins + std::vector meta_bins; ///< meta autotune bins + std::vector data_bins; ///< data autotune bins uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation @@ -2270,6 +2305,7 @@ private: struct MempoolCache : public PriorityCache::PriCache { BlueStore *store; + uint64_t bins[PriorityCache::Priority::LAST+1] = {0}; int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; int64_t committed_bytes = 0; double cache_ratio = 0; @@ -2277,21 +2313,34 @@ private: MempoolCache(BlueStore *s) : store(s) {}; virtual uint64_t _get_used_bytes() const = 0; + virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const = 0; virtual int64_t request_cache_bytes( PriorityCache::Priority pri, uint64_t total_cache) const { int64_t assigned = get_cache_bytes(pri); switch (pri) { - // All cache items are currently shoved into the PRI1 priority - case PriorityCache::Priority::PRI1: + case PriorityCache::Priority::PRI0: + { + // BlueStore caches currently don't put anything in PRI0 + break; + } + case PriorityCache::Priority::LAST: { - int64_t request = _get_used_bytes(); + uint32_t max = get_bin_count(); + int64_t request = _get_used_bytes() - _sum_bins(0, max); return(request > assigned) ? request - assigned : 0; } default: - break; - } + { + ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST); + auto prev_pri = static_cast(pri - 1); + uint64_t start = get_bins(prev_pri); + uint64_t end = get_bins(pri); + int64_t request = _sum_bins(start, end); + return(request > assigned) ? request - assigned : 0; + } + } return -EOPNOTSUPP; } @@ -2321,6 +2370,42 @@ private: virtual int64_t get_committed_size() const { return committed_bytes; } + virtual uint64_t get_bins(PriorityCache::Priority pri) const { + if (pri > PriorityCache::Priority::PRI0 && + pri < PriorityCache::Priority::LAST) { + return bins[pri]; + } + return 0; + } + virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) { + if (pri <= PriorityCache::Priority::PRI0 || + pri >= PriorityCache::Priority::LAST) { + return; + } + bins[pri] = end_bin; + uint64_t max = 0; + for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) { + if (bins[pri] > max) { + max = bins[pri]; + } + } + set_bin_count(max); + } + virtual void import_bins(const std::vector &bins_v) { + uint64_t max = 0; + for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) { + unsigned i = (unsigned) pri - 1; + if (i < bins_v.size()) { + bins[pri] = bins_v[i]; + if (bins[pri] > max) { + max = bins[pri]; + } + } else { + bins[pri] = 0; + } + } + set_bin_count(max); + } virtual double get_cache_ratio() const { return cache_ratio; } @@ -2328,11 +2413,21 @@ private: cache_ratio = ratio; } virtual std::string get_cache_name() const = 0; + virtual uint32_t get_bin_count() const = 0; + virtual void set_bin_count(uint32_t count) = 0; }; struct MetaCache : public MempoolCache { MetaCache(BlueStore *s) : MempoolCache(s) {}; + virtual uint32_t get_bin_count() const { + return store->onode_cache_shards[0]->get_bin_count(); + } + virtual void set_bin_count(uint32_t count) { + for (auto i : store->onode_cache_shards) { + i->set_bin_count(count); + } + } virtual uint64_t _get_used_bytes() const { return mempool::bluestore_Buffer::allocated_bytes() + mempool::bluestore_Blob::allocated_bytes() + @@ -2343,17 +2438,26 @@ private: mempool::bluestore_SharedBlob::allocated_bytes() + mempool::bluestore_inline_bl::allocated_bytes(); } - + virtual void shift_bins() { + for (auto i : store->onode_cache_shards) { + i->shift_bins(); + } + } + virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const { + uint64_t onodes = 0; + for (auto i : store->onode_cache_shards) { + onodes += i->sum_bins(start, end); + } + return onodes*get_bytes_per_onode(); + } virtual std::string get_cache_name() const { return "BlueStore Meta Cache"; } - uint64_t _get_num_onodes() const { uint64_t onode_num = mempool::bluestore_cache_onode::allocated_items(); return (2 > onode_num) ? 2 : onode_num; } - double get_bytes_per_onode() const { return (double)_get_used_bytes() / (double)_get_num_onodes(); } @@ -2363,6 +2467,14 @@ private: struct DataCache : public MempoolCache { DataCache(BlueStore *s) : MempoolCache(s) {}; + virtual uint32_t get_bin_count() const { + return store->buffer_cache_shards[0]->get_bin_count(); + } + virtual void set_bin_count(uint32_t count) { + for (auto i : store->buffer_cache_shards) { + i->set_bin_count(count); + } + } virtual uint64_t _get_used_bytes() const { uint64_t bytes = 0; for (auto i : store->buffer_cache_shards) { @@ -2370,6 +2482,18 @@ private: } return bytes; } + virtual void shift_bins() { + for (auto i : store->buffer_cache_shards) { + i->shift_bins(); + } + } + virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const { + uint64_t bytes = 0; + for (auto i : store->buffer_cache_shards) { + bytes += i->sum_bins(start, end); + } + return bytes; + } virtual std::string get_cache_name() const { return "BlueStore Data Cache"; } @@ -2396,7 +2520,6 @@ private: } private: - void _adjust_cache_settings(); void _update_cache_settings(); void _resize_shards(bool interval_stats); } mempool_thread; diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc index 38f35edcd44..a67afcf42de 100644 --- a/src/tools/rbd_mirror/Mirror.cc +++ b/src/tools/rbd_mirror/Mirror.cc @@ -255,6 +255,19 @@ struct PriCache : public PriorityCache::PriCache { m_cache_ratio = ratio; } + void shift_bins() override { + } + + void import_bins(const std::vector &intervals) override { + } + + void set_bins(PriorityCache::Priority pri, uint64_t end_interval) override { + } + + uint64_t get_bins(PriorityCache::Priority pri) const override { + return 0; + } + std::string get_cache_name() const override { return m_name; } -- 2.39.5