// Commit the new cache size
int64_t committed = it->second->commit_cache_size(tuned_mem);
-
// Update the perf counters
int64_t alloc = it->second->get_cache_bytes();
}
}
+ void Manager::shift_bins()
+ {
+ for (auto &l : loggers) {
+ auto it = caches.find(l.first);
+ it->second->shift_bins();
+ }
+ }
+
void Manager::balance_priority(int64_t *mem_avail, Priority pri)
{
std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
// Get the name of this cache.
virtual std::string get_cache_name() const = 0;
+
+ // Rotate the bins
+ virtual void shift_bins() = 0;
+
+ // Import user bins (from PRI1 to LAST-1)
+ virtual void import_bins(const std::vector<uint64_t> &bins) = 0;
+
+ // Set bins (PRI0 and LAST should be ignored)
+ virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) = 0;
+
+ // Get bins
+ virtual uint64_t get_bins(PriorityCache::Priority pri) const = 0;
};
class Manager {
void clear();
void tune_memory();
void balance();
-
+ void shift_bins();
private:
void balance_priority(int64_t *mem_avail, Priority pri);
};
default: 5
see_also:
- bluestore_cache_autotune
+- name: bluestore_cache_age_bin_interval
+ type: float
+ level: dev
+ desc: The duration (in seconds) represented by a single cache age bin.
+ fmt_desc: |
+ The caches used by bluestore will assign cache entries to an 'age bin'
+ that represents a period of time during which that cache entry was most
+ recently updated. By binning the caches in this way, Ceph's priority
+ cache balancing code can make better decisions about which caches should
+ receive priority based on the relative ages of items in the caches. By
+ default, a single cache age bin represents 1 second of time. Note:
+ Setting this interval too small can result in high CPU usage and lower
+ performance.
+ default: 1
+ see_also:
+ - bluestore_cache_age_bins_kv
+ - bluestore_cache_age_bins_kv_onode
+ - bluestore_cache_age_bins_meta
+ - bluestore_cache_age_bins_data
+- name: bluestore_cache_age_bins_kv
+ type: str
+ level: dev
+ desc: A 10 element, space separated list of age bins for kv cache
+ fmt_desc: |
+ A 10 element, space separated list of cache age bins grouped by
+ priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+ PRI10=[n+8,n+9). Values represent the starting and ending bin for each
+ priority level. A 0 in the 2nd term will prevent any items from being
+ associated with that priority. bin duration is based on the
+ bluestore_cache_age_bin_interval value. For example,
+ "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+ contains 1 age bin. Assuming the default age bin interval of 1 second,
+ PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+ bins representing cache items that are 1 to less than 5 seconds old. All
+ other cache items in this example are associated with the lowest priority
+ level as PRI3-PRI10 all have 0s in their second term.
+ default: "1 2 6 24 120 720 0 0 0 0"
+ see_also:
+ - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_kv_onode
+ type: str
+ level: dev
+ desc: A 10 element, space separated list of age bins for kv onode cache
+ fmt_desc: |
+ A 10 element, space separated list of cache age bins grouped by
+ priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+ PRI10=[n+8,n+9). Values represent the starting and ending bin for each
+ priority level. A 0 in the 2nd term will prevent any items from being
+ associated with that priority. bin duration is based on the
+ bluestore_cache_age_bin_interval value. For example,
+ "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+ contains 1 age bin. Assuming the default age bin interval of 1 second,
+ PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+ bins representing cache items that are 1 to less than 5 seconds old. All
+ other cache items in this example are associated with the lowest priority
+ level as PRI3-PRI10 all have 0s in their second term.
+ default: "0 0 0 0 0 0 0 0 0 720"
+ see_also:
+ - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_meta
+ type: str
+ level: dev
+ desc: A 10 element, space separated list of age bins for onode cache
+ fmt_desc: |
+ A 10 element, space separated list of cache age bins grouped by
+ priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+ PRI10=[n+8,n+9). Values represent the starting and ending bin for each
+ priority level. A 0 in the 2nd term will prevent any items from being
+ associated with that priority. bin duration is based on the
+ bluestore_cache_age_bin_interval value. For example,
+ "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+ contains 1 age bin. Assuming the default age bin interval of 1 second,
+ PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+ bins representing cache items that are 1 to less than 5 seconds old. All
+ other cache items in this example are associated with the lowest priority
+ level as PRI3-PRI10 all have 0s in their second term.
+ default: "1 2 6 24 120 720 0 0 0 0"
+ see_also:
+ - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_data
+ type: str
+ level: dev
+ desc: A 10 element, space separated list of age bins for data cache
+ fmt_desc: |
+ A 10 element, space separated list of cache age bins grouped by
+ priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+ PRI10=[n+8,n+9). Values represent the starting and ending bin for each
+ priority level. A 0 in the 2nd term will prevent any items from being
+ associated with that priority. bin duration is based on the
+ bluestore_cache_age_bin_interval value. For example,
+ "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+ contains 1 age bin. Assuming the default age bin interval of 1 second,
+ PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+ bins representing cache items that are 1 to less than 5 seconds old. All
+ other cache items in this example are associated with the lowest priority
+ level as PRI3-PRI10 all have 0s in their second term.
+ default: "1 2 6 24 120 720 0 0 0 0"
+ see_also:
+ - bluestore_cache_age_bin_interval
- name: bluestore_alloc_stats_dump_interval
type: float
level: dev
high_pri_pool_ratio_(high_pri_pool_ratio),
high_pri_pool_capacity_(0),
usage_(0),
- lru_usage_(0) {
+ lru_usage_(0),
+ age_bins(1) {
+ shift_bins();
// Make empty circular linked list
lru_.next = &lru_;
lru_.prev = &lru_;
if (e->InHighPriPool()) {
ceph_assert(high_pri_pool_usage_ >= e->charge);
high_pri_pool_usage_ -= e->charge;
+ } else {
+ ceph_assert(*(e->age_bin) >= e->charge);
+ *(e->age_bin) -= e->charge;
}
}
void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) {
ceph_assert(e->next == nullptr);
ceph_assert(e->prev == nullptr);
+ e->age_bin = age_bins.front();
+
if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) {
// Inset "e" to head of LRU list.
e->next = &lru_;
e->next->prev = e;
e->SetInHighPriPool(false);
lru_low_pri_ = e;
+ *(e->age_bin) += e->charge;
}
lru_usage_ += e->charge;
}
+uint64_t BinnedLRUCacheShard::sum_bins(uint32_t start, uint32_t end) const {
+ std::lock_guard<std::mutex> l(mutex_);
+ auto size = age_bins.size();
+ if (size < start) {
+ return 0;
+ }
+ uint64_t bytes = 0;
+ end = (size < end) ? size : end;
+ for (auto i = start; i < end; i++) {
+ bytes += *(age_bins[i]);
+ }
+ return bytes;
+}
+
void BinnedLRUCacheShard::MaintainPoolSize() {
while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
// Overflow last entry in high-pri pool to low-pri pool.
ceph_assert(lru_low_pri_ != &lru_);
lru_low_pri_->SetInHighPriPool(false);
high_pri_pool_usage_ -= lru_low_pri_->charge;
+ *(lru_low_pri_->age_bin) += lru_low_pri_->charge;
}
}
return usage_ - lru_usage_;
}
+void BinnedLRUCacheShard::shift_bins() {
+ std::lock_guard<std::mutex> l(mutex_);
+ age_bins.push_front(std::make_shared<uint64_t>(0));
+}
+
+uint32_t BinnedLRUCacheShard::get_bin_count() const {
+ std::lock_guard<std::mutex> l(mutex_);
+ return age_bins.capacity();
+}
+
+void BinnedLRUCacheShard::set_bin_count(uint32_t count) {
+ std::lock_guard<std::mutex> l(mutex_);
+ age_bins.set_capacity(count);
+}
+
std::string BinnedLRUCacheShard::GetPrintableOptions() const {
const int kBufferSize = 200;
char buffer[kBufferSize];
int64_t assigned = get_cache_bytes(pri);
int64_t request = 0;
- switch (pri) {
+ switch(pri) {
// PRI0 is for rocksdb's high priority items (indexes/filters)
case PriorityCache::Priority::PRI0:
{
- request = GetHighPriPoolUsage();
+ // Because we want the high pri cache to grow independently of the low
+ // pri cache, request a chunky allocation independent of the other
+ // priorities.
+ request = PriorityCache::get_chunk(GetHighPriPoolUsage(), total_cache);
break;
}
- // All other cache items are currently shoved into the PRI1 priority.
- case PriorityCache::Priority::PRI1:
+ case PriorityCache::Priority::LAST:
{
+ auto max = get_bin_count();
request = GetUsage();
request -= GetHighPriPoolUsage();
+ request -= sum_bins(0, max);
break;
}
default:
- break;
+ {
+ ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST);
+ auto prev_pri = static_cast<PriorityCache::Priority>(pri - 1);
+ uint64_t start = get_bins(prev_pri);
+ uint64_t end = get_bins(pri);
+ request = sum_bins(start, end);
+ break;
+ }
}
request = (request > assigned) ? request - assigned : 0;
ldout(cct, 10) << __func__ << " Priority: " << static_cast<uint32_t>(pri)
double ratio = 0;
if (new_bytes > 0) {
int64_t pri0_bytes = get_cache_bytes(PriorityCache::Priority::PRI0);
- // Add 10% of the "reserved" bytes so the ratio can't get stuck at 0
- pri0_bytes += (new_bytes - get_cache_bytes()) / 10;
ratio = (double) pri0_bytes / new_bytes;
}
- ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
+ ldout(cct, 5) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
SetHighPriPoolRatio(ratio);
return new_bytes;
}
+void BinnedLRUCache::shift_bins() {
+ for (int s = 0; s < num_shards_; s++) {
+ shards_[s].shift_bins();
+ }
+}
+
+uint64_t BinnedLRUCache::sum_bins(uint32_t start, uint32_t end) const {
+ uint64_t bytes = 0;
+ for (int s = 0; s < num_shards_; s++) {
+ bytes += shards_[s].sum_bins(start, end);
+ }
+ return bytes;
+}
+
+uint32_t BinnedLRUCache::get_bin_count() const {
+ uint32_t result = 0;
+ if (num_shards_ > 0) {
+ result = shards_[0].get_bin_count();
+ }
+ return result;
+}
+
+void BinnedLRUCache::set_bin_count(uint32_t count) {
+ for (int s = 0; s < num_shards_; s++) {
+ shards_[s].set_bin_count(count);
+ }
+}
+
std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
CephContext *c,
size_t capacity,
#include <string>
#include <mutex>
+#include <boost/circular_buffer.hpp>
#include "ShardedCache.h"
#include "common/autovector.h"
double high_pri_pool_ratio = 0.0);
struct BinnedLRUHandle {
+ std::shared_ptr<uint64_t> age_bin;
void* value;
DeleterFn deleter;
BinnedLRUHandle* next_hash;
// Retrieves high pri pool usage
size_t GetHighPriPoolUsage() const;
+ // Rotate the bins
+ void shift_bins();
+
+ // Get the bin count
+ uint32_t get_bin_count() const;
+
+ // Set the bin count
+ void set_bin_count(uint32_t count);
+
+ // Get the byte counts for a range of age bins
+ uint64_t sum_bins(uint32_t start, uint32_t end) const;
+
private:
CephContext *cct;
void LRU_Remove(BinnedLRUHandle* e);
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable std::mutex mutex_;
+
+ // Circular buffer of byte counters for age binning
+ boost::circular_buffer<std::shared_ptr<uint64_t>> age_bins;
};
class BinnedLRUCache : public ShardedCache {
virtual int64_t get_committed_size() const {
return GetCapacity();
}
+ virtual void shift_bins();
+ uint64_t sum_bins(uint32_t start, uint32_t end) const;
+ uint32_t get_bin_count() const;
+ void set_bin_count(uint32_t count);
+
virtual std::string get_cache_name() const {
return "RocksDB Binned LRU Cache";
}
int GetNumShardBits() const { return num_shard_bits_; }
+ virtual uint32_t get_bin_count() const = 0;
+ virtual void set_bin_count(uint32_t count) = 0;
+
// PriCache
virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
return cache_bytes[pri];
}
virtual void set_cache_ratio(double ratio) {
cache_ratio = ratio;
+ }
+ virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+ if (pri > PriorityCache::Priority::PRI0 &&
+ pri < PriorityCache::Priority::LAST) {
+ return bins[pri];
+ }
+ return 0;
+ }
+ virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+ if (pri <= PriorityCache::Priority::PRI0 ||
+ pri >= PriorityCache::Priority::LAST) {
+ return;
+ }
+ bins[pri] = end_bin;
+ uint64_t max = 0;
+ for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+ if (bins[pri] > max) {
+ max = bins[pri];
+ }
+ }
+ set_bin_count(max);
+ }
+ virtual void import_bins(const std::vector<uint64_t> &bins_v) {
+ uint64_t max = 0;
+ for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+ unsigned i = (unsigned) pri - 1;
+ if (i < bins_v.size()) {
+ bins[pri] = bins_v[i];
+ if (bins[pri] > max) {
+ max = bins[pri];
+ }
+ } else {
+ bins[pri] = 0;
+ }
+ }
+ set_bin_count(max);
}
virtual std::string get_cache_name() const = 0;
return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
}
+ uint64_t bins[PriorityCache::Priority::LAST+1] = {0};
int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
double cache_ratio = 0;
virtual void set_cache_ratio(double ratio) {
cache_ratio = ratio;
}
+ virtual void shift_bins() {
+ }
+ virtual void import_bins(const std::vector<uint64_t> &bins) {
+ }
+ virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+ }
+ virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+ return 0;
+ }
+
virtual string get_cache_name() const = 0;
};
{
if (o->put_cache()) {
(level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
} else {
++num_pinned;
}
void _rm(BlueStore::Onode* o) override
{
if (o->pop_cache()) {
+ *(o->cache_age_bin) -= 1;
lru.erase(lru.iterator_to(*o));
} else {
ceph_assert(num_pinned);
}
void _pin(BlueStore::Onode* o) override
{
+ *(o->cache_age_bin) -= 1;
lru.erase(lru.iterator_to(*o));
++num_pinned;
dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
void _unpin(BlueStore::Onode* o) override
{
lru.push_front(*o);
+ o->cache_age_bin = age_bins.front();
+ *(o->cache_age_bin) += 1;
ceph_assert(num_pinned);
--num_pinned;
dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
ceph_assert(n == 0);
lru.erase(p);
}
+ *(o->cache_age_bin) -= 1;
auto pinned = !o->pop_cache();
ceph_assert(!pinned);
o->c->onode_map._remove(o->oid);
lru.push_back(*b);
}
buffer_bytes += b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
num = lru.size();
}
void _rm(BlueStore::Buffer *b) override {
ceph_assert(buffer_bytes >= b->length);
buffer_bytes -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
auto q = lru.iterator_to(*b);
lru.erase(q);
num = lru.size();
void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
ceph_assert((int64_t)buffer_bytes + delta >= 0);
buffer_bytes += delta;
+ assert(*(b->cache_age_bin) + delta >= 0);
+ *(b->cache_age_bin) += delta;
}
void _touch(BlueStore::Buffer *b) override {
auto p = lru.iterator_to(*b);
lru.erase(p);
lru.push_front(*b);
+ *(b->cache_age_bin) -= b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
num = lru.size();
_audit("_touch_buffer end");
}
BlueStore::Buffer *b = &*i;
ceph_assert(b->is_clean());
dout(20) << __func__ << " rm " << *b << dendl;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
b->space->_rm_buffer(this, b);
}
num = lru.size();
ceph_abort_msg("bad cache_private");
}
}
+ b->cache_age_bin = age_bins.front();
if (!b->is_empty()) {
buffer_bytes += b->length;
list_bytes[b->cache_private] += b->length;
+ *(b->cache_age_bin) += b->length;
}
num = hot.size() + warm_in.size();
}
buffer_bytes -= b->length;
ceph_assert(list_bytes[b->cache_private] >= b->length);
list_bytes[b->cache_private] -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
}
switch (b->cache_private) {
case BUFFER_WARM_IN:
if (!b->is_empty()) {
buffer_bytes += b->length;
list_bytes[b->cache_private] += b->length;
+ *(b->cache_age_bin) += b->length;
}
num = hot.size() + warm_in.size();
}
buffer_bytes += delta;
ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
list_bytes[b->cache_private] += delta;
+ assert(*(b->cache_age_bin) + delta >= 0);
+ *(b->cache_age_bin) += delta;
}
}
hot.push_front(*b);
break;
}
+ *(b->cache_age_bin) -= b->length;
+ b->cache_age_bin = age_bins.front();
+ *(b->cache_age_bin) += b->length;
num = hot.size() + warm_in.size();
_audit("_touch_buffer end");
}
buffer_bytes -= b->length;
ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
list_bytes[BUFFER_WARM_IN] -= b->length;
- to_evict_bytes -= b->length;
+ assert(*(b->cache_age_bin) >= b->length);
+ *(b->cache_age_bin) -= b->length;
+ to_evict_bytes -= b->length;
evicted += b->length;
b->state = BlueStore::Buffer::STATE_EMPTY;
b->data.clear();
utime_t next_balance = ceph_clock_now();
utime_t next_resize = ceph_clock_now();
+ utime_t next_bin_rotation = ceph_clock_now();
utime_t next_deferred_force_submit = ceph_clock_now();
utime_t alloc_stats_dump_clock = ceph_clock_now();
prev_config_change = cur_config_change;
}
- // Before we trim, check and see if it's time to rebalance/resize.
+ // define various intervals for background work
+ double age_bin_interval = store->cache_age_bin_interval;
double autotune_interval = store->cache_autotune_interval;
double resize_interval = store->osd_memory_cache_resize_interval;
double max_defer_interval = store->max_defer_interval;
-
double alloc_stats_dump_interval =
store->cct->_conf->bluestore_alloc_stats_dump_interval;
+ // alloc stats dump
if (alloc_stats_dump_interval > 0 &&
alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
store->_record_allocation_stats();
alloc_stats_dump_clock = ceph_clock_now();
}
+ // cache age binning
+ if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->import_bins(store->kv_bins);
+ }
+ if (binned_kv_onode_cache != nullptr) {
+ binned_kv_onode_cache->import_bins(store->kv_onode_bins);
+ }
+ meta_cache->import_bins(store->meta_bins);
+ data_cache->import_bins(store->data_bins);
+
+ if (pcm != nullptr) {
+ pcm->shift_bins();
+ }
+ next_bin_rotation = ceph_clock_now();
+ next_bin_rotation += age_bin_interval;
+ }
+ // cache balancing
if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
- _adjust_cache_settings();
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+ }
+ if (binned_kv_onode_cache != nullptr) {
+ binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+ }
+ meta_cache->set_cache_ratio(store->cache_meta_ratio);
+ data_cache->set_cache_ratio(store->cache_data_ratio);
// Log events at 5 instead of 20 when balance happens.
interval_stats_trim = true;
next_balance = ceph_clock_now();
next_balance += autotune_interval;
}
+ // memory resizing (ie autotuning)
if (resize_interval > 0 && next_resize < ceph_clock_now()) {
if (ceph_using_tcmalloc() && pcm != nullptr) {
pcm->tune_memory();
next_resize = ceph_clock_now();
next_resize += resize_interval;
}
-
+ // deferred force submit
if (max_defer_interval > 0 &&
next_deferred_force_submit < ceph_clock_now()) {
if (store->get_deferred_last_submitted() + max_defer_interval <
return NULL;
}
-void BlueStore::MempoolThread::_adjust_cache_settings()
-{
- if (binned_kv_cache != nullptr) {
- binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
- }
- if (binned_kv_onode_cache != nullptr) {
- binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
- }
- meta_cache->set_cache_ratio(store->cache_meta_ratio);
- data_cache->set_cache_ratio(store->cache_data_ratio);
-}
-
void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
{
size_t onode_shards = store->onode_cache_shards.size();
uint64_t cache_size = store->cache_size;
int64_t kv_alloc =
- static_cast<int64_t>(store->cache_kv_ratio * cache_size);
+ static_cast<int64_t>(store->cache_kv_ratio * cache_size);
int64_t kv_onode_alloc =
static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
int64_t meta_alloc =
"osd_memory_expected_fragmentation",
"bluestore_cache_autotune",
"bluestore_cache_autotune_interval",
+ "bluestore_cache_age_bin_interval",
+ "bluestore_cache_kv_age_bins",
+ "bluestore_cache_kv_onode_age_bins",
+ "bluestore_cache_meta_age_bins",
+ "bluestore_cache_data_age_bins",
"bluestore_warn_on_legacy_statfs",
"bluestore_warn_on_no_per_pool_omap",
"bluestore_warn_on_no_per_pg_omap",
cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
cache_autotune_interval =
cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+ cache_age_bin_interval =
+ cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
+ auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
+ {
+ std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
+ std::istringstream interval_stream(intervals_str);
+ std::copy(
+ std::istream_iterator<uint64_t>(interval_stream),
+ std::istream_iterator<uint64_t>(),
+ std::back_inserter(*intervals));
+ };
+ _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
+ _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
+ _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
+ _set_bin("bluestore_cache_age_bins_data", &data_bins);
+
osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
osd_memory_expected_fragmentation =
uint64_t seq;
uint32_t offset, length;
ceph::buffer::list data;
+ std::shared_ptr<int64_t> cache_age_bin; ///< cache age bin
boost::intrusive::list_member_hook<> lru_item;
boost::intrusive::list_member_hook<> state_item;
/// protect flush_txns
ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
+ std::shared_ptr<int64_t> cache_age_bin; ///< cache age bin
Onode(Collection *c, const ghobject_t& o,
const mempool::bluestore_cache_meta::string& k)
std::atomic<uint64_t> max = {0};
std::atomic<uint64_t> num = {0};
+ boost::circular_buffer<std::shared_ptr<int64_t>> age_bins;
- CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
+ CacheShard(CephContext* cct) : cct(cct), logger(nullptr), age_bins(1) {
+ shift_bins();
+ }
virtual ~CacheShard() {}
void set_max(uint64_t max_) {
void flush() {
std::lock_guard l(lock);
// we should not be shutting down after the blackhole is enabled
- assert(!cct->_conf->objectstore_blackhole);
+ ceph_assert(!cct->_conf->objectstore_blackhole);
_trim_to(0);
}
+ virtual void shift_bins() {
+ std::lock_guard l(lock);
+ age_bins.push_front(std::make_shared<int64_t>(0));
+ }
+ virtual uint32_t get_bin_count() {
+ std::lock_guard l(lock);
+ return age_bins.capacity();
+ }
+ virtual void set_bin_count(uint32_t count) {
+ std::lock_guard l(lock);
+ age_bins.set_capacity(count);
+ }
+ virtual uint64_t sum_bins(uint32_t start, uint32_t end) {
+ std::lock_guard l(lock);
+ auto size = age_bins.size();
+ if (size < start) {
+ return 0;
+ }
+ uint64_t count = 0;
+ end = (size < end) ? size : end;
+ for (auto i = start; i < end; i++) {
+ count += *(age_bins[i]);
+ }
+ return count;
+ }
+
#ifdef DEBUG_CACHE
virtual void _audit(const char *s) = 0;
#else
/// A Generic onode Cache Shard
struct OnodeCacheShard : public CacheShard {
std::atomic<uint64_t> num_pinned = {0};
-
std::array<std::pair<ghobject_t, ceph::mono_clock::time_point>, 64> dumped_onodes;
virtual void _pin(Onode* o) = 0;
void flush_all_but_last() {
std::unique_lock l(qlock);
- assert (q.size() >= 1);
+ ceph_assert (q.size() >= 1);
while (true) {
// std::set flag before the check because the condition
// may become true outside qlock, and we need to make
double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
double cache_data_ratio = 0; ///< cache ratio dedicated to object data
bool cache_autotune = false; ///< cache autotune setting
+ double cache_age_bin_interval = 0; ///< time to wait between cache age bin rotations
double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
+ std::vector<uint64_t> kv_bins; ///< kv autotune bins
+ std::vector<uint64_t> kv_onode_bins; ///< kv onode autotune bins
+ std::vector<uint64_t> meta_bins; ///< meta autotune bins
+ std::vector<uint64_t> data_bins; ///< data autotune bins
uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
struct MempoolCache : public PriorityCache::PriCache {
BlueStore *store;
+ uint64_t bins[PriorityCache::Priority::LAST+1] = {0};
int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
int64_t committed_bytes = 0;
double cache_ratio = 0;
MempoolCache(BlueStore *s) : store(s) {};
virtual uint64_t _get_used_bytes() const = 0;
+ virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const = 0;
virtual int64_t request_cache_bytes(
PriorityCache::Priority pri, uint64_t total_cache) const {
int64_t assigned = get_cache_bytes(pri);
switch (pri) {
- // All cache items are currently shoved into the PRI1 priority
- case PriorityCache::Priority::PRI1:
+ case PriorityCache::Priority::PRI0:
+ {
+ // BlueStore caches currently don't put anything in PRI0
+ break;
+ }
+ case PriorityCache::Priority::LAST:
{
- int64_t request = _get_used_bytes();
+ uint32_t max = get_bin_count();
+ int64_t request = _get_used_bytes() - _sum_bins(0, max);
return(request > assigned) ? request - assigned : 0;
}
default:
- break;
- }
+ {
+ ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST);
+ auto prev_pri = static_cast<PriorityCache::Priority>(pri - 1);
+ uint64_t start = get_bins(prev_pri);
+ uint64_t end = get_bins(pri);
+ int64_t request = _sum_bins(start, end);
+ return(request > assigned) ? request - assigned : 0;
+ }
+ }
return -EOPNOTSUPP;
}
virtual int64_t get_committed_size() const {
return committed_bytes;
}
+ virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+ if (pri > PriorityCache::Priority::PRI0 &&
+ pri < PriorityCache::Priority::LAST) {
+ return bins[pri];
+ }
+ return 0;
+ }
+ virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+ if (pri <= PriorityCache::Priority::PRI0 ||
+ pri >= PriorityCache::Priority::LAST) {
+ return;
+ }
+ bins[pri] = end_bin;
+ uint64_t max = 0;
+ for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+ if (bins[pri] > max) {
+ max = bins[pri];
+ }
+ }
+ set_bin_count(max);
+ }
+ virtual void import_bins(const std::vector<uint64_t> &bins_v) {
+ uint64_t max = 0;
+ for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+ unsigned i = (unsigned) pri - 1;
+ if (i < bins_v.size()) {
+ bins[pri] = bins_v[i];
+ if (bins[pri] > max) {
+ max = bins[pri];
+ }
+ } else {
+ bins[pri] = 0;
+ }
+ }
+ set_bin_count(max);
+ }
virtual double get_cache_ratio() const {
return cache_ratio;
}
cache_ratio = ratio;
}
virtual std::string get_cache_name() const = 0;
+ virtual uint32_t get_bin_count() const = 0;
+ virtual void set_bin_count(uint32_t count) = 0;
};
struct MetaCache : public MempoolCache {
MetaCache(BlueStore *s) : MempoolCache(s) {};
+ virtual uint32_t get_bin_count() const {
+ return store->onode_cache_shards[0]->get_bin_count();
+ }
+ virtual void set_bin_count(uint32_t count) {
+ for (auto i : store->onode_cache_shards) {
+ i->set_bin_count(count);
+ }
+ }
virtual uint64_t _get_used_bytes() const {
return mempool::bluestore_Buffer::allocated_bytes() +
mempool::bluestore_Blob::allocated_bytes() +
mempool::bluestore_SharedBlob::allocated_bytes() +
mempool::bluestore_inline_bl::allocated_bytes();
}
-
+ virtual void shift_bins() {
+ for (auto i : store->onode_cache_shards) {
+ i->shift_bins();
+ }
+ }
+ virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+ uint64_t onodes = 0;
+ for (auto i : store->onode_cache_shards) {
+ onodes += i->sum_bins(start, end);
+ }
+ return onodes*get_bytes_per_onode();
+ }
virtual std::string get_cache_name() const {
return "BlueStore Meta Cache";
}
-
uint64_t _get_num_onodes() const {
uint64_t onode_num =
mempool::bluestore_cache_onode::allocated_items();
return (2 > onode_num) ? 2 : onode_num;
}
-
double get_bytes_per_onode() const {
return (double)_get_used_bytes() / (double)_get_num_onodes();
}
struct DataCache : public MempoolCache {
DataCache(BlueStore *s) : MempoolCache(s) {};
+ virtual uint32_t get_bin_count() const {
+ return store->buffer_cache_shards[0]->get_bin_count();
+ }
+ virtual void set_bin_count(uint32_t count) {
+ for (auto i : store->buffer_cache_shards) {
+ i->set_bin_count(count);
+ }
+ }
virtual uint64_t _get_used_bytes() const {
uint64_t bytes = 0;
for (auto i : store->buffer_cache_shards) {
}
return bytes;
}
+ virtual void shift_bins() {
+ for (auto i : store->buffer_cache_shards) {
+ i->shift_bins();
+ }
+ }
+ virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+ uint64_t bytes = 0;
+ for (auto i : store->buffer_cache_shards) {
+ bytes += i->sum_bins(start, end);
+ }
+ return bytes;
+ }
virtual std::string get_cache_name() const {
return "BlueStore Data Cache";
}
}
private:
- void _adjust_cache_settings();
void _update_cache_settings();
void _resize_shards(bool interval_stats);
} mempool_thread;
m_cache_ratio = ratio;
}
+ void shift_bins() override {
+ }
+
+ void import_bins(const std::vector<uint64_t> &intervals) override {
+ }
+
+ void set_bins(PriorityCache::Priority pri, uint64_t end_interval) override {
+ }
+
+ uint64_t get_bins(PriorityCache::Priority pri) const override {
+ return 0;
+ }
+
std::string get_cache_name() const override {
return m_name;
}