common/dns_resolve.cc
common/hostname.cc
common/util.cc
+ common/PriorityCache.cc
librbd/Features.cc
arch/probe.cc
${auth_files}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "PriorityCache.h"
+
+namespace PriorityCache {
+ int64_t get_chunk(uint64_t usage, uint64_t chunk_bytes) {
+ // Add a chunk of headroom and round up to the near chunk
+ uint64_t val = usage + chunk_bytes;
+ uint64_t r = (val) % chunk_bytes;
+ if (r > 0)
+ val = val + chunk_bytes - r;
+ return val;
+ }
+
+ PriCache::~PriCache() {
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PRIORITY_CACHE_H
+#define CEPH_PRIORITY_CACHE_H
+
+#include <stdint.h>
+#include <string>
+
+namespace PriorityCache {
+ enum Priority {
+ PRI0, // Reserved for special items
+ PRI1, // High priority cache items
+ PRI2, // Medium priority cache items
+ PRI3, // Low priority cache items
+ LAST = PRI3,
+ };
+
+ int64_t get_chunk(uint64_t usage, uint64_t chunk_bytes);
+
+ struct PriCache {
+ virtual ~PriCache();
+
+ /* Ask the cache to request memory for the given priority rounded up to
+ * the nearst chunk_bytes. This for example, may return the size of all
+ * items associated with this priority plus some additional space for
+ * future growth. Note that the cache may ultimately be allocated less
+ * memory than it requests here.
+ */
+ virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const = 0;
+
+ // Get the number of bytes currently allocated to the given priority.
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const = 0;
+
+ // Get the number of bytes currently allocated to all priorities.
+ virtual int64_t get_cache_bytes() const = 0;
+
+ // Allocate bytes for a given priority.
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+ // Allocate additional bytes for a given priority.
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+ // Commit the current number of bytes allocated to the cache.
+ virtual int64_t commit_cache_size() = 0;
+
+ // Get the ratio of available memory this cache should target.
+ virtual double get_cache_ratio() const = 0;
+
+ // Set the ratio of available memory this cache should target.
+ virtual void set_cache_ratio(double ratio) = 0;
+
+ // Get the name of this cache.
+ virtual std::string get_cache_name() const = 0;
+ };
+}
+
+#endif
OPTION(bluestore_cache_size_ssd, OPT_U64)
OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE)
OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE)
-OPTION(bluestore_cache_kv_min, OPT_INT)
OPTION(bluestore_kvbackend, OPT_STR)
OPTION(bluestore_allocator, OPT_STR) // stupid | bitmap
OPTION(bluestore_freelist_blocks_per_key, OPT_INT)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
- Option("bluestore_cache_kv_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(512_M)
- .set_description("Minimum memory (bytes) of bluestore_cache_size to devote to kv database (rocksdb)")
+ Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
.add_see_also("bluestore_cache_size")
- .set_long_description("A negative value means using bluestore_cache_meta_ratio "
- "and bluestore_cache_kv_ratio instead of calculating these ratios using "
- "bluestore_cache_size_* and bluestore_cache_kv_min. If "
- "bluestore_cache_size is below bluestore_cache_kv_min "
- "then this option has no effect."),
+ .add_see_also("bluestore_cache_meta_ratio")
+ .set_description("Automatically tune the ratio of caches while respecting min values."),
+
+ Option("bluestore_cache_autotune_chunk_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(33554432)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("The chunk size in bytes to allocate to caches when cache autotune is enabled."),
+
+ Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(5)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("The number of seconds to wait between rebalances when cache autotune is enabled."),
Option("bluestore_kvbackend", Option::TYPE_STR, Option::LEVEL_DEV)
.set_default("rocksdb")
#include "include/encoding.h"
#include "common/Formatter.h"
#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
using std::string;
using std::vector;
*
* Kyoto Cabinet or LevelDB should implement this
*/
-class KeyValueDB {
+class KeyValueDB : public PriorityCache::PriCache {
public:
/*
* See RocksDB's definition of a column family(CF) and how to use it.
typedef ceph::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
private:
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = { 0 };
+ double cache_ratio = 0;
+
// This class filters a WholeSpaceIterator by a prefix.
class PrefixIteratorImpl : public IteratorImpl {
const std::string prefix;
return -EOPNOTSUPP;
}
+ // PriCache
+
+ virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const {
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+
+ virtual int64_t commit_cache_size() {
+ return -EOPNOTSUPP;
+ }
+
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+
+ virtual string get_cache_name() const {
+ return "Unknown KeyValueDB Cache";
+ }
+
+ // End PriCache
+
+ virtual int set_cache_high_pri_pool_ratio(double ratio) {
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_usage() const {
+ return -EOPNOTSUPP;
+ }
+
virtual ~KeyValueDB() {}
/// compact the underlying store
using std::string;
#include "common/perf_counters.h"
#include "common/debug.h"
+#include "common/PriorityCache.h"
#include "include/str_list.h"
#include "include/stringify.h"
#include "include/str_map.h"
uint64_t row_cache_size = cache_size * g_conf->rocksdb_cache_row_ratio;
uint64_t block_cache_size = cache_size - row_cache_size;
- if (block_cache_size == 0) {
- // disable block cache
- dout(10) << __func__ << " block_cache_size " << block_cache_size
- << ", setting no_block_cache " << dendl;
- bbt_opts.no_block_cache = true;
+ if (g_conf->rocksdb_cache_type == "lru") {
+ bbt_opts.block_cache = rocksdb::NewLRUCache(
+ block_cache_size,
+ g_conf->rocksdb_cache_shard_bits);
+ } else if (g_conf->rocksdb_cache_type == "clock") {
+ bbt_opts.block_cache = rocksdb::NewClockCache(
+ block_cache_size,
+ g_conf->rocksdb_cache_shard_bits);
} else {
- if (g_conf->rocksdb_cache_type == "lru") {
- bbt_opts.block_cache = rocksdb::NewLRUCache(
- block_cache_size,
- g_conf->rocksdb_cache_shard_bits);
- } else if (g_conf->rocksdb_cache_type == "clock") {
- bbt_opts.block_cache = rocksdb::NewClockCache(
- block_cache_size,
- g_conf->rocksdb_cache_shard_bits);
- } else {
- derr << "unrecognized rocksdb_cache_type '" << g_conf->rocksdb_cache_type
- << "'" << dendl;
- return -EINVAL;
- }
+ derr << "unrecognized rocksdb_cache_type '" << g_conf->rocksdb_cache_type
+ << "'" << dendl;
+ return -EINVAL;
}
bbt_opts.block_size = g_conf->rocksdb_block_size;
db->CompactRange(options, &cstart, &cend);
}
+int64_t RocksDBStore::request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const
+{
+ auto cache = bbt_opts.block_cache;
+ int64_t assigned = get_cache_bytes(pri);
+
+ switch (pri) {
+ // PRI0 is for rocksdb's high priority items (indexes/filters)
+ case PriorityCache::Priority::PRI0:
+ {
+ int64_t usage = cache->GetHighPriPoolUsage();
+
+ // RocksDB sometimes flushes the high pri cache when the low priority
+ // cache exceeds the soft cap, so in that case use a "watermark" for
+ // the usage instead.
+ if (high_pri_watermark > usage) {
+ usage = high_pri_watermark;
+ }
+ dout(10) << __func__ << " high pri pool usage: " << usage << dendl;
+ int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
+ return (request > assigned) ? request - assigned : 0;
+ }
+ // All other cache items are currently shoved into the LAST priority.
+ case PriorityCache::Priority::LAST:
+ {
+ uint64_t usage = cache->GetUsage() - cache->GetHighPriPoolUsage();
+ dout(10) << __func__ << " low pri pool usage: " << usage << dendl;
+ int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
+ return (request > assigned) ? request - assigned : 0;
+ }
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+}
+
+int64_t RocksDBStore::get_cache_usage() const
+{
+ return static_cast<int64_t>(bbt_opts.block_cache->GetUsage());
+}
+
+int64_t RocksDBStore::commit_cache_size()
+{
+ int64_t high_pri_bytes = get_cache_bytes(PriorityCache::Priority::PRI0);
+ int64_t total_bytes = get_cache_bytes();
+
+ double ratio = (double) high_pri_bytes / total_bytes;
+ size_t old_bytes = bbt_opts.block_cache->GetCapacity();
+ dout(10) << __func__ << " old: " << old_bytes
+ << ", new: " << total_bytes << dendl;
+ bbt_opts.block_cache->SetCapacity((size_t) total_bytes);
+ set_cache_high_pri_pool_ratio(ratio);
+
+ // After setting the cache sizes, updated the high pri watermark.
+ int64_t high_pri_pool_usage = bbt_opts.block_cache->GetHighPriPoolUsage();
+ if (high_pri_watermark < high_pri_pool_usage) {
+ high_pri_watermark = high_pri_pool_usage;
+ } else {
+ high_pri_watermark = static_cast<int64_t>(0.90 * high_pri_watermark);
+ }
+
+ return total_bytes;
+}
+
+int RocksDBStore::set_cache_high_pri_pool_ratio(double ratio)
+{
+ if (g_conf->rocksdb_cache_type != "lru") {
+ return -EOPNOTSUPP;
+ }
+ dout(10) << __func__ << " old ratio: "
+ << bbt_opts.block_cache->GetHighPriPoolRatio() << " new ratio: "
+ << ratio << dendl;
+ bbt_opts.block_cache->SetHighPriPoolRatio(ratio);
+ return 0;
+}
+
+int64_t RocksDBStore::get_cache_capacity() {
+ return bbt_opts.block_cache->GetCapacity();
+}
+
RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl()
{
delete dbiter;
#include "include/assert.h"
#include "common/Formatter.h"
#include "common/Cond.h"
-
#include "common/ceph_context.h"
+#include "common/PriorityCache.h"
+
class PerfCounters;
enum {
bool disableWAL;
bool enable_rmrange;
void compact() override;
+ int64_t high_pri_watermark;
int tryInterpret(const string& key, const string& val, rocksdb::Options &opt);
int ParseOptionsFromString(const string& opt_str, rocksdb::Options &opt);
compact_thread(this),
compact_on_mount(false),
disableWAL(false),
- enable_rmrange(cct->_conf->rocksdb_enable_rmrange)
+ enable_rmrange(cct->_conf->rocksdb_enable_rmrange),
+ high_pri_watermark(0)
{}
~RocksDBStore() override;
return total_size;
}
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t cache_bytes) const override;
+ virtual int64_t commit_cache_size() override;
+ virtual std::string get_cache_name() const override {
+ return "RocksDB Block Cache";
+ }
+ virtual int64_t get_cache_usage() const override;
+
+
int set_cache_size(uint64_t s) override {
cache_size = s;
set_cache_flag = true;
return 0;
}
+ int set_cache_capacity(int64_t capacity);
+ int set_cache_high_pri_pool_ratio(double ratio);
+ int64_t get_cache_capacity();
+
WholeSpaceIterator get_wholespace_iterator() override;
};
#include "include/str_map.h"
#include "common/errno.h"
#include "common/safe_io.h"
+#include "common/PriorityCache.h"
#include "Allocator.h"
#include "FreelistManager.h"
#include "BlueFS.h"
return c;
}
-void BlueStore::Cache::trim_all()
+void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
{
std::lock_guard<std::recursive_mutex> l(lock);
- _trim(0, 0);
+ _trim(onode_max, buffer_max);
}
-void BlueStore::Cache::trim(
- uint64_t target_bytes,
- float target_meta_ratio,
- float target_data_ratio,
- float bytes_per_onode)
+void BlueStore::Cache::trim_all()
{
std::lock_guard<std::recursive_mutex> l(lock);
- uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
- uint64_t current_buffer = _get_buffer_bytes();
- uint64_t current = current_meta + current_buffer;
-
- uint64_t target_meta = target_bytes * target_meta_ratio;
- uint64_t target_buffer = target_bytes * target_data_ratio;
-
- // correct for overflow or float imprecision
- target_meta = min(target_bytes, target_meta);
- target_buffer = min(target_bytes - target_meta, target_buffer);
-
- if (current <= target_bytes) {
- dout(30) << __func__
- << " shard target " << byte_u_t(target_bytes)
- << " meta/data ratios " << target_meta_ratio
- << " + " << target_data_ratio << " ("
- << byte_u_t(target_meta) << " + "
- << byte_u_t(target_buffer) << "), "
- << " current " << byte_u_t(current) << " ("
- << byte_u_t(current_meta) << " + "
- << byte_u_t(current_buffer) << ")"
- << dendl;
- return;
- }
-
- uint64_t need_to_free = current - target_bytes;
- uint64_t free_buffer = 0;
- uint64_t free_meta = 0;
- if (current_buffer > target_buffer) {
- free_buffer = current_buffer - target_buffer;
- if (free_buffer > need_to_free) {
- free_buffer = need_to_free;
- }
- }
- free_meta = need_to_free - free_buffer;
-
- // start bounds at what we have now
- uint64_t max_buffer = current_buffer - free_buffer;
- uint64_t max_meta = current_meta - free_meta;
- uint64_t max_onodes = max_meta / bytes_per_onode;
-
- dout(20) << __func__
- << " shard target " << byte_u_t(target_bytes)
- << " ratio " << target_meta_ratio << " ("
- << byte_u_t(target_meta) << " + "
- << byte_u_t(target_buffer) << "), "
- << " current " << byte_u_t(current) << " ("
- << byte_u_t(current_meta) << " + "
- << byte_u_t(current_buffer) << "),"
- << " need_to_free " << byte_u_t(need_to_free) << " ("
- << byte_u_t(free_meta) << " + "
- << byte_u_t(free_buffer) << ")"
- << " -> max " << max_onodes << " onodes + "
- << max_buffer << " buffer"
- << dendl;
- _trim(max_onodes, max_buffer);
+ _trim(0, 0);
}
-
// LRUCache
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
// =======================================================
+// MempoolThread
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
+
void *BlueStore::MempoolThread::entry()
{
Mutex::Locker l(lock);
- while (!stop) {
- uint64_t meta_bytes =
- mempool::bluestore_cache_other::allocated_bytes() +
- mempool::bluestore_cache_onode::allocated_bytes();
- uint64_t onode_num =
- mempool::bluestore_cache_onode::allocated_items();
- if (onode_num < 2) {
- onode_num = 2;
- }
+ std::list<PriorityCache::PriCache *> caches;
+ caches.push_back(store->db);
+ caches.push_back(&meta_cache);
+ caches.push_back(&data_cache);
- float bytes_per_onode = (float)meta_bytes / (float)onode_num;
- size_t num_shards = store->cache_shards.size();
- float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
- // A little sloppy but should be close enough
- uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
+ utime_t next_balance = ceph_clock_now();
+ while (!stop) {
+ _adjust_cache_settings();
- for (auto i : store->cache_shards) {
- i->trim(shard_target,
- store->cache_meta_ratio,
- store->cache_data_ratio,
- bytes_per_onode);
+ // Before we trim, check and see if it's time to rebalance
+ bool log_stats = false;
+ double autotune_interval = store->cache_autotune_interval;
+ if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+ if (store->cache_autotune) {
+ _balance_cache(caches);
+ }
+ next_balance = ceph_clock_now();
+ next_balance += autotune_interval;
+ log_stats = true;
}
+ _trim_shards(log_stats);
store->_update_cache_logger();
utime_t wait;
return NULL;
}
+void BlueStore::MempoolThread::_adjust_cache_settings()
+{
+ store->db->set_cache_ratio(store->cache_kv_ratio);
+ meta_cache.set_cache_ratio(store->cache_meta_ratio);
+ data_cache.set_cache_ratio(store->cache_data_ratio);
+}
+
+void BlueStore::MempoolThread::_trim_shards(bool log_stats)
+{
+ uint64_t cache_size = store->cache_size;
+ size_t num_shards = store->cache_shards.size();
+ int64_t kv_alloc_bytes = 0;
+ int64_t meta_alloc_bytes = 0;
+ int64_t data_alloc_bytes = 0;
+
+ if (store->cache_autotune) {
+ kv_alloc_bytes = store->db->get_cache_bytes();
+ meta_alloc_bytes = meta_cache.get_cache_bytes();
+ data_alloc_bytes = data_cache.get_cache_bytes();
+ } else {
+ kv_alloc_bytes = static_cast<int64_t>(
+ store->db->get_cache_ratio() * cache_size);
+ meta_alloc_bytes = static_cast<int64_t>(
+ meta_cache.get_cache_ratio() * cache_size);
+ data_alloc_bytes = static_cast<int64_t>(
+ data_cache.get_cache_ratio() * cache_size);
+ }
+ if (log_stats) {
+ double kv_alloc_ratio = (double) kv_alloc_bytes / cache_size;
+ double meta_alloc_ratio = (double) meta_alloc_bytes / cache_size;
+ double data_alloc_ratio = (double) data_alloc_bytes / cache_size;
+ double kv_used_ratio = (double) store->db->get_cache_usage() / cache_size;
+ double meta_used_ratio = (double) meta_cache._get_used_bytes() / cache_size;
+ double data_used_ratio = (double) data_cache._get_used_bytes() / cache_size;
+
+ ldout(store->cct, 5) << __func__ << " ratios -" << std::fixed << std::setprecision(1)
+ << " kv_alloc: " << 100*kv_alloc_ratio << "%"
+ << " kv_used: " << 100*kv_used_ratio << "%"
+ << " meta_alloc: " << 100*meta_alloc_ratio << "%"
+ << " meta_used: " << 100*meta_used_ratio << "%"
+ << " data_alloc: " << 100*data_alloc_ratio << "%"
+ << " data_used: " << 100*data_used_ratio << "%" << dendl;
+ }
+
+ uint64_t max_shard_onodes = static_cast<uint64_t>(
+ (meta_alloc_bytes / (double) num_shards) / meta_cache.get_bytes_per_onode());
+ uint64_t max_shard_buffer = static_cast<uint64_t>(
+ data_alloc_bytes / num_shards);
+ ldout(store->cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+ << " max_shard_buffer: " << max_shard_buffer << dendl;
+
+ for (auto i : store->cache_shards) {
+ i->trim(max_shard_onodes, max_shard_buffer);
+ }
+}
+
+void BlueStore::MempoolThread::_balance_cache(
+ const std::list<PriorityCache::PriCache *>& caches)
+{
+ int64_t mem_avail = store->cache_size;
+
+ // Assign memory for each priority level
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ ldout(store->cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ _balance_cache_pri(&mem_avail, caches, pri);
+ }
+ // Assign any leftover memory based on the default ratios.
+ if (mem_avail > 0) {
+ for (auto it = caches.begin(); it != caches.end(); it++) {
+ int64_t fair_share =
+ static_cast<int64_t>((*it)->get_cache_ratio() * mem_avail);
+ if (fair_share > 0) {
+ (*it)->add_cache_bytes(PriorityCache::Priority::LAST, fair_share);
+ }
+ }
+ }
+ // assert if we assigned more memory than is available.
+ assert(mem_avail >= 0);
+
+ // Finally commit the new cache sizes
+ for (auto it = caches.begin(); it != caches.end(); it++) {
+ (*it)->commit_cache_size();
+ }
+}
+
+void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
+ const std::list<PriorityCache::PriCache *>& caches, PriorityCache::Priority pri)
+{
+ std::list<PriorityCache::PriCache *> tmp_caches = caches;
+ double cur_ratios = 0;
+ double new_ratios = 0;
+
+ // Zero this priority's bytes, sum the initial ratios.
+ for (auto it = tmp_caches.begin(); it != tmp_caches.end(); it++) {
+ (*it)->set_cache_bytes(pri, 0);
+ cur_ratios += (*it)->get_cache_ratio();
+ }
+
+ // For this priority, loop until caches are satisified or we run out of memory.
+ // Since we can't allocate fractional bytes, stop if we have fewer bytes left
+ // than the number of participating caches.
+ while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
+ uint64_t total_assigned = 0;
+
+ for (auto it = tmp_caches.begin(); it != tmp_caches.end(); ) {
+ int64_t cache_wants = (*it)->request_cache_bytes(pri, store->cache_autotune_chunk_size);
+
+ // Usually the ratio should be set to the fraction of the current caches'
+ // assigned ratio compared to the total ratio of all caches that still
+ // want memory. There is a special case where the only caches left are
+ // all assigned 0% ratios but still want memory. In that case, give
+ // them an equal shot at the remaining memory for this priority.
+ double ratio = 1.0 / tmp_caches.size();
+ if (cur_ratios > 0) {
+ ratio = (*it)->get_cache_ratio() / cur_ratios;
+ }
+ int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
+
+ if (cache_wants > fair_share) {
+ // If we want too much, take what we can get but stick around for more
+ (*it)->add_cache_bytes(pri, fair_share);
+ total_assigned += fair_share;
+
+ new_ratios += (*it)->get_cache_ratio();
+ ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
+ << " wanted: " << cache_wants << " fair_share: " << fair_share
+ << " mem_avail: " << *mem_avail
+ << " staying in list. Size: " << tmp_caches.size()
+ << dendl;
+ ++it;
+ } else {
+ // Otherwise assign only what we want
+ if (cache_wants > 0) {
+ (*it)->add_cache_bytes(pri, cache_wants);
+ total_assigned += cache_wants;
+
+ ldout(store->cct, 20) << __func__ << " " << (*it)->get_cache_name()
+ << " wanted: " << cache_wants << " fair_share: " << fair_share
+ << " mem_avail: " << *mem_avail
+ << " removing from list. New size: " << tmp_caches.size() - 1
+ << dendl;
+
+ }
+ // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
+ it = tmp_caches.erase(it);
+ }
+ }
+ // Reset the ratios
+ *mem_avail -= total_assigned;
+ cur_ratios = new_ratios;
+ new_ratios = 0;
+ }
+}
+
// =======================================================
// OmapIteratorImpl
int BlueStore::_set_cache_sizes()
{
ceph_assert(bdev);
+ cache_autotune = cct->_conf->get_val<bool>("bluestore_cache_autotune");
+ cache_autotune_chunk_size =
+ cct->_conf->get_val<uint64_t>("bluestore_cache_autotune_chunk_size");
+ cache_autotune_interval =
+ cct->_conf->get_val<double>("bluestore_cache_autotune_interval");
+
if (cct->_conf->bluestore_cache_size) {
cache_size = cct->_conf->bluestore_cache_size;
} else {
cache_size = cct->_conf->bluestore_cache_size_ssd;
}
}
- cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
- cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
+ cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
- << ") must be in range [0,1.0]" << dendl;
+ << ") must be in range [0,1.0]" << dendl;
return -EINVAL;
}
+
+ cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
- << ") must be in range [0,1.0]" << dendl;
+ << ") must be in range [0,1.0]" << dendl;
return -EINVAL;
}
+
if (cache_meta_ratio + cache_kv_ratio > 1.0) {
derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
- << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
- << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
- << dendl;
+ << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
+ << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
+ << dendl;
return -EINVAL;
}
- double cache_kv_min = cct->_conf->bluestore_cache_kv_min;
- double cache_kv_min_ratio = 0;
-
- // if cache_kv_min is negative, disable it
- if (cache_size > 0 && cache_kv_min >= 0) {
- cache_kv_min_ratio = std::min((double)cache_kv_min / (double)cache_size,
- (double)1.0);
- if (cache_kv_min_ratio > cache_kv_ratio) {
- dout(1) << __func__ << " kv_min_ratio " << cache_kv_min_ratio
- << " > kv_ratio " << cache_kv_ratio << dendl;
- cache_kv_ratio = cache_kv_min_ratio;
- cache_meta_ratio = std::min((double)cache_meta_ratio,
- (double)1.0 - cache_kv_ratio);
- }
- }
-
cache_data_ratio =
(double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
-
if (cache_data_ratio < 0) {
// deal with floating point imprecision
cache_data_ratio = 0;
}
+
dout(1) << __func__ << " cache_size " << cache_size
<< " meta " << cache_meta_ratio
<< " kv " << cache_kv_ratio
}
}
+
db = KeyValueDB::create(cct,
kv_backend,
fn,
FreelistManager::setup_merge_operators(db);
db->set_merge_operator(PREFIX_STAT, merge_op);
-
- db->set_cache_size(cache_size * cache_kv_ratio);
+ db->set_cache_size(cache_kv_ratio * cache_size);
if (kv_backend == "rocksdb") {
options = cct->_conf->bluestore_rocksdb_options;
#include "common/bloom_filter.hpp"
#include "common/Finisher.h"
#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
#include "compressor/Compressor.h"
#include "os/ObjectStore.h"
--num_blobs;
}
- void trim(uint64_t target_bytes,
- float target_meta_ratio,
- float target_data_ratio,
- float bytes_per_onode);
+ void trim(uint64_t onode_max, uint64_t buffer_max);
void trim_all();
// cache trim control
uint64_t cache_size = 0; ///< total cache size
- float cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
- float cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
- float cache_data_ratio = 0; ///< cache ratio dedicated to object data
+ double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
+ double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
+ double cache_data_ratio = 0; ///< cache ratio dedicated to object data
+ uint64_t cache_meta_min = 0; ///< cache min dedicated to metadata
+ uint64_t cache_kv_min = 0; ///< cache min dedicated to kv (e.g., rocksdb)
+ uint64_t cache_data_min = 0; ///< cache min dedicated to object data
+ bool cache_autotune = false; ///< cache autotune setting
+ uint64_t cache_autotune_chunk_size = 0; ///< cache autotune chunk size
+ double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
std::mutex vstatfs_lock;
volatile_statfs vstatfs;
struct MempoolThread : public Thread {
+ public:
BlueStore *store;
+
Cond cond;
Mutex lock;
bool stop = false;
+
+ struct MempoolCache : public PriorityCache::PriCache {
+ BlueStore *store;
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1];
+ double cache_ratio = 0;
+
+ MempoolCache(BlueStore *s) : store(s) {};
+
+ virtual uint64_t _get_used_bytes() const = 0;
+
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t chunk_bytes) const {
+ int64_t assigned = get_cache_bytes(pri);
+
+ switch (pri) {
+ // All cache items are currently shoved into the LAST priority
+ case PriorityCache::Priority::LAST:
+ {
+ uint64_t usage = _get_used_bytes();
+ int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
+ return(request > assigned) ? request - assigned : 0;
+ }
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+ }
+
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+ virtual int64_t commit_cache_size() {
+ return get_cache_bytes();
+ }
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+ virtual string get_cache_name() const = 0;
+ };
+
+ struct MetaCache : public MempoolCache {
+ MetaCache(BlueStore *s) : MempoolCache(s) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ return mempool::bluestore_cache_other::allocated_bytes() +
+ mempool::bluestore_cache_onode::allocated_bytes();
+ }
+
+ virtual string get_cache_name() const {
+ return "BlueStore Meta Cache";
+ }
+
+ uint64_t _get_num_onodes() const {
+ uint64_t onode_num =
+ mempool::bluestore_cache_onode::allocated_items();
+ return (2 > onode_num) ? 2 : onode_num;
+ }
+
+ double get_bytes_per_onode() const {
+ return (double)_get_used_bytes() / (double)_get_num_onodes();
+ }
+ } meta_cache;
+
+ struct DataCache : public MempoolCache {
+ DataCache(BlueStore *s) : MempoolCache(s) {};
+
+ virtual uint64_t _get_used_bytes() const {
+ uint64_t bytes = 0;
+ for (auto i : store->cache_shards) {
+ bytes += i->_get_buffer_bytes();
+ }
+ return bytes;
+ }
+ virtual string get_cache_name() const {
+ return "BlueStore Data Cache";
+ }
+ } data_cache;
+
public:
explicit MempoolThread(BlueStore *s)
: store(s),
- lock("BlueStore::MempoolThread::lock") {}
+ lock("BlueStore::MempoolThread::lock"),
+ meta_cache(MetaCache(s)),
+ data_cache(DataCache(s)) {}
+
void *entry() override;
void init() {
ceph_assert(stop == false);
lock.Unlock();
join();
}
+
+ private:
+ void _adjust_cache_settings();
+ void _trim_shards(bool log_stats);
+ void _balance_cache(const std::list<PriorityCache::PriCache *>& caches);
+ void _balance_cache_pri(int64_t *mem_avail,
+ const std::list<PriorityCache::PriCache *>& caches,
+ PriorityCache::Priority pri);
} mempool_thread;
// --------------------------------------------------------