*
* Kyoto Cabinet or LevelDB should implement this
*/
-class KeyValueDB : public PriorityCache::PriCache {
+class KeyValueDB {
public:
/*
* See RocksDB's definition of a column family(CF) and how to use it.
typedef std::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
private:
- int64_t cache_bytes[PriorityCache::Priority::LAST+1] = { 0 };
- double cache_ratio = 0;
-
// This class filters a WholeSpaceIterator by a prefix.
class PrefixIteratorImpl : public IteratorImpl {
const std::string prefix;
return -EOPNOTSUPP;
}
- // PriCache
-
- virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const {
- return -EOPNOTSUPP;
- }
-
- virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
- return cache_bytes[pri];
- }
-
- virtual int64_t get_cache_bytes() const {
- int64_t total = 0;
-
- for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
- PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
- total += get_cache_bytes(pri);
- }
- return total;
- }
-
- virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
- cache_bytes[pri] = bytes;
- }
-
- virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
- cache_bytes[pri] += bytes;
- }
-
- virtual int64_t commit_cache_size(uint64_t total_cache) {
- return -EOPNOTSUPP;
- }
-
- virtual int64_t get_committed_size() const {
- return -EOPNOTSUPP;
- }
-
- virtual double get_cache_ratio() const {
- return cache_ratio;
- }
-
- virtual void set_cache_ratio(double ratio) {
- cache_ratio = ratio;
- }
-
- virtual string get_cache_name() const {
- return "Unknown KeyValueDB Cache";
- }
-
- // End PriCache
-
virtual int set_cache_high_pri_pool_ratio(double ratio) {
return -EOPNOTSUPP;
}
return -EOPNOTSUPP;
}
+ virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const {
+ return nullptr;
+ }
+
virtual ~KeyValueDB() {}
/// estimate space utilization for a prefix (in bytes)
#include "rocksdb/filter_policy.h"
#include "rocksdb/utilities/convenience.h"
#include "rocksdb/merge_operator.h"
-#include "kv/rocksdb_cache/BinnedLRUCache.h"
using std::string;
#include "common/perf_counters.h"
if (g_conf()->rocksdb_cache_type == "binned_lru") {
bbt_opts.block_cache = rocksdb_cache::NewBinnedLRUCache(
+ cct,
block_cache_size,
g_conf()->rocksdb_cache_shard_bits);
} else if (g_conf()->rocksdb_cache_type == "lru") {
db->CompactRange(options, &cstart, &cend);
}
-int64_t RocksDBStore::request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const
-{
- auto cache = bbt_opts.block_cache;
-
- int64_t assigned = get_cache_bytes(pri);
- int64_t usage = 0;
- int64_t request = 0;
- switch (pri) {
- // PRI0 is for rocksdb's high priority items (indexes/filters)
- case PriorityCache::Priority::PRI0:
- {
- usage += cache->GetPinnedUsage();
- if (g_conf()->rocksdb_cache_type == "binned_lru") {
- auto binned_cache =
- std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(cache);
- usage += binned_cache->GetHighPriPoolUsage();
- }
- break;
- }
- // All other cache items are currently shoved into the LAST priority.
- case PriorityCache::Priority::LAST:
- {
- usage = get_cache_usage() - cache->GetPinnedUsage();
- if (g_conf()->rocksdb_cache_type == "binned_lru") {
- auto binned_cache =
- std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(cache);
- usage -= binned_cache->GetHighPriPoolUsage();
- }
- break;
- }
- default:
- break;
- }
- request = (request > assigned) ? request - assigned : 0;
- dout(10) << __func__ << " Priority: " << static_cast<uint32_t>(pri)
- << " Usage: " << usage << " Request: " << request << dendl;
- return request;
-}
-
-int64_t RocksDBStore::get_cache_usage() const
-{
- return static_cast<int64_t>(bbt_opts.block_cache->GetUsage());
-}
-
-int64_t RocksDBStore::commit_cache_size(uint64_t total_bytes)
-{
- size_t old_bytes = bbt_opts.block_cache->GetCapacity();
- int64_t new_bytes = PriorityCache::get_chunk(
- get_cache_bytes(), total_bytes);
- dout(10) << __func__ << " old: " << old_bytes
- << " new: " << new_bytes << dendl;
- bbt_opts.block_cache->SetCapacity((size_t) new_bytes);
-
- // Set the high priority pool ratio is this is the binned LRU cache.
- if (g_conf()->rocksdb_cache_type == "binned_lru") {
- auto binned_cache =
- std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(bbt_opts.block_cache);
- int64_t high_pri_bytes = PriorityCache::get_chunk(
- binned_cache->GetHighPriPoolUsage()+1, total_bytes);
- double ratio = (double) high_pri_bytes / new_bytes;
- dout(10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
- binned_cache->SetHighPriPoolRatio(ratio);
- }
- return new_bytes;
-}
-
RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl()
{
delete dbiter;
#include "rocksdb/iostats_context.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
+#include "kv/rocksdb_cache/BinnedLRUCache.h"
#include <errno.h>
#include "common/errno.h"
#include "common/dout.h"
bool disableWAL;
bool enable_rmrange;
void compact() override;
- int64_t high_pri_watermark;
void compact_async() override {
compact_range_async(string(), string());
compact_thread(this),
compact_on_mount(false),
disableWAL(false),
- enable_rmrange(cct->_conf->rocksdb_enable_rmrange),
- high_pri_watermark(0)
+ enable_rmrange(cct->_conf->rocksdb_enable_rmrange)
{}
~RocksDBStore() override;
return total_size;
}
- virtual int64_t request_cache_bytes(
- PriorityCache::Priority pri, uint64_t cache_bytes) const override;
- virtual int64_t commit_cache_size(uint64_t total_cache) override;
- virtual int64_t get_committed_size() const override {
- return bbt_opts.block_cache->GetCapacity();
+ virtual int64_t get_cache_usage() const override {
+ return static_cast<int64_t>(bbt_opts.block_cache->GetUsage());
}
- virtual std::string get_cache_name() const override {
- return "RocksDB Block Cache";
- }
- virtual int64_t get_cache_usage() const override;
-
int set_cache_size(uint64_t s) override {
cache_size = s;
int set_cache_capacity(int64_t capacity);
int64_t get_cache_capacity();
+ virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache()
+ const override {
+ return dynamic_pointer_cast<PriorityCache::PriCache>(
+ bbt_opts.block_cache);
+ }
+
WholeSpaceIterator get_wholespace_iterator() override;
};
#include "BinnedLRUCache.h"
-#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
+#define dout_context cct
+#define dout_subsys ceph_subsys_rocksdb
+#undef dout_prefix
+#define dout_prefix *_dout << "rocksdb: "
+
namespace rocksdb_cache {
BinnedLRUHandleTable::BinnedLRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
return std::string(buffer);
}
-BinnedLRUCache::BinnedLRUCache(size_t capacity, int num_shard_bits,
- bool strict_capacity_limit, double high_pri_pool_ratio)
- : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+BinnedLRUCache::BinnedLRUCache(CephContext *c,
+ size_t capacity,
+ int num_shard_bits,
+ bool strict_capacity_limit,
+ double high_pri_pool_ratio)
+ : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), cct(c) {
num_shards_ = 1 << num_shard_bits;
// TODO: Switch over to use mempool
int rc = posix_memalign((void**) &shards_,
return usage;
}
-std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(size_t capacity, int num_shard_bits,
- bool strict_capacity_limit,
- double high_pri_pool_ratio) {
+// PriCache
+
+int64_t BinnedLRUCache::request_cache_bytes(PriorityCache::Priority pri, uint64_t total_cache) const
+{
+ int64_t assigned = get_cache_bytes(pri);
+ int64_t request = 0;
+
+ switch (pri) {
+ // PRI0 is for rocksdb's high priority items (indexes/filters)
+ case PriorityCache::Priority::PRI0:
+ {
+ request = GetHighPriPoolUsage();
+ break;
+ }
+ // All other cache items are currently shoved into the LAST priority.
+ case PriorityCache::Priority::LAST:
+ {
+ request = GetUsage();
+ request -= GetHighPriPoolUsage();
+ break;
+ }
+ default:
+ break;
+ }
+ request = (request > assigned) ? request - assigned : 0;
+ ldout(cct, 10) << __func__ << " Priority: " << static_cast<uint32_t>(pri)
+ << " Request: " << request << dendl;
+ return request;
+}
+
+int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes)
+{
+ size_t old_bytes = GetCapacity();
+ int64_t new_bytes = PriorityCache::get_chunk(
+ get_cache_bytes(), total_bytes);
+ ldout(cct, 10) << __func__ << " old: " << old_bytes
+ << " new: " << new_bytes << dendl;
+ SetCapacity((size_t) new_bytes);
+ double ratio =
+ (double) get_cache_bytes(PriorityCache::Priority::PRI0) / new_bytes;
+ ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
+ SetHighPriPoolRatio(ratio);
+ return new_bytes;
+}
+
+std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
+ CephContext *c,
+ size_t capacity,
+ int num_shard_bits,
+ bool strict_capacity_limit,
+ double high_pri_pool_ratio) {
if (num_shard_bits >= 20) {
return nullptr; // the cache cannot be sharded into too many fine pieces
}
if (num_shard_bits < 0) {
num_shard_bits = GetDefaultCacheShardBits(capacity);
}
- return std::make_shared<BinnedLRUCache>(capacity, num_shard_bits,
- strict_capacity_limit, high_pri_pool_ratio);
+ return std::make_shared<BinnedLRUCache>(
+ c, capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio);
}
} // namespace rocksdb_cache
#include <mutex>
#include "ShardedCache.h"
-
#include "common/autovector.h"
+#include "common/dout.h"
#include "include/ceph_assert.h"
+#include "common/ceph_context.h"
namespace rocksdb_cache {
// RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
+ CephContext *c,
size_t capacity,
int num_shard_bits = -1,
bool strict_capacity_limit = false,
class BinnedLRUCache : public ShardedCache {
public:
- BinnedLRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
- double high_pri_pool_ratio);
+ BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
+ bool strict_capacity_limit, double high_pri_pool_ratio);
virtual ~BinnedLRUCache();
virtual const char* Name() const override { return "BinnedLRUCache"; }
virtual CacheShard* GetShard(int shard) override;
// Retrieves high pri pool usage
size_t GetHighPriPoolUsage() const;
+ // PriorityCache
+ virtual int64_t request_cache_bytes(
+ PriorityCache::Priority pri, uint64_t total_cache) const;
+ virtual int64_t commit_cache_size(uint64_t total_cache);
+ virtual int64_t get_committed_size() const {
+ return GetCapacity();
+ }
+ virtual std::string get_cache_name() const {
+ return "RocksDB Binned LRU Cache";
+ }
+
private:
+ CephContext *cct;
BinnedLRUCacheShard* shards_;
int num_shards_ = 0;
};
#include "rocksdb/cache.h"
#include "include/ceph_hash.h"
+#include "common/PriorityCache.h"
//#include "hash.h"
#ifndef CACHE_LINE_SIZE
// Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
// shards will be created, with capacity split evenly to each of the shards.
// Keys are sharded by the highest num_shard_bits bits of hash value.
-class ShardedCache : public rocksdb::Cache {
+class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache {
public:
ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit);
virtual ~ShardedCache() = default;
int GetNumShardBits() const { return num_shard_bits_; }
+ // PriCache
+ virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+ return cache_bytes[pri];
+ }
+ virtual int64_t get_cache_bytes() const {
+ int64_t total = 0;
+ for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+ PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+ total += get_cache_bytes(pri);
+ }
+ return total;
+ }
+ virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] = bytes;
+ }
+ virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+ cache_bytes[pri] += bytes;
+ }
+ virtual double get_cache_ratio() const {
+ return cache_ratio;
+ }
+ virtual void set_cache_ratio(double ratio) {
+ cache_ratio = ratio;
+ }
+ virtual std::string get_cache_name() const = 0;
+
private:
static inline uint32_t HashSlice(const rocksdb::Slice& s) {
return ceph_str_hash(CEPH_STR_HASH_RJENKINS, s.data(), s.size());
return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
}
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+ double cache_ratio = 0;
+
int num_shard_bits_;
mutable std::mutex capacity_mutex_;
size_t capacity_;
{
std::unique_lock l(lock);
- std::list<PriorityCache::PriCache *> caches;
- caches.push_back(store->db);
- caches.push_back(&meta_cache);
- caches.push_back(&data_cache);
+ std::list<std::shared_ptr<PriorityCache::PriCache>> caches;
+ binned_kv_cache = store->db->get_priority_cache();
+ if (binned_kv_cache != nullptr) {
+ caches.push_back(binned_kv_cache);
+ }
+ caches.push_back(meta_cache);
+ caches.push_back(data_cache);
+
autotune_cache_size = store->osd_memory_cache_min;
utime_t next_balance = ceph_clock_now();
bool interval_stats_trim = false;
bool interval_stats_resize = false;
while (!stop) {
- _adjust_cache_settings();
-
// Before we trim, check and see if it's time to rebalance/resize.
double autotune_interval = store->cache_autotune_interval;
double resize_interval = store->osd_memory_cache_resize_interval;
if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+ _adjust_cache_settings();
+
// Log events at 5 instead of 20 when balance happens.
interval_stats_resize = true;
interval_stats_trim = true;
void BlueStore::MempoolThread::_adjust_cache_settings()
{
- store->db->set_cache_ratio(store->cache_kv_ratio);
- meta_cache.set_cache_ratio(store->cache_meta_ratio);
- data_cache.set_cache_ratio(store->cache_data_ratio);
+ if (binned_kv_cache != nullptr) {
+ binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+ }
+ meta_cache->set_cache_ratio(store->cache_meta_ratio);
+ data_cache->set_cache_ratio(store->cache_data_ratio);
}
void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
size_t num_shards = store->cache_shards.size();
int64_t kv_used = store->db->get_cache_usage();
- int64_t meta_used = meta_cache._get_used_bytes();
- int64_t data_used = data_cache._get_used_bytes();
+ int64_t meta_used = meta_cache->_get_used_bytes();
+ int64_t data_used = data_cache->_get_used_bytes();
uint64_t cache_size = store->cache_size;
int64_t kv_alloc =
- static_cast<int64_t>(store->db->get_cache_ratio() * cache_size);
+ static_cast<int64_t>(store->cache_kv_ratio * cache_size);
int64_t meta_alloc =
- static_cast<int64_t>(meta_cache.get_cache_ratio() * cache_size);
+ static_cast<int64_t>(store->cache_meta_ratio * cache_size);
int64_t data_alloc =
- static_cast<int64_t>(data_cache.get_cache_ratio() * cache_size);
+ static_cast<int64_t>(store->cache_data_ratio * cache_size);
- if (store->cache_autotune) {
+ if (binned_kv_cache != nullptr && store->cache_autotune) {
cache_size = autotune_cache_size;
- kv_alloc = store->db->get_committed_size();
- meta_alloc = meta_cache.get_committed_size();
- data_alloc = data_cache.get_committed_size();
+ kv_alloc = binned_kv_cache->get_committed_size();
+ meta_alloc = meta_cache->get_committed_size();
+ data_alloc = data_cache->get_committed_size();
}
if (interval_stats) {
}
uint64_t max_shard_onodes = static_cast<uint64_t>(
- (meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode());
+ (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
}
void BlueStore::MempoolThread::_balance_cache(
- const std::list<PriorityCache::PriCache *>& caches)
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches)
{
int64_t mem_avail = autotune_cache_size;
/* Each cache is going to get at least 1 chunk's worth of memory from get_chunk
}
void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail,
- const std::list<PriorityCache::PriCache *>& caches, PriorityCache::Priority pri)
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
+ PriorityCache::Priority pri)
{
- std::list<PriorityCache::PriCache *> tmp_caches = caches;
+ std::list<std::shared_ptr<PriorityCache::PriCache>> tmp_caches = caches;
double cur_ratios = 0;
double new_ratios = 0;
ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
bool stop = false;
uint64_t autotune_cache_size = 0;
+ std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
struct MempoolCache : public PriorityCache::PriCache {
BlueStore *store;
- int64_t cache_bytes[PriorityCache::Priority::LAST+1];
+ int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
int64_t committed_bytes = 0;
double cache_ratio = 0;
double get_bytes_per_onode() const {
return (double)_get_used_bytes() / (double)_get_num_onodes();
}
- } meta_cache;
+ };
+ std::shared_ptr<MetaCache> meta_cache;
struct DataCache : public MempoolCache {
DataCache(BlueStore *s) : MempoolCache(s) {};
virtual string get_cache_name() const {
return "BlueStore Data Cache";
}
- } data_cache;
+ };
+ std::shared_ptr<DataCache> data_cache;
public:
explicit MempoolThread(BlueStore *s)
: store(s),
- meta_cache(MetaCache(s)),
- data_cache(DataCache(s)) {}
+ meta_cache(new MetaCache(s)),
+ data_cache(new DataCache(s)) {}
void *entry() override;
void init() {
void _adjust_cache_settings();
void _trim_shards(bool interval_stats);
void _tune_cache_size(bool interval_stats);
- void _balance_cache(const std::list<PriorityCache::PriCache *>& caches);
- void _balance_cache_pri(int64_t *mem_avail,
- const std::list<PriorityCache::PriCache *>& caches,
- PriorityCache::Priority pri);
+ void _balance_cache(
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches);
+ void _balance_cache_pri(
+ int64_t *mem_avail,
+ const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
+ PriorityCache::Priority pri);
} mempool_thread;
// --------------------------------------------------------