// shrink it to 1/256 of the rounded up cache size
chunk /= 256;
- // bound the chunk size to be between 4MB and 32MB
+ // bound the chunk size to be between 4MB and 64MB
chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
- chunk = (chunk < 16ul*1024*1024) ? chunk : 16ul*1024*1024;
+ chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
- /* Add 16 chunks of headroom and round up to the near chunk. Note that
+ /* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
* if RocksDB is used, it's a good idea to have N MB of headroom where
* N is the target_file_size_base value. RocksDB will read SST files
* into the block cache during compaction which potentially can force out
* compaction reads allows the kv cache grow even during extremely heavy
* compaction workloads.
*/
- uint64_t val = usage + (16 * chunk);
+ uint64_t val = usage + 64*1024*1024;
uint64_t r = (val) % chunk;
if (r > 0)
val = val + chunk - r;
.add_see_also("bluestore_cache_size"),
Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
- .set_default(.4)
+ .set_default(.45)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to metadata"),
Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
- .set_default(.4)
+ .set_default(.45)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
+ Option("bluestore_cache_kv_onode_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.04)
+ .add_see_also("bluestore_cache_size")
+ .set_description("Ratio of bluestore cache to devote to kv onode column family (rocksdb)"),
+
Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.add_see_also("bluestore_cache_size")
return -EOPNOTSUPP;
}
+ virtual int64_t get_cache_usage(std::string prefix) const {
+ return -EOPNOTSUPP;
+ }
+
virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const {
return nullptr;
}
+ virtual std::shared_ptr<PriorityCache::PriCache> get_priority_cache(std::string prefix) const {
+ return nullptr;
+ }
+
+
+
virtual ~KeyValueDB() {}
/// estimate space utilization for a prefix (in bytes)
return do_open(out, true, false, cfs);
}
+int RocksDBStore::init_block_cache(uint64_t size, rocksdb::BlockBasedTableOptions& bbto) {
+ auto shard_bits = cct->_conf()->rocksdb_cache_shard_bits;
+ if (cct->_conf()->rocksdb_cache_type == "binned_lru") {
+ bbto.block_cache = rocksdb_cache::NewBinnedLRUCache(cct, size, shard_bits);
+ } else if (cct->_conf()->rocksdb_cache_type == "lru") {
+ bbto.block_cache = rocksdb::NewLRUCache(size, shard_bits);
+ } else if (cct->_conf()->rocksdb_cache_type == "clock") {
+ bbto.block_cache = rocksdb::NewClockCache(size, shard_bits);
+ if (!bbto.block_cache) {
+ derr << "rocksdb_cache_type '" << cct->_conf()->rocksdb_cache_type
+ << "' chosen, but RocksDB not compiled with LibTBB. "
+ << dendl;
+ return -EINVAL;
+ }
+ } else {
+ derr << "unrecognized rocksdb_cache_type '" << g_conf()->rocksdb_cache_type
+ << "'" << dendl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options& opt)
{
rocksdb::Status status;
}
uint64_t row_cache_size = cache_size * cct->_conf->rocksdb_cache_row_ratio;
uint64_t block_cache_size = cache_size - row_cache_size;
+ init_block_cache(block_cache_size, bbt_opts);
- if (cct->_conf->rocksdb_cache_type == "binned_lru") {
- bbt_opts.block_cache = rocksdb_cache::NewBinnedLRUCache(
- cct,
- block_cache_size,
- cct->_conf->rocksdb_cache_shard_bits);
- } else if (cct->_conf->rocksdb_cache_type == "lru") {
- bbt_opts.block_cache = rocksdb::NewLRUCache(
- block_cache_size,
- cct->_conf->rocksdb_cache_shard_bits);
- } else if (cct->_conf->rocksdb_cache_type == "clock") {
- bbt_opts.block_cache = rocksdb::NewClockCache(
- block_cache_size,
- cct->_conf->rocksdb_cache_shard_bits);
- if (!bbt_opts.block_cache) {
- derr << "rocksdb_cache_type '" << cct->_conf->rocksdb_cache_type
- << "' chosen, but RocksDB not compiled with LibTBB. "
- << dendl;
- return -EINVAL;
- }
- } else {
- derr << "unrecognized rocksdb_cache_type '" << cct->_conf->rocksdb_cache_type
- << "'" << dendl;
- return -EINVAL;
- }
bbt_opts.block_size = cct->_conf->rocksdb_block_size;
if (row_cache_size > 0)
}
return 0;
}
+// linking to rocksdb function defined in options_helper.cc
+// it can parse nested params like "nested_opt={opt1=1;opt2=2}"
+extern rocksdb::Status StringToMap(const std::string& opts_str,
+ std::unordered_map<std::string, std::string>* opts_map);
int RocksDBStore::verify_sharding(const rocksdb::Options& opt,
std::vector<rocksdb::ColumnFamilyDescriptor>& existing_cfs,
for (auto& column : stored_sharding_def) {
rocksdb::ColumnFamilyOptions cf_opt(opt);
- status = rocksdb::GetColumnFamilyOptionsFromString(
- cf_opt, column.options, &cf_opt);
+ //sift column.options into 2 categories:
+ // - column family options
+ // - block cache options
+ std::unordered_map<std::string, std::string> options_map;
+ status = StringToMap(column.options, &options_map);
+ if (!status.ok()) {
+ dout(5) << __func__ << " error '" << status.getState() << "' while parsing options '" <<
+ column.options << dendl;
+ return -EIO;
+ }
+ //extract "block_cache" options
+ std::unordered_map<std::string, std::string> cache_options_map;
+ for (auto it = options_map.begin; it!= options_map.end() ; /*nop*/ ) {
+ if (it->first.find("block_cache.") == 0) {
+ cache_options_map.insert(it->first.substr(strlen("block_cache.")), it->second);
+ it = erase(it);
+ } else {
+ ++it;
+ }
+ }
+ status = rocksdb::GetColumnFamilyOptionsFromMap(
+ cf_opt, options_map, &cf_opt);
if (!status.ok()) {
derr << __func__ << " invalid db column family options for CF '"
<< column.name << "': " << column.options << dendl;
+ derr << __func__ << "error = '" << status.getState() << "'" << dendl;
return -EINVAL;
}
install_cf_mergeop(column.name, &cf_opt);
-
+ if (!cache_options_map.empty()) {
+ bool require_new_block_cache = false;
+ std::string cache_type = cct->_conf()->rocksdb_cache_type;
+ if (auto it = cache_options_map.find("cache_type"); it !=cache_options_map.end()) {
+ cache_type = it->second();
+ cache_options_map.erase(it);
+ require_new_block_cache = true;
+ }
+ size_t cache_size = cct->_conf()->rocksdb_cache_size;
+ if (auto it = cache_options_map.find("cache_size"); it !=cache_options_map.end()) {
+ std::string error;
+ cache_size = strict_iecstrtoll(it->second(), &error);
+ if (!error.empty()) {
+ derr << __func__ << " invalid size: '" << it->second() << "'" << dendl;
+ }
+ cache_options_map.erase(it);
+ require_new_block_cache = true;
+ }
+ std::shared_ptr<rocksdb::Cache> block_cache;
+ if (require_new_block_cache) {
+ block_cache = create_block_cache(cache_type, case_size);
+ if (!block_cache) {
+ return -EINVAL;
+ }
+
+ } else {
+ block_cache = bbt_opts.block_cache;
+ }
+ rocksdb::BlockBasedTableOptions column_bbt_opts;
+ status = GetBlockBasedTableOptionsFromMap(bbt_opts, cache_options_map, column_bbt_opts);
+ if (!status.ok()) {
+ derr << __func__ << " invalid cache options for CF '"
+ << column.name << "': " << cache_options_map << dendl;
+ derr << __func__ << "error = '" << status.getState() << "'" << dendl;
+ return -EINVAL;
+ }
+
+ cf_bbt_opts[column.name] = column_bbt_opts;
+ cf_opt.table_factory.reset(NewBlockBasedTableFactory(cf_bbt_opts[column.name]));
+ }
if (column.shard_cnt == 1) {
emplace_cf(column, 0, column.name, cf_opt);
} else {
};
std::unordered_map<std::string, prefix_shards> cf_handles;
std::unordered_map<uint32_t, std::string> cf_ids_to_prefix;
-
+ std::unordered_map<std::string, rocksdb::BlockBasedTableOptions> cf_bbt_opts;
+
void add_column_family(const std::string& cf_name, uint32_t hash_l, uint32_t hash_h,
size_t shard_idx, rocksdb::ColumnFamilyHandle *handle);
bool is_column_family(const std::string& prefix);
std::vector<std::pair<size_t, RocksDBStore::ColumnFamily> >& existing_cfs_shard,
std::vector<rocksdb::ColumnFamilyDescriptor>& missing_cfs,
std::vector<std::pair<size_t, RocksDBStore::ColumnFamily> >& missing_cfs_shard);
+ int init_block_cache(uint64_t size, rocksdb::BlockBasedTableOptions& bbto);
// manage async compactions
ceph::mutex compact_queue_lock =
return static_cast<int64_t>(bbt_opts.block_cache->GetUsage());
}
+ virtual int64_t get_cache_usage(string prefix) const override {
+ auto it = cf_bbt_opts.find(prefix);
+ if (it != cf_bbt_opts.end()) {
+ return static_cast<int64_t>(it->second.block_cache->GetUsage());
+ }
+ return -EINVAL;
+ }
+
int set_cache_size(uint64_t s) override {
cache_size = s;
set_cache_flag = true;
bool unittest_fail_after_successful_processing = false;
};
int reshard(const std::string& new_sharding, const resharding_ctrl* ctrl = nullptr);
+
+ int set_cache_capacity(int64_t capacity);
+ int64_t get_cache_capacity();
+
+ virtual std::shared_ptr<PriorityCache::PriCache>
+ get_priority_cache() const override {
+ return dynamic_pointer_cast<PriorityCache::PriCache>(
+ bbt_opts.block_cache);
+ }
+
+ virtual std::shared_ptr<PriorityCache::PriCache>
+ get_priority_cache(string prefix) const override {
+ auto it = cf_bbt_opts.find(prefix);
+ if (it != cf_bbt_opts.end()) {
+ return dynamic_pointer_cast<PriorityCache::PriCache>(
+ it->second.block_cache);
+ }
+ return nullptr;
+ }
+
};
#endif
length_ = new_length;
}
-BinnedLRUCacheShard::BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+BinnedLRUCacheShard::BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio)
- : capacity_(0),
+ : cct(c),
+ capacity_(0),
high_pri_pool_usage_(0),
strict_capacity_limit_(strict_capacity_limit),
high_pri_pool_ratio_(high_pri_pool_ratio),
size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
for (int i = 0; i < num_shards_; i++) {
new (&shards_[i])
- BinnedLRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio);
+ BinnedLRUCacheShard(c, per_shard, strict_capacity_limit, high_pri_pool_ratio);
}
}
// A single shard of sharded cache.
class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
public:
- BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+ BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio);
virtual ~BinnedLRUCacheShard();
size_t GetHighPriPoolUsage() const;
private:
+ CephContext *cct;
void LRU_Remove(BinnedLRUHandle* e);
void LRU_Insert(BinnedLRUHandle* e);
}
binned_kv_cache = store->db->get_priority_cache();
+ binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
if (store->cache_autotune && binned_kv_cache != nullptr) {
pcm = std::make_shared<PriorityCache::Manager>(
store->cct, min, max, target, true, "bluestore-pricache");
pcm->insert("kv", binned_kv_cache, true);
pcm->insert("meta", meta_cache, true);
pcm->insert("data", data_cache, true);
+ if (binned_kv_onode_cache != nullptr) {
+ pcm->insert("kv_onode", binned_kv_onode_cache, true);
+ }
}
utime_t next_balance = ceph_clock_now();
if (binned_kv_cache != nullptr) {
binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
}
+ if (binned_kv_onode_cache != nullptr) {
+ binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+ }
meta_cache->set_cache_ratio(store->cache_meta_ratio);
data_cache->set_cache_ratio(store->cache_data_ratio);
}
size_t onode_shards = store->onode_cache_shards.size();
size_t buffer_shards = store->buffer_cache_shards.size();
int64_t kv_used = store->db->get_cache_usage();
+ int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
int64_t meta_used = meta_cache->_get_used_bytes();
int64_t data_used = data_cache->_get_used_bytes();
uint64_t cache_size = store->cache_size;
int64_t kv_alloc =
static_cast<int64_t>(store->cache_kv_ratio * cache_size);
+ int64_t kv_onode_alloc =
+ static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
int64_t meta_alloc =
static_cast<int64_t>(store->cache_meta_ratio * cache_size);
int64_t data_alloc =
kv_alloc = binned_kv_cache->get_committed_size();
meta_alloc = meta_cache->get_committed_size();
data_alloc = data_cache->get_committed_size();
+ if (binned_kv_onode_cache != nullptr) {
+ kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
+ }
}
if (interval_stats) {
dout(5) << __func__ << " cache_size: " << cache_size
<< " kv_alloc: " << kv_alloc
<< " kv_used: " << kv_used
+ << " kv_onode_alloc: " << kv_onode_alloc
+ << " kv_onode_used: " << kv_onode_used
<< " meta_alloc: " << meta_alloc
<< " meta_used: " << meta_used
<< " data_alloc: " << data_alloc
dout(20) << __func__ << " cache_size: " << cache_size
<< " kv_alloc: " << kv_alloc
<< " kv_used: " << kv_used
+ << " kv_onode_alloc: " << kv_onode_alloc
+ << " kv_onode_used: " << kv_onode_used
<< " meta_alloc: " << meta_alloc
<< " meta_used: " << meta_used
<< " data_alloc: " << data_alloc
}
}
- cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio;
+ cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
<< ") must be in range [0,1.0]" << dendl;
return -EINVAL;
}
- cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio;
+ cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
<< ") must be in range [0,1.0]" << dendl;
return -EINVAL;
}
+ cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
+ if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
+ derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
+ << ") must be in range [0,1.0]" << dendl;
+ return -EINVAL;
+ }
+
if (cache_meta_ratio + cache_kv_ratio > 1.0) {
derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
<< ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
return -EINVAL;
}
- cache_data_ratio =
- (double)1.0 - (double)cache_meta_ratio - (double)cache_kv_ratio;
+ cache_data_ratio = (double)1.0 -
+ (double)cache_meta_ratio -
+ (double)cache_kv_ratio -
+ (double)cache_kv_onode_ratio;
if (cache_data_ratio < 0) {
// deal with floating point imprecision
cache_data_ratio = 0;
uint64_t cache_size = 0; ///< total cache size
double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
+ double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
double cache_data_ratio = 0; ///< cache ratio dedicated to object data
bool cache_autotune = false; ///< cache autotune setting
double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
bool stop = false;
std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
+ std::shared_ptr<PriorityCache::PriCache> binned_kv_onode_cache = nullptr;
std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
struct MempoolCache : public PriorityCache::PriCache {