From a42327067462ec4ab6152f1489768f181353b9e5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Jun 2017 12:40:11 -0400 Subject: [PATCH] os/bluestore: set bluestore_cache_kv_ratio to configure rocksdb cache size Devote 40% to kv (rocksdb), 50% to metadata (onodes etc), 10% to data. Note that if we don't consume the data portion (e.g., no cache hints) that the onode metadata will "borrow" that space. Signed-off-by: Sage Weil --- src/common/config_opts.h | 3 +- src/os/bluestore/BlueStore.cc | 64 +++++++++++++++++++++++++++-------- src/os/bluestore/BlueStore.h | 9 ++++- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ec3fa66ece380..705e7b437a193 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1137,7 +1137,8 @@ OPTION(bluestore_cache_type, OPT_STR, "2q") // lru, 2q OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5) // kin page slot size / max page slot size OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5) // number of kout page slot / total number of page slot OPTION(bluestore_cache_size, OPT_U64, 1024*1024*1024) -OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .9) +OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .7) +OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE, .2) OPTION(bluestore_kvbackend, OPT_STR, "rocksdb") OPTION(bluestore_allocator, OPT_STR, "bitmap") // stupid | bitmap OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a974b5ac836fc..e899868339208 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -752,6 +752,7 @@ void BlueStore::Cache::trim_all() void BlueStore::Cache::trim( uint64_t target_bytes, float target_meta_ratio, + float target_data_ratio, float bytes_per_onode) { std::lock_guard l(lock); @@ -759,23 +760,18 @@ void BlueStore::Cache::trim( uint64_t current_buffer = _get_buffer_bytes(); uint64_t current = current_meta + current_buffer; - uint64_t target_meta = target_bytes * (double)target_meta_ratio; //need to cast to double - //since float(1) might produce inaccurate value - // for target_meta (a bit greater than target_bytes) - // that causes overflow in target_buffer below. - //Consider the following code: - //uint64_t i =(uint64_t)227*1024*1024*1024 + 1; - //float f = 1; - //uint64_t i2 = i*f; - //assert(i == i2); + uint64_t target_meta = target_bytes * target_meta_ratio; + uint64_t target_buffer = target_bytes * target_data_ratio; - target_meta = min(target_bytes, target_meta); //and just in case that ratio is > 1 - uint64_t target_buffer = target_bytes - target_meta; + // correct for overflow or float imprecision + target_meta = min(target_bytes, target_meta); + target_buffer = min(target_bytes - target_meta, target_buffer); if (current <= target_bytes) { dout(10) << __func__ << " shard target " << pretty_si_t(target_bytes) - << " ratio " << target_meta_ratio << " (" + << " meta/data ratios " << target_meta_ratio + << " + " << target_data_ratio << " (" << pretty_si_t(target_meta) << " + " << pretty_si_t(target_buffer) << "), " << " current " << pretty_si_t(current) << " (" @@ -3263,8 +3259,10 @@ void *BlueStore::MempoolThread::entry() uint64_t shard_target = store->cct->_conf->bluestore_cache_size / num_shards; for (auto i : store->cache_shards) { - i->trim(shard_target, store->cct->_conf->bluestore_cache_meta_ratio, - bytes_per_onode); + i->trim(shard_target, + store->cache_meta_ratio, + store->cache_data_ratio, + bytes_per_onode); } store->_update_cache_logger(); @@ -3543,6 +3541,36 @@ void BlueStore::_set_blob_size() << std::dec << dendl; } +int BlueStore::_set_cache_sizes() +{ + cache_meta_ratio = cct->_conf->bluestore_cache_meta_ratio; + cache_kv_ratio = cct->_conf->bluestore_cache_kv_ratio; + cache_data_ratio = 1.0 - cache_meta_ratio - cache_kv_ratio; + + if (cache_meta_ratio <= 0 || cache_meta_ratio > 1.0) { + derr << __func__ << "bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") must be in range (0,1.0]" << dendl; + return -EINVAL; + } + if (cache_kv_ratio <= 0 || cache_kv_ratio > 1.0) { + derr << __func__ << "bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") must be in range (0,1.0]" << dendl; + return -EINVAL; + } + if (cache_meta_ratio + cache_kv_ratio > 1.0) { + derr << __func__ << "bluestore_cache_meta_ratio (" << cache_meta_ratio + << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio + << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0" + << dendl; + return -EINVAL; + } + dout(1) << __func__ << " meta " << cache_meta_ratio + << " kv " << cache_kv_ratio + << " data " << cache_data_ratio + << dendl; + return 0; +} + void BlueStore::_init_logger() { PerfCountersBuilder b(cct, "bluestore", @@ -3718,6 +3746,12 @@ int BlueStore::get_block_device_fsid(CephContext* cct, const string& path, int BlueStore::_open_path() { + // initial sanity check + int r = _set_cache_sizes(); + if (r < 0) { + return r; + } + assert(path_fd < 0); path_fd = ::open(path.c_str(), O_DIRECTORY); if (path_fd < 0) { @@ -4431,6 +4465,8 @@ int BlueStore::_open_db(bool create) FreelistManager::setup_merge_operators(db); db->set_merge_operator(PREFIX_STAT, merge_op); + db->set_cache_size(cct->_conf->bluestore_cache_size * cache_kv_ratio); + if (kv_backend == "rocksdb") options = cct->_conf->bluestore_rocksdb_options; db->init(options); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 83fe45d275217..de347ec985591 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -132,6 +132,7 @@ public: void _set_csum(); void _set_compression(); void _set_throttle_params(); + int _set_cache_sizes(); class TransContext; @@ -1067,7 +1068,9 @@ public: --num_blobs; } - void trim(uint64_t target_bytes, float target_meta_ratio, + void trim(uint64_t target_bytes, + float target_meta_ratio, + float target_data_ratio, float bytes_per_onode); void trim_all(); @@ -1900,6 +1903,10 @@ private: uint64_t kv_ios = 0; uint64_t kv_throttle_costs = 0; + float cache_meta_ratio = 0; ///< cache ratio dedicated to metadata + float cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb) + float cache_data_ratio = 0; ///< cache ratio dedicated to object data + // cache trim control std::mutex vstatfs_lock; -- 2.39.5