.set_default(true)
.set_description(""),
+ Option("osd_memory_target", Option::TYPE_UINT, Option::LEVEL_BASIC)
+ .set_default(4_G)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("When tcmalloc and cache autotuning is enabled, try to keep this many bytes mapped in memory."),
+
+ Option("osd_memory_base", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(768_M)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need."),
+
+ Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0.15)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),
+
+ Option("osd_memory_cache_min", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(128_M)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches."),
+
+ Option("osd_memory_cache_resize_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(1)
+ .add_see_also("bluestore_cache_autotune")
+ .set_description("When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches."),
+
Option("memstore_device_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.set_default(1_G)
.set_description(""),
.set_default(.5)
.set_description("2Q paper suggests .5"),
- Option("bluestore_cache_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
.set_default(0)
.set_description("Cache size (in bytes) for BlueStore")
.set_long_description("This includes data and metadata cached by BlueStore as well as memory devoted to rocksdb's cache(s)."),
- Option("bluestore_cache_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_size_hdd", Option::TYPE_SIZE, Option::LEVEL_DEV)
.set_default(1_G)
.set_description("Default bluestore_cache_size for rotational media"),
- Option("bluestore_cache_size_ssd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_size_ssd", Option::TYPE_SIZE, Option::LEVEL_DEV)
.set_default(3_G)
.set_description("Default bluestore_cache_size for non-rotational (solid state) media"),
- Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(.5)
+ Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.4)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to metadata"),
- Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(.5)
+ Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(.4)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
- Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.add_see_also("bluestore_cache_size")
.add_see_also("bluestore_cache_meta_ratio")
.set_description("Automatically tune the ratio of caches while respecting min values."),
- Option("bluestore_cache_autotune_chunk_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_autotune_chunk_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
.set_default(33554432)
.add_see_also("bluestore_cache_autotune")
.set_description("The chunk size in bytes to allocate to caches when cache autotune is enabled."),
- Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(5)
.add_see_also("bluestore_cache_autotune")
.set_description("The number of seconds to wait between rebalances when cache autotune is enabled."),
#include "BlueRocksEnv.h"
#include "auth/Crypto.h"
#include "common/EventTrace.h"
+#include "perfglue/heap_profiler.h"
#define dout_context cct
#define dout_subsys ceph_subsys_bluestore
caches.push_back(store->db);
caches.push_back(&meta_cache);
caches.push_back(&data_cache);
+ autotune_cache_size = store->osd_memory_cache_min;
utime_t next_balance = ceph_clock_now();
+ utime_t next_resize = ceph_clock_now();
+
+ bool interval_stats_trim = false;
+ bool interval_stats_resize = false;
while (!stop) {
_adjust_cache_settings();
- // Before we trim, check and see if it's time to rebalance
- bool log_stats = false;
+ // Before we trim, check and see if it's time to rebalance/resize.
double autotune_interval = store->cache_autotune_interval;
+ double resize_interval = store->osd_memory_cache_resize_interval;
+
if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+ // Log events at 5 instead of 20 when balance happens.
+ interval_stats_resize = true;
+ interval_stats_trim = true;
if (store->cache_autotune) {
_balance_cache(caches);
}
+
next_balance = ceph_clock_now();
next_balance += autotune_interval;
- log_stats = true;
+ }
+ if (resize_interval > 0 && next_resize < ceph_clock_now()) {
+ if (ceph_using_tcmalloc() && store->cache_autotune) {
+ _tune_cache_size(interval_stats_resize);
+ interval_stats_resize = false;
+ }
+ next_resize = ceph_clock_now();
+ next_resize += resize_interval;
}
- _trim_shards(log_stats);
- store->_update_cache_logger();
+ // Now Trim
+ _trim_shards(interval_stats_trim);
+ interval_stats_trim = false;
+ store->_update_cache_logger();
utime_t wait;
wait += store->cct->_conf->bluestore_cache_trim_interval;
cond.WaitInterval(lock, wait);
data_cache.set_cache_ratio(store->cache_data_ratio);
}
-void BlueStore::MempoolThread::_trim_shards(bool log_stats)
+void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
{
- uint64_t cache_size = store->cache_size;
+ auto cct = store->cct;
size_t num_shards = store->cache_shards.size();
- int64_t kv_alloc_bytes = 0;
- int64_t meta_alloc_bytes = 0;
- int64_t data_alloc_bytes = 0;
+
+ int64_t kv_used = store->db->get_cache_usage();
+ int64_t meta_used = meta_cache._get_used_bytes();
+ int64_t data_used = data_cache._get_used_bytes();
+
+ uint64_t cache_size = store->cache_size;
+ int64_t kv_alloc =
+ static_cast<int64_t>(store->db->get_cache_ratio() * cache_size);
+ int64_t meta_alloc =
+ static_cast<int64_t>(meta_cache.get_cache_ratio() * cache_size);
+ int64_t data_alloc =
+ static_cast<int64_t>(data_cache.get_cache_ratio() * cache_size);
if (store->cache_autotune) {
- kv_alloc_bytes = store->db->get_cache_bytes();
- meta_alloc_bytes = meta_cache.get_cache_bytes();
- data_alloc_bytes = data_cache.get_cache_bytes();
+ cache_size = autotune_cache_size;
+
+ kv_alloc = store->db->get_cache_bytes();
+ meta_alloc = meta_cache.get_cache_bytes();
+ data_alloc = data_cache.get_cache_bytes();
+ }
+
+ if (interval_stats) {
+ ldout(cct, 5) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
} else {
- kv_alloc_bytes = static_cast<int64_t>(
- store->db->get_cache_ratio() * cache_size);
- meta_alloc_bytes = static_cast<int64_t>(
- meta_cache.get_cache_ratio() * cache_size);
- data_alloc_bytes = static_cast<int64_t>(
- data_cache.get_cache_ratio() * cache_size);
- }
- if (log_stats) {
- double kv_alloc_ratio = (double) kv_alloc_bytes / cache_size;
- double meta_alloc_ratio = (double) meta_alloc_bytes / cache_size;
- double data_alloc_ratio = (double) data_alloc_bytes / cache_size;
- double kv_used_ratio = (double) store->db->get_cache_usage() / cache_size;
- double meta_used_ratio = (double) meta_cache._get_used_bytes() / cache_size;
- double data_used_ratio = (double) data_cache._get_used_bytes() / cache_size;
-
- ldout(store->cct, 5) << __func__ << " ratios -" << std::fixed << std::setprecision(1)
- << " kv_alloc: " << 100*kv_alloc_ratio << "%"
- << " kv_used: " << 100*kv_used_ratio << "%"
- << " meta_alloc: " << 100*meta_alloc_ratio << "%"
- << " meta_used: " << 100*meta_used_ratio << "%"
- << " data_alloc: " << 100*data_alloc_ratio << "%"
- << " data_used: " << 100*data_used_ratio << "%" << dendl;
+ ldout(cct, 20) << __func__ << " cache_size: " << cache_size
+ << " kv_alloc: " << kv_alloc
+ << " kv_used: " << kv_used
+ << " meta_alloc: " << meta_alloc
+ << " meta_used: " << meta_used
+ << " data_alloc: " << data_alloc
+ << " data_used: " << data_used << dendl;
}
uint64_t max_shard_onodes = static_cast<uint64_t>(
- (meta_alloc_bytes / (double) num_shards) / meta_cache.get_bytes_per_onode());
- uint64_t max_shard_buffer = static_cast<uint64_t>(
- data_alloc_bytes / num_shards);
- ldout(store->cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
- << " max_shard_buffer: " << max_shard_buffer << dendl;
+ (meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode());
+ uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
+
+ ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+ << " max_shard_buffer: " << max_shard_buffer << dendl;
for (auto i : store->cache_shards) {
i->trim(max_shard_onodes, max_shard_buffer);
}
}
+void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
+{
+ auto cct = store->cct;
+ uint64_t target = store->osd_memory_target;
+ uint64_t base = store->osd_memory_base;
+ double fragmentation = store->osd_memory_expected_fragmentation;
+ uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
+ uint64_t cache_min = store->osd_memory_cache_min;
+
+ size_t heap_size = 0;
+ size_t unmapped = 0;
+ uint64_t mapped = 0;
+
+ ceph_heap_release_free_memory();
+ ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
+ ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
+ mapped = heap_size - unmapped;
+
+ uint64_t new_size = autotune_cache_size;
+ new_size = (new_size < cache_max) ? new_size : cache_max;
+ new_size = (new_size > cache_min) ? new_size : cache_min;
+
+ // Approach the min/max slowly, but bounce away quickly.
+ if ((uint64_t) mapped < target) {
+ double ratio = 1 - ((double) mapped / target);
+ new_size += ratio * (cache_max - new_size);
+ } else {
+ double ratio = 1 - ((double) target / mapped);
+ new_size -= ratio * (new_size - cache_min);
+ }
+
+ if (interval_stats) {
+ ldout(cct, 5) << __func__
+ << " target: " << target
+ << " heap: " << heap_size
+ << " unmapped: " << unmapped
+ << " mapped: " << mapped
+ << " old cache_size: " << autotune_cache_size
+ << " new cache size: " << new_size << dendl;
+ } else {
+ ldout(cct, 20) << __func__
+ << " target: " << target
+ << " heap: " << heap_size
+ << " unmapped: " << unmapped
+ << " mapped: " << mapped
+ << " old cache_size: " << autotune_cache_size
+ << " new cache size: " << new_size << dendl;
+ }
+ autotune_cache_size = new_size;
+}
+
void BlueStore::MempoolThread::_balance_cache(
const std::list<PriorityCache::PriCache *>& caches)
{
- int64_t mem_avail = store->cache_size;
+ int64_t mem_avail = autotune_cache_size;
// Assign memory for each priority level
for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
cct->_conf.get_val<Option::size_t>("bluestore_cache_autotune_chunk_size");
cache_autotune_interval =
cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+ osd_memory_target = cct->_conf.get_val<uint64_t>("osd_memory_target");
+ osd_memory_base = cct->_conf.get_val<uint64_t>("osd_memory_base");
+ osd_memory_expected_fragmentation =
+ cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+ osd_memory_cache_min = cct->_conf.get_val<uint64_t>("osd_memory_cache_min");
+ osd_memory_cache_resize_interval =
+ cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
if (cct->_conf->bluestore_cache_size) {
cache_size = cct->_conf->bluestore_cache_size;