From: Sage Weil Date: Tue, 11 Oct 2016 18:25:01 +0000 (-0400) Subject: os/bluestore: restructure cache trimming in terms of mempool X-Git-Tag: v11.1.0~442^2~16 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bcf20a1ca12ac0a7d4bd51e0beeda2877b4e0125;p=ceph.git os/bluestore: restructure cache trimming in terms of mempool Trim cache based on overall memory utilization by cache objects, as tracked by the bluestore_meta_* mempools. This lets you configure the bluestore cache size in terms of bytes of memory. Note that we do not account for other memory utilization by the OSD. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 82157561687b..53af4a13d109 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -998,11 +998,12 @@ OPTION(bluestore_extent_map_shard_target_size, OPT_U32, 500) OPTION(bluestore_extent_map_shard_min_size, OPT_U32, 150) OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE, .2) OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32, 256) +OPTION(bluestore_cache_trim_interval, OPT_DOUBLE, .1) OPTION(bluestore_cache_type, OPT_STR, "2q") // lru, 2q OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5) // kin page slot size / max page slot size OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5) // number of kout page slot / total number of page slot -OPTION(bluestore_onode_cache_size, OPT_U32, 4*1024) -OPTION(bluestore_buffer_cache_size, OPT_U32, 512*1024*1024) +OPTION(bluestore_cache_size, OPT_U64, 1024*1024*1024) +OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .1) OPTION(bluestore_kvbackend, OPT_STR, "rocksdb") OPTION(bluestore_allocator, OPT_STR, "bitmap") // stupid | bitmap OPTION(bluestore_freelist_type, OPT_STR, "bitmap") // extent | bitmap diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index df854399fe44..c96ed46db3ad 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -540,6 +540,69 @@ BlueStore::Cache *BlueStore::Cache::create(string type, PerfCounters *logger) return c; } +void BlueStore::Cache::trim( + uint64_t target_bytes, + float target_meta_ratio, + float bytes_per_onode) +{ + std::lock_guard l(lock); + uint64_t current_meta = _get_num_onodes() * bytes_per_onode; + uint64_t current_buffer = _get_buffer_bytes(); + uint64_t current = current_meta + current_buffer; + + uint64_t target_meta = target_bytes * target_meta_ratio; + uint64_t target_buffer = target_bytes - target_meta; + + if (current <= target_bytes) { + dout(10) << __func__ + << " shard target " << pretty_si_t(target_bytes) + << " ratio " << target_meta_ratio << " (" + << pretty_si_t(target_meta) << " + " + << pretty_si_t(target_buffer) << "), " + << " current " << pretty_si_t(current) << " (" + << pretty_si_t(current_meta) << " + " + << pretty_si_t(current_buffer) << ")" + << dendl; + return; + } + + uint64_t need_to_free = 0; + if (current > target_bytes) { + need_to_free = current - target_bytes; + } + uint64_t free_buffer = 0; + uint64_t free_meta = 0; + if (current_buffer > target_buffer) { + free_buffer = current_buffer - target_buffer; + if (free_buffer > need_to_free) { + free_buffer = need_to_free; + } + } + free_meta = need_to_free - free_buffer; + + // start bounds at what we have now + uint64_t max_buffer = current_buffer - free_buffer; + uint64_t max_meta = current_meta - free_meta; + uint64_t max_onodes = max_meta / bytes_per_onode; + + dout(10) << __func__ + << " shard target " << pretty_si_t(target_bytes) + << " ratio " << target_meta_ratio << " (" + << pretty_si_t(target_meta) << " + " + << pretty_si_t(target_buffer) << "), " + << " current " << pretty_si_t(current) << " (" + << pretty_si_t(current_meta) << " + " + << pretty_si_t(current_buffer) << ")," + << " need_to_free " << pretty_si_t(need_to_free) << " (" + << pretty_si_t(free_meta) << " + " + << pretty_si_t(free_buffer) << ")" + << " -> max " << max_onodes << " onodes + " + << max_buffer << " buffer" + << dendl; + _trim(max_onodes, max_buffer); +} + + // LRUCache #undef dout_prefix #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") " @@ -551,10 +614,8 @@ void BlueStore::LRUCache::_touch_onode(OnodeRef& o) onode_lru.push_front(*o); } -void BlueStore::LRUCache::trim(uint64_t onode_max, uint64_t buffer_max) +void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max) { - std::lock_guard l(lock); - dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max << " buffers " << buffer_size << " / " << buffer_max << dendl; @@ -730,10 +791,8 @@ void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta) } } -void BlueStore::TwoQCache::trim(uint64_t onode_max, uint64_t buffer_max) +void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max) { - std::lock_guard l(lock); - dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max << " buffers " << buffer_bytes << " / " << buffer_max << dendl; @@ -2457,15 +2516,54 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode( void BlueStore::Collection::trim_cache() { - cache->trim( - g_conf->bluestore_onode_cache_size / store->cache_shards.size(), - g_conf->bluestore_buffer_cache_size / store->cache_shards.size()); + // see if mempool stats have updated + uint64_t total_bytes; + uint64_t total_onodes; + size_t seq; + store->get_mempool_stats(&seq, &total_bytes, &total_onodes); + if (seq == cache->last_trim_seq) { + dout(30) << __func__ << " no new mempool stats; nothing to do" << dendl; + return; + } + cache->last_trim_seq = seq; + + // trim + if (total_onodes < 2) { + total_onodes = 2; + } + float bytes_per_onode = (float)total_bytes / (float)total_onodes; + size_t num_shards = store->cache_shards.size(); + uint64_t shard_target = g_conf->bluestore_cache_size / num_shards; + dout(30) << __func__ + << " total meta bytes " << total_bytes + << ", total onodes " << total_onodes + << ", bytes_per_onode " << bytes_per_onode + << dendl; + cache->trim(shard_target, g_conf->bluestore_cache_meta_ratio, bytes_per_onode); store->_update_cache_logger(); } // ======================================================= +void *BlueStore::MempoolThread::entry() +{ + Mutex::Locker l(lock); + while (!stop) { + store->mempool_bytes = bluestore_meta_other::allocated_bytes() + + bluestore_meta_onode::allocated_bytes(); + store->mempool_onodes = bluestore_meta_onode::allocated_items(); + ++store->mempool_seq; + utime_t wait; + wait += g_conf->bluestore_cache_trim_interval; + cond.WaitInterval(g_ceph_context, lock, wait); + } + stop = false; + return NULL; +} + +// ======================================================= + #undef dout_prefix #define dout_prefix *_dout << "bluestore(" << path << ") " @@ -2512,7 +2610,8 @@ BlueStore::BlueStore(CephContext *cct, const string& path) logger(NULL), debug_read_error_lock("BlueStore::debug_read_error_lock"), csum_type(Checksummer::CSUM_CRC32C), - sync_wal_apply(cct->_conf->bluestore_sync_wal_apply) + sync_wal_apply(cct->_conf->bluestore_sync_wal_apply), + mempool_thread(this) { _init_logger(); g_ceph_context->_conf->add_observer(this); @@ -4052,6 +4151,8 @@ int BlueStore::mount() if (r < 0) goto out_stop; + mempool_thread.init(); + _set_csum(); _set_compression(); @@ -4059,6 +4160,7 @@ int BlueStore::mount() return 0; out_stop: + mempool_thread.shutdown(); _kv_stop(); wal_wq.drain(); wal_tp.stop(); @@ -4092,6 +4194,8 @@ int BlueStore::umount() _reap_collections(); coll_map.clear(); + mempool_thread.shutdown(); + dout(20) << __func__ << " stopping kv thread" << dendl; _kv_stop(); dout(20) << __func__ << " draining wal_wq" << dendl; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 73d57e30e60d..848ab7efbf4b 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -791,6 +791,8 @@ public: std::atomic num_extents = {0}; std::atomic num_blobs = {0}; + size_t last_trim_seq = 0; + static Cache *create(string type, PerfCounters *logger); virtual ~Cache() {} @@ -804,6 +806,9 @@ public: virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0; virtual void _touch_buffer(Buffer *b) = 0; + virtual uint64_t _get_num_onodes() = 0; + virtual uint64_t _get_buffer_bytes() = 0; + void add_extent() { ++num_extents; } @@ -818,7 +823,10 @@ public: --num_blobs; } - virtual void trim(uint64_t onode_max, uint64_t buffer_max) = 0; + void trim(uint64_t target_bytes, float target_meta_ratio, + float bytes_per_onode); + + virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0; virtual void add_stats(uint64_t *onodes, uint64_t *extents, uint64_t *blobs, @@ -854,6 +862,9 @@ public: uint64_t buffer_size = 0; public: + uint64_t _get_num_onodes() override { + return onode_lru.size(); + } void _add_onode(OnodeRef& o, int level) override { if (level > 0) onode_lru.push_front(*o); @@ -866,6 +877,9 @@ public: } void _touch_onode(OnodeRef& o) override; + uint64_t _get_buffer_bytes() override { + return buffer_size; + } void _add_buffer(Buffer *b, int level, Buffer *near) override { if (near) { auto q = buffer_lru.iterator_to(*near); @@ -894,7 +908,7 @@ public: _audit("_touch_buffer end"); } - void trim(uint64_t onode_max, uint64_t buffer_max) override; + void _trim(uint64_t onode_max, uint64_t buffer_max) override; void add_stats(uint64_t *onodes, uint64_t *extents, uint64_t *blobs, @@ -948,6 +962,9 @@ public: uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type public: + uint64_t _get_num_onodes() override { + return onode_lru.size(); + } void _add_onode(OnodeRef& o, int level) override { if (level > 0) onode_lru.push_front(*o); @@ -960,6 +977,9 @@ public: } void _touch_onode(OnodeRef& o) override; + uint64_t _get_buffer_bytes() override { + return buffer_bytes; + } void _add_buffer(Buffer *b, int level, Buffer *near) override; void _rm_buffer(Buffer *b) override; void _adjust_buffer_size(Buffer *b, int64_t delta) override; @@ -981,7 +1001,7 @@ public: _audit("_touch_buffer end"); } - void trim(uint64_t onode_max, uint64_t buffer_max) override; + void _trim(uint64_t onode_max, uint64_t buffer_max) override; void add_stats(uint64_t *onodes, uint64_t *extents, uint64_t *blobs, @@ -1501,6 +1521,44 @@ private: std::atomic comp_min_blob_size = {0}; std::atomic comp_max_blob_size = {0}; + // cache trim control + + // note that these update in a racy way, but we don't *really* care if + // they're perfectly accurate. they are all word sized so they will + // individually update atomically, but may not be coherent with each other. + size_t mempool_seq = 0; + size_t mempool_bytes = 0; + size_t mempool_onodes = 0; + + void get_mempool_stats(size_t *seq, uint64_t *bytes, uint64_t *onodes) { + *seq = mempool_seq; + *bytes = mempool_bytes; + *onodes = mempool_onodes; + } + + struct MempoolThread : public Thread { + BlueStore *store; + Cond cond; + Mutex lock; + bool stop = false; + public: + explicit MempoolThread(BlueStore *s) + : store(s), + lock("BlueStore::MempoolThread::lock") {} + void *entry(); + void init() { + assert(stop == false); + create("bstore_mempool"); + } + void shutdown() { + lock.Lock(); + stop = true; + cond.Signal(); + lock.Unlock(); + join(); + } + } mempool_thread; + // -------------------------------------------------------- // private methods