OPTION(bluestore_extent_map_shard_min_size, OPT_U32, 150)
OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE, .2)
OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32, 256)
+OPTION(bluestore_cache_trim_interval, OPT_DOUBLE, .1)
OPTION(bluestore_cache_type, OPT_STR, "2q") // lru, 2q
OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5) // kin page slot size / max page slot size
OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5) // number of kout page slot / total number of page slot
-OPTION(bluestore_onode_cache_size, OPT_U32, 4*1024)
-OPTION(bluestore_buffer_cache_size, OPT_U32, 512*1024*1024)
+OPTION(bluestore_cache_size, OPT_U64, 1024*1024*1024)
+OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .1)
OPTION(bluestore_kvbackend, OPT_STR, "rocksdb")
OPTION(bluestore_allocator, OPT_STR, "bitmap") // stupid | bitmap
OPTION(bluestore_freelist_type, OPT_STR, "bitmap") // extent | bitmap
return c;
}
+void BlueStore::Cache::trim(
+ uint64_t target_bytes,
+ float target_meta_ratio,
+ float bytes_per_onode)
+{
+ std::lock_guard<std::recursive_mutex> l(lock);
+ uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
+ uint64_t current_buffer = _get_buffer_bytes();
+ uint64_t current = current_meta + current_buffer;
+
+ uint64_t target_meta = target_bytes * target_meta_ratio;
+ uint64_t target_buffer = target_bytes - target_meta;
+
+ if (current <= target_bytes) {
+ dout(10) << __func__
+ << " shard target " << pretty_si_t(target_bytes)
+ << " ratio " << target_meta_ratio << " ("
+ << pretty_si_t(target_meta) << " + "
+ << pretty_si_t(target_buffer) << "), "
+ << " current " << pretty_si_t(current) << " ("
+ << pretty_si_t(current_meta) << " + "
+ << pretty_si_t(current_buffer) << ")"
+ << dendl;
+ return;
+ }
+
+ uint64_t need_to_free = 0;
+ if (current > target_bytes) {
+ need_to_free = current - target_bytes;
+ }
+ uint64_t free_buffer = 0;
+ uint64_t free_meta = 0;
+ if (current_buffer > target_buffer) {
+ free_buffer = current_buffer - target_buffer;
+ if (free_buffer > need_to_free) {
+ free_buffer = need_to_free;
+ }
+ }
+ free_meta = need_to_free - free_buffer;
+
+ // start bounds at what we have now
+ uint64_t max_buffer = current_buffer - free_buffer;
+ uint64_t max_meta = current_meta - free_meta;
+ uint64_t max_onodes = max_meta / bytes_per_onode;
+
+ dout(10) << __func__
+ << " shard target " << pretty_si_t(target_bytes)
+ << " ratio " << target_meta_ratio << " ("
+ << pretty_si_t(target_meta) << " + "
+ << pretty_si_t(target_buffer) << "), "
+ << " current " << pretty_si_t(current) << " ("
+ << pretty_si_t(current_meta) << " + "
+ << pretty_si_t(current_buffer) << "),"
+ << " need_to_free " << pretty_si_t(need_to_free) << " ("
+ << pretty_si_t(free_meta) << " + "
+ << pretty_si_t(free_buffer) << ")"
+ << " -> max " << max_onodes << " onodes + "
+ << max_buffer << " buffer"
+ << dendl;
+ _trim(max_onodes, max_buffer);
+}
+
+
// LRUCache
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
onode_lru.push_front(*o);
}
-void BlueStore::LRUCache::trim(uint64_t onode_max, uint64_t buffer_max)
+void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
{
- std::lock_guard<std::recursive_mutex> l(lock);
-
dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
<< " buffers " << buffer_size << " / " << buffer_max
<< dendl;
}
}
-void BlueStore::TwoQCache::trim(uint64_t onode_max, uint64_t buffer_max)
+void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
{
- std::lock_guard<std::recursive_mutex> l(lock);
-
dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
<< " buffers " << buffer_bytes << " / " << buffer_max
<< dendl;
void BlueStore::Collection::trim_cache()
{
- cache->trim(
- g_conf->bluestore_onode_cache_size / store->cache_shards.size(),
- g_conf->bluestore_buffer_cache_size / store->cache_shards.size());
+ // see if mempool stats have updated
+ uint64_t total_bytes;
+ uint64_t total_onodes;
+ size_t seq;
+ store->get_mempool_stats(&seq, &total_bytes, &total_onodes);
+ if (seq == cache->last_trim_seq) {
+ dout(30) << __func__ << " no new mempool stats; nothing to do" << dendl;
+ return;
+ }
+ cache->last_trim_seq = seq;
+
+ // trim
+ if (total_onodes < 2) {
+ total_onodes = 2;
+ }
+ float bytes_per_onode = (float)total_bytes / (float)total_onodes;
+ size_t num_shards = store->cache_shards.size();
+ uint64_t shard_target = g_conf->bluestore_cache_size / num_shards;
+ dout(30) << __func__
+ << " total meta bytes " << total_bytes
+ << ", total onodes " << total_onodes
+ << ", bytes_per_onode " << bytes_per_onode
+ << dendl;
+ cache->trim(shard_target, g_conf->bluestore_cache_meta_ratio, bytes_per_onode);
store->_update_cache_logger();
}
// =======================================================
+void *BlueStore::MempoolThread::entry()
+{
+ Mutex::Locker l(lock);
+ while (!stop) {
+ store->mempool_bytes = bluestore_meta_other::allocated_bytes() +
+ bluestore_meta_onode::allocated_bytes();
+ store->mempool_onodes = bluestore_meta_onode::allocated_items();
+ ++store->mempool_seq;
+ utime_t wait;
+ wait += g_conf->bluestore_cache_trim_interval;
+ cond.WaitInterval(g_ceph_context, lock, wait);
+ }
+ stop = false;
+ return NULL;
+}
+
+// =======================================================
+
#undef dout_prefix
#define dout_prefix *_dout << "bluestore(" << path << ") "
logger(NULL),
debug_read_error_lock("BlueStore::debug_read_error_lock"),
csum_type(Checksummer::CSUM_CRC32C),
- sync_wal_apply(cct->_conf->bluestore_sync_wal_apply)
+ sync_wal_apply(cct->_conf->bluestore_sync_wal_apply),
+ mempool_thread(this)
{
_init_logger();
g_ceph_context->_conf->add_observer(this);
if (r < 0)
goto out_stop;
+ mempool_thread.init();
+
_set_csum();
_set_compression();
return 0;
out_stop:
+ mempool_thread.shutdown();
_kv_stop();
wal_wq.drain();
wal_tp.stop();
_reap_collections();
coll_map.clear();
+ mempool_thread.shutdown();
+
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
dout(20) << __func__ << " draining wal_wq" << dendl;
std::atomic<uint64_t> num_extents = {0};
std::atomic<uint64_t> num_blobs = {0};
+ size_t last_trim_seq = 0;
+
static Cache *create(string type, PerfCounters *logger);
virtual ~Cache() {}
virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
virtual void _touch_buffer(Buffer *b) = 0;
+ virtual uint64_t _get_num_onodes() = 0;
+ virtual uint64_t _get_buffer_bytes() = 0;
+
void add_extent() {
++num_extents;
}
--num_blobs;
}
- virtual void trim(uint64_t onode_max, uint64_t buffer_max) = 0;
+ void trim(uint64_t target_bytes, float target_meta_ratio,
+ float bytes_per_onode);
+
+ virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
virtual void add_stats(uint64_t *onodes, uint64_t *extents,
uint64_t *blobs,
uint64_t buffer_size = 0;
public:
+ uint64_t _get_num_onodes() override {
+ return onode_lru.size();
+ }
void _add_onode(OnodeRef& o, int level) override {
if (level > 0)
onode_lru.push_front(*o);
}
void _touch_onode(OnodeRef& o) override;
+ uint64_t _get_buffer_bytes() override {
+ return buffer_size;
+ }
void _add_buffer(Buffer *b, int level, Buffer *near) override {
if (near) {
auto q = buffer_lru.iterator_to(*near);
_audit("_touch_buffer end");
}
- void trim(uint64_t onode_max, uint64_t buffer_max) override;
+ void _trim(uint64_t onode_max, uint64_t buffer_max) override;
void add_stats(uint64_t *onodes, uint64_t *extents,
uint64_t *blobs,
uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
public:
+ uint64_t _get_num_onodes() override {
+ return onode_lru.size();
+ }
void _add_onode(OnodeRef& o, int level) override {
if (level > 0)
onode_lru.push_front(*o);
}
void _touch_onode(OnodeRef& o) override;
+ uint64_t _get_buffer_bytes() override {
+ return buffer_bytes;
+ }
void _add_buffer(Buffer *b, int level, Buffer *near) override;
void _rm_buffer(Buffer *b) override;
void _adjust_buffer_size(Buffer *b, int64_t delta) override;
_audit("_touch_buffer end");
}
- void trim(uint64_t onode_max, uint64_t buffer_max) override;
+ void _trim(uint64_t onode_max, uint64_t buffer_max) override;
void add_stats(uint64_t *onodes, uint64_t *extents,
uint64_t *blobs,
std::atomic<uint64_t> comp_min_blob_size = {0};
std::atomic<uint64_t> comp_max_blob_size = {0};
+ // cache trim control
+
+ // note that these update in a racy way, but we don't *really* care if
+ // they're perfectly accurate. they are all word sized so they will
+ // individually update atomically, but may not be coherent with each other.
+ size_t mempool_seq = 0;
+ size_t mempool_bytes = 0;
+ size_t mempool_onodes = 0;
+
+ void get_mempool_stats(size_t *seq, uint64_t *bytes, uint64_t *onodes) {
+ *seq = mempool_seq;
+ *bytes = mempool_bytes;
+ *onodes = mempool_onodes;
+ }
+
+ struct MempoolThread : public Thread {
+ BlueStore *store;
+ Cond cond;
+ Mutex lock;
+ bool stop = false;
+ public:
+ explicit MempoolThread(BlueStore *s)
+ : store(s),
+ lock("BlueStore::MempoolThread::lock") {}
+ void *entry();
+ void init() {
+ assert(stop == false);
+ create("bstore_mempool");
+ }
+ void shutdown() {
+ lock.Lock();
+ stop = true;
+ cond.Signal();
+ lock.Unlock();
+ join();
+ }
+ } mempool_thread;
+
// --------------------------------------------------------
// private methods