From: Sage Weil <sage@redhat.com>
Date: Tue, 11 Oct 2016 18:25:01 +0000 (-0400)
Subject: os/bluestore: restructure cache trimming in terms of mempool
X-Git-Tag: v11.1.0~442^2~16
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bcf20a1ca12ac0a7d4bd51e0beeda2877b4e0125;p=ceph.git

os/bluestore: restructure cache trimming in terms of mempool

Trim cache based on overall memory utilization by cache objects,
as tracked by the bluestore_meta_* mempools.  This lets you
configure the bluestore cache size in terms of bytes of memory.

Note that we do not account for other memory utilization by the
OSD.

Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 82157561687b..53af4a13d109 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -998,11 +998,12 @@ OPTION(bluestore_extent_map_shard_target_size, OPT_U32, 500)
 OPTION(bluestore_extent_map_shard_min_size, OPT_U32, 150)
 OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE, .2)
 OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32, 256)
+OPTION(bluestore_cache_trim_interval, OPT_DOUBLE, .1)
 OPTION(bluestore_cache_type, OPT_STR, "2q")   // lru, 2q
 OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5)    // kin page slot size / max page slot size
 OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5)   // number of kout page slot / total number of page slot
-OPTION(bluestore_onode_cache_size, OPT_U32, 4*1024)
-OPTION(bluestore_buffer_cache_size, OPT_U32, 512*1024*1024)
+OPTION(bluestore_cache_size, OPT_U64, 1024*1024*1024)
+OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .1)
 OPTION(bluestore_kvbackend, OPT_STR, "rocksdb")
 OPTION(bluestore_allocator, OPT_STR, "bitmap")     // stupid | bitmap
 OPTION(bluestore_freelist_type, OPT_STR, "bitmap") // extent | bitmap
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index df854399fe44..c96ed46db3ad 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -540,6 +540,69 @@ BlueStore::Cache *BlueStore::Cache::create(string type, PerfCounters *logger)
   return c;
 }
 
+void BlueStore::Cache::trim(
+  uint64_t target_bytes,
+  float target_meta_ratio,
+  float bytes_per_onode)
+{
+  std::lock_guard<std::recursive_mutex> l(lock);
+  uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
+  uint64_t current_buffer = _get_buffer_bytes();
+  uint64_t current = current_meta + current_buffer;
+
+  uint64_t target_meta = target_bytes * target_meta_ratio;
+  uint64_t target_buffer = target_bytes - target_meta;
+
+  if (current <= target_bytes) {
+    dout(10) << __func__
+	     << " shard target " << pretty_si_t(target_bytes)
+	     << " ratio " << target_meta_ratio << " ("
+	     << pretty_si_t(target_meta) << " + "
+	     << pretty_si_t(target_buffer) << "), "
+	     << " current " << pretty_si_t(current) << " ("
+	     << pretty_si_t(current_meta) << " + "
+	     << pretty_si_t(current_buffer) << ")"
+	     << dendl;
+    return;
+  }
+
+  uint64_t need_to_free = 0;
+  if (current > target_bytes) {
+    need_to_free = current - target_bytes;
+  }
+  uint64_t free_buffer = 0;
+  uint64_t free_meta = 0;
+  if (current_buffer > target_buffer) {
+    free_buffer = current_buffer - target_buffer;
+    if (free_buffer > need_to_free) {
+      free_buffer = need_to_free;
+    }
+  }
+  free_meta = need_to_free - free_buffer;
+
+  // start bounds at what we have now
+  uint64_t max_buffer = current_buffer - free_buffer;
+  uint64_t max_meta = current_meta - free_meta;
+  uint64_t max_onodes = max_meta / bytes_per_onode;
+
+  dout(10) << __func__
+	   << " shard target " << pretty_si_t(target_bytes)
+	   << " ratio " << target_meta_ratio << " ("
+	   << pretty_si_t(target_meta) << " + "
+	   << pretty_si_t(target_buffer) << "), "
+	   << " current " << pretty_si_t(current) << " ("
+	   << pretty_si_t(current_meta) << " + "
+	   << pretty_si_t(current_buffer) << "),"
+	   << " need_to_free " << pretty_si_t(need_to_free) << " ("
+	   << pretty_si_t(free_meta) << " + "
+	   << pretty_si_t(free_buffer) << ")"
+	   << " -> max " << max_onodes << " onodes + "
+	   << max_buffer << " buffer"
+	   << dendl;
+  _trim(max_onodes, max_buffer);
+}
+
+
 // LRUCache
 #undef dout_prefix
 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
@@ -551,10 +614,8 @@ void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
   onode_lru.push_front(*o);
 }
 
-void BlueStore::LRUCache::trim(uint64_t onode_max, uint64_t buffer_max)
+void BlueStore::LRUCache::_trim(uint64_t onode_max, uint64_t buffer_max)
 {
-  std::lock_guard<std::recursive_mutex> l(lock);
-
   dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
 	   << " buffers " << buffer_size << " / " << buffer_max
 	   << dendl;
@@ -730,10 +791,8 @@ void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
   }
 }
 
-void BlueStore::TwoQCache::trim(uint64_t onode_max, uint64_t buffer_max)
+void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
 {
-  std::lock_guard<std::recursive_mutex> l(lock);
-
   dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
 	   << " buffers " << buffer_bytes << " / " << buffer_max
 	   << dendl;
@@ -2457,15 +2516,54 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode(
 
 void BlueStore::Collection::trim_cache()
 {
-  cache->trim(
-    g_conf->bluestore_onode_cache_size / store->cache_shards.size(),
-    g_conf->bluestore_buffer_cache_size / store->cache_shards.size());
+  // see if mempool stats have updated
+  uint64_t total_bytes;
+  uint64_t total_onodes;
+  size_t seq;
+  store->get_mempool_stats(&seq, &total_bytes, &total_onodes);
+  if (seq == cache->last_trim_seq) {
+    dout(30) << __func__ << " no new mempool stats; nothing to do" << dendl;
+    return;
+  }
+  cache->last_trim_seq = seq;
+
+  // trim
+  if (total_onodes < 2) {
+    total_onodes = 2;
+  }
+  float bytes_per_onode = (float)total_bytes / (float)total_onodes;
+  size_t num_shards = store->cache_shards.size();
+  uint64_t shard_target = g_conf->bluestore_cache_size / num_shards;
+  dout(30) << __func__
+	   << " total meta bytes " << total_bytes
+	   << ", total onodes " << total_onodes
+	   << ", bytes_per_onode " << bytes_per_onode
+	   << dendl;
+  cache->trim(shard_target, g_conf->bluestore_cache_meta_ratio, bytes_per_onode);
 
   store->_update_cache_logger();
 }
 
 // =======================================================
 
+void *BlueStore::MempoolThread::entry()
+{
+  Mutex::Locker l(lock);
+  while (!stop) {
+    store->mempool_bytes = bluestore_meta_other::allocated_bytes() +
+      bluestore_meta_onode::allocated_bytes();
+    store->mempool_onodes = bluestore_meta_onode::allocated_items();
+    ++store->mempool_seq;
+    utime_t wait;
+    wait += g_conf->bluestore_cache_trim_interval;
+    cond.WaitInterval(g_ceph_context, lock, wait);
+  }
+  stop = false;
+  return NULL;
+}
+
+// =======================================================
+
 #undef dout_prefix
 #define dout_prefix *_dout << "bluestore(" << path << ") "
 
@@ -2512,7 +2610,8 @@ BlueStore::BlueStore(CephContext *cct, const string& path)
     logger(NULL),
     debug_read_error_lock("BlueStore::debug_read_error_lock"),
     csum_type(Checksummer::CSUM_CRC32C),
-    sync_wal_apply(cct->_conf->bluestore_sync_wal_apply)
+    sync_wal_apply(cct->_conf->bluestore_sync_wal_apply),
+    mempool_thread(this)
 {
   _init_logger();
   g_ceph_context->_conf->add_observer(this);
@@ -4052,6 +4151,8 @@ int BlueStore::mount()
   if (r < 0)
     goto out_stop;
 
+  mempool_thread.init();
+
   _set_csum();
   _set_compression();
 
@@ -4059,6 +4160,7 @@ int BlueStore::mount()
   return 0;
 
  out_stop:
+  mempool_thread.shutdown();
   _kv_stop();
   wal_wq.drain();
   wal_tp.stop();
@@ -4092,6 +4194,8 @@ int BlueStore::umount()
   _reap_collections();
   coll_map.clear();
 
+  mempool_thread.shutdown();
+
   dout(20) << __func__ << " stopping kv thread" << dendl;
   _kv_stop();
   dout(20) << __func__ << " draining wal_wq" << dendl;
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index 73d57e30e60d..848ab7efbf4b 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -791,6 +791,8 @@ public:
     std::atomic<uint64_t> num_extents = {0};
     std::atomic<uint64_t> num_blobs = {0};
 
+    size_t last_trim_seq = 0;
+
     static Cache *create(string type, PerfCounters *logger);
 
     virtual ~Cache() {}
@@ -804,6 +806,9 @@ public:
     virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
     virtual void _touch_buffer(Buffer *b) = 0;
 
+    virtual uint64_t _get_num_onodes() = 0;
+    virtual uint64_t _get_buffer_bytes() = 0;
+
     void add_extent() {
       ++num_extents;
     }
@@ -818,7 +823,10 @@ public:
       --num_blobs;
     }
 
-    virtual void trim(uint64_t onode_max, uint64_t buffer_max) = 0;
+    void trim(uint64_t target_bytes, float target_meta_ratio,
+	      float bytes_per_onode);
+
+    virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
 
     virtual void add_stats(uint64_t *onodes, uint64_t *extents,
 			   uint64_t *blobs,
@@ -854,6 +862,9 @@ public:
     uint64_t buffer_size = 0;
 
   public:
+    uint64_t _get_num_onodes() override {
+      return onode_lru.size();
+    }
     void _add_onode(OnodeRef& o, int level) override {
       if (level > 0)
 	onode_lru.push_front(*o);
@@ -866,6 +877,9 @@ public:
     }
     void _touch_onode(OnodeRef& o) override;
 
+    uint64_t _get_buffer_bytes() override {
+      return buffer_size;
+    }
     void _add_buffer(Buffer *b, int level, Buffer *near) override {
       if (near) {
 	auto q = buffer_lru.iterator_to(*near);
@@ -894,7 +908,7 @@ public:
       _audit("_touch_buffer end");
     }
 
-    void trim(uint64_t onode_max, uint64_t buffer_max) override;
+    void _trim(uint64_t onode_max, uint64_t buffer_max) override;
 
     void add_stats(uint64_t *onodes, uint64_t *extents,
 		   uint64_t *blobs,
@@ -948,6 +962,9 @@ public:
     uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
 
   public:
+    uint64_t _get_num_onodes() override {
+      return onode_lru.size();
+    }
     void _add_onode(OnodeRef& o, int level) override {
       if (level > 0)
 	onode_lru.push_front(*o);
@@ -960,6 +977,9 @@ public:
     }
     void _touch_onode(OnodeRef& o) override;
 
+    uint64_t _get_buffer_bytes() override {
+      return buffer_bytes;
+    }
     void _add_buffer(Buffer *b, int level, Buffer *near) override;
     void _rm_buffer(Buffer *b) override;
     void _adjust_buffer_size(Buffer *b, int64_t delta) override;
@@ -981,7 +1001,7 @@ public:
       _audit("_touch_buffer end");
     }
 
-    void trim(uint64_t onode_max, uint64_t buffer_max) override;
+    void _trim(uint64_t onode_max, uint64_t buffer_max) override;
 
     void add_stats(uint64_t *onodes, uint64_t *extents,
 		   uint64_t *blobs,
@@ -1501,6 +1521,44 @@ private:
   std::atomic<uint64_t> comp_min_blob_size = {0};
   std::atomic<uint64_t> comp_max_blob_size = {0};
 
+  // cache trim control
+
+  // note that these update in a racy way, but we don't *really* care if
+  // they're perfectly accurate.  they are all word sized so they will
+  // individually update atomically, but may not be coherent with each other.
+  size_t mempool_seq = 0;
+  size_t mempool_bytes = 0;
+  size_t mempool_onodes = 0;
+
+  void get_mempool_stats(size_t *seq, uint64_t *bytes, uint64_t *onodes) {
+    *seq = mempool_seq;
+    *bytes = mempool_bytes;
+    *onodes = mempool_onodes;
+  }
+
+  struct MempoolThread : public Thread {
+    BlueStore *store;
+    Cond cond;
+    Mutex lock;
+    bool stop = false;
+  public:
+    explicit MempoolThread(BlueStore *s)
+      : store(s),
+	lock("BlueStore::MempoolThread::lock") {}
+    void *entry();
+    void init() {
+      assert(stop == false);
+      create("bstore_mempool");
+    }
+    void shutdown() {
+      lock.Lock();
+      stop = true;
+      cond.Signal();
+      lock.Unlock();
+      join();
+    }
+  } mempool_thread;
+
   // --------------------------------------------------------
   // private methods