]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: implement 2Q buffer cache policy
authorSage Weil <sage@redhat.com>
Fri, 10 Jun 2016 15:37:52 +0000 (11:37 -0400)
committerSage Weil <sage@redhat.com>
Wed, 22 Jun 2016 15:28:42 +0000 (11:28 -0400)
Signed-off-by: Sage Weil <sage@redhat.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index bc83680dcf9aba99705c0d6974982056d9bca1a5..c255e0b852bb908990368fc16b925dfdfe3178ff 100644 (file)
@@ -479,6 +479,8 @@ BlueStore::Cache *BlueStore::Cache::create(string type)
 {
   if (type == "lru")
     return new LRUCache;
+  if (type == "2q")
+    return new TwoQCache;
   assert(0 == "unrecognized cache type");
 }
 
@@ -589,6 +591,195 @@ void BlueStore::LRUCache::_audit(const char *when)
 }
 #endif
 
+// TwoQCache
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
+
+
+void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
+{
+  auto p = onode_lru.iterator_to(*o);
+  onode_lru.erase(p);
+  onode_lru.push_front(*o);
+}
+
+void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
+{
+  dout(20) << __func__ << " level " << level << " near " << near
+          << " on " << *b
+          << " which has level " << b->cache_private << dendl;
+  if (near) {
+    b->cache_private = near->cache_private;
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
+      break;
+    case BUFFER_WARM_OUT:
+      buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
+      break;
+    case BUFFER_HOT:
+      buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
+      break;
+    default:
+      assert(0 == "bad cache_private");
+    }
+  } else if (b->cache_private == BUFFER_NEW) {
+    b->cache_private = BUFFER_WARM_IN;
+    if (level > 0) {
+      buffer_warm_in.push_front(*b);
+    } else {
+      // take caller hint to start at the back of the warm queue
+      buffer_warm_in.push_back(*b);
+    }
+  } else {
+    // ew got a hint from discard
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      // stay in warm_in.  move to front, even though 2Q doesn't actually
+      // do this.
+      dout(20) << __func__ << " move to front of warm " << *b << dendl;
+      buffer_warm_in.push_front(*b);
+      break;
+    case BUFFER_WARM_OUT:
+      // move to hot.  fall-thru
+    case BUFFER_HOT:
+      dout(20) << __func__ << " move to hot " << *b << dendl;
+      b->cache_private = BUFFER_HOT;
+      buffer_hot.push_front(*b);
+      break;
+    default:
+      assert(0 == "bad cache_private");
+    }
+  }
+  if (!b->is_empty()) {
+    buffer_bytes += b->length;
+  }
+}
+
+void BlueStore::TwoQCache::trim(uint64_t onode_max, uint64_t buffer_max)
+{
+  std::lock_guard<std::mutex> l(lock);
+
+  dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
+          << " buffers " << buffer_bytes << " / " << buffer_max
+          << dendl;
+
+  _audit("trim start");
+
+  // buffers
+  uint64_t num_buffers = buffer_hot.size() + buffer_warm_in.size();
+  if (num_buffers) {
+    // for simplicity, we assume buffer sizes are uniform across hot
+    // and warm_in.  FIXME?
+    uint64_t avg_buffer_size = buffer_bytes / num_buffers;
+    uint64_t kin = buffer_max / avg_buffer_size / 2;
+    uint64_t khot = kin;
+    uint64_t kout = buffer_max / avg_buffer_size;
+    // if hot is small, give slack to warm_in.
+    if (buffer_hot.size() < kin) {
+      kin += kin - buffer_hot.size();
+    }
+    dout(20) << __func__ << " num_buffers " << num_buffers
+            << " (hot " << buffer_hot.size()
+            << " + warm_in " << buffer_warm_in.size() << ")"
+            << " avg size " << avg_buffer_size
+            << " warm_out " << buffer_warm_out.size()
+            << " khot " << khot << " kin " << kin << " kout " << kout
+            << dendl;
+    while (buffer_hot.size() > khot) {
+      Buffer *b = &*buffer_hot.rbegin();
+      assert(b->is_clean());
+      dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+      b->space->_rm_buffer(b);
+    }
+    while (buffer_warm_in.size() > kin) {
+      Buffer *b = &*buffer_warm_in.rbegin();
+      assert(b->is_clean());
+      dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+      buffer_bytes -= b->length;
+      b->state = Buffer::STATE_EMPTY;
+      b->data.clear();
+      buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
+      buffer_warm_out.push_front(*b);
+      b->cache_private = BUFFER_WARM_OUT;
+    }
+    while (buffer_warm_out.size() > kout) {
+      Buffer *b = &*buffer_warm_out.rbegin();
+      assert(b->is_empty());
+      dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+      b->space->_rm_buffer(b);
+    }
+  }
+
+  // onodes
+  int num = onode_lru.size() - onode_max;
+  if (num <= 0)
+    return; // don't even try
+
+  auto p = onode_lru.end();
+  if (num)
+    --p;
+  while (num > 0) {
+    Onode *o = &*p;
+    int refs = o->nref.load();
+    if (refs > 1) {
+      dout(20) << __func__ << "  " << o->oid << " has " << refs
+              << " refs; stopping with " << num << " left to trim" << dendl;
+      break;
+    }
+    dout(30) << __func__ << "  trim " << o->oid << dendl;
+    if (p != onode_lru.begin()) {
+      onode_lru.erase(p--);
+    } else {
+      onode_lru.erase(p);
+      assert(num == 1);
+    }
+    o->get();  // paranoia
+    o->space->onode_map.erase(o->oid);
+    o->blob_map._clear();    // clear blobs and their buffers, too
+    o->put();
+    --num;
+  }
+}
+
+#ifdef DEBUG_CACHE
+void BlueStore::TwoQCache::_audit(const char *when)
+{
+  if (true) {
+    dout(10) << __func__ << " " << when << " start" << dendl;
+    uint64_t s = 0;
+    for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
+      s += i->length;
+    }
+    for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
+      s += i->length;
+    }
+    if (s != buffer_bytes) {
+      derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+          << dendl;
+      assert(s == buffer_bytes);
+    }
+    dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+            << " ok" << dendl;
+  }
+  if (false) {
+    uint64_t lc = 0, oc = 0;
+    set<OnodeSpace*> spaces;
+    for (auto i = onode_lru.begin(); i != onode_lru.end(); ++i) {
+      assert(i->space->onode_map.count(i->oid));
+      if (spaces.count(i->space) == 0) {
+       spaces.insert(i->space);
+       oc += i->space->onode_map.size();
+      }
+      ++lc;
+    }
+    if (lc != oc) {
+      derr << " lc " << lc << " oc " << oc << dendl;
+    }
+  }
+}
+#endif
+
 
 // BufferSpace
 
index d504c2aefcaea4e1f2a6e6f452639215ac0d82f1..3db3703530beffed8152de7985dde4d917e1550d 100644 (file)
@@ -524,6 +524,7 @@ public:
 #endif
   };
 
+  /// simple LRU cache for onodes and buffers
   struct LRUCache : public Cache {
   private:
     typedef boost::intrusive::list<
@@ -590,7 +591,100 @@ public:
 #ifdef DEBUG_CACHE
     void _audit(const char *s) override;
 #endif
+  };
+
+  // 2Q cache for buffers, LRU for onodes
+  struct TwoQCache : public Cache {
+  private:
+    // stick with LRU for onodes for now (fixme?)
+    typedef boost::intrusive::list<
+      Onode,
+      boost::intrusive::member_hook<
+        Onode,
+       boost::intrusive::list_member_hook<>,
+       &Onode::lru_item> > onode_lru_list_t;
+    typedef boost::intrusive::list<
+      Buffer,
+      boost::intrusive::member_hook<
+       Buffer,
+       boost::intrusive::list_member_hook<>,
+       &Buffer::lru_item> > buffer_list_t;
+
+    onode_lru_list_t onode_lru;
+
+    buffer_list_t buffer_hot;      //< "Am" hot buffers
+    buffer_list_t buffer_warm_in;  //< "A1in" newly warm buffers
+    buffer_list_t buffer_warm_out; //< "A1out" empty buffers we've evicted
+    uint64_t buffer_bytes = 0;        //< bytes
+
+    enum {
+      BUFFER_NEW = 0,
+      BUFFER_WARM_IN,   ///< in buffer_warm_in
+      BUFFER_WARM_OUT,  ///< in buffer_warm_out
+      BUFFER_HOT,       ///< in buffer_hot
+    };
 
+  public:
+    void _add_onode(OnodeRef& o, int level) override {
+      if (level > 0)
+       onode_lru.push_front(*o);
+      else
+       onode_lru.push_back(*o);
+    }
+    void _rm_onode(OnodeRef& o) override {
+      auto q = onode_lru.iterator_to(*o);
+      onode_lru.erase(q);
+    }
+    void _touch_onode(OnodeRef& o) override;
+
+    void _add_buffer(Buffer *b, int level, Buffer *near) override;
+
+    void _rm_buffer(Buffer *b) override {
+      if (!b->is_empty()) {
+       buffer_bytes -= b->length;
+      }
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+       buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
+       break;
+      case BUFFER_WARM_OUT:
+       buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
+       break;
+      case BUFFER_HOT:
+       buffer_hot.erase(buffer_hot.iterator_to(*b));
+       break;
+      default:
+       assert(0 == "bad cache_private");
+      }
+    }
+    void _adjust_buffer_size(Buffer *b, int64_t delta) override {
+      if (!b->is_empty()) {
+       buffer_bytes += delta;
+      }
+    }
+    void _touch_buffer(Buffer *b) override {
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+       // do nothing (somewhat counter-intuitively!)
+       break;
+      case BUFFER_WARM_OUT:
+       // move from warm_out to hot LRU
+       assert(0 == "this happens via discard hint");
+       break;
+      case BUFFER_HOT:
+       // move to front of hot LRU
+       buffer_hot.erase(buffer_hot.iterator_to(*b));
+       buffer_hot.push_front(*b);
+       break;
+      }
+      _audit("_touch_buffer end");
+    }
+
+    void trim(uint64_t onode_max, uint64_t buffer_max) override;
+
+#ifdef DEBUG_CACHE
+    void _audit(const char *s) override;
+#endif
   };
 
   struct OnodeSpace {