]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
src/kv: Initial import of a custom RocksDB cache.
authorMark Nelson <mnelson@redhat.com>
Tue, 3 Jul 2018 15:42:14 +0000 (10:42 -0500)
committerJosh Durgin <jdurgin@redhat.com>
Tue, 27 Nov 2018 02:08:35 +0000 (21:08 -0500)
Signed-off-by: Mark Nelson <mnelson@redhat.com>
(cherry picked from commit 9dd6b2885e6d83b5ad468a7343aa2f1d476e818b)

Conflicts:
src/kv/RocksDBStore.cc
trivial - g_conf has no operator()

src/common/autovector.h [new file with mode: 0644]
src/common/options.cc
src/kv/CMakeLists.txt
src/kv/RocksDBStore.cc
src/kv/RocksDBStore.h
src/kv/rocksdb_cache/BinnedLRUCache.cc [new file with mode: 0644]
src/kv/rocksdb_cache/BinnedLRUCache.h [new file with mode: 0644]
src/kv/rocksdb_cache/ShardedCache.cc [new file with mode: 0644]
src/kv/rocksdb_cache/ShardedCache.h [new file with mode: 0644]

diff --git a/src/common/autovector.h b/src/common/autovector.h
new file mode 100644 (file)
index 0000000..5fbaa83
--- /dev/null
@@ -0,0 +1,334 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+
+#ifndef CEPH_AUTOVECTOR_H
+#define CEPH_AUTOVECTOR_H
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+namespace ceph {
+
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index) {};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // ++iterator
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // iterator++
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // --iterator
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // iterator--
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) const {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) const {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    const_reference operator*() const {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    pointer operator->() {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    const_pointer operator->() const {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() = default;
+
+  autovector(std::initializer_list<T> init_list) {
+    for (const T& item : init_list) {
+      push_back(item);
+    }
+  }
+
+  ~autovector() = default;
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      num_stack_items_ = n;
+    }
+  }
+
+  bool empty() const { return size() == 0; }
+
+  const_reference operator[](size_type n) const {
+    assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  const_reference at(size_type n) const {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference front() {
+    assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  template <class... Args>
+  void emplace_back(Args&&... args) {
+    push_back(value_type(args...));
+  }
+
+  void pop_back() {
+    assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      --num_stack_items_;
+    }
+  }
+
+  void clear() {
+    num_stack_items_ = 0;
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+}  // namespace ceph 
+#endif // CEPH_AUTOVECTOR_H
index b21e271311e12f6412b0a08d33cc243cb0302fbc..59e0dfd05f075fcb0e35496399933bdfe785a1e8 100644 (file)
@@ -3440,7 +3440,7 @@ std::vector<Option> get_global_options() {
     .set_description(""),
 
     Option("rocksdb_cache_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("lru")
+    .set_default("binned_lru")
     .set_description(""),
 
     Option("rocksdb_block_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
index c20bb5e4099952658e02afae0c545cb195f2e84e..08d7f79ea9eb6604a29a75282f1cf5668ef68aee 100644 (file)
@@ -1,7 +1,9 @@
 set(kv_srcs
   KeyValueDB.cc
   MemDB.cc
-  RocksDBStore.cc)
+  RocksDBStore.cc
+  rocksdb_cache/ShardedCache.cc
+  rocksdb_cache/BinnedLRUCache.cc)
 
 if (WITH_LEVELDB)
   list(APPEND kv_srcs LevelDBStore.cc)
index 222945dd606ff9594b0dc761b971ad4fe2e51710..c13453f0ab7228d419edb9136cb948e78c88e068 100644 (file)
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/utilities/convenience.h"
 #include "rocksdb/merge_operator.h"
+#include "kv/rocksdb_cache/BinnedLRUCache.h"
+
 using std::string;
 #include "common/perf_counters.h"
-#include "common/debug.h"
 #include "common/PriorityCache.h"
 #include "include/str_list.h"
 #include "include/stringify.h"
@@ -389,7 +390,11 @@ int RocksDBStore::load_rocksdb_options(bool create_if_missing, rocksdb::Options&
   uint64_t row_cache_size = cache_size * g_conf->rocksdb_cache_row_ratio;
   uint64_t block_cache_size = cache_size - row_cache_size;
 
-  if (g_conf->rocksdb_cache_type == "lru") {
+  if (g_conf->rocksdb_cache_type == "binned_lru") {
+    bbt_opts.block_cache = rocksdb_cache::NewBinnedLRUCache(
+      block_cache_size,
+      g_conf->rocksdb_cache_shard_bits);
+  } else if (g_conf->rocksdb_cache_type == "lru") {
     bbt_opts.block_cache = rocksdb::NewLRUCache(
       block_cache_size,
       g_conf->rocksdb_cache_shard_bits);
@@ -1235,36 +1240,41 @@ void RocksDBStore::compact_range(const string& start, const string& end)
 int64_t RocksDBStore::request_cache_bytes(PriorityCache::Priority pri, uint64_t chunk_bytes) const
 {
   auto cache = bbt_opts.block_cache;
-  int64_t assigned = get_cache_bytes(pri);
 
+  int64_t assigned = get_cache_bytes(pri);
+  int64_t usage = 0;
+  int64_t request = 0;
   switch (pri) {
   // PRI0 is for rocksdb's high priority items (indexes/filters)
   case PriorityCache::Priority::PRI0:
     {
-      int64_t usage = cache->GetHighPriPoolUsage();
-
-      // RocksDB sometimes flushes the high pri cache when the low priority
-      // cache exceeds the soft cap, so in that case use a "watermark" for 
-      // the usage instead.
-      if (high_pri_watermark > usage) {
-        usage = high_pri_watermark;
+      usage += cache->GetPinnedUsage();
+      if (g_conf->rocksdb_cache_type == "binned_lru") {
+        auto binned_cache =
+            std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(cache);
+        usage += binned_cache->GetHighPriPoolUsage();
       }
-      dout(10) << __func__ << " high pri pool usage: " << usage << dendl;
-      int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
-      return (request > assigned) ? request - assigned : 0;
+      break;
     }
   // All other cache items are currently shoved into the LAST priority. 
   case PriorityCache::Priority::LAST:
     { 
-      uint64_t usage = cache->GetUsage() - cache->GetHighPriPoolUsage();
-      dout(10) << __func__ << " low pri pool usage: " << usage << dendl;
-      int64_t request = PriorityCache::get_chunk(usage, chunk_bytes);
-      return (request > assigned) ? request - assigned : 0;
+      usage = get_cache_usage() - cache->GetPinnedUsage(); 
+      if (g_conf->rocksdb_cache_type == "binned_lru") {
+        auto binned_cache =
+            std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(cache);
+        usage -= binned_cache->GetHighPriPoolUsage();
+      }
+      break;
     }
   default:
     break;
   }
-  return -EOPNOTSUPP;
+  request = PriorityCache::get_chunk(usage, chunk_bytes);
+  request = (request > assigned) ? request - assigned : 0;
+  dout(10) << __func__ << " Priority: " << static_cast<uint32_t>(pri) 
+           << " Usage: " << usage << " Request: " << request << dendl;
+  return request;
 }
 
 int64_t RocksDBStore::get_cache_usage() const
@@ -1274,39 +1284,24 @@ int64_t RocksDBStore::get_cache_usage() const
 
 int64_t RocksDBStore::commit_cache_size()
 {
-  int64_t high_pri_bytes = get_cache_bytes(PriorityCache::Priority::PRI0);
-  int64_t total_bytes = get_cache_bytes();
-
-  double ratio = (double) high_pri_bytes / total_bytes;
   size_t old_bytes = bbt_opts.block_cache->GetCapacity();
+  int64_t total_bytes = get_cache_bytes();
   dout(10) << __func__ << " old: " << old_bytes
-           << ", new: " << total_bytes << dendl;
+           << " new: " << total_bytes << dendl;
   bbt_opts.block_cache->SetCapacity((size_t) total_bytes);
-  set_cache_high_pri_pool_ratio(ratio);
 
-  // After setting the cache sizes, updated the high pri watermark. 
-  int64_t high_pri_pool_usage = bbt_opts.block_cache->GetHighPriPoolUsage();
-  if (high_pri_watermark < high_pri_pool_usage) {
-    high_pri_watermark = high_pri_pool_usage;
-  } else {
-    high_pri_watermark = static_cast<int64_t>(0.90 * high_pri_watermark);
+  // Set the high priority pool ratio is this is the binned LRU cache.
+  if (g_conf->rocksdb_cache_type == "binned_lru") {
+    auto binned_cache =
+        std::static_pointer_cast<rocksdb_cache::BinnedLRUCache>(bbt_opts.block_cache);
+    int64_t high_pri_bytes = get_cache_bytes(PriorityCache::Priority::PRI0);
+    double ratio = (double) high_pri_bytes / total_bytes;
+    dout(10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
+    binned_cache->SetHighPriPoolRatio(ratio);
   }
-
   return total_bytes;
 }
 
-int RocksDBStore::set_cache_high_pri_pool_ratio(double ratio)
-{
-  if (g_conf->rocksdb_cache_type != "lru") {
-    return -EOPNOTSUPP;
-  }
-  dout(10) << __func__ << " old ratio: " 
-          << bbt_opts.block_cache->GetHighPriPoolRatio() << " new ratio: "
-          << ratio << dendl;
-  bbt_opts.block_cache->SetHighPriPoolRatio(ratio);
-  return 0;
-}
-
 int64_t RocksDBStore::get_cache_capacity() {
   return bbt_opts.block_cache->GetCapacity();
 }
index 756e74222afbcaea8f7c7f17fe76d76e3bf5c59d..34470d5c963172b97398480509e883ad173231ee 100644 (file)
@@ -488,7 +488,6 @@ err:
   }
 
   int set_cache_capacity(int64_t capacity);
-  int set_cache_high_pri_pool_ratio(double ratio);
   int64_t get_cache_capacity();
 
   WholeSpaceIterator get_wholespace_iterator() override;
diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.cc b/src/kv/rocksdb_cache/BinnedLRUCache.cc
new file mode 100644 (file)
index 0000000..6677d88
--- /dev/null
@@ -0,0 +1,565 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "BinnedLRUCache.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+namespace rocksdb_cache {
+
+BinnedLRUHandleTable::BinnedLRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
+  Resize();
+}
+
+BinnedLRUHandleTable::~BinnedLRUHandleTable() {
+  ApplyToAllCacheEntries([](BinnedLRUHandle* h) {
+    if (h->refs == 1) {
+      h->Free();
+    }
+  });
+  delete[] list_;
+}
+
+BinnedLRUHandle* BinnedLRUHandleTable::Lookup(const rocksdb::Slice& key, uint32_t hash) {
+  return *FindPointer(key, hash);
+}
+
+BinnedLRUHandle* BinnedLRUHandleTable::Insert(BinnedLRUHandle* h) {
+  BinnedLRUHandle** ptr = FindPointer(h->key(), h->hash);
+  BinnedLRUHandle* old = *ptr;
+  h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+  *ptr = h;
+  if (old == nullptr) {
+    ++elems_;
+    if (elems_ > length_) {
+      // Since each cache entry is fairly large, we aim for a small
+      // average linked list length (<= 1).
+      Resize();
+    }
+  }
+  return old;
+}
+
+BinnedLRUHandle* BinnedLRUHandleTable::Remove(const rocksdb::Slice& key, uint32_t hash) {
+  BinnedLRUHandle** ptr = FindPointer(key, hash);
+  BinnedLRUHandle* result = *ptr;
+  if (result != nullptr) {
+    *ptr = result->next_hash;
+    --elems_;
+  }
+  return result;
+}
+
+BinnedLRUHandle** BinnedLRUHandleTable::FindPointer(const rocksdb::Slice& key, uint32_t hash) {
+  BinnedLRUHandle** ptr = &list_[hash & (length_ - 1)];
+  while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
+    ptr = &(*ptr)->next_hash;
+  }
+  return ptr;
+}
+
+void BinnedLRUHandleTable::Resize() {
+  uint32_t new_length = 16;
+  while (new_length < elems_ * 1.5) {
+    new_length *= 2;
+  }
+  BinnedLRUHandle** new_list = new BinnedLRUHandle*[new_length];
+  memset(new_list, 0, sizeof(new_list[0]) * new_length);
+  uint32_t count = 0;
+  for (uint32_t i = 0; i < length_; i++) {
+    BinnedLRUHandle* h = list_[i];
+    while (h != nullptr) {
+      BinnedLRUHandle* next = h->next_hash;
+      uint32_t hash = h->hash;
+      BinnedLRUHandle** ptr = &new_list[hash & (new_length - 1)];
+      h->next_hash = *ptr;
+      *ptr = h;
+      h = next;
+      count++;
+    }
+  }
+  assert(elems_ == count);
+  delete[] list_;
+  list_ = new_list;
+  length_ = new_length;
+}
+
+BinnedLRUCacheShard::BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio)
+    : capacity_(0),
+      high_pri_pool_usage_(0),
+      strict_capacity_limit_(strict_capacity_limit),
+      high_pri_pool_ratio_(high_pri_pool_ratio),
+      high_pri_pool_capacity_(0),
+      usage_(0),
+      lru_usage_(0) {
+  // Make empty circular linked list
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+  lru_low_pri_ = &lru_;
+  SetCapacity(capacity);
+}
+
+BinnedLRUCacheShard::~BinnedLRUCacheShard() {}
+
+bool BinnedLRUCacheShard::Unref(BinnedLRUHandle* e) {
+  assert(e->refs > 0);
+  e->refs--;
+  return e->refs == 0;
+}
+
+// Call deleter and free
+
+void BinnedLRUCacheShard::EraseUnRefEntries() {
+  ceph::autovector<BinnedLRUHandle*> last_reference_list;
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    while (lru_.next != &lru_) {
+      BinnedLRUHandle* old = lru_.next;
+      assert(old->InCache());
+      assert(old->refs ==
+             1);  // LRU list contains elements which may be evicted
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      old->SetInCache(false);
+      Unref(old);
+      usage_ -= old->charge;
+      last_reference_list.push_back(old);
+    }
+  }
+
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+}
+
+void BinnedLRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                           bool thread_safe) {
+  if (thread_safe) {
+    mutex_.lock();
+  }
+  table_.ApplyToAllCacheEntries(
+      [callback](BinnedLRUHandle* h) { callback(h->value, h->charge); });
+  if (thread_safe) {
+    mutex_.unlock();
+  }
+}
+
+void BinnedLRUCacheShard::TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri) {
+  *lru = &lru_;
+  *lru_low_pri = lru_low_pri_;
+}
+
+size_t BinnedLRUCacheShard::TEST_GetLRUSize() {
+  BinnedLRUHandle* lru_handle = lru_.next;
+  size_t lru_size = 0;
+  while (lru_handle != &lru_) {
+    lru_size++;
+    lru_handle = lru_handle->next;
+  }
+  return lru_size;
+}
+
+double BinnedLRUCacheShard::GetHighPriPoolRatio() const {
+  std::lock_guard<std::mutex> l(mutex_);
+  return high_pri_pool_ratio_;
+}
+
+size_t BinnedLRUCacheShard::GetHighPriPoolUsage() const {
+  std::lock_guard<std::mutex> l(mutex_);
+  return high_pri_pool_usage_;
+}
+
+void BinnedLRUCacheShard::LRU_Remove(BinnedLRUHandle* e) {
+  assert(e->next != nullptr);
+  assert(e->prev != nullptr);
+  if (lru_low_pri_ == e) {
+    lru_low_pri_ = e->prev;
+  }
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+  e->prev = e->next = nullptr;
+  lru_usage_ -= e->charge;
+  if (e->InHighPriPool()) {
+    assert(high_pri_pool_usage_ >= e->charge);
+    high_pri_pool_usage_ -= e->charge;
+  }
+}
+
+void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) {
+  assert(e->next == nullptr);
+  assert(e->prev == nullptr);
+  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
+    // Inset "e" to head of LRU list.
+    e->next = &lru_;
+    e->prev = lru_.prev;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(true);
+    high_pri_pool_usage_ += e->charge;
+    MaintainPoolSize();
+  } else {
+    // Insert "e" to the head of low-pri pool. Note that when
+    // high_pri_pool_ratio is 0, head of low-pri pool is also head of LRU list.
+    e->next = lru_low_pri_->next;
+    e->prev = lru_low_pri_;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(false);
+    lru_low_pri_ = e;
+  }
+  lru_usage_ += e->charge;
+}
+
+void BinnedLRUCacheShard::MaintainPoolSize() {
+  while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
+    // Overflow last entry in high-pri pool to low-pri pool.
+    lru_low_pri_ = lru_low_pri_->next;
+    assert(lru_low_pri_ != &lru_);
+    lru_low_pri_->SetInHighPriPool(false);
+    high_pri_pool_usage_ -= lru_low_pri_->charge;
+  }
+}
+
+void BinnedLRUCacheShard::EvictFromLRU(size_t charge,
+                                 ceph::autovector<BinnedLRUHandle*>* deleted) {
+  while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+    BinnedLRUHandle* old = lru_.next;
+    assert(old->InCache());
+    assert(old->refs == 1);  // LRU list contains elements which may be evicted
+    LRU_Remove(old);
+    table_.Remove(old->key(), old->hash);
+    old->SetInCache(false);
+    Unref(old);
+    usage_ -= old->charge;
+    deleted->push_back(old);
+  }
+}
+
+void BinnedLRUCacheShard::SetCapacity(size_t capacity) {
+  ceph::autovector<BinnedLRUHandle*> last_reference_list;
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    capacity_ = capacity;
+    high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+    EvictFromLRU(0, &last_reference_list);
+  }
+  // we free the entries here outside of mutex for
+  // performance reasons
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+}
+
+void BinnedLRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  std::lock_guard<std::mutex> l(mutex_);
+  strict_capacity_limit_ = strict_capacity_limit;
+}
+
+rocksdb::Cache::Handle* BinnedLRUCacheShard::Lookup(const rocksdb::Slice& key, uint32_t hash) {
+  std::lock_guard<std::mutex> l(mutex_);
+  BinnedLRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    assert(e->InCache());
+    if (e->refs == 1) {
+      LRU_Remove(e);
+    }
+    e->refs++;
+    e->SetHit();
+  }
+  return reinterpret_cast<rocksdb::Cache::Handle*>(e);
+}
+
+bool BinnedLRUCacheShard::Ref(rocksdb::Cache::Handle* h) {
+  BinnedLRUHandle* handle = reinterpret_cast<BinnedLRUHandle*>(h);
+  std::lock_guard<std::mutex> l(mutex_);
+  if (handle->InCache() && handle->refs == 1) {
+    LRU_Remove(handle);
+  }
+  handle->refs++;
+  return true;
+}
+
+void BinnedLRUCacheShard::SetHighPriPoolRatio(double high_pri_pool_ratio) {
+  std::lock_guard<std::mutex> l(mutex_);
+  high_pri_pool_ratio_ = high_pri_pool_ratio;
+  high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+  MaintainPoolSize();
+}
+
+bool BinnedLRUCacheShard::Release(rocksdb::Cache::Handle* handle, bool force_erase) {
+  if (handle == nullptr) {
+    return false;
+  }
+  BinnedLRUHandle* e = reinterpret_cast<BinnedLRUHandle*>(handle);
+  bool last_reference = false;
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    last_reference = Unref(e);
+    if (last_reference) {
+      usage_ -= e->charge;
+    }
+    if (e->refs == 1 && e->InCache()) {
+      // The item is still in cache, and nobody else holds a reference to it
+      if (usage_ > capacity_ || force_erase) {
+        // the cache is full
+        // The LRU list must be empty since the cache is full
+        assert(!(usage_ > capacity_) || lru_.next == &lru_);
+        // take this opportunity and remove the item
+        table_.Remove(e->key(), e->hash);
+        e->SetInCache(false);
+        Unref(e);
+        usage_ -= e->charge;
+        last_reference = true;
+      } else {
+        // put the item on the list to be potentially freed
+        LRU_Insert(e);
+      }
+    }
+  }
+
+  // free outside of mutex
+  if (last_reference) {
+    e->Free();
+  }
+  return last_reference;
+}
+
+rocksdb::Status BinnedLRUCacheShard::Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
+                             size_t charge,
+                             void (*deleter)(const rocksdb::Slice& key, void* value),
+                             rocksdb::Cache::Handle** handle, rocksdb::Cache::Priority priority) {
+  // Allocate the memory here outside of the mutex
+  // If the cache is full, we'll have to release it
+  // It shouldn't happen very often though.
+  BinnedLRUHandle* e = reinterpret_cast<BinnedLRUHandle*>(
+      new char[sizeof(BinnedLRUHandle) - 1 + key.size()]);
+  rocksdb::Status s;
+  ceph::autovector<BinnedLRUHandle*> last_reference_list;
+
+  e->value = value;
+  e->deleter = deleter;
+  e->charge = charge;
+  e->key_length = key.size();
+  e->flags = 0;
+  e->hash = hash;
+  e->refs = (handle == nullptr
+                 ? 1
+                 : 2);  // One from BinnedLRUCache, one for the returned handle
+  e->next = e->prev = nullptr;
+  e->SetInCache(true);
+  e->SetPriority(priority);
+  memcpy(e->key_data, key.data(), key.size());
+
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty
+    EvictFromLRU(charge, &last_reference_list);
+
+    if (usage_ - lru_usage_ + charge > capacity_ &&
+        (strict_capacity_limit_ || handle == nullptr)) {
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry inserted
+        // into cache and get evicted immediately.
+        last_reference_list.push_back(e);
+      } else {
+        delete[] reinterpret_cast<char*>(e);
+        *handle = nullptr;
+        s = rocksdb::Status::Incomplete("Insert failed due to LRU cache being full.");
+      }
+    } else {
+      // insert into the cache
+      // note that the cache might get larger than its capacity if not enough
+      // space was freed
+      BinnedLRUHandle* old = table_.Insert(e);
+      usage_ += e->charge;
+      if (old != nullptr) {
+        old->SetInCache(false);
+        if (Unref(old)) {
+          usage_ -= old->charge;
+          // old is on LRU because it's in cache and its reference count
+          // was just 1 (Unref returned 0)
+          LRU_Remove(old);
+          last_reference_list.push_back(old);
+        }
+      }
+      if (handle == nullptr) {
+        LRU_Insert(e);
+      } else {
+        *handle = reinterpret_cast<rocksdb::Cache::Handle*>(e);
+      }
+      s = rocksdb::Status::OK();
+    }
+  }
+
+  // we free the entries here outside of mutex for
+  // performance reasons
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+
+  return s;
+}
+
+void BinnedLRUCacheShard::Erase(const rocksdb::Slice& key, uint32_t hash) {
+  BinnedLRUHandle* e;
+  bool last_reference = false;
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    e = table_.Remove(key, hash);
+    if (e != nullptr) {
+      last_reference = Unref(e);
+      if (last_reference) {
+        usage_ -= e->charge;
+      }
+      if (last_reference && e->InCache()) {
+        LRU_Remove(e);
+      }
+      e->SetInCache(false);
+    }
+  }
+
+  // mutex not held here
+  // last_reference will only be true if e != nullptr
+  if (last_reference) {
+    e->Free();
+  }
+}
+
+size_t BinnedLRUCacheShard::GetUsage() const {
+  std::lock_guard<std::mutex> l(mutex_);
+  return usage_;
+}
+
+size_t BinnedLRUCacheShard::GetPinnedUsage() const {
+  std::lock_guard<std::mutex> l(mutex_);
+  assert(usage_ >= lru_usage_);
+  return usage_ - lru_usage_;
+}
+
+std::string BinnedLRUCacheShard::GetPrintableOptions() const {
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    std::lock_guard<std::mutex> l(mutex_);
+    snprintf(buffer, kBufferSize, "    high_pri_pool_ratio: %.3lf\n",
+             high_pri_pool_ratio_);
+  }
+  return std::string(buffer);
+}
+
+BinnedLRUCache::BinnedLRUCache(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit, double high_pri_pool_ratio)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+  num_shards_ = 1 << num_shard_bits;
+  // TODO: Switch over to use mempool
+  int rc = posix_memalign((void**) &shards_, 
+                          CACHE_LINE_SIZE, 
+                          sizeof(BinnedLRUCacheShard) * num_shards_);
+  if (rc != 0) {
+    throw std::bad_alloc();
+  } 
+  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
+  for (int i = 0; i < num_shards_; i++) {
+    new (&shards_[i])
+        BinnedLRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio);
+  }
+}
+
+BinnedLRUCache::~BinnedLRUCache() {
+  for (int i = 0; i < num_shards_; i++) {
+    shards_[i].~BinnedLRUCacheShard();
+  }
+  free(shards_);
+}
+
+CacheShard* BinnedLRUCache::GetShard(int shard) {
+  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+}
+
+const CacheShard* BinnedLRUCache::GetShard(int shard) const {
+  return reinterpret_cast<CacheShard*>(&shards_[shard]);
+}
+
+void* BinnedLRUCache::Value(Handle* handle) {
+  return reinterpret_cast<const BinnedLRUHandle*>(handle)->value;
+}
+
+size_t BinnedLRUCache::GetCharge(Handle* handle) const {
+  return reinterpret_cast<const BinnedLRUHandle*>(handle)->charge;
+}
+
+uint32_t BinnedLRUCache::GetHash(Handle* handle) const {
+  return reinterpret_cast<const BinnedLRUHandle*>(handle)->hash;
+}
+
+void BinnedLRUCache::DisownData() {
+// Do not drop data if compile with ASAN to suppress leak warning.
+#ifndef __SANITIZE_ADDRESS__
+  shards_ = nullptr;
+#endif  // !__SANITIZE_ADDRESS__
+}
+
+size_t BinnedLRUCache::TEST_GetLRUSize() {
+  size_t lru_size_of_all_shards = 0;
+  for (int i = 0; i < num_shards_; i++) {
+    lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
+  }
+  return lru_size_of_all_shards;
+}
+
+void BinnedLRUCache::SetHighPriPoolRatio(double high_pri_pool_ratio) {
+  for (int i = 0; i < num_shards_; i++) {
+    shards_[i].SetHighPriPoolRatio(high_pri_pool_ratio);
+  }
+}
+
+double BinnedLRUCache::GetHighPriPoolRatio() const {
+  double result = 0.0;
+  if (num_shards_ > 0) {
+    result = shards_[0].GetHighPriPoolRatio();
+  }
+  return result;
+}
+
+size_t BinnedLRUCache::GetHighPriPoolUsage() const {
+  // We will not lock the cache when getting the usage from shards.
+  size_t usage = 0;
+  for (int s = 0; s < num_shards_; s++) {
+    usage += shards_[s].GetHighPriPoolUsage();
+  }
+  return usage;
+}
+
+std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(size_t capacity, int num_shard_bits,
+                                   bool strict_capacity_limit,
+                                   double high_pri_pool_ratio) {
+  if (num_shard_bits >= 20) {
+    return nullptr;  // the cache cannot be sharded into too many fine pieces
+  }
+  if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
+    // invalid high_pri_pool_ratio
+    return nullptr;
+  }
+  if (num_shard_bits < 0) {
+    num_shard_bits = GetDefaultCacheShardBits(capacity);
+  }
+  return std::make_shared<BinnedLRUCache>(capacity, num_shard_bits,
+                                    strict_capacity_limit, high_pri_pool_ratio);
+}
+
+}  // namespace rocksdb_cache
diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h
new file mode 100644 (file)
index 0000000..81cf06b
--- /dev/null
@@ -0,0 +1,319 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_BINNED_LRU_CACHE
+#define ROCKSDB_BINNED_LRU_CACHE
+
+#include <string>
+#include <mutex>
+
+#include "ShardedCache.h"
+
+#include "common/autovector.h"
+
+namespace rocksdb_cache {
+
+// LRU cache implementation
+
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in table. Some elements
+// are also stored on LRU list.
+//
+// BinnedLRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
+// 2. Not referenced externally and in hash table. In that case the entry is
+// in the LRU and can be freed. (refs == 1 && in_cache == true)
+// 3. Referenced externally and not in hash table. In that case the entry is
+// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//
+// All newly created BinnedLRUHandles are in state 1. If you call
+// BinnedLRUCacheShard::Release
+// on entry in state 1, it will go into state 2. To move from state 1 to
+// state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
+// same key.
+// To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
+// Before destruction, make sure that no handles are in state 1. This means
+// that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
+// matching
+// RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
+
+std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
+    size_t capacity,
+    int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    double high_pri_pool_ratio = 0.0);
+
+struct BinnedLRUHandle {
+  void* value;
+  void (*deleter)(const rocksdb::Slice&, void* value);
+  BinnedLRUHandle* next_hash;
+  BinnedLRUHandle* next;
+  BinnedLRUHandle* prev;
+  size_t charge;  // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  uint32_t refs;     // a number of refs to this entry
+                     // cache itself is counted as 1
+
+  // Include the following flags:
+  //   in_cache:    whether this entry is referenced by the hash table.
+  //   is_high_pri: whether this entry is high priority entry.
+  //   in_high_pri_pool: whether this entry is in high-pri pool.
+  char flags;
+
+  uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
+
+  char key_data[1];  // Beginning of key
+
+  rocksdb::Slice key() const {
+    // For cheaper lookups, we allow a temporary Handle object
+    // to store a pointer to a key in "value".
+    if (next == this) {
+      return *(reinterpret_cast<rocksdb::Slice*>(value));
+    } else {
+      return rocksdb::Slice(key_data, key_length);
+    }
+  }
+
+  bool InCache() { return flags & 1; }
+  bool IsHighPri() { return flags & 2; }
+  bool InHighPriPool() { return flags & 4; }
+  bool HasHit() { return flags & 8; }
+
+  void SetInCache(bool in_cache) {
+    if (in_cache) {
+      flags |= 1;
+    } else {
+      flags &= ~1;
+    }
+  }
+
+  void SetPriority(rocksdb::Cache::Priority priority) {
+    if (priority == rocksdb::Cache::Priority::HIGH) {
+      flags |= 2;
+    } else {
+      flags &= ~2;
+    }
+  }
+
+  void SetInHighPriPool(bool in_high_pri_pool) {
+    if (in_high_pri_pool) {
+      flags |= 4;
+    } else {
+      flags &= ~4;
+    }
+  }
+
+  void SetHit() { flags |= 8; }
+
+  void Free() {
+    assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
+    if (deleter) {
+      (*deleter)(key(), value);
+    }
+    delete[] reinterpret_cast<char*>(this);
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class BinnedLRUHandleTable {
+ public:
+  BinnedLRUHandleTable();
+  ~BinnedLRUHandleTable();
+
+  BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
+  BinnedLRUHandle* Insert(BinnedLRUHandle* h);
+  BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
+
+  template <typename T>
+  void ApplyToAllCacheEntries(T func) {
+    for (uint32_t i = 0; i < length_; i++) {
+      BinnedLRUHandle* h = list_[i];
+      while (h != nullptr) {
+        auto n = h->next_hash;
+        assert(h->InCache());
+        func(h);
+        h = n;
+      }
+    }
+  }
+
+ private:
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
+
+  void Resize();
+
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  BinnedLRUHandle** list_;
+  uint32_t length_;
+  uint32_t elems_;
+};
+
+// A single shard of sharded cache.
+class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
+ public:
+  BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                double high_pri_pool_ratio);
+  virtual ~BinnedLRUCacheShard();
+
+  // Separate from constructor so caller can easily make an array of BinnedLRUCache
+  // if current usage is more than new capacity, the function will attempt to
+  // free the needed space
+  virtual void SetCapacity(size_t capacity) override;
+
+  // Set the flag to reject insertion if cache if full.
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+
+  // Set percentage of capacity reserved for high-pri cache entries.
+  void SetHighPriPoolRatio(double high_pri_pool_ratio);
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
+                        size_t charge,
+                        void (*deleter)(const rocksdb::Slice& key, void* value),
+                        rocksdb::Cache::Handle** handle,
+                        rocksdb::Cache::Priority priority) override;
+  virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
+  virtual bool Ref(rocksdb::Cache::Handle* handle) override;
+  virtual bool Release(rocksdb::Cache::Handle* handle,
+                       bool force_erase = false) override;
+  virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
+
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
+  // protect them with mutex_.
+
+  virtual size_t GetUsage() const override;
+  virtual size_t GetPinnedUsage() const override;
+
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) override;
+
+  virtual void EraseUnRefEntries() override;
+
+  virtual std::string GetPrintableOptions() const override;
+
+  void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
+
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  //  not threadsafe
+  size_t TEST_GetLRUSize();
+
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio() const;
+
+  // Retrives high pri pool usage
+  size_t GetHighPriPoolUsage() const;
+
+ private:
+  void LRU_Remove(BinnedLRUHandle* e);
+  void LRU_Insert(BinnedLRUHandle* e);
+
+  // Overflow the last entry in high-pri pool to low-pri pool until size of
+  // high-pri pool is no larger than the size specify by high_pri_pool_pct.
+  void MaintainPoolSize();
+
+  // Just reduce the reference count by 1.
+  // Return true if last reference
+  bool Unref(BinnedLRUHandle* e);
+
+  // Free some space following strict LRU policy until enough space
+  // to hold (usage_ + charge) is freed or the lru list is empty
+  // This function is not thread safe - it needs to be executed while
+  // holding the mutex_
+  void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
+
+  // Initialized before use.
+  size_t capacity_;
+
+  // Memory size for entries in high-pri pool.
+  size_t high_pri_pool_usage_;
+
+  // Whether to reject insertion if cache reaches its full capacity.
+  bool strict_capacity_limit_;
+
+  // Ratio of capacity reserved for high priority cache entries.
+  double high_pri_pool_ratio_;
+
+  // High-pri pool size, equals to capacity * high_pri_pool_ratio.
+  // Remember the value to avoid recomputing each time.
+  double high_pri_pool_capacity_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  // LRU contains items which can be evicted, ie reference only by cache
+  BinnedLRUHandle lru_;
+
+  // Pointer to head of low-pri pool in LRU list.
+  BinnedLRUHandle* lru_low_pri_;
+
+  // ------------^^^^^^^^^^^^^-----------
+  // Not frequently modified data members
+  // ------------------------------------
+  //
+  // We separate data members that are updated frequently from the ones that
+  // are not frequently updated so that they don't share the same cache line
+  // which will lead into false cache sharing
+  //
+  // ------------------------------------
+  // Frequently modified data members
+  // ------------vvvvvvvvvvvvv-----------
+  BinnedLRUHandleTable table_;
+
+  // Memory size for entries residing in the cache
+  size_t usage_;
+
+  // Memory size for entries residing only in the LRU list
+  size_t lru_usage_;
+
+  // mutex_ protects the following state.
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable std::mutex mutex_;
+};
+
+class BinnedLRUCache : public ShardedCache {
+ public:
+  BinnedLRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+           double high_pri_pool_ratio);
+  virtual ~BinnedLRUCache();
+  virtual const char* Name() const override { return "BinnedLRUCache"; }
+  virtual CacheShard* GetShard(int shard) override;
+  virtual const CacheShard* GetShard(int shard) const override;
+  virtual void* Value(Handle* handle) override;
+  virtual size_t GetCharge(Handle* handle) const override;
+  virtual uint32_t GetHash(Handle* handle) const override;
+  virtual void DisownData() override;
+
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  size_t TEST_GetLRUSize();
+  // Sets the high pri pool ratio
+  void SetHighPriPoolRatio(double high_pri_pool_ratio);
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio() const;
+  // Retrieves high pri pool usage
+  size_t GetHighPriPoolUsage() const;
+
+ private:
+  BinnedLRUCacheShard* shards_;
+  int num_shards_ = 0;
+};
+
+}  // namespace rocksdb_cache
+
+#endif // ROCKSDB_BINNED_LRU_CACHE
diff --git a/src/kv/rocksdb_cache/ShardedCache.cc b/src/kv/rocksdb_cache/ShardedCache.cc
new file mode 100644 (file)
index 0000000..367140a
--- /dev/null
@@ -0,0 +1,159 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "ShardedCache.h"
+
+#include <string>
+
+namespace rocksdb_cache {
+
+ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
+                           bool strict_capacity_limit)
+    : num_shard_bits_(num_shard_bits),
+      capacity_(capacity),
+      strict_capacity_limit_(strict_capacity_limit),
+      last_id_(1) {}
+
+void ShardedCache::SetCapacity(size_t capacity) {
+  int num_shards = 1 << num_shard_bits_;
+  const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
+  std::lock_guard<std::mutex> l(capacity_mutex_);
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->SetCapacity(per_shard);
+  }
+  capacity_ = capacity;
+}
+
+void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  int num_shards = 1 << num_shard_bits_;
+  std::lock_guard<std::mutex> l(capacity_mutex_);
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+  strict_capacity_limit_ = strict_capacity_limit;
+}
+
+rocksdb::Status ShardedCache::Insert(const rocksdb::Slice& key, void* value, size_t charge,
+                            void (*deleter)(const rocksdb::Slice& key, void* value),
+                            rocksdb::Cache::Handle** handle, Priority priority) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))
+      ->Insert(key, hash, value, charge, deleter, handle, priority);
+}
+
+rocksdb::Cache::Handle* ShardedCache::Lookup(const rocksdb::Slice& key, rocksdb::Statistics* /*stats*/) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))->Lookup(key, hash);
+}
+
+bool ShardedCache::Ref(rocksdb::Cache::Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Ref(handle);
+}
+
+bool ShardedCache::Release(rocksdb::Cache::Handle* handle, bool force_erase) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Release(handle, force_erase);
+}
+
+void ShardedCache::Erase(const rocksdb::Slice& key) {
+  uint32_t hash = HashSlice(key);
+  GetShard(Shard(hash))->Erase(key, hash);
+}
+
+uint64_t ShardedCache::NewId() {
+  return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+size_t ShardedCache::GetCapacity() const {
+  std::lock_guard<std::mutex> l(capacity_mutex_);
+  return capacity_;
+}
+
+bool ShardedCache::HasStrictCapacityLimit() const {
+  std::lock_guard<std::mutex> l(capacity_mutex_);
+  return strict_capacity_limit_;
+}
+
+size_t ShardedCache::GetUsage() const {
+  // We will not lock the cache when getting the usage from shards.
+  int num_shards = 1 << num_shard_bits_;
+  size_t usage = 0;
+  for (int s = 0; s < num_shards; s++) {
+    usage += GetShard(s)->GetUsage();
+  }
+  return usage;
+}
+
+size_t ShardedCache::GetUsage(rocksdb::Cache::Handle* handle) const {
+  return GetCharge(handle);
+}
+
+size_t ShardedCache::GetPinnedUsage() const {
+  // We will not lock the cache when getting the usage from shards.
+  int num_shards = 1 << num_shard_bits_;
+  size_t usage = 0;
+  for (int s = 0; s < num_shards; s++) {
+    usage += GetShard(s)->GetPinnedUsage();
+  }
+  return usage;
+}
+
+void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                          bool thread_safe) {
+  int num_shards = 1 << num_shard_bits_;
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+}
+
+void ShardedCache::EraseUnRefEntries() {
+  int num_shards = 1 << num_shard_bits_;
+  for (int s = 0; s < num_shards; s++) {
+    GetShard(s)->EraseUnRefEntries();
+  }
+}
+
+std::string ShardedCache::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    std::lock_guard<std::mutex> l(capacity_mutex_);
+    snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
+             capacity_);
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n", num_shard_bits_);
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    strict_capacity_limit : %d\n",
+             strict_capacity_limit_);
+    ret.append(buffer);
+  }
+  ret.append(GetShard(0)->GetPrintableOptions());
+  return ret;
+}
+int GetDefaultCacheShardBits(size_t capacity) {
+  int num_shard_bits = 0;
+  size_t min_shard_size = 512L * 1024L;  // Every shard is at least 512KB.
+  size_t num_shards = capacity / min_shard_size;
+  while (num_shards >>= 1) {
+    if (++num_shard_bits >= 6) {
+      // No more than 6.
+      return num_shard_bits;
+    }
+  }
+  return num_shard_bits;
+}
+
+}  // namespace rocksdb_cache
diff --git a/src/kv/rocksdb_cache/ShardedCache.h b/src/kv/rocksdb_cache/ShardedCache.h
new file mode 100644 (file)
index 0000000..e8e53b2
--- /dev/null
@@ -0,0 +1,111 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_SHARDED_CACHE
+#define ROCKSDB_SHARDED_CACHE
+
+#include <atomic>
+#include <string>
+#include <mutex>
+
+#include "rocksdb/cache.h"
+#include "include/ceph_hash.h"
+//#include "hash.h"
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64 // XXX arch-specific define 
+#endif
+#define ROCKSDB_PRIszt "zu"
+
+namespace rocksdb_cache {
+
+// Single cache shard interface.
+class CacheShard {
+ public:
+  CacheShard() = default;
+  virtual ~CacheShard() = default;
+
+  virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
+                                 size_t charge,
+                                 void (*deleter)(const rocksdb::Slice& key, void* value),
+                                 rocksdb::Cache::Handle** handle, rocksdb::Cache::Priority priority) = 0;
+  virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) = 0;
+  virtual bool Ref(rocksdb::Cache::Handle* handle) = 0;
+  virtual bool Release(rocksdb::Cache::Handle* handle, bool force_erase = false) = 0;
+  virtual void Erase(const rocksdb::Slice& key, uint32_t hash) = 0;
+  virtual void SetCapacity(size_t capacity) = 0;
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+  virtual size_t GetUsage() const = 0;
+  virtual size_t GetPinnedUsage() const = 0;
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) = 0;
+  virtual void EraseUnRefEntries() = 0;
+  virtual std::string GetPrintableOptions() const { return ""; }
+};
+
+// Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are sharded by the highest num_shard_bits bits of hash value.
+class ShardedCache : public rocksdb::Cache {
+ public:
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit);
+  virtual ~ShardedCache() = default;
+  virtual const char* Name() const override = 0;
+  virtual CacheShard* GetShard(int shard) = 0;
+  virtual const CacheShard* GetShard(int shard) const = 0;
+  virtual void* Value(Handle* handle) override = 0;
+  virtual size_t GetCharge(Handle* handle) const = 0;
+  virtual uint32_t GetHash(Handle* handle) const = 0;
+  virtual void DisownData() override = 0;
+
+  virtual void SetCapacity(size_t capacity) override;
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+
+  virtual rocksdb::Status Insert(const rocksdb::Slice& key, void* value, size_t charge,
+                                 void (*deleter)(const rocksdb::Slice& key, void* value),
+                                 rocksdb::Cache::Handle** handle, Priority priority) override;
+  virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, rocksdb::Statistics* stats) override;
+  virtual bool Ref(rocksdb::Cache::Handle* handle) override;
+  virtual bool Release(rocksdb::Cache::Handle* handle, bool force_erase = false) override;
+  virtual void Erase(const rocksdb::Slice& key) override;
+  virtual uint64_t NewId() override;
+  virtual size_t GetCapacity() const override;
+  virtual bool HasStrictCapacityLimit() const override;
+  virtual size_t GetUsage() const override;
+  virtual size_t GetUsage(rocksdb::Cache::Handle* handle) const override;
+  virtual size_t GetPinnedUsage() const override;
+  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                      bool thread_safe) override;
+  virtual void EraseUnRefEntries() override;
+  virtual std::string GetPrintableOptions() const override;
+
+  int GetNumShardBits() const { return num_shard_bits_; }
+
+ private:
+  static inline uint32_t HashSlice(const rocksdb::Slice& s) {
+     return ceph_str_hash(CEPH_STR_HASH_RJENKINS, s.data(), s.size());
+//    return Hash(s.data(), s.size(), 0);
+  }
+
+  uint32_t Shard(uint32_t hash) {
+    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
+    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
+  }
+
+  int num_shard_bits_;
+  mutable std::mutex capacity_mutex_;
+  size_t capacity_;
+  bool strict_capacity_limit_;
+  std::atomic<uint64_t> last_id_;
+};
+
+extern int GetDefaultCacheShardBits(size_t capacity);
+
+}  // namespace rocksdb_cache
+#endif // ROCKSDB_SHARDED_CACHE