common/PriorityCache: Implement Cache Age Binning

author Mark Nelson <mnelson@redhat.com>

Tue, 13 Oct 2020 23:24:18 +0000 (23:24 +0000)

committer Mark Nelson <mnelson@redhat.com>

Tue, 11 Jan 2022 21:01:19 +0000 (21:01 +0000)
author Mark Nelson <mnelson@redhat.com>
Tue, 13 Oct 2020 23:24:18 +0000 (23:24 +0000)
committer Mark Nelson <mnelson@redhat.com>
Tue, 11 Jan 2022 21:01:19 +0000 (21:01 +0000)
diff --git a/src/common/PriorityCache.cc b/src/common/PriorityCache.cc

index ff96ad7a13405368bc9c624fdcc0a9a4a1774276..0fe781b3e1e509d5a32b1c5bdc2f914756f61fac 100644 (file)
--- a/src/common/PriorityCache.cc
+++ b/src/common/PriorityCache.cc
@@ -305,7 +305,6 @@ namespace PriorityCache
  
        // Commit the new cache size
        int64_t committed = it->second->commit_cache_size(tuned_mem);
-
        // Update the perf counters
        int64_t alloc = it->second->get_cache_bytes();
  
@@ -314,6 +313,14 @@ namespace PriorityCache
      }
    }
  
+  void Manager::shift_bins()
+  {
+    for (auto &l : loggers) {
+      auto it = caches.find(l.first);
+      it->second->shift_bins();
+    }
+  }
+
    void Manager::balance_priority(int64_t *mem_avail, Priority pri)
    {
      std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
diff --git a/src/common/PriorityCache.h b/src/common/PriorityCache.h

index 362b5477da2f70142bb887517b4e5f18577741ef..8233d0ecf27966bd62dc6ea783674bddba7e9790 100644 (file)
--- a/src/common/PriorityCache.h
+++ b/src/common/PriorityCache.h
@@ -100,6 +100,18 @@ namespace PriorityCache {
  
      // Get the name of this cache.
      virtual std::string get_cache_name() const = 0;
+
+    // Rotate the bins
+    virtual void shift_bins() = 0;
+
+    // Import user bins (from PRI1 to LAST-1)
+    virtual void import_bins(const std::vector<uint64_t> &bins) = 0;
+
+    // Set bins (PRI0 and LAST should be ignored)
+    virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) = 0;
+
+    // Get bins
+    virtual uint64_t get_bins(PriorityCache::Priority pri) const = 0;
    };
  
    class Manager {
@@ -140,7 +152,7 @@ namespace PriorityCache {
      void clear();
      void tune_memory();
      void balance();
-
+    void shift_bins();
    private:
      void balance_priority(int64_t *mem_avail, Priority pri);
    };
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index 557c67646bec813079ad7bba71b0e89cdfca604d..084b591538fae1d4aa3a44df53901d037cb0dc01 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -4677,6 +4677,105 @@ options:
    default: 5
    see_also:
    - bluestore_cache_autotune
+- name: bluestore_cache_age_bin_interval
+  type: float
+  level: dev
+  desc: The duration (in seconds) represented by a single cache age bin.
+  fmt_desc: |
+    The caches used by bluestore will assign cache entries to an 'age bin'
+    that represents a period of time during which that cache entry was most
+    recently updated.  By binning the caches in this way, Ceph's priority
+    cache balancing code can make better decisions about which caches should
+    receive priority based on the relative ages of items in the caches.  By
+    default, a single cache age bin represents 1 second of time.  Note:
+    Setting this interval too small can result in high CPU usage and lower
+    performance.
+  default: 1
+  see_also:
+  - bluestore_cache_age_bins_kv
+  - bluestore_cache_age_bins_kv_onode
+  - bluestore_cache_age_bins_meta
+  - bluestore_cache_age_bins_data
+- name: bluestore_cache_age_bins_kv
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for kv cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_kv_onode
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for kv onode cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "0 0 0 0 0 0 0 0 0 720"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_meta
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for onode cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_data
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for data cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
  - name: bluestore_alloc_stats_dump_interval
    type: float
    level: dev
diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.cc b/src/kv/rocksdb_cache/BinnedLRUCache.cc

index 47c56e2ddd76987f8b673ff5e71353facb8dc429..fce26c7b07dec0597ca6ba9fabca9b5d39e74b78 100644 (file)
--- a/src/kv/rocksdb_cache/BinnedLRUCache.cc
+++ b/src/kv/rocksdb_cache/BinnedLRUCache.cc
@@ -110,7 +110,9 @@ BinnedLRUCacheShard::BinnedLRUCacheShard(CephContext *c, size_t capacity, bool s
        high_pri_pool_ratio_(high_pri_pool_ratio),
        high_pri_pool_capacity_(0),
        usage_(0),
-      lru_usage_(0) {
+      lru_usage_(0),
+      age_bins(1) {
+  shift_bins();
    // Make empty circular linked list
    lru_.next = &lru_;
    lru_.prev = &lru_;
@@ -208,12 +210,17 @@ void BinnedLRUCacheShard::LRU_Remove(BinnedLRUHandle* e) {
    if (e->InHighPriPool()) {
      ceph_assert(high_pri_pool_usage_ >= e->charge);
      high_pri_pool_usage_ -= e->charge;
+  } else {
+    ceph_assert(*(e->age_bin) >= e->charge);
+    *(e->age_bin) -= e->charge;
    }
  }
  
  void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) {
    ceph_assert(e->next == nullptr);
    ceph_assert(e->prev == nullptr);
+  e->age_bin = age_bins.front();
+
    if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) {
      // Inset "e" to head of LRU list.
      e->next = &lru_;
@@ -232,10 +239,25 @@ void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) {
      e->next->prev = e;
      e->SetInHighPriPool(false);
      lru_low_pri_ = e;
+    *(e->age_bin) += e->charge;
    }
    lru_usage_ += e->charge;
  }
  
+uint64_t BinnedLRUCacheShard::sum_bins(uint32_t start, uint32_t end) const {
+  std::lock_guard<std::mutex> l(mutex_);
+  auto size = age_bins.size();
+  if (size < start) {
+    return 0;
+  }
+  uint64_t bytes = 0;
+  end = (size < end) ? size : end;
+  for (auto i = start; i < end; i++) {
+    bytes += *(age_bins[i]);
+  }
+  return bytes;
+}
+
  void BinnedLRUCacheShard::MaintainPoolSize() {
    while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
      // Overflow last entry in high-pri pool to low-pri pool.
@@ -243,6 +265,7 @@ void BinnedLRUCacheShard::MaintainPoolSize() {
      ceph_assert(lru_low_pri_ != &lru_);
      lru_low_pri_->SetInHighPriPool(false);
      high_pri_pool_usage_ -= lru_low_pri_->charge;
+    *(lru_low_pri_->age_bin) += lru_low_pri_->charge;
    }
  }
  
@@ -460,6 +483,21 @@ size_t BinnedLRUCacheShard::GetPinnedUsage() const {
    return usage_ - lru_usage_;
  }
  
+void BinnedLRUCacheShard::shift_bins() {
+  std::lock_guard<std::mutex> l(mutex_);
+  age_bins.push_front(std::make_shared<uint64_t>(0));
+}
+
+uint32_t BinnedLRUCacheShard::get_bin_count() const {
+  std::lock_guard<std::mutex> l(mutex_);
+  return age_bins.capacity();
+}
+
+void BinnedLRUCacheShard::set_bin_count(uint32_t count) {
+  std::lock_guard<std::mutex> l(mutex_);
+  age_bins.set_capacity(count);
+}
+
  std::string BinnedLRUCacheShard::GetPrintableOptions() const {
    const int kBufferSize = 200;
    char buffer[kBufferSize];
@@ -577,22 +615,33 @@ int64_t BinnedLRUCache::request_cache_bytes(PriorityCache::Priority pri, uint64_
    int64_t assigned = get_cache_bytes(pri);
    int64_t request = 0;
  
-  switch (pri) {
+  switch(pri) {
    // PRI0 is for rocksdb's high priority items (indexes/filters)
    case PriorityCache::Priority::PRI0:
      {
-      request = GetHighPriPoolUsage();
+      // Because we want the high pri cache to grow independently of the low
+      // pri cache, request a chunky allocation independent of the other
+      // priorities.
+      request = PriorityCache::get_chunk(GetHighPriPoolUsage(), total_cache);
        break;
      }
-  // All other cache items are currently shoved into the PRI1 priority. 
-  case PriorityCache::Priority::PRI1:
+  case PriorityCache::Priority::LAST:
      {
+      auto max = get_bin_count();
        request = GetUsage();
        request -= GetHighPriPoolUsage();
+      request -= sum_bins(0, max);
        break;
      }
    default:
-    break;
+    {
+      ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST);
+      auto prev_pri = static_cast<PriorityCache::Priority>(pri - 1);
+      uint64_t start = get_bins(prev_pri);
+      uint64_t end = get_bins(pri);
+      request = sum_bins(start, end);
+      break;
+    }
    }
    request = (request > assigned) ? request - assigned : 0;
    ldout(cct, 10) << __func__ << " Priority: " << static_cast<uint32_t>(pri)
@@ -612,15 +661,41 @@ int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes)
    double ratio = 0;
    if (new_bytes > 0) {
      int64_t pri0_bytes = get_cache_bytes(PriorityCache::Priority::PRI0);
-    // Add 10% of the "reserved" bytes so the ratio can't get stuck at 0 
-    pri0_bytes += (new_bytes - get_cache_bytes()) / 10;
      ratio = (double) pri0_bytes / new_bytes;
    }
-  ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
+  ldout(cct, 5) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
    SetHighPriPoolRatio(ratio);
    return new_bytes;
  }
  
+void BinnedLRUCache::shift_bins() {
+  for (int s = 0; s < num_shards_; s++) {
+    shards_[s].shift_bins();
+  }
+}
+
+uint64_t BinnedLRUCache::sum_bins(uint32_t start, uint32_t end) const {
+  uint64_t bytes = 0;
+  for (int s = 0; s < num_shards_; s++) {
+    bytes += shards_[s].sum_bins(start, end);
+  }
+  return bytes;
+}
+
+uint32_t BinnedLRUCache::get_bin_count() const {
+  uint32_t result = 0;
+  if (num_shards_ > 0) {
+    result = shards_[0].get_bin_count();
+  }
+  return result;
+}
+
+void BinnedLRUCache::set_bin_count(uint32_t count) {
+  for (int s = 0; s < num_shards_; s++) {
+    shards_[s].set_bin_count(count);
+  }
+}
+
  std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
      CephContext *c, 
      size_t capacity,
diff --git a/src/kv/rocksdb_cache/BinnedLRUCache.h b/src/kv/rocksdb_cache/BinnedLRUCache.h

index 88bf4502e89271da19c1bbf160dd5bcca7d0006a..fcf49b7e8703e1cb1154198501ceec1d9aabf8f6 100644 (file)
--- a/src/kv/rocksdb_cache/BinnedLRUCache.h
+++ b/src/kv/rocksdb_cache/BinnedLRUCache.h
@@ -12,6 +12,7 @@
  
  #include <string>
  #include <mutex>
+#include <boost/circular_buffer.hpp>
  
  #include "ShardedCache.h"
  #include "common/autovector.h"
@@ -55,6 +56,7 @@ std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
      double high_pri_pool_ratio = 0.0);
  
  struct BinnedLRUHandle {
+  std::shared_ptr<uint64_t> age_bin;
    void* value;
    DeleterFn deleter;
    BinnedLRUHandle* next_hash;
@@ -230,6 +232,18 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
    // Retrieves high pri pool usage
    size_t GetHighPriPoolUsage() const;
  
+  // Rotate the bins
+  void shift_bins();
+
+  // Get the bin count
+  uint32_t get_bin_count() const;
+
+  // Set the bin count
+  void set_bin_count(uint32_t count);
+
+  // Get the byte counts for a range of age bins
+  uint64_t sum_bins(uint32_t start, uint32_t end) const;
+
   private:
    CephContext *cct;
    void LRU_Remove(BinnedLRUHandle* e);
@@ -296,6 +310,9 @@ class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
    // We don't count mutex_ as the cache's internal state so semantically we
    // don't mind mutex_ invoking the non-const actions.
    mutable std::mutex mutex_;
+
+  // Circular buffer of byte counters for age binning
+  boost::circular_buffer<std::shared_ptr<uint64_t>> age_bins;
  };
  
  class BinnedLRUCache : public ShardedCache {
@@ -329,6 +346,11 @@ class BinnedLRUCache : public ShardedCache {
    virtual int64_t get_committed_size() const {
      return GetCapacity();
    }
+  virtual void shift_bins();
+  uint64_t sum_bins(uint32_t start, uint32_t end) const;
+  uint32_t get_bin_count() const;
+  void set_bin_count(uint32_t count);
+
    virtual std::string get_cache_name() const {
      return "RocksDB Binned LRU Cache";
    }
diff --git a/src/kv/rocksdb_cache/ShardedCache.h b/src/kv/rocksdb_cache/ShardedCache.h

index f98421a09a33a29a8312bf7a0b2d4baf80d227f1..0ed692233f1a73e1bd9fb1d2ffa51d5ed35740f0 100644 (file)
--- a/src/kv/rocksdb_cache/ShardedCache.h
+++ b/src/kv/rocksdb_cache/ShardedCache.h
@@ -104,6 +104,9 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache {
  
    int GetNumShardBits() const { return num_shard_bits_; }
  
+  virtual uint32_t get_bin_count() const = 0;
+  virtual void set_bin_count(uint32_t count) = 0;
+
    // PriCache
    virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
      return cache_bytes[pri];
@@ -127,6 +130,42 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache {
    }
    virtual void set_cache_ratio(double ratio) {
      cache_ratio = ratio;
+  }
+    virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+    if (pri > PriorityCache::Priority::PRI0 &&
+        pri < PriorityCache::Priority::LAST) {
+      return bins[pri];
+    }
+    return 0;
+  }
+  virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+    if (pri <= PriorityCache::Priority::PRI0 ||
+        pri >= PriorityCache::Priority::LAST) {
+      return;
+    }
+    bins[pri] = end_bin;
+    uint64_t max = 0;
+    for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+      if (bins[pri] > max) {
+        max = bins[pri];
+      }
+    }
+    set_bin_count(max);
+  }
+  virtual void import_bins(const std::vector<uint64_t> &bins_v) {
+    uint64_t max = 0;
+    for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+      unsigned i = (unsigned) pri - 1;
+      if (i < bins_v.size()) {
+        bins[pri] = bins_v[i];
+        if (bins[pri] > max) {
+          max = bins[pri];
+        }
+      } else {
+        bins[pri] = 0;
+      }
+    }
+    set_bin_count(max);
    }
    virtual std::string get_cache_name() const = 0;
  
@@ -141,6 +180,7 @@ class ShardedCache : public rocksdb::Cache, public PriorityCache::PriCache {
      return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
    }
  
+  uint64_t bins[PriorityCache::Priority::LAST+1] = {0};
    int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
    double cache_ratio = 0;
  
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index ab9c53c060a352b6ba55064aa5b8eadb71aafcba..acf14bdbe54a4ab24387564cedbfbb3a2505c437 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -214,6 +214,16 @@ struct OSDMemCache : public PriorityCache::PriCache {
    virtual void set_cache_ratio(double ratio) {
      cache_ratio = ratio;
    }
+  virtual void shift_bins() {
+  }
+  virtual void import_bins(const std::vector<uint64_t> &bins) {
+  }
+  virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+  }
+  virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+    return 0;
+  }
+
    virtual string get_cache_name() const = 0;
  };
  
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index 89cc0713a44eb57f98c46742845efc0634a579d2..e49d797fd23c5af18477973eeedef913c0f3f874 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -1108,6 +1108,8 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
    {
      if (o->put_cache()) {
        (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+      o->cache_age_bin = age_bins.front();
+      *(o->cache_age_bin) += 1;
      } else {
        ++num_pinned;
      }
@@ -1118,6 +1120,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
    void _rm(BlueStore::Onode* o) override
    {
      if (o->pop_cache()) {
+      *(o->cache_age_bin) -= 1;
        lru.erase(lru.iterator_to(*o));
      } else {
        ceph_assert(num_pinned);
@@ -1129,6 +1132,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
    }
    void _pin(BlueStore::Onode* o) override
    {
+    *(o->cache_age_bin) -= 1;
      lru.erase(lru.iterator_to(*o));
      ++num_pinned;
      dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl;
@@ -1136,6 +1140,8 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
    void _unpin(BlueStore::Onode* o) override
    {
      lru.push_front(*o);
+    o->cache_age_bin = age_bins.front();
+    *(o->cache_age_bin) += 1;
      ceph_assert(num_pinned);
      --num_pinned;
      dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl;
@@ -1169,6 +1175,7 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
          ceph_assert(n == 0);
          lru.erase(p);
        }
+      *(o->cache_age_bin) -= 1;
        auto pinned = !o->pop_cache();
        ceph_assert(!pinned);
        o->c->onode_map._remove(o->oid);
@@ -1230,11 +1237,15 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
        lru.push_back(*b);
      }
      buffer_bytes += b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
      num = lru.size();
    }
    void _rm(BlueStore::Buffer *b) override {
      ceph_assert(buffer_bytes >= b->length);
      buffer_bytes -= b->length;
+    assert(*(b->cache_age_bin) >= b->length);
+    *(b->cache_age_bin) -= b->length;
      auto q = lru.iterator_to(*b);
      lru.erase(q);
      num = lru.size();
@@ -1246,11 +1257,16 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
    void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
      ceph_assert((int64_t)buffer_bytes + delta >= 0);
      buffer_bytes += delta;
+    assert(*(b->cache_age_bin) + delta >= 0);
+    *(b->cache_age_bin) += delta;
    }
    void _touch(BlueStore::Buffer *b) override {
      auto p = lru.iterator_to(*b);
      lru.erase(p);
      lru.push_front(*b);
+    *(b->cache_age_bin) -= b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
      num = lru.size();
      _audit("_touch_buffer end");
    }
@@ -1267,6 +1283,8 @@ struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
        BlueStore::Buffer *b = &*i;
        ceph_assert(b->is_clean());
        dout(20) << __func__ << " rm " << *b << dendl;
+      assert(*(b->cache_age_bin) >= b->length);
+      *(b->cache_age_bin) -= b->length;
        b->space->_rm_buffer(this, b);
      }
      num = lru.size();
@@ -1378,9 +1396,11 @@ public:
          ceph_abort_msg("bad cache_private");
        }
      }
+    b->cache_age_bin = age_bins.front();
      if (!b->is_empty()) {
        buffer_bytes += b->length;
        list_bytes[b->cache_private] += b->length;
+      *(b->cache_age_bin) += b->length;
      }
      num = hot.size() + warm_in.size();
    }
@@ -1393,6 +1413,8 @@ public:
        buffer_bytes -= b->length;
        ceph_assert(list_bytes[b->cache_private] >= b->length);
        list_bytes[b->cache_private] -= b->length;
+      assert(*(b->cache_age_bin) >= b->length);
+      *(b->cache_age_bin) -= b->length;
      }
      switch (b->cache_private) {
      case BUFFER_WARM_IN:
@@ -1435,6 +1457,7 @@ public:
      if (!b->is_empty()) {
        buffer_bytes += b->length;
        list_bytes[b->cache_private] += b->length;
+      *(b->cache_age_bin) += b->length;
      }
      num = hot.size() + warm_in.size();
    }
@@ -1447,6 +1470,8 @@ public:
        buffer_bytes += delta;
        ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
        list_bytes[b->cache_private] += delta;
+      assert(*(b->cache_age_bin) + delta >= 0);
+      *(b->cache_age_bin) += delta;
      }
    }
  
@@ -1465,6 +1490,9 @@ public:
        hot.push_front(*b);
        break;
      }
+    *(b->cache_age_bin) -= b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
      num = hot.size() + warm_in.size();
      _audit("_touch_buffer end");
    }
@@ -1512,7 +1540,9 @@ public:
          buffer_bytes -= b->length;
          ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
          list_bytes[BUFFER_WARM_IN] -= b->length;
-        to_evict_bytes -= b->length;
+        assert(*(b->cache_age_bin) >= b->length);
+        *(b->cache_age_bin) -= b->length;
+       to_evict_bytes -= b->length;
          evicted += b->length;
          b->state = BlueStore::Buffer::STATE_EMPTY;
          b->data.clear();
@@ -4177,6 +4207,7 @@ void *BlueStore::MempoolThread::entry()
  
    utime_t next_balance = ceph_clock_now();
    utime_t next_resize = ceph_clock_now();
+  utime_t next_bin_rotation = ceph_clock_now();
    utime_t next_deferred_force_submit = ceph_clock_now();
    utime_t alloc_stats_dump_clock = ceph_clock_now();
  
@@ -4189,21 +4220,47 @@ void *BlueStore::MempoolThread::entry()
        prev_config_change = cur_config_change;
      }
  
-    // Before we trim, check and see if it's time to rebalance/resize.
+    // define various intervals for background work
+    double age_bin_interval = store->cache_age_bin_interval;
      double autotune_interval = store->cache_autotune_interval;
      double resize_interval = store->osd_memory_cache_resize_interval;
      double max_defer_interval = store->max_defer_interval;
-
      double alloc_stats_dump_interval =
        store->cct->_conf->bluestore_alloc_stats_dump_interval;
  
+    // alloc stats dump
      if (alloc_stats_dump_interval > 0 &&
          alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
        store->_record_allocation_stats();
        alloc_stats_dump_clock = ceph_clock_now();
      }
+    // cache age binning
+    if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
+      if (binned_kv_cache != nullptr) {
+        binned_kv_cache->import_bins(store->kv_bins);
+      }
+      if (binned_kv_onode_cache != nullptr) {
+        binned_kv_onode_cache->import_bins(store->kv_onode_bins);
+      }
+      meta_cache->import_bins(store->meta_bins);
+      data_cache->import_bins(store->data_bins);
+
+      if (pcm != nullptr) {
+        pcm->shift_bins();
+      }
+      next_bin_rotation = ceph_clock_now();
+      next_bin_rotation += age_bin_interval;
+    }
+    // cache balancing
      if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
-      _adjust_cache_settings();
+      if (binned_kv_cache != nullptr) {
+        binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+      }
+      if (binned_kv_onode_cache != nullptr) {
+        binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+      }
+      meta_cache->set_cache_ratio(store->cache_meta_ratio);
+      data_cache->set_cache_ratio(store->cache_data_ratio);
  
        // Log events at 5 instead of 20 when balance happens.
        interval_stats_trim = true;
@@ -4215,6 +4272,7 @@ void *BlueStore::MempoolThread::entry()
        next_balance = ceph_clock_now();
        next_balance += autotune_interval;
      }
+    // memory resizing (ie autotuning)
      if (resize_interval > 0 && next_resize < ceph_clock_now()) {
        if (ceph_using_tcmalloc() && pcm != nullptr) {
          pcm->tune_memory();
@@ -4222,7 +4280,7 @@ void *BlueStore::MempoolThread::entry()
        next_resize = ceph_clock_now();
        next_resize += resize_interval;
      }
-
+    // deferred force submit
      if (max_defer_interval > 0 &&
         next_deferred_force_submit < ceph_clock_now()) {
        if (store->get_deferred_last_submitted() + max_defer_interval <
@@ -4249,18 +4307,6 @@ void *BlueStore::MempoolThread::entry()
    return NULL;
  }
  
-void BlueStore::MempoolThread::_adjust_cache_settings()
-{
-  if (binned_kv_cache != nullptr) {
-    binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
-  }
-  if (binned_kv_onode_cache != nullptr) {
-    binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
-  }
-  meta_cache->set_cache_ratio(store->cache_meta_ratio);
-  data_cache->set_cache_ratio(store->cache_data_ratio);
-}
-
  void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
  {
    size_t onode_shards = store->onode_cache_shards.size();
@@ -4272,7 +4318,7 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
  
    uint64_t cache_size = store->cache_size;
    int64_t kv_alloc =
-     static_cast<int64_t>(store->cache_kv_ratio * cache_size); 
+     static_cast<int64_t>(store->cache_kv_ratio * cache_size);
    int64_t kv_onode_alloc =
       static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
    int64_t meta_alloc =
@@ -4609,6 +4655,11 @@ const char **BlueStore::get_tracked_conf_keys() const
      "osd_memory_expected_fragmentation",
      "bluestore_cache_autotune",
      "bluestore_cache_autotune_interval",
+    "bluestore_cache_age_bin_interval",
+    "bluestore_cache_kv_age_bins",
+    "bluestore_cache_kv_onode_age_bins",
+    "bluestore_cache_meta_age_bins",
+    "bluestore_cache_data_age_bins",
      "bluestore_warn_on_legacy_statfs",
      "bluestore_warn_on_no_per_pool_omap",
      "bluestore_warn_on_no_per_pg_omap",
@@ -4808,6 +4859,22 @@ int BlueStore::_set_cache_sizes()
    cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
    cache_autotune_interval =
        cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+  cache_age_bin_interval =
+      cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
+  auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
+  {
+    std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
+    std::istringstream interval_stream(intervals_str);
+    std::copy(
+      std::istream_iterator<uint64_t>(interval_stream),
+      std::istream_iterator<uint64_t>(),
+      std::back_inserter(*intervals));
+  };
+  _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
+  _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
+  _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
+  _set_bin("bluestore_cache_age_bins_data", &data_bins);
+
    osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
    osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
    osd_memory_expected_fragmentation =
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h

index 00fee1ffa43e068017faed493beafbe4e51a1fc1..2afac549b09d7a8953092609aae06aa85d8f3afc 100644 (file)
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -281,6 +281,7 @@ public:
      uint64_t seq;
      uint32_t offset, length;
      ceph::buffer::list data;
+    std::shared_ptr<int64_t> cache_age_bin;  ///< cache age bin
  
      boost::intrusive::list_member_hook<> lru_item;
      boost::intrusive::list_member_hook<> state_item;
@@ -1152,6 +1153,7 @@ public:
      /// protect flush_txns
      ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
      ceph::condition_variable flush_cond;   ///< wait here for uncommitted txns
+    std::shared_ptr<int64_t> cache_age_bin;  ///< cache age bin
  
      Onode(Collection *c, const ghobject_t& o,
           const mempool::bluestore_cache_meta::string& k)
@@ -1260,8 +1262,11 @@ public:
  
      std::atomic<uint64_t> max = {0};
      std::atomic<uint64_t> num = {0};
+    boost::circular_buffer<std::shared_ptr<int64_t>> age_bins;
  
-    CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
+    CacheShard(CephContext* cct) : cct(cct), logger(nullptr), age_bins(1) {
+      shift_bins();
+    }
      virtual ~CacheShard() {}
  
      void set_max(uint64_t max_) {
@@ -1288,10 +1293,36 @@ public:
      void flush() {
        std::lock_guard l(lock);
        // we should not be shutting down after the blackhole is enabled
-      assert(!cct->_conf->objectstore_blackhole);
+      ceph_assert(!cct->_conf->objectstore_blackhole);
        _trim_to(0);
      }
  
+    virtual void shift_bins() {
+      std::lock_guard l(lock);
+      age_bins.push_front(std::make_shared<int64_t>(0));
+    }
+    virtual uint32_t get_bin_count() {
+      std::lock_guard l(lock);
+      return age_bins.capacity();
+    }
+    virtual void set_bin_count(uint32_t count) {
+      std::lock_guard l(lock);
+      age_bins.set_capacity(count);
+    }
+    virtual uint64_t sum_bins(uint32_t start, uint32_t end) {
+      std::lock_guard l(lock);
+      auto size = age_bins.size();
+      if (size < start) {
+        return 0;
+      }
+      uint64_t count = 0;
+      end = (size < end) ? size : end;
+      for (auto i = start; i < end; i++) {
+        count += *(age_bins[i]);
+      }
+      return count;
+    }
+
  #ifdef DEBUG_CACHE
      virtual void _audit(const char *s) = 0;
  #else
@@ -1302,7 +1333,6 @@ public:
    /// A Generic onode Cache Shard
    struct OnodeCacheShard : public CacheShard {
      std::atomic<uint64_t> num_pinned = {0};
-
      std::array<std::pair<ghobject_t, ceph::mono_clock::time_point>, 64> dumped_onodes;
  
      virtual void _pin(Onode* o) = 0;
@@ -1990,7 +2020,7 @@ public:
  
      void flush_all_but_last() {
        std::unique_lock l(qlock);
-      assert (q.size() >= 1);
+      ceph_assert (q.size() >= 1);
        while (true) {
         // std::set flag before the check because the condition
         // may become true outside qlock, and we need to make
@@ -2240,7 +2270,12 @@ private:
    double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
    double cache_data_ratio = 0;   ///< cache ratio dedicated to object data
    bool cache_autotune = false;   ///< cache autotune setting
+  double cache_age_bin_interval = 0; ///< time to wait between cache age bin rotations
    double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
+  std::vector<uint64_t> kv_bins; ///< kv autotune bins
+  std::vector<uint64_t> kv_onode_bins; ///< kv onode autotune bins
+  std::vector<uint64_t> meta_bins; ///< meta autotune bins
+  std::vector<uint64_t> data_bins; ///< data autotune bins
    uint64_t osd_memory_target = 0;   ///< OSD memory target when autotuning cache
    uint64_t osd_memory_base = 0;     ///< OSD base memory when autotuning cache
    double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
@@ -2270,6 +2305,7 @@ private:
  
      struct MempoolCache : public PriorityCache::PriCache {
        BlueStore *store;
+      uint64_t bins[PriorityCache::Priority::LAST+1] = {0};
        int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
        int64_t committed_bytes = 0;
        double cache_ratio = 0;
@@ -2277,21 +2313,34 @@ private:
        MempoolCache(BlueStore *s) : store(s) {};
  
        virtual uint64_t _get_used_bytes() const = 0;
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const = 0;
  
        virtual int64_t request_cache_bytes(
            PriorityCache::Priority pri, uint64_t total_cache) const {
          int64_t assigned = get_cache_bytes(pri);
  
          switch (pri) {
-        // All cache items are currently shoved into the PRI1 priority 
-        case PriorityCache::Priority::PRI1:
+        case PriorityCache::Priority::PRI0:
+         {
+            // BlueStore caches currently don't put anything in PRI0
+           break;
+         }
+        case PriorityCache::Priority::LAST:
            {
-            int64_t request = _get_used_bytes();
+            uint32_t max = get_bin_count();
+           int64_t request = _get_used_bytes() - _sum_bins(0, max);
              return(request > assigned) ? request - assigned : 0;
            }
          default:
-          break;
-        }
+         {
+           ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST);
+            auto prev_pri = static_cast<PriorityCache::Priority>(pri - 1);
+            uint64_t start = get_bins(prev_pri);
+            uint64_t end = get_bins(pri);
+            int64_t request = _sum_bins(start, end);
+            return(request > assigned) ? request - assigned : 0;
+         }
+       }
          return -EOPNOTSUPP;
        }
   
@@ -2321,6 +2370,42 @@ private:
        virtual int64_t get_committed_size() const {
          return committed_bytes;
        }
+      virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+        if (pri > PriorityCache::Priority::PRI0 &&
+            pri < PriorityCache::Priority::LAST) {
+          return bins[pri];
+        }
+        return 0;
+      }
+      virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+        if (pri <= PriorityCache::Priority::PRI0 ||
+            pri >= PriorityCache::Priority::LAST) {
+          return;
+        }
+        bins[pri] = end_bin;
+        uint64_t max = 0;
+        for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+          if (bins[pri] > max) {
+            max = bins[pri];
+          }
+        }
+        set_bin_count(max);
+      }
+      virtual void import_bins(const std::vector<uint64_t> &bins_v) {
+        uint64_t max = 0;
+        for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+          unsigned i = (unsigned) pri - 1;
+          if (i < bins_v.size()) {
+            bins[pri] = bins_v[i];
+            if (bins[pri] > max) {
+              max = bins[pri];
+            }
+          } else {
+            bins[pri] = 0;
+          }
+        }
+        set_bin_count(max);
+      }
        virtual double get_cache_ratio() const {
          return cache_ratio;
        }
@@ -2328,11 +2413,21 @@ private:
          cache_ratio = ratio;
        }
        virtual std::string get_cache_name() const = 0;
+      virtual uint32_t get_bin_count() const = 0;
+      virtual void set_bin_count(uint32_t count) = 0;
      };
  
      struct MetaCache : public MempoolCache {
        MetaCache(BlueStore *s) : MempoolCache(s) {};
  
+      virtual uint32_t get_bin_count() const {
+        return store->onode_cache_shards[0]->get_bin_count();
+      }
+      virtual void set_bin_count(uint32_t count) {
+        for (auto i : store->onode_cache_shards) {
+          i->set_bin_count(count);
+        }
+      }
        virtual uint64_t _get_used_bytes() const {
          return mempool::bluestore_Buffer::allocated_bytes() +
            mempool::bluestore_Blob::allocated_bytes() +
@@ -2343,17 +2438,26 @@ private:
            mempool::bluestore_SharedBlob::allocated_bytes() +
            mempool::bluestore_inline_bl::allocated_bytes();
        }
-
+      virtual void shift_bins() {
+        for (auto i : store->onode_cache_shards) {
+          i->shift_bins();
+        }
+      }
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+        uint64_t onodes = 0;
+       for (auto i : store->onode_cache_shards) {
+         onodes += i->sum_bins(start, end);
+       }
+       return onodes*get_bytes_per_onode();
+      }
        virtual std::string get_cache_name() const {
          return "BlueStore Meta Cache";
        }
-
        uint64_t _get_num_onodes() const {
          uint64_t onode_num =
              mempool::bluestore_cache_onode::allocated_items();
          return (2 > onode_num) ? 2 : onode_num;
        }
-
        double get_bytes_per_onode() const {
          return (double)_get_used_bytes() / (double)_get_num_onodes();
        }
@@ -2363,6 +2467,14 @@ private:
      struct DataCache : public MempoolCache {
        DataCache(BlueStore *s) : MempoolCache(s) {};
  
+      virtual uint32_t get_bin_count() const {
+        return store->buffer_cache_shards[0]->get_bin_count();
+      }
+      virtual void set_bin_count(uint32_t count) {
+        for (auto i : store->buffer_cache_shards) {
+          i->set_bin_count(count);
+        }
+      }
        virtual uint64_t _get_used_bytes() const {
          uint64_t bytes = 0;
          for (auto i : store->buffer_cache_shards) {
@@ -2370,6 +2482,18 @@ private:
          }
          return bytes; 
        }
+      virtual void shift_bins() {
+        for (auto i : store->buffer_cache_shards) {
+          i->shift_bins();
+        }
+      }
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+        uint64_t bytes = 0;
+        for (auto i : store->buffer_cache_shards) {
+          bytes += i->sum_bins(start, end);
+        }
+        return bytes;
+      }
        virtual std::string get_cache_name() const {
          return "BlueStore Data Cache";
        }
@@ -2396,7 +2520,6 @@ private:
      }
  
    private:
-    void _adjust_cache_settings();
      void _update_cache_settings();
      void _resize_shards(bool interval_stats);
    } mempool_thread;
diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc

index 38f35edcd440356db3a1f1731791fcadfc73acb8..a67afcf42dee685159475db95ca1e94541fc39eb 100644 (file)
--- a/src/tools/rbd_mirror/Mirror.cc
+++ b/src/tools/rbd_mirror/Mirror.cc
@@ -255,6 +255,19 @@ struct PriCache : public PriorityCache::PriCache {
      m_cache_ratio = ratio;
    }
  
+  void shift_bins() override {
+  }
+
+  void import_bins(const std::vector<uint64_t> &intervals) override {
+  }
+
+  void set_bins(PriorityCache::Priority pri, uint64_t end_interval) override {
+  }
+
+  uint64_t get_bins(PriorityCache::Priority pri) const override {
+    return 0;
+  }
+
    std::string get_cache_name() const override {
      return m_name;
    }
author	Mark Nelson <mnelson@redhat.com>
	Tue, 13 Oct 2020 23:24:18 +0000 (23:24 +0000)
committer	Mark Nelson <mnelson@redhat.com>
	Tue, 11 Jan 2022 21:01:19 +0000 (21:01 +0000)
src/common/PriorityCache.cc		patch \| blob \| history
src/common/PriorityCache.h		patch \| blob \| history
src/common/options/global.yaml.in		patch \| blob \| history
src/kv/rocksdb_cache/BinnedLRUCache.cc		patch \| blob \| history
src/kv/rocksdb_cache/BinnedLRUCache.h		patch \| blob \| history
src/kv/rocksdb_cache/ShardedCache.h		patch \| blob \| history
src/mon/OSDMonitor.cc		patch \| blob \| history
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/BlueStore.h		patch \| blob \| history
src/tools/rbd_mirror/Mirror.cc		patch \| blob \| history