os/bluestore: move and rename ExtentCache to Allocator class.

author Igor Fedotov <igor.fedotov@croit.io>

Mon, 29 Jan 2024 18:49:48 +0000 (21:49 +0300)

committer Igor Fedotov <igor.fedotov@croit.io>

Thu, 11 Jul 2024 11:49:00 +0000 (14:49 +0300)
author Igor Fedotov <igor.fedotov@croit.io>
Mon, 29 Jan 2024 18:49:48 +0000 (21:49 +0300)
committer Igor Fedotov <igor.fedotov@croit.io>
Thu, 11 Jul 2024 11:49:00 +0000 (14:49 +0300)
diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h

index 932695d4ab08e30afd24ad1839d16cb9c1bf4138..582ff1b188d6feebbd011110cf4598a76211e155 100644 (file)
--- a/src/os/bluestore/Allocator.h
+++ b/src/os/bluestore/Allocator.h
@@ -16,11 +16,185 @@
  #include <ostream>
  #include "include/ceph_assert.h"
  #include "bluestore_types.h"
+#include "common/ceph_mutex.h"
  
  typedef interval_set<uint64_t> release_set_t;
  typedef release_set_t::value_type release_set_entry_t;
  
  class Allocator {
+protected:
+
+  struct ExtentCollectionTraits {
+    size_t num_buckets;
+    size_t base_bits; // min extent size
+    size_t base = 1ull << base_bits;
+    size_t factor;  // single bucket size range to be
+                    // determined as [len, len * factor * 2)
+                    // for log2(len) indexing and
+                    // [len, len + factor * base)
+                    // for linear indexing.
+
+
+    ExtentCollectionTraits(size_t _num_buckets,
+                           size_t _base_bits = 12,  //= 4096 bytes
+                           size_t _factor = 1) :
+      num_buckets(_num_buckets),
+      base_bits(_base_bits),
+      base(1ull << base_bits),
+      factor(_factor)
+    {
+      ceph_assert(factor);
+    }
+
+    /*
+     * Determines bucket index for a given extent's length in a bucket collection
+     * with log2(len) indexing.
+     * The last bucket index is returned for lengths above the maximum.
+     */
+    inline size_t _get_p2_size_bucket(uint64_t len) const {
+      size_t idx;
+      const size_t len_p2_max =
+        base << ((factor * (num_buckets - 2)));
+      if (len <= base) {
+        idx = 0;
+      } else if (len > len_p2_max) {
+        idx = num_buckets - 1;
+      } else {
+        size_t most_bit = cbits(uint64_t(len - 1)) - 1;
+        idx = 1 + ((most_bit - base_bits) / factor);
+      }
+      ceph_assert(idx < num_buckets);
+      return idx;
+    }
+    /*
+     * Determines bucket index for a given extent's length in a bucket collection
+     * with linear (len / min_extent_size) indexing.
+     * The last bucket index is returned for lengths above the maximum.
+     */
+    inline size_t _get_linear_size_bucket(uint64_t len) const {
+      size_t idx = (len / factor) >> base_bits;
+      idx = idx < num_buckets ? idx : num_buckets - 1;
+      return idx;
+    }
+  };
+
+  /*
+   * Lockless stack implementation
+   * that permits put/get operation exclusively
+   * if no waiting is needed.
+   * Conflicting operations are omitted.
+   */
+  class LocklessOpportunisticStack {
+    std::atomic<size_t> ref = 0;
+    std::atomic<size_t> count = 0;
+    std::vector<uint64_t> data;
+  public:
+    void init(size_t size) {
+      data.resize(size);
+    }
+    bool try_put(uint64_t& v) {
+      bool done = ++ref == 1 && count < data.size();
+      if (done) {
+        data[count++] = v;
+      }
+      --ref;
+      return done;
+    }
+    bool try_get(uint64_t& v) {
+      bool done = ++ref == 1 && count > 0;
+      if (done) {
+        v = data[--count];
+      }
+      --ref;
+      return done;
+    }
+    void foreach(std::function<void(uint64_t)> notify) {
+      for (size_t i = 0; i < count; i++) {
+        notify(data[i]);
+      }
+    }
+  };
+  /*
+   * Concurrently accessed extent (offset,length) cache
+   * which permits put/get operation exclusively if no waiting is needed.
+   * Implemented via a set of independent buckets (aka LocklessOpportunisticStack).
+   * Each bucket keeps extents of specific size only: 4K, 8K, 12K...64K
+   * which allows to avoid individual extent size tracking.
+   * Each bucket permits a single operation at a given time only,
+   * additional operations against the bucket are rejected meaning relevant
+   * extents aren't not cached.
+   */
+  class OpportunisticExtentCache {
+    const Allocator::ExtentCollectionTraits myTraits;
+    enum {
+      BUCKET_COUNT = 16,
+      EXTENTS_PER_BUCKET = 16, // amount of entries per single bucket,
+                               // total amount of entries will be
+                               // BUCKET_COUNT * EXTENTS_PER_BUCKET.
+    };
+
+    std::vector<LocklessOpportunisticStack> buckets;
+    std::atomic<size_t> hits = 0;
+    ceph::shared_mutex lock{
+      ceph::make_shared_mutex(std::string(), false, false, false)
+    };
+  public:
+    OpportunisticExtentCache() :
+      myTraits(BUCKET_COUNT + 1), // 16 regular buckets + 1 "catch-all" pseudo
+                                  // one to be used for out-of-bound checking
+                                  // since _get_*_size_bucket() methods imply
+                                  // the last bucket usage for the entries
+                                  // exceeding the max length.
+      buckets(BUCKET_COUNT)
+    {
+      //buckets.resize(BUCKET_COUNT);
+      for(auto& b : buckets) {
+        b.init(EXTENTS_PER_BUCKET);
+      }
+    }
+    bool try_put(uint64_t offset, uint64_t len) {
+      if (!lock.try_lock_shared()) {
+        return false;
+      }
+      bool ret = false;
+      ceph_assert(p2aligned(offset, myTraits.base));
+      ceph_assert(p2aligned(len, myTraits.base));
+      auto idx = myTraits._get_linear_size_bucket(len);
+      if (idx < buckets.size())
+        ret = buckets[idx].try_put(offset);
+      lock.unlock_shared();
+      return ret;
+    }
+    bool try_get(uint64_t* offset, uint64_t len) {
+      if (!lock.try_lock_shared()) {
+        return false;
+      }
+      bool ret = false;
+      ceph_assert(offset);
+      ceph_assert(p2aligned(len, myTraits.base));
+      size_t idx = len >> myTraits.base_bits;
+      if (idx < buckets.size()) {
+        ret = buckets[idx].try_get(*offset);
+        if (ret) {
+          ++hits;
+        }
+      }
+      lock.unlock_shared();
+      return ret;
+    }
+    size_t get_hit_count() const {
+      return hits.load();
+    }
+    void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) {
+      std::unique_lock _lock(lock);
+      for (uint64_t i = 0; i < buckets.size(); i++) {
+        auto cb = [&](uint64_t o) {
+          notify(o, i << myTraits.base_bits);
+        };
+        buckets[i].foreach(cb);
+      }
+    }
+  };
  
  public:
    Allocator(std::string_view name,
diff --git a/src/os/bluestore/Btree2Allocator.cc b/src/os/bluestore/Btree2Allocator.cc

index d0a2763ac2fdb49474ef1e3e857d53ba5baecfe6..33ee6348c8e0315908031054ac6fe224fd08e70d 100644 (file)
--- a/src/os/bluestore/Btree2Allocator.cc
+++ b/src/os/bluestore/Btree2Allocator.cc
@@ -24,13 +24,15 @@ Btree2Allocator::Btree2Allocator(CephContext* _cct,
    bool with_cache,
    std::string_view name) :
      Allocator(name, device_size, block_size),
+    myTraits(RANGE_SIZE_BUCKET_COUNT),
      cct(_cct),
      range_count_cap(max_mem / sizeof(range_seg_t))
  {
    set_weight_factor(_rweight_factor);
    if (with_cache) {
-    cache = new ChunkCache();
+    cache = new OpportunisticExtentCache();
    }
+  range_size_set.resize(myTraits.num_buckets);
  }
  
  void Btree2Allocator::init_add_free(uint64_t offset, uint64_t length)
@@ -195,7 +197,7 @@ int64_t Btree2Allocator::_allocate(
          continue;
        }
      }
-    size_t bucket0 = MyTraits::_get_size_bucket(want_now);
+    size_t bucket0 = myTraits._get_p2_size_bucket(want_now);
      int64_t r = __allocate(bucket0, want_now,
        unit, extents);
      if (r < 0) {
@@ -329,7 +331,7 @@ int64_t Btree2Allocator::__allocate(
    auto rs_p = _pick_block(0, rs_tree, size);
  
    if (rs_p == rs_tree->end()) {
-    auto bucket_center = MyTraits::_get_size_bucket(weight_center);
+    auto bucket_center = myTraits._get_p2_size_bucket(weight_center);
  
      // requested size is to the left of weight center
      // hence we try to search up toward it first
@@ -356,7 +358,7 @@ int64_t Btree2Allocator::__allocate(
      bucket = dir < 0 ? bucket0 : bucket_center + 1;
      do {
        // try spilled over or different direction if bucket index is out of bounds
-      if (bucket >= MyTraits::num_size_buckets) {
+      if (bucket >= myTraits.num_buckets) {
          if (dir < 0) {
            // reached the bottom while going downhill,
            // time to try spilled over extents
@@ -376,7 +378,7 @@ int64_t Btree2Allocator::__allocate(
          dir = -dir;
          bucket = dir < 0 ? bucket0 : bucket_center + 1; // See above on new bucket
                                                          // selection rationales
-        ceph_assert(bucket < MyTraits::num_size_buckets); // this should never happen
+        ceph_assert(bucket < myTraits.num_buckets); // this should never happen
          if (dir == dir0 ) {
            // stop if both directions already attempted
            return -ENOSPC;
@@ -465,7 +467,7 @@ void Btree2Allocator::_remove_from_tree(
    uint64_t end)
  {
    range_seg_t rs(rt_p->first, rt_p->second);
-  size_t bucket = MyTraits::_get_size_bucket(rs.length());
+  size_t bucket = myTraits._get_p2_size_bucket(rs.length());
    range_size_tree_t* rs_tree = &range_size_set[bucket];
    auto rs_p = rs_tree->find(rs);
    ceph_assert(rs_p != rs_tree->end());
@@ -569,7 +571,7 @@ bool Btree2Allocator::__try_insert_range(
  void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) {
    auto l = rs.length();
    ceph_assert(rs.end > rs.start);
-  size_t bucket = MyTraits::_get_size_bucket(l);
+  size_t bucket = myTraits._get_p2_size_bucket(l);
    range_size_set[bucket].insert(rs);
    num_free += l;
  
@@ -582,7 +584,7 @@ void Btree2Allocator::_range_size_tree_add(const range_seg_t& rs) {
  }
  void Btree2Allocator::_range_size_tree_rm(const range_seg_t& rs)
  {
-  size_t bucket = MyTraits::_get_size_bucket(rs.length());
+  size_t bucket = myTraits._get_p2_size_bucket(rs.length());
    range_size_tree_t* rs_tree = &range_size_set[bucket];
    ceph_assert(rs_tree->size() > 0);
    auto rs_p = rs_tree->find(rs);
diff --git a/src/os/bluestore/Btree2Allocator.h b/src/os/bluestore/Btree2Allocator.h

index f950999adc45d80a1d0003b71953bdad6f06558c..bccdeb6ad057136769f470b6ba2be971c4957531 100644 (file)
--- a/src/os/bluestore/Btree2Allocator.h
+++ b/src/os/bluestore/Btree2Allocator.h
@@ -14,146 +14,23 @@
  #include "include/mempool.h"
  #include "common/ceph_mutex.h"
  
-template <size_t BASE_BITS, size_t NUM_BUCKETS, size_t FACTOR = 1>
-struct SizeBucketCollectionTraits {
-  static const size_t base_bits = BASE_BITS;
-  static const size_t base = 1ull << base_bits;
-  static const size_t size_factor = FACTOR; // single bucket size range to be
-                                            // determined as [n, n * 2 * FACTOR)
-  static const size_t num_size_buckets = NUM_BUCKETS;
-  static const size_t bucket_max =
-    base << ((size_factor * (num_size_buckets - 2)));
-
-  static inline size_t _get_size_bucket(uint64_t len) {
-    size_t idx;
-    ceph_assert(size_factor);
-    if (len <= base) {
-      idx = 0;
-    } else if (len > bucket_max) {
-      idx = num_size_buckets - 1;
-    } else {
-      size_t most_bit = cbits(uint64_t(len - 1)) - 1;
-      idx = 1 + ((most_bit - base_bits) / size_factor);
-    }
-    ceph_assert(idx < num_size_buckets);
-    return idx;
-  }
-};
-
-class ChunkCache {
-public:
-  class LocklessBuf {
-    std::atomic<size_t> ref = 0;
-    size_t count = 0;
-    std::vector<uint32_t> data;
-  public:
-    void init(size_t size) {
-      ceph_assert(data.size() == 0);
-      data.resize(size);
-    }
-    bool try_put(uint32_t& v) {
-      bool done = ++ref == 1 && count < data.size();
-      if (done) {
-        data[count++] = v;
-      }
-      --ref;
-      return done;
-    }
-    bool try_get(uint32_t& v) {
-      bool done = ++ref == 1 && count > 0;
-      if (done) {
-        v = data[--count];
-      }
-      --ref;
-      return done;
-    }
-    void foreach(std::function<void(uint32_t)> notify) {
-      for (size_t i = 0; i < count; i++) {
-        notify(data[i]);
-      }
-    }
-  };
-
-  static const size_t base_bits = 12; // 4K
-  static const size_t base = 1ull << base_bits;
-  static const size_t num_buckets = 16; // [4K, 8K, 12K, ... 64K] - 16 buckets total
-  static const size_t num_chunks = 16;
-  ChunkCache() {
-    for (size_t i = 0; i < buckets.size(); i++) {
-      buckets[i].init(num_chunks);
-    }
-  }
-  bool try_put(uint64_t offset, uint64_t len) {
-    if (!lock.try_lock_shared()) {
-      return false;
-    }
-    bool ret = false;
-    ceph_assert(p2aligned(offset, base));
-    ceph_assert(p2aligned(len, base));
-    // permit caching chunks with offsets fitting (without LSBs)
-    // into 32 - bit value
-    offset = offset >> base_bits;
-    size_t idx = len >> base_bits;
-    if (offset <= std::numeric_limits<uint32_t>::max() &&
-      idx < buckets.size()) {
-      uint32_t o = offset;
-      ret = buckets[idx].try_put(o);
-    }
-    lock.unlock_shared();
-    return ret;
-  }
-  bool try_get(uint64_t* offset, uint64_t len) {
-    if (!lock.try_lock_shared()) {
-      return false;
-    }
-    bool ret = false;
-    ceph_assert(offset);
-    ceph_assert(p2aligned(len, base));
-    size_t idx = len >> base_bits;
-    if (idx < buckets.size()) {
-      uint32_t o = 0;
-      ret = buckets[idx].try_get(o);
-      if (ret) {
-        *offset = uint64_t(o) << base_bits;
-        ++hits;
-      }
-    }
-    lock.unlock_shared();
-    return ret;
-  }
-  size_t get_hit_count() const {
-    return hits.load();
-  }
-  void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) {
-    std::unique_lock _lock(lock);
-    for (uint64_t i = 0; i < buckets.size(); i++) {
-      auto cb = [&](uint32_t o) {
-        notify(uint64_t(o) << base_bits, i << base_bits);
-      };
-      buckets[i].foreach(cb);
-    }
-  }
-private:
-  std::array<LocklessBuf, num_buckets> buckets;
-  std::atomic<size_t> hits = 0;
-  ceph::shared_mutex lock{
-    ceph::make_shared_mutex(std::string(), false, false, false)
-  };
-};
-
  /*
   * class Btree2Allocator
   *
   *
   */
  class Btree2Allocator : public Allocator {
-  typedef SizeBucketCollectionTraits<12, 14> MyTraits;
+  enum {
+    RANGE_SIZE_BUCKET_COUNT = 14,
+  };
+  const ExtentCollectionTraits myTraits;
+
  public:
    // Making public to share with mempools
    struct range_seg_t {
      MEMPOOL_CLASS_HELPERS();  ///< memory monitoring
-    uint64_t start;   ///< starting offset of this segment
-    uint64_t end;          ///< ending offset (non-inclusive)
+    uint64_t start;           ///< starting offset of this segment
+    uint64_t end;            ///< ending offset (non-inclusive)
  
      // Tree is sorted by offset, greater offsets at the end of the tree.
      struct before_t {
@@ -252,7 +129,7 @@ public:
  
  private:
    CephContext* cct = nullptr;
-  ChunkCache* cache = nullptr;
+  Allocator::OpportunisticExtentCache* cache = nullptr;
    std::mutex lock;
  
    template<class T>
@@ -276,8 +153,7 @@ private:
        range_seg_t,
        range_seg_t::shorter_t,
        pool_allocator<range_seg_t>>;
-  std::array<
-    range_size_tree_t, MyTraits::num_size_buckets> range_size_set;
+  std::vector<range_size_tree_t> range_size_set;
  
    std::atomic<uint64_t> num_free = 0;     ///< total bytes in freelist
author	Igor Fedotov <igor.fedotov@croit.io>
	Mon, 29 Jan 2024 18:49:48 +0000 (21:49 +0300)
committer	Igor Fedotov <igor.fedotov@croit.io>
	Thu, 11 Jul 2024 11:49:00 +0000 (14:49 +0300)
src/os/bluestore/Allocator.h		patch \| blob \| history
src/os/bluestore/Btree2Allocator.cc		patch \| blob \| history
src/os/bluestore/Btree2Allocator.h		patch \| blob \| history