]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: intdoduce spillover mechanics to AvlAllocator.
authorIgor Fedotov <ifedotov@suse.com>
Wed, 12 Feb 2020 12:55:03 +0000 (15:55 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Mon, 13 Apr 2020 13:12:43 +0000 (16:12 +0300)
This to be utilized in AvlAllocator's descendant class(es) to be able
to limit amount of entries allocator tracks. Hence bringing a cap for
RAM used by the allocator.

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/os/bluestore/AvlAllocator.cc
src/os/bluestore/AvlAllocator.h

index ccc3c756379cb4ffca3cc9933feff9a9459cf3ad..abc130a41a25de39dbe7d4885e98e4bba8c98714 100644 (file)
@@ -55,15 +55,6 @@ uint64_t AvlAllocator::_block_picker(const Tree& t,
    return _block_picker(t, cursor, size, align);
 }
 
-namespace {
-  struct dispose_rs {
-    void operator()(range_seg_t* p)
-    {
-      delete p;
-    }
-  };
-}
-
 void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
 {
   assert(size != 0);
@@ -83,25 +74,22 @@ void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
   bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
 
   if (merge_before && merge_after) {
-    range_size_tree.erase(*rs_before);
-    range_size_tree.erase(*rs_after);
+    _range_size_tree_rm(*rs_before);
+    _range_size_tree_rm(*rs_after);
     rs_after->start = rs_before->start;
     range_tree.erase_and_dispose(rs_before, dispose_rs{});
-    range_size_tree.insert(*rs_after);
+    _range_size_tree_try_insert(*rs_after);
   } else if (merge_before) {
-    range_size_tree.erase(*rs_before);
+    _range_size_tree_rm(*rs_before);
     rs_before->end = end;
-    range_size_tree.insert(*rs_before);
+    _range_size_tree_try_insert(*rs_before);
   } else if (merge_after) {
-    range_size_tree.erase(*rs_after);
+    _range_size_tree_rm(*rs_after);
     rs_after->start = start;
-    range_size_tree.insert(*rs_after);
+    _range_size_tree_try_insert(*rs_after);
   } else {
-    auto new_rs = new range_seg_t{start, end};
-    range_tree.insert_before(rs_after, *new_rs);
-    range_size_tree.insert(*new_rs);
+    _try_insert_range(start, end, &rs_after);
   }
-  num_free += size;
 }
 
 void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
@@ -120,25 +108,31 @@ void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
   bool left_over = (rs->start != start);
   bool right_over = (rs->end != end);
 
-  range_size_tree.erase(*rs);
+  _range_size_tree_rm(*rs);
 
   if (left_over && right_over) {
-    auto new_seg = new range_seg_t{end, rs->end};
+    auto old_right_end = rs->end;
+    auto insert_pos = rs;
+    ceph_assert(insert_pos != range_tree.end());
+    ++insert_pos;
     rs->end = start;
-    range_tree.insert(rs, *new_seg);
-    range_size_tree.insert(*new_seg);
-    range_size_tree.insert(*rs);
+
+    // Insert tail first to be sure insert_pos hasn't been disposed.
+    // This woulnd't dispose rs though since it's out of range_size_tree.
+    // Don't care about a small chance of 'not-the-best-choice-for-removal' case
+    // which might happen if rs has the lowest size.
+    _try_insert_range(end, old_right_end, &insert_pos);
+    _range_size_tree_try_insert(*rs);
+
   } else if (left_over) {
     rs->end = start;
-    range_size_tree.insert(*rs);
+    _range_size_tree_try_insert(*rs);
   } else if (right_over) {
     rs->start = end;
-    range_size_tree.insert(*rs);
+    _range_size_tree_try_insert(*rs);
   } else {
     range_tree.erase_and_dispose(rs, dispose_rs{});
   }
-  assert(num_free >= size);
-  num_free -= size;
 }
 
 int AvlAllocator::_allocate(
@@ -198,6 +192,22 @@ int AvlAllocator::_allocate(
   return 0;
 }
 
+AvlAllocator::AvlAllocator(CephContext* cct,
+                           int64_t device_size,
+                           int64_t block_size,
+                           uint64_t max_entries,
+                           const std::string& name) :
+  Allocator(name),
+  num_total(device_size),
+  block_size(block_size),
+  range_size_alloc_threshold(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+  range_size_alloc_free_pct(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+  range_count_cap(max_entries),
+  cct(cct)
+{}
+
 AvlAllocator::AvlAllocator(CephContext* cct,
                           int64_t device_size,
                           int64_t block_size,
index a855d975da611ed1011b352826aaa9475b69224b..0ad3c99c6a2e6c455a6ee5c253a5f2788acb1be7 100644 (file)
@@ -43,10 +43,30 @@ struct range_seg_t {
       }
     }
   };
+  inline uint64_t length() const {
+    return end - start;
+  }
   boost::intrusive::avl_set_member_hook<> size_hook;
 };
 
 class AvlAllocator final : public Allocator {
+  struct dispose_rs {
+    void operator()(range_seg_t* p)
+    {
+      delete p;
+    }
+  };
+
+protected:
+  /*
+  * ctor intended for the usage from descendant class(es) which
+  * provides handlig for spilled over entries
+  * (when entry count >= max_entries)
+  */
+  AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+    uint64_t max_entries,
+    const std::string& name);
+
 public:
   AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
               const std::string& name);
@@ -100,7 +120,8 @@ private:
       boost::intrusive::member_hook<
        range_seg_t,
        boost::intrusive::avl_set_member_hook<>,
-       &range_seg_t::size_hook>>;
+       &range_seg_t::size_hook>,
+      boost::intrusive::constant_time_size<true>>;
   range_size_tree_t range_size_tree;
 
   const int64_t num_total;   ///< device size
@@ -131,6 +152,66 @@ private:
    */
   int range_size_alloc_free_pct = 0;
 
+  /*
+  * Max amount of range entries allowed. 0 - unlimited
+  */
+  uint64_t range_count_cap = 0;
+
   CephContext* cct;
   std::mutex lock;
+
+  uint64_t _lowest_size_available() {
+    auto rs = range_size_tree.begin();
+    return rs != range_size_tree.end() ? rs->length() : 0;
+  }
+  void _range_size_tree_rm(range_seg_t& r) {
+    ceph_assert(num_free >= r.length());
+    num_free -= r.length();
+    range_size_tree.erase(r);
+
+  }
+  void _range_size_tree_try_insert(range_seg_t& r) {
+    if (_try_insert_range(r.start, r.end)) {
+      range_size_tree.insert(r);
+      num_free += r.length();
+    } else {
+      range_tree.erase_and_dispose(r, dispose_rs{});
+    }
+  }
+  bool _try_insert_range(uint64_t start,
+                         uint64_t end,
+                        range_tree_t::iterator* insert_pos = nullptr) {
+    bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
+    bool remove_lowest = false;
+    if (!res) {
+      if (end - start > _lowest_size_available()) {
+        remove_lowest = true;
+        res = true;
+      }
+    }
+    if (!res) {
+      _spillover_range(start, end);
+    } else {
+      // NB:  we should do insertion before the following removal
+      // to avoid potential iterator disposal insertion might depend on.
+      if (insert_pos) {
+        auto new_rs = new range_seg_t{ start, end };
+        range_tree.insert_before(*insert_pos, *new_rs);
+        range_size_tree.insert(*new_rs);
+        num_free += new_rs->length();
+      }
+      if (remove_lowest) {
+        auto r = range_size_tree.begin();
+        _range_size_tree_rm(*r);
+        _spillover_range(r->start, r->end);
+        range_tree.erase_and_dispose(*r, dispose_rs{});
+      }
+    }
+    return res;
+  }
+  virtual void _spillover_range(uint64_t start, uint64_t end) {
+    // this should be overriden when range count cap is present,
+    // i.e. (range_count_cap > 0)
+    ceph_assert(false);
+  }
 };