]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore/btree: search fixed-kv-btree by parent<->child pointers
authorXuehan Xu <xxhdx1985126@gmail.com>
Thu, 13 Oct 2022 03:50:17 +0000 (11:50 +0800)
committerMatan Breizman <mbreizma@redhat.com>
Tue, 23 May 2023 13:24:44 +0000 (13:24 +0000)
Signed-off-by: Xuehan Xu <xxhdx1985126@gmail.com>
(cherry picked from commit 7c3305f0149808f800dc6c852d4f1f755a490ec4)

src/crimson/os/seastore/btree/fixed_kv_btree.h
src/crimson/os/seastore/btree/fixed_kv_node.h
src/crimson/os/seastore/cache.h
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/transaction.h

index 71e77a5ae970cc11c32173a27ea83764223a6046..ca6e9359e7a963111d67ab6f24fdb86ccdb60cdf 100644 (file)
@@ -219,6 +219,12 @@ public:
       typename NodeType::Ref node;
       uint16_t pos = INVALID;
 
+      node_position_t() = default;
+      node_position_t(
+        typename NodeType::Ref node,
+        uint16_t pos)
+        : node(node), pos(pos) {}
+
       void reset() {
        *this = node_position_t{};
       }
@@ -1009,6 +1015,9 @@ private:
   phy_tree_root_t root;
   bool root_dirty = false;
 
+  template <typename T>
+  using node_position_t = typename iterator::template node_position_t<T>;
+
   using get_internal_node_iertr = base_iertr;
   using get_internal_node_ret = get_internal_node_iertr::future<InternalNodeRef>;
   static get_internal_node_ret get_internal_node(
@@ -1016,7 +1025,8 @@ private:
     depth_t depth,
     paddr_t offset,
     node_key_t begin,
-    node_key_t end)
+    node_key_t end,
+    typename std::optional<node_position_t<internal_node_t>> parent_pos)
   {
     LOG_PREFIX(FixedKVBtree::get_internal_node);
     SUBTRACET(
@@ -1028,10 +1038,16 @@ private:
       begin,
       end);
     assert(depth > 1);
-    auto init_internal = [c, depth, begin, end](internal_node_t &node) {
+    auto init_internal = [c, depth, begin, end,
+                          parent_pos=std::move(parent_pos)]
+                          (internal_node_t &node) {
       assert(!node.is_pending());
       assert(!node.pin.is_linked());
       node.pin.set_range(fixed_kv_node_meta_t<node_key_t>{begin, end, depth});
+      if (parent_pos) {
+        auto &parent = parent_pos->node;
+        parent->link_child(&node, parent_pos->pos);
+      }
       if (c.pins) {
         c.pins->add_pin(node.pin);
       }
@@ -1078,7 +1094,8 @@ private:
     op_context_t<node_key_t> c,
     paddr_t offset,
     node_key_t begin,
-    node_key_t end)
+    node_key_t end,
+    typename std::optional<node_position_t<leaf_node_t>> parent_pos)
   {
     LOG_PREFIX(FixedKVBtree::get_leaf_node);
     SUBTRACET(
@@ -1088,10 +1105,16 @@ private:
       offset,
       begin,
       end);
-    auto init_leaf = [c, begin, end](leaf_node_t &node) {
+    auto init_leaf = [c, begin, end,
+                      parent_pos=std::move(parent_pos)]
+                      (leaf_node_t &node) {
       assert(!node.is_pending());
       assert(!node.pin.is_linked());
       node.pin.set_range(fixed_kv_node_meta_t<node_key_t>{begin, end, 1});
+      if (parent_pos) {
+        auto &parent = parent_pos->node;
+        parent->link_child(&node, parent_pos->pos);
+      }
       if (c.pins) {
         c.pins->add_pin(node.pin);
       }
@@ -1143,7 +1166,8 @@ private:
        root.get_depth(),
        root.get_location(),
        min_max_t<node_key_t>::min,
-       min_max_t<node_key_t>::max
+       min_max_t<node_key_t>::max,
+        std::nullopt
       ).si_then([this, visitor, &iter](InternalNodeRef root_node) {
        iter.get_internal(root.get_depth()).node = root_node;
        if (visitor) (*visitor)(
@@ -1158,7 +1182,8 @@ private:
        c,
        root.get_location(),
        min_max_t<node_key_t>::min,
-       min_max_t<node_key_t>::max
+       min_max_t<node_key_t>::max,
+        std::nullopt
       ).si_then([visitor, &iter, this](LeafNodeRef root_node) {
        iter.leaf.node = root_node;
        if (visitor) (*visitor)(
@@ -1181,22 +1206,13 @@ private:
     F &f,
     mapped_space_visitor_t *visitor
   ) {
+    LOG_PREFIX(FixedKVBtree::lookup_internal_level);
     assert(depth > 1);
     auto &parent_entry = iter.get_internal(depth + 1);
     auto parent = parent_entry.node;
     auto node_iter = parent->iter_idx(parent_entry.pos);
-    auto next_iter = node_iter + 1;
-    auto begin = node_iter->get_key();
-    auto end = next_iter == parent->end()
-      ? parent->get_node_meta().end
-      : next_iter->get_key();
-    return get_internal_node(
-      c,
-      depth,
-      node_iter->get_val().maybe_relative_to(parent->get_paddr()),
-      begin,
-      end
-    ).si_then([depth, visitor, &iter, &f](InternalNodeRef node) {
+
+    auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) {
       auto &entry = iter.get_internal(depth);
       entry.node = node;
       auto node_iter = f(*node);
@@ -1209,6 +1225,50 @@ private:
           depth,
           node->get_type());
       return seastar::now();
+    };
+
+    auto child_pos = parent->get_child(c.trans, node_iter);
+    auto &child = child_pos.child;
+    if (child) {
+      SUBTRACET(seastore_fixedkv_tree,
+        "got child on {}, pos: {}, res: {}",
+        c.trans,
+        *parent_entry.node,
+        parent_entry.pos,
+        *child);
+
+      ceph_assert(child->is_valid());
+      if (!child->is_pending_in_trans(c.trans.get_trans_id())) {
+        c.trans.add_to_read_set(child);
+        if (!child->is_mutation_pending()) {
+          c.cache.touch_extent(*child);
+        }
+      }
+      return child->wait_io().then(
+        [child, on_found=std::move(on_found), node_iter]() mutable {
+        auto &cnode = (typename internal_node_t::base_t &)*child;
+        assert(cnode.get_node_meta().begin == node_iter.get_key());
+        assert(cnode.get_node_meta().end > node_iter.get_key());
+        return on_found(child->template cast<internal_node_t>());
+      });
+    }
+
+    auto next_iter = node_iter + 1;
+    auto begin = node_iter->get_key();
+    auto end = next_iter == parent->end()
+      ? parent->get_node_meta().end
+      : next_iter->get_key();
+    return get_internal_node(
+      c,
+      depth,
+      node_iter->get_val().maybe_relative_to(parent->get_paddr()),
+      begin,
+      end,
+      std::make_optional<node_position_t<internal_node_t>>(
+        child_pos.stable_parent->template cast<internal_node_t>(),
+        child_pos.pos)
+    ).si_then([on_found=std::move(on_found)](InternalNodeRef node) {
+      return on_found(node);
     });
   }
 
@@ -1221,22 +1281,13 @@ private:
     F &f,
     mapped_space_visitor_t *visitor
   ) {
+    LOG_PREFIX(FixedKVBtree::lookup_leaf);
     auto &parent_entry = iter.get_internal(2);
     auto parent = parent_entry.node;
     assert(parent);
     auto node_iter = parent->iter_idx(parent_entry.pos);
-    auto next_iter = node_iter + 1;
-    auto begin = node_iter->get_key();
-    auto end = next_iter == parent->end()
-      ? parent->get_node_meta().end
-      : next_iter->get_key();
 
-    return get_leaf_node(
-      c,
-      node_iter->get_val().maybe_relative_to(parent->get_paddr()),
-      begin,
-      end
-    ).si_then([visitor, &iter, &f](LeafNodeRef node) {
+    auto on_found = [visitor, &iter, &f](LeafNodeRef node) {
       iter.leaf.node = node;
       auto node_iter = f(*node);
       iter.leaf.pos = node_iter->get_offset();
@@ -1247,6 +1298,50 @@ private:
           1,
           node->get_type());
       return seastar::now();
+    };
+
+    auto child_pos = parent->get_child(c.trans, node_iter);
+    auto &child = child_pos.child;
+    if (child) {
+      SUBTRACET(seastore_fixedkv_tree,
+        "got child on {}, pos: {}, res: {}",
+        c.trans,
+        *parent_entry.node,
+        parent_entry.pos,
+        *child);
+
+      ceph_assert(child->is_valid());
+      if (!child->is_pending_in_trans(c.trans.get_trans_id())) {
+        c.trans.add_to_read_set(child);
+        if (!child->is_mutation_pending()) {
+          c.cache.touch_extent(*child);
+        }
+      }
+      return child->wait_io().then(
+        [child, on_found=std::move(on_found), node_iter]() mutable {
+        auto &cnode = (typename internal_node_t::base_t &)*child;
+        assert(cnode.get_node_meta().begin == node_iter.get_key());
+        assert(cnode.get_node_meta().end > node_iter.get_key());
+        return on_found(child->template cast<leaf_node_t>());
+      });
+    }
+
+    auto next_iter = node_iter + 1;
+    auto begin = node_iter->get_key();
+    auto end = next_iter == parent->end()
+      ? parent->get_node_meta().end
+      : next_iter->get_key();
+
+    return get_leaf_node(
+      c,
+      node_iter->get_val().maybe_relative_to(parent->get_paddr()),
+      begin,
+      end,
+      std::make_optional<node_position_t<leaf_node_t>>(
+        child_pos.stable_parent->template cast<leaf_node_t>(),
+        child_pos.pos)
+    ).si_then([on_found=std::move(on_found)](LeafNodeRef node) {
+      return on_found(node);
     });
   }
 
@@ -1627,9 +1722,6 @@ private:
       });
   }
 
-  template <typename T>
-  using node_position_t = typename iterator::template node_position_t<T>;
-
   template <typename NodeType,
             std::enable_if_t<std::is_same_v<NodeType, leaf_node_t>, int> = 0>
   base_iertr::future<typename NodeType::Ref> get_node(
@@ -1637,9 +1729,10 @@ private:
     depth_t depth,
     paddr_t addr,
     node_key_t begin,
-    node_key_t end) {
+    node_key_t end,
+    typename std::optional<node_position_t<leaf_node_t>> parent_pos) {
     assert(depth == 1);
-    return get_leaf_node(c, addr, begin, end);
+    return get_leaf_node(c, addr, begin, end, std::move(parent_pos));
   }
 
   template <typename NodeType,
@@ -1649,8 +1742,9 @@ private:
     depth_t depth,
     paddr_t addr,
     node_key_t begin,
-    node_key_t end) {
-    return get_internal_node(c, depth, addr, begin, end);
+    node_key_t end,
+    typename std::optional<node_position_t<internal_node_t>> parent_pos) {
+    return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos));
   }
 
   template <typename NodeType>
@@ -1678,13 +1772,7 @@ private:
       : next_iter->get_key();
     
     SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node);
-    return get_node<NodeType>(
-      c,
-      depth,
-      donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()),
-      begin,
-      end
-    ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos](
+    auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos](
                 typename NodeType::Ref donor) {
       LOG_PREFIX(FixedKVBtree::merge_level);
       auto [l, r] = donor_is_left ?
@@ -1756,6 +1844,49 @@ private:
       }
 
       return seastar::now();
+    };
+
+    auto child_pos = parent_pos.node->get_child(c.trans, donor_iter);
+    auto &child = child_pos.child;
+    if (child) {
+      SUBTRACET(seastore_fixedkv_tree,
+        "got child on {}, pos: {}, res: {}",
+        c.trans,
+        *parent_pos.node,
+        donor_iter.get_offset(),
+        *child);
+
+      ceph_assert(child->is_valid());
+      if (!child->is_pending_in_trans(c.trans.get_trans_id())) {
+        c.trans.add_to_read_set(child);
+        if (!child->is_mutation_pending()) {
+          c.cache.touch_extent(*child);
+        }
+      }
+      return child->wait_io().then(
+        [child, do_merge=std::move(do_merge), &pos,
+        donor_iter, donor_is_left]() mutable {
+        auto &node = (typename internal_node_t::base_t&)*child;
+        assert(donor_is_left ?
+          node.get_node_meta().end == pos.node->get_node_meta().begin :
+          node.get_node_meta().begin == pos.node->get_node_meta().end);
+        assert(node.get_node_meta().begin == donor_iter.get_key());
+        assert(node.get_node_meta().end > donor_iter.get_key());
+        return do_merge(child->template cast<NodeType>());
+      });
+    }
+
+    return get_node<NodeType>(
+      c,
+      depth,
+      donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()),
+      begin,
+      end,
+      std::make_optional<node_position_t<NodeType>>(
+        child_pos.stable_parent->template cast<NodeType>(),
+        child_pos.pos)
+    ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) {
+      return do_merge(donor);
     });
   }
 };
index 101ad4945d3f10165aed8222ce5f645d2cb041ac..b171db6965b5a20e452a7d084f48ab7e7b58796e 100644 (file)
@@ -216,6 +216,47 @@ struct FixedKVNode : CachedExtent {
     }
   }
 
+  struct child_pos_t {
+    FixedKVNodeRef stable_parent;
+    uint16_t pos = std::numeric_limits<uint16_t>::max();
+    CachedExtentRef child;
+    child_pos_t(CachedExtentRef child) : child(child) {}
+    child_pos_t(FixedKVNodeRef stable_parent, uint16_t pos)
+      : stable_parent(stable_parent), pos(pos) {}
+  };
+
+  void link_child(FixedKVNode* child, uint16_t pos) {
+    assert(pos < get_node_size());
+    assert(child);
+    ceph_assert(!is_pending());
+    ceph_assert(child->is_valid() && !child->is_pending());
+    assert(!children[pos]);
+    children[pos] = child;
+    set_child_ptracker(child);
+  }
+
+  template <typename iter_t>
+  child_pos_t get_child(Transaction &t, iter_t iter) {
+    auto pos = iter.get_offset();
+    assert(children.capacity());
+    auto child = children[pos];
+    if (child) {
+      return child_pos_t(child->get_transactional_view(t));
+    } else if (is_pending()) {
+      auto key = iter.get_key();
+      auto &sparent = get_stable_for_key(key);
+      auto spos = sparent.child_pos_for_key(key);
+      auto child = sparent.children[spos];
+      if (child) {
+       return child_pos_t(child->get_transactional_view(t));
+      } else {
+       return child_pos_t(&sparent, spos);
+      }
+    } else {
+      return child_pos_t(this, pos);
+    }
+  }
+
   void split_child_ptrs(
     FixedKVNode &left,
     FixedKVNode &right)
@@ -585,6 +626,13 @@ struct FixedKVInternalNode
     return this->upper_bound(key).get_offset();
   }
 
+  uint16_t child_pos_for_key(NODE_KEY key) const final {
+    auto it = this->upper_bound(key);
+    assert(it != this->begin());
+    --it;
+    return it.get_offset();
+  }
+
   NODE_KEY get_key_from_idx(uint16_t idx) const final {
     return this->iter_idx(idx).get_key();
   }
@@ -919,6 +967,10 @@ struct FixedKVLeafNode
     return this->upper_bound(key).get_offset();
   }
 
+  uint16_t child_pos_for_key(NODE_KEY key) const final {
+    return lower_bound_offset(key);
+  }
+
   NODE_KEY get_key_from_idx(uint16_t idx) const final {
     return this->iter_idx(idx).get_key();
   }
index 3b90516e614c68c4b44347c4b2362ad942afd2b3..32ff392b62c1bad461dfddd57a5a6207b72ec69b 100644 (file)
@@ -985,6 +985,19 @@ public:
   uint64_t get_omap_tree_depth() {
     return stats.omap_tree_depth;
   }
+
+  /// Update lru for access to ref
+  void touch_extent(
+      CachedExtent &ext,
+      const Transaction::src_t* p_src=nullptr)
+  {
+    if (p_src && is_background_transaction(*p_src))
+      return;
+    if (ext.is_clean() && !ext.is_placeholder()) {
+      lru.move_to_top(ext);
+    }
+  }
+
 private:
   ExtentPlacementManager& epm;
   RootBlockRef root;               ///< ref to current root
@@ -1268,18 +1281,6 @@ private:
     return bp;
   }
 
-  /// Update lru for access to ref
-  void touch_extent(
-      CachedExtent &ext,
-      const Transaction::src_t* p_src=nullptr)
-  {
-    if (p_src && is_background_transaction(*p_src))
-      return;
-    if (ext.is_clean() && !ext.is_placeholder()) {
-      lru.move_to_top(ext);
-    }
-  }
-
   void backref_batch_update(
     std::vector<backref_entry_ref> &&,
     const journal_seq_t &);
index 70ebc394bd5849965c57cdb2aa5f39e9cc95a296..170a81377467bf9c64225749918eecf3857c24c1 100644 (file)
@@ -24,6 +24,15 @@ class SegmentedAllocator;
 class TransactionManager;
 class ExtentPlacementManager;
 
+template <
+  typename node_key_t,
+  typename node_val_t,
+  typename internal_node_t,
+  typename leaf_node_t,
+  typename pin_t,
+  size_t node_size>
+class FixedKVBtree;
+
 // #define DEBUG_CACHED_EXTENT_REF
 #ifdef DEBUG_CACHED_EXTENT_REF
 
@@ -45,11 +54,14 @@ namespace onode {
 
 template <typename T>
 class read_set_item_t {
-  boost::intrusive::list_member_hook<> list_hook;
-  using list_hook_options = boost::intrusive::member_hook<
+  using set_hook_t = boost::intrusive::set_member_hook<
+    boost::intrusive::link_mode<
+      boost::intrusive::auto_unlink>>;
+  set_hook_t trans_hook;
+  using set_hook_options = boost::intrusive::member_hook<
     read_set_item_t,
-    boost::intrusive::list_member_hook<>,
-    &read_set_item_t::list_hook>;
+    set_hook_t,
+    &read_set_item_t::trans_hook>;
 
 public:
   struct cmp_t {
@@ -59,9 +71,29 @@ public:
     bool operator()(const read_set_item_t<T> &lhs, const paddr_t &rhs) const;
   };
 
-  using list =  boost::intrusive::list<
+  struct trans_cmp_t {
+    bool operator()(
+      const read_set_item_t<Transaction> &lhs,
+      const read_set_item_t<Transaction> &rhs) const {
+      return lhs.t < rhs.t;
+    }
+    bool operator()(
+      const Transaction *lhs,
+      const read_set_item_t<Transaction> &rhs) const {
+      return lhs < rhs.t;
+    }
+    bool operator()(
+      const read_set_item_t<Transaction> &lhs,
+      const Transaction *rhs) const {
+      return lhs.t < rhs;
+    }
+  };
+
+  using trans_set_t =  boost::intrusive::set<
     read_set_item_t,
-    list_hook_options>;
+    set_hook_options,
+    boost::intrusive::constant_time_size<false>,
+    boost::intrusive::compare<trans_cmp_t>>;
 
   T *t = nullptr;
   CachedExtentRef ref;
@@ -69,7 +101,7 @@ public:
   read_set_item_t(T *t, CachedExtentRef ref);
   read_set_item_t(const read_set_item_t &) = delete;
   read_set_item_t(read_set_item_t &&) = default;
-  ~read_set_item_t();
+  ~read_set_item_t() = default;
 };
 template <typename T>
 using read_set_t = std::set<
@@ -151,6 +183,14 @@ class CachedExtent
   friend class onode::DummyNodeExtent;
   friend class onode::TestReplayExtent;
 
+  template <
+    typename node_key_t,
+    typename node_val_t,
+    typename internal_node_t,
+    typename leaf_node_t,
+    typename pin_t,
+    size_t node_size>
+  friend class FixedKVBtree;
   uint32_t last_committed_crc = 0;
 
   // Points at current version while in state MUTATION_PENDING
@@ -530,6 +570,11 @@ private:
     return extent_index_hook.is_linked();
   }
 
+  /// Returns true if the extent part of the open transaction
+  bool is_pending_in_trans(transaction_id_t id) const {
+    return is_pending() && pending_for_transaction == id;
+  }
+
   /// hook for intrusive ref list (mainly dirty or lru list)
   boost::intrusive::list_member_hook<> primary_ref_list_hook;
   using primary_ref_list_member_options = boost::intrusive::member_hook<
@@ -580,7 +625,9 @@ private:
     }
   }
 
-  read_set_item_t<Transaction>::list transactions;
+  CachedExtent* get_transactional_view(Transaction &t);
+
+  read_set_item_t<Transaction>::trans_set_t transactions;
 
   placement_hint_t user_hint = PLACEMENT_HINT_NULL;
 
@@ -611,8 +658,6 @@ protected:
   struct retired_placeholder_t{};
   CachedExtent(retired_placeholder_t) : state(extent_state_t::INVALID) {}
 
-  CachedExtent& get_transactional_view(Transaction &t);
-
   friend class Cache;
   template <typename T, typename... Args>
   static TCachedExtentRef<T> make_cached_extent_ref(
@@ -1000,15 +1045,7 @@ struct ref_laddr_cmp {
 template <typename T>
 read_set_item_t<T>::read_set_item_t(T *t, CachedExtentRef ref)
   : t(t), ref(ref)
-{
-  ref->transactions.push_back(*this);
-}
-
-template <typename T>
-read_set_item_t<T>::~read_set_item_t()
-{
-  ref->transactions.erase(ref->transactions.s_iterator_to(*this));
-}
+{}
 
 template <typename T>
 inline bool read_set_item_t<T>::cmp_t::operator()(
index 14ba61cee07dfb500b57fee7dbb6312e803adc25..ed9d1d1a0c4ae5207c2d855b34ccc867a61c0f28 100644 (file)
@@ -140,8 +140,16 @@ public:
   void add_to_read_set(CachedExtentRef ref) {
     if (is_weak()) return;
 
+    assert(ref->is_valid());
+
+    auto it = ref->transactions.lower_bound(
+      this, read_set_item_t<Transaction>::trans_cmp_t());
+    if (it != ref->transactions.end() && it->t == this) return;
+
     auto [iter, inserted] = read_set.emplace(this, ref);
     ceph_assert(inserted);
+    ref->transactions.insert_before(
+      it, const_cast<read_set_item_t<Transaction>&>(*iter));
   }
 
   void add_fresh_extent(
@@ -231,7 +239,8 @@ public:
       assert(where != read_set.end());
       assert(where->ref.get() == &placeholder);
       where = read_set.erase(where);
-      read_set.emplace_hint(where, this, &extent);
+      auto it = read_set.emplace_hint(where, this, &extent);
+      extent.transactions.insert(const_cast<read_set_item_t<Transaction>&>(*it));
     }
     {
       auto where = retired_set.find(&placeholder);