]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/omap_manager: apply linked_tree_node to omap trees
authorXuehan Xu <xuxuehan@qianxin.com>
Fri, 14 Feb 2025 07:48:01 +0000 (15:48 +0800)
committerXuehan Xu <xuxuehan@qianxin.com>
Wed, 16 Apr 2025 03:00:15 +0000 (11:00 +0800)
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/linked_tree_node.h
src/crimson/os/seastore/logical_child_node.h
src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
src/crimson/os/seastore/transaction_manager.h

index d6324f2ff15a36104eff392022fa44e4aec4ab42..d7b47d3cae1260081b7e751b83ee1926224dbd95 100644 (file)
@@ -1388,12 +1388,15 @@ public:
   template <typename... T>
   LogicalCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {}
 
-  void on_rewrite(Transaction&, CachedExtent &extent, extent_len_t off) final {
+  void on_rewrite(Transaction &t, CachedExtent &extent, extent_len_t off) final {
     assert(get_type() == extent.get_type());
     auto &lextent = (LogicalCachedExtent&)extent;
     set_laddr((lextent.get_laddr() + off).checked_to_laddr());
+    do_on_rewrite(t, lextent);
   }
 
+  virtual void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) {}
+
   bool has_laddr() const {
     return laddr != L_ADDR_NULL;
   }
index b19c053cdbb38139825218d02523f344414f6169..572b7e1ccfcb51bbae7eaa7eea15e40720b97892 100644 (file)
@@ -149,7 +149,11 @@ private:
   friend class TreeRootLinker;
 };
 
-// The link from the ChildNodes to the ParentNodes
+// The sharable linker from the ChildNodes to the same ParentNode
+//
+// The indirection of child.parent_tracker.parent is necessary
+// because otherwise we'll need to update every child's parent
+// upon commiting a mutated extent.
 template <typename ParentT>
 class parent_tracker_t
   : public boost::intrusive_ref_counter<
@@ -400,6 +404,10 @@ protected:
     : children(capacity, nullptr) {}
   ParentNode(const ParentNode &rhs)
     : children(rhs.children.capacity(), nullptr) {}
+  void sync_children_capacity() {
+    auto &me = down_cast();
+    maybe_expand_children(me.get_size());
+  }
   void add_copy_dest(Transaction &t, TCachedExtentRef<T> dest) {
     ceph_assert(down_cast().is_stable());
     ceph_assert(dest->is_pending());
@@ -1000,6 +1008,9 @@ protected:
   void on_replace_prior() {
     take_parent_from_prior();
   }
+  // destroy() is allowed to be skipped for the pending extents,
+  // because they will be destroyed altogether with their parents
+  // upon transaction invalidation.
   void destroy() {
     assert(!down_cast().is_btree_root());
     assert(this->has_parent_tracker());
index 03f2b73f975c49cfbdd89b6fe86400c0ae579dfb..e1895ada8468c3f7d5eea00a054de5710af00503 100644 (file)
@@ -42,7 +42,9 @@ public:
 protected:
   void on_replace_prior() final {
     child_node_t::on_replace_prior();
+    do_on_replace_prior();
   }
+  virtual void do_on_replace_prior() {}
 };
 using LogicalChildNodeRef = TCachedExtentRef<LogicalChildNode>;
 } // namespace crimson::os::seastore
index ae197097c6506fe62ea9052c0d2a6ef1190f2522..451e17bd8fdeef85b1795db28d536ff541281120 100644 (file)
@@ -26,6 +26,7 @@ BtreeOMapManager::initialize_omap(Transaction &t, laddr_t hint,
   return tm.alloc_non_data_extent<OMapLeafNode>(t, hint, get_leaf_size(type))
     .si_then([hint, &t, type](auto&& root_extent) {
       root_extent->set_size(0);
+      root_extent->init_range(BEGIN_KEY, END_KEY);
       omap_node_meta_t meta{1};
       root_extent->set_meta(meta);
       omap_root_t omap_root;
@@ -44,7 +45,19 @@ BtreeOMapManager::get_omap_root(omap_context_t oc, const omap_root_t &omap_root)
 {
   assert(omap_root.get_location() != L_ADDR_NULL);
   laddr_t laddr = omap_root.get_location();
-  return omap_load_extent(oc, laddr, omap_root.get_depth());
+  if (omap_root.get_depth() > 1) {
+    return omap_load_extent<OMapInnerNode>(
+      oc, laddr, omap_root.get_depth(), BEGIN_KEY, END_KEY
+    ).si_then([](auto extent) {
+      return extent->template cast<OMapNode>();
+    });
+  } else {
+    return omap_load_extent<OMapLeafNode>(
+      oc, laddr, omap_root.get_depth(), BEGIN_KEY, END_KEY
+    ).si_then([](auto extent) {
+      return extent->template cast<OMapNode>();
+    });
+  }
 }
 
 BtreeOMapManager::handle_root_split_ret
@@ -57,18 +70,29 @@ BtreeOMapManager::handle_root_split(
   DEBUGT("{}", oc.t, omap_root);
   return oc.tm.alloc_non_data_extent<OMapInnerNode>(oc.t, omap_root.hint,
                                            OMAP_INNER_BLOCK_SIZE)
-    .si_then([&omap_root, mresult, oc](auto&& nroot) -> handle_root_split_ret {
+    .si_then([&omap_root, mresult, oc, FNAME]
+             (auto&& nroot) -> handle_root_split_ret {
     auto [left, right, pivot] = *(mresult.split_tuple);
     omap_node_meta_t meta{omap_root.depth + 1};
+    nroot->init_range(BEGIN_KEY, END_KEY);
     nroot->set_meta(meta);
+    left->init_range(BEGIN_KEY, pivot);
+    nroot->insert_child_ptr(
+      nroot->iter_begin().get_offset(),
+      dynamic_cast<BaseChildNode<OMapInnerNode, std::string>*>(left.get()));
     nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(),
-                                "", nroot->maybe_get_delta_buffer());
+                                BEGIN_KEY, nroot->maybe_get_delta_buffer());
+    nroot->insert_child_ptr(
+      (nroot->iter_begin() + 1).get_offset(),
+      dynamic_cast<BaseChildNode<OMapInnerNode, std::string>*>(right.get()));
+    right->init_range(pivot, END_KEY);
     nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(),
                                 pivot, nroot->maybe_get_delta_buffer());
     omap_root.update(nroot->get_laddr(), omap_root.get_depth() + 1, omap_root.hint,
       omap_root.get_type());
     oc.t.get_omap_tree_stats().depth = omap_root.depth;
     ++(oc.t.get_omap_tree_stats().extents_num_delta);
+    DEBUGT("l {}, r {}", oc.t, *left, *right);
     return seastar::now();
   }).handle_error_interruptible(
     crimson::ct_error::enospc::assert_failure{"unexpected enospc"},
@@ -84,8 +108,8 @@ BtreeOMapManager::handle_root_merge(
 {
   LOG_PREFIX(BtreeOMapManager::handle_root_merge);
   DEBUGT("{}", oc.t, omap_root);
-  auto root = *(mresult.need_merge);
-  auto iter = root->cast<OMapInnerNode>()->iter_begin();
+  auto old_root = *(mresult.need_merge);
+  auto iter = old_root->cast<OMapInnerNode>()->iter_begin();
   omap_root.update(
     iter->get_val(),
     omap_root.depth -= 1,
@@ -93,7 +117,7 @@ BtreeOMapManager::handle_root_merge(
     omap_root.get_type());
   oc.t.get_omap_tree_stats().depth = omap_root.depth;
   oc.t.get_omap_tree_stats().extents_num_delta--;
-  return oc.tm.remove(oc.t, root->get_laddr()
+  return oc.tm.remove(oc.t, old_root->get_laddr()
   ).si_then([](auto &&ret) -> handle_root_merge_ret {
     return seastar::now();
   }).handle_error_interruptible(
index 90dbccf63eedf5fbc6f85bf31d607b91aa852fe4..6de3ade3f7b765538e71a4ca93eaa083301d091b 100644 (file)
@@ -106,15 +106,6 @@ public:
   omap_clear_ret omap_clear(
     omap_root_t &omap_root,
     Transaction &t) final;
-
-  static extent_len_t get_leaf_size(omap_type_t type) {
-    if (type == omap_type_t::LOG) {
-      return LOG_LEAF_BLOCK_SIZE;
-    }
-    ceph_assert(type == omap_type_t::OMAP ||
-               type == omap_type_t::XATTR);
-    return OMAP_LEAF_BLOCK_SIZE;
-  }
 };
 using BtreeOMapManagerRef = std::unique_ptr<BtreeOMapManager>;
 
index 3de6017096c548bd71def0fa7decff0dd424a71f..cbdacd28601e9393ad146aac831aed74d86b22b6 100644 (file)
@@ -15,6 +15,9 @@
 
 namespace crimson::os::seastore::omap_manager{
 
+const std::string BEGIN_KEY = "";
+const std::string END_KEY(64, (char)(-1));
+
 struct omap_context_t {
   TransactionManager &tm;
   Transaction &t;
@@ -52,7 +55,10 @@ struct OMapNode : LogicalChildNode {
   explicit OMapNode(ceph::bufferptr &&ptr) : LogicalChildNode(std::move(ptr)) {}
   explicit OMapNode(extent_len_t length) : LogicalChildNode(length) {}
   OMapNode(const OMapNode &other)
-  : LogicalChildNode(other) {}
+  : LogicalChildNode(other),
+    root(other.root),
+    begin(other.begin),
+    end(other.end) {}
 
   using get_value_iertr = base_iertr;
   using get_value_ret = OMapManager::omap_get_value_ret;
@@ -109,14 +115,34 @@ struct OMapNode : LogicalChildNode {
   virtual uint32_t get_node_size() = 0;
 
   virtual ~OMapNode() = default;
+
+  void init_range(std::string begin, std::string end) {
+    if (!this->end.empty()) {
+      // this is a duplicated init.
+      assert(end == this->end);
+      return;
+    }
+    if (begin == BEGIN_KEY && end == END_KEY) {
+      root = true;
+    }
+    this->begin = std::move(begin);
+    this->end = std::move(end);
+  }
+  const std::string &get_begin() const {
+    return begin;
+  }
+  const std::string &get_end() const {
+    return end;
+  }
+  bool is_btree_root() const { return root; }
+private:
+  bool root = false;
+  std::string begin;
+  std::string end;
 };
 
 using OMapNodeRef = OMapNode::OMapNodeRef;
 
-using omap_load_extent_iertr = OMapNode::base_iertr;
-omap_load_extent_iertr::future<OMapNodeRef>
-omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth);
-
 }
 
 #if FMT_VERSION >= 90000
index 042b22d18930075c897246ea3439aed2eee44c09..746a6cdcdf572e7f89e27b6144317d5353df6380 100644 (file)
@@ -29,8 +29,14 @@ std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs)
 
 std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const
 {
-  return out << ", size=" << get_size()
-            << ", depth=" << get_meta().depth;
+  out << ", size=" << get_size()
+      << ", depth=" << get_meta().depth
+      << ", is_root=" << is_btree_root();
+  if (get_size() > 0) {
+    out << ", begin=" << get_begin()
+       << ", end=" << get_end();
+  }
+  return out;
 }
 
 using dec_ref_iertr = OMapInnerNode::base_iertr;
@@ -54,23 +60,35 @@ dec_ref_ret dec_ref(omap_context_t oc, T&& addr) {
 OMapInnerNode::make_split_insert_ret
 OMapInnerNode::make_split_insert(
   omap_context_t oc,
-  internal_iterator_t iter,
+  internal_const_iterator_t iter,
   std::string key,
-  laddr_t laddr)
+  OMapNodeRef &node)
 {
   LOG_PREFIX(OMapInnerNode::make_split_insert);
   DEBUGT("this: {}, key: {}", oc.t, *this, key);
-  return make_split_children(oc).si_then([=] (auto tuple) {
+  return make_split_children(oc).si_then(
+    [FNAME, oc, iter=std::move(iter),
+    key=std::move(key), node, this] (auto tuple) {
     auto [left, right, pivot] = tuple;
+    DEBUGT("this: {}, key: {}, pivot {}", oc.t, *this, key, pivot);
+    left->init_range(get_begin(), pivot);
+    right->init_range(pivot, get_end());
     if (pivot > key) {
       auto liter = left->iter_idx(iter.get_offset());
-      left->journal_inner_insert(liter, laddr, key,
+      left->insert_child_ptr(
+       liter.get_offset(),
+       dynamic_cast<base_child_t*>(node.get()));
+      left->journal_inner_insert(liter, node->get_laddr(), key,
                                  left->maybe_get_delta_buffer());
     } else {  //right
       auto riter = right->iter_idx(iter.get_offset() - left->get_node_size());
-      right->journal_inner_insert(riter, laddr, key,
+      right->insert_child_ptr(
+       riter.get_offset(),
+       dynamic_cast<base_child_t*>(node.get()));
+      right->journal_inner_insert(riter, node->get_laddr(), key,
                                   right->maybe_get_delta_buffer());
     }
+    this->adjust_copy_src_dest_on_split(oc.t, *left, *right);
     ++(oc.t.get_omap_tree_stats().extents_num_delta);
     return make_split_insert_ret(
            interruptible::ready_future_marker{},
@@ -83,7 +101,7 @@ OMapInnerNode::make_split_insert(
 OMapInnerNode::handle_split_ret
 OMapInnerNode::handle_split(
   omap_context_t oc,
-  internal_iterator_t iter,
+  internal_const_iterator_t iter,
   mutation_result_t mresult)
 {
   LOG_PREFIX(OMapInnerNode::handle_split);
@@ -94,25 +112,34 @@ OMapInnerNode::handle_split(
     return mut->handle_split(oc, mut_iter, mresult);
   }
   auto [left, right, pivot] = *(mresult.split_tuple);
+  DEBUGT("this: {} {} {}",  oc.t, *left, *right, pivot);
   //update operation will not cause node overflow, so we can do it first.
+  this->update_child_ptr(
+    iter.get_offset(),
+    dynamic_cast<base_child_t*>(left.get()));
   journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer());
   bool overflow = extent_will_overflow(pivot.size(), std::nullopt);
+  auto right_iter = iter + 1;
   if (!overflow) {
-    journal_inner_insert(iter + 1, right->get_laddr(), pivot,
+    this->insert_child_ptr(
+      right_iter.get_offset(),
+      dynamic_cast<base_child_t*>(right.get()));
+    journal_inner_insert(right_iter, right->get_laddr(), pivot,
                          maybe_get_delta_buffer());
     return insert_ret(
            interruptible::ready_future_marker{},
            mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
   } else {
-    return make_split_insert(oc, iter + 1, pivot, right->get_laddr())
-      .si_then([this, oc] (auto m_result) {
-       return dec_ref(oc, get_laddr())
-         .si_then([m_result = std::move(m_result)] {
-          return insert_ret(
-                 interruptible::ready_future_marker{},
-                 m_result);
-       });
-   });
+    return make_split_insert(
+      oc, right_iter, pivot, right
+    ).si_then([this, oc] (auto m_result) {
+      return dec_ref(oc, get_laddr()
+      ).si_then([m_result = std::move(m_result)] {
+       return insert_ret(
+              interruptible::ready_future_marker{},
+              m_result);
+      });
+    });
   }
 }
 
@@ -123,11 +150,9 @@ OMapInnerNode::get_value(
 {
   LOG_PREFIX(OMapInnerNode::get_value);
   DEBUGT("key = {}, this: {}", oc.t, key, *this);
-  auto child_pt = get_containing_child(key);
-  assert(child_pt != iter_cend());
-  auto laddr = child_pt->get_val();
-  return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+  return get_child_node(oc, key).si_then(
     [oc, &key] (auto extent) {
+    ceph_assert(!extent->is_btree_root());
     return extent->get_value(oc, key);
   }).finally([ref = OMapNodeRef(this)] {});
 }
@@ -141,10 +166,9 @@ OMapInnerNode::insert(
   LOG_PREFIX(OMapInnerNode::insert);
   DEBUGT("{}->{}, this: {}",  oc.t, key, value, *this);
   auto child_pt = get_containing_child(key);
-  assert(child_pt != iter_cend());
-  auto laddr = child_pt->get_val();
-  return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+  return get_child_node(oc, child_pt).si_then(
     [oc, &key, &value] (auto extent) {
+    ceph_assert(!extent->is_btree_root());
     return extent->insert(oc, key, value);
   }).si_then([this, oc, child_pt] (auto mresult) {
     if (mresult.status == mutation_status_t::SUCCESS) {
@@ -165,10 +189,9 @@ OMapInnerNode::rm_key(omap_context_t oc, const std::string &key)
   LOG_PREFIX(OMapInnerNode::rm_key);
   DEBUGT("key={}, this: {}", oc.t, key, *this);
   auto child_pt = get_containing_child(key);
-  assert(child_pt != iter_cend());
-  auto laddr = child_pt->get_val();
-  return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then(
+  return get_child_node(oc, child_pt).si_then(
     [this, oc, &key, child_pt] (auto extent) {
+    ceph_assert(!extent->is_btree_root());
     return extent->rm_key(oc, key)
       .si_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) {
       switch (mresult.status) {
@@ -242,11 +265,9 @@ OMapInnerNode::list(
             seastar::stop_iteration::yes);
         }
        assert(result.size() < config.max_result_size);
-        auto laddr = iter->get_val();
-        return omap_load_extent(
-          oc, laddr,
-          get_meta().depth - 1
+        return get_child_node(oc, iter
         ).si_then([&, config, oc](auto &&extent) {
+         ceph_assert(!extent->is_btree_root());
          return seastar::do_with(
            iter == fiter ? first : std::optional<std::string>(std::nullopt),
            iter == liter - 1 ? last : std::optional<std::string>(std::nullopt),
@@ -306,8 +327,9 @@ OMapInnerNode::clear(omap_context_t oc)
     auto laddr = iter->get_val();
     auto ndepth = get_meta().depth - 1;
     if (ndepth > 1) {
-      return omap_load_extent(oc, laddr, ndepth
+      return get_child_node(oc, iter
       ).si_then([oc](auto &&extent) {
+       ceph_assert(!extent->is_btree_root());
        return extent->clear(oc);
       }).si_then([oc, laddr] {
        return dec_ref(oc, laddr);
@@ -325,7 +347,7 @@ OMapInnerNode::clear(omap_context_t oc)
 }
 
 OMapInnerNode::split_children_ret
-OMapInnerNode:: make_split_children(omap_context_t oc)
+OMapInnerNode::make_split_children(omap_context_t oc)
 {
   LOG_PREFIX(OMapInnerNode::make_split_children);
   DEBUGT("this: {}", oc.t, *this);
@@ -336,6 +358,7 @@ OMapInnerNode:: make_split_children(omap_context_t oc)
       auto left = ext_pair.front();
       auto right = ext_pair.back();
       DEBUGT("this: {}, split into: l {} r {}", oc.t, *this, *left, *right);
+      this->split_child_ptrs(oc.t, *left, *right);
       return split_children_ret(
              interruptible::ready_future_marker{},
              std::make_tuple(left, right, split_into(*left, *right)));
@@ -352,7 +375,9 @@ OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
   DEBUGT("", oc.t);
   return oc.tm.alloc_non_data_extent<OMapInnerNode>(oc.t, oc.hint,
     OMAP_INNER_BLOCK_SIZE)
-    .si_then([this, right] (auto &&replacement) {
+    .si_then([this, right, oc] (auto &&replacement) {
+      replacement->merge_child_ptrs(
+       oc.t, *this, *right->cast<OMapInnerNode>());
       replacement->merge_from(*this, *right->cast<OMapInnerNode>());
       return full_merge_ret(
         interruptible::ready_future_marker{},
@@ -371,10 +396,12 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
   ceph_assert(_right->get_type() == TYPE);
   return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint,
     OMAP_INNER_BLOCK_SIZE, 2)
-    .si_then([this, _right] (auto &&replacement_pair){
+    .si_then([this, _right, oc] (auto &&replacement_pair){
       auto replacement_left = replacement_pair.front();
       auto replacement_right = replacement_pair.back();
       auto &right = *_right->cast<OMapInnerNode>();
+      this->balance_child_ptrs(oc.t, *this, right, true,
+                              *replacement_left, *replacement_right);
       return make_balanced_ret(
              interruptible::ready_future_marker{},
              std::make_tuple(replacement_left, replacement_right,
@@ -389,7 +416,7 @@ OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
 OMapInnerNode::merge_entry_ret
 OMapInnerNode::merge_entry(
   omap_context_t oc,
-  internal_iterator_t iter,
+  internal_const_iterator_t iter,
   OMapNodeRef entry)
 {
   LOG_PREFIX(OMapInnerNode::merge_entry);
@@ -401,36 +428,53 @@ OMapInnerNode::merge_entry(
   }
   auto is_left = (iter + 1) == iter_cend();
   auto donor_iter = is_left ? iter - 1 : iter + 1;
-  return omap_load_extent(oc, donor_iter->get_val(), get_meta().depth - 1
+  return get_child_node(oc, donor_iter
   ).si_then([=, this](auto &&donor) mutable {
+    ceph_assert(!donor->is_btree_root());
     LOG_PREFIX(OMapInnerNode::merge_entry);
     auto [l, r] = is_left ?
       std::make_pair(donor, entry) : std::make_pair(entry, donor);
     auto [liter, riter] = is_left ?
       std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
     if (l->can_merge(r)) {
-      DEBUGT("make_full_merge l {} r {}", oc.t, *l, *r);
+      DEBUGT("make_full_merge l {} r {} liter {} riter {}",
+       oc.t, *l, *r, liter->get_key(), riter->get_key());
       assert(entry->extent_is_below_min());
       return l->make_full_merge(oc, r
       ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this]
                (auto &&replacement) {
        LOG_PREFIX(OMapInnerNode::merge_entry);
        DEBUGT("to update parent: {}", oc.t, *this);
+       this->update_child_ptr(
+         liter.get_offset(),
+         dynamic_cast<base_child_t*>(replacement.get()));
         journal_inner_update(
          liter,
          replacement->get_laddr(),
          maybe_get_delta_buffer());
+       this->remove_child_ptr(riter.get_offset());
         journal_inner_remove(riter, maybe_get_delta_buffer());
         //retire extent
         std::vector<laddr_t> dec_laddrs {l->get_laddr(), r->get_laddr()};
+       auto next = liter + 1;
+       auto end = next == iter_cend() ? get_end() : next.get_key();
+       assert(end == r->get_end());
+       replacement->init_range(liter.get_key(), std::move(end));
+       if (get_meta().depth > 2) { // replacement is an inner node
+         auto &rep = *replacement->template cast<OMapInnerNode>();
+         rep.adjust_copy_src_dest_on_merge(
+           oc.t,
+           *l->template cast<OMapInnerNode>(),
+           *r->template cast<OMapInnerNode>());
+       }
         return dec_ref(oc, dec_laddrs
-       ).si_then([this, oc] {
+       ).si_then([this, oc, r=std::move(replacement)] {
          --(oc.t.get_omap_tree_stats().extents_num_delta);
           if (extent_is_below_min()) {
             return merge_entry_ret(
                    interruptible::ready_future_marker{},
-                   mutation_result_t(mutation_status_t::NEED_MERGE,
-                    std::nullopt, this));
+                  mutation_result_t(mutation_status_t::NEED_MERGE,
+                                    std::nullopt, this));
           } else {
             return merge_entry_ret(
                    interruptible::ready_future_marker{},
@@ -439,14 +483,30 @@ OMapInnerNode::merge_entry(
           }
         });
       });
-    } else {
-      DEBUGT("balanced l {} r {}", oc.t, *l, *r);
+    } else { // !l->can_merge(r)
+      DEBUGT("balanced l {} r {} liter {} riter {}",
+       oc.t, *l, *r, liter->get_key(), riter->get_key());
       return l->make_balanced(oc, r
       ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) {
        LOG_PREFIX(OMapInnerNode::merge_entry);
-       DEBUGT("to update parent: {}", oc.t, *this);
         auto [replacement_l, replacement_r, replacement_pivot] = tuple;
+       DEBUGT("to update parent: {} {} {}",
+         oc.t, *this, *replacement_l, *replacement_r);
+       replacement_l->init_range(l->get_begin(), replacement_pivot);
+       replacement_r->init_range(replacement_pivot, r->get_end());
+       if (get_meta().depth > 2) { // l and r are inner nodes
+         auto &left = *l->template cast<OMapInnerNode>();
+         auto &right = *r->template cast<OMapInnerNode>();
+         auto &rep_left = *replacement_l->template cast<OMapInnerNode>();
+         auto &rep_right = *replacement_r->template cast<OMapInnerNode>();
+         this->adjust_copy_src_dest_on_balance(
+           oc.t, left, right, true, rep_left, rep_right);
+       }
+
         //update operation will not cuase node overflow, so we can do it first
+       this->update_child_ptr(
+         liter.get_offset(),
+         dynamic_cast<base_child_t*>(replacement_l.get()));
         journal_inner_update(
          liter,
          replacement_l->get_laddr(),
@@ -454,6 +514,9 @@ OMapInnerNode::merge_entry(
         bool overflow = extent_will_overflow(replacement_pivot.size(),
          std::nullopt);
         if (!overflow) {
+         this->update_child_ptr(
+           riter.get_offset(),
+           dynamic_cast<base_child_t*>(replacement_r.get()));
           journal_inner_remove(riter, maybe_get_delta_buffer());
           journal_inner_insert(
            riter,
@@ -469,12 +532,14 @@ OMapInnerNode::merge_entry(
                     std::nullopt, std::nullopt));
           });
         } else {
-          DEBUGT("balanced and split {} r {}", oc.t, *l, *r);
+          DEBUGT("balanced and split {} r {} riter {}",
+           oc.t, *l, *r, riter.get_key());
           //use remove and insert to instead of replace,
           //remove operation will not cause node split, so we can do it first
+         this->remove_child_ptr(riter.get_offset());
           journal_inner_remove(riter, maybe_get_delta_buffer());
-          return make_split_insert(oc, riter, replacement_pivot,
-           replacement_r->get_laddr()
+          return make_split_insert(
+           oc, riter, replacement_pivot, replacement_r
          ).si_then([this, oc, l = l, r = r](auto mresult) {
            std::vector<laddr_t> dec_laddrs{
              l->get_laddr(),
@@ -493,21 +558,28 @@ OMapInnerNode::merge_entry(
 
 }
 
-OMapInnerNode::internal_iterator_t
+OMapInnerNode::internal_const_iterator_t
 OMapInnerNode::get_containing_child(const std::string &key)
 {
-  auto iter = string_lower_bound(key);
-  if (iter == iter_end() || (iter != iter_begin() && iter.get_key() > key)) {
-    iter--;
-  }
+  auto iter = string_upper_bound(key);
+  iter--;
   assert(iter.contains(key));
   return iter;
 }
 
 std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const
 {
-  return out << ", size=" << get_size()
-         << ", depth=" << get_meta().depth;
+  out << ", size=" << get_size()
+         << ", depth=" << get_meta().depth
+        << ", is_root=" << is_btree_root();
+  if (get_size() > 0) {
+    out << ", begin=" << get_begin()
+       << ", end=" << get_end();
+  }
+  if (this->child_node_t::is_parent_valid())
+    return out << ", parent=" << (void*)this->child_node_t::get_parent_node().get();
+  else
+    return out;
 }
 
 OMapLeafNode::get_value_ret
@@ -559,6 +631,8 @@ OMapLeafNode::insert(
   } else {
     return make_split_children(oc).si_then([this, oc, &key, &value] (auto tuple) {
       auto [left, right, pivot] = tuple;
+      left->init_range(get_begin(), pivot);
+      right->init_range(pivot, get_end());
       auto replace_pt = find_string_key(key);
       if (replace_pt != iter_end()) {
         ++(oc.t.get_omap_tree_stats().num_updates);
@@ -732,36 +806,71 @@ OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
   );
 }
 
-
-omap_load_extent_iertr::future<OMapNodeRef>
-omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
+OMapInnerNode::get_child_node_ret
+OMapInnerNode::get_child_node(
+  omap_context_t oc,
+  internal_const_iterator_t child_pt)
 {
-  ceph_assert(depth > 0);
-  if (depth > 1) {
-    return oc.tm.read_extent<OMapInnerNode>(
-        oc.t, laddr, OMAP_INNER_BLOCK_SIZE
-    ).handle_error_interruptible(
-      omap_load_extent_iertr::pass_further{},
-      crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
-    ).si_then([](auto maybe_indirect_extent) {
-      assert(!maybe_indirect_extent.is_indirect());
-      assert(!maybe_indirect_extent.is_clone);
-      return seastar::make_ready_future<OMapNodeRef>(
-          std::move(maybe_indirect_extent.extent));
+  assert(get_meta().depth > 1);
+  child_pos_t<OMapInnerNode> child_pos(nullptr, 0);
+  auto laddr = child_pt->get_val();
+  auto next = child_pt + 1;
+  if (get_meta().depth == 2) {
+    auto ret = this->get_child<OMapLeafNode>(
+      oc.t, oc.tm.get_etvr(), child_pt.get_offset(), child_pt.get_key());
+    if (ret.has_child()) {
+      return ret.get_child_fut(
+      ).si_then([](auto extent) {
+       return extent->template cast<OMapNode>();
+      });
+    } else {
+      child_pos = ret.get_child_pos();
+    }
+    return omap_load_extent<OMapLeafNode>(
+      oc,
+      laddr,
+      get_meta().depth - 1,
+      child_pt->get_key(),
+      next == iter_cend()
+       ? this->get_end()
+       : next->get_key(),
+      std::move(child_pos)
+    ).si_then([](auto extent) {
+      return extent->template cast<OMapNode>();
     });
   } else {
-    return oc.tm.read_extent<OMapLeafNode>(
-        oc.t, laddr,
-       BtreeOMapManager::get_leaf_size(oc.type)
-    ).handle_error_interruptible(
-      omap_load_extent_iertr::pass_further{},
-      crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
-    ).si_then([](auto maybe_indirect_extent) {
-      assert(!maybe_indirect_extent.is_indirect());
-      assert(!maybe_indirect_extent.is_clone);
-      return seastar::make_ready_future<OMapNodeRef>(
-          std::move(maybe_indirect_extent.extent));
+    auto ret = this->get_child<OMapInnerNode>(
+      oc.t, oc.tm.get_etvr(), child_pt.get_offset(), child_pt.get_key());
+    if (ret.has_child()) {
+      return ret.get_child_fut(
+      ).si_then([](auto extent) {
+       return extent->template cast<OMapNode>();
+      });
+    } else {
+      child_pos = ret.get_child_pos();
+    }
+    return omap_load_extent<OMapInnerNode>(
+      oc,
+      laddr,
+      get_meta().depth - 1,
+      child_pt->get_key(),
+      next == iter_cend()
+       ? this->get_end()
+       : next->get_key(),
+      std::move(child_pos)
+    ).si_then([](auto extent) {
+      return extent->template cast<OMapNode>();
     });
   }
 }
+
+extent_len_t get_leaf_size(omap_type_t type) {
+  if (type == omap_type_t::LOG) {
+    return LOG_LEAF_BLOCK_SIZE;
+  }
+  ceph_assert(type == omap_type_t::OMAP ||
+             type == omap_type_t::XATTR);
+  return OMAP_LEAF_BLOCK_SIZE;
+}
+
 }
index d1dd00f182f474065fb2ff7216b65308f11dc2e5..32fb3f871464a95602b7355f426c7fa70ebd2449 100644 (file)
@@ -16,6 +16,8 @@
 
 namespace crimson::os::seastore::omap_manager {
 
+extent_len_t get_leaf_size(omap_type_t type);
+
 /**
  * OMapInnerNode
  *
@@ -28,20 +30,86 @@ namespace crimson::os::seastore::omap_manager {
 
 struct OMapInnerNode
   : OMapNode,
-    StringKVInnerNodeLayout {
+    StringKVInnerNodeLayout,
+    ParentNode<OMapInnerNode, std::string>,
+    ChildNode<OMapInnerNode, OMapInnerNode, std::string> {
   using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
-  using internal_iterator_t = const_iterator;
+  using internal_const_iterator_t = const_iterator;
+  using internal_iterator_t = iterator;
+  using parent_node_t = ParentNode<OMapInnerNode, std::string>;
+  using base_child_t = BaseChildNode<OMapInnerNode, std::string>;
+  using child_node_t = ChildNode<OMapInnerNode, OMapInnerNode, std::string>;
+  static constexpr uint32_t CHILD_VEC_UNIT = 128;
 
   explicit OMapInnerNode(ceph::bufferptr &&ptr)
-    : OMapNode(std::move(ptr)) {
-    this->set_layout_buf(this->get_bptr().c_str());
+    : OMapNode(std::move(ptr)),
+      StringKVInnerNodeLayout(get_bptr().c_str()),
+      parent_node_t(0) {
+    this->parent_node_t::sync_children_capacity();
   }
   // Must be identical with OMapInnerNode(ptr) after on_fully_loaded()
   explicit OMapInnerNode(extent_len_t length)
-    : OMapNode(length) {}
+    : OMapNode(length),
+      StringKVInnerNodeLayout(nullptr),
+      parent_node_t(0) {}
   OMapInnerNode(const OMapInnerNode &rhs)
-    : OMapNode(rhs) {
-    this->set_layout_buf(this->get_bptr().c_str());
+    : OMapNode(rhs),
+      StringKVInnerNodeLayout(get_bptr().c_str()),
+      parent_node_t(0) {
+    this->parent_node_t::sync_children_capacity();
+  }
+
+  iterator begin() {
+    return iter_begin();
+  }
+
+  iterator end() {
+    return iter_end();
+  }
+
+  void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) final {
+    auto &ext = static_cast<OMapInnerNode&>(extent);
+    this->parent_node_t::on_rewrite(t, ext);
+    auto &other = static_cast<OMapInnerNode&>(extent);
+    this->init_range(other.get_begin(), other.get_end());
+    this->sync_children_capacity();
+  }
+
+  void prepare_commit() final {
+    this->parent_node_t::prepare_commit();
+    if (is_rewrite() && !is_btree_root()) {
+      auto &prior = *get_prior_instance()->template cast<OMapInnerNode>();
+      if (prior.base_child_t::has_parent_tracker()) {
+       // unlike fixed-kv nodes, rewriting child nodes of the omap tree
+       // won't affect parent nodes, so we have to manually take prior
+       // instances' parent trackers here.
+       this->child_node_t::take_parent_from_prior();
+      }
+    }
+  }
+
+  void do_on_replace_prior() final {
+    this->parent_node_t::on_replace_prior();
+    if (!this->is_btree_root()) {
+      auto &prior = *get_prior_instance()->template cast<OMapInnerNode>();
+      assert(prior.base_child_t::has_parent_tracker());
+      this->child_node_t::on_replace_prior();
+    }
+  }
+
+  void on_invalidated(Transaction &t) final {
+    this->child_node_t::on_invalidated();
+  }
+
+  void on_initial_write() final {
+    if (this->is_btree_root()) {
+      //TODO: should involve RootChildNode
+      this->child_node_t::reset_parent_tracker();
+    }
+  }
+
+  btreenode_pos_t get_node_split_pivot() const {
+    return this->get_split_pivot().get_offset();
   }
 
   omap_node_meta_t get_node_meta() const final { return get_meta(); }
@@ -58,6 +126,10 @@ struct OMapInnerNode
     this->set_layout_buf(this->get_bptr().c_str());
   }
 
+  void on_clean_read() final {
+    this->sync_children_capacity();
+  }
+
   CachedExtentRef duplicate_for_write(Transaction&) final {
     assert(delta_buffer.empty());
     return CachedExtentRef(new OMapInnerNode(*this));
@@ -101,19 +173,19 @@ struct OMapInnerNode
   using make_split_insert_iertr = base_iertr; 
   using make_split_insert_ret = make_split_insert_iertr::future<mutation_result_t>;
   make_split_insert_ret make_split_insert(
-    omap_context_t oc, internal_iterator_t iter,
-    std::string key, laddr_t laddr);
+    omap_context_t oc, internal_const_iterator_t iter,
+    std::string key, OMapNodeRef &node);
 
   using merge_entry_iertr = base_iertr;
   using merge_entry_ret = merge_entry_iertr::future<mutation_result_t>;
   merge_entry_ret merge_entry(
     omap_context_t oc,
-    internal_iterator_t iter, OMapNodeRef entry);
+    internal_const_iterator_t iter, OMapNodeRef entry);
 
   using handle_split_iertr = base_iertr;
   using handle_split_ret = handle_split_iertr::future<mutation_result_t>;
   handle_split_ret handle_split(
-    omap_context_t oc, internal_iterator_t iter,
+    omap_context_t oc, internal_const_iterator_t iter,
     mutation_result_t mresult);
 
   std::ostream &print_detail_l(std::ostream &out) const final;
@@ -138,9 +210,43 @@ struct OMapInnerNode
     auto bptr = bl.cbegin();
     decode(buffer, bptr);
     buffer.replay(*this);
+    this->parent_node_t::sync_children_capacity();
+  }
+
+  internal_const_iterator_t get_containing_child(const std::string &key);
+
+  internal_iterator_t lower_bound(const std::string &key) {
+    return string_lower_bound(key);
   }
 
-  internal_iterator_t get_containing_child(const std::string &key);
+  internal_iterator_t upper_bound(const std::string &key) {
+    return string_upper_bound(key);
+  }
+
+  bool is_in_range(const std::string &key) const {
+    return get_begin() <= key && get_end() > key;
+  }
+
+  ~OMapInnerNode() {
+    if (this->is_valid()
+       && !this->is_pending()
+       && !this->is_btree_root()
+       && this->base_child_t::has_parent_tracker()) {
+      this->child_node_t::destroy();
+    }
+  }
+private:
+  using get_child_node_iertr = OMapNode::base_iertr;
+  using get_child_node_ret = get_child_node_iertr::future<OMapNodeRef>;
+  get_child_node_ret get_child_node(
+    omap_context_t oc,
+    internal_const_iterator_t child_pt);
+
+  get_child_node_ret get_child_node(omap_context_t oc, const std::string &key) {
+    auto child_pt = get_containing_child(key);
+    assert(child_pt != iter_cend());
+    return get_child_node(oc, child_pt);
+  }
 };
 using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef;
 
@@ -156,10 +262,51 @@ using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef;
 
 struct OMapLeafNode
   : OMapNode,
-    StringKVLeafNodeLayout {
-
+    StringKVLeafNodeLayout,
+    ChildNode<OMapInnerNode, OMapLeafNode, std::string> {
   using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
-  using internal_iterator_t = const_iterator;
+  using internal_const_iterator_t = const_iterator;
+  using base_child_t = BaseChildNode<OMapInnerNode, std::string>;
+  using child_node_t = ChildNode<OMapInnerNode, OMapLeafNode, std::string>;
+
+  void do_on_rewrite(Transaction &t, LogicalCachedExtent &extent) final {
+    auto &other = static_cast<OMapInnerNode&>(extent);
+    this->init_range(other.get_begin(), other.get_end());
+  }
+
+  void on_invalidated(Transaction &t) final {
+    this->child_node_t::on_invalidated();
+  }
+
+  void prepare_commit() final {
+    if (is_rewrite() && !is_btree_root()) {
+      auto &prior = *get_prior_instance()->template cast<OMapLeafNode>();
+      if (prior.base_child_t::has_parent_tracker()) {
+       // unlike fixed-kv nodes, rewriting child nodes of the omap tree
+       // won't affect parent nodes, so we have to manually take prior
+       // instances' parent trackers here.
+       this->child_node_t::take_parent_from_prior();
+      }
+    }
+  }
+
+  void do_on_replace_prior() final {
+    ceph_assert(!this->is_rewrite());
+    if (!this->is_btree_root()) {
+      auto &prior = *get_prior_instance()->template cast<OMapLeafNode>();
+      assert(prior.base_child_t::has_parent_tracker());
+      this->child_node_t::on_replace_prior();
+    }
+  }
+
+  ~OMapLeafNode() {
+    if (this->is_valid()
+       && !this->is_pending()
+       && !this->is_btree_root()
+       && this->base_child_t::has_parent_tracker()) {
+      this->child_node_t::destroy();
+    }
+  }
 
   explicit OMapLeafNode(ceph::bufferptr &&ptr)
     : OMapNode(std::move(ptr)) {
@@ -256,14 +403,58 @@ struct OMapLeafNode
     buffer.replay(*this);
   }
 
+  btreenode_pos_t get_node_split_pivot() const {
+    return this->get_split_pivot().get_offset();
+  }
+
   std::ostream &print_detail_l(std::ostream &out) const final;
 
-  std::pair<internal_iterator_t, internal_iterator_t>
+  std::pair<internal_const_iterator_t, internal_const_iterator_t>
   get_leaf_entries(std::string &key);
-
 };
 using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef;
 
+using omap_load_extent_iertr = OMapNode::base_iertr;
+template <typename T>
+requires std::is_same_v<OMapInnerNode, T> || std::is_same_v<OMapLeafNode, T>
+omap_load_extent_iertr::future<TCachedExtentRef<T>>
+omap_load_extent(
+  omap_context_t oc,
+  laddr_t laddr,
+  depth_t depth,
+  std::string begin,
+  std::string end,
+  std::optional<child_pos_t<OMapInnerNode>> chp = std::nullopt)
+{
+  LOG_PREFIX(omap_load_extent);
+  assert(end <= END_KEY);
+  auto size = std::is_same_v<OMapInnerNode, T>
+    ? OMAP_INNER_BLOCK_SIZE : get_leaf_size(oc.type);
+  return oc.tm.read_extent<T>(
+    oc.t, laddr, size,
+    [begin=std::move(begin), end=std::move(end), FNAME,
+    oc, chp=std::move(chp)](T &extent) mutable {
+      extent.init_range(std::move(begin), std::move(end));
+      if (extent.T::base_child_t::is_parent_valid()
+         || extent.is_btree_root()) {
+       return;
+      }
+      assert(chp);
+      SUBDEBUGT(seastore_omap, "linking {} to {}",
+       oc.t, extent, *chp->get_parent());
+      chp->link_child(&extent);
+    }
+  ).handle_error_interruptible(
+    omap_load_extent_iertr::pass_further{},
+    crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
+  ).si_then([](auto maybe_indirect_extent) {
+    assert(!maybe_indirect_extent.is_indirect());
+    assert(!maybe_indirect_extent.is_clone);
+    return seastar::make_ready_future<TCachedExtentRef<T>>(
+       std::move(maybe_indirect_extent.extent));
+  });
+}
+
 std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs);
 std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs);
 }
index 69f5601132181c3dd410b165d17fe7c5b128cf29..d0991de5023e214777f8b2a7528dfe613d78588e 100644 (file)
@@ -504,7 +504,7 @@ public:
     inner_remove(iter);
   }
 
-  StringKVInnerNodeLayout() : buf(nullptr) {}
+  StringKVInnerNodeLayout(char *buf) : buf(buf) {}
 
   void set_layout_buf(char *_buf) {
     assert(buf == nullptr);
index bc553e955b2753ea58a8dc9eaf925b0fea7e54d6..e7d449cdd5e6deb0268ac3d8043fc194461c1335 100644 (file)
@@ -931,6 +931,10 @@ public:
     return epm->get_stat();
   }
 
+  ExtentTransViewRetriever& get_etvr() {
+    return *cache;
+  }
+
   ~TransactionManager();
 
 private: