]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/seastore: add omap tree implementation
authorchunmei-liu <chunmei.liu@intel.com>
Thu, 17 Dec 2020 01:20:44 +0000 (17:20 -0800)
committerchunmei-liu <chunmei.liu@intel.com>
Thu, 14 Jan 2021 03:14:26 +0000 (19:14 -0800)
Signed-off-by: chunmei-liu <chunmei.liu@intel.com>
17 files changed:
src/crimson/os/seastore/CMakeLists.txt
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/omap_manager.cc [new file with mode: 0644]
src/crimson/os/seastore/omap_manager.h [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/omap_types.h [new file with mode: 0644]
src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h [new file with mode: 0644]
src/crimson/os/seastore/seastore_types.cc
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction_manager.cc
src/crimson/os/seastore/transaction_manager.h
src/test/crimson/seastore/CMakeLists.txt
src/test/crimson/seastore/test_omap_manager.cc [new file with mode: 0644]

index 77f8465cf9a665bcf64dd815f8b957f6fbbf87d6..fd8ff393fef50ebbb5ab890889e46bd2706f691e 100644 (file)
@@ -11,6 +11,9 @@ add_library(crimson-seastore STATIC
   lba_manager/btree/btree_lba_manager.cc
   lba_manager/btree/lba_btree_node_impl.cc
   lba_manager/btree/btree_range_pin.cc
+  omap_manager.cc
+  omap_manager/btree/btree_omap_manager.cc
+  omap_manager/btree/omap_btree_node_impl.cc
   onode.cc
   onode_manager/simple-fltree/onode_block.cc
   onode_manager/simple-fltree/onode_delta.cc
index 6a406c1b85a0718c3c4dd4a8bf82fe57bfe816e9..76f5008485efbf15ce87104bc3a7a9552aa12783 100644 (file)
@@ -7,6 +7,7 @@
 // included for get_extent_by_type
 #include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h"
 #include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
 #include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h"
 #include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
 #include "test/crimson/seastore/test_block.h"
@@ -136,6 +137,10 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
     return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length);
   case extent_types_t::EXTMAP_LEAF:
     return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length);
+  case extent_types_t::OMAP_INNER:
+    return alloc_new_extent<omap_manager::OMapInnerNode>(t, length);
+  case extent_types_t::OMAP_LEAF:
+    return alloc_new_extent<omap_manager::OMapLeafNode>(t, length);
   case extent_types_t::TEST_BLOCK:
     return alloc_new_extent<TestBlock>(t, length);
   case extent_types_t::TEST_BLOCK_PHYSICAL:
@@ -501,6 +506,16 @@ Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type(
       ).safe_then([](auto extent) {
         return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
+    case extent_types_t::OMAP_INNER:
+      return get_extent<omap_manager::OMapInnerNode>(offset, length
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
+    case extent_types_t::OMAP_LEAF:
+      return get_extent<omap_manager::OMapLeafNode>(offset, length
+      ).safe_then([](auto extent) {
+        return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
     case extent_types_t::ONODE_BLOCK:
       return get_extent<OnodeBlock>(offset, length
       ).safe_then([](auto extent) {
diff --git a/src/crimson/os/seastore/omap_manager.cc b/src/crimson/os/seastore/omap_manager.cc
new file mode 100644 (file)
index 0000000..f4c3ff0
--- /dev/null
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <experimental/iterator>
+#include <iostream>
+
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+OMapManagerRef create_omap_manager(
+  TransactionManager &trans_manager) {
+  return OMapManagerRef(new BtreeOMapManager(trans_manager));
+}
+
+}
+
+namespace std {
+std::ostream &operator<<(std::ostream &out, const std::pair<std::string, std::string> &rhs)
+{
+  return out << "key_value_map (" << rhs.first<< "->" << rhs.second << ")";
+}
+}
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs)
+{
+  out << '[';
+  std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
+  return out << ']';
+}
+
+std::ostream &operator<<(std::ostream &out, const std::vector<std::pair<std::string, std::string>> &rhs)
+{
+  out << '[';
+  std::ostream_iterator<std::pair<std::string, std::string>> out_it(out, ", ");
+  std::copy(rhs.begin(), rhs.end(), out_it);
+  return out << ']';
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager.h b/src/crimson/os/seastore/omap_manager.h
new file mode 100644 (file)
index 0000000..6725bc0
--- /dev/null
@@ -0,0 +1,152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <seastar/core/future.hh>
+
+#include "crimson/osd/exceptions.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+#define OMAP_BLOCK_SIZE 4096
+
+namespace crimson::os::seastore {
+
+enum class omap_root_state_t : uint8_t {
+  INITIAL = 0,
+  MUTATED = 1,
+  NONE = 0xFF
+};
+
+struct omap_root_t {
+  depth_t depth = 0;
+  omap_root_state_t state;
+  laddr_t omap_root_laddr;
+  omap_root_t(depth_t dep, laddr_t laddr)
+  : depth(dep),
+    omap_root_laddr(laddr) { state = omap_root_state_t::INITIAL; }
+};
+
+struct list_keys_result_t {
+  std::vector<std::string> keys;
+  std::string next;
+};
+
+struct list_kvs_result_t {
+  std::vector<std::pair<std::string, std::string>> kvs;
+  std::string next;
+};
+constexpr size_t MAX_SIZE = std::numeric_limits<size_t>::max();
+std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs);
+std::ostream &operator<<(std::ostream &out, const std::map<std::string, std::string> &rhs);
+
+class OMapManager {
+ /* all OMapManager API use reference to transfer input string parameters,
+  * the upper caller should guarantee the referenced string values alive (not freed)
+  * until these functions future resolved.
+  */
+public:
+  /* allocate omap tree root node
+   *
+   * input: Transaction &t, current transaction
+   * return: return the omap_root_t structure.
+   */
+  using initialize_omap_ertr = TransactionManager::alloc_extent_ertr;
+  using initialize_omap_ret = initialize_omap_ertr::future<omap_root_t>;
+  virtual initialize_omap_ret initialize_omap(Transaction &t) = 0;
+
+  /*get value(string) by key(string)
+   *
+   * input: omap_root_t omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   * input: string &key, omap string key
+   * return: string key->string value mapping pair.
+   */
+  using omap_get_value_ertr = TransactionManager::read_extent_ertr;
+  using omap_get_value_ret = omap_get_value_ertr::future<std::pair<std::string, std::string>>;
+  virtual omap_get_value_ret omap_get_value(omap_root_t &omap_root, Transaction &t,
+                                           const std::string &key) = 0;
+
+  /* set key value mapping in omap
+   *
+   * input: omap_root_t &omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   * input: string &key, omap string key
+   * input: string &value, mapped value corresponding key
+   * return: mutation_result_t, status should be success.
+   */
+  using omap_set_key_ertr = TransactionManager::read_extent_ertr;
+  using omap_set_key_ret = omap_set_key_ertr::future<bool>;
+  virtual omap_set_key_ret omap_set_key(omap_root_t &omap_root, Transaction &t,
+                                       const std::string &key, const std::string &value) = 0;
+
+  /* remove key value mapping in omap tree
+   *
+   * input: omap_root_t &omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   * input: string &key, omap string key
+   * return: remove success return true, else return false.
+   */
+  using omap_rm_key_ertr = TransactionManager::read_extent_ertr;
+  using omap_rm_key_ret = omap_rm_key_ertr::future<bool>;
+  virtual omap_rm_key_ret omap_rm_key(omap_root_t &omap_root, Transaction &t,
+                                                   const std::string &key) = 0;
+
+  /* get all keys or partial keys in omap tree
+   *
+   * input: omap_root_t &omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   * input: string &start, the list keys range begin from start,
+   *        if start is "", list from the first omap key
+   * input: max_result_size, the number of list keys,
+   *        it it is not set, list all keys after string start
+   * return: list_keys_result_t, listed keys and next key
+   */
+  using omap_list_keys_ertr = TransactionManager::read_extent_ertr;
+  using omap_list_keys_ret = omap_list_keys_ertr::future<list_keys_result_t>;
+  virtual omap_list_keys_ret omap_list_keys(omap_root_t &omap_root, Transaction &t,
+                             std::string &start,
+                             size_t max_result_size = MAX_SIZE) = 0;
+
+  /* Get all or partial key-> value mapping in omap tree
+   *
+   * input: omap_root_t &omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   * input: string &start, the list keys range begin from start,
+   *        if start is "" , list from the first omap key
+   * input: max_result_size, the number of list keys,
+   *        it it is not set, list all keys after string start.
+   * return: list_kvs_result_t, listed key->value mapping and next key.
+   */
+  using omap_list_ertr = TransactionManager::read_extent_ertr;
+  using omap_list_ret = omap_list_ertr::future<list_kvs_result_t>;
+  virtual omap_list_ret omap_list(omap_root_t &omap_root, Transaction &t,
+                        std::string &start,
+                        size_t max_result_size = MAX_SIZE) = 0;
+
+  /* clear all omap tree key->value mapping
+   *
+   * input: omap_root_t &omap_root,  omap btree root information
+   * input: Transaction &t,  current transaction
+   */
+  using omap_clear_ertr = TransactionManager::read_extent_ertr;
+  using omap_clear_ret = omap_clear_ertr::future<>;
+  virtual omap_clear_ret omap_clear(omap_root_t &omap_root, Transaction &t) = 0;
+
+  virtual ~OMapManager() {}
+};
+using OMapManagerRef = std::unique_ptr<OMapManager>;
+
+namespace omap_manager {
+
+OMapManagerRef create_omap_manager (
+  TransactionManager &trans_manager);
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc
new file mode 100644 (file)
index 0000000..877d192
--- /dev/null
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string.h>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::omap_manager {
+
+BtreeOMapManager::BtreeOMapManager(
+  TransactionManager &tm)
+  : tm(tm) {}
+
+BtreeOMapManager::initialize_omap_ret
+BtreeOMapManager::initialize_omap(Transaction &t)
+{
+
+  logger().debug("{}", __func__);
+  return tm.alloc_extent<OMapLeafNode>(t, L_ADDR_MIN, OMAP_BLOCK_SIZE)
+    .safe_then([this](auto&& root_extent) {
+      root_extent->set_size(0);
+      omap_node_meta_t meta{1};
+      root_extent->set_meta(meta);
+      omap_root_t omap_root = omap_root_t(1, root_extent->get_laddr());
+      return initialize_omap_ertr::make_ready_future<omap_root_t>(omap_root);
+  });
+}
+
+BtreeOMapManager::get_root_ret
+BtreeOMapManager::get_omap_root(omap_root_t &omap_root, Transaction &t)
+{
+  assert(omap_root.omap_root_laddr != L_ADDR_NULL);
+  laddr_t laddr = omap_root.omap_root_laddr;
+  return omap_load_extent(get_omap_context(omap_root, t), laddr, omap_root.depth);
+}
+
+BtreeOMapManager::handle_root_split_ret
+BtreeOMapManager::handle_root_split(omap_context_t oc, OMapNode::mutation_result_t mresult)
+{
+  return oc.tm.alloc_extent<OMapInnerNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE)
+    .safe_then([oc, mresult](auto&& nroot) {
+    auto [left, right, pivot] = *(mresult.split_tuple);
+    omap_node_meta_t meta{oc.omap_root.depth + 1};
+    nroot->set_meta(meta);
+    nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(),
+                                "", nroot->maybe_get_delta_buffer());
+    nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(),
+                                pivot, nroot->maybe_get_delta_buffer());
+    oc.omap_root.omap_root_laddr = nroot->get_laddr();
+    oc.omap_root.depth += 1;
+    oc.omap_root.state = omap_root_state_t::MUTATED;
+    return handle_root_split_ertr::make_ready_future<bool>(true);
+  });
+}
+
+BtreeOMapManager::handle_root_merge_ret
+BtreeOMapManager::handle_root_merge(omap_context_t oc, OMapNode::mutation_result_t mresult)
+{
+  auto root = *(mresult.need_merge);
+  auto iter = root->cast<OMapInnerNode>()->iter_begin();
+  oc.omap_root.omap_root_laddr = iter->get_node_key().laddr;
+  oc.omap_root.depth -= 1;
+  oc.omap_root.state = omap_root_state_t::MUTATED;
+  return oc.tm.dec_ref(oc.t, root->get_laddr()).safe_then([] (auto &&ret) {
+    return handle_root_merge_ertr::make_ready_future<bool>(true);
+  });
+}
+
+
+BtreeOMapManager::omap_get_value_ret
+BtreeOMapManager::omap_get_value(omap_root_t &omap_root, Transaction &t,
+                                 const std::string &key)
+{
+  logger().debug("{}: {}", __func__, key);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t, &key](auto&& extent) {
+    return extent->get_value(get_omap_context(omap_root, t), key);
+  }).safe_then([](auto &&e) {
+    logger().debug("{}: {} -> {}", __func__, e.first, e.second);
+    return omap_get_value_ret(
+        omap_get_value_ertr::ready_future_marker{},
+        std::move(e));
+  });
+
+}
+
+BtreeOMapManager::omap_set_key_ret
+BtreeOMapManager::omap_set_key(omap_root_t &omap_root, Transaction &t,
+                             const std::string &key, const std::string &value)
+{
+  logger().debug("{}: {} -> {}", __func__, key, value);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t, &key, &value](auto root) {
+    return root->insert(get_omap_context(omap_root, t), key, value);
+  }).safe_then([this, &omap_root, &t](auto mresult) {
+    if (mresult.status == mutation_status_t::SUCCESS)
+      return omap_set_key_ertr::make_ready_future<bool>(true);
+    else if (mresult.status == mutation_status_t::SPLITTED)
+      return handle_root_split(get_omap_context(omap_root,  t), mresult);
+    else
+      return omap_set_key_ertr::make_ready_future<bool>(false);
+
+  });
+
+}
+
+BtreeOMapManager::omap_rm_key_ret
+BtreeOMapManager::omap_rm_key(omap_root_t &omap_root, Transaction &t, const std::string &key)
+{
+  logger().debug("{}: {}", __func__, key);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t, &key](auto root) {
+    return root->rm_key(get_omap_context(omap_root, t), key);
+  }).safe_then([this, &omap_root, &t](auto mresult) {
+    if (mresult.status == mutation_status_t::SUCCESS)
+      return omap_rm_key_ertr::make_ready_future<bool>(true);
+    else if (mresult.status == mutation_status_t::SPLITTED)
+      return handle_root_split(get_omap_context(omap_root,  t), mresult);
+    else if (mresult.status == mutation_status_t::NEED_MERGE) {
+      auto root = *(mresult.need_merge);
+      if (root->get_node_size() == 1 && omap_root.depth != 1)
+        return handle_root_merge(get_omap_context(omap_root,  t), mresult);
+      else
+        return omap_rm_key_ertr::make_ready_future<bool>(true);
+    }
+    else
+      return omap_rm_key_ertr::make_ready_future<bool>(false);
+  });
+
+}
+
+BtreeOMapManager::omap_list_keys_ret
+BtreeOMapManager::omap_list_keys(omap_root_t &omap_root, Transaction &t,
+                                 std::string &start, size_t max_result_size)
+{
+  logger().debug("{}", __func__);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t, &start,
+    max_result_size] (auto extent) {
+    return extent->list_keys(get_omap_context(omap_root, t), start, max_result_size)
+      .safe_then([](auto &&result) {
+      return omap_list_keys_ret(
+             omap_list_keys_ertr::ready_future_marker{},
+             std::move(result));
+    });
+  });
+
+}
+
+BtreeOMapManager::omap_list_ret
+BtreeOMapManager::omap_list(omap_root_t &omap_root, Transaction &t,
+                            std::string &start, size_t max_result_size)
+{
+  logger().debug("{}", __func__);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t, &start, max_result_size]
+    (auto extent) {
+    return extent->list(get_omap_context(omap_root, t), start, max_result_size)
+      .safe_then([](auto &&result) {
+      return omap_list_ret(
+             omap_list_ertr::ready_future_marker{},
+             std::move(result));
+    });
+  });
+}
+
+BtreeOMapManager::omap_clear_ret
+BtreeOMapManager::omap_clear(omap_root_t &omap_root, Transaction &t)
+{
+  logger().debug("{}", __func__);
+  return get_omap_root(omap_root, t).safe_then([this, &omap_root, &t](auto extent) {
+    return extent->clear(get_omap_context(omap_root, t));
+  }).safe_then([this, &omap_root, &t] {
+    return tm.dec_ref(t, omap_root.omap_root_laddr).safe_then([&omap_root] (auto ret) {
+      omap_root.state = omap_root_state_t::MUTATED;
+      omap_root.depth = 0;
+      omap_root.omap_root_laddr = L_ADDR_NULL;
+      return omap_clear_ertr::now();
+    });
+  });
+}
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h
new file mode 100644 (file)
index 0000000..d1601ba
--- /dev/null
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include <boost/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <seastar/core/future.hh>
+
+#include "include/ceph_assert.h"
+#include "crimson/osd/exceptions.h"
+
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+
+namespace crimson::os::seastore::omap_manager {
+/**
+ * BtreeOMapManager
+ *
+ * Uses a btree to track :
+ * string -> string mapping for each onode omap
+ */
+
+class BtreeOMapManager : public OMapManager {
+  TransactionManager &tm;
+
+  omap_context_t get_omap_context(omap_root_t &omap_root, Transaction &t) {
+    return omap_context_t{omap_root, tm, t};
+  }
+
+  /* get_omap_root
+   *
+   * load omap tree root node
+   */
+  using get_root_ertr = TransactionManager::read_extent_ertr;
+  using get_root_ret = get_root_ertr::future<OMapNodeRef>;
+  get_root_ret get_omap_root(omap_root_t &omap_root, Transaction &t);
+
+  /* handle_root_split
+   *
+   * root has been splitted and need update omap_root_t
+   */
+  using handle_root_split_ertr = TransactionManager::read_extent_ertr;
+  using handle_root_split_ret = handle_root_split_ertr::future<bool>;
+  handle_root_split_ret handle_root_split(omap_context_t oc,
+                                          OMapNode:: mutation_result_t mresult);
+
+  /* handle_root_merge
+   *
+   * root node has only one item and it is not leaf node, need remove a layer
+   */
+  using handle_root_merge_ertr = TransactionManager::read_extent_ertr;
+  using handle_root_merge_ret = handle_root_merge_ertr::future<bool>;
+  handle_root_merge_ret handle_root_merge(omap_context_t oc,
+                                          OMapNode:: mutation_result_t mresult);
+
+public:
+  explicit BtreeOMapManager(TransactionManager &tm);
+
+  initialize_omap_ret initialize_omap(Transaction &t) final;
+
+  omap_get_value_ret omap_get_value(omap_root_t &omap_root, Transaction &t,
+                                    const std::string &key) final;
+
+  omap_set_key_ret omap_set_key(omap_root_t &omap_root, Transaction &t,
+                                const std::string &key, const std::string &value) final;
+
+  omap_rm_key_ret omap_rm_key(omap_root_t &omap_root, Transaction &t,
+                              const std::string &key) final;
+
+  omap_list_keys_ret omap_list_keys(omap_root_t &omap_root, Transaction &t,
+                                    std::string &start,
+                                    size_t max_result_size = MAX_SIZE) final;
+
+  omap_list_ret omap_list(omap_root_t &omap_root, Transaction &t,
+                          std::string &start,
+                          size_t max_result_size = MAX_SIZE) final;
+
+  omap_clear_ret omap_clear(omap_root_t &omap_root, Transaction &t) final;
+
+};
+using BtreeOMapManagerRef = std::unique_ptr<BtreeOMapManager>;
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
new file mode 100644 (file)
index 0000000..7a447bb
--- /dev/null
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+#include <vector>
+
+//#include <boost/iterator/counting_iterator.hpp>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+namespace crimson::os::seastore::omap_manager{
+
+struct omap_context_t {
+  omap_root_t &omap_root;
+  TransactionManager &tm;
+  Transaction &t;
+};
+
+enum class mutation_status_t : uint8_t {
+  SUCCESS = 0,
+  SPLITTED = 1,
+  NEED_MERGE = 2,
+  FAIL = 3
+};
+
+struct OMapNode : LogicalCachedExtent {
+  using OMapNodeRef = TCachedExtentRef<OMapNode>;
+
+  struct mutation_result_t {
+    mutation_status_t status;
+    /// Only populated if SPLITTED, indicates the newly created left and right nodes
+    /// from splitting the target entry during insertion.
+    std::optional<std::tuple<OMapNodeRef, OMapNodeRef, std::string>> split_tuple;
+    /// only sopulated if need merged, indicate which entry need be doing merge in upper layer.
+    std::optional<OMapNodeRef> need_merge;
+
+    mutation_result_t(mutation_status_t s, std::optional<std::tuple<OMapNodeRef,
+                      OMapNodeRef, std::string>> tuple, std::optional<OMapNodeRef> n_merge)
+    : status(s),
+      split_tuple(tuple),
+      need_merge(n_merge) {}
+  };
+
+  OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+  OMapNode(const OMapNode &other)
+  : LogicalCachedExtent(other) {}
+
+  using get_value_ertr = OMapManager::omap_get_value_ertr;
+  using get_value_ret = OMapManager::omap_get_value_ret;
+  virtual get_value_ret get_value(omap_context_t oc, const std::string &key) = 0;
+
+  using insert_ertr = TransactionManager::alloc_extent_ertr;
+  using insert_ret = insert_ertr::future<mutation_result_t>;
+  virtual insert_ret insert(omap_context_t oc, const std::string &key, const std::string &value) = 0;
+
+  using rm_key_ertr = TransactionManager::alloc_extent_ertr;
+  using rm_key_ret = rm_key_ertr::future<mutation_result_t>;
+  virtual rm_key_ret rm_key(omap_context_t oc, const std::string &key) = 0;
+
+  using list_keys_ertr = OMapManager::omap_list_keys_ertr;
+  using list_keys_ret = OMapManager::omap_list_keys_ret;
+  virtual list_keys_ret list_keys(omap_context_t oc, std::string &start,
+                                  size_t max_result_size) = 0;
+
+  using list_ertr = OMapManager::omap_list_ertr;
+  using list_ret = OMapManager::omap_list_ret;
+  virtual list_ret list(omap_context_t oc, std::string &start, size_t max_result_size) = 0;
+
+  using clear_ertr = OMapManager::omap_clear_ertr;
+  using clear_ret = clear_ertr::future<>;
+  virtual clear_ret clear(omap_context_t oc) = 0;
+
+  using full_merge_ertr = TransactionManager::alloc_extent_ertr;
+  using full_merge_ret = full_merge_ertr::future<OMapNodeRef>;
+  virtual full_merge_ret make_full_merge(omap_context_t oc, OMapNodeRef right) = 0;
+
+  using make_balanced_ertr = TransactionManager::alloc_extent_ertr;
+  using make_balanced_ret = make_balanced_ertr::future
+          <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>;
+  virtual make_balanced_ret make_balanced(omap_context_t oc, OMapNodeRef _right) = 0;
+
+  virtual omap_node_meta_t get_node_meta() const = 0;
+  virtual bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const = 0;
+  virtual bool extent_is_below_min() const = 0;
+  virtual uint32_t get_node_size() = 0;
+
+  virtual ~OMapNode() = default;
+};
+
+using OMapNodeRef = OMapNode::OMapNodeRef;
+
+TransactionManager::read_extent_ertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth);
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
new file mode 100644 (file)
index 0000000..b57f66a
--- /dev/null
@@ -0,0 +1,615 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string.h>
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+#include "seastar/core/thread.hh"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore::omap_manager {
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs)
+{
+  return out << "omap_inner_key (" << rhs.key_off<< " - " << rhs.key_len
+             << " - " << rhs.laddr << ")";
+}
+
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs)
+{
+  return out << "omap_leaf_key_t (" << rhs.key_off<< " - " << rhs.key_len
+             << " "<< rhs.val_off<<" - " << rhs.val_len << ")";
+}
+
+std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+            << ", depth=" << get_meta().depth;
+}
+
+/**
+ * make_split_insert
+ *
+ * insert an  entry at iter, with the address of key.
+ * will result in a split outcome encoded in the returned mutation_result_t
+ */
+OMapInnerNode::make_split_insert_ret
+OMapInnerNode::make_split_insert(omap_context_t oc, internal_iterator_t iter,
+                                 std::string key, laddr_t laddr)
+{
+  return make_split_children(oc).safe_then([=] (auto tuple) {
+    auto [left, right, pivot] = tuple;
+    if (pivot > key) {
+      auto liter = left->iter_idx(iter.get_index());
+      left->journal_inner_insert(liter, laddr, key,
+                                 left->maybe_get_delta_buffer());
+    } else {  //right
+      auto riter = right->iter_idx(iter.get_index() - left->get_node_size());
+      right->journal_inner_insert(riter, laddr, key,
+                                  right->maybe_get_delta_buffer());
+    }
+    return make_split_insert_ret(
+           make_split_insert_ertr::ready_future_marker{},
+           mutation_result_t(mutation_status_t::SPLITTED, tuple, std::nullopt));
+  });
+
+}
+
+
+OMapInnerNode::handle_split_ret
+OMapInnerNode::handle_split(omap_context_t oc, internal_iterator_t iter,
+                               mutation_result_t mresult)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  if (!is_pending()) {
+    auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+    auto mut_iter = mut->iter_idx(iter.get_index());
+    return mut->handle_split(oc, mut_iter, mresult);
+  }
+  auto [left, right, pivot] = *(mresult.split_tuple);
+  //update will not cause overflow do it first.
+  journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer());
+  if (!extent_will_overflow(pivot.size() + 1, std::nullopt)) {
+    journal_inner_insert(iter + 1, right->get_laddr(), pivot,
+                         maybe_get_delta_buffer());
+    return insert_ret(
+           insert_ertr::ready_future_marker{},
+           mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+  } else {
+    return make_split_insert(oc, iter + 1, pivot, right->get_laddr())
+      .safe_then([this, oc] (auto m_result) {
+       return oc.tm.dec_ref(oc.t, get_laddr())
+         .safe_then([m_result = std::move(m_result)] (auto ret) {
+          return insert_ret(
+                 insert_ertr::ready_future_marker{},
+                 m_result);
+       });
+   });
+  }
+}
+
+OMapInnerNode::get_value_ret
+OMapInnerNode::get_value(omap_context_t oc, const std::string &key)
+{
+  logger().debug("{}: {} key = {}", "OMapInnerNode", __func__, key);
+  auto child_pt = get_containing_child(key);
+  auto laddr = child_pt->get_node_key().laddr;
+  return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+    [oc, &key] (auto extent) {
+    return extent->get_value(oc, key);
+  }).finally([ref = OMapNodeRef(this)] {});
+}
+
+OMapInnerNode::insert_ret
+OMapInnerNode::insert(omap_context_t oc, const std::string &key, const std::string &value)
+{
+  logger().debug("{}: {}  {}->{}", "OMapInnerNode",  __func__, key, value);
+  auto child_pt = get_containing_child(key);
+  assert(child_pt != iter_end());
+  auto laddr = child_pt->get_node_key().laddr;
+  return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+    [this, oc, child_pt, &key, &value] (auto extent) {
+    return extent->insert(oc, key, value);
+  }).safe_then([this, oc, child_pt] (auto mresult) {
+    if (mresult.status == mutation_status_t::SUCCESS) {
+      return insert_ertr::make_ready_future<mutation_result_t>(mresult);
+    } else if (mresult.status == mutation_status_t::SPLITTED) {
+      return handle_split(oc, child_pt, mresult);
+    } else {
+     return insert_ret(
+            insert_ertr::ready_future_marker{},
+            mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+    }
+  });
+}
+
+OMapInnerNode::rm_key_ret
+OMapInnerNode::rm_key(omap_context_t oc, const std::string &key)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  auto child_pt = get_containing_child(key);
+  auto laddr = child_pt->get_node_key().laddr;
+  return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+    [this, oc, &key, child_pt] (auto extent) {
+    return extent->rm_key(oc, key)
+      .safe_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) {
+      if (mresult.status == mutation_status_t::SUCCESS ||
+          mresult.status == mutation_status_t::FAIL) {
+        return rm_key_ertr::make_ready_future<mutation_result_t>(mresult);
+      } else if (mresult.status == mutation_status_t::NEED_MERGE) {
+        if (get_node_size() >1)
+          return merge_entry(oc, child_pt, *(mresult.need_merge));
+        else
+          return rm_key_ret(
+                 rm_key_ertr::ready_future_marker{},
+                 mutation_result_t(mutation_status_t::SUCCESS,
+                                   std::nullopt, std::nullopt));
+      } else if (mresult.status == mutation_status_t::SPLITTED) {
+        return handle_split(oc, child_pt, mresult);
+      } else {
+        return rm_key_ertr::make_ready_future<mutation_result_t>(mresult);
+      }
+    });
+  });
+}
+
+OMapInnerNode::list_keys_ret
+OMapInnerNode::list_keys(omap_context_t oc, std::string &start, size_t max_result_size)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  auto  child_iter = get_containing_child(start);
+
+  return seastar::do_with(child_iter, iter_end(), list_keys_result_t(), [=, &start]
+    (auto &biter, auto &eiter, auto &result) {
+    result.next = start;
+    return crimson::do_until([=, &biter, &eiter, &result] ()
+       -> list_keys_ertr::future<bool> {
+      if (biter == eiter  || result.keys.size() == max_result_size)
+        return list_keys_ertr::make_ready_future<bool>(true);
+
+      auto laddr = biter->get_node_key().laddr;
+      return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+        [=, &biter, &eiter, &result] (auto &&extent) {
+        return extent->list_keys(oc, result.next, max_result_size - result.keys.size())
+          .safe_then([&biter, &eiter, &result] (auto &&list) mutable {
+          if (!list.keys.empty())
+            result.keys.insert(result.keys.end(), list.keys.begin(),list.keys.end());
+
+          biter++;
+          if (list.next != "")
+            result.next = list.next;
+          else if (biter != eiter)
+            result.next = biter->get_node_val();
+          else
+            result.next = "";
+
+          return list_keys_ertr::make_ready_future<bool>(false);
+        });
+      });
+    }).safe_then([&result, ref = OMapNodeRef(this)] {
+      return list_keys_ertr::make_ready_future<list_keys_result_t>(std::move(result));
+    });
+  });
+}
+
+OMapInnerNode::list_ret
+OMapInnerNode::list(omap_context_t oc, std::string &start, size_t max_result_size)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  auto child_iter = get_containing_child(start);
+
+  return seastar::do_with(child_iter, iter_end(), list_kvs_result_t(), [=, &start]
+    (auto &biter, auto &eiter, auto &result) {
+    result.next = start;
+    return crimson::do_until([=, &biter, &eiter, &result] ()
+      -> list_ertr::future<bool> {
+      if (biter == eiter  || result.kvs.size() == max_result_size)
+        return list_ertr::make_ready_future<bool>(true);
+
+      auto laddr = biter->get_node_key().laddr;
+      return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+        [=, &biter, &eiter, &result] (auto &&extent) {
+        return extent->list(oc, result.next, max_result_size - result.kvs.size())
+          .safe_then([&biter, &eiter, &result] (auto &&list) mutable {
+          if (!list.kvs.empty())
+            result.kvs.insert(result.kvs.end(), list.kvs.begin(),list.kvs.end());
+
+          biter++;
+          if (list.next != "")
+            result.next = list.next;
+          else if (biter != eiter)
+            result.next = biter->get_node_val();
+          else
+            result.next = "";
+
+          return list_ertr::make_ready_future<bool>(false);
+        });
+      });
+    }).safe_then([&result, ref = OMapNodeRef(this)] {
+      return list_ertr::make_ready_future<list_kvs_result_t>(std::move(result));
+    });
+  });
+}
+
+OMapInnerNode::clear_ret
+OMapInnerNode::clear(omap_context_t oc)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  return crimson::do_for_each(iter_begin(), iter_end(), [this, oc] (auto iter) {
+    auto laddr = iter->get_node_key().laddr;
+    return omap_load_extent(oc, laddr, get_meta().depth - 1).safe_then(
+      [oc] (auto &&extent) {
+      return extent->clear(oc);
+    }).safe_then([oc, laddr] {
+      return oc.tm.dec_ref(oc.t, laddr);
+    }).safe_then([ref = OMapNodeRef(this)] (auto ret){
+      return clear_ertr::now();
+    });
+  });
+}
+
+OMapInnerNode::split_children_ret
+OMapInnerNode:: make_split_children(omap_context_t oc)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  return oc.tm.alloc_extents<OMapInnerNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE, 2)
+    .safe_then([this] (auto &&ext_pair) {
+      auto left = ext_pair.front();
+      auto right = ext_pair.back();
+      return split_children_ret(
+             split_children_ertr::ready_future_marker{},
+             std::make_tuple(left, right, split_into(*left, *right)));
+  });
+}
+
+OMapInnerNode::full_merge_ret
+OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  return oc.tm.alloc_extent<OMapInnerNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE)
+    .safe_then([this, right] (auto &&replacement) {
+      replacement->merge_from(*this, *right->cast<OMapInnerNode>());
+      return full_merge_ret(
+        full_merge_ertr::ready_future_marker{},
+        std::move(replacement));
+  });
+}
+
+OMapInnerNode::make_balanced_ret
+OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  ceph_assert(_right->get_type() == type);
+  return oc.tm.alloc_extents<OMapInnerNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE, 2)
+    .safe_then([this, _right] (auto &&replacement_pair){
+      auto replacement_left = replacement_pair.front();
+      auto replacement_right = replacement_pair.back();
+      auto &right = *_right->cast<OMapInnerNode>();
+      return make_balanced_ret(
+             make_balanced_ertr::ready_future_marker{},
+             std::make_tuple(replacement_left, replacement_right,
+             balance_into_new_nodes(*this, right,
+                                    *replacement_left, *replacement_right)));
+  });
+}
+
+OMapInnerNode::merge_entry_ret
+OMapInnerNode::merge_entry(omap_context_t oc, internal_iterator_t iter, OMapNodeRef entry)
+{
+  logger().debug("{}: {}","OMapInnerNode",  __func__);
+  if (!is_pending()) {
+    auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>();
+    auto mut_iter = mut->iter_idx(iter->get_index());
+    return mut->merge_entry(oc, mut_iter, entry);
+  }
+  auto is_left = (iter + 1) == iter_end();
+  auto donor_iter = is_left ? iter - 1 : iter + 1;
+  return omap_load_extent(oc, donor_iter->get_node_key().laddr,  get_meta().depth - 1)
+    .safe_then([=] (auto &&donor) mutable {
+    auto [l, r] = is_left ?
+      std::make_pair(donor, entry) : std::make_pair(entry, donor);
+    auto [liter, riter] = is_left ?
+      std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
+    if (donor->extent_is_below_min()) {
+      logger().debug("{}::merge_entry make_full_merge l {} r {}", __func__, *l, *r);
+      assert(entry->extent_is_below_min());
+      return l->make_full_merge(oc, r).safe_then([=] (auto &&replacement){
+        journal_inner_update(liter, replacement->get_laddr(), maybe_get_delta_buffer());
+        journal_inner_remove(riter, maybe_get_delta_buffer());
+        //retire extent
+        std::list<laddr_t> dec_laddrs {l->get_laddr(), r->get_laddr()};
+        return oc.tm.dec_ref(oc.t, dec_laddrs).safe_then([this, oc] (auto &&ret) {
+          if (extent_is_below_min()) {
+            return merge_entry_ret(
+                   merge_entry_ertr::ready_future_marker{},
+                   mutation_result_t(mutation_status_t::NEED_MERGE, std::nullopt,
+                                    this->cast<OMapNode>()));
+          } else {
+            return merge_entry_ret(
+                   merge_entry_ertr::ready_future_marker{},
+                   mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+          }
+        });
+      });
+    } else {
+      logger().debug("{}::merge_entry balanced l {} r {}", __func__, *l, *r);
+      return l->make_balanced(oc, r).safe_then([=] (auto tuple) {
+        auto [replacement_l, replacement_r, replacement_pivot] = tuple;
+        //update will not cuase overflow, do it first
+        journal_inner_update(liter, replacement_l->get_laddr(), maybe_get_delta_buffer());
+        if (!extent_will_overflow(replacement_pivot.size() + 1, std::nullopt)) {
+          journal_inner_replace(riter, replacement_r->get_laddr(),
+                                replacement_pivot, maybe_get_delta_buffer());
+          std::list<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()};
+          return oc.tm.dec_ref(oc.t, dec_laddrs).safe_then([] (auto &&ret) {
+            return merge_entry_ret(
+                   merge_entry_ertr::ready_future_marker{},
+                   mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+          });
+        } else {
+          logger().debug("{}::merge_entry balanced and split {} r {}", __func__, *l, *r);
+          //use remove and insert to instead of replace, remove not cause split do it first
+          journal_inner_remove(riter, maybe_get_delta_buffer());
+          return make_split_insert(oc, riter, replacement_pivot, replacement_r->get_laddr())
+            .safe_then([this, oc, l = l, r = r] (auto mresult) {
+            std::list<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr(), get_laddr()};
+            return oc.tm.dec_ref(oc.t, dec_laddrs)
+              .safe_then([mresult = std::move(mresult)] (auto &&ret){
+              return merge_entry_ret(
+                     merge_entry_ertr::ready_future_marker{},
+                     mresult);
+            });
+          });
+        }
+      });
+    }
+  });
+
+}
+
+OMapInnerNode::internal_iterator_t
+OMapInnerNode::get_containing_child(const std::string &key)
+{
+  for (auto i = iter_begin(); i != iter_end(); ++i) {
+    if (i.contains(key))
+      return i;
+  }
+  ceph_assert( 0 == "invalid");
+  return iter_end();
+}
+
+std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const
+{
+  return out << ", size=" << get_size()
+         << ", depth=" << get_meta().depth;
+}
+
+OMapLeafNode::get_value_ret
+OMapLeafNode::get_value(omap_context_t oc, const std::string &key)
+{
+  logger().debug("{}: {} key = {}","OMapLeafNode", __func__, key);
+  auto ite = find_string_key(key);
+  if (ite != iter_end()) {
+    auto value = ite->get_string_val();
+    return get_value_ret(
+      get_value_ertr::ready_future_marker{},
+      std::make_pair(key, value));
+  } else {
+    return get_value_ret(
+      get_value_ertr::ready_future_marker{},
+      std::make_pair(key, ""));
+  }
+}
+
+OMapLeafNode::insert_ret
+OMapLeafNode::insert(omap_context_t oc, const std::string &key, const std::string &value)
+{
+  logger().debug("{}: {}, {} -> {}","OMapLeafNode", __func__, key, value);
+  if (!extent_will_overflow(key.size() + 1, value.size() + 1)) {
+    if (!is_pending()) {
+      auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+      return mut->insert(oc, key, value);
+    }
+    auto replace_pt = find_string_key(key);
+    if (replace_pt != iter_end()) {
+      journal_leaf_update(replace_pt, key, value, maybe_get_delta_buffer());
+    } else {
+      auto insert_pt = string_lower_bound(key);
+      journal_leaf_insert(insert_pt, key, value, maybe_get_delta_buffer());
+
+      logger().debug(
+        "{}: {} inserted {}->{} {}"," OMapLeafNode",  __func__,
+        insert_pt.get_node_key(),
+        insert_pt.get_node_val(),
+        insert_pt.get_string_val());
+    }
+    return insert_ret(
+           insert_ertr::ready_future_marker{},
+           mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+  } else {
+    return make_split_children(oc).safe_then([this, oc, &key, &value] (auto tuple) {
+      auto [left, right, pivot] = tuple;
+      auto replace_pt = find_string_key(key);
+      if (replace_pt != iter_end()) {
+        if (key < pivot) {  //left
+          auto mut_iter = left->iter_idx(replace_pt->get_index());
+          left->journal_leaf_update(mut_iter, key, value, left->maybe_get_delta_buffer());
+        } else if (key >= pivot) { //right
+          auto mut_iter = right->iter_idx(replace_pt->get_index() - left->get_node_size());
+          right->journal_leaf_update(mut_iter, key, value, right->maybe_get_delta_buffer());
+        }
+      } else {
+        auto insert_pt = string_lower_bound(key);
+        if (key < pivot) {  //left
+          auto mut_iter = left->iter_idx(insert_pt->get_index());
+          left->journal_leaf_insert(mut_iter, key, value, left->maybe_get_delta_buffer());
+        } else {
+          auto mut_iter = right->iter_idx(insert_pt->get_index() - left->get_node_size());
+          right->journal_leaf_insert(mut_iter, key, value, right->maybe_get_delta_buffer());
+        }
+      }
+      return oc.tm.dec_ref(oc.t, get_laddr())
+        .safe_then([tuple = std::move(tuple)] (auto ret) {
+        return insert_ret(
+               insert_ertr::ready_future_marker{},
+               mutation_result_t(mutation_status_t::SPLITTED, tuple, std::nullopt));
+      });
+    });
+  }
+}
+
+OMapLeafNode::rm_key_ret
+OMapLeafNode::rm_key(omap_context_t oc, const std::string &key)
+{
+  logger().debug("{}: {} : {}","OMapLeafNode",  __func__, key);
+  if(!is_pending()) {
+    auto mut =  oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>();
+    return mut->rm_key(oc, key);
+  }
+
+  auto rm_pt = find_string_key(key);
+  if (rm_pt != iter_end()) {
+    journal_leaf_remove(rm_pt, maybe_get_delta_buffer());
+    logger().debug(
+      "{}: removed {}->{} {}", __func__,
+      rm_pt->get_node_key(),
+      rm_pt->get_node_val(),
+      rm_pt->get_string_val());
+      if (extent_is_below_min()) {
+        return rm_key_ret(
+               rm_key_ertr::ready_future_marker{},
+               mutation_result_t(mutation_status_t::NEED_MERGE, std::nullopt,
+                                 this->cast<OMapNode>()));
+      } else {
+        return rm_key_ret(
+               rm_key_ertr::ready_future_marker{},
+               mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt));
+      }
+  } else {
+    return rm_key_ret(
+           rm_key_ertr::ready_future_marker{},
+           mutation_result_t(mutation_status_t::FAIL, std::nullopt, std::nullopt));
+  }
+
+}
+
+OMapLeafNode::list_keys_ret
+OMapLeafNode::list_keys(omap_context_t oc, std::string &start, size_t max_result_size)
+{
+  logger().debug("{}: {}","OMapLeafNode",  __func__);
+  auto result = list_keys_result_t();
+  iterator  iter = start == "" ?  iter_begin() : string_lower_bound(start);
+  for (; iter != iter_end() && result.keys.size() <= max_result_size; iter++) {
+    result.keys.push_back(iter->get_node_val());
+  }
+  if (iter == iter_end())
+   result.next = "";
+  else
+   result.next = iter->get_node_val();
+
+  return list_keys_ertr::make_ready_future<list_keys_result_t>(std::move(result));
+
+}
+
+OMapLeafNode::list_ret
+OMapLeafNode::list(omap_context_t oc, std::string &start, size_t max_result_size)
+{
+  logger().debug("{}: {}", "OMapLeafNode", __func__);
+  auto result = list_kvs_result_t();
+  iterator  iter = start == "" ? iter_begin() : string_lower_bound(start);
+  for (; iter != iter_end() && result.kvs.size() <= max_result_size; iter++) {
+    result.kvs.push_back({iter->get_node_val(), iter->get_string_val()});
+  }
+  if (iter == iter_end())
+   result.next = "";
+  else
+   result.next = iter->get_node_val();
+
+  return list_ertr::make_ready_future<list_kvs_result_t>(std::move(result));
+}
+
+OMapLeafNode::clear_ret
+OMapLeafNode::clear(omap_context_t oc)
+{
+  return clear_ertr::now();
+}
+
+OMapLeafNode::split_children_ret
+OMapLeafNode::make_split_children(omap_context_t oc)
+{
+  logger().debug("{}: {}","OMapLeafNode",  __func__);
+  return oc.tm.alloc_extents<OMapLeafNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE, 2)
+    .safe_then([this] (auto &&ext_pair) {
+      auto left = ext_pair.front();
+      auto right = ext_pair.back();
+      return split_children_ret(
+             split_children_ertr::ready_future_marker{},
+             std::make_tuple(left, right, split_into(*left, *right)));
+  });
+}
+
+OMapLeafNode::full_merge_ret
+OMapLeafNode::make_full_merge(omap_context_t oc, OMapNodeRef right)
+{
+  ceph_assert(right->get_type() == type);
+  logger().debug("{}: {}","OMapLeafNode",  __func__);
+  return oc.tm.alloc_extent<OMapLeafNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE)
+    .safe_then([this, right] (auto &&replacement) {
+      replacement->merge_from(*this, *right->cast<OMapLeafNode>());
+      return full_merge_ret(
+        full_merge_ertr::ready_future_marker{},
+        std::move(replacement));
+  });
+}
+
+OMapLeafNode::make_balanced_ret
+OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right)
+{
+  ceph_assert(_right->get_type() == type);
+  logger().debug("{}: {}", "OMapLeafNode",  __func__);
+  return oc.tm.alloc_extents<OMapLeafNode>(oc.t, L_ADDR_MIN, OMAP_BLOCK_SIZE, 2)
+    .safe_then([this, _right] (auto &&replacement_pair) {
+      auto replacement_left = replacement_pair.front();
+      auto replacement_right = replacement_pair.back();
+      auto &right = *_right->cast<OMapLeafNode>();
+      return make_balanced_ret(
+             make_balanced_ertr::ready_future_marker{},
+             std::make_tuple(
+               replacement_left, replacement_right,
+               balance_into_new_nodes(
+                 *this, right,
+                 *replacement_left, *replacement_right)));
+  });
+}
+
+
+TransactionManager::read_extent_ertr::future<OMapNodeRef>
+omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
+{
+  ceph_assert(depth > 0);
+  if (depth > 1) {
+    return oc.tm.read_extents<OMapInnerNode>(oc.t, laddr, OMAP_BLOCK_SIZE).safe_then(
+      [](auto&& extents) {
+      assert(extents.size() == 1);
+      [[maybe_unused]] auto [laddr, e] = extents.front();
+      return TransactionManager::read_extent_ertr::make_ready_future<OMapNodeRef>(std::move(e));
+    });
+  } else {
+    return oc.tm.read_extents<OMapLeafNode>(oc.t, laddr, OMAP_BLOCK_SIZE).safe_then(
+      [](auto&& extents) {
+      assert(extents.size() == 1);
+      [[maybe_unused]] auto [laddr, e] = extents.front();
+      return TransactionManager::read_extent_ertr::make_ready_future<OMapNodeRef>(std::move(e));
+    });
+  }
+}
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
new file mode 100644 (file)
index 0000000..1e0f201
--- /dev/null
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string.h>
+
+#include "include/buffer.h"
+
+#include "crimson/common/errorator.h"
+#include "crimson/os/seastore/omap_manager.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+/**
+ * OMapInnerNode
+ *
+ * Abstracts operations on and layout of internal nodes for the
+ * omap Tree.
+ *
+ * Layout (4k):
+ *   num_entries:   meta    :    keys    :  values  :
+ */
+
+struct OMapInnerNode
+  : OMapNode,
+    StringKVInnerNodeLayout<
+    omap_node_meta_t, omap_node_meta_le_t> {
+  using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  OMapInnerNode(T&&... t) :
+    OMapNode(std::forward<T>(t)...),
+    StringKVInnerNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::OMAP_INNER;
+
+  omap_node_meta_t get_node_meta() const final { return get_meta(); }
+  bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
+    return is_overflow(ksize);
+  }
+  bool extent_is_below_min() const { return below_min(); }
+  uint32_t get_node_size() { return get_size(); }
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new OMapInnerNode(*this));
+  }
+
+  delta_inner_buffer_t delta_buffer;
+  delta_inner_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  get_value_ret get_value(omap_context_t oc, const std::string &key) final;
+
+  insert_ret insert(omap_context_t oc, const std::string &key, const std::string &value) final;
+
+  rm_key_ret rm_key(omap_context_t oc, const std::string &key) final;
+
+  list_keys_ret list_keys(omap_context_t oc, std::string &start, size_t max_result_size) final;
+
+  list_ret list(omap_context_t oc, std::string &start, size_t max_result_size) final;
+
+  clear_ret clear(omap_context_t oc) final;
+
+  using split_children_ertr = TransactionManager::alloc_extent_ertr;
+  using split_children_ret = split_children_ertr::future
+          <std::tuple<OMapInnerNodeRef, OMapInnerNodeRef, std::string>>;
+  split_children_ret make_split_children(omap_context_t oc);
+
+  full_merge_ret make_full_merge(omap_context_t oc, OMapNodeRef right) final;
+
+  make_balanced_ret
+    make_balanced(omap_context_t oc, OMapNodeRef right) final;
+
+  using make_split_insert_ertr = TransactionManager::alloc_extent_ertr;
+  using make_split_insert_ret = make_split_insert_ertr::future<mutation_result_t>;
+  make_split_insert_ret make_split_insert(omap_context_t oc, internal_iterator_t iter,
+                                          std::string key, laddr_t laddr);
+
+  using merge_entry_ertr = TransactionManager::read_extent_ertr;
+  using merge_entry_ret = merge_entry_ertr::future<mutation_result_t>;
+  merge_entry_ret merge_entry(omap_context_t oc,
+                              internal_iterator_t iter, OMapNodeRef entry);
+
+  using handle_split_ertr = TransactionManager::read_extent_ertr;
+  using handle_split_ret = handle_split_ertr::future<mutation_result_t>;
+  handle_split_ret handle_split(omap_context_t oc, internal_iterator_t iter,
+                                      mutation_result_t mresult);
+
+  std::ostream &print_detail_l(std::ostream &out) const final;
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  ceph::bufferlist get_delta() final {
+    ceph::bufferlist bl;
+    delta_buffer.encode(bl);
+    return bl;
+  }
+
+  void apply_delta(const ceph::bufferlist &bl) final {
+    assert(bl.length());
+    delta_inner_buffer_t buffer;
+    buffer.decode(bl);
+    buffer.replay(*this);
+  }
+
+  internal_iterator_t get_containing_child(const std::string &key);
+
+};
+using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef;
+/**
+ * OMapLeafNode
+ *
+ * Abstracts operations on and layout of leaf nodes for the
+ * OMap Tree.
+ *
+ * Layout (4k):
+ *   num_entries:   meta   :   keys   :  values  :
+ */
+
+struct OMapLeafNode
+  : OMapNode,
+    StringKVLeafNodeLayout<
+      omap_node_meta_t, omap_node_meta_le_t> {
+
+  using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
+  using internal_iterator_t = const_iterator;
+  template <typename... T>
+  OMapLeafNode(T&&... t) :
+    OMapNode(std::forward<T>(t)...),
+    StringKVLeafNodeLayout(get_bptr().c_str()) {}
+
+  static constexpr extent_types_t type = extent_types_t::OMAP_LEAF;
+
+  omap_node_meta_t get_node_meta() const final { return get_meta(); }
+  bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
+    return is_overflow(ksize, *vsize);
+  }
+  bool extent_is_below_min() const { return below_min(); }
+  uint32_t get_node_size() { return get_size(); }
+
+  CachedExtentRef duplicate_for_write() final {
+    assert(delta_buffer.empty());
+    return CachedExtentRef(new OMapLeafNode(*this));
+  }
+
+  delta_leaf_buffer_t delta_buffer;
+  delta_leaf_buffer_t *maybe_get_delta_buffer() {
+    return is_mutation_pending() ? &delta_buffer : nullptr;
+  }
+
+  get_value_ret get_value(omap_context_t oc, const std::string &key) final;
+
+  insert_ret insert(omap_context_t oc, const std::string &key, const std::string &value) final;
+
+  rm_key_ret rm_key(omap_context_t oc, const std::string &key) final;
+
+  list_keys_ret list_keys(omap_context_t oc, std::string &start, size_t max_result_size) final;
+
+  list_ret list(omap_context_t oc, std::string &start, size_t max_result_size) final;
+
+  clear_ret clear(omap_context_t oc) final;
+
+  using split_children_ertr = TransactionManager::alloc_extent_ertr;
+  using split_children_ret = split_children_ertr::future
+          <std::tuple<OMapLeafNodeRef, OMapLeafNodeRef, std::string>>;
+  split_children_ret make_split_children(omap_context_t oc);
+
+  full_merge_ret make_full_merge(omap_context_t oc, OMapNodeRef right) final;
+
+  make_balanced_ret make_balanced(omap_context_t oc, OMapNodeRef _right) final;
+
+  extent_types_t get_type() const final {
+    return type;
+  }
+
+  ceph::bufferlist get_delta() final {
+    ceph::bufferlist bl;
+    delta_buffer.encode(bl);
+    return bl;
+  }
+
+  void apply_delta(const ceph::bufferlist &_bl) final {
+    assert(_bl.length());
+    ceph::bufferlist bl = _bl;
+    bl.rebuild();
+    delta_leaf_buffer_t buffer;
+    buffer.decode(bl);
+    buffer.replay(*this);
+  }
+
+  std::ostream &print_detail_l(std::ostream &out) const final;
+
+  std::pair<internal_iterator_t, internal_iterator_t>
+  get_leaf_entries(std::string &key);
+
+};
+using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef;
+
+std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs);
+std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs);
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_types.h b/src/crimson/os/seastore/omap_manager/btree/omap_types.h
new file mode 100644 (file)
index 0000000..d1bbb4c
--- /dev/null
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore::omap_manager {
+
+struct omap_node_meta_t {
+  depth_t depth = 0;
+
+  std::pair<omap_node_meta_t, omap_node_meta_t> split_into() const {
+    return std::make_pair(
+      omap_node_meta_t{depth},
+      omap_node_meta_t{depth});
+  }
+
+  static omap_node_meta_t merge_from(
+    const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+    assert(lhs.depth == rhs.depth);
+    return omap_node_meta_t{lhs.depth};
+  }
+
+  static std::pair<omap_node_meta_t, omap_node_meta_t>
+  rebalance(const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) {
+    assert(lhs.depth == rhs.depth);
+    return std::make_pair(
+      omap_node_meta_t{lhs.depth},
+      omap_node_meta_t{lhs.depth});
+  }
+};
+
+struct omap_node_meta_le_t {
+  depth_le_t depth = init_les32(0);
+
+  omap_node_meta_le_t() = default;
+  omap_node_meta_le_t(const omap_node_meta_le_t &) = default;
+  explicit omap_node_meta_le_t(const omap_node_meta_t &val)
+    : depth(init_les32(val.depth)) {}
+
+  operator omap_node_meta_t() const {
+    return omap_node_meta_t{ depth };
+  }
+};
+
+struct omap_inner_key_t {
+  uint16_t key_off = 0;
+  uint16_t key_len = 0;
+  laddr_t laddr = 0;
+
+  omap_inner_key_t() = default;
+  omap_inner_key_t(uint16_t off, uint16_t len, laddr_t addr)
+  : key_off(off), key_len(len), laddr(addr) {}
+};
+
+struct omap_inner_key_le_t {
+  ceph_le16 key_off = init_le16(0);
+  ceph_le16 key_len = init_le16(0);
+  laddr_le_t laddr = laddr_le_t(0);
+
+  omap_inner_key_le_t() = default;
+  omap_inner_key_le_t(const omap_inner_key_le_t &) = default;
+  explicit omap_inner_key_le_t(const omap_inner_key_t &key)
+    : key_off(init_le16(key.key_off)),
+      key_len(init_le16(key.key_len)),
+      laddr(laddr_le_t(key.laddr)) {}
+
+  operator omap_inner_key_t() const {
+    return omap_inner_key_t{uint16_t(key_off), uint16_t(key_len), laddr_t(laddr)};
+  }
+
+  omap_inner_key_le_t& operator=(omap_inner_key_t key) {
+    key_off = init_le16(key.key_off);
+    key_len = init_le16(key.key_len);
+    laddr = laddr_le_t(key.laddr);
+    return *this;
+  }
+
+  inline bool operator==(const omap_inner_key_le_t b) const {
+    return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr;
+  }
+};
+
+struct omap_leaf_key_t {
+  uint16_t key_off = 0;
+  uint16_t key_len = 0;
+  uint16_t val_off = 0;
+  uint16_t val_len = 0;
+
+  omap_leaf_key_t() = default;
+  omap_leaf_key_t(uint16_t k_off, uint16_t k_len, uint16_t v_off, uint16_t v_len)
+  : key_off(k_off), key_len(k_len), val_off(v_off), val_len(v_len) {}
+};
+
+struct omap_leaf_key_le_t {
+  ceph_le16 key_off = init_le16(0);
+  ceph_le16 key_len = init_le16(0);
+  ceph_le16 val_off = init_le16(0);
+  ceph_le16 val_len = init_le16(0);
+
+  omap_leaf_key_le_t() = default;
+  omap_leaf_key_le_t(const omap_leaf_key_le_t &) = default;
+  explicit omap_leaf_key_le_t(const omap_leaf_key_t &key)
+    : key_off(init_le16(key.key_off)),
+      key_len(init_le16(key.key_len)),
+      val_off(init_le16(key.val_off)),
+      val_len(init_le16(key.val_len)) {}
+
+  operator omap_leaf_key_t() const {
+    return omap_leaf_key_t{uint16_t(key_off), uint16_t(key_len),
+                           uint16_t(val_off), uint16_t(val_len)};
+  }
+
+  omap_leaf_key_le_t& operator=(omap_leaf_key_t key) {
+    key_off = init_le16(key.key_off);
+    key_len = init_le16(key.key_len);
+    val_off = init_le16(key.val_off);
+    val_len = init_le16(key.val_len);
+    return *this;
+  }
+
+  inline bool operator==(const omap_leaf_key_le_t b) const {
+    return key_off == b.key_off && key_len == b.key_len &&
+           val_off == b.val_off && val_len == b.val_len;
+  }
+};
+
+}
diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
new file mode 100644 (file)
index 0000000..bf5bac5
--- /dev/null
@@ -0,0 +1,1777 @@
+// -*- mode:C++; tab-width:8; c-basic-index:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "include/byteorder.h"
+
+#include "crimson/common/layout.h"
+#include "crimson/common/fixed_kv_node_layout.h"
+#include "crimson/os/seastore/omap_manager/btree/omap_types.h"
+
+#define BlockSize 4096
+namespace crimson::os::seastore::omap_manager {
+
+template <
+  typename Meta,
+  typename MetaInt,
+  bool VALIDATE_INVARIANTS=true> class StringKVInnerNodeLayout;
+
+template <
+  typename Meta,
+  typename MetaInt,
+  bool VALIDATE_INVARIANTS=true> class StringKVLeafNodeLayout;
+
+
+/**
+ * StringKVInnerNodeLayout
+ *
+ * Reusable implementation of a fixed size key mapping
+ * omap_inner_key_t(fixed) -> V(string) with internal representations omap_inner_key_le_t.
+ *
+ * Uses absl::container_internal::Layout for the actual key memory layout.
+ *
+ * The primary interface exposed is centered on the iterator
+ * and related methods.
+ *
+ * Also included are helpers for doing splits and merges as for a btree.
+ */
+template <
+  typename Meta,
+  typename MetaInt,
+  bool VALIDATE_INVARIANTS>
+class StringKVInnerNodeLayout {
+  char *buf = nullptr;
+
+  using L = absl::container_internal::Layout<ceph_le32, MetaInt, omap_inner_key_le_t>;
+  static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+
+public:
+  template <bool is_const>
+  struct iter_t {
+    friend class StringKVInnerNodeLayout;
+    using parent_t = typename crimson::common::maybe_const_t<StringKVInnerNodeLayout, is_const>::type;
+
+    parent_t node;
+    uint16_t index;
+
+    iter_t(
+      parent_t parent,
+      uint16_t index) : node(parent), index(index) {}
+
+    iter_t(const iter_t &) = default;
+    iter_t(iter_t &&) = default;
+    iter_t &operator=(const iter_t &) = default;
+    iter_t &operator=(iter_t &&) = default;
+
+    operator iter_t<!is_const>() const {
+      static_assert(!is_const);
+      return iter_t<!is_const>(node, index);
+    }
+
+    // Work nicely with for loops without requiring a nested type.
+    iter_t &operator*() { return *this; }
+    iter_t *operator->() { return this; }
+
+    iter_t operator++(int) {
+      auto ret = *this;
+      ++index;
+      return ret;
+    }
+
+    iter_t &operator++() {
+      ++index;
+      return *this;
+    }
+
+    uint16_t operator-(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return index - rhs.index;
+    }
+
+    iter_t operator+(uint16_t off) const {
+      return iter_t(
+                  node,
+                  index + off);
+    }
+    iter_t operator-(uint16_t off) const {
+      return iter_t(
+                  node,
+                  index - off);
+    }
+
+    uint16_t operator<(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return index < rhs.index;
+    }
+
+    bool operator==(const iter_t &rhs) const {
+      assert(node == rhs.node);
+      return rhs.index == index;
+    }
+
+    bool operator!=(const iter_t &rhs) const {
+      return !(*this == rhs);
+    }
+
+    omap_inner_key_t get_node_key() const {
+      omap_inner_key_le_t kint = node->get_node_key_ptr()[index];
+      return omap_inner_key_t(kint);
+    }
+
+    char *get_node_val_ptr() {
+      auto tail = node->buf + BlockSize;
+      if (*this == node->iter_end())
+        return tail;
+      else {
+        return tail - static_cast<uint32_t>(get_node_key().key_off);
+      }
+    }
+
+    const char *get_node_val_ptr() const {
+      auto tail = node->buf + BlockSize;
+      if ( *this == node->iter_end())
+        return tail;
+      else {
+        return tail - static_cast<uint32_t>(get_node_key().key_off);
+      }
+    }
+
+    void set_node_val(const std::string &val) {
+      static_assert(!is_const);
+      std::strcpy((char*)get_node_val_ptr(), val.c_str()); //copy char* to char* include "\0"
+    }
+
+    std::string get_node_val(){
+     std::string s(get_node_val_ptr());
+     return s;
+    }
+    std::string get_node_val() const{
+      std::string s(get_node_val_ptr());
+      return s;
+    }
+
+    bool contains(const std::string &key) const {
+      auto next = *this + 1;
+      if (next == node->iter_end())
+        return get_node_val() <= key;
+
+      return (get_node_val() <= key) && (next->get_node_val() > key);
+    }
+
+    uint16_t get_index() const {
+      return index;
+    }
+
+  private:
+    void set_node_key(omap_inner_key_t _lb) const {
+      static_assert(!is_const);
+      omap_inner_key_le_t lb;
+      lb = _lb;
+      node->get_node_key_ptr()[index] = lb;
+    }
+
+    typename crimson::common::maybe_const_t<char, is_const>::type get_node_key_ptr() const {
+      return reinterpret_cast<
+        typename crimson::common::maybe_const_t<char, is_const>::type>(
+              node->get_node_key_ptr() + index);
+    }
+
+  };
+  using const_iterator = iter_t<true>;
+  using iterator = iter_t<false>;
+
+  struct delta_inner_t {
+    enum class op_t : uint8_t {
+      INSERT,
+      UPDATE,
+      REMOVE,
+    } op;
+    omap_inner_key_le_t key;
+    std::string val;
+
+    void replay(StringKVInnerNodeLayout &l) {
+      switch (op) {
+      case op_t::INSERT: {
+       l.inner_insert(l.string_lower_bound(val), key, val);
+       break;
+      }
+      case op_t::UPDATE: {
+       auto iter = l.find_string_key(val);
+       assert(iter != l.iter_end());
+       l.inner_update(iter, key);
+       break;
+      }
+      case op_t::REMOVE: {
+       auto iter = l.find_string_key(val);
+       assert(iter != l.iter_end());
+       l.inner_remove(iter);
+       break;
+      }
+      default:
+       assert(0 == "Impossible");
+      }
+    }
+
+    bool operator==(const delta_inner_t &rhs) const {
+      return op == rhs.op &&
+             key == rhs.key &&
+             val == rhs.val;
+    }
+  };
+
+public:
+  class delta_inner_buffer_t {
+    std::vector<delta_inner_t> buffer;
+  public:
+    bool empty() const {
+      return buffer.empty();
+    }
+    void insert(
+      const omap_inner_key_t &key,
+      const std::string val) {
+      omap_inner_key_le_t k;
+      k = key;
+      buffer.push_back(
+       delta_inner_t{
+         delta_inner_t::op_t::INSERT,
+         k,
+         val
+       });
+    }
+    void update(
+      const omap_inner_key_t &key,
+      const std::string &val) {
+      omap_inner_key_le_t k;
+      k = key;
+      buffer.push_back(
+       delta_inner_t{
+         delta_inner_t::op_t::UPDATE,
+         k,
+         val
+       });
+    }
+    void remove(std::string val) {
+      buffer.push_back(
+       delta_inner_t{
+         delta_inner_t::op_t::REMOVE,
+         omap_inner_key_le_t(),
+         val
+       });
+    }
+
+    void replay(StringKVInnerNodeLayout &node) {
+      for (auto &i: buffer) {
+        i.replay(node);
+      }
+    }
+    size_t get_bytes() const {
+      size_t size = 0;
+      for (auto &i: buffer) {
+        size += sizeof(i.op_t) + sizeof(i.key) + i.val.size();
+      }
+      return size;
+    }
+    //copy out
+    void encode(ceph::bufferlist &bl) {
+      using ceph::encode;
+      uint32_t num = buffer.size();
+      encode(num, bl);
+      for (auto &&i: buffer) {
+        encode(i.op, bl);
+        bl.append((char*)&(i.key), sizeof(i.key));
+        encode(i.val, bl);
+      }
+      buffer.clear();
+    }
+    //copy in
+    void decode(const ceph::bufferlist &bl) {
+      using ceph::decode;
+      auto p = bl.cbegin();
+      uint32_t num;
+      decode (num, p);
+      while (num--) {
+        delta_inner_t delta;
+        decode(delta.op, p);
+        omap_inner_key_le_t key;
+        p.copy(sizeof(key), (char*)&(key));
+        delta.key = key;
+        decode(delta.val, p);
+        buffer.push_back(delta);
+      }
+    }
+
+    bool operator==(const delta_inner_buffer_t &rhs) const {
+      return buffer == rhs.buffer;
+    }
+  };
+
+  void journal_inner_insert(
+    const_iterator _iter,
+    const laddr_t laddr,
+    const std::string val,
+    delta_inner_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    omap_inner_key_t node_key;
+    node_key.laddr = laddr;
+    node_key.key_len = val.size() + 1;
+    node_key.key_off = iter.get_index() == 0 ?
+                       node_key.key_len :
+                       (iter - 1).get_node_key().key_off + node_key.key_len;
+    if (recorder) {
+      recorder->insert(
+        node_key,
+        val);
+    }
+    inner_insert(iter, node_key, val);
+  }
+
+  void journal_inner_update(
+    const_iterator _iter,
+    const laddr_t laddr,
+    delta_inner_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    auto node_key = iter.get_node_key();
+    node_key.laddr = laddr;
+    if (recorder) {
+      recorder->update(node_key, iter->get_node_val());
+    }
+    inner_update(iter, node_key);
+  }
+
+  void journal_inner_replace(
+    const_iterator _iter,
+    const laddr_t laddr,
+    const std::string val,
+    delta_inner_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    omap_inner_key_t node_key;
+    node_key.laddr = laddr;
+    node_key.key_len = val.size() + 1;
+    node_key.key_off = iter.get_index() == 0?
+                       node_key.key_len :
+                       (iter - 1).get_node_key().key_off + node_key.key_len;
+    if (recorder) {
+      recorder->remove(iter->get_node_val());
+      recorder->insert(node_key, val);
+    }
+    inner_replace(iter, node_key, val);
+  }
+
+  void journal_inner_remove(
+    const_iterator _iter,
+    delta_inner_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    if (recorder) {
+      recorder->remove(iter->get_node_val());
+    }
+    inner_remove(iter);
+  }
+
+  StringKVInnerNodeLayout(char *buf) :
+    buf(buf) {}
+
+  uint32_t get_size() const {
+    ceph_le32 &size = *layout.template Pointer<0>(buf);
+    return uint32_t(size);
+  }
+
+  /**
+   * set_size
+   *
+   * Set size representation to match size
+   */
+  void set_size(uint32_t size) {
+    ceph_le32 s;
+    s = size;
+    *layout.template Pointer<0>(buf) = s;
+  }
+
+  const_iterator iter_begin() const {
+    return const_iterator(
+      this,
+      0);
+  }
+
+  const_iterator iter_end() const {
+    return const_iterator(
+      this,
+      get_size());
+  }
+
+  iterator iter_begin() {
+    return iterator(
+      this,
+      0);
+  }
+
+  iterator iter_end() {
+    return iterator(
+      this,
+      get_size());
+  }
+
+  const_iterator iter_idx(uint16_t off) const {
+    return const_iterator(
+      this,
+      off);
+  }
+
+  const_iterator string_lower_bound(std::string str) const {
+    uint16_t start = 0, end = get_size();
+    while (start != end) {
+      unsigned mid = (start + end) / 2;
+      const_iterator iter(this, mid);
+      std::string s = iter->get_node_val();
+      if (s < str)
+        start = ++mid;
+      if ( s > str)
+        end = mid;
+      if (s == str)
+        return iter;
+    }
+    return const_iterator(this, start);
+  }
+
+  iterator string_lower_bound(std::string str) {
+    const auto &tref = *this;
+    return iterator(this, tref.string_lower_bound(str).index);
+  }
+
+  const_iterator string_upper_bound(std::string str) const {
+    auto ret = iter_begin();
+    for (; ret != iter_end(); ++ret) {
+     std::string s = ret->get_node_val();
+      if (s > str)
+        break;
+    }
+      return ret;
+  }
+
+  iterator string_upper_bound(std::string str) {
+    const auto &tref = *this;
+    return iterator(this, tref.string_upper_bound(str).index);
+  }
+
+  const_iterator find_string_key(const std::string &str) const {
+    auto ret = iter_begin();
+    for (; ret != iter_end(); ++ret) {
+     std::string s = ret->get_node_val();
+      if (s == str)
+        break;
+    }
+    return ret;
+  }
+  iterator find_string_key(const std::string &str) {
+    const auto &tref = *this;
+    return iterator(this, tref.find_string_key(str).index);
+  }
+
+  const_iterator get_split_pivot() const {
+    uint32_t total_size = omap_inner_key_t(get_node_key_ptr()[get_size()-1]).key_off;
+    uint32_t pivot_size = total_size / 2;
+    uint32_t size = 0;
+    for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+      auto node_key = ite->get_node_key();
+      size += node_key.key_len;
+      if (size >= pivot_size){
+        return ite;
+      }
+    }
+    return iter_end();
+  }
+
+
+  /**
+   * get_meta/set_meta
+   *
+   * Enables stashing a templated type within the layout.
+   * Cannot be modified after initial write as it is not represented
+   * in delta_t
+   */
+  Meta get_meta() const {
+    MetaInt &metaint = *layout.template Pointer<1>(buf);
+    return Meta(metaint);
+  }
+  void set_meta(const Meta &meta) {
+    *layout.template Pointer<1>(buf) = MetaInt(meta);
+  }
+
+  uint32_t used_space() const {
+    uint32_t count = get_size();
+    if (count) {
+      omap_inner_key_t last_key = omap_inner_key_t(get_node_key_ptr()[count-1]);
+      return last_key.key_off + count * sizeof(omap_inner_key_le_t);
+    } else {
+      return 0;
+    }
+  }
+
+  uint32_t free_space() const {
+    return capacity() - used_space();
+  }
+
+  uint16_t capacity() const {
+    return BlockSize - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))-
+                        reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+  }
+
+  char* from_end(int off) {
+    return  buf + (BlockSize - off);
+  }
+
+  bool is_overflow(size_t ksize) const {
+    return free_space() < (sizeof(omap_inner_key_le_t) + ksize);
+  }
+  bool below_min() const {
+    return free_space() > (capacity() / 2);
+  }
+
+  bool operator==(const StringKVInnerNodeLayout &rhs) const {
+    if (get_size() != rhs.get_size()) {
+      return false;
+    }
+
+    auto iter = iter_begin();
+    auto iter2 = rhs.iter_begin();
+    while (iter != iter_end()) {
+      if (iter->get_node_key() != iter2->get_node_key() ||
+          iter->get_node_val() != iter2->get_node_val()) {
+          return false;
+      }
+      iter++;
+      iter2++;
+    }
+    return true;
+  }
+
+  /**
+   * split_into
+   *
+   * Takes *this and splits its contents into left and right.
+   */
+  std::string split_into(
+    StringKVInnerNodeLayout &left,
+    StringKVInnerNodeLayout &right) const {
+    auto piviter = get_split_pivot();
+    assert(piviter != iter_end());
+
+    left.copy_from_foreign_head(left.iter_begin(), iter_begin(), piviter);
+    left.set_size(piviter - iter_begin());
+
+    right.copy_from_foreign_back(right.iter_begin(), piviter, iter_end());
+    right.set_size(iter_end() - piviter);
+
+    auto [lmeta, rmeta] = get_meta().split_into();
+    left.set_meta(lmeta);
+    right.set_meta(rmeta);
+
+    return piviter->get_node_val();
+  }
+
+  /**
+   * merge_from
+   *
+   * Takes two nodes and copies their contents into *this.
+   *
+   * precondition: left.size() + right.size() < CAPACITY
+   */
+  void merge_from(
+    const StringKVInnerNodeLayout &left,
+    const StringKVInnerNodeLayout &right) {
+    copy_from_foreign_head(
+      iter_end(),
+      left.iter_begin(),
+      left.iter_end());
+    set_size(left.get_size());
+
+    append_copy_from_foreign_head(
+      iter_end(),
+      right.iter_begin(),
+      right.iter_end());
+    set_size(left.get_size() + right.get_size());
+    set_meta(Meta::merge_from(left.get_meta(), right.get_meta()));
+  }
+
+  /**
+   * balance_into_new_nodes
+   *
+   * Takes the contents of left and right and copies them into
+   * replacement_left and replacement_right such that
+   * the size of replacement_left just >= 1/2 of (left + right)
+   */
+  static std::string balance_into_new_nodes(
+    const StringKVInnerNodeLayout &left,
+    const StringKVInnerNodeLayout &right,
+    StringKVInnerNodeLayout &replacement_left,
+    StringKVInnerNodeLayout &replacement_right)
+  {
+    uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+    uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+    uint32_t total = left_size + right_size;
+    uint32_t pivot_size = total / 2;
+    uint32_t pivot_idx = 0;
+    if (pivot_size < left_size) {
+      uint32_t size = 0;
+      for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+        auto node_key = ite->get_node_key();
+        size += node_key.key_len;
+        if (size >= pivot_size){
+          pivot_idx = ite.get_index();
+          break;
+        }
+      }
+    } else {
+      uint32_t more_size = pivot_size - left_size;
+      uint32_t size = 0;
+      for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+        auto node_key = ite->get_node_key();
+        size += node_key.key_len;
+        if (size >= more_size){
+          pivot_idx = ite.get_index() + left.get_size();
+          break;
+        }
+      }
+    }
+
+    auto replacement_pivot = pivot_idx >= left.get_size() ?
+      right.iter_idx(pivot_idx - left.get_size())->get_node_val() :
+      left.iter_idx(pivot_idx)->get_node_val();
+
+    if (pivot_size < left_size) {
+      replacement_left.copy_from_foreign_head(
+        replacement_left.iter_end(),
+        left.iter_begin(),
+        left.iter_idx(pivot_idx));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign_back(
+        replacement_right.iter_end(),
+        left.iter_idx(pivot_idx),
+        left.iter_end());
+      replacement_right.set_size(left.get_size() - pivot_idx);
+
+      replacement_right.append_copy_from_foreign_head(
+        replacement_right.iter_end(),
+        right.iter_begin(),
+        right.iter_end());
+      replacement_right.set_size(right.get_size() + left.get_size()- pivot_idx);
+    } else {
+      replacement_left.copy_from_foreign_head(
+        replacement_left.iter_end(),
+        left.iter_begin(),
+        left.iter_end());
+      replacement_left.set_size(left.get_size());
+
+      replacement_left.append_copy_from_foreign_head(
+        replacement_left.iter_end(),
+        right.iter_begin(),
+        right.iter_idx(pivot_idx - left.get_size()));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign_back(
+        replacement_right.iter_end(),
+        right.iter_idx(pivot_idx - left.get_size()),
+        right.iter_end());
+      replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+    }
+
+    auto [lmeta, rmeta] = Meta::rebalance(
+      left.get_meta(), right.get_meta());
+    replacement_left.set_meta(lmeta);
+    replacement_right.set_meta(rmeta);
+    return replacement_pivot;
+  }
+
+private:
+  void inner_insert(
+    iterator iter,
+    const omap_inner_key_t key,
+    const std::string &val) {
+    if (VALIDATE_INVARIANTS) {
+      if (iter != iter_begin()) {
+        assert((iter - 1)->get_node_val() < val);
+      }
+      if (iter != iter_end()) {
+        assert(iter->get_node_val() > val);
+      }
+      assert(is_overflow(val.size() + 1) == false);
+    }
+    if (get_size() != 0 && iter != iter_end())
+      local_move_back(key, iter + 1, iter, iter_end());
+
+    iter->set_node_key(key);
+    set_size(get_size() + 1);
+    iter->set_node_val(val);
+  }
+
+  void inner_update(
+    iterator iter,
+    omap_inner_key_t key ) {
+    assert(iter != iter_end());
+    iter->set_node_key(key);
+  }
+
+  void inner_replace(
+    iterator iter,
+    const omap_inner_key_t &key,
+    const std::string &val) {
+    assert(iter != iter_end());
+    if (VALIDATE_INVARIANTS) {
+      if (iter != iter_begin()) {
+        assert((iter - 1)->get_node_val() < val);
+      }
+      if ((iter + 1) != iter_end()) {
+        assert((iter + 1)->get_node_val() > val);
+      }
+      assert(is_overflow(val.size() + 1) == false);
+    }
+    inner_remove(iter);
+    inner_insert(iter, key, val);
+  }
+
+  void inner_remove(iterator iter) {
+    assert(iter != iter_end());
+    if ((iter + 1) != iter_end())
+      local_move_ahead(iter, iter + 1, iter_end());
+    set_size(get_size() - 1);
+  }
+
+  /**
+   * get_key_ptr
+   *
+   * Get pointer to start of key array
+   */
+  omap_inner_key_le_t *get_node_key_ptr() {
+    return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+  }
+  const omap_inner_key_le_t *get_node_key_ptr() const {
+    return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+  }
+
+  /**
+   * copy_from_foreign_head
+   *
+   * Copy from another node begin entries to this node.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  static void copy_from_foreign_head(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = (to_src -1)->get_node_key().key_off;
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+  /**
+   * copy_from_foreign_back
+   *
+   * Copy from another node back entries to this node.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  void copy_from_foreign_back(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    auto offset = from_src.get_index() == 0? 0: (from_src-1)->get_node_key().key_off;
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off - offset);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = from_src.get_index() == 0? (to_src -1)->get_node_key().key_off:
+                 (from_src-1)->get_node_val_ptr() - (to_src -1)->get_node_val_ptr();
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+    if ( from_src.get_index() == 0)
+      return;
+
+    omap_inner_key_t key = (from_src - 1)->get_node_key();
+    auto end_idx = tgt.get_index() + to_src.get_index() - from_src.get_index();
+    for (auto ite = tgt; ite.get_index() != end_idx; ite++) {
+       omap_inner_key_t node_key = ite->get_node_key();
+       node_key.key_off -= key.key_off;
+       ite->set_node_key(node_key);
+    }
+  }
+
+  /**
+   * append copy_from_foreign_ahead
+   *
+   * append another node head entries to this node back.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  void append_copy_from_foreign_head(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    if (from_src == to_src)
+      return;
+
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off + (tgt - 1)->get_node_key().key_off);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = (to_src -1)->get_node_key().key_off;
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+    omap_inner_key_t key = (tgt - 1)->get_node_key();
+    auto end_idx = tgt.get_index() + to_src.get_index() - from_src.get_index();
+    for (auto ite = tgt; ite.get_index() != end_idx; ite++) {
+       omap_inner_key_t node_key = ite->get_node_key();
+       node_key.key_off += key.key_off;
+       ite->set_node_key(node_key);
+    }
+  }
+
+  /**
+   * local_move_back
+   *
+   * move this node entries range [from_src, to_src) back to tgt position.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void local_move_back(
+    omap_inner_key_t key,
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    void* des = (to_src-1)->get_node_val_ptr() - key.key_len;
+    void* src = (to_src-1)->get_node_val_ptr();
+    size_t len = from_src.get_index() == 0?
+                 from_src->node->buf + BlockSize - (to_src-1)->get_node_val_ptr():
+                 (from_src-1)->get_node_val_ptr() - (to_src-1)->get_node_val_ptr();
+
+    memmove(des, src, len);
+    for ( auto ite = from_src; ite < to_src; ite++) {
+      omap_inner_key_t node_key = ite->get_node_key();
+      node_key.key_off += key.key_len;
+      ite->set_node_key(node_key);
+    }
+    memmove(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+  /**
+   * local_move_ahead
+   *
+   * move this node entries range [from_src, to_src) ahead to tgt position.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void local_move_ahead(
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    assert(from_src.get_index() != 0);
+    omap_inner_key_t key = tgt->get_node_key();
+    void* des = (to_src-1)->get_node_val_ptr() + key.key_len;
+    void* src = (to_src-1)->get_node_val_ptr();
+    size_t len = (from_src-1)->get_node_val_ptr() - (to_src-1)->get_node_val_ptr();
+    memmove(des, src, len);
+    for ( auto ite = from_src; ite < to_src; ite++) {
+      omap_inner_key_t node_key = ite->get_node_key();
+      node_key.key_off -= key.key_len;
+      ite->set_node_key(node_key);
+    }
+    memmove(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+};
+
+template <
+  typename Meta,
+  typename MetaInt,
+  bool VALIDATE_INVARIANTS>
+class StringKVLeafNodeLayout {
+  char *buf = nullptr;
+
+  using L = absl::container_internal::Layout<ceph_le32, MetaInt, omap_leaf_key_le_t>;
+  static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1);
+
+public:
+  template <bool is_const>
+  struct iter_t {
+    friend class StringKVLeafNodeLayout;
+    using parent_t = typename crimson::common::maybe_const_t<StringKVLeafNodeLayout, is_const>::type;
+
+    parent_t node;
+    uint16_t index;
+
+    iter_t(
+      parent_t parent,
+      uint16_t index) : node(parent), index(index) {}
+
+    iter_t(const iter_t &) = default;
+    iter_t(iter_t &&) = default;
+    iter_t &operator=(const iter_t &) = default;
+    iter_t &operator=(iter_t &&) = default;
+
+    operator iter_t<!is_const>() const {
+      static_assert(!is_const);
+      return iter_t<!is_const>(node, index);
+    }
+
+    // Work nicely with for loops without requiring a nested type.
+    iter_t &operator*() { return *this; }
+    iter_t *operator->() { return this; }
+
+    iter_t operator++(int) {
+      auto ret = *this;
+      ++index;
+      return ret;
+    }
+
+    iter_t &operator++() {
+      ++index;
+      return *this;
+    }
+
+    uint16_t operator-(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return index - rhs.index;
+    }
+
+    iter_t operator+(uint16_t off) const {
+      return iter_t(
+             node,
+             index + off);
+    }
+    iter_t operator-(uint16_t off) const {
+      return iter_t(
+             node,
+             index - off);
+    }
+
+    uint16_t operator<(const iter_t &rhs) const {
+      assert(rhs.node == node);
+      return index < rhs.index;
+    }
+
+    bool operator==(const iter_t &rhs) const {
+      assert(node == rhs.node);
+      return rhs.index == index;
+    }
+
+    bool operator!=(const iter_t &rhs) const {
+      assert(node == rhs.node);
+      return index != rhs.index;
+    }
+
+    omap_leaf_key_t get_node_key() const {
+      omap_leaf_key_le_t kint = node->get_node_key_ptr()[index];
+      return omap_leaf_key_t(kint);
+    }
+
+    char *get_node_val_ptr() {
+      auto tail = node->buf + BlockSize;
+      if ( *this == node->iter_end())
+        return tail;
+      else
+        return tail - static_cast<int>(get_node_key().key_off);
+    }
+
+    const char *get_node_val_ptr() const {
+      auto tail = node->buf + BlockSize;
+      if ( *this == node->iter_end())
+        return tail;
+      else
+        return tail - static_cast<int>(get_node_key().key_off);
+    }
+
+    char *get_string_val_ptr() {
+      auto tail = node->buf + BlockSize;
+      return tail - static_cast<int>(get_node_key().val_off);
+    }
+
+    const char *get_string_val_ptr() const {
+      auto tail = node->buf + BlockSize;
+      return tail - static_cast<int>(get_node_key().val_off);
+    }
+
+    void set_node_val(std::string val) const {
+      static_assert(!is_const);
+      std::strcpy((char*)get_node_val_ptr(), val.c_str()); //copy char* to char* include "\0"
+    }
+
+    std::string get_node_val() {
+     std::string s(get_node_val_ptr());
+      return s;
+    }
+    std::string get_node_val() const{
+     std::string s(get_node_val_ptr());
+      return s;
+    }
+
+    void set_string_val(std::string val) {
+      static_assert(!is_const);
+      std::strcpy((char*)get_string_val_ptr(), val.c_str()); //copy char* to char* include "\0"
+    }
+
+    std::string get_string_val() const {
+      std::string s(get_string_val_ptr());
+      return s;
+    }
+
+    bool contains(const std::string &key) const {
+      auto next = *this + 1;
+      if (*this == node->iter_begin()){
+        if (next->get_node_val() > key)
+          return true;
+        else
+          return false;
+      }
+      if (next == node->iter_end())
+        return get_node_val() <= key;
+
+      return (get_node_val() <= key) && (next->get_node_val() > key);
+    }
+
+    uint16_t get_index() const {
+      return index;
+    }
+
+  private:
+    void set_node_key(omap_leaf_key_t _lb) const {
+      static_assert(!is_const);
+      omap_leaf_key_le_t lb;
+      lb = _lb;
+      node->get_node_key_ptr()[index] = lb;
+    }
+
+    typename crimson::common::maybe_const_t<char, is_const>::type get_node_key_ptr() const {
+      return reinterpret_cast<
+        typename crimson::common::maybe_const_t<char, is_const>::type>(
+        node->get_node_key_ptr() + index);
+    }
+  };
+  using const_iterator = iter_t<true>;
+  using iterator = iter_t<false>;
+
+  struct delta_leaf_t {
+    enum class op_t : uint8_t {
+      INSERT,
+      UPDATE,
+      REMOVE,
+    } op;
+    std::string key;
+    std::string val;
+
+    void replay(StringKVLeafNodeLayout &l) {
+      switch (op) {
+      case op_t::INSERT: {
+       l.leaf_insert(l.string_lower_bound(key), key, val);
+       break;
+      }
+      case op_t::UPDATE: {
+       auto iter = l.find_string_key(key);
+       assert(iter != l.iter_end());
+       l.leaf_update(iter, key, val);
+       break;
+      }
+      case op_t::REMOVE: {
+       auto iter = l.find_string_key(key);
+       assert(iter != l.iter_end());
+       l.leaf_remove(iter);
+       break;
+      }
+      default:
+       assert(0 == "Impossible");
+      }
+    }
+
+    bool operator==(const delta_leaf_t &rhs) const {
+      return op == rhs.op &&
+        key == rhs.key &&
+        val == rhs.val;
+    }
+  };
+
+public:
+  class delta_leaf_buffer_t {
+    std::vector<delta_leaf_t> buffer;
+  public:
+    bool empty() const {
+      return buffer.empty();
+    }
+    void insert(
+      const std::string &key,
+      const std::string &val) {
+      buffer.push_back(
+       delta_leaf_t{
+         delta_leaf_t::op_t::INSERT,
+         key,
+         val
+       });
+    }
+    void update(
+      const std::string &key,
+      const std::string &val) {
+      buffer.push_back(
+       delta_leaf_t{
+         delta_leaf_t::op_t::UPDATE,
+         key,
+         val
+       });
+    }
+    void remove(std::string key) {
+      buffer.push_back(
+       delta_leaf_t{
+         delta_leaf_t::op_t::REMOVE,
+         key,
+         ""
+       });
+    }
+
+    void replay(StringKVLeafNodeLayout &node) {
+      for (auto &i: buffer) {
+        i.replay(node);
+      }
+    }
+    size_t get_bytes() const {
+      size_t size = 0;
+      for (auto &i: buffer) {
+        size += sizeof(i.op_t) + i.key.size() + i.val.size();
+      }
+      return size;
+    }
+    //copy out
+    void encode(ceph::bufferlist &bl) {
+      using ceph::encode;
+      uint32_t num = buffer.size();
+      encode(num, bl);
+      for (auto &&i: buffer) {
+        encode(i.op, bl);
+        encode(i.key, bl);
+        //bl.append((char*)&(i.key), sizeof(i.key));
+        encode(i.val, bl);
+      }
+      buffer.clear();
+    }
+    //copy in
+    void decode(const ceph::bufferlist &bl) {
+      using ceph::decode;
+      auto p = bl.cbegin();
+      uint32_t num;
+      decode (num, p);
+      while (num--) {
+        delta_leaf_t delta;
+        decode(delta.op, p);
+        decode(delta.key, p);
+        decode(delta.val, p);
+        buffer.push_back(delta);
+      }
+    }
+
+    bool operator==(const delta_leaf_buffer_t &rhs) const {
+      return buffer == rhs.buffer;
+    }
+  };
+
+  void journal_leaf_insert(
+    const_iterator _iter,
+    const std::string &key,
+    const std::string &val,
+    delta_leaf_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    if (recorder) {
+      recorder->insert(
+        key,
+        val);
+    }
+    leaf_insert(iter, key, val);
+  }
+
+  void journal_leaf_update(
+    const_iterator _iter,
+    const std::string &key,
+    const std::string &val,
+    delta_leaf_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    if (recorder) {
+      recorder->remove(iter->get_node_val());
+      recorder->insert(key, val);
+    }
+    leaf_update(iter, key, val);
+  }
+
+
+  void journal_leaf_remove(
+    const_iterator _iter,
+    delta_leaf_buffer_t *recorder) {
+    auto iter = iterator(this, _iter.index);
+    if (recorder) {
+      recorder->remove(iter->get_node_val());
+    }
+    leaf_remove(iter);
+  }
+
+  StringKVLeafNodeLayout(char *buf) :
+    buf(buf) {}
+
+  const_iterator iter_begin() const {
+    return const_iterator(
+      this,
+      0);
+  }
+
+  const_iterator iter_end() const {
+    return const_iterator(
+      this,
+      get_size());
+  }
+
+  iterator iter_begin() {
+    return iterator(
+      this,
+      0);
+  }
+
+  iterator iter_end() {
+    return iterator(
+      this,
+      get_size());
+  }
+
+  const_iterator iter_idx(uint16_t off) const {
+    return const_iterator(
+      this,
+      off);
+  }
+
+  const_iterator string_lower_bound(std::string str) const {
+    uint16_t start = 0, end = get_size();
+    while (start != end) {
+      unsigned mid = (start + end) / 2;
+      const_iterator iter(this, mid);
+      std::string s = iter->get_node_val();
+      if (s < str)
+        start = ++mid;
+      if (s > str)
+        end = mid;
+      if (s == str)
+        return iter;
+    }
+    return const_iterator(this, start);
+  }
+
+  iterator string_lower_bound(std::string str) {
+    const auto &tref = *this;
+    return iterator(this, tref.string_lower_bound(str).index);
+  }
+
+  const_iterator string_upper_bound(std::string str) const {
+    auto ret = iter_begin();
+    for (; ret != iter_end(); ++ret) {
+     std::string s = ret->get_node_val();
+      if (s > str)
+        break;
+    }
+    return ret;
+  }
+
+  iterator string_upper_bound(std::string str) {
+    const auto &tref = *this;
+    return iterator(this, tref.string_upper_bound(str).index);
+  }
+
+  const_iterator find_string_key(const std::string &str) const {
+    auto ret = iter_begin();
+    for (; ret != iter_end(); ++ret) {
+     std::string s = ret->get_node_val();
+      if (s == str)
+        break;
+    }
+    return ret;
+  }
+  iterator find_string_key(const std::string &str) {
+    const auto &tref = *this;
+    return iterator(this, tref.find_string_key(str).index);
+  }
+
+  const_iterator get_split_pivot() const {
+    uint32_t total_size = omap_leaf_key_t(get_node_key_ptr()[get_size()-1]).key_off;
+    uint32_t pivot_size = total_size / 2;
+    uint32_t size = 0;
+    for (auto ite = iter_begin(); ite < iter_end(); ite++) {
+      auto node_key = ite->get_node_key();
+      size += node_key.key_len + node_key.val_len;
+      if (size >= pivot_size){
+        return ite;
+      }
+    }
+    return iter_end();
+  }
+
+  uint32_t get_size() const {
+    ceph_le32 &size = *layout.template Pointer<0>(buf);
+    return uint32_t(size);
+  }
+
+  /**
+   * set_size
+   *
+   * Set size representation to match size
+   */
+  void set_size(uint32_t size) {
+    ceph_le32 s;
+    s = size;
+    *layout.template Pointer<0>(buf) = s;
+  }
+
+  /**
+   * get_meta/set_meta
+   *
+   * Enables stashing a templated type within the layout.
+   * Cannot be modified after initial write as it is not represented
+   * in delta_t
+   */
+  Meta get_meta() const {
+    MetaInt &metaint = *layout.template Pointer<1>(buf);
+    return Meta(metaint);
+  }
+  void set_meta(const Meta &meta) {
+    *layout.template Pointer<1>(buf) = MetaInt(meta);
+  }
+
+  uint32_t used_space() const {
+    uint32_t count = get_size();
+    if (count) {
+      omap_leaf_key_t last_key = omap_leaf_key_t(get_node_key_ptr()[count-1]);
+      return last_key.key_off + count * sizeof(omap_leaf_key_le_t);
+    } else {
+      return 0;
+    }
+  }
+
+  uint32_t free_space() const {
+    return capacity() - used_space();
+  }
+
+  uint32_t capacity() const {
+    return BlockSize - (reinterpret_cast<char*>(layout.template Pointer<2>(buf))-
+                        reinterpret_cast<char*>(layout.template Pointer<0>(buf)));
+  }
+  char* from_end(int off) {
+    return buf + (BlockSize - off);
+  }
+
+  bool is_overflow(size_t ksize, size_t vsize) const {
+    return free_space() < (sizeof(omap_leaf_key_le_t) + ksize + vsize);
+  }
+  bool below_min() const {
+    return free_space() > (capacity() / 2);
+  }
+
+  bool operator==(const StringKVLeafNodeLayout &rhs) const {
+    if (get_size() != rhs.get_size()) {
+      return false;
+    }
+
+    auto iter = iter_begin();
+    auto iter2 = rhs.iter_begin();
+    while (iter != iter_end()) {
+      if (iter->get_node_key() != iter2->get_node_key() ||
+              iter->get_node_val() != iter2->get_node_val() ||
+        iter->get_string_val() != iter2->get_string_val()){
+             return false;
+      }
+      iter++;
+      iter2++;
+    }
+    return true;
+  }
+
+  /**
+   * split_into
+   *
+   * Takes *this and splits its contents into left and right.
+   */
+  std::string split_into(
+    StringKVLeafNodeLayout &left,
+    StringKVLeafNodeLayout &right) const {
+    auto piviter = get_split_pivot();
+    assert (piviter != iter_end());
+
+    left.copy_from_foreign_head(left.iter_begin(), iter_begin(), piviter);
+    left.set_size(piviter - iter_begin());
+
+    right.copy_from_foreign_back(right.iter_begin(), piviter, iter_end());
+    right.set_size(iter_end() - piviter);
+
+    auto [lmeta, rmeta] = get_meta().split_into();
+    left.set_meta(lmeta);
+    right.set_meta(rmeta);
+
+    return piviter->get_node_val();
+  }
+
+  /**
+   * merge_from
+   *
+   * Takes two nodes and copies their contents into *this.
+   *
+   * precondition: left.size() + right.size() < CAPACITY
+   */
+  void merge_from(
+    const StringKVLeafNodeLayout &left,
+    const StringKVLeafNodeLayout &right)
+  {
+    copy_from_foreign_head(
+      iter_end(),
+      left.iter_begin(),
+      left.iter_end());
+    set_size(left.get_size());
+    append_copy_from_foreign_head(
+      iter_end(),
+      right.iter_begin(),
+      right.iter_end());
+    set_size(left.get_size() + right.get_size());
+    set_meta(Meta::merge_from(left.get_meta(), right.get_meta()));
+  }
+
+  /**
+   * balance_into_new_nodes
+   *
+   * Takes the contents of left and right and copies them into
+   * replacement_left and replacement_right such that
+   * the size of replacement_left side just >= 1/2 of the total size (left + right).
+   */
+  static std::string balance_into_new_nodes(
+    const StringKVLeafNodeLayout &left,
+    const StringKVLeafNodeLayout &right,
+    StringKVLeafNodeLayout &replacement_left,
+    StringKVLeafNodeLayout &replacement_right)
+  {
+    uint32_t left_size = omap_leaf_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off;
+    uint32_t right_size = omap_leaf_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off;
+    uint32_t total = left_size + right_size;
+    uint32_t pivot_size = total / 2;
+    uint32_t pivot_idx = 0;
+    if (pivot_size < left_size) {
+      uint32_t size = 0;
+      for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) {
+        auto node_key = ite->get_node_key();
+        size += node_key.key_len + node_key.val_len;
+        if (size >= pivot_size){
+          pivot_idx = ite.get_index();
+          break;
+        }
+      }
+    } else {
+      uint32_t more_size = pivot_size - left_size;
+      uint32_t size = 0;
+      for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) {
+        auto node_key = ite->get_node_key();
+        size += node_key.key_len + node_key.val_len;
+        if (size >= more_size){
+          pivot_idx = ite.get_index() + left.get_size();
+          break;
+        }
+      }
+    }
+
+    auto replacement_pivot = pivot_idx >= left.get_size() ?
+      right.iter_idx(pivot_idx - left.get_size())->get_node_val() :
+      left.iter_idx(pivot_idx)->get_node_val();
+
+    if (pivot_size < left_size) {
+      replacement_left.copy_from_foreign_head(
+        replacement_left.iter_end(),
+        left.iter_begin(),
+        left.iter_idx(pivot_idx));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign_back(
+        replacement_right.iter_end(),
+        left.iter_idx(pivot_idx),
+        left.iter_end());
+      replacement_right.set_size(left.get_size() - pivot_idx);
+
+      replacement_right.append_copy_from_foreign_head(
+        replacement_right.iter_end(),
+        right.iter_begin(),
+        right.iter_end());
+      replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+    } else {
+      replacement_left.copy_from_foreign_head(
+        replacement_left.iter_end(),
+        left.iter_begin(),
+        left.iter_end());
+      replacement_left.set_size(left.get_size());
+
+      replacement_left.append_copy_from_foreign_head(
+        replacement_left.iter_end(),
+        right.iter_begin(),
+        right.iter_idx(pivot_idx - left.get_size()));
+      replacement_left.set_size(pivot_idx);
+
+      replacement_right.copy_from_foreign_back(
+        replacement_right.iter_end(),
+        right.iter_idx(pivot_idx - left.get_size()),
+        right.iter_end());
+      replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx);
+    }
+
+    auto [lmeta, rmeta] = Meta::rebalance(
+      left.get_meta(), right.get_meta());
+    replacement_left.set_meta(lmeta);
+    replacement_right.set_meta(rmeta);
+    return replacement_pivot;
+  }
+
+private:
+  void leaf_insert(
+    iterator iter,
+    const std::string &key,
+    const std::string &val) {
+    if (VALIDATE_INVARIANTS) {
+      if (iter != iter_begin()) {
+        assert((iter - 1)->get_node_val() < key);
+      }
+      if (iter != iter_end()) {
+        assert(iter->get_node_val() > key);
+      }
+      assert(is_overflow(key.size() + 1, val.size() + 1) == false);
+    }
+    omap_leaf_key_t node_key;
+    if (iter == iter_begin()) {
+      node_key.key_off = key.size() + 1 + val.size() + 1;
+      node_key.key_len = key.size() + 1;
+      node_key.val_off = val.size() + 1;
+      node_key.val_len = val.size() + 1;
+    } else {
+      node_key.key_off = (iter - 1)->get_node_key().key_off + (key.size() + 1 + val.size() + 1);
+      node_key.key_len = key.size() + 1;
+      node_key.val_off = (iter - 1)->get_node_key().key_off + (val.size() + 1);
+      node_key.val_len = val.size() + 1;
+    }
+    if (get_size() != 0 && iter != iter_end())
+      local_move_back(node_key, iter + 1, iter, iter_end());
+
+    iter->set_node_key(node_key);
+    set_size(get_size() + 1);
+    iter->set_node_val(key);
+    iter->set_string_val(val);
+  }
+
+  void leaf_update(
+    iterator iter,
+    const std::string &key,
+    const std::string &val) {
+    assert(iter != iter_end());
+    if (VALIDATE_INVARIANTS) {
+      assert(is_overflow(0, val.size() + 1) == false);
+    }
+    leaf_remove(iter);
+    leaf_insert(iter, key, val);
+  }
+
+  void leaf_remove(iterator iter) {
+    assert(iter != iter_end());
+    if ((iter + 1) != iter_end())
+      local_move_ahead(iter, iter + 1, iter_end());
+    set_size(get_size() - 1);
+  }
+
+  /**
+   * get_key_ptr
+   *
+   * Get pointer to start of key array
+   */
+  omap_leaf_key_le_t *get_node_key_ptr() {
+    return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+  }
+  const omap_leaf_key_le_t *get_node_key_ptr() const {
+    return L::Partial(1, 1, get_size()).template Pointer<2>(buf);
+  }
+
+  /**
+   * copy_from_foreign_head
+   *
+   * Copy from another node begin entries to this node.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  static void copy_from_foreign_head(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = (to_src -1)->get_node_key().key_off;
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+  /**
+   * copy_from_foreign_back
+   *
+   * Copy from another node back entries to this node.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  void copy_from_foreign_back(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    auto offset = from_src.get_index() == 0? 0: (from_src-1)->get_node_key().key_off;
+
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off - offset);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = from_src.get_index() == 0? (to_src -1)->get_node_key().key_off:
+                 (from_src-1)->get_node_val_ptr() - (to_src -1)->get_node_val_ptr();
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+    if ( from_src.get_index() == 0)
+      return;
+
+    omap_leaf_key_t key = (from_src - 1)->get_node_key();
+    for (auto ite = tgt; ite.get_index() < (tgt.get_index() + to_src.get_index() - from_src.get_index()); ite++) {
+       omap_leaf_key_t node_key = ite->get_node_key();
+       node_key.key_off -= key.key_off;
+       node_key.val_off -= key.key_off;
+       ite->set_node_key(node_key);
+    }
+  }
+
+  /**
+   * append copy_from_foreign_ahead
+   *
+   * append another node head entries to this node back.
+   * [from_src, to_src) is another node entry range.
+   * tgt is this node entry to copy to.
+   * tgt and from_src must be from different nodes.
+   * from_src and to_src must be in the same node.
+   */
+  void append_copy_from_foreign_head(
+    iterator tgt,
+    const_iterator from_src,
+    const_iterator to_src) {
+    assert(tgt->node != from_src->node);
+    assert(to_src->node == from_src->node);
+    if (from_src == to_src)
+      return;
+
+    void* des = tgt.node->from_end((to_src -1)->get_node_key().key_off + (tgt - 1)->get_node_key().key_off);
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = (to_src -1)->get_node_key().key_off;
+    memcpy(des, src, len);
+    memcpy(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+    omap_leaf_key_t key = (tgt - 1)->get_node_key();
+    auto end_idx = tgt.get_index() + to_src.get_index() - from_src.get_index();
+    for (auto ite = tgt; ite.get_index() != end_idx; ite++) {
+       omap_leaf_key_t node_key = ite->get_node_key();
+       node_key.key_off += key.key_off;
+       node_key.val_off += key.key_off;
+       ite->set_node_key(node_key);
+    }
+  }
+
+  /**
+   * local_move_back
+   *
+   * move this node entries range [from_src, to_src) back to tgt position.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void local_move_back(
+    omap_leaf_key_t key,
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    void* des = (to_src-1)->get_node_val_ptr() - (key.key_len + key.val_len);
+    void* src = (to_src-1)->get_node_val_ptr();
+    size_t len = from_src.get_index() == 0?
+                 from_src->node->buf + BlockSize - (to_src-1)->get_node_val_ptr():
+                 (from_src-1)->get_node_val_ptr() - (to_src-1)->get_node_val_ptr();
+    memmove(des, src, len);
+    for ( auto ite = from_src; ite < to_src; ite++) {
+      omap_leaf_key_t node_key = ite->get_node_key();
+      node_key.key_off += (key.key_len + key.val_len);
+      node_key.val_off += (key.key_len + key.val_len);
+      ite->set_node_key(node_key);
+    }
+    memmove(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+  /**
+   * local_move_ahead
+   *
+   * move this node entries range [from_src, to_src) ahead to tgt position.
+   *
+   * tgt, from_src, and to_src must be from the same node.
+   */
+  static void local_move_ahead(
+    iterator tgt,
+    iterator from_src,
+    iterator to_src) {
+    assert(tgt->node == from_src->node);
+    assert(to_src->node == from_src->node);
+    assert(from_src.get_index() != 0);
+    omap_leaf_key_t key = tgt->get_node_key();
+    void* des = (to_src - 1)->get_node_val_ptr() + key.key_len + key.val_len;
+    void* src = (to_src - 1)->get_node_val_ptr();
+    size_t len = (from_src - 1)->get_node_val_ptr() - (to_src - 1)->get_node_val_ptr();
+    memmove(des, src, len);
+    for ( auto ite = from_src; ite < to_src; ite++) {
+      omap_leaf_key_t node_key = ite->get_node_key();
+      node_key.key_off -= (key.key_len + key.val_len);
+      node_key.val_off -= (key.key_len + key.val_len);
+      ite->set_node_key(node_key);
+    }
+    memmove(
+      tgt->get_node_key_ptr(), from_src->get_node_key_ptr(),
+      to_src->get_node_key_ptr() - from_src->get_node_key_ptr());
+  }
+
+};
+
+}
index ff43b1e515bf9e49d9d87014410b252dcfd83567..e4e52dfd0d2680d6e100d2fd8ad5216884b3936f 100644 (file)
@@ -59,6 +59,10 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
     return out << "EXTMAP_LEAF";
   case extent_types_t::ONODE_BLOCK_STAGED:
     return out << "ONODE_BLOCK_STAGED";
+  case extent_types_t::OMAP_INNER:
+    return out << "OMAP_INNER";
+  case extent_types_t::OMAP_LEAF:
+    return out << "OMAP_LEAF";
   case extent_types_t::TEST_BLOCK:
     return out << "TEST_BLOCK";
   case extent_types_t::TEST_BLOCK_PHYSICAL:
index cb8480268e164f4f5d69f87d3cfc9f71fe72de5d..28ffdad18fa627e3f07ca54694e04249ad365933 100644 (file)
@@ -281,7 +281,9 @@ enum class extent_types_t : uint8_t {
   ONODE_BLOCK = 3,
   EXTMAP_INNER = 4,
   EXTMAP_LEAF = 5,
-  ONODE_BLOCK_STAGED = 6,
+  OMAP_INNER = 6,
+  OMAP_LEAF = 7,
+  ONODE_BLOCK_STAGED = 8,
 
   // Test Block Types
   TEST_BLOCK = 0xF0,
index 7b86631e2ca7c0657437e4ee22910c2f461fa099..63bf46e8e4503264ec6bdb8d05e3ac36f8fe49c5 100644 (file)
@@ -168,6 +168,23 @@ TransactionManager::ref_ret TransactionManager::dec_ref(
   });
 }
 
+TransactionManager::refs_ret TransactionManager::dec_ref(
+  Transaction &t,
+  std::list<laddr_t> offsets)
+{
+  return seastar::do_with(std::move(offsets), std::list<unsigned>(),
+      [this, &t] (auto &&offsets, auto &refcnt) {
+      return crimson::do_for_each(offsets.begin(), offsets.end(),
+        [this, &t, &refcnt] (auto &laddr) {
+        return dec_ref(t, laddr).safe_then([&refcnt] (auto ref) {
+          refcnt.push_back(ref);
+        });
+      }).safe_then([&refcnt] {
+        return ref_ertr::make_ready_future<std::list<unsigned>>(std::move(refcnt));
+      });
+    });
+}
+
 TransactionManager::submit_transaction_ertr::future<>
 TransactionManager::submit_transaction(
   TransactionRef t)
index d28fd0b87923cdbf1f76e19764ca62bbe767b978..8258a81a9f7d3e8b5dd735bde4c7bdee363eef0d 100644 (file)
@@ -10,6 +10,7 @@
 #include <functional>
 
 #include <boost/intrusive_ptr.hpp>
+#include <boost/iterator/counting_iterator.hpp>
 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
 
 #include <seastar/core/future.hh>
@@ -176,6 +177,12 @@ public:
     Transaction &t,
     laddr_t offset);
 
+  /// remove refcount for list of offset
+  using refs_ret = ref_ertr::future<std::list<unsigned>>;
+  refs_ret dec_ref(
+    Transaction &t,
+    std::list<laddr_t> offsets);
+
   /**
    * alloc_extent
    *
@@ -205,6 +212,35 @@ public:
     });
   }
 
+  /* alloc_extents
+   *
+   * allocates more than one new blocks of type T.
+   */
+   using alloc_extents_ertr = alloc_extent_ertr;
+   template<class T>
+   alloc_extents_ertr::future<std::vector<TCachedExtentRef<T>>>
+   alloc_extents(
+     Transaction &t,
+     laddr_t hint,
+     extent_len_t len,
+     int num) {
+     return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
+       [this, &t, hint, len, num] (auto &extents) {
+       return crimson::do_for_each(
+                       boost::make_counting_iterator(0),
+                       boost::make_counting_iterator(num),
+         [this, &t, len, hint, &extents] (auto i) {
+         return alloc_extent<T>(t, hint, len).safe_then(
+           [&extents](auto &&node) {
+           extents.push_back(node);
+         });
+       }).safe_then([&extents] {
+         return alloc_extents_ertr::make_ready_future
+                <std::vector<TCachedExtentRef<T>>>(std::move(extents));
+       });
+     });
+  }
+
   /**
    * submit_transaction
    *
index 73feebe23c2b737c78836189a7e7800af008b78f..840a59070c7d4aa640d5c5244bf4dfb4f79c0a4e 100644 (file)
@@ -45,4 +45,13 @@ target_link_libraries(
   ${CMAKE_DL_LIBS}
   crimson-seastore)
 
+add_executable(unittest_omap_manager
+  test_omap_manager.cc
+  ../gtest_seastar.cc)
+add_ceph_unittest(unittest_omap_manager)
+target_link_libraries(
+  unittest_omap_manager
+  ${CMAKE_DL_LIBS}
+  crimson-seastore)
+
 add_subdirectory(onode_tree)
diff --git a/src/test/crimson/seastore/test_omap_manager.cc b/src/test/crimson/seastore/test_omap_manager.cc
new file mode 100644 (file)
index 0000000..173d4e6
--- /dev/null
@@ -0,0 +1,604 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/crimson/gtest_seastar.h"
+
+#include "test/crimson/seastore/transaction_manager_test_state.h"
+
+#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/segment_manager.h"
+#include "crimson/os/seastore/omap_manager.h"
+
+#include "test/crimson/seastore/test_block.h"
+
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+using namespace std;
+
+namespace {
+  [[maybe_unused]] seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_test);
+  }
+}
+
+struct omap_manager_test_t :
+  public seastar_test_suite_t,
+  TMTestState {
+
+  OMapManagerRef omap_manager;
+
+  omap_manager_test_t() {}
+
+  seastar::future<> set_up_fut() final {
+    return tm_setup().then([this] {
+      omap_manager = omap_manager::create_omap_manager(*tm);
+      return seastar::now();
+    });
+  }
+
+  seastar::future<> tear_down_fut() final {
+    return tm_teardown().then([this] {
+      omap_manager.reset();
+      return seastar::now();
+    });
+  }
+
+  using test_omap_t = std::map<std::string, std::string>;
+  test_omap_t test_omap_mappings;
+
+  bool set_key(
+    omap_root_t &omap_root,
+    Transaction &t,
+    string &key,
+    string &val) {
+    auto ret = omap_manager->omap_set_key(omap_root, t, key, val).unsafe_get0();
+    EXPECT_EQ(ret, true);
+    test_omap_mappings[key] = val;
+    return ret;
+  }
+
+  std::pair<string, string> get_value(
+    omap_root_t &omap_root,
+    Transaction &t,
+    const string &key) {
+    auto ret = omap_manager->omap_get_value(omap_root, t, key).unsafe_get0();
+    EXPECT_EQ(key, ret.first);
+    return ret;
+  }
+
+  bool rm_key(
+    omap_root_t &omap_root,
+    Transaction &t,
+    const string &key) {
+    auto ret = omap_manager->omap_rm_key(omap_root, t, key).unsafe_get0();
+    EXPECT_EQ(ret, true);
+    test_omap_mappings.erase(test_omap_mappings.find(key));
+    return ret;
+  }
+
+  list_keys_result_t list_keys(
+    omap_root_t &omap_root,
+    Transaction &t,
+    std::string &start,
+    size_t max = MAX_SIZE) {
+    auto ret = omap_manager->omap_list_keys(omap_root, t, start, max).unsafe_get0();
+    if (start == "" && max == MAX_SIZE) {
+      EXPECT_EQ(test_omap_mappings.size(), ret.keys.size());
+      for ( auto &i : ret.keys) {
+        auto it = test_omap_mappings.find(i);
+        EXPECT_NE(it, test_omap_mappings.end());
+        EXPECT_EQ(i, it->first);
+      }
+    } else {
+      size_t i =0;
+      auto it = test_omap_mappings.find(start);
+      for (; it != test_omap_mappings.end() && i < max; it++) {
+        EXPECT_EQ(ret.keys[i], it->first);
+        i++;
+      }
+      if (it == test_omap_mappings.end()) {
+        EXPECT_EQ(ret.next, "");
+      } else {
+        EXPECT_EQ(ret.keys.size(), max);
+        EXPECT_EQ(ret.next, it->first);
+      }
+    }
+    return ret;
+  }
+
+  list_kvs_result_t list(
+    omap_root_t &omap_root,
+    Transaction &t,
+    std::string &start,
+    size_t max = MAX_SIZE) {
+    auto ret = omap_manager->omap_list(omap_root, t, start, max).unsafe_get0();
+    if (start == "" && max == MAX_SIZE) {
+      EXPECT_EQ(test_omap_mappings.size(), ret.kvs.size());
+      for ( auto &i : ret.kvs) {
+        auto it = test_omap_mappings.find(i.first);
+        EXPECT_NE(it, test_omap_mappings.end());
+        EXPECT_EQ(i.second, it->second);
+      }
+    } else {
+      size_t i = 0;
+      auto it = test_omap_mappings.find(start);
+      for (; it != test_omap_mappings.end() && i < max; it++) {
+        EXPECT_EQ(ret.kvs[i].first, it->first);
+        i++;
+      }
+      if (it == test_omap_mappings.end()) {
+        EXPECT_EQ(ret.next, "");
+      } else {
+        EXPECT_EQ(ret.kvs.size(), max);
+        EXPECT_EQ(ret.next, it->first);
+      }
+    }
+
+    return ret;
+  }
+
+  void clear(
+    omap_root_t &omap_root,
+    Transaction &t) {
+    omap_manager->omap_clear(omap_root, t).unsafe_get0();
+    EXPECT_EQ(omap_root.omap_root_laddr, L_ADDR_NULL);
+  }
+
+  void check_mappings(omap_root_t &omap_root, Transaction &t) {
+    for (const auto &i: test_omap_mappings){
+      auto ret = get_value(omap_root, t, i.first);
+      EXPECT_EQ(i.first, ret.first);
+      EXPECT_EQ(i.second, ret.second);
+    }
+  }
+
+  void check_mappings(omap_root_t &omap_root) {
+    auto t = tm->create_transaction();
+    check_mappings(omap_root, *t);
+  }
+
+  void replay() {
+    logger().debug("{}: begin", __func__);
+    tm->close().unsafe_get();
+    destroy();
+    static_cast<segment_manager::EphemeralSegmentManager*>(&*segment_manager)->remount();
+    init();
+    tm->mount().unsafe_get();
+    omap_manager = omap_manager::create_omap_manager(*tm);
+    logger().debug("{}: end", __func__);
+  }
+};
+
+char* rand_string(char* str, const int len)
+{
+  int i;
+  for (i = 0; i < len; ++i) {
+    switch (rand() % 3) {
+      case 1:
+        str[i] = 'A' + rand() % 26;
+        break;
+      case 2:
+        str[i] = 'a' +rand() % 26;
+        break;
+      case 0:
+        str[i] = '0' + rand() % 10;
+        break;
+    }
+  }
+  str[len] = '\0';
+  return str;
+}
+
+TEST_F(omap_manager_test_t, basic)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+
+    string key = "owner";
+    string val = "test";
+    {
+      auto t = tm->create_transaction();
+      logger().debug("first transaction");
+      [[maybe_unused]] auto setret = set_key(omap_root, *t, key, val);
+      [[maybe_unused]] auto getret = get_value(omap_root, *t, key);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    {
+      auto t = tm->create_transaction();
+      logger().debug("second transaction");
+      [[maybe_unused]] auto getret = get_value(omap_root, *t, key);
+      [[maybe_unused]] auto rmret = rm_key(omap_root, *t, key);
+      [[maybe_unused]] auto getret2 = get_value(omap_root, *t, key);
+      EXPECT_EQ(getret2.second, "");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    {
+      auto t = tm->create_transaction();
+      logger().debug("third transaction");
+      [[maybe_unused]] auto getret = get_value(omap_root, *t, key);
+      EXPECT_EQ(getret.second, "");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+  });
+}
+
+TEST_F(omap_manager_test_t, force_leafnode_split)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 50;
+    char str[STR_LEN + 1];
+    for (unsigned i = 0; i < 40; i++) {
+      auto t = tm->create_transaction();
+      logger().debug("opened transaction");
+      for (unsigned j = 0; j < 10; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 20 == 0) && (j == 5)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("force split submit transaction i = {}", i);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+      check_mappings(omap_root);
+    }
+  });
+}
+
+TEST_F(omap_manager_test_t, force_leafnode_split_merge)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 50;
+    char str[STR_LEN + 1];
+
+    for (unsigned i = 0; i < 80; i++) {
+      auto t = tm->create_transaction();
+      logger().debug("opened split_merge transaction");
+      for (unsigned j = 0; j < 5; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 10 == 0) && (j == 3)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("submitting transaction");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+      if (i % 50 == 0) {
+        check_mappings(omap_root);
+      }
+    }
+    auto t = tm->create_transaction();
+    int i = 0;
+    for (auto &e: test_omap_mappings) {
+      if (i % 3 != 0) {
+        [[maybe_unused]] auto rmref= rm_key(omap_root, *t, e.first);
+      }
+
+      if (i % 10 == 0) {
+        logger().debug("submitting transaction i= {}", i);
+        tm->submit_transaction(std::move(t)).unsafe_get();
+        t = tm->create_transaction();
+      }
+      if (i % 100 == 0) {
+        logger().debug("check_mappings  i= {}", i);
+        check_mappings(omap_root, *t);
+        check_mappings(omap_root);
+      }
+      i++;
+    }
+    logger().debug("finally submitting transaction ");
+    tm->submit_transaction(std::move(t)).unsafe_get();
+  });
+}
+
+TEST_F(omap_manager_test_t, force_leafnode_split_merge_fullandbalanced)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 50;
+    char str[STR_LEN + 1];
+
+    for (unsigned i = 0; i < 50; i++) {
+      auto t = tm->create_transaction();
+      logger().debug("opened split_merge transaction");
+      for (unsigned j = 0; j < 5; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 10 == 0) && (j == 3)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("submitting transaction");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+      if (i % 50 == 0) {
+        check_mappings(omap_root);
+      }
+    }
+    auto t = tm->create_transaction();
+    int i = 0;
+    for (auto &e: test_omap_mappings) {
+      if (30 < i && i < 100) {
+        auto val = e;
+        [[maybe_unused]] auto rmref= rm_key(omap_root, *t, e.first);
+      }
+
+      if (i % 10 == 0) {
+      logger().debug("submitting transaction i= {}", i);
+        tm->submit_transaction(std::move(t)).unsafe_get();
+        t = tm->create_transaction();
+      }
+      if (i % 50 == 0) {
+      logger().debug("check_mappings  i= {}", i);
+        check_mappings(omap_root, *t);
+        check_mappings(omap_root);
+      }
+      i++;
+      if (i == 100)
+ break;
+    }
+    logger().debug("finally submitting transaction ");
+    tm->submit_transaction(std::move(t)).unsafe_get();
+    check_mappings(omap_root);
+  });
+}
+
+
+TEST_F(omap_manager_test_t, force_split_listkeys_list_clear)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 300;
+    char str[STR_LEN + 1];
+    string temp;
+    for (unsigned i = 0; i < 40; i++) {
+      auto t = tm->create_transaction();
+      logger().debug("opened transaction");
+      for (unsigned j = 0; j < 10; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if (i == 10)
+          temp = key;
+        if ((i % 20 == 0) && (j == 5)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("force split submit transaction i = {}", i);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+      check_mappings(omap_root);
+    }
+    std::string empty = "";
+    auto t = tm->create_transaction();
+    [[maybe_unused]] auto keys = list_keys(omap_root, *t, empty);
+    tm->submit_transaction(std::move(t)).unsafe_get();
+
+    t = tm->create_transaction();
+    keys = list_keys(omap_root, *t, temp, 100);
+    tm->submit_transaction(std::move(t)).unsafe_get();
+
+    t = tm->create_transaction();
+    [[maybe_unused]] auto ls = list(omap_root, *t, empty);
+    tm->submit_transaction(std::move(t)).unsafe_get();
+
+    t = tm->create_transaction();
+    ls = list(omap_root, *t, temp, 100);
+    tm->submit_transaction(std::move(t)).unsafe_get();
+
+    t = tm->create_transaction();
+    clear(omap_root, *t);
+    tm->submit_transaction(std::move(t)).unsafe_get();
+
+  });
+}
+
+TEST_F(omap_manager_test_t, internal_force_split)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 300;
+    char str[STR_LEN + 1];
+    for (unsigned i = 0; i < 10; i++) {
+      logger().debug("opened split transaction");
+      auto t = tm->create_transaction();
+
+      for (unsigned j = 0; j < 80; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 2 == 0) && (j % 50 == 0)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("submitting transaction i = {}", i);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    check_mappings(omap_root);
+  });
+}
+
+TEST_F(omap_manager_test_t, internal_force_merge_fullandbalanced)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 300;
+    char str[STR_LEN + 1];
+
+    for (unsigned i = 0; i < 8; i++) {
+      logger().debug("opened split transaction");
+      auto t = tm->create_transaction();
+
+      for (unsigned j = 0; j < 80; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 2 == 0) && (j % 50 == 0)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("submitting transaction");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    auto t = tm->create_transaction();
+    int i = 0;
+    for (auto &e: test_omap_mappings) {
+        auto val = e;
+        [[maybe_unused]] auto rmref= rm_key(omap_root, *t, e.first);
+
+      if (i % 10 == 0) {
+      logger().debug("submitting transaction i= {}", i);
+        tm->submit_transaction(std::move(t)).unsafe_get();
+        t = tm->create_transaction();
+      }
+      if (i % 50 == 0) {
+      logger().debug("check_mappings  i= {}", i);
+        check_mappings(omap_root, *t);
+        check_mappings(omap_root);
+      }
+      i++;
+    }
+    logger().debug("finally submitting transaction ");
+    tm->submit_transaction(std::move(t)).unsafe_get();
+    check_mappings(omap_root);
+  });
+}
+
+TEST_F(omap_manager_test_t, replay)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+      replay();
+    }
+    const int STR_LEN = 300;
+    char str[STR_LEN + 1];
+
+    for (unsigned i = 0; i < 8; i++) {
+      logger().debug("opened split transaction");
+      auto t = tm->create_transaction();
+
+      for (unsigned j = 0; j < 80; ++j) {
+        string key(rand_string(str, rand() % STR_LEN));
+        string val(rand_string(str, rand() % STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+        if ((i % 2 == 0) && (j % 50 == 0)) {
+          check_mappings(omap_root, *t);
+        }
+      }
+      logger().debug("submitting transaction i = {}", i);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    replay();
+    check_mappings(omap_root);
+
+    auto t = tm->create_transaction();
+    int i = 0;
+    for (auto &e: test_omap_mappings) {
+        auto val = e;
+        [[maybe_unused]] auto rmref= rm_key(omap_root, *t, e.first);
+
+      if (i % 10 == 0) {
+      logger().debug("submitting transaction i= {}", i);
+        tm->submit_transaction(std::move(t)).unsafe_get();
+        replay();
+        t = tm->create_transaction();
+      }
+      if (i % 50 == 0) {
+      logger().debug("check_mappings  i= {}", i);
+        check_mappings(omap_root, *t);
+        check_mappings(omap_root);
+      }
+      i++;
+    }
+    logger().debug("finally submitting transaction ");
+    tm->submit_transaction(std::move(t)).unsafe_get();
+    replay();
+    check_mappings(omap_root);
+  });
+}
+
+
+TEST_F(omap_manager_test_t, internal_force_split_to_root)
+{
+  run_async([this] {
+    omap_root_t omap_root(0, L_ADDR_NULL);
+    {
+      auto t = tm->create_transaction();
+      omap_root = omap_manager->initialize_omap(*t).unsafe_get0();
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+    const int STR_LEN = 300;
+    char str[STR_LEN + 1];
+
+    logger().debug("set big keys");
+    for (unsigned i = 0; i < 53; i++) {
+      auto t = tm->create_transaction();
+
+      for (unsigned j = 0; j < 8; ++j) {
+        string key(rand_string(str, STR_LEN));
+        string val(rand_string(str, STR_LEN));
+        [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+      }
+      logger().debug("submitting transaction i = {}", i);
+      tm->submit_transaction(std::move(t)).unsafe_get();
+    }
+     logger().debug("set small keys");
+     const int STR_LEN_2 = 100;
+     char str_2[STR_LEN_2 + 1];
+     for (unsigned i = 0; i < 100; i++) {
+       auto t = tm->create_transaction();
+
+       for (unsigned j = 0; j < 8; ++j) {
+         string key(rand_string(str_2, STR_LEN_2));
+         string val(rand_string(str_2, STR_LEN_2));
+         [[maybe_unused]] auto addref = set_key(omap_root, *t, key, val);
+       }
+      logger().debug("submitting transaction last");
+      tm->submit_transaction(std::move(t)).unsafe_get();
+     }
+    check_mappings(omap_root);
+  });
+}