]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore/object_data_handler: add logic for handling object data
authorSamuel Just <sjust@redhat.com>
Sat, 10 Apr 2021 01:04:59 +0000 (18:04 -0700)
committerSamuel Just <sjust@redhat.com>
Sun, 18 Apr 2021 07:36:03 +0000 (00:36 -0700)
Signed-off-by: Samuel Just <sjust@redhat.com>
src/crimson/os/seastore/CMakeLists.txt
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/object_data_handler.cc [new file with mode: 0644]
src/crimson/os/seastore/object_data_handler.h [new file with mode: 0644]
src/crimson/os/seastore/onode.h
src/crimson/os/seastore/seastore_types.cc
src/crimson/os/seastore/seastore_types.h
src/test/crimson/seastore/CMakeLists.txt
src/test/crimson/seastore/test_object_data_handler.cc [new file with mode: 0644]

index a28fd55131be16f0ce1c7c2dd31c0d40416a2ef1..9b50b345adf4e0f85e3e63c354b0927cbfa13f17 100644 (file)
@@ -30,6 +30,7 @@ add_library(crimson-seastore STATIC
   collection_manager.cc
   collection_manager/flat_collection_manager.cc
   collection_manager/collection_flat_node.cc
+  object_data_handler.cc
   seastore.cc
   ../../../test/crimson/seastore/test_block.cc
   ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc
index bcf2e5165e0222a189058e3cf9ad5ba3e66b73c4..4745b7d14f1592e56ce0056524d1d4d5a1e66ee9 100644 (file)
@@ -8,6 +8,7 @@
 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
 #include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h"
 #include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
+#include "crimson/os/seastore/object_data_handler.h"
 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
 #include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
 #include "test/crimson/seastore/test_block.h"
@@ -149,6 +150,8 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
     return alloc_new_extent<omap_manager::OMapLeafNode>(t, length);
   case extent_types_t::COLL_BLOCK:
     return alloc_new_extent<collection_manager::CollectionNode>(t, length);
+  case extent_types_t::OBJECT_DATA_BLOCK:
+    return alloc_new_extent<ObjectDataBlock>(t, length);
   case extent_types_t::TEST_BLOCK:
     return alloc_new_extent<TestBlock>(t, length);
   case extent_types_t::TEST_BLOCK_PHYSICAL:
@@ -544,6 +547,11 @@ Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type(
       ).safe_then([](auto extent) {
        return CachedExtentRef(extent.detach(), false /* add_ref */);
       });
+    case extent_types_t::OBJECT_DATA_BLOCK:
+      return get_extent<ObjectDataBlock>(offset, length
+      ).safe_then([](auto extent) {
+       return CachedExtentRef(extent.detach(), false /* add_ref */);
+      });
     case extent_types_t::TEST_BLOCK:
       return get_extent<TestBlock>(offset, length
       ).safe_then([](auto extent) {
diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc
new file mode 100644 (file)
index 0000000..59f2f54
--- /dev/null
@@ -0,0 +1,536 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <utility>
+#include <functional>
+
+#include "crimson/common/log.h"
+
+#include "crimson/os/seastore/object_data_handler.h"
+
+namespace {
+  seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_filestore);
+  }
+}
+
+namespace crimson::os::seastore {
+
+/**
+ * MAX_OBJECT_SIZE
+ *
+ * For now, we allocate a fixed region of laddr space of size MAX_OBJECT_SIZE
+ * for any object.  In the future, once we have the ability to remap logical
+ * mappings (necessary for clone), we'll add the ability to grow and shrink
+ * these regions and remove this assumption.
+ */
+static constexpr extent_len_t MAX_OBJECT_SIZE = 16<<20;
+#define assert_aligned(x) ceph_assert(((x)%ctx.tm.get_block_size()) == 0)
+
+using context_t = ObjectDataHandler::context_t;
+using get_ertr = ObjectDataHandler::write_ertr;
+
+auto read_pin(
+  context_t ctx,
+  LBAPinRef pin) {
+  return ctx.tm.pin_to_extent<ObjectDataBlock>(
+    ctx.t,
+    std::move(pin)
+  ).handle_error(
+    get_ertr::pass_further{},
+    crimson::ct_error::assert_all{ "read_pin: invalid error" }
+  );
+}
+
+/**
+ * extent_to_write_t
+ *
+ * Encapsulates extents to be written out using do_insertions.
+ * Indicates a zero extent or a data extent based on whether
+ * to_write is populate.
+ */
+struct extent_to_write_t {
+  laddr_t addr = L_ADDR_NULL;
+  extent_len_t len;
+  std::optional<bufferlist> to_write;
+
+  extent_to_write_t() = default;
+  extent_to_write_t(const extent_to_write_t &) = default;
+  extent_to_write_t(extent_to_write_t &&) = default;
+
+  extent_to_write_t(laddr_t addr, bufferlist to_write)
+    : addr(addr), len(to_write.length()), to_write(to_write) {}
+
+  extent_to_write_t(laddr_t addr, extent_len_t len)
+    : addr(addr), len(len) {}
+};
+using extent_to_write_list_t = std::list<extent_to_write_t>;
+
+/// Removes extents/mappings in pins
+ObjectDataHandler::write_ret do_removals(
+  context_t ctx,
+  lba_pin_list_t &pins)
+{
+  return crimson::do_for_each(
+    pins.begin(),
+    pins.end(),
+    [ctx](auto &pin) {
+      return ctx.tm.dec_ref(
+       ctx.t,
+       pin->get_laddr()
+      ).safe_then(
+       [](auto){},
+       ObjectDataHandler::write_ertr::pass_further{},
+       crimson::ct_error::assert_all{
+         "object_data_handler::do_removals invalid error"
+       }
+      );
+    });
+}
+
+/// Creates zero/data extents in to_write
+ObjectDataHandler::write_ret do_insertions(
+  context_t ctx,
+  extent_to_write_list_t &to_write)
+{
+  return crimson::do_for_each(
+    to_write.begin(),
+    to_write.end(),
+    [ctx](auto &region) {
+      if (region.to_write) {
+       assert_aligned(region.addr);
+       assert_aligned(region.len);
+       ceph_assert(region.len == region.to_write->length());
+       return ctx.tm.alloc_extent<ObjectDataBlock>(
+         ctx.t,
+         region.addr,
+         region.len
+       ).safe_then([ctx, &region](auto extent) {
+         if (extent->get_laddr() != region.addr) {
+           logger().debug(
+             "object_data_handler::do_insertions alloc got addr {},"
+             " should have been {}",
+             extent->get_laddr(),
+             region.addr);
+         }
+         ceph_assert(extent->get_laddr() == region.addr);
+         ceph_assert(extent->get_length() == region.len);
+         auto iter = region.to_write->cbegin();
+         iter.copy(region.len, extent->get_bptr().c_str());
+         return ObjectDataHandler::write_ertr::now();
+       });
+      } else {
+       return ctx.tm.reserve_region(
+         ctx.t,
+         region.addr,
+         region.len
+       ).safe_then([&region](auto pin) {
+         ceph_assert(pin->get_length() == region.len);
+         ceph_assert(pin->get_laddr() == region.addr);
+         return ObjectDataHandler::write_ertr::now();
+       });
+      }
+    });
+}
+
+/**
+ * split_pin_left
+ *
+ * Splits the passed pin returning aligned extent to be rewritten
+ * to the left (if a zero extent), tail to be prepended to write
+ * beginning at offset.  See below for details.
+ */
+using split_ret_bare = std::pair<
+  std::optional<extent_to_write_t>,
+  std::optional<bufferptr>>;
+using split_ret = get_ertr::future<split_ret_bare>;
+split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
+{
+  const auto pin_offset = pin->get_laddr();
+  assert_aligned(pin_offset);
+  ceph_assert(offset >= pin_offset);
+  if (offset == pin_offset) {
+    // Aligned, no tail and no extra extent
+    return get_ertr::make_ready_future<split_ret_bare>(
+      std::nullopt,
+      std::nullopt);
+  } else if (pin->get_paddr().is_zero()) {
+    /* Zero extent unaligned, return largest aligned zero extent to
+     * the left and the gap between aligned_offset and offset to prepend. */
+    auto aligned_offset = p2align(offset, (uint64_t)ctx.tm.get_block_size());
+    assert_aligned(aligned_offset);
+    ceph_assert(aligned_offset <= offset);
+    auto zero_extent_len = aligned_offset - pin_offset;
+    assert_aligned(zero_extent_len);
+    auto zero_prepend_len = offset - aligned_offset;
+    return get_ertr::make_ready_future<split_ret_bare>(
+      (zero_extent_len == 0
+       ? std::nullopt
+       : std::make_optional(extent_to_write_t(pin_offset, zero_extent_len))),
+      bufferptr(ceph::buffer::create(zero_prepend_len, 0))
+    );
+  } else {
+    // Data, return up to offset to prepend
+    auto to_prepend = offset - pin->get_laddr();
+    return read_pin(ctx, pin->duplicate()
+    ).safe_then([to_prepend](auto extent) {
+      return get_ertr::make_ready_future<split_ret_bare>(
+       std::nullopt,
+       bufferptr(extent->get_bptr(), 0, to_prepend));
+    });
+  }
+};
+
+/// Reverse of split_pin_left
+split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end)
+{
+  const auto pin_begin = pin->get_laddr();
+  const auto pin_end = pin->get_laddr() + pin->get_length();
+  assert_aligned(pin_end);
+  ceph_assert(pin_end >= end);
+  if (end == pin_end) {
+    return get_ertr::make_ready_future<split_ret_bare>(
+      std::nullopt,
+      std::nullopt);
+  } else if (pin->get_paddr().is_zero()) {
+    auto aligned_end = p2roundup(end, (uint64_t)ctx.tm.get_block_size());
+    assert_aligned(aligned_end);
+    ceph_assert(aligned_end >= end);
+    auto zero_suffix_len = aligned_end - end;
+    auto zero_extent_len = pin_end - aligned_end;
+    assert_aligned(zero_extent_len);
+    return get_ertr::make_ready_future<split_ret_bare>(
+      (zero_extent_len == 0
+       ? std::nullopt
+       : std::make_optional(extent_to_write_t(aligned_end, zero_extent_len))),
+      bufferptr(ceph::buffer::create(zero_suffix_len, 0))
+    );
+  } else {
+    return read_pin(ctx, pin->duplicate()
+    ).safe_then([end, pin_begin, pin_end](auto extent) {
+      return get_ertr::make_ready_future<split_ret_bare>(
+       std::nullopt,
+       bufferptr(
+         extent->get_bptr(),
+         end - pin_begin,
+         pin_end - end));
+    });
+  }
+};
+
+template <typename F>
+auto with_object_data(
+  ObjectDataHandler::context_t ctx,
+  F &&f)
+{
+  return seastar::do_with(
+    ctx.onode.get_layout().object_data.get(),
+    std::forward<F>(f),
+    [ctx](auto &object_data, auto &f) {
+      return std::invoke(f, object_data
+      ).safe_then([ctx, &object_data] {
+       if (object_data.must_update()) {
+         ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data);
+       }
+       return seastar::now();
+      });
+    });
+}
+
+ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
+  context_t ctx,
+  object_data_t &object_data,
+  extent_len_t size)
+{
+  ceph_assert(size <= MAX_OBJECT_SIZE);
+  if (!object_data.is_null()) {
+    ceph_assert(object_data.get_reserved_data_len() == MAX_OBJECT_SIZE);
+    return write_ertr::now();
+  } else {
+    return ctx.tm.reserve_region(
+      ctx.t,
+      0 /* TODO -- pass hint based on object hash */,
+      MAX_OBJECT_SIZE
+    ).safe_then([size, &object_data](auto pin) {
+      ceph_assert(pin->get_length() == MAX_OBJECT_SIZE);
+      object_data.update_reserved(
+       pin->get_laddr(),
+       pin->get_length());
+      return write_ertr::now();
+    });
+  }
+}
+
+ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
+  context_t ctx, object_data_t &object_data, extent_len_t size)
+{
+  ceph_assert(!object_data.is_null());
+  assert_aligned(size);
+  ceph_assert(size <= object_data.get_reserved_data_len());
+  return seastar::do_with(
+    lba_pin_list_t(),
+    extent_to_write_list_t(),
+    [this, ctx, size, &object_data](auto &pins, auto &to_write) {
+      return ctx.tm.get_pins(
+       ctx.t,
+       object_data.get_reserved_data_base() + size,
+       object_data.get_reserved_data_len() - size
+      ).safe_then([this, ctx, size, &pins, &object_data, &to_write](auto _pins) {
+       _pins.swap(pins);
+       ceph_assert(pins.size());
+       auto &pin = *pins.front();
+       ceph_assert(pin.get_laddr() >= object_data.get_reserved_data_base());
+       ceph_assert(
+         pin.get_laddr() <= object_data.get_reserved_data_base() + size);
+       auto pin_offset = pin.get_laddr() -
+         object_data.get_reserved_data_base();
+       if (pin.get_paddr().is_zero()) {
+         to_write.emplace_back(
+           pin.get_laddr(),
+           object_data.get_reserved_data_len() - pin_offset);
+         return clear_ertr::now();
+       } else {
+         return read_pin(
+           ctx,
+           pin.duplicate()
+         ).safe_then([ctx, size, pin_offset, &pin, &object_data, &to_write](
+                       auto extent) {
+           bufferlist bl;
+           bl.append(
+             bufferptr(
+               extent->get_bptr(),
+               0,
+               size - pin_offset
+             ));
+           to_write.emplace_back(
+             pin.get_laddr(),
+             bl);
+           to_write.emplace_back(
+             object_data.get_reserved_data_base() + size,
+             object_data.get_reserved_data_len() - size);
+           return clear_ertr::now();
+         });
+       }
+      }).safe_then([ctx, size, &pins] {
+       return do_removals(ctx, pins);
+      }).safe_then([ctx, size, &to_write] {
+       return do_insertions(ctx, to_write);
+      }).safe_then([size, &object_data] {
+       if (size == 0) {
+         object_data.clear();
+       }
+       return ObjectDataHandler::clear_ertr::now();
+      });
+    });
+}
+
+/**
+ * get_buffers
+ *
+ * Returns extent_to_write_t's from bl.
+ *
+ * TODO: probably add some kind of upper limit on extent size.
+ */
+extent_to_write_list_t get_buffers(laddr_t offset, bufferlist &bl)
+{
+  auto ret = extent_to_write_list_t();
+  ret.emplace_back(offset, bl);
+  return ret;
+};
+
+ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
+  context_t ctx,
+  laddr_t _offset,
+  bufferlist &&bl,
+  lba_pin_list_t &&_pins)
+{
+  return seastar::do_with(
+    _offset,
+    std::move(bl),
+    std::move(_pins),
+    extent_to_write_list_t(),
+    [this, ctx](laddr_t &offset, auto &bl, auto &pins, auto &to_write) {
+      ceph_assert(pins.size() >= 1);
+      auto pin_begin = pins.front()->get_laddr();
+      ceph_assert(pin_begin <= offset);
+      auto pin_end = pins.back()->get_laddr() + pins.back()->get_length();
+      ceph_assert(pin_end >= (offset > bl.length()));
+
+      return split_pin_left(
+       ctx,
+       pins.front(),
+       offset
+      ).safe_then([this, ctx, pin_begin, &offset, &bl, &pins, &to_write](
+                   auto p) {
+       auto &[left_extent, headptr] = p;
+       if (left_extent) {
+         ceph_assert(left_extent->addr == pin_begin);
+         to_write.push_front(std::move(*left_extent));
+       }
+       if (headptr) {
+         bufferlist newbl;
+         newbl.append(*headptr);
+         newbl.append(bl);
+         bl.swap(newbl);
+         offset -= headptr->length();
+         assert_aligned(offset);
+       }
+       return split_pin_right(
+         ctx,
+         pins.back(),
+         offset + bl.length());
+      }).safe_then([this, ctx, pin_end, &offset, &bl, &pins, &to_write](
+                    auto p) {
+       auto &[right_extent, tailptr] = p;
+       if (tailptr) {
+         bl.append(*tailptr);
+         assert_aligned(bl.length());
+       }
+       to_write.splice(to_write.end(), get_buffers(offset, bl));
+       if (right_extent) {
+         ceph_assert((right_extent->addr  + right_extent->len) == pin_end);
+         to_write.push_back(std::move(*right_extent));
+       }
+       return write_ertr::now();
+      }).safe_then([this, ctx, &pins] {
+       return do_removals(ctx, pins);
+      }).safe_then([this, ctx, &to_write] {
+       return do_insertions(ctx, to_write);
+      });
+    });
+}
+
+ObjectDataHandler::write_ret ObjectDataHandler::write(
+  context_t ctx,
+  objaddr_t offset,
+  const bufferlist &bl)
+{
+  return with_object_data(
+    ctx,
+    [this, ctx, offset, &bl](auto &object_data) {
+      return prepare_data_reservation(
+       ctx,
+       object_data,
+       p2roundup(offset + bl.length(), ctx.tm.get_block_size())
+      ).safe_then([this, ctx, offset, &object_data, &bl] {
+       auto logical_offset = object_data.get_reserved_data_base() + offset;
+       return ctx.tm.get_pins(
+         ctx.t,
+         logical_offset,
+         bl.length()
+       ).safe_then([this, ctx, offset, logical_offset, &object_data, &bl](
+                     auto pins) {
+         return overwrite(ctx, logical_offset, bufferlist(bl), std::move(pins));
+       });
+      });
+    });
+}
+
+ObjectDataHandler::read_ret ObjectDataHandler::read(
+  context_t ctx,
+  objaddr_t obj_offset,
+  extent_len_t len)
+{
+  return seastar::do_with(
+    bufferlist(),
+    [this, ctx, obj_offset, len](auto &ret) {
+      return with_object_data(
+       ctx,
+       [this, ctx, obj_offset, len, &ret](const auto &object_data) {
+         /* Assumption: callers ensure that onode size is <= reserved
+          * size and that len is adjusted here prior to call */
+         ceph_assert(!object_data.is_null());
+         ceph_assert((obj_offset + len) <= object_data.get_reserved_data_len());
+         ceph_assert(len > 0);
+         laddr_t loffset =
+           object_data.get_reserved_data_base() + obj_offset;
+         return ctx.tm.get_pins(
+           ctx.t,
+           loffset,
+           len
+         ).safe_then([this, ctx, loffset, len, &ret](auto _pins) {
+           // offset~len falls within reserved region and len > 0
+           ceph_assert(_pins.size() >= 1);
+           ceph_assert((*_pins.begin())->get_laddr() <= loffset);
+           return seastar::do_with(
+             std::move(_pins),
+             loffset,
+             [this, ctx, loffset, len, &ret](auto &pins, auto &current) {
+               return crimson::do_for_each(
+                 std::begin(pins),
+                 std::end(pins),
+                 [this, ctx, loffset, len, &current, &ret](auto &pin)
+                 -> read_ertr::future<> {
+                   ceph_assert(current <= (loffset + len));
+                   ceph_assert(
+                     (loffset + len) > pin->get_laddr());
+                   laddr_t end = std::min(
+                     pin->get_laddr() + pin->get_length(),
+                     loffset + len);
+                   if (pin->get_paddr().is_zero()) {
+                     ceph_assert(end > current); // See LBAManager::get_mappings
+                     ret.append_zero(end - current);
+                     current = end;
+                     return seastar::now();
+                   } else {
+                     return ctx.tm.pin_to_extent<ObjectDataBlock>(
+                       ctx.t,
+                       std::move(pin)
+                     ).safe_then([&ret, &current, end](auto extent) {
+                       ceph_assert(
+                         (extent->get_laddr() + extent->get_length()) >= end);
+                       ceph_assert(end > current);
+                       ret.append(
+                         bufferptr(
+                           extent->get_bptr(),
+                           current - extent->get_laddr(),
+                           end - current));
+                       current = end;
+                       return seastar::now();
+                     }).handle_error(
+                       read_ertr::pass_further{},
+                       crimson::ct_error::assert_all{
+                         "ObjectDataHandler::read hit invalid error"
+                       }
+                     );
+                   }
+                 });
+             });
+         });
+       }).safe_then([&ret] {
+         return std::move(ret);
+       });
+    });
+}
+
+ObjectDataHandler::truncate_ret ObjectDataHandler::truncate(
+  context_t ctx,
+  objaddr_t offset)
+{
+  return with_object_data(
+    ctx,
+    [this, ctx, offset](auto &object_data) {
+      if (offset < object_data.get_reserved_data_len()) {
+       return trim_data_reservation(ctx, object_data, offset);
+      } else if (offset > object_data.get_reserved_data_len()) {
+       return prepare_data_reservation(
+         ctx,
+         object_data,
+         offset);
+      } else {
+       return truncate_ertr::now();
+      }
+    });
+}
+
+ObjectDataHandler::clear_ret ObjectDataHandler::clear(
+  context_t ctx)
+{
+  return with_object_data(
+    ctx,
+    [this, ctx](auto &object_data) {
+      return trim_data_reservation(ctx, object_data, 0);
+    });
+}
+
+}
diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h
new file mode 100644 (file)
index 0000000..c7dc078
--- /dev/null
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include "include/buffer.h"
+
+#include "test/crimson/seastore/test_block.h" // TODO
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/transaction_manager.h"
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore {
+
+struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
+  using Ref = TCachedExtentRef<ObjectDataBlock>;
+
+  ObjectDataBlock(ceph::bufferptr &&ptr)
+    : LogicalCachedExtent(std::move(ptr)) {}
+  ObjectDataBlock(const ObjectDataBlock &other)
+    : LogicalCachedExtent(other) {}
+
+  CachedExtentRef duplicate_for_write() final {
+    return CachedExtentRef(new ObjectDataBlock(*this));
+  };
+
+  static constexpr extent_types_t TYPE = extent_types_t::OBJECT_DATA_BLOCK;
+  extent_types_t get_type() const final {
+    return TYPE;
+  }
+
+  ceph::bufferlist get_delta() final {
+    /* Currently, we always allocate fresh ObjectDataBlock's rather than
+     * mutating existing ones. */
+    ceph_assert(0 == "Should be impossible");
+  }
+
+  void apply_delta(const ceph::bufferlist &bl) final {
+    // See get_delta()
+    ceph_assert(0 == "Should be impossible");
+  }
+};
+using ObjectDataBlockRef = TCachedExtentRef<ObjectDataBlock>;
+
+class ObjectDataHandler {
+public:
+  using base_ertr = TransactionManager::base_ertr;
+
+  struct context_t {
+    TransactionManager &tm;
+    Transaction &t;
+    Onode &onode;
+  };
+
+  /// Writes bl to [offset, offset + bl.length())
+  using write_ertr = base_ertr;
+  using write_ret = write_ertr::future<>;
+  write_ret write(
+    context_t ctx,
+    objaddr_t offset,
+    const bufferlist &bl);
+
+  /// Reads data in [offset, offset + len)
+  using read_ertr = base_ertr;
+  using read_ret = read_ertr::future<bufferlist>;
+  read_ret read(
+    context_t ctx,
+    objaddr_t offset,
+    extent_len_t len);
+
+  /// Clears data past offset
+  using truncate_ertr = base_ertr;
+  using truncate_ret = truncate_ertr::future<>;
+  truncate_ret truncate(
+    context_t ctx,
+    objaddr_t offset);
+
+  /// Clears data and reservation
+  using clear_ertr = base_ertr;
+  using clear_ret = clear_ertr::future<>;
+  clear_ret clear(context_t ctx);
+
+private:
+  /// Updates region [_offset, _offset + bl.length) to bl
+  write_ret overwrite(
+    context_t ctx,        ///< [in] ctx
+    laddr_t offset,       ///< [in] write offset
+    bufferlist &&bl,      ///< [in] buffer to write
+    lba_pin_list_t &&pins ///< [in] set of pins overlapping above region
+  );
+
+  /// Ensures object_data reserved region is prepared
+  write_ret prepare_data_reservation(
+    context_t ctx,
+    object_data_t &object_data,
+    extent_len_t size);
+
+  /// Trims data past size
+  clear_ret trim_data_reservation(
+    context_t ctx,
+    object_data_t &object_data,
+    extent_len_t size);
+};
+
+}
index 3b9dd383a90b91375f0c927b2296220cab83e474..21c0fba8aa559f9bd62961cd6e88d074a8da837e 100644 (file)
@@ -16,6 +16,8 @@ namespace crimson::os::seastore {
 struct onode_layout_t {
   ceph_le32 size{0};
   omap_root_le_t omap_root;
+
+  object_data_le_t object_data;
 } __attribute__((packed));
 
 class Transaction;
index f5a5debbb6c0ffa58bb274e950561aca45e32afe..a651d23cd909bc9b2331c058cbcaa3c6473baff3 100644 (file)
@@ -63,6 +63,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
     return out << "OMAP_LEAF";
   case extent_types_t::COLL_BLOCK:
     return out << "COLL_BLOCK";
+  case extent_types_t::OBJECT_DATA_BLOCK:
+    return out << "OBJECT_DATA_BLOCK";
   case extent_types_t::TEST_BLOCK:
     return out << "TEST_BLOCK";
   case extent_types_t::TEST_BLOCK_PHYSICAL:
index d6c78a400a3fec5c882ca4e26108cc4eb7455ba4..de6485ba4b01ac92e0cf5f0e83c8e63da91f599a 100644 (file)
@@ -316,6 +316,7 @@ enum class extent_types_t : uint8_t {
   OMAP_LEAF = 5,
   ONODE_BLOCK_STAGED = 6,
   COLL_BLOCK = 7,
+  OBJECT_DATA_BLOCK = 8,
 
   // Test Block Types
   TEST_BLOCK = 0xF0,
@@ -395,6 +396,71 @@ struct record_t {
   std::vector<delta_info_t> deltas;
 };
 
+class object_data_t {
+  laddr_t reserved_data_base = L_ADDR_NULL;
+  extent_len_t reserved_data_len = 0;
+
+  bool dirty = false;
+public:
+  object_data_t(
+    laddr_t reserved_data_base,
+    extent_len_t reserved_data_len)
+    : reserved_data_base(reserved_data_base),
+      reserved_data_len(reserved_data_len) {}
+
+  laddr_t get_reserved_data_base() const {
+    return reserved_data_base;
+  }
+
+  extent_len_t get_reserved_data_len() const {
+    return reserved_data_len;
+  }
+
+  bool is_null() const {
+    return reserved_data_base == L_ADDR_NULL;
+  }
+
+  bool must_update() const {
+    return dirty;
+  }
+
+  void update_reserved(
+    laddr_t base,
+    extent_len_t len) {
+    dirty = true;
+    reserved_data_base = base;
+    reserved_data_len = len;
+  }
+
+  void update_len(
+    extent_len_t len) {
+    dirty = true;
+    reserved_data_len = len;
+  }
+
+  void clear() {
+    dirty = true;
+    reserved_data_base = L_ADDR_NULL;
+    reserved_data_len = 0;
+  }
+};
+
+struct __attribute__((packed)) object_data_le_t {
+  laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL);
+  extent_len_le_t reserved_data_len = init_extent_len_le(0);
+
+  void update(const object_data_t &nroot) {
+    reserved_data_base = nroot.get_reserved_data_base();
+    reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len());
+  }
+
+  object_data_t get() const {
+    return object_data_t(
+      reserved_data_base,
+      reserved_data_len);
+  }
+};
+
 struct omap_root_t {
   laddr_t addr = L_ADDR_NULL;
   depth_t depth = 0;
index 153a3fbcd12f54a8a15fc3b04b771f5928f1fca1..f1585bf0ec08e44023e5dd7f1df5615256121ad8 100644 (file)
@@ -38,6 +38,18 @@ target_link_libraries(
   crimson::gtest
   crimson-seastore)
 
+add_executable(unittest-object-data-handler
+  test_object_data_handler.cc
+  ../gtest_seastar.cc)
+add_ceph_unittest(unittest-object-data-handler
+  --memory 256M --smp 1)
+target_link_libraries(
+  unittest-object-data-handler
+  crimson::gtest
+  crimson-seastore
+  crimson-os
+  crimson-common)
+
 add_executable(unittest-collection-manager
   test_collection_manager.cc
   ../gtest_seastar.cc)
diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc
new file mode 100644 (file)
index 0000000..45e941e
--- /dev/null
@@ -0,0 +1,300 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/crimson/gtest_seastar.h"
+#include "test/crimson/seastore/transaction_manager_test_state.h"
+
+#include "crimson/os/seastore/onode.h"
+#include "crimson/os/seastore/object_data_handler.h"
+
+using namespace crimson;
+using namespace crimson::os;
+using namespace crimson::os::seastore;
+
+namespace {
+  [[maybe_unused]] seastar::logger& logger() {
+    return crimson::get_logger(ceph_subsys_test);
+  }
+}
+
+class TestOnode : public Onode {
+  onode_layout_t layout;
+  bool dirty = false;
+
+public:
+  const onode_layout_t &get_layout() const final {
+    return layout;
+  }
+  onode_layout_t &get_mutable_layout(Transaction &t) final {
+    dirty = true;
+    return layout;
+  }
+  bool is_dirty() const { return dirty; }
+  ~TestOnode() final = default;
+};
+
+struct object_data_handler_test_t:
+  public seastar_test_suite_t,
+  TMTestState {
+  OnodeRef onode;
+
+  bufferptr known_contents;
+  extent_len_t size = 0;
+
+  object_data_handler_test_t() {}
+
+  auto submit_transaction(TransactionRef &&t) {
+    return tm->submit_transaction(std::move(t)
+    ).safe_then([this] {
+      return segment_cleaner->run_until_halt();
+    });
+  }
+
+  void write(Transaction &t, objaddr_t offset, extent_len_t len, char fill) {
+    ceph_assert(offset + len <= known_contents.length());
+    size = std::max<extent_len_t>(size, offset + len);
+    memset(
+      known_contents.c_str() + offset,
+      fill,
+      len);
+    bufferlist bl;
+    bl.append(
+      bufferptr(
+       known_contents,
+       offset,
+       len));
+    return ObjectDataHandler().write(
+      ObjectDataHandler::context_t{
+       *tm,
+       t,
+       *onode,
+      },
+      offset,
+      bl).unsafe_get0();
+  }
+  void write(objaddr_t offset, extent_len_t len, char fill) {
+    auto t = tm->create_transaction();
+    write(*t, offset, len, fill);
+    return submit_transaction(std::move(t)).unsafe_get0();
+  }
+
+  void truncate(Transaction &t, objaddr_t offset) {
+    if (size > offset) {
+      memset(
+       known_contents.c_str() + offset,
+       0,
+       size - offset);
+      ObjectDataHandler().truncate(
+       ObjectDataHandler::context_t{
+         *tm,
+         t,
+         *onode
+       },
+       offset).unsafe_get0();
+    }
+    size = offset;
+  }
+  void truncate(objaddr_t offset) {
+    auto t = tm->create_transaction();
+    truncate(*t, offset);
+    return submit_transaction(std::move(t)).unsafe_get0();
+  }
+
+  void read(Transaction &t, objaddr_t offset, extent_len_t len) {
+    bufferlist bl = ObjectDataHandler().read(
+      ObjectDataHandler::context_t{
+       *tm,
+       t,
+       *onode
+      },
+      offset,
+      len).unsafe_get0();
+    bufferlist known;
+    known.append(
+      bufferptr(
+       known_contents,
+       offset,
+       len));
+    EXPECT_EQ(bl.length(), known.length());
+    EXPECT_EQ(bl, known);
+  }
+  void read(objaddr_t offset, extent_len_t len) {
+    auto t = tm->create_transaction();
+    read(*t, offset, len);
+  }
+  void read_near(objaddr_t offset, extent_len_t len, extent_len_t fuzz) {
+    auto fuzzes = std::vector<int32_t>{-1 * (int32_t)fuzz, 0, (int32_t)fuzz};
+    for (auto left_fuzz : fuzzes) {
+      for (auto right_fuzz : fuzzes) {
+       read(offset + left_fuzz, len - left_fuzz + right_fuzz);
+      }
+    }
+  }
+
+  seastar::future<> set_up_fut() final {
+    onode = new TestOnode{};
+    known_contents = buffer::create(4<<20 /* 4MB */);
+    size = 0;
+    return tm_setup();
+  }
+
+  seastar::future<> tear_down_fut() final {
+    onode.reset();
+    size = 0;
+    return tm_teardown();
+  }
+};
+
+TEST_F(object_data_handler_test_t, single_write)
+{
+  run_async([this] {
+    write(1<<20, 8<<10, 'c');
+
+    read_near(1<<20, 8<<10, 1);
+    read_near(1<<20, 8<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, multi_write)
+{
+  run_async([this] {
+    write((1<<20) - (4<<10), 4<<10, 'a');
+    write(1<<20, 4<<10, 'b');
+    write((1<<20) + (4<<10), 4<<10, 'c');
+
+    read_near(1<<20, 4<<10, 1);
+    read_near(1<<20, 4<<10, 512);
+
+    read_near((1<<20)-(4<<10), 12<<10, 1);
+    read_near((1<<20)-(4<<10), 12<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, write_hole)
+{
+  run_async([this] {
+    write((1<<20) - (4<<10), 4<<10, 'a');
+    // hole at 1<<20
+    write((1<<20) + (4<<10), 4<<10, 'c');
+
+    read_near(1<<20, 4<<10, 1);
+    read_near(1<<20, 4<<10, 512);
+
+    read_near((1<<20)-(4<<10), 12<<10, 1);
+    read_near((1<<20)-(4<<10), 12<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, overwrite_single)
+{
+  run_async([this] {
+    write((1<<20), 4<<10, 'a');
+    write((1<<20), 4<<10, 'c');
+
+    read_near(1<<20, 4<<10, 1);
+    read_near(1<<20, 4<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, overwrite_double)
+{
+  run_async([this] {
+    write((1<<20), 4<<10, 'a');
+    write((1<<20)+(4<<10), 4<<10, 'c');
+    write((1<<20), 8<<10, 'b');
+
+    read_near(1<<20, 8<<10, 1);
+    read_near(1<<20, 8<<10, 512);
+
+    read_near(1<<20, 4<<10, 1);
+    read_near(1<<20, 4<<10, 512);
+
+    read_near((1<<20) + (4<<10), 4<<10, 1);
+    read_near((1<<20) + (4<<10), 4<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, overwrite_partial)
+{
+  run_async([this] {
+    write((1<<20), 12<<10, 'a');
+    read_near(1<<20, 12<<10, 1);
+
+    write((1<<20)+(8<<10), 4<<10, 'b');
+    read_near(1<<20, 12<<10, 1);
+
+    write((1<<20)+(4<<10), 4<<10, 'c');
+    read_near(1<<20, 12<<10, 1);
+
+    write((1<<20), 4<<10, 'd');
+
+    read_near(1<<20, 12<<10, 1);
+    read_near(1<<20, 12<<10, 512);
+
+    read_near(1<<20, 4<<10, 1);
+    read_near(1<<20, 4<<10, 512);
+
+    read_near((1<<20) + (4<<10), 4<<10, 1);
+    read_near((1<<20) + (4<<10), 4<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, unaligned_write)
+{
+  run_async([this] {
+    objaddr_t base = 1<<20;
+    write(base, (4<<10)+(1<<10), 'a');
+    read_near(base-(4<<10), 12<<10, 512);
+
+    base = (1<<20) + (64<<10);
+    write(base+(1<<10), (4<<10)+(1<<10), 'b');
+    read_near(base-(4<<10), 12<<10, 512);
+
+    base = (1<<20) + (128<<10);
+    write(base-(1<<10), (4<<10)+(2<<20), 'c');
+    read_near(base-(4<<10), 12<<10, 512);
+  });
+}
+
+TEST_F(object_data_handler_test_t, unaligned_overwrite)
+{
+  run_async([this] {
+    objaddr_t base = 1<<20;
+    write(base, (128<<10) + (16<<10), 'x');
+
+    write(base, (4<<10)+(1<<10), 'a');
+    read_near(base-(4<<10), 12<<10, 2<<10);
+
+    base = (1<<20) + (64<<10);
+    write(base+(1<<10), (4<<10)+(1<<10), 'b');
+    read_near(base-(4<<10), 12<<10, 2<<10);
+
+    base = (1<<20) + (128<<10);
+    write(base-(1<<10), (4<<10)+(2<<20), 'c');
+    read_near(base-(4<<10), 12<<10, 2<<10);
+
+    read(base, (128<<10) + (16<<10));
+  });
+}
+
+TEST_F(object_data_handler_test_t, truncate)
+{
+  run_async([this] {
+    objaddr_t base = 1<<20;
+    write(base, 8<<10, 'a');
+    write(base+(8<<10), 8<<10, 'b');
+    write(base+(16<<10), 8<<10, 'c');
+
+    truncate(base + (32<<10));
+    read(base, 64<<10);
+
+    truncate(base + (24<<10));
+    read(base, 64<<10);
+
+    truncate(base + (12<<10));
+    read(base, 64<<10);
+
+    truncate(base - (12<<10));
+    read(base, 64<<10);
+  });
+}