]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: store data as a delta using mutable extent when overwriting
authorMyoungwon Oh <myoungwon.oh@samsung.com>
Wed, 20 Sep 2023 01:05:56 +0000 (10:05 +0900)
committermyoungwon oh <ohmyoungwon@gmail.com>
Thu, 9 Nov 2023 11:51:24 +0000 (11:51 +0000)
Signed-off-by: Myoungwon Oh <myoungwon.oh@samsung.com>
Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
src/common/options/crimson.yaml.in
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/object_data_handler.cc
src/crimson/os/seastore/object_data_handler.h
src/crimson/os/seastore/transaction_manager.h

index 1007998fade9745f5f558104e9049476e12633cc..8f0af93f3ea8399c0017690d7ff72ffb9279f469 100644 (file)
@@ -117,3 +117,8 @@ options:
   level: advanced
   desc: Begin fast eviction when the used ratio of the main tier reaches this value.
   default: 0.7
+- name: seastore_data_delta_based_overwrite
+  type: size
+  level: dev
+  desc: overwrite the existing data block based on delta if the original size is smaller than the value, otherwise do overwrite based on remapping, set to 0 to enforce the remap-based overwrite.
+  default: 0
index 6ab19282637f26b906fe753726dd4d6e31fe9af0..c3010efe6cb4dd1ad4ea52278acb7a7cc938d9d2 100644 (file)
@@ -1054,6 +1054,9 @@ public:
   }
 
   virtual bool is_stable() const = 0;
+  bool is_zero_reserved() const {
+    return !get_val().is_real();
+  }
 
   virtual ~PhysicalNodeMapping() {}
 protected:
index 1b0ae5c814aef37d14ac004bae70f5601fa7c1b4..29e89d3ddf0d027d991f2dc417624fe9fa35bc5e 100644 (file)
@@ -98,7 +98,8 @@ using extent_to_write_list_t = std::list<extent_to_write_t>;
 // Encapsulates extents to be written out using do_remappings.
 struct extent_to_remap_t {
   enum class type_t {
-    REMAP,
+    REMAP1,
+    REMAP2,
     OVERWRITE
   };
   type_t type;
@@ -114,54 +115,75 @@ struct extent_to_remap_t {
   extent_to_remap_t(const extent_to_remap_t &) = delete;
   extent_to_remap_t(extent_to_remap_t &&) = default;
 
-  bool is_remap() const {
-    return type == type_t::REMAP;
+  bool is_remap1() const {
+    return type == type_t::REMAP1;
   }
 
-  bool is_overwrite() const {
+  bool is_remap2() const {
     assert((new_offset != 0) && (pin->get_length() != new_offset + new_len));
+    return type == type_t::REMAP2;
+  }
+
+  bool is_overwrite() const {
     return type == type_t::OVERWRITE;
   }
 
   using remap_entry = TransactionManager::remap_entry;
   remap_entry create_remap_entry() {
-    assert(is_remap());
+    assert(is_remap1());
     return remap_entry(
       new_offset,
       new_len);
   }
 
   remap_entry create_left_remap_entry() {
-    assert(is_overwrite());
+    assert(is_remap2());
     return remap_entry(
       0,
       new_offset);
   }
 
   remap_entry create_right_remap_entry() {
-    assert(is_overwrite());
+    assert(is_remap2());
     return remap_entry(
       new_offset + new_len,
       pin->get_length() - new_offset - new_len);
   }
 
-  static extent_to_remap_t create_remap(
+  static extent_to_remap_t create_remap1(
     LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
-    return extent_to_remap_t(type_t::REMAP,
+    return extent_to_remap_t(type_t::REMAP1,
       std::move(pin), new_offset, new_len);
   }
 
-  static extent_to_remap_t create_overwrite(
+  static extent_to_remap_t create_remap2(
     LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) {
-    return extent_to_remap_t(type_t::OVERWRITE,
+    return extent_to_remap_t(type_t::REMAP2,
       std::move(pin), new_offset, new_len);
   }
 
+  static extent_to_remap_t create_overwrite(
+    extent_len_t new_offset, extent_len_t new_len, LBAMappingRef p,
+    bufferlist b) {
+    return extent_to_remap_t(type_t::OVERWRITE,
+      nullptr, new_offset, new_len, p->get_key(), p->get_length(), b);
+  }
+
+  uint64_t laddr_start;
+  extent_len_t length;
+  std::optional<bufferlist> bl;
+
 private:
   extent_to_remap_t(type_t type,
     LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len)
     : type(type),
       pin(std::move(pin)), new_offset(new_offset), new_len(new_len) {}
+  extent_to_remap_t(type_t type,
+    LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len,
+    uint64_t ori_laddr, extent_len_t ori_len, std::optional<bufferlist> b)
+    : type(type),
+      pin(std::move(pin)), new_offset(new_offset), new_len(new_len),
+      laddr_start(ori_laddr), length(ori_len), bl(b) {}
 };
 using extent_to_remap_list_t = std::list<extent_to_remap_t>;
 
@@ -222,7 +244,8 @@ struct overwrite_ops_t {
 // prepare to_remap, to_retire, to_insert list
 overwrite_ops_t prepare_ops_list(
   lba_pin_list_t &pins_to_remove,
-  extent_to_write_list_t &to_write) {
+  extent_to_write_list_t &to_write,
+  size_t delta_based_overwrite_max_extent_size) {
   assert(pins_to_remove.size() != 0);
   overwrite_ops_t ops;
   ops.to_remove.swap(pins_to_remove);
@@ -241,7 +264,7 @@ overwrite_ops_t prepare_ops_list(
       assert(to_write.size() > 2);
       assert(front.addr == front.pin->get_key());
       assert(back.addr > back.pin->get_key());
-      ops.to_remap.push_back(extent_to_remap_t::create_overwrite(
+      ops.to_remap.push_back(extent_to_remap_t::create_remap2(
        std::move(front.pin),
        front.len,
        back.addr - front.addr - front.len));
@@ -252,7 +275,7 @@ overwrite_ops_t prepare_ops_list(
       visitted++;
       assert(to_write.size() > 1);
       assert(front.addr == front.pin->get_key());
-      ops.to_remap.push_back(extent_to_remap_t::create_remap(
+      ops.to_remap.push_back(extent_to_remap_t::create_remap1(
        std::move(front.pin),
        0,
        front.len));
@@ -263,7 +286,7 @@ overwrite_ops_t prepare_ops_list(
       assert(to_write.size() > 1);
       assert(back.addr + back.len ==
        back.pin->get_key() + back.pin->get_length());
-      ops.to_remap.push_back(extent_to_remap_t::create_remap(
+      ops.to_remap.push_back(extent_to_remap_t::create_remap1(
        std::move(back.pin),
        back.addr - back.pin->get_key(),
        back.len));
@@ -271,13 +294,65 @@ overwrite_ops_t prepare_ops_list(
     }
   }
 
-  // prepare to_insert
+  interval_set<uint64_t> pre_alloc_addr_removed, pre_alloc_addr_remapped;
+  if (delta_based_overwrite_max_extent_size) {
+    for (auto &r : ops.to_remove) {
+      if (r->is_stable() && !r->is_zero_reserved()) {
+       pre_alloc_addr_removed.insert(r->get_key(), r->get_length());
+
+      }
+    }
+    for (auto &r : ops.to_remap) {
+      if (r.pin && r.pin->is_stable() && !r.pin->is_zero_reserved()) {
+       pre_alloc_addr_remapped.insert(r.pin->get_key(), r.pin->get_length());
+      }
+    }
+  }
+
+  // prepare to insert
+  extent_to_remap_list_t to_remap;
   for (auto &region : to_write) {
     if (region.is_data()) {
       visitted++;
       assert(region.to_write.has_value());
-      ops.to_insert.push_back(extent_to_insert_t::create_data(
-       region.addr, region.len, region.to_write));
+      int erased_num = 0;
+      if (pre_alloc_addr_removed.contains(region.addr, region.len) &&
+         region.len <= delta_based_overwrite_max_extent_size) {
+       erased_num = std::erase_if(
+         ops.to_remove,
+         [&region, &to_remap](auto &r) {
+           interval_set<uint64_t> range;
+           range.insert(r->get_key(), r->get_length());
+           if (range.contains(region.addr, region.len)) {
+             to_remap.push_back(extent_to_remap_t::create_overwrite(
+               0, region.len, std::move(r), *region.to_write));
+             return true;
+           }
+           return false;
+         });
+       // if the size of the region is wider than the ragne from the enry in to_remove,
+       // we create a separated extent in the original way.
+      } else if (pre_alloc_addr_remapped.contains(region.addr, region.len) &&
+                region.len <= delta_based_overwrite_max_extent_size) {
+       erased_num = std::erase_if(
+         ops.to_remap,
+         [&region, &to_remap](auto &r) {
+           interval_set<uint64_t> range;
+           range.insert(r.pin->get_key(), r.pin->get_length());
+           if (range.contains(region.addr, region.len)) {
+             to_remap.push_back(extent_to_remap_t::create_overwrite(
+               region.addr - range.begin().get_start(), region.len,
+               std::move(r.pin), *region.to_write));
+             return true;
+           }
+           return false;
+         });
+       assert(erased_num > 0);
+      }
+      if (erased_num == 0)  {
+       ops.to_insert.push_back(extent_to_insert_t::create_data(
+         region.addr, region.len, region.to_write));
+      }
     } else if (region.is_zero()) {
       visitted++;
       assert(!(region.to_write.has_value()));
@@ -285,6 +360,7 @@ overwrite_ops_t prepare_ops_list(
        region.addr, region.len));
     }
   }
+  ops.to_remap.splice(ops.to_remap.end(), to_remap);
 
   logger().debug(
     "to_remap list size: {}"
@@ -334,6 +410,22 @@ void splice_extent_to_write(
   }
 }
 
+ceph::bufferlist ObjectDataBlock::get_delta() {
+  ceph::bufferlist bl;
+  encode(delta, bl);
+  return bl;
+}
+
+void ObjectDataBlock::apply_delta(const ceph::bufferlist &bl) {
+  auto biter = bl.begin();
+  decltype(delta) deltas;
+  decode(deltas, biter);
+  for (auto &&d : deltas) {
+    auto iter = d.bl.cbegin();
+    iter.copy(d.len, get_bptr().c_str() + d.offset);
+  }
+}
+
 /// Creates remap extents in to_remap
 ObjectDataHandler::write_ret do_remappings(
   context_t ctx,
@@ -342,7 +434,7 @@ ObjectDataHandler::write_ret do_remappings(
   return trans_intr::do_for_each(
     to_remap,
     [ctx](auto &region) {
-      if (region.is_remap()) {
+      if (region.is_remap1()) {
         return ctx.tm.remap_pin<ObjectDataBlock, 1>(
           ctx.t,
           std::move(region.pin),
@@ -355,6 +447,22 @@ ObjectDataHandler::write_ret do_remappings(
           return ObjectDataHandler::write_iertr::now();
         });
       } else if (region.is_overwrite()) {
+       return ctx.tm.get_mutable_extent_by_laddr<ObjectDataBlock>(
+         ctx.t,
+         region.laddr_start,
+         region.length
+       ).handle_error_interruptible(
+         TransactionManager::base_iertr::pass_further{},
+         crimson::ct_error::assert_all{
+           "ObjectDataHandler::do_remapping hit invalid error"
+         }
+       ).si_then([&region](auto extent) {
+         extent_len_t off = region.new_offset;
+         assert(region.bl->length() == region.new_len);
+         extent->overwrite(off, *region.bl);
+         return ObjectDataHandler::write_iertr::now();
+       });
+      } else if (region.is_remap2()) {
         return ctx.tm.remap_pin<ObjectDataBlock, 2>(
           ctx.t,
           std::move(region.pin),
@@ -960,7 +1068,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
   return seastar::do_with(
     lba_pin_list_t(),
     extent_to_write_list_t(),
-    [ctx, size, &object_data](auto &pins, auto &to_write) {
+    [ctx, size, &object_data, this](auto &pins, auto &to_write) {
       LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
       DEBUGT("object_data: {}~{}",
             ctx.t,
@@ -1038,9 +1146,10 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
             });
           }
        }
-      }).si_then([ctx, size, &to_write, &object_data, &pins] {
+      }).si_then([ctx, size, &to_write, &object_data, &pins, this] {
         return seastar::do_with(
-          prepare_ops_list(pins, to_write),
+          prepare_ops_list(pins, to_write,
+           delta_based_overwrite_max_extent_size),
           [ctx, size, &object_data](auto &ops) {
             return do_remappings(ctx, ops.to_remap
             ).si_then([ctx, &ops] {
@@ -1162,7 +1271,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
   return seastar::do_with(
     std::move(_pins),
     extent_to_write_list_t(),
-    [ctx, len, offset, overwrite_plan, bl=std::move(bl)]
+    [ctx, len, offset, overwrite_plan, bl=std::move(bl), this]
     (auto &pins, auto &to_write) mutable
   {
     LOG_PREFIX(ObjectDataHandler::overwrite);
@@ -1178,7 +1287,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
       pins.front(),
       overwrite_plan
     ).si_then([ctx, len, offset, overwrite_plan, bl=std::move(bl),
-               &to_write, &pins](auto p) mutable {
+               &to_write, &pins, this](auto p) mutable {
       auto &[left_extent, headptr] = p;
       if (left_extent) {
         ceph_assert(left_extent->addr == overwrite_plan.pin_begin);
@@ -1195,7 +1304,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
                  pin_begin=overwrite_plan.pin_begin,
                  pin_end=overwrite_plan.pin_end,
                  bl=std::move(bl), headptr=std::move(headptr),
-                 &to_write, &pins](auto p) mutable {
+                 &to_write, &pins, this](auto p) mutable {
         auto &[right_extent, tailptr] = p;
         if (bl.has_value()) {
           auto write_offset = offset;
@@ -1232,7 +1341,8 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
         assert(pin_end == to_write.back().get_end_addr());
 
         return seastar::do_with(
-          prepare_ops_list(pins, to_write),
+          prepare_ops_list(pins, to_write,
+           delta_based_overwrite_max_extent_size),
           [ctx](auto &ops) {
             return do_remappings(ctx, ops.to_remap
             ).si_then([ctx, &ops] {
index b5f432d5ac7760140895059a157740e0e68ae4e7..eaa05da8d545675b7e8a552ee3034e1a9fbcfb43 100644 (file)
 
 namespace crimson::os::seastore {
 
+struct block_delta_t {
+  uint64_t offset = 0;
+  extent_len_t len = 0;
+  bufferlist bl;
+
+  DENC(block_delta_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.offset, p);
+    denc(v.len, p);
+    denc(v.bl, p);
+    DENC_FINISH(p);
+  }
+};
+
 struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
   using Ref = TCachedExtentRef<ObjectDataBlock>;
 
+  std::vector<block_delta_t> delta = {};
+
   explicit ObjectDataBlock(ceph::bufferptr &&ptr)
     : LogicalCachedExtent(std::move(ptr)) {}
   explicit ObjectDataBlock(const ObjectDataBlock &other)
@@ -35,16 +51,15 @@ struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent {
     return TYPE;
   }
 
-  ceph::bufferlist get_delta() final {
-    /* Currently, we always allocate fresh ObjectDataBlock's rather than
-     * mutating existing ones. */
-    ceph_assert(0 == "Should be impossible");
+  void overwrite(extent_len_t offset, bufferlist bl) {
+    auto iter = bl.cbegin();
+    iter.copy(bl.length(), get_bptr().c_str() + offset);
+    delta.push_back({offset, bl.length(), bl});
   }
 
-  void apply_delta(const ceph::bufferlist &bl) final {
-    // See get_delta()
-    ceph_assert(0 == "Should be impossible");
-  }
+  ceph::bufferlist get_delta() final;
+
+  void apply_delta(const ceph::bufferlist &bl) final;
 };
 using ObjectDataBlockRef = TCachedExtentRef<ObjectDataBlock>;
 
@@ -52,7 +67,9 @@ class ObjectDataHandler {
 public:
   using base_iertr = TransactionManager::base_iertr;
 
-  ObjectDataHandler(uint32_t mos) : max_object_size(mos) {}
+  ObjectDataHandler(uint32_t mos) : max_object_size(mos),
+    delta_based_overwrite_max_extent_size(
+      crimson::common::get_conf<Option::size_t>("seastore_data_delta_based_overwrite")) {}
 
   struct context_t {
     TransactionManager &tm;
@@ -147,10 +164,13 @@ private:
    * these regions and remove this assumption.
    */
   const uint32_t max_object_size = 0;
+  extent_len_t delta_based_overwrite_max_extent_size = 0; // enable only if rbm is used
 };
 
 }
 
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::block_delta_t)
+
 #if FMT_VERSION >= 90000
 template <> struct fmt::formatter<crimson::os::seastore::ObjectDataBlock> : fmt::ostream_formatter {};
 #endif
index b708e0a9f203043c8a372bcff75a905dc618139a..34c87a404c4d87d9ce99f1e54dbee8884a4798b9 100644 (file)
@@ -316,6 +316,21 @@ public:
     });
   }
 
+  template <typename T>
+  read_extent_ret<T> get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) {
+    return get_pin(t, laddr
+    ).si_then([this, &t, len](auto pin) {
+      ceph_assert(pin->is_stable());
+      ceph_assert(!pin->is_clone());
+      ceph_assert(pin->get_length() == len);
+      return this->read_pin<T>(t, std::move(pin));
+    }).si_then([this, &t](auto extent) {
+      auto ext = get_mutable_extent(t, extent)->template cast<T>();
+      return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
+       std::move(ext));
+    });
+  }
+
   /**
    * remap_pin
    *