]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore/cache: ensure retired extents remain until transactions complete
authorSamuel Just <sjust@redhat.com>
Thu, 22 Apr 2021 06:15:38 +0000 (23:15 -0700)
committerSamuel Just <sjust@redhat.com>
Thu, 22 Apr 2021 06:16:43 +0000 (23:16 -0700)
Adds a structure to ensure that retired extents remain in the index until
any transactions which might reference them complete.

Signed-off-by: Samuel Just <sjust@redhat.com>
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cache.h
src/crimson/os/seastore/cached_extent.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction.h

index 1d707f54345396d94fcdca2635c2a8c36e08a8c3..3ebfd70c41d661c6a4b3de5512a3856b6d3cc5d2 100644 (file)
@@ -105,6 +105,17 @@ void Cache::remove_extent(CachedExtentRef ref)
   extents.erase(*ref);
 }
 
+void Cache::retire_extent(CachedExtentRef ref)
+{
+  logger().debug("retire_extent: {}", *ref);
+  assert(ref->is_valid());
+
+  remove_from_dirty(ref);
+  ref->dirty_from_or_retired_at = JOURNAL_SEQ_MAX;
+  retired_extent_gate.add_extent(*ref);
+  ref->state = CachedExtent::extent_state_t::RETIRED;
+}
+
 void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev)
 {
   assert(next->get_paddr() == prev->get_paddr());
@@ -323,7 +334,7 @@ void Cache::complete_commit(
     }
     i->state = CachedExtent::extent_state_t::DIRTY;
     if (i->version == 1 || i->get_type() == extent_types_t::ROOT) {
-      i->dirty_from = seq;
+      i->dirty_from_or_retired_at = seq;
     }
   }
 
@@ -343,6 +354,13 @@ void Cache::complete_commit(
   for (auto &i: t.mutated_block_list) {
     i->complete_io();
   }
+
+  last_commit = seq;
+  for (auto &i: t.retired_set) {
+    logger().debug("try_construct_record: retiring {}", *i);
+    i->dirty_from_or_retired_at = last_commit;
+  }
+  retired_extent_gate.prune();
 }
 
 void Cache::init() {
@@ -390,7 +408,7 @@ Cache::replay_delta(
     logger().debug("replay_delta: found root delta");
     remove_extent(root);
     root->apply_delta_and_adjust_crc(record_base, delta.bl);
-    root->dirty_from = journal_seq;
+    root->dirty_from_or_retired_at = journal_seq;
     add_extent(root);
     return replay_delta_ertr::now();
   } else {
@@ -438,7 +456,7 @@ Cache::replay_delta(
       assert(extent->last_committed_crc == delta.final_crc);
 
       if (extent->version == 0) {
-       extent->dirty_from = journal_seq;
+       extent->dirty_from_or_retired_at = journal_seq;
       }
       extent->version++;
       mark_dirty(extent);
index d45cf1dade2965af7340b007a1481aaead26de95..1876e0960f22e0062b56a642e1f15e31a853d939 100644 (file)
@@ -90,20 +90,28 @@ public:
   Cache(SegmentManager &segment_manager);
   ~Cache();
 
+  retired_extent_gate_t retired_extent_gate;
+
   /// Creates empty transaction
   TransactionRef create_transaction() {
-    return std::make_unique<Transaction>(
+    auto ret = std::make_unique<Transaction>(
       get_dummy_ordering_handle(),
-      false
+      false,
+      last_commit
     );
+    retired_extent_gate.add_token(ret->retired_gate_token);
+    return ret;
   }
 
   /// Creates empty weak transaction
   TransactionRef create_weak_transaction() {
-    return std::make_unique<Transaction>(
+    auto ret = std::make_unique<Transaction>(
       get_dummy_ordering_handle(),
-      true
+      true,
+      last_commit
     );
+    retired_extent_gate.add_token(ret->retired_gate_token);
+    return ret;
   }
 
   /**
@@ -523,6 +531,8 @@ private:
   RootBlockRef root;               ///< ref to current root
   ExtentIndex extents;             ///< set of live extents
 
+  journal_seq_t last_commit = JOURNAL_SEQ_MIN;
+
   /**
    * dirty
    *
index d65d268db569119e006fae292566495fac089c35..9612a73960892a97f9971dad3f1376d6574ee6b3 100644 (file)
@@ -46,6 +46,8 @@ std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
     return out << "CLEAN";
   case CachedExtent::extent_state_t::DIRTY:
     return out << "DIRTY";
+  case CachedExtent::extent_state_t::RETIRED:
+    return out << "RETIRED";
   case CachedExtent::extent_state_t::INVALID:
     return out << "INVALID";
   default:
index c3df9554283e68fd4dd2c209eebfdc2e52491635..8ccd2451c157b244cd1ba35012478ca81c893509 100644 (file)
@@ -48,6 +48,7 @@ class CachedExtent : public boost::intrusive_ref_counter<
                            //  during write, contents match disk, version == 0
     DIRTY,                 // Same as CLEAN, but contents do not match disk,
                            //  version > 0
+    RETIRED,               // In ExtentIndex while in retired_extent_gate
     INVALID                // Part of no ExtentIndex set
   } state = extent_state_t::INVALID;
   friend std::ostream &operator<<(std::ostream &, extent_state_t);
@@ -60,14 +61,6 @@ class CachedExtent : public boost::intrusive_ref_counter<
   // Points at current version while in state MUTATION_PENDING
   CachedExtentRef prior_instance;
 
-  /**
-   * dirty_from
-   *
-   * When dirty, indiciates the oldest journal entry which mutates
-   * this extent.
-   */
-  journal_seq_t dirty_from;
-
 public:
   /**
    *  duplicate_for_write
@@ -137,7 +130,7 @@ public:
     out << "CachedExtent(addr=" << this
        << ", type=" << get_type()
        << ", version=" << version
-       << ", dirty_from=" << dirty_from
+       << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at
        << ", paddr=" << get_paddr()
        << ", state=" << state
        << ", last_committed_crc=" << last_committed_crc
@@ -232,7 +225,12 @@ public:
 
   /// Returns true if extent has not been superceded or retired
   bool is_valid() const {
-    return state != extent_state_t::INVALID;
+    return state != extent_state_t::INVALID && state != extent_state_t::RETIRED;
+  }
+
+  /// True iff extent is in state RETIRED
+  bool is_retired() const {
+    return state == extent_state_t::RETIRED;
   }
 
   /// Returns true if extent or prior_instance has been invalidated
@@ -240,13 +238,17 @@ public:
     return !is_valid() || (prior_instance && !prior_instance->is_valid());
   }
 
-  /**
-   * get_dirty_from
-   *
-   * Return journal location of oldest relevant delta.
-   */
-  auto get_dirty_from() const { return dirty_from; }
+  /// Return journal location of oldest relevant delta, only valid while DIRTY
+  auto get_dirty_from() const {
+    ceph_assert(is_dirty());
+    return dirty_from_or_retired_at;
+  }
 
+  /// Return journal location of oldest relevant delta, only valid while RETIRED
+  auto get_retired_at() const {
+    ceph_assert(is_retired());
+    return dirty_from_or_retired_at;
+  }
 
   /**
    * get_paddr
@@ -316,6 +318,15 @@ private:
   using list = boost::intrusive::list<
     CachedExtent,
     primary_ref_list_member_options>;
+  friend class retired_extent_gate_t;
+
+  /**
+   * dirty_from_or_retired_at
+   *
+   * Encodes ordering token for primary_ref_list -- dirty_from when
+   * dirty or retired_at if retired.
+   */
+  journal_seq_t dirty_from_or_retired_at;
 
   /// Actual data contents
   ceph::bufferptr ptr;
@@ -351,7 +362,7 @@ protected:
   CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {}
   CachedExtent(const CachedExtent &other)
     : state(other.state),
-      dirty_from(other.dirty_from),
+      dirty_from_or_retired_at(other.dirty_from_or_retired_at),
       ptr(other.ptr.c_str(), other.ptr.length()),
       version(other.version),
       poffset(other.poffset) {}
@@ -359,7 +370,7 @@ protected:
   struct share_buffer_t {};
   CachedExtent(const CachedExtent &other, share_buffer_t) :
     state(other.state),
-    dirty_from(other.dirty_from),
+    dirty_from_or_retired_at(other.dirty_from_or_retired_at),
     ptr(other.ptr),
     version(other.version),
     poffset(other.poffset) {}
@@ -571,6 +582,70 @@ using lba_pin_list_t = std::list<LBAPinRef>;
 
 std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
 
+/**
+ * retired_extent_gate_t
+ *
+ * We need to keep each retired extent in memory until all transactions
+ * that could still reference it has completed.  live_tokens tracks the
+ * set of tokens (which will be embedded in Transaction's) still live
+ * in order of the commit after which it was created.  retired_extents
+ * lists retired extents ordered by the commit at which they were
+ * retired.
+ */
+class retired_extent_gate_t {
+public:
+  class token_t {
+    friend class retired_extent_gate_t;
+    retired_extent_gate_t *parent = nullptr;
+    journal_seq_t created_after;
+
+    boost::intrusive::list_member_hook<> list_hook;
+    using list_hook_options = boost::intrusive::member_hook<
+      token_t,
+      boost::intrusive::list_member_hook<>,
+      &token_t::list_hook>;
+    using registry =  boost::intrusive::list<
+      token_t,
+      list_hook_options>;
+  public:
+    token_t(journal_seq_t created_after) : created_after(created_after) {}
+    ~token_t();
+  };
+
+  void prune() {
+    journal_seq_t prune_to = live_tokens.empty() ?
+      JOURNAL_SEQ_MAX : live_tokens.front().created_after;
+    while (!retired_extents.empty() &&
+          prune_to > retired_extents.front().get_retired_at()) {
+      auto ext = &retired_extents.front();
+      retired_extents.pop_front();
+      intrusive_ptr_release(ext);
+    }
+  }
+
+  void add_token(token_t &t) {
+    t.parent = this;
+    live_tokens.push_back(t);
+  }
+
+  void add_extent(CachedExtent &extent) {
+    intrusive_ptr_add_ref(&extent);
+    retired_extents.push_back(extent);
+  }
+
+private:
+  token_t::registry live_tokens;
+  CachedExtent::list retired_extents;
+};
+
+inline retired_extent_gate_t::token_t::~token_t() {
+  if (parent) {
+    parent->live_tokens.erase(
+      parent->live_tokens.s_iterator_to(*this));
+    parent->prune();
+    parent = nullptr;
+  }
+}
 
 /**
  * LogicalCachedExtent
index 655a9bea6840cc0a1679adcd4543f7b7e02d184e..39966865db5366e7f3326476e657aa0ee21997a0 100644 (file)
@@ -36,6 +36,8 @@ struct seastore_meta_t {
 
 // Identifies segment location on disk, see SegmentManager,
 using segment_id_t = uint32_t;
+constexpr segment_id_t MAX_SEG_ID =
+  std::numeric_limits<segment_id_t>::max();
 constexpr segment_id_t NULL_SEG_ID =
   std::numeric_limits<segment_id_t>::max() - 1;
 /* Used to denote relative paddr_t */
@@ -60,6 +62,8 @@ std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t);
 using segment_off_t = int32_t;
 constexpr segment_off_t NULL_SEG_OFF =
   std::numeric_limits<segment_off_t>::max();
+constexpr segment_off_t MAX_SEG_OFF =
+  std::numeric_limits<segment_off_t>::max();
 
 std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t);
 
@@ -68,6 +72,8 @@ std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t);
 using segment_seq_t = uint32_t;
 static constexpr segment_seq_t NULL_SEG_SEQ =
   std::numeric_limits<segment_seq_t>::max();
+static constexpr segment_seq_t MAX_SEG_SEQ =
+  std::numeric_limits<segment_seq_t>::max();
 
 // Offset of delta within a record
 using record_delta_idx_t = uint32_t;
@@ -192,6 +198,10 @@ WRITE_CMP_OPERATORS_2(paddr_t, segment, offset)
 WRITE_EQ_OPERATORS_2(paddr_t, segment, offset)
 constexpr paddr_t P_ADDR_NULL = paddr_t{};
 constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0};
+constexpr paddr_t P_ADDR_MAX = paddr_t{
+  MAX_SEG_ID,
+  MAX_SEG_OFF
+};
 constexpr paddr_t make_record_relative_paddr(segment_off_t off) {
   return paddr_t{RECORD_REL_SEG_ID, off};
 }
@@ -247,6 +257,10 @@ constexpr journal_seq_t JOURNAL_SEQ_MIN{
   0,
   paddr_t{0, 0}
 };
+constexpr journal_seq_t JOURNAL_SEQ_MAX{
+  MAX_SEG_SEQ,
+  P_ADDR_MAX
+};
 
 std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
 
index 4510c525746eb60d264207433a9ff84592948456..d17907a3fa98cb1646823a68f1880d3dee959299 100644 (file)
@@ -5,6 +5,8 @@
 
 #include <iostream>
 
+#include <boost/intrusive/list.hpp>
+
 #include "crimson/os/seastore/ordering_handle.h"
 #include "crimson/os/seastore/seastore_types.h"
 #include "crimson/os/seastore/cached_extent.h"
@@ -12,6 +14,8 @@
 
 namespace crimson::os::seastore {
 
+struct retired_extent_gate_t;
+
 /**
  * Transaction
  *
@@ -138,11 +142,17 @@ private:
 
   std::vector<std::pair<paddr_t, extent_len_t>> retired_uncached;
 
+  journal_seq_t initiated_after;
+
+  retired_extent_gate_t::token_t retired_gate_token;
+
 public:
   Transaction(
     OrderingHandle &&handle,
-    bool weak
-  ) : handle(std::move(handle)), weak(weak) {}
+    bool weak,
+    journal_seq_t initiated_after
+  ) : handle(std::move(handle)), weak(weak),
+      retired_gate_token(initiated_after) {}
 
   ~Transaction() {
     for (auto i = write_set.begin();
@@ -158,7 +168,8 @@ using TransactionRef = Transaction::Ref;
 inline TransactionRef make_test_transaction() {
   return std::make_unique<Transaction>(
     get_dummy_ordering_handle(),
-    false
+    false,
+    journal_seq_t{}
   );
 }