From e6567f6484f4ae4c1727eb7d03000dc7d131d1cf Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 21 Apr 2021 23:15:38 -0700 Subject: [PATCH] crimson/os/seastore/cache: ensure retired extents remain until transactions complete Adds a structure to ensure that retired extents remain in the index until any transactions which might reference them complete. Signed-off-by: Samuel Just --- src/crimson/os/seastore/cache.cc | 24 ++++- src/crimson/os/seastore/cache.h | 18 +++- src/crimson/os/seastore/cached_extent.cc | 2 + src/crimson/os/seastore/cached_extent.h | 111 +++++++++++++++++++---- src/crimson/os/seastore/seastore_types.h | 14 +++ src/crimson/os/seastore/transaction.h | 17 +++- 6 files changed, 158 insertions(+), 28 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 1d707f54345..3ebfd70c41d 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -105,6 +105,17 @@ void Cache::remove_extent(CachedExtentRef ref) extents.erase(*ref); } +void Cache::retire_extent(CachedExtentRef ref) +{ + logger().debug("retire_extent: {}", *ref); + assert(ref->is_valid()); + + remove_from_dirty(ref); + ref->dirty_from_or_retired_at = JOURNAL_SEQ_MAX; + retired_extent_gate.add_extent(*ref); + ref->state = CachedExtent::extent_state_t::RETIRED; +} + void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev) { assert(next->get_paddr() == prev->get_paddr()); @@ -323,7 +334,7 @@ void Cache::complete_commit( } i->state = CachedExtent::extent_state_t::DIRTY; if (i->version == 1 || i->get_type() == extent_types_t::ROOT) { - i->dirty_from = seq; + i->dirty_from_or_retired_at = seq; } } @@ -343,6 +354,13 @@ void Cache::complete_commit( for (auto &i: t.mutated_block_list) { i->complete_io(); } + + last_commit = seq; + for (auto &i: t.retired_set) { + logger().debug("try_construct_record: retiring {}", *i); + i->dirty_from_or_retired_at = last_commit; + } + retired_extent_gate.prune(); } void Cache::init() { @@ -390,7 +408,7 @@ Cache::replay_delta( logger().debug("replay_delta: found root delta"); remove_extent(root); root->apply_delta_and_adjust_crc(record_base, delta.bl); - root->dirty_from = journal_seq; + root->dirty_from_or_retired_at = journal_seq; add_extent(root); return replay_delta_ertr::now(); } else { @@ -438,7 +456,7 @@ Cache::replay_delta( assert(extent->last_committed_crc == delta.final_crc); if (extent->version == 0) { - extent->dirty_from = journal_seq; + extent->dirty_from_or_retired_at = journal_seq; } extent->version++; mark_dirty(extent); diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index d45cf1dade2..1876e0960f2 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -90,20 +90,28 @@ public: Cache(SegmentManager &segment_manager); ~Cache(); + retired_extent_gate_t retired_extent_gate; + /// Creates empty transaction TransactionRef create_transaction() { - return std::make_unique( + auto ret = std::make_unique( get_dummy_ordering_handle(), - false + false, + last_commit ); + retired_extent_gate.add_token(ret->retired_gate_token); + return ret; } /// Creates empty weak transaction TransactionRef create_weak_transaction() { - return std::make_unique( + auto ret = std::make_unique( get_dummy_ordering_handle(), - true + true, + last_commit ); + retired_extent_gate.add_token(ret->retired_gate_token); + return ret; } /** @@ -523,6 +531,8 @@ private: RootBlockRef root; ///< ref to current root ExtentIndex extents; ///< set of live extents + journal_seq_t last_commit = JOURNAL_SEQ_MIN; + /** * dirty * diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index d65d268db56..9612a739608 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -46,6 +46,8 @@ std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) return out << "CLEAN"; case CachedExtent::extent_state_t::DIRTY: return out << "DIRTY"; + case CachedExtent::extent_state_t::RETIRED: + return out << "RETIRED"; case CachedExtent::extent_state_t::INVALID: return out << "INVALID"; default: diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index c3df9554283..8ccd2451c15 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -48,6 +48,7 @@ class CachedExtent : public boost::intrusive_ref_counter< // during write, contents match disk, version == 0 DIRTY, // Same as CLEAN, but contents do not match disk, // version > 0 + RETIRED, // In ExtentIndex while in retired_extent_gate INVALID // Part of no ExtentIndex set } state = extent_state_t::INVALID; friend std::ostream &operator<<(std::ostream &, extent_state_t); @@ -60,14 +61,6 @@ class CachedExtent : public boost::intrusive_ref_counter< // Points at current version while in state MUTATION_PENDING CachedExtentRef prior_instance; - /** - * dirty_from - * - * When dirty, indiciates the oldest journal entry which mutates - * this extent. - */ - journal_seq_t dirty_from; - public: /** * duplicate_for_write @@ -137,7 +130,7 @@ public: out << "CachedExtent(addr=" << this << ", type=" << get_type() << ", version=" << version - << ", dirty_from=" << dirty_from + << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at << ", paddr=" << get_paddr() << ", state=" << state << ", last_committed_crc=" << last_committed_crc @@ -232,7 +225,12 @@ public: /// Returns true if extent has not been superceded or retired bool is_valid() const { - return state != extent_state_t::INVALID; + return state != extent_state_t::INVALID && state != extent_state_t::RETIRED; + } + + /// True iff extent is in state RETIRED + bool is_retired() const { + return state == extent_state_t::RETIRED; } /// Returns true if extent or prior_instance has been invalidated @@ -240,13 +238,17 @@ public: return !is_valid() || (prior_instance && !prior_instance->is_valid()); } - /** - * get_dirty_from - * - * Return journal location of oldest relevant delta. - */ - auto get_dirty_from() const { return dirty_from; } + /// Return journal location of oldest relevant delta, only valid while DIRTY + auto get_dirty_from() const { + ceph_assert(is_dirty()); + return dirty_from_or_retired_at; + } + /// Return journal location of oldest relevant delta, only valid while RETIRED + auto get_retired_at() const { + ceph_assert(is_retired()); + return dirty_from_or_retired_at; + } /** * get_paddr @@ -316,6 +318,15 @@ private: using list = boost::intrusive::list< CachedExtent, primary_ref_list_member_options>; + friend class retired_extent_gate_t; + + /** + * dirty_from_or_retired_at + * + * Encodes ordering token for primary_ref_list -- dirty_from when + * dirty or retired_at if retired. + */ + journal_seq_t dirty_from_or_retired_at; /// Actual data contents ceph::bufferptr ptr; @@ -351,7 +362,7 @@ protected: CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {} CachedExtent(const CachedExtent &other) : state(other.state), - dirty_from(other.dirty_from), + dirty_from_or_retired_at(other.dirty_from_or_retired_at), ptr(other.ptr.c_str(), other.ptr.length()), version(other.version), poffset(other.poffset) {} @@ -359,7 +370,7 @@ protected: struct share_buffer_t {}; CachedExtent(const CachedExtent &other, share_buffer_t) : state(other.state), - dirty_from(other.dirty_from), + dirty_from_or_retired_at(other.dirty_from_or_retired_at), ptr(other.ptr), version(other.version), poffset(other.poffset) {} @@ -571,6 +582,70 @@ using lba_pin_list_t = std::list; std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); +/** + * retired_extent_gate_t + * + * We need to keep each retired extent in memory until all transactions + * that could still reference it has completed. live_tokens tracks the + * set of tokens (which will be embedded in Transaction's) still live + * in order of the commit after which it was created. retired_extents + * lists retired extents ordered by the commit at which they were + * retired. + */ +class retired_extent_gate_t { +public: + class token_t { + friend class retired_extent_gate_t; + retired_extent_gate_t *parent = nullptr; + journal_seq_t created_after; + + boost::intrusive::list_member_hook<> list_hook; + using list_hook_options = boost::intrusive::member_hook< + token_t, + boost::intrusive::list_member_hook<>, + &token_t::list_hook>; + using registry = boost::intrusive::list< + token_t, + list_hook_options>; + public: + token_t(journal_seq_t created_after) : created_after(created_after) {} + ~token_t(); + }; + + void prune() { + journal_seq_t prune_to = live_tokens.empty() ? + JOURNAL_SEQ_MAX : live_tokens.front().created_after; + while (!retired_extents.empty() && + prune_to > retired_extents.front().get_retired_at()) { + auto ext = &retired_extents.front(); + retired_extents.pop_front(); + intrusive_ptr_release(ext); + } + } + + void add_token(token_t &t) { + t.parent = this; + live_tokens.push_back(t); + } + + void add_extent(CachedExtent &extent) { + intrusive_ptr_add_ref(&extent); + retired_extents.push_back(extent); + } + +private: + token_t::registry live_tokens; + CachedExtent::list retired_extents; +}; + +inline retired_extent_gate_t::token_t::~token_t() { + if (parent) { + parent->live_tokens.erase( + parent->live_tokens.s_iterator_to(*this)); + parent->prune(); + parent = nullptr; + } +} /** * LogicalCachedExtent diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 655a9bea684..39966865db5 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -36,6 +36,8 @@ struct seastore_meta_t { // Identifies segment location on disk, see SegmentManager, using segment_id_t = uint32_t; +constexpr segment_id_t MAX_SEG_ID = + std::numeric_limits::max(); constexpr segment_id_t NULL_SEG_ID = std::numeric_limits::max() - 1; /* Used to denote relative paddr_t */ @@ -60,6 +62,8 @@ std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t); using segment_off_t = int32_t; constexpr segment_off_t NULL_SEG_OFF = std::numeric_limits::max(); +constexpr segment_off_t MAX_SEG_OFF = + std::numeric_limits::max(); std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t); @@ -68,6 +72,8 @@ std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t); using segment_seq_t = uint32_t; static constexpr segment_seq_t NULL_SEG_SEQ = std::numeric_limits::max(); +static constexpr segment_seq_t MAX_SEG_SEQ = + std::numeric_limits::max(); // Offset of delta within a record using record_delta_idx_t = uint32_t; @@ -192,6 +198,10 @@ WRITE_CMP_OPERATORS_2(paddr_t, segment, offset) WRITE_EQ_OPERATORS_2(paddr_t, segment, offset) constexpr paddr_t P_ADDR_NULL = paddr_t{}; constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0}; +constexpr paddr_t P_ADDR_MAX = paddr_t{ + MAX_SEG_ID, + MAX_SEG_OFF +}; constexpr paddr_t make_record_relative_paddr(segment_off_t off) { return paddr_t{RECORD_REL_SEG_ID, off}; } @@ -247,6 +257,10 @@ constexpr journal_seq_t JOURNAL_SEQ_MIN{ 0, paddr_t{0, 0} }; +constexpr journal_seq_t JOURNAL_SEQ_MAX{ + MAX_SEG_SEQ, + P_ADDR_MAX +}; std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq); diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 4510c525746..d17907a3fa9 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -5,6 +5,8 @@ #include +#include + #include "crimson/os/seastore/ordering_handle.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/cached_extent.h" @@ -12,6 +14,8 @@ namespace crimson::os::seastore { +struct retired_extent_gate_t; + /** * Transaction * @@ -138,11 +142,17 @@ private: std::vector> retired_uncached; + journal_seq_t initiated_after; + + retired_extent_gate_t::token_t retired_gate_token; + public: Transaction( OrderingHandle &&handle, - bool weak - ) : handle(std::move(handle)), weak(weak) {} + bool weak, + journal_seq_t initiated_after + ) : handle(std::move(handle)), weak(weak), + retired_gate_token(initiated_after) {} ~Transaction() { for (auto i = write_set.begin(); @@ -158,7 +168,8 @@ using TransactionRef = Transaction::Ref; inline TransactionRef make_test_transaction() { return std::make_unique( get_dummy_ordering_handle(), - false + false, + journal_seq_t{} ); } -- 2.39.5