]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore: simplify journal tails 47206/head
authorYingxin Cheng <yingxin.cheng@intel.com>
Thu, 21 Jul 2022 09:16:50 +0000 (17:16 +0800)
committerYingxin Cheng <yingxin.cheng@intel.com>
Fri, 22 Jul 2022 03:16:48 +0000 (11:16 +0800)
* append latest dirty tail as journal delta, in order to simplify and
  drop journal_tail_committed in the cleaner.
* simplify misc journal tails into alloc_tail and dirty_tail, with
  proper renaming.
* move journal-tail recovery logic from cleaner to journal.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
12 files changed:
src/crimson/os/seastore/async_cleaner.cc
src/crimson/os/seastore/async_cleaner.h
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/extent_placement_manager.cc
src/crimson/os/seastore/journal/segment_allocator.cc
src/crimson/os/seastore/journal/segmented_journal.cc
src/crimson/os/seastore/journal/segmented_journal.h
src/crimson/os/seastore/seastore_types.cc
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction_manager.cc
src/test/crimson/seastore/test_btree_lba_manager.cc
src/test/crimson/seastore/test_seastore_journal.cc

index df146d390754469bda74b12d038802eb144e64ac..c64adcd88c376b2b21bb4d156333a03c2becb34e 100644 (file)
@@ -625,75 +625,42 @@ segment_id_t AsyncCleaner::allocate_segment(
   return NULL_SEG_ID;
 }
 
-void AsyncCleaner::update_journal_tail_target(
-  journal_seq_t dirty_replay_from,
-  journal_seq_t alloc_replay_from)
+void AsyncCleaner::update_journal_tails(
+  journal_seq_t dirty_tail,
+  journal_seq_t alloc_tail)
 {
-  LOG_PREFIX(AsyncCleaner::update_journal_tail_target);
+  LOG_PREFIX(AsyncCleaner::update_journal_tails);
   if (disable_trim) return;
-  assert(dirty_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK);
-  assert(alloc_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK);
-  if (dirty_extents_replay_from == JOURNAL_SEQ_NULL
-      || dirty_replay_from > dirty_extents_replay_from) {
-    DEBUG("dirty_extents_replay_from={} => {}",
-          dirty_extents_replay_from, dirty_replay_from);
-    dirty_extents_replay_from = dirty_replay_from;
-  }
-
-  update_alloc_info_replay_from(alloc_replay_from);
+  assert(dirty_tail.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK);
+  assert(alloc_tail.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK);
 
-  journal_seq_t target = std::min(dirty_replay_from, alloc_replay_from);
-  ceph_assert(target != JOURNAL_SEQ_NULL);
+  ceph_assert(dirty_tail != JOURNAL_SEQ_NULL);
+  ceph_assert(alloc_tail != JOURNAL_SEQ_NULL);
   ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
-              journal_head >= target);
-  if (journal_tail_target == JOURNAL_SEQ_NULL ||
-      target > journal_tail_target) {
-    if (!init_complete ||
-        journal_tail_target.segment_seq == target.segment_seq) {
-      DEBUG("journal_tail_target={} => {}", journal_tail_target, target);
+              (journal_head >= dirty_tail && journal_head >= alloc_tail));
+
+  if (journal_dirty_tail == JOURNAL_SEQ_NULL ||
+      dirty_tail > journal_dirty_tail) {
+    if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) {
+      DEBUG("journal_dirty_tail {} => {}", journal_dirty_tail, dirty_tail);
     } else {
-      INFO("journal_tail_target={} => {}", journal_tail_target, target);
+      INFO("journal_dirty_tail {} => {}", journal_dirty_tail, dirty_tail);
     }
-    journal_tail_target = target;
+    journal_dirty_tail = dirty_tail;
   }
-  gc_process.maybe_wake_on_space_used();
-  maybe_wake_gc_blocked_io();
-}
 
-void AsyncCleaner::update_alloc_info_replay_from(
-  journal_seq_t alloc_replay_from)
-{
-  LOG_PREFIX(AsyncCleaner::update_alloc_info_replay_from);
-  if (alloc_info_replay_from == JOURNAL_SEQ_NULL
-      || alloc_replay_from > alloc_info_replay_from) {
-    DEBUG("alloc_info_replay_from={} => {}",
-          alloc_info_replay_from, alloc_replay_from);
-    alloc_info_replay_from = alloc_replay_from;
-  }
-}
-
-void AsyncCleaner::update_journal_tail_committed(journal_seq_t committed)
-{
-  LOG_PREFIX(AsyncCleaner::update_journal_tail_committed);
-  assert(committed.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK);
-  if (committed == JOURNAL_SEQ_NULL) {
-    return;
+  if (journal_alloc_tail == JOURNAL_SEQ_NULL ||
+      alloc_tail > journal_alloc_tail) {
+    if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) {
+      DEBUG("journal_alloc_tail {} => {}", journal_alloc_tail, alloc_tail);
+    } else {
+      INFO("journal_alloc_tail {} => {}", journal_alloc_tail, alloc_tail);
+    }
+    journal_alloc_tail = alloc_tail;
   }
-  ceph_assert(journal_head == JOURNAL_SEQ_NULL ||
-              journal_head >= committed);
 
-  if (journal_tail_committed == JOURNAL_SEQ_NULL ||
-      committed > journal_tail_committed) {
-    DEBUG("update journal_tail_committed={} => {}",
-          journal_tail_committed, committed);
-    journal_tail_committed = committed;
-  }
-  if (journal_tail_target == JOURNAL_SEQ_NULL ||
-      committed > journal_tail_target) {
-    DEBUG("update journal_tail_target={} => {}",
-          journal_tail_target, committed);
-    journal_tail_target = committed;
-  }
+  gc_process.maybe_wake_on_space_used();
+  maybe_wake_gc_blocked_io();
 }
 
 void AsyncCleaner::close_segment(segment_id_t segment)
@@ -819,7 +786,7 @@ AsyncCleaner::gc_cycle_ret AsyncCleaner::GCProcess::run()
 AsyncCleaner::gc_cycle_ret AsyncCleaner::do_gc_cycle()
 {
   if (gc_should_trim_backref()) {
-    return gc_trim_backref(get_backref_tail()
+    return gc_trim_backref(get_alloc_tail_target()
     ).safe_then([](auto) {
       return seastar::now();
     }).handle_error(
@@ -884,7 +851,7 @@ AsyncCleaner::gc_trim_backref(journal_seq_t limit) {
 
 AsyncCleaner::gc_trim_journal_ret AsyncCleaner::gc_trim_journal()
 {
-  return gc_trim_backref(get_dirty_tail()
+  return gc_trim_backref(get_dirty_tail_target()
   ).safe_then([this](auto seq) {
     return repeat_eagain([this, seq=std::move(seq)]() mutable {
       return ecb->with_transaction_intr(
@@ -1086,10 +1053,8 @@ AsyncCleaner::mount_ret AsyncCleaner::mount()
   INFO("{} segment managers", sms.size());
   init_complete = false;
   stats = {};
-  journal_tail_target = JOURNAL_SEQ_NULL;
-  journal_tail_committed = JOURNAL_SEQ_NULL;
-  dirty_extents_replay_from = JOURNAL_SEQ_NULL;
-  alloc_info_replay_from = JOURNAL_SEQ_NULL;
+  journal_alloc_tail = JOURNAL_SEQ_NULL;
+  journal_dirty_tail = JOURNAL_SEQ_NULL;
   
   space_tracker.reset(
     detailed ?
@@ -1184,31 +1149,8 @@ AsyncCleaner::scan_extents_ret AsyncCleaner::scan_no_tail_segment(
       ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<>
     {
       LOG_PREFIX(AsyncCleaner::scan_no_tail_segment);
-      if (segment_header.get_type() == segment_type_t::OOL) {
-        DEBUG("out-of-line segment {}, decodeing {} records",
-          segment_id,
-          record_group_header.records);
-      } else {
-        DEBUG("inline segment {}, decodeing {} records",
-          segment_id,
-          record_group_header.records);
-        auto maybe_record_deltas_list = try_decode_deltas(
-          record_group_header, mdbuf, locator.record_block_base);
-        if (!maybe_record_deltas_list) {
-          ERROR("unable to decode deltas for record {} at {}",
-                record_group_header, locator);
-          return crimson::ct_error::input_output_error::make();
-        }
-        for (auto &record_deltas : *maybe_record_deltas_list) {
-          for (auto &[ctime, delta] : record_deltas.deltas) {
-            if (delta.type == extent_types_t::ALLOC_TAIL) {
-              journal_seq_t seq;
-              decode(seq, delta.bl);
-              update_alloc_info_replay_from(seq);
-            }
-          }
-        }
-      }
+      DEBUG("{} {}, decoding {} records",
+            segment_id, segment_header.get_type(), record_group_header.records);
 
       auto maybe_headers = try_decode_record_headers(
           record_group_header, mdbuf);
@@ -1389,7 +1331,7 @@ segment_id_t AsyncCleaner::get_next_reclaim_segment() const
   }
   for (auto& [_id, segment_info] : segments) {
     if (segment_info.is_closed() &&
-        !segment_info.is_in_journal(journal_tail_committed)) {
+        !segment_info.is_in_journal(get_journal_tail())) {
       double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time);
       if (benefit_cost > max_benefit_cost) {
         id = _id;
@@ -1431,10 +1373,11 @@ void AsyncCleaner::log_gc_state(const char *caller) const
       "should_block_on_reclaim {}, "
       "gc_should_reclaim_space {}, "
       "journal_head {}, "
-      "journal_tail_target {}, "
-      "journal_tail_commit {}, "
-      "dirty_tail {}, "
-      "dirty_tail_limit {}, "
+      "journal_alloc_tail {}, "
+      "journal_dirty_tail {}, "
+      "alloc_tail_target {}, "
+      "dirty_tail_target {}, "
+      "tail_limit {}, "
       "gc_should_trim_journal {}, ",
       caller,
       segments.get_num_empty(),
@@ -1452,10 +1395,11 @@ void AsyncCleaner::log_gc_state(const char *caller) const
       should_block_on_reclaim(),
       gc_should_reclaim_space(),
       journal_head,
-      journal_tail_target,
-      journal_tail_committed,
-      get_dirty_tail(),
-      get_dirty_tail_limit(),
+      journal_alloc_tail,
+      journal_dirty_tail,
+      get_alloc_tail_target(),
+      get_dirty_tail_target(),
+      get_tail_limit(),
       gc_should_trim_journal()
     );
   }
index 23e939a2dc09764029a5af960b3d8ed60a1ce0e3..ba44faef840ae87ffb10790c99570c52622032d2 100644 (file)
@@ -271,21 +271,24 @@ class SegmentProvider {
 public:
   virtual void set_journal_head(journal_seq_t) = 0;
 
-  virtual journal_seq_t get_journal_tail_target() const = 0;
+  journal_seq_t get_journal_tail() const {
+    return std::min(get_alloc_tail(), get_dirty_tail());
+  }
+
+  virtual journal_seq_t get_dirty_tail() const = 0;
+
+  virtual journal_seq_t get_alloc_tail() const = 0;
+
+  virtual void update_journal_tails(
+      journal_seq_t dirty_tail, journal_seq_t alloc_tail) = 0;
 
   virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0;
 
   virtual segment_id_t allocate_segment(
       segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) = 0;
 
-  virtual journal_seq_t get_dirty_extents_replay_from() const = 0;
-
-  virtual journal_seq_t get_alloc_info_replay_from() const = 0;
-
   virtual void close_segment(segment_id_t) = 0;
 
-  virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0;
-
   virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0;
 
   virtual void update_modify_time(
@@ -721,17 +724,9 @@ private:
   seastar::metrics::metric_group metrics;
   void register_metrics();
 
-  /// target journal_tail for next fresh segment
-  journal_seq_t journal_tail_target;
+  journal_seq_t journal_alloc_tail;
 
-  /// target replay_from for dirty extents
-  journal_seq_t dirty_extents_replay_from;
-
-  /// target replay_from for alloc infos
-  journal_seq_t alloc_info_replay_from;
-
-  /// most recently committed journal_tail
-  journal_seq_t journal_tail_committed;
+  journal_seq_t journal_dirty_tail;
 
   /// head of journal
   journal_seq_t journal_head;
@@ -770,10 +765,6 @@ public:
   /*
    * SegmentProvider interfaces
    */
-  journal_seq_t get_journal_tail_target() const final {
-    return journal_tail_target;
-  }
-
   const segment_info_t& get_seg_info(segment_id_t id) const final {
     return segments[id];
   }
@@ -791,8 +782,6 @@ public:
 
   void close_segment(segment_id_t segment) final;
 
-  void update_journal_tail_committed(journal_seq_t committed) final;
-
   void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final {
     segments.update_written_to(type, offset);
     if (type == segment_type_t::JOURNAL) {
@@ -812,25 +801,21 @@ public:
     return sm_group.get();
   }
 
-  journal_seq_t get_dirty_extents_replay_from() const final {
-    return dirty_extents_replay_from;
+  journal_seq_t get_dirty_tail() const final {
+    return journal_dirty_tail;
   }
 
-  journal_seq_t get_alloc_info_replay_from() const final {
-    return alloc_info_replay_from;
+  journal_seq_t get_alloc_tail() const final {
+    return journal_alloc_tail;
   }
 
-  void update_journal_tail_target(
-    journal_seq_t dirty_replay_from,
-    journal_seq_t alloc_replay_from);
-
-  void update_alloc_info_replay_from(
-    journal_seq_t alloc_replay_from);
+  void update_journal_tails(
+      journal_seq_t dirty_tail, journal_seq_t alloc_tail) final;
 
   void init_mkfs() {
     ceph_assert(disable_trim || journal_head != JOURNAL_SEQ_NULL);
-    journal_tail_target = journal_head;
-    journal_tail_committed = journal_head;
+    journal_alloc_tail = journal_head;
+    journal_dirty_tail = journal_head;
   }
 
   using release_ertr = SegmentManagerGroup::release_ertr;
@@ -947,7 +932,7 @@ private:
     Transaction &t,
     journal_seq_t limit);
 
-  journal_seq_t get_dirty_tail() const {
+  journal_seq_t get_dirty_tail_target() const {
     auto ret = journal_head;
     ceph_assert(ret != JOURNAL_SEQ_NULL);
     if (ret.segment_seq >= config.target_journal_segments) {
@@ -959,7 +944,7 @@ private:
     return ret;
   }
 
-  journal_seq_t get_dirty_tail_limit() const {
+  journal_seq_t get_tail_limit() const {
     auto ret = journal_head;
     ceph_assert(ret != JOURNAL_SEQ_NULL);
     if (ret.segment_seq >= config.max_journal_segments) {
@@ -971,7 +956,7 @@ private:
     return ret;
   }
 
-  journal_seq_t get_backref_tail() const {
+  journal_seq_t get_alloc_tail_target() const {
     auto ret = journal_head;
     ceph_assert(ret != JOURNAL_SEQ_NULL);
     if (ret.segment_seq >= config.target_backref_inflight_segments) {
@@ -1149,12 +1134,13 @@ private:
     if (!init_complete) {
       return 0;
     }
-    if (journal_tail_committed == JOURNAL_SEQ_NULL) {
+    auto journal_tail = get_journal_tail();
+    if (journal_tail == JOURNAL_SEQ_NULL) {
       return segments.get_num_type_journal();
     }
     assert(journal_head != JOURNAL_SEQ_NULL);
-    assert(journal_head.segment_seq >= journal_tail_committed.segment_seq);
-    return journal_head.segment_seq + 1 - journal_tail_committed.segment_seq;
+    assert(journal_head.segment_seq >= journal_tail.segment_seq);
+    return journal_head.segment_seq + 1 - journal_tail.segment_seq;
   }
   std::size_t get_segments_in_journal_closed() const {
     auto in_journal = get_segments_in_journal();
@@ -1217,26 +1203,26 @@ private:
    */
   std::size_t get_dirty_journal_size() const {
     if (journal_head == JOURNAL_SEQ_NULL ||
-        dirty_extents_replay_from == JOURNAL_SEQ_NULL) {
+        journal_dirty_tail == JOURNAL_SEQ_NULL) {
       return 0;
     }
-    return (journal_head.segment_seq - dirty_extents_replay_from.segment_seq) *
+    return (journal_head.segment_seq - journal_dirty_tail.segment_seq) *
            segments.get_segment_size() +
            journal_head.offset.as_seg_paddr().get_segment_off() -
            segments.get_segment_size() -
-           dirty_extents_replay_from.offset.as_seg_paddr().get_segment_off();
+           journal_dirty_tail.offset.as_seg_paddr().get_segment_off();
   }
 
   std::size_t get_alloc_journal_size() const {
     if (journal_head == JOURNAL_SEQ_NULL ||
-        alloc_info_replay_from == JOURNAL_SEQ_NULL) {
+        journal_alloc_tail == JOURNAL_SEQ_NULL) {
       return 0;
     }
-    return (journal_head.segment_seq - alloc_info_replay_from.segment_seq) *
+    return (journal_head.segment_seq - journal_alloc_tail.segment_seq) *
            segments.get_segment_size() +
            journal_head.offset.as_seg_paddr().get_segment_off() -
            segments.get_segment_size() -
-           alloc_info_replay_from.offset.as_seg_paddr().get_segment_off();
+           journal_alloc_tail.offset.as_seg_paddr().get_segment_off();
   }
 
   /**
@@ -1246,7 +1232,7 @@ private:
    */
   bool should_block_on_trim() const {
     if (disable_trim) return false;
-    return get_dirty_tail_limit() > journal_tail_target;
+    return get_tail_limit() > get_journal_tail();
   }
 
   bool should_block_on_reclaim() const {
@@ -1311,11 +1297,11 @@ private:
    * Encapsulates logic for whether gc should be reclaiming segment space.
    */
   bool gc_should_trim_journal() const {
-    return get_dirty_tail() > journal_tail_target;
+    return get_dirty_tail_target() > journal_dirty_tail;
   }
 
   bool gc_should_trim_backref() const {
-    return get_backref_tail() > alloc_info_replay_from;
+    return get_alloc_tail_target() > journal_alloc_tail;
   }
   /**
    * gc_should_run
index 12282a3039619f473b3665090ad49b5d21c0fa17..be7bd75cd0c76348c1bc6c68c7e18a8e0691de1a 100644 (file)
@@ -150,7 +150,7 @@ void Cache::register_metrics()
     {extent_types_t::OBJECT_DATA_BLOCK,   sm::label_instance("ext", "OBJECT_DATA_BLOCK")},
     {extent_types_t::RETIRED_PLACEHOLDER, sm::label_instance("ext", "RETIRED_PLACEHOLDER")},
     {extent_types_t::ALLOC_INFO,         sm::label_instance("ext", "ALLOC_INFO")},
-    {extent_types_t::ALLOC_TAIL,          sm::label_instance("ext", "ALLOC_TAIL")},
+    {extent_types_t::JOURNAL_TAIL,        sm::label_instance("ext", "JOURNAL_TAIL")},
     {extent_types_t::TEST_BLOCK,          sm::label_instance("ext", "TEST_BLOCK")},
     {extent_types_t::TEST_BLOCK_PHYSICAL, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")},
     {extent_types_t::BACKREF_INTERNAL,    sm::label_instance("ext", "BACKREF_INTERNAL")},
@@ -1045,7 +1045,7 @@ record_t Cache::prepare_record(
   t.read_set.clear();
   t.write_set.clear();
 
-  record_t record;
+  record_t record(trans_src);
   auto commit_time = seastar::lowres_system_clock::now();
 
   // Add new copy of mutated blocks, set_io_wait to block until written
@@ -1248,10 +1248,15 @@ record_t Cache::prepare_record(
   }
 
   if (is_cleaner_transaction(trans_src)) {
+    assert(cleaner != nullptr);
+    auto tails = journal_tail_delta_t{
+      get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
+      get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL)
+    };
     bufferlist bl;
-    encode(get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), bl);
+    encode(tails, bl);
     delta_info_t delta;
-    delta.type = extent_types_t::ALLOC_TAIL;
+    delta.type = extent_types_t::JOURNAL_TAIL;
     delta.bl = bl;
     record.push_back(std::move(delta));
   }
@@ -1593,11 +1598,11 @@ Cache::replay_delta(
   journal_seq_t journal_seq,
   paddr_t record_base,
   const delta_info_t &delta,
-  const journal_seq_t &alloc_replay_from,
+  const journal_seq_t &alloc_tail,
   sea_time_point &modify_time)
 {
   LOG_PREFIX(Cache::replay_delta);
-  assert(alloc_replay_from != JOURNAL_SEQ_NULL);
+  assert(alloc_tail != JOURNAL_SEQ_NULL);
   ceph_assert(modify_time != NULL_TIME);
   if (delta.type == extent_types_t::ROOT) {
     TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}",
@@ -1612,9 +1617,9 @@ Cache::replay_delta(
     add_extent(root);
     return replay_delta_ertr::now();
   } else if (delta.type == extent_types_t::ALLOC_INFO) {
-    if (journal_seq < alloc_replay_from) {
-      DEBUG("journal_seq {} < alloc_replay_from {}, don't replay {}",
-       journal_seq, alloc_replay_from, delta);
+    if (journal_seq < alloc_tail) {
+      DEBUG("journal_seq {} < alloc_tail {}, don't replay {}",
+       journal_seq, alloc_tail, delta);
       return replay_delta_ertr::now();
     }
     alloc_delta_t alloc_delta;
@@ -1638,7 +1643,7 @@ Cache::replay_delta(
     if (!backref_list.empty())
       backref_batch_update(std::move(backref_list), journal_seq);
     return replay_delta_ertr::now();
-  } else if (delta.type == extent_types_t::ALLOC_TAIL) {
+  } else if (delta.type == extent_types_t::JOURNAL_TAIL) {
     // this delta should have been dealt with during segment cleaner mounting
     return replay_delta_ertr::now();
   } else {
index 48017f8e39c7f571a4e3954b76566ee1dc7be754..78c707bb713e323796438f81535bc0109e6e0fb1 100644 (file)
@@ -81,7 +81,7 @@ SegmentedOolWriter::do_write(
       return do_write(t, extents);
     });
   }
-  record_t record;
+  record_t record(TRANSACTION_TYPE_NULL);
   std::list<LogicalCachedExtentRef> pending_extents;
   auto commit_time = seastar::lowres_system_clock::now();
 
index 6ad2ab1a53eef80e830d219d5fb90b7ecb48c82a..1d11155f94858b17bb2117e1ca5733bd42e5da85 100644 (file)
@@ -54,34 +54,34 @@ SegmentAllocator::do_open(bool is_mkfs)
   ).safe_then([this, is_mkfs, FNAME, new_segment_seq](auto sref) {
     // initialize new segment
     segment_id_t segment_id = sref->get_segment_id();
-    journal_seq_t new_journal_tail;
-    journal_seq_t new_alloc_replay_from;
+    journal_seq_t dirty_tail;
+    journal_seq_t alloc_tail;
     if (type == segment_type_t::JOURNAL) {
-      new_journal_tail = segment_provider.get_journal_tail_target();
-      new_alloc_replay_from = segment_provider.get_alloc_info_replay_from();
+      dirty_tail = segment_provider.get_dirty_tail();
+      alloc_tail = segment_provider.get_alloc_tail();
       if (is_mkfs) {
-        ceph_assert(new_journal_tail == JOURNAL_SEQ_NULL);
-        ceph_assert(new_alloc_replay_from == JOURNAL_SEQ_NULL);
+        ceph_assert(dirty_tail == JOURNAL_SEQ_NULL);
+        ceph_assert(alloc_tail == JOURNAL_SEQ_NULL);
         auto mkfs_seq = journal_seq_t{
           new_segment_seq,
           paddr_t::make_seg_paddr(segment_id, 0)
         };
-        new_journal_tail = mkfs_seq;
-        new_alloc_replay_from = mkfs_seq;
+        dirty_tail = mkfs_seq;
+        alloc_tail = mkfs_seq;
       } else {
-        ceph_assert(new_journal_tail != JOURNAL_SEQ_NULL);
-        ceph_assert(new_alloc_replay_from != JOURNAL_SEQ_NULL);
+        ceph_assert(dirty_tail != JOURNAL_SEQ_NULL);
+        ceph_assert(alloc_tail != JOURNAL_SEQ_NULL);
       }
     } else { // OOL
       ceph_assert(!is_mkfs);
-      new_journal_tail = NO_DELTAS;
-      new_alloc_replay_from = NO_DELTAS;
+      dirty_tail = NO_DELTAS;
+      alloc_tail = NO_DELTAS;
     }
     auto header = segment_header_t{
       new_segment_seq,
       segment_id,
-      new_journal_tail,
-      new_alloc_replay_from,
+      dirty_tail,
+      alloc_tail,
       current_segment_nonce,
       type,
       category,
@@ -116,13 +116,9 @@ SegmentAllocator::do_open(bool is_mkfs)
     ).safe_then([this,
                  FNAME,
                  new_journal_seq,
-                 new_journal_tail,
                  sref=std::move(sref)]() mutable {
       ceph_assert(!current_segment);
       current_segment = std::move(sref);
-      if (type == segment_type_t::JOURNAL) {
-        segment_provider.update_journal_tail_committed(new_journal_tail);
-      }
       DEBUG("{} rolled new segment id={}",
             print_name, current_segment->get_segment_id());
       ceph_assert(new_journal_seq.segment_seq ==
index 9fc3ec7bdcbb106c1e6f0947afd47d334efddfdd..24ada40f73664ecd3b519e4b009060fa47679de6 100644 (file)
@@ -100,43 +100,126 @@ SegmentedJournal::prep_replay_segments(
     }
   });
 
-  auto journal_tail = segments.rbegin()->second.journal_tail;
-  segment_provider.update_journal_tail_committed(journal_tail);
-  auto journal_tail_paddr = journal_tail.offset;
-  ceph_assert(journal_tail != JOURNAL_SEQ_NULL);
-  ceph_assert(journal_tail_paddr != P_ADDR_NULL);
-  auto from = std::find_if(
-    segments.begin(),
-    segments.end(),
-    [&journal_tail_paddr](const auto &seg) -> bool {
-      auto& seg_addr = journal_tail_paddr.as_seg_paddr();
-      return seg.first == seg_addr.get_segment_id();
-    });
-  if (from->second.segment_seq != journal_tail.segment_seq) {
-    ERROR("journal_tail {} does not match {}",
-          journal_tail, from->second);
-    ceph_abort();
-  }
+  auto last_segment_id = segments.rbegin()->first;
+  auto last_header = segments.rbegin()->second;
+  return scan_last_segment(last_segment_id, last_header
+  ).safe_then([this, FNAME, segments=std::move(segments)] {
+    auto journal_tail = segment_provider.get_journal_tail();
+    auto journal_tail_paddr = journal_tail.offset;
+    ceph_assert(journal_tail != JOURNAL_SEQ_NULL);
+    ceph_assert(journal_tail_paddr != P_ADDR_NULL);
+    auto from = std::find_if(
+      segments.begin(),
+      segments.end(),
+      [&journal_tail_paddr](const auto &seg) -> bool {
+        auto& seg_addr = journal_tail_paddr.as_seg_paddr();
+        return seg.first == seg_addr.get_segment_id();
+      });
+    if (from->second.segment_seq != journal_tail.segment_seq) {
+      ERROR("journal_tail {} does not match {}",
+            journal_tail, from->second);
+      ceph_abort();
+    }
 
-  auto num_segments = segments.end() - from;
-  INFO("{} segments to replay from {}",
-       num_segments, journal_tail);
-  auto ret = replay_segments_t(num_segments);
-  std::transform(
-    from, segments.end(), ret.begin(),
-    [this](const auto &p) {
-      auto ret = journal_seq_t{
-       p.second.segment_seq,
-       paddr_t::make_seg_paddr(
-         p.first,
-         sm_group.get_block_size())
-      };
-      return std::make_pair(ret, p.second);
-    });
-  ret[0].first.offset = journal_tail_paddr;
-  return prep_replay_segments_fut(
-    prep_replay_segments_ertr::ready_future_marker{},
-    std::move(ret));
+    auto num_segments = segments.end() - from;
+    INFO("{} segments to replay from {}",
+         num_segments, journal_tail);
+    auto ret = replay_segments_t(num_segments);
+    std::transform(
+      from, segments.end(), ret.begin(),
+      [this](const auto &p) {
+        auto ret = journal_seq_t{
+          p.second.segment_seq,
+          paddr_t::make_seg_paddr(
+            p.first,
+            sm_group.get_block_size())
+        };
+        return std::make_pair(ret, p.second);
+      });
+    ret[0].first.offset = journal_tail_paddr;
+    return prep_replay_segments_fut(
+      replay_ertr::ready_future_marker{},
+      std::move(ret));
+  });
+}
+
+SegmentedJournal::scan_last_segment_ertr::future<>
+SegmentedJournal::scan_last_segment(
+  const segment_id_t &segment_id,
+  const segment_header_t &segment_header)
+{
+  LOG_PREFIX(SegmentedJournal::scan_last_segment);
+  assert(segment_id == segment_header.physical_segment_id);
+  segment_provider.update_journal_tails(
+      segment_header.dirty_tail, segment_header.alloc_tail);
+  auto seq = journal_seq_t{
+    segment_header.segment_seq,
+    paddr_t::make_seg_paddr(segment_id, 0)
+  };
+  INFO("scanning {} for journal tail deltas", seq);
+  return seastar::do_with(
+    scan_valid_records_cursor(seq),
+    SegmentManagerGroup::found_record_handler_t(
+      [FNAME, this](
+        record_locator_t locator,
+        const record_group_header_t& record_group_header,
+        const bufferlist& mdbuf
+      ) -> SegmentManagerGroup::scan_valid_records_ertr::future<>
+    {
+      DEBUG("decoding {} at {}", record_group_header, locator);
+      bool has_tail_delta = false;
+      auto maybe_headers = try_decode_record_headers(
+          record_group_header, mdbuf);
+      if (!maybe_headers) {
+        // This should be impossible, we did check the crc on the mdbuf
+        ERROR("unable to decode headers from {} at {}",
+              record_group_header, locator);
+        ceph_abort();
+      }
+      for (auto &record_header : *maybe_headers) {
+        ceph_assert(is_valid_transaction(record_header.type));
+        if (is_cleaner_transaction(record_header.type)) {
+          has_tail_delta = true;
+        }
+      }
+      if (has_tail_delta) {
+        bool found_delta = false;
+        auto maybe_record_deltas_list = try_decode_deltas(
+          record_group_header, mdbuf, locator.record_block_base);
+        if (!maybe_record_deltas_list) {
+          ERROR("unable to decode deltas from {} at {}",
+                record_group_header, locator);
+          ceph_abort();
+        }
+        for (auto &record_deltas : *maybe_record_deltas_list) {
+          for (auto &[ctime, delta] : record_deltas.deltas) {
+            if (delta.type == extent_types_t::JOURNAL_TAIL) {
+              found_delta = true;
+              journal_tail_delta_t tail_delta;
+              decode(tail_delta, delta.bl);
+              if (tail_delta.alloc_tail == JOURNAL_SEQ_NULL) {
+                tail_delta.alloc_tail = locator.write_result.start_seq;
+              }
+              if (tail_delta.dirty_tail == JOURNAL_SEQ_NULL) {
+                tail_delta.dirty_tail = locator.write_result.start_seq;
+              }
+              segment_provider.update_journal_tails(
+                  tail_delta.dirty_tail, tail_delta.alloc_tail);
+            }
+          }
+        }
+        ceph_assert(found_delta);
+      }
+      return seastar::now();
+    }),
+    [this, nonce=segment_header.segment_nonce](auto &cursor, auto &handler)
+  {
+    return sm_group.scan_valid_records(
+      cursor,
+      nonce,
+      std::numeric_limits<std::size_t>::max(),
+      handler).discard_result();
+  });
 }
 
 SegmentedJournal::replay_ertr::future<>
@@ -226,7 +309,7 @@ SegmentedJournal::replay_segment(
            return handler(
              locator,
              delta,
-             segment_provider.get_alloc_info_replay_from(),
+             segment_provider.get_alloc_tail(),
               modify_time);
           });
         });
index 8c6485a1302f755fa51ad94dcf3ebcf0813668f4..3cf935382b612387b64faf02d72493a897a09063 100644 (file)
@@ -65,14 +65,16 @@ private:
   /// return ordered vector of segments to replay
   using replay_segments_t = std::vector<
     std::pair<journal_seq_t, segment_header_t>>;
-  using prep_replay_segments_ertr = crimson::errorator<
-    crimson::ct_error::input_output_error
-    >;
-  using prep_replay_segments_fut = prep_replay_segments_ertr::future<
+  using prep_replay_segments_fut = replay_ertr::future<
     replay_segments_t>;
   prep_replay_segments_fut prep_replay_segments(
     std::vector<std::pair<segment_id_t, segment_header_t>> segments);
 
+  /// scan the last segment for tail deltas
+  using scan_last_segment_ertr = replay_ertr;
+  scan_last_segment_ertr::future<> scan_last_segment(
+      const segment_id_t&, const segment_header_t&);
+
   /// replays records starting at start through end of segment
   replay_ertr::future<>
   replay_segment(
index 6ef15f6fba6a4adf5f4bcf6e6e11e3f64efc42e1..a6191f7dfbc561d4c94140b593907924d890d260 100644 (file)
@@ -269,7 +269,8 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
   return out << "segment_header_t("
             << "segment_seq=" << segment_seq_printer_t{header.segment_seq}
             << ", segment_id=" << header.physical_segment_id
-            << ", journal_tail=" << header.journal_tail
+            << ", dirty_tail=" << header.dirty_tail
+            << ", alloc_tail=" << header.alloc_tail
             << ", segment_nonce=" << header.segment_nonce
             << ", type=" << header.type
             << ", category=" << header.category
@@ -341,7 +342,8 @@ std::ostream &operator<<(std::ostream& out, const record_size_t& rsize)
 std::ostream &operator<<(std::ostream& out, const record_t& r)
 {
   return out << "record_t("
-             << "num_extents=" << r.extents.size()
+             << "type=" << r.type
+             << ", num_extents=" << r.extents.size()
              << ", num_deltas=" << r.deltas.size()
              << ", modify_time=" << sea_time_point_printer_t{r.modify_time}
              << ")";
@@ -350,7 +352,8 @@ std::ostream &operator<<(std::ostream& out, const record_t& r)
 std::ostream &operator<<(std::ostream& out, const record_header_t& r)
 {
   return out << "record_header_t("
-             << "num_extents=" << r.extents
+             << "type=" << r.type
+             << ", num_extents=" << r.extents
              << ", num_deltas=" << r.deltas
              << ", modify_time=" << mod_time_point_printer_t{r.modify_time}
              << ")";
@@ -450,6 +453,7 @@ ceph::bufferlist encode_records(
 
   for (auto& r: record_group.records) {
     record_header_t rheader{
+      r.type,
       (extent_len_t)r.deltas.size(),
       (extent_len_t)r.extents.size(),
       timepoint_to_mod(r.modify_time)
index 9e5dbc0543e844e7169c7ed26bd08d2e633db80f..c76c85172c61de5c2d07d0fb0f5c944d2e13a4ab 100644 (file)
@@ -931,7 +931,7 @@ enum class extent_types_t : uint8_t {
   // the following two types are not extent types,
   // they are just used to indicates paddr allocation deltas
   ALLOC_INFO = 9,
-  ALLOC_TAIL = 10,
+  JOURNAL_TAIL = 10,
   // Test Block Types
   TEST_BLOCK = 11,
   TEST_BLOCK_PHYSICAL = 12,
@@ -1107,6 +1107,19 @@ struct delta_info_t {
 
 std::ostream &operator<<(std::ostream &out, const delta_info_t &delta);
 
+/* contains the latest journal tail information */
+struct journal_tail_delta_t {
+  journal_seq_t alloc_tail;
+  journal_seq_t dirty_tail;
+
+  DENC(journal_tail_delta_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.alloc_tail, p);
+    denc(v.dirty_tail, p);
+    DENC_FINISH(p);
+  }
+};
+
 class object_data_t {
   laddr_t reserved_data_base = L_ADDR_NULL;
   extent_len_t reserved_data_len = 0;
@@ -1475,14 +1488,16 @@ using segment_nonce_t = uint32_t;
  * Every segment contains and encode segment_header_t in the first block.
  * Our strategy for finding the journal replay point is:
  * 1) Find the segment with the highest journal_segment_seq
- * 2) Replay starting at record located at that segment's journal_tail
+ * 2) Get dirty_tail and alloc_tail from the segment header
+ * 3) Scan forward to update tails from journal_tail_delta_t
+ * 4) Replay from the latest tails
  */
 struct segment_header_t {
   segment_seq_t segment_seq;
   segment_id_t physical_segment_id; // debugging
 
-  journal_seq_t journal_tail;
-  journal_seq_t alloc_replay_from;
+  journal_seq_t dirty_tail;
+  journal_seq_t alloc_tail;
   segment_nonce_t segment_nonce;
 
   segment_type_t type;
@@ -1498,8 +1513,8 @@ struct segment_header_t {
     DENC_START(1, 1, p);
     denc(v.segment_seq, p);
     denc(v.physical_segment_id, p);
-    denc(v.journal_tail, p);
-    denc(v.alloc_replay_from, p);
+    denc(v.dirty_tail, p);
+    denc(v.alloc_tail, p);
     denc(v.segment_nonce, p);
     denc(v.type, p);
     denc(v.category, p);
@@ -1585,12 +1600,18 @@ WRITE_EQ_OPERATORS_2(record_size_t, plain_mdlength, dlength);
 std::ostream &operator<<(std::ostream&, const record_size_t&);
 
 struct record_t {
+  transaction_type_t type = TRANSACTION_TYPE_NULL;
   std::vector<extent_t> extents;
   std::vector<delta_info_t> deltas;
   record_size_t size;
   sea_time_point modify_time = NULL_TIME;
 
-  record_t() = default;
+  record_t(transaction_type_t type) : type{type} { }
+
+  // unit test only
+  record_t() {
+    type = transaction_type_t::MUTATE;
+  }
 
   // unit test only
   record_t(std::vector<extent_t>&& _extents,
@@ -1602,6 +1623,7 @@ struct record_t {
     for (auto& d: _deltas) {
       push_back(std::move(d));
     }
+    type = transaction_type_t::MUTATE;
   }
 
   bool is_empty() const {
@@ -1639,12 +1661,14 @@ struct record_t {
 std::ostream &operator<<(std::ostream&, const record_t&);
 
 struct record_header_t {
+  transaction_type_t type;
   uint32_t deltas;              // number of deltas
   uint32_t extents;             // number of extents
   mod_time_point_t modify_time;
 
   DENC(record_header_t, v, p) {
     DENC_START(1, 1, p);
+    denc(v.type, p);
     denc(v.deltas, p);
     denc(v.extents, p);
     denc(v.modify_time, p);
@@ -1945,6 +1969,7 @@ WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
+WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t)
 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
index f9a082634e1fb1e87f1e20912e25adbc04c69818..4a70b501570c5f058c40fe7f88271d4c75f1eb46 100644 (file)
@@ -92,18 +92,18 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount()
       [this](
        const auto &offsets,
        const auto &e,
-       const journal_seq_t alloc_replay_from,
+       const journal_seq_t alloc_tail,
        auto modify_time)
       {
        auto start_seq = offsets.write_result.start_seq;
-       async_cleaner->update_journal_tail_target(
+       async_cleaner->update_journal_tails(
          cache->get_oldest_dirty_from().value_or(start_seq),
          cache->get_oldest_backref_dirty_from().value_or(start_seq));
        return cache->replay_delta(
          start_seq,
          offsets.record_block_base,
          e,
-         alloc_replay_from,
+         alloc_tail,
          modify_time);
       });
   }).safe_then([this] {
@@ -416,7 +416,7 @@ TransactionManager::submit_transaction_direct(
       lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link);
       backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link);
 
-      async_cleaner->update_journal_tail_target(
+      async_cleaner->update_journal_tails(
        cache->get_oldest_dirty_from().value_or(start_seq),
        cache->get_oldest_backref_dirty_from().value_or(start_seq));
       return async_cleaner->maybe_release_segment(tref);
index c782531680a0ad0ccce4b1bbe1ee1b27fef9bb52..1bf7c5050f4b16b802bc7c712247287fc6194e35 100644 (file)
@@ -53,7 +53,11 @@ struct btree_test_base :
    */
   void set_journal_head(journal_seq_t) final {}
 
-  journal_seq_t get_journal_tail_target() const final { return dummy_tail; }
+  journal_seq_t get_dirty_tail() const final { return dummy_tail; }
+
+  journal_seq_t get_alloc_tail() const final { return dummy_tail; }
+
+  void update_journal_tails(journal_seq_t, journal_seq_t) final {}
 
   const segment_info_t& get_seg_info(segment_id_t id) const final {
     tmp_info = {};
@@ -79,22 +83,12 @@ struct btree_test_base :
 
   void close_segment(segment_id_t) final {}
 
-  void update_journal_tail_committed(journal_seq_t committed) final {}
-
   void update_segment_avail_bytes(segment_type_t, paddr_t) final {}
 
   void update_modify_time(segment_id_t, sea_time_point, std::size_t) final {}
 
   SegmentManagerGroup* get_segment_manager_group() final { return sms.get(); }
 
-  journal_seq_t get_dirty_extents_replay_from() const final {
-    return dummy_tail;
-  }
-
-  journal_seq_t get_alloc_info_replay_from() const final {
-    return dummy_tail;
-  }
-
   virtual void complete_commit(Transaction &t) {}
   seastar::future<> submit_transaction(TransactionRef t)
   {
index 63a61f8a6f96c219816ca66747fe76e743af4849..3b116482ca188f77db2a5310c0096f7a4c87ae5c 100644 (file)
@@ -94,7 +94,11 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider {
    */
   void set_journal_head(journal_seq_t) final {}
 
-  journal_seq_t get_journal_tail_target() const final { return dummy_tail; }
+  journal_seq_t get_dirty_tail() const final { return dummy_tail; }
+
+  journal_seq_t get_alloc_tail() const final { return dummy_tail; }
+
+  void update_journal_tails(journal_seq_t, journal_seq_t) final {}
 
   const segment_info_t& get_seg_info(segment_id_t id) const final {
     tmp_info = {};
@@ -103,14 +107,6 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider {
     return tmp_info;
   }
 
-  journal_seq_t get_dirty_extents_replay_from() const final {
-    return dummy_tail;
-  }
-
-  journal_seq_t get_alloc_info_replay_from() const final {
-    return dummy_tail;
-  }
-
   segment_id_t allocate_segment(
     segment_seq_t seq,
     segment_type_t type,
@@ -128,8 +124,6 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider {
 
   void close_segment(segment_id_t) final {}
 
-  void update_journal_tail_committed(journal_seq_t paddr) final {}
-
   void update_segment_avail_bytes(segment_type_t, paddr_t) final {}
 
   void update_modify_time(segment_id_t, sea_time_point, std::size_t) final {}