]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/os/seastore: define the usage of generation
authorYingxin Cheng <yingxin.cheng@intel.com>
Mon, 7 Nov 2022 08:42:58 +0000 (16:42 +0800)
committerYingxin Cheng <yingxin.cheng@intel.com>
Mon, 7 Nov 2022 11:06:02 +0000 (19:06 +0800)
Unify the definition of the reclaim generation, cleanup and explain its
intentions and usages. Please refer to the comments in seastore_types.h.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
13 files changed:
src/crimson/os/seastore/async_cleaner.cc
src/crimson/os/seastore/async_cleaner.h
src/crimson/os/seastore/btree/fixed_kv_btree.h
src/crimson/os/seastore/btree/fixed_kv_node.h
src/crimson/os/seastore/cache.cc
src/crimson/os/seastore/cached_extent.h
src/crimson/os/seastore/extent_placement_manager.cc
src/crimson/os/seastore/extent_placement_manager.h
src/crimson/os/seastore/journal/segmented_journal.cc
src/crimson/os/seastore/seastore_types.cc
src/crimson/os/seastore/seastore_types.h
src/crimson/os/seastore/transaction_manager.cc
src/crimson/os/seastore/transaction_manager.h

index 322f54ee3d9c8ae7d24c06e68b1f6abfb6535538..819f69e5e2be330c13958f347a5a82ef67b74745 100644 (file)
@@ -31,7 +31,7 @@ void segment_info_t::set_open(
   ceph_assert(_seq != NULL_SEG_SEQ);
   ceph_assert(_type != segment_type_t::NULL_SEG);
   ceph_assert(_category != data_category_t::NUM);
-  ceph_assert(_generation < RECLAIM_GENERATIONS);
+  ceph_assert(is_reclaim_generation(_generation));
   state = Segment::segment_state_t::OPEN;
   seq = _seq;
   type = _type;
@@ -66,7 +66,7 @@ void segment_info_t::init_closed(
   ceph_assert(_seq != NULL_SEG_SEQ);
   ceph_assert(_type != segment_type_t::NULL_SEG);
   ceph_assert(_category != data_category_t::NUM);
-  ceph_assert(_generation < RECLAIM_GENERATIONS);
+  ceph_assert(is_reclaim_generation(_generation));
   state = Segment::segment_state_t::CLOSED;
   seq = _seq;
   type = _type;
@@ -612,7 +612,7 @@ JournalTrimmerImpl::trim_dirty()
             dirty_list,
             [this, &t](auto &e) {
             return extent_callback->rewrite_extent(
-                t, e, DIRTY_GENERATION, NULL_TIME);
+                t, e, INIT_GENERATION, NULL_TIME);
           });
         });
       }).si_then([this, &t] {
index ded2a3da8ac1e7ff1c26f1d2bc6223c5f723f182..eb09464e6e41ab067e2779cc39ccc84870d8d08d 100644 (file)
@@ -1085,10 +1085,20 @@ private:
         segment_id_t segment_id,
         reclaim_gen_t generation,
         segment_off_t segment_size) {
-      ceph_assert(generation < RECLAIM_GENERATIONS);
+      ceph_assert(is_reclaim_generation(generation));
+
+      reclaim_gen_t target_gen;
+      if (generation < MIN_REWRITE_GENERATION) {
+        target_gen = MIN_REWRITE_GENERATION;
+      } else {
+        // tolerate the target_gen to exceed MAX_REWRETE_GENERATION to make EPM
+        // aware of its original generation for the decisions.
+        target_gen = generation + 1;
+      }
+
+      assert(is_target_reclaim_generation(target_gen));
       return {generation,
-              (reclaim_gen_t)(generation == RECLAIM_GENERATIONS - 1 ?
-                              generation : generation + 1),
+              target_gen,
               segment_size,
               P_ADDR_NULL,
               paddr_t::make_seg_paddr(segment_id, 0)};
index 1ea09d0e5987356bec2e976ad6368e14ede89811..ccf90caf06be77caa40acb3607288ae3ebf16a31 100644 (file)
@@ -315,7 +315,7 @@ public:
       c.trans,
       node_size,
       placement_hint_t::HOT,
-      0);
+      INIT_GENERATION);
     root_leaf->set_size(0);
     fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1};
     root_leaf->set_meta(meta);
@@ -818,6 +818,7 @@ public:
         c.trans,
         fixed_kv_extent.get_length(),
         fixed_kv_extent.get_user_hint(),
+        // get target reclaim generation
         fixed_kv_extent.get_reclaim_generation());
       fixed_kv_extent.get_bptr().copy_out(
         0,
@@ -1406,7 +1407,7 @@ private:
 
     if (split_from == iter.get_depth()) {
       auto nroot = c.cache.template alloc_new_extent<internal_node_t>(
-        c.trans, node_size, placement_hint_t::HOT, 0);
+        c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
       fixed_kv_node_meta_t<node_key_t> meta{
         min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1};
       nroot->set_meta(meta);
index f193509f50e5419608675c15e8bace142c4a10a3..1aed9fb200c9751d0754729f10759c673cff194d 100644 (file)
@@ -154,9 +154,9 @@ struct FixedKVInternalNode
   std::tuple<Ref, Ref, NODE_KEY>
   make_split_children(op_context_t<NODE_KEY> c) {
     auto left = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto right = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto pivot = this->split_into(*left, *right);
     left->pin.set_range(left->get_meta());
     right->pin.set_range(right->get_meta());
@@ -170,7 +170,7 @@ struct FixedKVInternalNode
     op_context_t<NODE_KEY> c,
     Ref &right) {
     auto replacement = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     replacement->merge_from(*this, *right->template cast<node_type_t>());
     replacement->pin.set_range(replacement->get_meta());
     return replacement;
@@ -184,9 +184,9 @@ struct FixedKVInternalNode
     ceph_assert(_right->get_type() == this->get_type());
     auto &right = *_right->template cast<node_type_t>();
     auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
 
     auto pivot = this->balance_into_new_nodes(
       *this,
@@ -355,9 +355,9 @@ struct FixedKVLeafNode
   std::tuple<Ref, Ref, NODE_KEY>
   make_split_children(op_context_t<NODE_KEY> c) {
     auto left = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto right = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto pivot = this->split_into(*left, *right);
     left->pin.set_range(left->get_meta());
     right->pin.set_range(right->get_meta());
@@ -371,7 +371,7 @@ struct FixedKVLeafNode
     op_context_t<NODE_KEY> c,
     Ref &right) {
     auto replacement = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     replacement->merge_from(*this, *right->template cast<node_type_t>());
     replacement->pin.set_range(replacement->get_meta());
     return replacement;
@@ -385,9 +385,9 @@ struct FixedKVLeafNode
     ceph_assert(_right->get_type() == this->get_type());
     auto &right = *_right->template cast<node_type_t>();
     auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
     auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
-      c.trans, node_size, placement_hint_t::HOT, 0);
+      c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
 
     auto pivot = this->balance_into_new_nodes(
       *this,
index 80d09889b1b2e8ba98b0e351826162720b5095e7..1fe91306ba52d271402482d247769c5a33cad60a 100644 (file)
@@ -700,6 +700,8 @@ void Cache::add_extent(
     const Transaction::src_t* p_src=nullptr)
 {
   assert(ref->is_valid());
+  assert(ref->user_hint == PLACEMENT_HINT_NULL);
+  assert(ref->reclaim_generation == NULL_GENERATION);
   extents.insert(*ref);
   if (ref->is_dirty()) {
     add_to_dirty(ref);
@@ -1012,7 +1014,6 @@ CachedExtentRef Cache::duplicate_for_write(
 
   ret->version++;
   ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
-  ret->set_reclaim_generation(DIRTY_GENERATION);
   DEBUGT("{} -> {}", t, *i, *ret);
   return ret;
 }
@@ -1440,6 +1441,7 @@ void Cache::complete_commit(
     DEBUGT("add extent as fresh, inline={} -- {}",
           t, is_inline, *i);
     const auto t_src = t.get_src();
+    i->invalidate_hints();
     add_extent(i, &t_src);
     epm.mark_space_used(i->get_paddr(), i->get_length());
     if (is_backref_mapped_extent_node(i)) {
index 05cf563866409e48ffd34d03e242dada2cf55239..2024e8115f81031730d91bc6d95479fd4968a388 100644 (file)
@@ -118,6 +118,7 @@ public:
             paddr_t paddr,
             placement_hint_t hint,
             reclaim_gen_t gen) {
+    assert(gen == NULL_GENERATION || is_reclaim_generation(gen));
     state = _state;
     set_paddr(paddr);
     user_hint = hint;
@@ -402,8 +403,10 @@ public:
     reclaim_generation = NULL_GENERATION;
   }
 
-  void set_reclaim_generation(reclaim_gen_t gen) {
-    assert(gen < RECLAIM_GENERATIONS);
+  /// assign the target reclaim generation for the followup rewrite
+  void set_target_reclaim_generation(reclaim_gen_t gen) {
+    assert(is_target_reclaim_generation(gen));
+
     user_hint = placement_hint_t::REWRITE;
     reclaim_generation = gen;
   }
@@ -485,10 +488,11 @@ private:
 
   read_set_item_t<Transaction>::list transactions;
 
-  placement_hint_t user_hint;
+  placement_hint_t user_hint = PLACEMENT_HINT_NULL;
 
-  /// > 0 and not null means the extent is under reclaimming
-  reclaim_gen_t reclaim_generation;
+  // the target reclaim generation for the followup rewrite
+  // or the reclaim generation for the fresh write
+  reclaim_gen_t reclaim_generation = NULL_GENERATION;
 
 protected:
   CachedExtent(CachedExtent &&other) = delete;
index 8e77a6804268030ad9af7038705e9345657730a6..a39b69c32003dfb216ebd99c0dff40317d4fe6da 100644 (file)
@@ -57,7 +57,6 @@ SegmentedOolWriter::write_record(
       TRACET("{} ool extent written at {} -- {}",
              t, segment_allocator.get_name(),
              extent_addr, *extent);
-      extent->invalidate_hints();
       t.mark_delayed_extent_ool(extent, extent_addr);
       extent_addr = extent_addr.as_seg_paddr().add_offset(
           extent->get_length());
@@ -179,23 +178,23 @@ void ExtentPlacementManager::init(
 {
   writer_refs.clear();
 
-  ceph_assert(RECLAIM_GENERATIONS > 0);
   auto segment_cleaner = dynamic_cast<SegmentCleaner*>(cleaner.get());
   ceph_assert(segment_cleaner != nullptr);
-  data_writers_by_gen.resize(RECLAIM_GENERATIONS, {});
-  for (reclaim_gen_t gen = 0; gen < RECLAIM_GENERATIONS; ++gen) {
+  auto num_writers = generation_to_writer(REWRITE_GENERATIONS);
+  data_writers_by_gen.resize(num_writers, {});
+  for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
     writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
           data_category_t::DATA, gen, *segment_cleaner,
           segment_cleaner->get_ool_segment_seq_allocator()));
-    data_writers_by_gen[gen] = writer_refs.back().get();
+    data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
   }
 
-  md_writers_by_gen.resize(RECLAIM_GENERATIONS - 1, {});
-  for (reclaim_gen_t gen = 1; gen < RECLAIM_GENERATIONS; ++gen) {
+  md_writers_by_gen.resize(num_writers, {});
+  for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
     writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
           data_category_t::METADATA, gen, *segment_cleaner,
           segment_cleaner->get_ool_segment_seq_allocator()));
-    md_writers_by_gen[gen - 1] = writer_refs.back().get();
+    md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
   }
 
   for (auto *device : segment_cleaner->get_segment_manager_group()
index 0804e7d1c252b9688ddcd538390364cdc9b70297..d17732e8340ad326210b36d49efa7acf714a6c28 100644 (file)
@@ -146,8 +146,8 @@ public:
     reclaim_gen_t gen
   ) {
     assert(hint < placement_hint_t::NUM_HINTS);
-    assert(gen < RECLAIM_GENERATIONS);
-    assert(gen == 0 || hint == placement_hint_t::REWRITE);
+    assert(is_target_reclaim_generation(gen));
+    assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE);
 
     // XXX: bp might be extended to point to differnt memory (e.g. PMem)
     // according to the allocator.
@@ -159,31 +159,37 @@ public:
       // TODO: implement out-of-line strategy for physical extent.
       return {make_record_relative_paddr(0),
               std::move(bp),
-              0};
+              INLINE_GENERATION};
     }
 
     if (hint == placement_hint_t::COLD) {
-      assert(gen == 0);
+      assert(gen == INIT_GENERATION);
       return {make_delayed_temp_paddr(0),
               std::move(bp),
-              COLD_GENERATION};
+              MIN_REWRITE_GENERATION};
     }
 
     if (get_extent_category(type) == data_category_t::METADATA &&
-        gen == 0) {
-      // gen 0 METADATA writer is the journal writer
+        gen == INIT_GENERATION) {
       if (prefer_ool) {
         return {make_delayed_temp_paddr(0),
                 std::move(bp),
-                1};
+                OOL_GENERATION};
       } else {
+        // default not to ool metadata extents to reduce padding overhead.
+        // TODO: improve padding so we can default to the prefer_ool path.
         return {make_record_relative_paddr(0),
                 std::move(bp),
-                0};
+                INLINE_GENERATION};
       }
     } else {
       assert(get_extent_category(type) == data_category_t::DATA ||
-             gen > 0);
+             gen >= MIN_REWRITE_GENERATION);
+      if (gen > MAX_REWRITE_GENERATION) {
+        gen = MAX_REWRITE_GENERATION;
+      } else if (gen == INIT_GENERATION) {
+        gen = OOL_GENERATION;
+      }
       return {make_delayed_temp_paddr(0),
               std::move(bp),
               gen};
@@ -261,14 +267,13 @@ private:
                               data_category_t category,
                               reclaim_gen_t gen) {
     assert(hint < placement_hint_t::NUM_HINTS);
-    assert(gen < RECLAIM_GENERATIONS);
+    assert(is_reclaim_generation(gen));
+    assert(gen != INLINE_GENERATION);
     if (category == data_category_t::DATA) {
-      return data_writers_by_gen[gen];
+      return data_writers_by_gen[generation_to_writer(gen)];
     } else {
       assert(category == data_category_t::METADATA);
-      // gen 0 METADATA writer is the journal writer
-      assert(gen > 0);
-      return md_writers_by_gen[gen - 1];
+      return md_writers_by_gen[generation_to_writer(gen)];
     }
   }
 
index f8b8539738d6184841092ea6d83b39405454090a..58df913749321abe97332fc840e5be816d466123 100644 (file)
@@ -33,7 +33,7 @@ SegmentedJournal::SegmentedJournal(
       new SegmentSeqAllocator(segment_type_t::JOURNAL)),
     journal_segment_allocator(&trimmer,
                               data_category_t::METADATA,
-                              0, // generation
+                              INLINE_GENERATION,
                               segment_provider,
                               *segment_seq_allocator),
     record_submitter(crimson::common::get_conf<uint64_t>(
index bb41a9f78d3780a07b79a52d510e74f2f61c9842..1b8bab23625e739f28ac09b4eb9d685f21c58524 100644 (file)
@@ -249,9 +249,15 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
 std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen)
 {
   if (gen.gen == NULL_GENERATION) {
-    return out << "NULL_GEN";
-  } else if (gen.gen >= RECLAIM_GENERATIONS) {
-    return out << "INVALID_GEN(" << (unsigned)gen.gen << ")";
+    return out << "GEN_NULL";
+  } else if (gen.gen == INIT_GENERATION) {
+    return out << "GEN_INIT";
+  } else if (gen.gen == INLINE_GENERATION) {
+    return out << "GEN_INL";
+  } else if (gen.gen == OOL_GENERATION) {
+    return out << "GEN_OOL";
+  } else if (gen.gen > REWRITE_GENERATIONS) {
+    return out << "GEN_INVALID(" << (unsigned)gen.gen << ")!";
   } else {
     return out << "GEN(" << (unsigned)gen.gen << ")";
   }
index 28d12ebb9573b1ff983736b3a4e5d0da6044050c..490622a67ff16b03d9b9cf8449b2d0762a271f8a 100644 (file)
@@ -1117,11 +1117,44 @@ constexpr bool is_backref_node(extent_types_t type)
 
 std::ostream &operator<<(std::ostream &out, extent_types_t t);
 
+/**
+ * reclaim_gen_t
+ *
+ * The goal is to group the similar aged extents in the same segment for better
+ * bimodel utilization distribution, and also to the same device tier. For EPM,
+ * it has the flexibility to make placement decisions by re-assigning the
+ * generation. And each non-inline generation will be statically mapped to a
+ * writer in EPM.
+ *
+ * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
+ * and they will be assigned to INLINE/OOL generation by EPM before the initial
+ * writes. After that, the generation can only be increased upon rewrite.
+ *
+ * Note, although EPM can re-assign the generations according to the tiering
+ * status, it cannot decrease the generation for the correctness of space
+ * reservation. It may choose to assign a larger generation if the extent is
+ * hinted cold, or if want to evict extents to the cold tier. And it may choose
+ * to not increase the generation if want to keep the hot tier as filled as
+ * possible.
+ */
 using reclaim_gen_t = uint8_t;
 
-constexpr reclaim_gen_t DIRTY_GENERATION = 1;
-constexpr reclaim_gen_t COLD_GENERATION = 1;
-constexpr reclaim_gen_t RECLAIM_GENERATIONS = 3;
+// INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
+constexpr reclaim_gen_t INIT_GENERATION = 0;
+constexpr reclaim_gen_t INLINE_GENERATION = 1; // to the journal
+constexpr reclaim_gen_t OOL_GENERATION = 2;
+
+// All the rewritten extents start with MIN_REWRITE_GENERATION
+constexpr reclaim_gen_t MIN_REWRITE_GENERATION = 3;
+constexpr reclaim_gen_t MAX_REWRITE_GENERATION = 4;
+
+/**
+ * TODO:
+ * For tiering, might introduce 5 and 6 for the cold tier, and 1 ~ 4 for the
+ * hot tier.
+ */
+
+constexpr reclaim_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1;
 constexpr reclaim_gen_t NULL_GENERATION =
   std::numeric_limits<reclaim_gen_t>::max();
 
@@ -1131,6 +1164,24 @@ struct reclaim_gen_printer_t {
 
 std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen);
 
+constexpr std::size_t generation_to_writer(reclaim_gen_t gen) {
+  // caller to assert the gen is in the reasonable range
+  return gen - OOL_GENERATION;
+}
+
+// before EPM decision
+constexpr bool is_target_reclaim_generation(reclaim_gen_t gen) {
+  return gen == INIT_GENERATION ||
+         (gen >= MIN_REWRITE_GENERATION &&
+          gen <= REWRITE_GENERATIONS);
+}
+
+// after EPM decision
+constexpr bool is_reclaim_generation(reclaim_gen_t gen) {
+  return gen >= INLINE_GENERATION &&
+         gen < REWRITE_GENERATIONS;
+}
+
 enum class data_category_t : uint8_t {
   METADATA = 0,
   DATA,
index 29d2c4873a0e67af4b3e744c388a65d579221f3c..f4ad79803e94cf0faaea83828bcda44e0e90f2e3 100644 (file)
@@ -451,6 +451,7 @@ TransactionManager::rewrite_logical_extent(
     lextent->get_type(),
     lextent->get_length(),
     lextent->get_user_hint(),
+    // get target reclaim generation
     lextent->get_reclaim_generation())->cast<LogicalCachedExtent>();
   lextent->get_bptr().copy_out(
     0,
@@ -493,9 +494,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
 
   assert(extent->is_valid() && !extent->is_initial_pending());
   if (extent->is_dirty()) {
-    extent->set_reclaim_generation(DIRTY_GENERATION);
+    extent->set_target_reclaim_generation(INIT_GENERATION);
   } else {
-    extent->set_reclaim_generation(target_generation);
+    extent->set_target_reclaim_generation(target_generation);
     ceph_assert(modify_time != NULL_TIME);
     extent->set_modify_time(modify_time);
   }
index 3945537708cd1fb3f3d6286c9f37f34c1bac552a..6e6eb45f73ca79549030913fe503038fa97a1b5c 100644 (file)
@@ -320,7 +320,7 @@ public:
       t,
       len,
       placement_hint,
-      0);
+      INIT_GENERATION);
     return lba_manager->alloc_extent(
       t,
       laddr_hint,