ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
ceph_assert(_category != data_category_t::NUM);
- ceph_assert(_generation < RECLAIM_GENERATIONS);
+ ceph_assert(is_reclaim_generation(_generation));
state = Segment::segment_state_t::OPEN;
seq = _seq;
type = _type;
ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
ceph_assert(_category != data_category_t::NUM);
- ceph_assert(_generation < RECLAIM_GENERATIONS);
+ ceph_assert(is_reclaim_generation(_generation));
state = Segment::segment_state_t::CLOSED;
seq = _seq;
type = _type;
dirty_list,
[this, &t](auto &e) {
return extent_callback->rewrite_extent(
- t, e, DIRTY_GENERATION, NULL_TIME);
+ t, e, INIT_GENERATION, NULL_TIME);
});
});
}).si_then([this, &t] {
segment_id_t segment_id,
reclaim_gen_t generation,
segment_off_t segment_size) {
- ceph_assert(generation < RECLAIM_GENERATIONS);
+ ceph_assert(is_reclaim_generation(generation));
+
+ reclaim_gen_t target_gen;
+ if (generation < MIN_REWRITE_GENERATION) {
+ target_gen = MIN_REWRITE_GENERATION;
+ } else {
+ // tolerate the target_gen to exceed MAX_REWRETE_GENERATION to make EPM
+ // aware of its original generation for the decisions.
+ target_gen = generation + 1;
+ }
+
+ assert(is_target_reclaim_generation(target_gen));
return {generation,
- (reclaim_gen_t)(generation == RECLAIM_GENERATIONS - 1 ?
- generation : generation + 1),
+ target_gen,
segment_size,
P_ADDR_NULL,
paddr_t::make_seg_paddr(segment_id, 0)};
c.trans,
node_size,
placement_hint_t::HOT,
- 0);
+ INIT_GENERATION);
root_leaf->set_size(0);
fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1};
root_leaf->set_meta(meta);
c.trans,
fixed_kv_extent.get_length(),
fixed_kv_extent.get_user_hint(),
+ // get target reclaim generation
fixed_kv_extent.get_reclaim_generation());
fixed_kv_extent.get_bptr().copy_out(
0,
if (split_from == iter.get_depth()) {
auto nroot = c.cache.template alloc_new_extent<internal_node_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
fixed_kv_node_meta_t<node_key_t> meta{
min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1};
nroot->set_meta(meta);
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto pivot = this->balance_into_new_nodes(
*this,
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size, placement_hint_t::HOT, 0);
+ c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
auto pivot = this->balance_into_new_nodes(
*this,
const Transaction::src_t* p_src=nullptr)
{
assert(ref->is_valid());
+ assert(ref->user_hint == PLACEMENT_HINT_NULL);
+ assert(ref->reclaim_generation == NULL_GENERATION);
extents.insert(*ref);
if (ref->is_dirty()) {
add_to_dirty(ref);
ret->version++;
ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
- ret->set_reclaim_generation(DIRTY_GENERATION);
DEBUGT("{} -> {}", t, *i, *ret);
return ret;
}
DEBUGT("add extent as fresh, inline={} -- {}",
t, is_inline, *i);
const auto t_src = t.get_src();
+ i->invalidate_hints();
add_extent(i, &t_src);
epm.mark_space_used(i->get_paddr(), i->get_length());
if (is_backref_mapped_extent_node(i)) {
paddr_t paddr,
placement_hint_t hint,
reclaim_gen_t gen) {
+ assert(gen == NULL_GENERATION || is_reclaim_generation(gen));
state = _state;
set_paddr(paddr);
user_hint = hint;
reclaim_generation = NULL_GENERATION;
}
- void set_reclaim_generation(reclaim_gen_t gen) {
- assert(gen < RECLAIM_GENERATIONS);
+ /// assign the target reclaim generation for the followup rewrite
+ void set_target_reclaim_generation(reclaim_gen_t gen) {
+ assert(is_target_reclaim_generation(gen));
+
user_hint = placement_hint_t::REWRITE;
reclaim_generation = gen;
}
read_set_item_t<Transaction>::list transactions;
- placement_hint_t user_hint;
+ placement_hint_t user_hint = PLACEMENT_HINT_NULL;
- /// > 0 and not null means the extent is under reclaimming
- reclaim_gen_t reclaim_generation;
+ // the target reclaim generation for the followup rewrite
+ // or the reclaim generation for the fresh write
+ reclaim_gen_t reclaim_generation = NULL_GENERATION;
protected:
CachedExtent(CachedExtent &&other) = delete;
TRACET("{} ool extent written at {} -- {}",
t, segment_allocator.get_name(),
extent_addr, *extent);
- extent->invalidate_hints();
t.mark_delayed_extent_ool(extent, extent_addr);
extent_addr = extent_addr.as_seg_paddr().add_offset(
extent->get_length());
{
writer_refs.clear();
- ceph_assert(RECLAIM_GENERATIONS > 0);
auto segment_cleaner = dynamic_cast<SegmentCleaner*>(cleaner.get());
ceph_assert(segment_cleaner != nullptr);
- data_writers_by_gen.resize(RECLAIM_GENERATIONS, {});
- for (reclaim_gen_t gen = 0; gen < RECLAIM_GENERATIONS; ++gen) {
+ auto num_writers = generation_to_writer(REWRITE_GENERATIONS);
+ data_writers_by_gen.resize(num_writers, {});
+ for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
data_category_t::DATA, gen, *segment_cleaner,
segment_cleaner->get_ool_segment_seq_allocator()));
- data_writers_by_gen[gen] = writer_refs.back().get();
+ data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
}
- md_writers_by_gen.resize(RECLAIM_GENERATIONS - 1, {});
- for (reclaim_gen_t gen = 1; gen < RECLAIM_GENERATIONS; ++gen) {
+ md_writers_by_gen.resize(num_writers, {});
+ for (reclaim_gen_t gen = OOL_GENERATION; gen < REWRITE_GENERATIONS; ++gen) {
writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
data_category_t::METADATA, gen, *segment_cleaner,
segment_cleaner->get_ool_segment_seq_allocator()));
- md_writers_by_gen[gen - 1] = writer_refs.back().get();
+ md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get();
}
for (auto *device : segment_cleaner->get_segment_manager_group()
reclaim_gen_t gen
) {
assert(hint < placement_hint_t::NUM_HINTS);
- assert(gen < RECLAIM_GENERATIONS);
- assert(gen == 0 || hint == placement_hint_t::REWRITE);
+ assert(is_target_reclaim_generation(gen));
+ assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE);
// XXX: bp might be extended to point to differnt memory (e.g. PMem)
// according to the allocator.
// TODO: implement out-of-line strategy for physical extent.
return {make_record_relative_paddr(0),
std::move(bp),
- 0};
+ INLINE_GENERATION};
}
if (hint == placement_hint_t::COLD) {
- assert(gen == 0);
+ assert(gen == INIT_GENERATION);
return {make_delayed_temp_paddr(0),
std::move(bp),
- COLD_GENERATION};
+ MIN_REWRITE_GENERATION};
}
if (get_extent_category(type) == data_category_t::METADATA &&
- gen == 0) {
- // gen 0 METADATA writer is the journal writer
+ gen == INIT_GENERATION) {
if (prefer_ool) {
return {make_delayed_temp_paddr(0),
std::move(bp),
- 1};
+ OOL_GENERATION};
} else {
+ // default not to ool metadata extents to reduce padding overhead.
+ // TODO: improve padding so we can default to the prefer_ool path.
return {make_record_relative_paddr(0),
std::move(bp),
- 0};
+ INLINE_GENERATION};
}
} else {
assert(get_extent_category(type) == data_category_t::DATA ||
- gen > 0);
+ gen >= MIN_REWRITE_GENERATION);
+ if (gen > MAX_REWRITE_GENERATION) {
+ gen = MAX_REWRITE_GENERATION;
+ } else if (gen == INIT_GENERATION) {
+ gen = OOL_GENERATION;
+ }
return {make_delayed_temp_paddr(0),
std::move(bp),
gen};
data_category_t category,
reclaim_gen_t gen) {
assert(hint < placement_hint_t::NUM_HINTS);
- assert(gen < RECLAIM_GENERATIONS);
+ assert(is_reclaim_generation(gen));
+ assert(gen != INLINE_GENERATION);
if (category == data_category_t::DATA) {
- return data_writers_by_gen[gen];
+ return data_writers_by_gen[generation_to_writer(gen)];
} else {
assert(category == data_category_t::METADATA);
- // gen 0 METADATA writer is the journal writer
- assert(gen > 0);
- return md_writers_by_gen[gen - 1];
+ return md_writers_by_gen[generation_to_writer(gen)];
}
}
new SegmentSeqAllocator(segment_type_t::JOURNAL)),
journal_segment_allocator(&trimmer,
data_category_t::METADATA,
- 0, // generation
+ INLINE_GENERATION,
segment_provider,
*segment_seq_allocator),
record_submitter(crimson::common::get_conf<uint64_t>(
std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen)
{
if (gen.gen == NULL_GENERATION) {
- return out << "NULL_GEN";
- } else if (gen.gen >= RECLAIM_GENERATIONS) {
- return out << "INVALID_GEN(" << (unsigned)gen.gen << ")";
+ return out << "GEN_NULL";
+ } else if (gen.gen == INIT_GENERATION) {
+ return out << "GEN_INIT";
+ } else if (gen.gen == INLINE_GENERATION) {
+ return out << "GEN_INL";
+ } else if (gen.gen == OOL_GENERATION) {
+ return out << "GEN_OOL";
+ } else if (gen.gen > REWRITE_GENERATIONS) {
+ return out << "GEN_INVALID(" << (unsigned)gen.gen << ")!";
} else {
return out << "GEN(" << (unsigned)gen.gen << ")";
}
std::ostream &operator<<(std::ostream &out, extent_types_t t);
+/**
+ * reclaim_gen_t
+ *
+ * The goal is to group the similar aged extents in the same segment for better
+ * bimodel utilization distribution, and also to the same device tier. For EPM,
+ * it has the flexibility to make placement decisions by re-assigning the
+ * generation. And each non-inline generation will be statically mapped to a
+ * writer in EPM.
+ *
+ * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
+ * and they will be assigned to INLINE/OOL generation by EPM before the initial
+ * writes. After that, the generation can only be increased upon rewrite.
+ *
+ * Note, although EPM can re-assign the generations according to the tiering
+ * status, it cannot decrease the generation for the correctness of space
+ * reservation. It may choose to assign a larger generation if the extent is
+ * hinted cold, or if want to evict extents to the cold tier. And it may choose
+ * to not increase the generation if want to keep the hot tier as filled as
+ * possible.
+ */
using reclaim_gen_t = uint8_t;
-constexpr reclaim_gen_t DIRTY_GENERATION = 1;
-constexpr reclaim_gen_t COLD_GENERATION = 1;
-constexpr reclaim_gen_t RECLAIM_GENERATIONS = 3;
+// INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
+constexpr reclaim_gen_t INIT_GENERATION = 0;
+constexpr reclaim_gen_t INLINE_GENERATION = 1; // to the journal
+constexpr reclaim_gen_t OOL_GENERATION = 2;
+
+// All the rewritten extents start with MIN_REWRITE_GENERATION
+constexpr reclaim_gen_t MIN_REWRITE_GENERATION = 3;
+constexpr reclaim_gen_t MAX_REWRITE_GENERATION = 4;
+
+/**
+ * TODO:
+ * For tiering, might introduce 5 and 6 for the cold tier, and 1 ~ 4 for the
+ * hot tier.
+ */
+
+constexpr reclaim_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1;
constexpr reclaim_gen_t NULL_GENERATION =
std::numeric_limits<reclaim_gen_t>::max();
std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen);
+constexpr std::size_t generation_to_writer(reclaim_gen_t gen) {
+ // caller to assert the gen is in the reasonable range
+ return gen - OOL_GENERATION;
+}
+
+// before EPM decision
+constexpr bool is_target_reclaim_generation(reclaim_gen_t gen) {
+ return gen == INIT_GENERATION ||
+ (gen >= MIN_REWRITE_GENERATION &&
+ gen <= REWRITE_GENERATIONS);
+}
+
+// after EPM decision
+constexpr bool is_reclaim_generation(reclaim_gen_t gen) {
+ return gen >= INLINE_GENERATION &&
+ gen < REWRITE_GENERATIONS;
+}
+
enum class data_category_t : uint8_t {
METADATA = 0,
DATA,
lextent->get_type(),
lextent->get_length(),
lextent->get_user_hint(),
+ // get target reclaim generation
lextent->get_reclaim_generation())->cast<LogicalCachedExtent>();
lextent->get_bptr().copy_out(
0,
assert(extent->is_valid() && !extent->is_initial_pending());
if (extent->is_dirty()) {
- extent->set_reclaim_generation(DIRTY_GENERATION);
+ extent->set_target_reclaim_generation(INIT_GENERATION);
} else {
- extent->set_reclaim_generation(target_generation);
+ extent->set_target_reclaim_generation(target_generation);
ceph_assert(modify_time != NULL_TIME);
extent->set_modify_time(modify_time);
}
t,
len,
placement_hint,
- 0);
+ INIT_GENERATION);
return lba_manager->alloc_extent(
t,
laddr_hint,