namespace crimson::os::seastore {
void segment_info_t::set_open(
- segment_seq_t _seq, segment_type_t _type)
+ segment_seq_t _seq, segment_type_t _type,
+ data_category_t _category, reclaim_gen_t _generation)
{
ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
+ ceph_assert(_category != data_category_t::NUM);
+ ceph_assert(_generation < RECLAIM_GENERATIONS);
state = Segment::segment_state_t::OPEN;
seq = _seq;
type = _type;
+ category = _category;
+ generation = _generation;
written_to = 0;
}
state = Segment::segment_state_t::EMPTY;
seq = NULL_SEG_SEQ;
type = segment_type_t::NULL_SEG;
+ category = data_category_t::NUM;
+ generation = NULL_GENERATION;
last_modified = {};
last_rewritten = {};
written_to = 0;
}
void segment_info_t::init_closed(
- segment_seq_t _seq, segment_type_t _type, std::size_t seg_size)
+ segment_seq_t _seq, segment_type_t _type,
+ data_category_t _category, reclaim_gen_t _generation,
+ std::size_t seg_size)
{
ceph_assert(_seq != NULL_SEG_SEQ);
ceph_assert(_type != segment_type_t::NULL_SEG);
+ ceph_assert(_category != data_category_t::NUM);
+ ceph_assert(_generation < RECLAIM_GENERATIONS);
state = Segment::segment_state_t::CLOSED;
seq = _seq;
type = _type;
+ category = _category;
+ generation = _generation;
written_to = seg_size;
}
} else { // open or closed
out << ", seq=" << segment_seq_printer_t{info.seq}
<< ", type=" << info.type
+ << ", category=" << info.category
+ << ", generation=" << reclaim_gen_printer_t{info.generation}
<< ", last_modified=" << info.last_modified.time_since_epoch()
<< ", last_rewritten=" << info.last_rewritten.time_since_epoch()
<< ", written_to=" << info.written_to;
}
void segments_info_t::init_closed(
- segment_id_t segment, segment_seq_t seq, segment_type_t type)
+ segment_id_t segment, segment_seq_t seq, segment_type_t type,
+ data_category_t category, reclaim_gen_t generation)
{
LOG_PREFIX(segments_info_t::init_closed);
auto& segment_info = segments[segment];
- INFO("initiating {} {} {}, {}, num_segments(empty={}, opened={}, closed={})",
+ INFO("initiating {} {} {} {} {}, {}, "
+ "num_segments(empty={}, opened={}, closed={})",
segment, segment_seq_printer_t{seq}, type,
+ category, reclaim_gen_printer_t{generation},
segment_info, num_empty, num_open, num_closed);
ceph_assert(segment_info.is_empty());
- segment_info.init_closed(seq, type, get_segment_size());
+ segment_info.init_closed(
+ seq, type, category, generation, get_segment_size());
ceph_assert(num_empty > 0);
--num_empty;
++num_closed;
}
void segments_info_t::mark_open(
- segment_id_t segment, segment_seq_t seq, segment_type_t type)
+ segment_id_t segment, segment_seq_t seq, segment_type_t type,
+ data_category_t category, reclaim_gen_t generation)
{
LOG_PREFIX(segments_info_t::mark_open);
auto& segment_info = segments[segment];
- INFO("opening {} {} {}, {}, num_segments(empty={}, opened={}, closed={})",
+ INFO("opening {} {} {} {} {}, {}, "
+ "num_segments(empty={}, opened={}, closed={})",
segment, segment_seq_printer_t{seq}, type,
+ category, reclaim_gen_printer_t{generation},
segment_info, num_empty, num_open, num_closed);
ceph_assert(segment_info.is_empty());
- segment_info.set_open(seq, type);
+ segment_info.set_open(seq, type, category, generation);
ceph_assert(num_empty > 0);
--num_empty;
++num_open;
segment_id_t AsyncCleaner::allocate_segment(
segment_seq_t seq,
- segment_type_t type)
+ segment_type_t type,
+ data_category_t category,
+ reclaim_gen_t generation)
{
LOG_PREFIX(AsyncCleaner::allocate_segment);
assert(seq != NULL_SEG_SEQ);
auto& segment_info = it->second;
if (segment_info.is_empty()) {
auto old_usage = calc_utilization(seg_id);
- segments.mark_open(seg_id, seq, type);
+ segments.mark_open(seg_id, seq, type, category, generation);
auto new_usage = calc_utilization(seg_id);
adjust_segment_util(old_usage, new_usage);
INFO("opened, should_block_on_gc {}, projected_avail_ratio {}, "
dirty_list,
[this, FNAME, &t](auto &e) {
DEBUGT("cleaning {}", t, *e);
- return ecb->rewrite_extent(t, e);
+ return ecb->rewrite_extent(t, e, DIRTY_GENERATION);
});
});
});
INFO("reclaim {} {} start", seg_id, segment_info);
ceph_assert(segment_info.is_closed());
reclaim_state = reclaim_state_t::create(
- seg_id, segments.get_segment_size());
+ seg_id, segment_info.generation, segments.get_segment_size());
}
reclaim_state->advance(config.reclaim_bytes_per_cycle);
- DEBUG("reclaiming {}~{}",
+ DEBUG("reclaiming {} {}~{}",
+ reclaim_gen_printer_t{reclaim_state->generation},
reclaim_state->start_pos,
reclaim_state->end_pos);
double pavail_ratio = get_projected_available_ratio();
extents,
[this, &t, &reclaimed](auto &ext) {
reclaimed += ext->get_length();
- return ecb->rewrite_extent(t, ext);
+ return ecb->rewrite_extent(t, ext, reclaim_state->target_generation);
});
});
}).si_then([this, &t, &seq] {
init_mark_segment_closed(
segment_id,
header.segment_seq,
- header.type);
+ header.type,
+ header.category,
+ header.generation);
return seastar::now();
}).handle_error(
crimson::ct_error::enodata::handle(
init_mark_segment_closed(
segment_id,
header.segment_seq,
- header.type);
+ header.type,
+ header.category,
+ header.generation);
return seastar::now();
});
}
segment_type_t type = segment_type_t::NULL_SEG;
+ data_category_t category = data_category_t::NUM;
+
+ reclaim_gen_t generation = NULL_GENERATION;
+
time_point last_modified;
time_point last_rewritten;
return state == Segment::segment_state_t::OPEN;
}
- void init_closed(segment_seq_t, segment_type_t, std::size_t);
+ void init_closed(segment_seq_t, segment_type_t,
+ data_category_t, reclaim_gen_t,
+ std::size_t);
- void set_open(segment_seq_t, segment_type_t);
+ void set_open(segment_seq_t, segment_type_t,
+ data_category_t, reclaim_gen_t);
void set_empty();
void add_segment_manager(SegmentManager &segment_manager);
// initiate non-empty segments, the others are by default empty
- void init_closed(segment_id_t, segment_seq_t, segment_type_t);
+ void init_closed(segment_id_t, segment_seq_t, segment_type_t,
+ data_category_t, reclaim_gen_t);
- void mark_open(segment_id_t, segment_seq_t, segment_type_t);
+ void mark_open(segment_id_t, segment_seq_t, segment_type_t,
+ data_category_t, reclaim_gen_t);
void mark_empty(segment_id_t);
virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0;
virtual segment_id_t allocate_segment(
- segment_seq_t seq, segment_type_t type) = 0;
+ segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) = 0;
virtual journal_seq_t get_dirty_extents_replay_from() const = 0;
using rewrite_extent_ret = rewrite_extent_iertr::future<>;
virtual rewrite_extent_ret rewrite_extent(
Transaction &t,
- CachedExtentRef extent) = 0;
+ CachedExtentRef extent,
+ reclaim_gen_t target_generation) = 0;
/**
* get_extent_if_live
}
segment_id_t allocate_segment(
- segment_seq_t seq, segment_type_t type) final;
+ segment_seq_t, segment_type_t, data_category_t, reclaim_gen_t) final;
void close_segment(segment_id_t segment) final;
}
struct reclaim_state_t {
+ reclaim_gen_t generation;
+ reclaim_gen_t target_generation;
std::size_t segment_size;
paddr_t start_pos;
paddr_t end_pos;
static reclaim_state_t create(
segment_id_t segment_id,
+ reclaim_gen_t generation,
std::size_t segment_size) {
- return {segment_size,
+ ceph_assert(generation < RECLAIM_GENERATIONS);
+ return {generation,
+ (reclaim_gen_t)(generation == RECLAIM_GENERATIONS - 1 ?
+ generation : generation + 1),
+ segment_size,
P_ADDR_NULL,
paddr_t::make_seg_paddr(segment_id, 0)};
}
void init_mark_segment_closed(
segment_id_t segment,
segment_seq_t seq,
- segment_type_t s_type) {
+ segment_type_t s_type,
+ data_category_t category,
+ reclaim_gen_t generation) {
ceph_assert(!init_complete);
auto old_usage = calc_utilization(segment);
- segments.init_closed(segment, seq, s_type);
+ segments.init_closed(segment, seq, s_type, category, generation);
auto new_usage = calc_utilization(segment);
adjust_segment_util(old_usage, new_usage);
if (s_type == segment_type_t::OOL) {
static mkfs_ret mkfs(op_context_t<node_key_t> c) {
auto root_leaf = c.cache.template alloc_new_extent<leaf_node_t>(
c.trans,
- node_size);
+ node_size,
+ placement_hint_t::HOT,
+ 0);
root_leaf->set_size(0);
fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1};
root_leaf->set_meta(meta);
std::remove_reference_t<decltype(fixed_kv_extent)>
>(
c.trans,
- fixed_kv_extent.get_length());
+ fixed_kv_extent.get_length(),
+ fixed_kv_extent.get_user_hint(),
+ fixed_kv_extent.get_reclaim_generation());
fixed_kv_extent.get_bptr().copy_out(
0,
fixed_kv_extent.get_length(),
if (split_from == iter.get_depth()) {
auto nroot = c.cache.template alloc_new_extent<internal_node_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
fixed_kv_node_meta_t<node_key_t> meta{
min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1};
nroot->set_meta(meta);
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->balance_into_new_nodes(
*this,
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
- c.trans, node_size);
+ c.trans, node_size, placement_hint_t::HOT, 0);
auto pivot = this->balance_into_new_nodes(
*this,
// add a new placeholder to Cache
ext = CachedExtent::make_cached_extent_ref<
RetiredExtentPlaceholder>(length);
- ext->set_paddr(addr);
- ext->state = CachedExtent::extent_state_t::CLEAN;
+ ext->init(CachedExtent::extent_state_t::CLEAN,
+ addr,
+ placement_hint_t::NUM_HINTS,
+ NULL_GENERATION);
DEBUGT("retire {}~{} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
}
CachedExtentRef Cache::alloc_new_extent_by_type(
- Transaction &t, ///< [in, out] current transaction
- extent_types_t type, ///< [in] type tag
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
seastore_off_t length, ///< [in] length
- placement_hint_t hint
+ placement_hint_t hint, ///< [in] user hint
+ reclaim_gen_t gen ///< [in] reclaim generation
)
{
LOG_PREFIX(Cache::alloc_new_extent_by_type);
- SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}",
- t, type, length, hint);
+ SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ t, type, length, hint, reclaim_gen_printer_t{gen});
switch (type) {
case extent_types_t::ROOT:
ceph_assert(0 == "ROOT is never directly alloc'd");
return CachedExtentRef();
case extent_types_t::LADDR_INTERNAL:
- return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint);
+ return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint, gen);
case extent_types_t::LADDR_LEAF:
- return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length, hint);
+ return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length, hint, gen);
case extent_types_t::ONODE_BLOCK_STAGED:
- return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint);
+ return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint, gen);
case extent_types_t::OMAP_INNER:
- return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint);
+ return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint, gen);
case extent_types_t::OMAP_LEAF:
- return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint);
+ return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint, gen);
case extent_types_t::COLL_BLOCK:
- return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint);
+ return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint, gen);
case extent_types_t::OBJECT_DATA_BLOCK:
- return alloc_new_extent<ObjectDataBlock>(t, length, hint);
+ return alloc_new_extent<ObjectDataBlock>(t, length, hint, gen);
case extent_types_t::RETIRED_PLACEHOLDER:
ceph_assert(0 == "impossible");
return CachedExtentRef();
case extent_types_t::TEST_BLOCK:
- return alloc_new_extent<TestBlock>(t, length, hint);
+ return alloc_new_extent<TestBlock>(t, length, hint, gen);
case extent_types_t::TEST_BLOCK_PHYSICAL:
- return alloc_new_extent<TestBlockPhysical>(t, length, hint);
+ return alloc_new_extent<TestBlockPhysical>(t, length, hint, gen);
case extent_types_t::NONE: {
ceph_assert(0 == "NONE is an invalid extent type");
return CachedExtentRef();
ret->version++;
ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
+ ret->set_reclaim_generation(DIRTY_GENERATION);
DEBUGT("{} -> {}", t, *i, *ret);
return ret;
}
if (!cached) {
auto ret = CachedExtent::make_cached_extent_ref<T>(
alloc_cache_buf(length));
- ret->set_paddr(offset);
- ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
+ ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
+ offset,
+ placement_hint_t::NUM_HINTS,
+ NULL_GENERATION);
SUBDEBUG(seastore_cache,
"{} {}~{} is absent, add extent and reading ... -- {}",
T::TYPE, offset, length, *ret);
if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) {
auto ret = CachedExtent::make_cached_extent_ref<T>(
alloc_cache_buf(length));
- ret->set_paddr(offset);
- ret->state = CachedExtent::extent_state_t::CLEAN_PENDING;
+ ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
+ offset,
+ placement_hint_t::NUM_HINTS,
+ NULL_GENERATION);
SUBDEBUG(seastore_cache,
"{} {}~{} is absent(placeholder), reading ... -- {}",
T::TYPE, offset, length, *ret);
TCachedExtentRef<T> alloc_new_extent(
Transaction &t, ///< [in, out] current transaction
seastore_off_t length, ///< [in] length
- placement_hint_t hint = placement_hint_t::HOT
+ placement_hint_t hint, ///< [in] user hint
+ reclaim_gen_t gen ///< [in] reclaim generation
) {
LOG_PREFIX(Cache::alloc_new_extent);
- SUBTRACET(seastore_cache, "allocate {} {}B, hint={}",
- t, T::TYPE, length, hint);
- auto result = epm.alloc_new_extent(t, T::TYPE, length, hint);
+ SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ t, T::TYPE, length, hint, reclaim_gen_printer_t{gen});
+ auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen);
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
- ret->set_paddr(result.paddr);
- ret->hint = hint;
- ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING;
+ ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
+ result.paddr,
+ hint,
+ result.gen);
t.add_fresh_extent(ret);
- SUBDEBUGT(seastore_cache, "allocated {} {}B extent at {}, hint={} -- {}",
- t, T::TYPE, length, result.paddr, hint, *ret);
+ SUBDEBUGT(seastore_cache,
+ "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ t, T::TYPE, length, result.paddr,
+ hint, reclaim_gen_printer_t{result.gen}, *ret);
return ret;
}
* Allocates a fresh extent. addr will be relative until commit.
*/
CachedExtentRef alloc_new_extent_by_type(
- Transaction &t, ///< [in, out] current transaction
- extent_types_t type, ///< [in] type tag
+ Transaction &t, ///< [in, out] current transaction
+ extent_types_t type, ///< [in] type tag
seastore_off_t length, ///< [in] length
- placement_hint_t hint = placement_hint_t::HOT
+ placement_hint_t hint, ///< [in] user hint
+ reclaim_gen_t gen ///< [in] reclaim generation
);
/**
// time of the last rewrite
seastar::lowres_system_clock::time_point last_rewritten;
+
public:
+ void init(extent_state_t _state,
+ paddr_t paddr,
+ placement_hint_t hint,
+ reclaim_gen_t gen) {
+ state = _state;
+ set_paddr(paddr);
+ user_hint = hint;
+ reclaim_generation = gen;
+ }
void set_last_modified(seastar::lowres_system_clock::duration d) {
last_modified = seastar::lowres_system_clock::time_point(d);
<< ", length=" << get_length()
<< ", state=" << state
<< ", last_committed_crc=" << last_committed_crc
- << ", refcount=" << use_count();
+ << ", refcount=" << use_count()
+ << ", user_hint=" << user_hint
+ << ", reclaim_gen=" << reclaim_generation;
if (state != extent_state_t::INVALID &&
state != extent_state_t::CLEAN_PENDING) {
print_detail(out);
virtual ~CachedExtent();
- /// hint for allocators
- placement_hint_t hint = placement_hint_t::NUM_HINTS;
+ placement_hint_t get_user_hint() const {
+ return user_hint;
+ }
+
+ reclaim_gen_t get_reclaim_generation() const {
+ return reclaim_generation;
+ }
+
+ void invalidate_hints() {
+ user_hint = placement_hint_t::NUM_HINTS;
+ reclaim_generation = NULL_GENERATION;
+ }
+
+ void set_reclaim_generation(reclaim_gen_t gen) {
+ assert(gen < RECLAIM_GENERATIONS);
+ user_hint = placement_hint_t::REWRITE;
+ reclaim_generation = gen;
+ }
bool is_inline() const {
return poffset.is_relative();
read_set_item_t<Transaction>::list transactions;
+ placement_hint_t user_hint;
+
+ /// > 0 and not null means the extent is under reclaimming
+ reclaim_gen_t reclaim_generation;
+
protected:
CachedExtent(CachedExtent &&other) = delete;
CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {}
namespace crimson::os::seastore {
SegmentedOolWriter::SegmentedOolWriter(
- std::string name,
+ data_category_t category,
+ reclaim_gen_t gen,
SegmentProvider& sp,
SegmentSeqAllocator &ssa)
- : segment_allocator(name, segment_type_t::OOL, sp, ssa),
+ : segment_allocator(segment_type_t::OOL, category, gen, sp, ssa),
record_submitter(crimson::common::get_conf<uint64_t>(
"seastore_journal_iodepth_limit"),
crimson::common::get_conf<uint64_t>(
TRACET("{} ool extent written at {} -- {}",
t, segment_allocator.get_name(),
extent_addr, *extent);
- extent->hint = placement_hint_t::NUM_HINTS; // invalidate hint
+ extent->invalidate_hints();
t.mark_delayed_extent_ool(extent, extent_addr);
extent_addr = extent_addr.as_seg_paddr().add_offset(
extent->get_length());
*/
class SegmentedOolWriter : public ExtentOolWriter {
public:
- SegmentedOolWriter(std::string name,
+ SegmentedOolWriter(data_category_t category,
+ reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa);
class ExtentPlacementManager {
public:
- ExtentPlacementManager() {
+ ExtentPlacementManager(bool prefer_ool)
+ : prefer_ool{prefer_ool} {
devices_by_id.resize(DEVICE_ID_GLOBAL_MAX, nullptr);
}
void init_ool_writers(SegmentProvider &sp, SegmentSeqAllocator &ssa) {
- // Currently only one SegmentProvider is supported, so hardcode the
- // writers_by_hint for now.
- writer_seed = 0;
+ // Currently only one SegmentProvider is supported
writer_refs.clear();
- writers_by_hint.resize((std::size_t)placement_hint_t::NUM_HINTS, {});
-
- // ool writer is not supported for placement_hint_t::HOT
- writer_refs.emplace_back(
- std::make_unique<SegmentedOolWriter>("COLD", sp, ssa));
- writers_by_hint[(std::size_t)placement_hint_t::COLD
- ].emplace_back(writer_refs.back().get());
- writer_refs.emplace_back(
- std::make_unique<SegmentedOolWriter>("REWRITE", sp, ssa));
- writers_by_hint[(std::size_t)placement_hint_t::REWRITE
- ].emplace_back(writer_refs.back().get());
+
+ ceph_assert(RECLAIM_GENERATIONS > 0);
+ data_writers_by_gen.resize(RECLAIM_GENERATIONS, {});
+ for (reclaim_gen_t gen = 0; gen < RECLAIM_GENERATIONS; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::DATA, gen, sp, ssa));
+ data_writers_by_gen[gen] = writer_refs.back().get();
+ }
+
+ md_writers_by_gen.resize(RECLAIM_GENERATIONS - 1, {});
+ for (reclaim_gen_t gen = 1; gen < RECLAIM_GENERATIONS; ++gen) {
+ writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>(
+ data_category_t::METADATA, gen, sp, ssa));
+ md_writers_by_gen[gen - 1] = writer_refs.back().get();
+ }
}
void add_device(Device* device, bool is_primary) {
open_ertr::future<> open() {
LOG_PREFIX(ExtentPlacementManager::open);
SUBINFO(seastore_journal, "started");
- return crimson::do_for_each(writers_by_hint, [](auto& writers) {
- return crimson::do_for_each(writers, [](auto& writer) {
+ return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
+ return writer->open();
+ }).safe_then([this] {
+ return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
return writer->open();
});
});
struct alloc_result_t {
paddr_t paddr;
bufferptr bp;
+ reclaim_gen_t gen;
};
alloc_result_t alloc_new_extent(
Transaction& t,
extent_types_t type,
seastore_off_t length,
- placement_hint_t hint
+ placement_hint_t hint,
+ reclaim_gen_t gen
) {
assert(hint < placement_hint_t::NUM_HINTS);
+ assert(gen < RECLAIM_GENERATIONS);
+ assert(gen == 0 || hint == placement_hint_t::REWRITE);
// XXX: bp might be extended to point to differnt memory (e.g. PMem)
// according to the allocator.
if (!is_logical_type(type)) {
// TODO: implement out-of-line strategy for physical extent.
return {make_record_relative_paddr(0),
- std::move(bp)};
+ std::move(bp),
+ 0};
}
- // FIXME: set delay for COLD extent and improve GC
- // NOTE: delay means to delay the decision about whether to write the
- // extent as inline or out-of-line extents.
- bool delay = (hint > placement_hint_t::COLD);
- if (delay) {
+ if (hint == placement_hint_t::COLD) {
+ assert(gen == 0);
return {make_delayed_temp_paddr(0),
- std::move(bp)};
+ std::move(bp),
+ COLD_GENERATION};
+ }
+
+ if (get_extent_category(type) == data_category_t::METADATA &&
+ gen == 0) {
+ // gen 0 METADATA writer is the journal writer
+ if (prefer_ool) {
+ return {make_delayed_temp_paddr(0),
+ std::move(bp),
+ 1};
+ } else {
+ return {make_record_relative_paddr(0),
+ std::move(bp),
+ 0};
+ }
} else {
- return {make_record_relative_paddr(0),
- std::move(bp)};
+ assert(get_extent_category(type) == data_category_t::DATA ||
+ gen > 0);
+ return {make_delayed_temp_paddr(0),
+ std::move(bp),
+ gen};
}
}
[this, &t, &delayed_extents](auto& alloc_map) {
for (auto& extent : delayed_extents) {
// For now, just do ool allocation for any delayed extent
- auto writer_ptr = get_writer(extent->hint);
+ auto writer_ptr = get_writer(
+ extent->get_user_hint(),
+ get_extent_category(extent->get_type()),
+ extent->get_reclaim_generation());
alloc_map[writer_ptr].emplace_back(extent);
}
return trans_intr::do_for_each(alloc_map, [&t](auto& p) {
close_ertr::future<> close() {
LOG_PREFIX(ExtentPlacementManager::close);
SUBINFO(seastore_journal, "started");
- return crimson::do_for_each(writers_by_hint, [](auto& writers) {
- return crimson::do_for_each(writers, [](auto& writer) {
+ return crimson::do_for_each(data_writers_by_gen, [](auto &writer) {
+ return writer->close();
+ }).safe_then([this] {
+ return crimson::do_for_each(md_writers_by_gen, [](auto &writer) {
return writer->close();
});
}).safe_then([this] {
}
private:
- ExtentOolWriter* get_writer(placement_hint_t hint) {
+ ExtentOolWriter* get_writer(placement_hint_t hint,
+ data_category_t category,
+ reclaim_gen_t gen) {
assert(hint < placement_hint_t::NUM_HINTS);
- auto hint_index = static_cast<std::size_t>(hint);
- assert(hint_index < writers_by_hint.size());
- auto& writers = writers_by_hint[hint_index];
- assert(writers.size() > 0);
- return writers[writer_seed++ % writers.size()];
+ assert(gen < RECLAIM_GENERATIONS);
+ if (category == data_category_t::DATA) {
+ return data_writers_by_gen[gen];
+ } else {
+ assert(category == data_category_t::METADATA);
+ // gen 0 METADATA writer is the journal writer
+ assert(gen > 0);
+ return md_writers_by_gen[gen - 1];
+ }
}
- std::size_t writer_seed = 0;
+ bool prefer_ool;
std::vector<ExtentOolWriterRef> writer_refs;
- std::vector<std::vector<ExtentOolWriter*>> writers_by_hint;
+ std::vector<ExtentOolWriter*> data_writers_by_gen;
+ // gen 0 METADATA writer is the journal writer
+ std::vector<ExtentOolWriter*> md_writers_by_gen;
+
std::vector<Device*> devices_by_id;
Device* primary_device = nullptr;
};
namespace crimson::os::seastore::journal {
SegmentAllocator::SegmentAllocator(
- std::string name,
segment_type_t type,
+ data_category_t category,
+ reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa)
- : name{name},
- print_name{fmt::format("D?_{}", name)},
+ : print_name{fmt::format("{}_G{}", category, gen)},
type{type},
+ category{category},
+ gen{gen},
segment_provider{sp},
sm_group{*sp.get_segment_manager_group()},
segment_seq_allocator(ssa)
new_segment_seq,
reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()),
sizeof(meta.seastore_id.uuid));
- auto new_segment_id = segment_provider.allocate_segment(new_segment_seq, type);
+ auto new_segment_id = segment_provider.allocate_segment(
+ new_segment_seq, type, category, gen);
ceph_assert(new_segment_id != NULL_SEG_ID);
return sm_group.open(new_segment_id
).handle_error(
new_journal_tail,
new_alloc_replay_from,
current_segment_nonce,
- type};
+ type,
+ category,
+ gen};
INFO("{} writing header to new segment ... -- {}",
print_name, header);
for (auto& device_id : device_ids) {
oss << "_" << device_id_printer_t{device_id};
}
- oss << "_" << name;
+ oss << "_"
+ << fmt::format("{}_G{}", category, gen);
print_name = oss.str();
INFO("{}", print_name);
crimson::ct_error::input_output_error>;
public:
- SegmentAllocator(std::string name,
- segment_type_t type,
+ SegmentAllocator(segment_type_t type,
+ data_category_t category,
+ reclaim_gen_t gen,
SegmentProvider &sp,
SegmentSeqAllocator &ssa);
using close_segment_ertr = base_ertr;
close_segment_ertr::future<> close_segment();
- const std::string name;
// device id is not available during construction,
// so generate the print_name later.
std::string print_name;
const segment_type_t type; // JOURNAL or OOL
+ const data_category_t category;
+ const reclaim_gen_t gen;
SegmentProvider &segment_provider;
SegmentManagerGroup &sm_group;
SegmentRef current_segment;
: segment_provider(segment_provider),
segment_seq_allocator(
new SegmentSeqAllocator(segment_type_t::JOURNAL)),
- journal_segment_allocator("JOURNAL",
- segment_type_t::JOURNAL,
+ journal_segment_allocator(segment_type_t::JOURNAL,
+ data_category_t::METADATA,
+ 0, // generation
segment_provider,
*segment_seq_allocator),
record_submitter(crimson::common::get_conf<uint64_t>(
}
}
+std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen)
+{
+ if (gen.gen == NULL_GENERATION) {
+ return out << "NULL_GEN";
+ } else if (gen.gen >= RECLAIM_GENERATIONS) {
+ return out << "INVALID_GEN(" << (unsigned)gen.gen << ")";
+ } else {
+ return out << "GEN(" << (unsigned)gen.gen << ")";
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, data_category_t c)
+{
+ switch (c) {
+ case data_category_t::METADATA:
+ return out << "MD";
+ case data_category_t::DATA:
+ return out << "DATA";
+ default:
+ return out << "INVALID_CATEGORY!";
+ }
+}
+
std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs)
{
bool first = false;
<< ", journal_tail=" << header.journal_tail
<< ", segment_nonce=" << header.segment_nonce
<< ", type=" << header.type
+ << ", category=" << header.category
+ << ", generaton=" << (unsigned)header.generation
<< ")";
}
constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX;
enum class placement_hint_t {
- HOT = 0, // Most of the metadata
- COLD, // Object data
- REWRITE, // Cold metadata and data (probably need further splits)
+ HOT = 0, // The default user hint that expects mutations or retirement
+ COLD, // Expect no mutations and no retirement in the near future
+ REWRITE, // Hint for the internal rewrites
NUM_HINTS // Constant for number of hints
};
std::ostream &operator<<(std::ostream &out, extent_types_t t);
+using reclaim_gen_t = uint8_t;
+
+constexpr reclaim_gen_t DIRTY_GENERATION = 1;
+constexpr reclaim_gen_t COLD_GENERATION = 1;
+constexpr reclaim_gen_t RECLAIM_GENERATIONS = 3;
+constexpr reclaim_gen_t NULL_GENERATION =
+ std::numeric_limits<reclaim_gen_t>::max();
+
+struct reclaim_gen_printer_t {
+ reclaim_gen_t gen;
+};
+
+std::ostream &operator<<(std::ostream &out, reclaim_gen_printer_t gen);
+
+enum class data_category_t : uint8_t {
+ METADATA = 0,
+ DATA,
+ NUM
+};
+
+std::ostream &operator<<(std::ostream &out, data_category_t c);
+
+constexpr data_category_t get_extent_category(extent_types_t type) {
+ if (type == extent_types_t::OBJECT_DATA_BLOCK ||
+ type == extent_types_t::COLL_BLOCK) {
+ return data_category_t::DATA;
+ } else {
+ return data_category_t::METADATA;
+ }
+}
+
enum class record_commit_type_t : uint8_t {
NONE,
MODIFY,
segment_type_t type;
+ data_category_t category;
+ reclaim_gen_t generation;
+
segment_type_t get_type() const {
return type;
}
denc(v.alloc_replay_from, p);
denc(v.segment_nonce, p);
denc(v.type, p);
+ denc(v.category, p);
+ denc(v.generation, p);
DENC_FINISH(p);
}
};
CacheRef _cache,
LBAManagerRef _lba_manager,
ExtentPlacementManagerRef &&epm,
- BackrefManagerRef&& backref_manager,
- tm_make_config_t config)
+ BackrefManagerRef&& backref_manager)
: async_cleaner(std::move(_async_cleaner)),
cache(std::move(_cache)),
lba_manager(std::move(_lba_manager)),
journal(std::move(_journal)),
epm(std::move(epm)),
backref_manager(std::move(backref_manager)),
- sm_group(*async_cleaner->get_segment_manager_group()),
- config(config)
+ sm_group(*async_cleaner->get_segment_manager_group())
{
async_cleaner->set_extent_callback(this);
journal->set_write_pipeline(&write_pipeline);
t,
lextent->get_type(),
lextent->get_length(),
- placement_hint_t::REWRITE)->cast<LogicalCachedExtent>();
+ lextent->get_user_hint(),
+ lextent->get_reclaim_generation())->cast<LogicalCachedExtent>();
lextent->get_bptr().copy_out(
0,
lextent->get_length(),
TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
Transaction &t,
- CachedExtentRef extent)
+ CachedExtentRef extent,
+ reclaim_gen_t target_generation)
{
LOG_PREFIX(TransactionManager::rewrite_extent);
ceph_assert(!extent->is_pending_io());
}
+ assert(extent->is_valid() && !extent->is_initial_pending());
+ if (extent->is_dirty()) {
+ extent->set_reclaim_generation(DIRTY_GENERATION);
+ } else {
+ extent->set_reclaim_generation(target_generation);
+ }
+
t.get_rewrite_version_stats().increment(extent->get_version());
if (is_backref_node(extent->get_type())) {
TransactionManagerRef make_transaction_manager(tm_make_config_t config)
{
LOG_PREFIX(make_transaction_manager);
- auto epm = std::make_unique<ExtentPlacementManager>();
+ auto epm = std::make_unique<ExtentPlacementManager>(config.epm_prefer_ool);
auto cache = std::make_unique<Cache>(*epm);
auto lba_manager = lba_manager::create_lba_manager(*cache);
auto sms = std::make_unique<SegmentManagerGroup>();
std::move(cache),
std::move(lba_manager),
std::move(epm),
- std::move(backref_manager),
- config);
+ std::move(backref_manager));
}
}
class Journal;
struct tm_make_config_t {
- bool is_test = true;
- journal_type_t j_type = journal_type_t::SEGMENT_JOURNAL;
- placement_hint_t default_placement_hint = placement_hint_t::HOT;
+ bool is_test;
+ journal_type_t j_type;
+ bool epm_prefer_ool;
+ reclaim_gen_t default_generation;
static tm_make_config_t get_default() {
return tm_make_config_t {
false,
journal_type_t::SEGMENT_JOURNAL,
- placement_hint_t::HOT
+ false
};
}
static tm_make_config_t get_test_segmented_journal() {
return tm_make_config_t {
true,
journal_type_t::SEGMENT_JOURNAL,
- placement_hint_t::HOT
+ false
};
}
static tm_make_config_t get_test_cb_journal() {
return tm_make_config_t {
true,
journal_type_t::CIRCULARBOUNDED_JOURNAL,
- placement_hint_t::REWRITE
+ true
};
}
tm_make_config_t(
bool is_test,
journal_type_t j_type,
- placement_hint_t default_placement_hint)
+ bool epm_prefer_ool)
: is_test(is_test), j_type(j_type),
- default_placement_hint(default_placement_hint)
+ epm_prefer_ool(epm_prefer_ool)
{}
};
CacheRef cache,
LBAManagerRef lba_manager,
ExtentPlacementManagerRef &&epm,
- BackrefManagerRef&& backref_manager,
- tm_make_config_t config = tm_make_config_t::get_default());
+ BackrefManagerRef&& backref_manager);
/// Writes initial metadata to disk
using mkfs_ertr = base_ertr;
alloc_extent_ret<T> alloc_extent(
Transaction &t,
laddr_t laddr_hint,
- extent_len_t len) {
- placement_hint_t placement_hint;
- if constexpr (T::TYPE == extent_types_t::OBJECT_DATA_BLOCK ||
- T::TYPE == extent_types_t::COLL_BLOCK) {
- placement_hint = placement_hint_t::COLD;
- } else {
- placement_hint = config.default_placement_hint;
- }
+ extent_len_t len,
+ placement_hint_t placement_hint = placement_hint_t::HOT) {
LOG_PREFIX(TransactionManager::alloc_extent);
SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
t, T::TYPE, len, placement_hint, laddr_hint);
auto ext = cache->alloc_new_extent<T>(
t,
len,
- placement_hint);
+ placement_hint,
+ 0);
return lba_manager->alloc_extent(
t,
laddr_hint,
using AsyncCleaner::ExtentCallbackInterface::rewrite_extent_ret;
rewrite_extent_ret rewrite_extent(
Transaction &t,
- CachedExtentRef extent) final;
+ CachedExtentRef extent,
+ reclaim_gen_t target_generation) final;
using AsyncCleaner::ExtentCallbackInterface::get_extent_if_live_ret;
get_extent_if_live_ret get_extent_if_live(
WritePipeline write_pipeline;
- tm_make_config_t config;
rewrite_extent_ret rewrite_logical_extent(
Transaction& t,
LogicalCachedExtentRef extent);
+
public:
// Testing interfaces
auto get_async_cleaner() {
segment_id_t allocate_segment(
segment_seq_t seq,
- segment_type_t type
+ segment_type_t type,
+ data_category_t,
+ reclaim_gen_t
) final {
auto ret = next;
next = segment_id_t{
}).safe_then([this] {
sms.reset(new SegmentManagerGroup());
journal = journal::make_segmented(*this);
- epm.reset(new ExtentPlacementManager());
+ epm.reset(new ExtentPlacementManager(false));
cache.reset(new Cache(*epm));
block_size = segment_manager->get_block_size();
test_lba_mappings
};
if (create_fake_extent) {
- cache->alloc_new_extent<TestBlockPhysical>(*t.t, TestBlockPhysical::SIZE);
+ cache->alloc_new_extent<TestBlockPhysical>(
+ *t.t,
+ TestBlockPhysical::SIZE,
+ placement_hint_t::HOT,
+ 0);
};
return t;
}
cbjournal_test_t() :
segment_manager(segment_manager::create_test_ephemeral()),
- epm(new ExtentPlacementManager()),
+ epm(new ExtentPlacementManager(true)),
cache(*epm)
{
device = new nvme_device::TestMemory(CBTEST_DEFAULT_TEST_SIZE);
return segment_manager->mkfs(
segment_manager::get_ephemeral_device_config(0, 1));
}).safe_then([this] {
- epm.reset(new ExtentPlacementManager());
+ epm.reset(new ExtentPlacementManager(false));
cache.reset(new Cache(*epm));
current = paddr_t::make_seg_paddr(segment_id_t(segment_manager->get_device_id(), 0), 0);
epm->add_device(segment_manager.get(), true);
auto t = get_transaction();
auto extent = cache->alloc_new_extent<TestBlockPhysical>(
*t,
- TestBlockPhysical::SIZE);
+ TestBlockPhysical::SIZE,
+ placement_hint_t::HOT,
+ 0);
extent->set_contents('c');
csum = extent->get_crc32c();
submit_transaction(std::move(t)).get0();
auto t = get_transaction();
auto extent = cache->alloc_new_extent<TestBlockPhysical>(
*t,
- TestBlockPhysical::SIZE);
+ TestBlockPhysical::SIZE,
+ placement_hint_t::HOT,
+ 0);
extent->set_contents('c');
csum = extent->get_crc32c();
auto reladdr = extent->get_paddr();
segment_id_t allocate_segment(
segment_seq_t seq,
- segment_type_t type
+ segment_type_t type,
+ data_category_t,
+ reclaim_gen_t
) final {
auto ret = next;
next = segment_id_t{