From: myoungwon oh Date: Wed, 1 Jun 2022 12:55:27 +0000 (+0900) Subject: seastore: rename segment cleaner to async cleaner X-Git-Tag: v18.0.0~612^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F46885%2Fhead;p=ceph.git seastore: rename segment cleaner to async cleaner Signed-off-by: Myoungwon Oh --- diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 55d2168cebb1..ee3aa47cc533 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -8,7 +8,7 @@ set(crimson_seastore_srcs transaction.cc cache.cc lba_manager.cc - segment_cleaner.cc + async_cleaner.cc backref_manager.cc backref/backref_tree_node.cc backref/btree_backref_manager.cc diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc new file mode 100644 index 000000000000..2e069a81a307 --- /dev/null +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -0,0 +1,1439 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/transaction_manager.h" + +SET_SUBSYS(seastore_cleaner); + +namespace crimson::os::seastore { + +void segment_info_t::set_open( + segment_seq_t _seq, segment_type_t _type) +{ + ceph_assert(_seq != NULL_SEG_SEQ); + ceph_assert(_type != segment_type_t::NULL_SEG); + state = Segment::segment_state_t::OPEN; + seq = _seq; + type = _type; + written_to = 0; +} + +void segment_info_t::set_empty() +{ + state = Segment::segment_state_t::EMPTY; + seq = NULL_SEG_SEQ; + type = segment_type_t::NULL_SEG; + last_modified = {}; + last_rewritten = {}; + written_to = 0; +} + +void segment_info_t::set_closed() +{ + state = Segment::segment_state_t::CLOSED; + // the rest of information is unchanged +} + +void segment_info_t::init_closed( + segment_seq_t _seq, segment_type_t _type, std::size_t seg_size) +{ + ceph_assert(_seq != NULL_SEG_SEQ); + ceph_assert(_type != segment_type_t::NULL_SEG); + state = Segment::segment_state_t::CLOSED; + seq = _seq; + type = _type; + written_to = seg_size; +} + +std::ostream& operator<<(std::ostream &out, const segment_info_t &info) +{ + out << "seg_info_t(" + << "state=" << info.state; + if (info.is_empty()) { + // pass + } else { // open or closed + out << ", seq=" << segment_seq_printer_t{info.seq} + << ", type=" << info.type + << ", last_modified=" << info.last_modified.time_since_epoch() + << ", last_rewritten=" << info.last_rewritten.time_since_epoch() + << ", written_to=" << info.written_to; + } + return out << ")"; +} + +void segments_info_t::reset() +{ + segments.clear(); + + segment_size = 0; + + journal_segment_id = NULL_SEG_ID; + num_in_journal_open = 0; + num_type_journal = 0; + num_type_ool = 0; + + num_open = 0; + num_empty = 0; + num_closed = 0; + + count_open_journal = 0; + count_open_ool = 0; + count_release_journal = 0; + count_release_ool = 0; + count_close_journal = 0; + count_close_ool = 0; + + total_bytes = 0; + avail_bytes_in_open = 0; +} + +void segments_info_t::add_segment_manager( + SegmentManager &segment_manager) +{ + LOG_PREFIX(segments_info_t::add_segment_manager); + device_id_t d_id = segment_manager.get_device_id(); + auto ssize = segment_manager.get_segment_size(); + auto nsegments = segment_manager.get_num_segments(); + auto sm_size = segment_manager.get_size(); + INFO("adding segment manager {}, size={}, ssize={}, segments={}", + device_id_printer_t{d_id}, sm_size, ssize, nsegments); + ceph_assert(ssize > 0); + ceph_assert(nsegments > 0); + ceph_assert(sm_size > 0); + + // also validate if the device is duplicated + segments.add_device(d_id, nsegments, segment_info_t{}); + + // assume all the segment managers share the same settings as follows. + if (segment_size == 0) { + ceph_assert(ssize > 0); + segment_size = ssize; + } else { + ceph_assert(segment_size == (std::size_t)ssize); + } + + // NOTE: by default the segments are empty + num_empty += nsegments; + + total_bytes += sm_size; +} + +void segments_info_t::init_closed( + segment_id_t segment, segment_seq_t seq, segment_type_t type) +{ + LOG_PREFIX(segments_info_t::init_closed); + auto& segment_info = segments[segment]; + INFO("initiating {} {} {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_seq_printer_t{seq}, type, + segment_info, num_empty, num_open, num_closed); + ceph_assert(segment_info.is_empty()); + segment_info.init_closed(seq, type, get_segment_size()); + ceph_assert(num_empty > 0); + --num_empty; + ++num_closed; + if (type == segment_type_t::JOURNAL) { + // init_closed won't initialize journal_segment_id + ceph_assert(get_journal_head() == JOURNAL_SEQ_NULL); + ++num_type_journal; + } else { + ++num_type_ool; + } + // do not increment count_close_*; +} + +void segments_info_t::mark_open( + segment_id_t segment, segment_seq_t seq, segment_type_t type) +{ + LOG_PREFIX(segments_info_t::mark_open); + auto& segment_info = segments[segment]; + INFO("opening {} {} {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_seq_printer_t{seq}, type, + segment_info, num_empty, num_open, num_closed); + ceph_assert(segment_info.is_empty()); + segment_info.set_open(seq, type); + ceph_assert(num_empty > 0); + --num_empty; + ++num_open; + if (type == segment_type_t::JOURNAL) { + if (journal_segment_id != NULL_SEG_ID) { + auto& last_journal_segment = segments[journal_segment_id]; + ceph_assert(last_journal_segment.is_closed()); + ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL); + ceph_assert(last_journal_segment.seq + 1 == seq); + } + journal_segment_id = segment; + + ++num_in_journal_open; + ++num_type_journal; + ++count_open_journal; + } else { + ++num_type_ool; + ++count_open_ool; + } + ceph_assert(segment_info.written_to == 0); + avail_bytes_in_open += get_segment_size(); +} + +void segments_info_t::mark_empty( + segment_id_t segment) +{ + LOG_PREFIX(segments_info_t::mark_empty); + auto& segment_info = segments[segment]; + INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_info, + num_empty, num_open, num_closed); + ceph_assert(segment_info.is_closed()); + auto type = segment_info.type; + assert(type != segment_type_t::NULL_SEG); + segment_info.set_empty(); + ceph_assert(num_closed > 0); + --num_closed; + ++num_empty; + if (type == segment_type_t::JOURNAL) { + ceph_assert(num_type_journal > 0); + --num_type_journal; + ++count_release_journal; + } else { + ceph_assert(num_type_ool > 0); + --num_type_ool; + ++count_release_ool; + } +} + +void segments_info_t::mark_closed( + segment_id_t segment) +{ + LOG_PREFIX(segments_info_t::mark_closed); + auto& segment_info = segments[segment]; + INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_info, + num_empty, num_open, num_closed); + ceph_assert(segment_info.is_open()); + segment_info.set_closed(); + ceph_assert(num_open > 0); + --num_open; + ++num_closed; + if (segment_info.type == segment_type_t::JOURNAL) { + ceph_assert(num_in_journal_open > 0); + --num_in_journal_open; + ++count_close_journal; + } else { + ++count_close_ool; + } + ceph_assert(get_segment_size() >= segment_info.written_to); + auto seg_avail_bytes = get_segment_size() - segment_info.written_to; + ceph_assert(avail_bytes_in_open >= seg_avail_bytes); + avail_bytes_in_open -= seg_avail_bytes; +} + +void segments_info_t::update_written_to( + segment_type_t type, + paddr_t offset) +{ + LOG_PREFIX(segments_info_t::update_written_to); + auto& saddr = offset.as_seg_paddr(); + auto& segment_info = segments[saddr.get_segment_id()]; + if (!segment_info.is_open()) { + ERROR("segment is not open, not updating, type={}, offset={}, {}", + type, offset, segment_info); + ceph_abort(); + } + + auto new_written_to = static_cast(saddr.get_segment_off()); + ceph_assert(new_written_to <= get_segment_size()); + if (segment_info.written_to > new_written_to) { + ERROR("written_to should not decrease! type={}, offset={}, {}", + type, offset, segment_info); + ceph_abort(); + } + + DEBUG("type={}, offset={}, {}", type, offset, segment_info); + ceph_assert(type == segment_info.type); + auto avail_deduction = new_written_to - segment_info.written_to; + ceph_assert(avail_bytes_in_open >= avail_deduction); + avail_bytes_in_open -= avail_deduction; + segment_info.written_to = new_written_to; +} + +bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const +{ + LOG_PREFIX(SpaceTrackerSimple::equals); + const auto &other = static_cast(_other); + + if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { + ERROR("different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin(); + i != live_bytes_by_segment.end(); ++i, ++j) { + if (i->second.live_bytes != j->second.live_bytes) { + all_match = false; + DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}", + i->first, i->second.live_bytes, j->second.live_bytes); + } + } + return all_match; +} + +int64_t SpaceTrackerDetailed::SegmentMap::allocate( + device_segment_id_t segment, + seastore_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + LOG_PREFIX(SegmentMap::allocate); + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (bitmap[i]) { + if (!error) { + ERROR("found allocated in {}, {} ~ {}", segment, offset, len); + error = true; + } + DEBUG("block {} allocated", i * block_size); + } + bitmap[i] = true; + } + return update_usage(len); +} + +int64_t SpaceTrackerDetailed::SegmentMap::release( + device_segment_id_t segment, + seastore_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + LOG_PREFIX(SegmentMap::release); + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (!bitmap[i]) { + if (!error) { + ERROR("found unallocated in {}, {} ~ {}", segment, offset, len); + error = true; + } + DEBUG("block {} unallocated", i * block_size); + } + bitmap[i] = false; + } + return update_usage(-(int64_t)len); +} + +bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const +{ + LOG_PREFIX(SpaceTrackerDetailed::equals); + const auto &other = static_cast(_other); + + if (other.segment_usage.size() != segment_usage.size()) { + ERROR("different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (auto i = segment_usage.begin(), j = other.segment_usage.begin(); + i != segment_usage.end(); ++i, ++j) { + if (i->second.get_usage() != j->second.get_usage()) { + all_match = false; + ERROR("segment_id {} live bytes mismatch *this: {}, other: {}", + i->first, i->second.get_usage(), j->second.get_usage()); + } + } + return all_match; +} + +void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const +{ + LOG_PREFIX(SegmentMap::dump_usage); + INFO("dump start"); + for (unsigned i = 0; i < bitmap.size(); ++i) { + if (bitmap[i]) { + LOCAL_LOGGER.info(" {} still live", i * block_size); + } + } +} + +void SpaceTrackerDetailed::dump_usage(segment_id_t id) const +{ + LOG_PREFIX(SpaceTrackerDetailed::dump_usage); + INFO("{}", id); + segment_usage[id].dump_usage( + block_size_by_segment_manager[id.device_id()]); +} + +void SpaceTrackerSimple::dump_usage(segment_id_t id) const +{ + LOG_PREFIX(SpaceTrackerSimple::dump_usage); + INFO("id: {}, live_bytes: {}", + id, live_bytes_by_segment[id].live_bytes); +} + +AsyncCleaner::AsyncCleaner( + config_t config, + SegmentManagerGroupRef&& sm_group, + BackrefManager &backref_manager, + bool detailed) + : detailed(detailed), + config(config), + sm_group(std::move(sm_group)), + backref_manager(backref_manager), + ool_segment_seq_allocator( + new SegmentSeqAllocator(segment_type_t::OOL)), + gc_process(*this) +{ + config.validate(); +} + +void AsyncCleaner::register_metrics() +{ + namespace sm = seastar::metrics; + stats.segment_util.buckets.resize(UTIL_BUCKETS); + std::size_t i; + for (i = 0; i < UTIL_BUCKETS; ++i) { + stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10; + stats.segment_util.buckets[i].count = 0; + } + // NOTE: by default the segments are empty + i = get_bucket_index(UTIL_STATE_EMPTY); + stats.segment_util.buckets[i].count = segments.get_num_segments(); + + metrics.add_group("async_cleaner", { + sm::make_counter("segments_number", + [this] { return segments.get_num_segments(); }, + sm::description("the number of segments")), + sm::make_counter("segment_size", + [this] { return segments.get_segment_size(); }, + sm::description("the bytes of a segment")), + sm::make_counter("segments_in_journal", + [this] { return get_segments_in_journal(); }, + sm::description("the number of segments in journal")), + sm::make_counter("segments_type_journal", + [this] { return segments.get_num_type_journal(); }, + sm::description("the number of segments typed journal")), + sm::make_counter("segments_type_ool", + [this] { return segments.get_num_type_ool(); }, + sm::description("the number of segments typed out-of-line")), + sm::make_counter("segments_open", + [this] { return segments.get_num_open(); }, + sm::description("the number of open segments")), + sm::make_counter("segments_empty", + [this] { return segments.get_num_empty(); }, + sm::description("the number of empty segments")), + sm::make_counter("segments_closed", + [this] { return segments.get_num_closed(); }, + sm::description("the number of closed segments")), + + sm::make_counter("segments_count_open_journal", + [this] { return segments.get_count_open_journal(); }, + sm::description("the count of open journal segment operations")), + sm::make_counter("segments_count_open_ool", + [this] { return segments.get_count_open_ool(); }, + sm::description("the count of open ool segment operations")), + sm::make_counter("segments_count_release_journal", + [this] { return segments.get_count_release_journal(); }, + sm::description("the count of release journal segment operations")), + sm::make_counter("segments_count_release_ool", + [this] { return segments.get_count_release_ool(); }, + sm::description("the count of release ool segment operations")), + sm::make_counter("segments_count_close_journal", + [this] { return segments.get_count_close_journal(); }, + sm::description("the count of close journal segment operations")), + sm::make_counter("segments_count_close_ool", + [this] { return segments.get_count_close_ool(); }, + sm::description("the count of close ool segment operations")), + + sm::make_counter("total_bytes", + [this] { return segments.get_total_bytes(); }, + sm::description("the size of the space")), + sm::make_counter("available_bytes", + [this] { return segments.get_available_bytes(); }, + sm::description("the size of the space is available")), + sm::make_counter("unavailable_unreclaimable_bytes", + [this] { return get_unavailable_unreclaimable_bytes(); }, + sm::description("the size of the space is unavailable and unreclaimable")), + sm::make_counter("unavailable_reclaimable_bytes", + [this] { return get_unavailable_reclaimable_bytes(); }, + sm::description("the size of the space is unavailable and reclaimable")), + sm::make_counter("used_bytes", stats.used_bytes, + sm::description("the size of the space occupied by live extents")), + sm::make_counter("unavailable_unused_bytes", + [this] { return get_unavailable_unused_bytes(); }, + sm::description("the size of the space is unavailable and not alive")), + + sm::make_counter("dirty_journal_bytes", + [this] { return get_dirty_journal_size(); }, + sm::description("the size of the journal for dirty extents")), + sm::make_counter("alloc_journal_bytes", + [this] { return get_alloc_journal_size(); }, + sm::description("the size of the journal for alloc info")), + + sm::make_counter("projected_count", stats.projected_count, + sm::description("the number of projected usage reservations")), + sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum, + sm::description("the sum of the projected usage in bytes")), + + sm::make_counter("io_count", stats.io_count, + sm::description("the sum of IOs")), + sm::make_counter("io_blocked_count", stats.io_blocked_count, + sm::description("IOs that are blocked by gc")), + sm::make_counter("io_blocked_count_trim", stats.io_blocked_count_trim, + sm::description("IOs that are blocked by trimming")), + sm::make_counter("io_blocked_count_reclaim", stats.io_blocked_count_reclaim, + sm::description("IOs that are blocked by reclaimming")), + sm::make_counter("io_blocked_sum", stats.io_blocked_sum, + sm::description("the sum of blocking IOs")), + + sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes, + sm::description("rewritten bytes due to reclaim")), + sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes, + sm::description("rewritten bytes due to reclaim")), + sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes, + sm::description("used bytes when close a journal segment")), + sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes, + sm::description("total bytes of closed journal segments")), + sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes, + sm::description("used bytes when close a ool segment")), + sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes, + sm::description("total bytes of closed ool segments")), + + sm::make_gauge("available_ratio", + [this] { return segments.get_available_ratio(); }, + sm::description("ratio of available space to total space")), + sm::make_gauge("reclaim_ratio", + [this] { return get_reclaim_ratio(); }, + sm::description("ratio of reclaimable space to unavailable space")), + + sm::make_histogram("segment_utilization_distribution", + [this]() -> seastar::metrics::histogram& { + return stats.segment_util; + }, + sm::description("utilization distribution of all segments")) + }); +} + +segment_id_t AsyncCleaner::allocate_segment( + segment_seq_t seq, + segment_type_t type) +{ + LOG_PREFIX(AsyncCleaner::allocate_segment); + assert(seq != NULL_SEG_SEQ); + for (auto it = segments.begin(); + it != segments.end(); + ++it) { + auto seg_id = it->first; + auto& segment_info = it->second; + if (segment_info.is_empty()) { + auto old_usage = calc_utilization(seg_id); + segments.mark_open(seg_id, seq, type); + auto new_usage = calc_utilization(seg_id); + adjust_segment_util(old_usage, new_usage); + INFO("opened, should_block_on_gc {}, projected_avail_ratio {}, " + "reclaim_ratio {}", + should_block_on_gc(), + get_projected_available_ratio(), + get_reclaim_ratio()); + return seg_id; + } + } + ERROR("out of space with segment_seq={}", segment_seq_printer_t{seq}); + ceph_abort(); + return NULL_SEG_ID; +} + +void AsyncCleaner::update_journal_tail_target( + journal_seq_t dirty_replay_from, + journal_seq_t alloc_replay_from) +{ + LOG_PREFIX(AsyncCleaner::update_journal_tail_target); + if (disable_trim) return; + assert(dirty_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); + assert(alloc_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); + if (dirty_extents_replay_from == JOURNAL_SEQ_NULL + || dirty_replay_from > dirty_extents_replay_from) { + DEBUG("dirty_extents_replay_from={} => {}", + dirty_extents_replay_from, dirty_replay_from); + dirty_extents_replay_from = dirty_replay_from; + } + + update_alloc_info_replay_from(alloc_replay_from); + + journal_seq_t target = std::min(dirty_replay_from, alloc_replay_from); + ceph_assert(target != JOURNAL_SEQ_NULL); + auto journal_head = segments.get_journal_head(); + ceph_assert(journal_head == JOURNAL_SEQ_NULL || + journal_head >= target); + if (journal_tail_target == JOURNAL_SEQ_NULL || + target > journal_tail_target) { + if (!init_complete || + journal_tail_target.segment_seq == target.segment_seq) { + DEBUG("journal_tail_target={} => {}", journal_tail_target, target); + } else { + INFO("journal_tail_target={} => {}", journal_tail_target, target); + } + journal_tail_target = target; + } + gc_process.maybe_wake_on_space_used(); + maybe_wake_gc_blocked_io(); +} + +void AsyncCleaner::update_alloc_info_replay_from( + journal_seq_t alloc_replay_from) +{ + LOG_PREFIX(AsyncCleaner::update_alloc_info_replay_from); + if (alloc_info_replay_from == JOURNAL_SEQ_NULL + || alloc_replay_from > alloc_info_replay_from) { + DEBUG("alloc_info_replay_from={} => {}", + alloc_info_replay_from, alloc_replay_from); + alloc_info_replay_from = alloc_replay_from; + } +} + +void AsyncCleaner::update_journal_tail_committed(journal_seq_t committed) +{ + LOG_PREFIX(AsyncCleaner::update_journal_tail_committed); + assert(committed.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); + if (committed == JOURNAL_SEQ_NULL) { + return; + } + auto journal_head = segments.get_journal_head(); + ceph_assert(journal_head == JOURNAL_SEQ_NULL || + journal_head >= committed); + + if (journal_tail_committed == JOURNAL_SEQ_NULL || + committed > journal_tail_committed) { + DEBUG("update journal_tail_committed={} => {}", + journal_tail_committed, committed); + journal_tail_committed = committed; + } + if (journal_tail_target == JOURNAL_SEQ_NULL || + committed > journal_tail_target) { + DEBUG("update journal_tail_target={} => {}", + journal_tail_target, committed); + journal_tail_target = committed; + } +} + +void AsyncCleaner::close_segment(segment_id_t segment) +{ + LOG_PREFIX(AsyncCleaner::close_segment); + auto old_usage = calc_utilization(segment); + segments.mark_closed(segment); + auto &seg_info = segments[segment]; + if (seg_info.type == segment_type_t::JOURNAL) { + stats.closed_journal_used_bytes += space_tracker->get_usage(segment); + stats.closed_journal_total_bytes += segments.get_segment_size(); + } else { + stats.closed_ool_used_bytes += space_tracker->get_usage(segment); + stats.closed_ool_total_bytes += segments.get_segment_size(); + } + auto new_usage = calc_utilization(segment); + adjust_segment_util(old_usage, new_usage); + INFO("closed, should_block_on_gc {}, projected_avail_ratio {}, " + "reclaim_ratio {}", + should_block_on_gc(), + get_projected_available_ratio(), + get_reclaim_ratio()); +} + +AsyncCleaner::trim_backrefs_ret AsyncCleaner::trim_backrefs( + Transaction &t, + journal_seq_t limit) +{ + return backref_manager.merge_cached_backrefs( + t, + limit, + config.rewrite_backref_bytes_per_cycle + ); +} + +AsyncCleaner::rewrite_dirty_ret AsyncCleaner::rewrite_dirty( + Transaction &t, + journal_seq_t limit) +{ + return ecb->get_next_dirty_extents( + t, + limit, + config.rewrite_dirty_bytes_per_cycle + ).si_then([=, &t](auto dirty_list) { + LOG_PREFIX(AsyncCleaner::rewrite_dirty); + DEBUGT("rewrite {} dirty extents", t, dirty_list.size()); + return seastar::do_with( + std::move(dirty_list), + [this, FNAME, &t](auto &dirty_list) { + return trans_intr::do_for_each( + dirty_list, + [this, FNAME, &t](auto &e) { + DEBUGT("cleaning {}", t, *e); + return ecb->rewrite_extent(t, e); + }); + }); + }); +} + +AsyncCleaner::gc_cycle_ret AsyncCleaner::GCProcess::run() +{ + return seastar::do_until( + [this] { return is_stopping(); }, + [this] { + return maybe_wait_should_run( + ).then([this] { + cleaner.log_gc_state("GCProcess::run"); + + if (is_stopping()) { + return seastar::now(); + } else { + return cleaner.do_gc_cycle(); + } + }); + }); +} + +AsyncCleaner::gc_cycle_ret AsyncCleaner::do_gc_cycle() +{ + if (gc_should_trim_journal()) { + return gc_trim_journal( + ).handle_error( + crimson::ct_error::assert_all{ + "GCProcess::run encountered invalid error in gc_trim_journal" + } + ); + } else if (gc_should_trim_backref()) { + return gc_trim_backref(get_backref_tail() + ).safe_then([](auto) { + return seastar::now(); + }).handle_error( + crimson::ct_error::assert_all{ + "GCProcess::run encountered invalid error in gc_trim_backref" + } + ); + } else if (gc_should_reclaim_space()) { + return gc_reclaim_space( + ).handle_error( + crimson::ct_error::assert_all{ + "GCProcess::run encountered invalid error in gc_reclaim_space" + } + ); + } else { + return seastar::now(); + } +} + +AsyncCleaner::gc_trim_backref_ret +AsyncCleaner::gc_trim_backref(journal_seq_t limit) { + return seastar::do_with( + journal_seq_t(), + [this, limit=std::move(limit)](auto &seq) mutable { + return repeat_eagain([this, limit=std::move(limit), &seq] { + return ecb->with_transaction_intr( + Transaction::src_t::TRIM_BACKREF, + "trim_backref", + [this, limit](auto &t) { + return trim_backrefs( + t, + limit + ).si_then([this, &t, limit](auto trim_backrefs_to) + -> ExtentCallbackInterface::submit_transaction_direct_iertr::future< + journal_seq_t> { + if (trim_backrefs_to != JOURNAL_SEQ_NULL) { + return ecb->submit_transaction_direct( + t, std::make_optional(trim_backrefs_to) + ).si_then([trim_backrefs_to=std::move(trim_backrefs_to)]() mutable { + return seastar::make_ready_future< + journal_seq_t>(std::move(trim_backrefs_to)); + }); + } + return seastar::make_ready_future(std::move(limit)); + }); + }).safe_then([&seq](auto trim_backrefs_to) { + seq = std::move(trim_backrefs_to); + }); + }).safe_then([&seq] { + return gc_trim_backref_ertr::make_ready_future< + journal_seq_t>(std::move(seq)); + }); + }); +} + +AsyncCleaner::gc_trim_journal_ret AsyncCleaner::gc_trim_journal() +{ + return gc_trim_backref(get_dirty_tail() + ).safe_then([this](auto seq) { + return repeat_eagain([this, seq=std::move(seq)]() mutable { + return ecb->with_transaction_intr( + Transaction::src_t::CLEANER_TRIM, + "trim_journal", + [this, seq=std::move(seq)](auto& t) + { + return rewrite_dirty(t, seq + ).si_then([this, &t] { + return ecb->submit_transaction_direct(t); + }); + }); + }); + }); +} + +AsyncCleaner::retrieve_live_extents_ret +AsyncCleaner::_retrieve_live_extents( + Transaction &t, + std::set< + backref_buf_entry_t, + backref_buf_entry_t::cmp_t> &&backrefs, + std::vector &extents) +{ + return seastar::do_with( + JOURNAL_SEQ_NULL, + std::move(backrefs), + [this, &t, &extents](auto &seq, auto &backrefs) { + return trans_intr::parallel_for_each( + backrefs, + [this, &extents, &t, &seq](auto &ent) { + LOG_PREFIX(AsyncCleaner::_retrieve_live_extents); + DEBUGT("getting extent of type {} at {}~{}", + t, + ent.type, + ent.paddr, + ent.len); + return ecb->get_extent_if_live( + t, ent.type, ent.paddr, ent.laddr, ent.len + ).si_then([this, FNAME, &extents, &ent, &seq, &t](auto ext) { + if (!ext) { + DEBUGT("addr {} dead, skipping", t, ent.paddr); + auto backref = backref_manager.get_cached_backref_removal(ent.paddr); + if (seq == JOURNAL_SEQ_NULL || seq < backref.seq) { + seq = backref.seq; + } + } else { + extents.emplace_back(std::move(ext)); + } + return ExtentCallbackInterface::rewrite_extent_iertr::now(); + }); + }).si_then([&seq] { + return retrieve_live_extents_iertr::make_ready_future< + journal_seq_t>(std::move(seq)); + }); + }); +} + +AsyncCleaner::retrieve_backref_mappings_ret +AsyncCleaner::retrieve_backref_mappings( + paddr_t start_paddr, + paddr_t end_paddr) +{ + return seastar::do_with( + backref_pin_list_t(), + [this, start_paddr, end_paddr](auto &pin_list) { + return repeat_eagain([this, start_paddr, end_paddr, &pin_list] { + return ecb->with_transaction_intr( + Transaction::src_t::READ, + "get_backref_mappings", + [this, start_paddr, end_paddr](auto &t) { + return backref_manager.get_mappings( + t, start_paddr, end_paddr + ); + }).safe_then([&pin_list](auto&& list) { + pin_list = std::move(list); + }); + }).safe_then([&pin_list] { + return seastar::make_ready_future(std::move(pin_list)); + }); + }); +} + +AsyncCleaner::gc_reclaim_space_ret AsyncCleaner::gc_reclaim_space() +{ + LOG_PREFIX(AsyncCleaner::gc_reclaim_space); + if (!reclaim_state) { + segment_id_t seg_id = get_next_reclaim_segment(); + auto &segment_info = segments[seg_id]; + INFO("reclaim {} {} start", seg_id, segment_info); + ceph_assert(segment_info.is_closed()); + reclaim_state = reclaim_state_t::create( + seg_id, segments.get_segment_size()); + } + reclaim_state->advance(config.reclaim_bytes_per_cycle); + + DEBUG("reclaiming {}~{}", + reclaim_state->start_pos, + reclaim_state->end_pos); + double pavail_ratio = get_projected_available_ratio(); + seastar::lowres_system_clock::time_point start = seastar::lowres_system_clock::now(); + + return seastar::do_with( + (size_t)0, + (size_t)0, + [this, pavail_ratio, start]( + auto &reclaimed, + auto &runs) { + return retrieve_backref_mappings( + reclaim_state->start_pos, + reclaim_state->end_pos + ).safe_then([this, &reclaimed, &runs](auto pin_list) { + return seastar::do_with( + std::move(pin_list), + [this, &reclaimed, &runs](auto &pin_list) { + return repeat_eagain( + [this, &reclaimed, &runs, &pin_list]() mutable { + reclaimed = 0; + runs++; + return seastar::do_with( + backref_manager.get_cached_backref_extents_in_range( + reclaim_state->start_pos, reclaim_state->end_pos), + backref_manager.get_cached_backrefs_in_range( + reclaim_state->start_pos, reclaim_state->end_pos), + backref_manager.get_cached_backref_removals_in_range( + reclaim_state->start_pos, reclaim_state->end_pos), + JOURNAL_SEQ_NULL, + [this, &reclaimed, &pin_list]( + auto &backref_extents, + auto &backrefs, + auto &del_backrefs, + auto &seq) { + return ecb->with_transaction_intr( + Transaction::src_t::CLEANER_RECLAIM, + "reclaim_space", + [this, &backref_extents, &backrefs, &seq, + &del_backrefs, &reclaimed, &pin_list](auto &t) { + LOG_PREFIX(AsyncCleaner::gc_reclaim_space); + DEBUGT("{} backrefs, {} del_backrefs, {} pins", t, + backrefs.size(), del_backrefs.size(), pin_list.size()); + for (auto &br : backrefs) { + if (seq == JOURNAL_SEQ_NULL + || (br.seq != JOURNAL_SEQ_NULL && br.seq > seq)) + seq = br.seq; + } + for (auto &pin : pin_list) { + backrefs.emplace( + pin->get_key(), + pin->get_val(), + pin->get_length(), + pin->get_type(), + journal_seq_t()); + } + for (auto &del_backref : del_backrefs) { + DEBUGT("del_backref {}~{} {} {}", t, + del_backref.paddr, del_backref.len, del_backref.type, del_backref.seq); + auto it = backrefs.find(del_backref.paddr); + if (it != backrefs.end()) + backrefs.erase(it); + if (seq == JOURNAL_SEQ_NULL + || (del_backref.seq != JOURNAL_SEQ_NULL && del_backref.seq > seq)) + seq = del_backref.seq; + } + return seastar::do_with( + std::vector(), + [this, &backref_extents, &backrefs, &reclaimed, &t, &seq] + (auto &extents) { + return backref_manager.retrieve_backref_extents( + t, std::move(backref_extents), extents + ).si_then([this, &extents, &t, &backrefs] { + return _retrieve_live_extents( + t, std::move(backrefs), extents); + }).si_then([this, &seq, &t](auto nseq) { + if (nseq != JOURNAL_SEQ_NULL && + (nseq > seq || seq == JOURNAL_SEQ_NULL)) + seq = nseq; + auto fut = BackrefManager::merge_cached_backrefs_iertr::now(); + if (seq != JOURNAL_SEQ_NULL) { + fut = backref_manager.merge_cached_backrefs( + t, seq, std::numeric_limits::max() + ).si_then([](auto) { + return BackrefManager::merge_cached_backrefs_iertr::now(); + }); + } + return fut; + }).si_then([&extents, this, &t, &reclaimed] { + return trans_intr::do_for_each( + extents, + [this, &t, &reclaimed](auto &ext) { + reclaimed += ext->get_length(); + return ecb->rewrite_extent(t, ext); + }); + }); + }).si_then([this, &t, &seq] { + if (reclaim_state->is_complete()) { + t.mark_segment_to_release(reclaim_state->get_segment_id()); + } + return ecb->submit_transaction_direct( + t, std::make_optional(std::move(seq))); + }); + }); + }); + }); + }); + }).safe_then( + [&reclaimed, this, pavail_ratio, start, &runs] { + LOG_PREFIX(AsyncCleaner::gc_reclaim_space); +#ifndef NDEBUG + auto ndel_backrefs = + backref_manager.get_cached_backref_removals_in_range( + reclaim_state->start_pos, reclaim_state->end_pos); + if (!ndel_backrefs.empty()) { + for (auto &del_br : ndel_backrefs) { + ERROR("unexpected del_backref {}~{} {} {}", + del_br.paddr, del_br.len, del_br.type, del_br.seq); + } + ceph_abort("impossible"); + } +#endif + stats.reclaiming_bytes += reclaimed; + auto d = seastar::lowres_system_clock::now() - start; + DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}", d, pavail_ratio, runs); + if (reclaim_state->is_complete()) { + INFO("reclaim {} finish, alive/total={}", + reclaim_state->get_segment_id(), + stats.reclaiming_bytes/(double)segments.get_segment_size()); + stats.reclaimed_bytes += stats.reclaiming_bytes; + stats.reclaimed_segment_bytes += segments.get_segment_size(); + stats.reclaiming_bytes = 0; + reclaim_state.reset(); + } + }); + }); +} + +AsyncCleaner::mount_ret AsyncCleaner::mount() +{ + LOG_PREFIX(AsyncCleaner::mount); + const auto& sms = sm_group->get_segment_managers(); + INFO("{} segment managers", sms.size()); + init_complete = false; + stats = {}; + journal_tail_target = JOURNAL_SEQ_NULL; + journal_tail_committed = JOURNAL_SEQ_NULL; + dirty_extents_replay_from = JOURNAL_SEQ_NULL; + alloc_info_replay_from = JOURNAL_SEQ_NULL; + + space_tracker.reset( + detailed ? + (SpaceTrackerI*)new SpaceTrackerDetailed( + sms) : + (SpaceTrackerI*)new SpaceTrackerSimple( + sms)); + + segments.reset(); + for (auto sm : sms) { + segments.add_segment_manager(*sm); + } + metrics.clear(); + register_metrics(); + + INFO("{} segments", segments.get_num_segments()); + return seastar::do_with( + std::vector>(), + [this, FNAME](auto& segment_set) { + return crimson::do_for_each( + segments.begin(), + segments.end(), + [this, FNAME, &segment_set](auto& it) { + auto segment_id = it.first; + return sm_group->read_segment_header( + segment_id + ).safe_then([segment_id, this, FNAME, &segment_set](auto header) { + INFO("segment_id={} -- {}", segment_id, header); + auto s_type = header.get_type(); + if (s_type == segment_type_t::NULL_SEG) { + ERROR("got null segment, segment_id={} -- {}", segment_id, header); + ceph_abort(); + } + return sm_group->read_segment_tail( + segment_id + ).safe_then([this, segment_id, &segment_set, header](auto tail) + -> scan_extents_ertr::future<> { + if (tail.segment_nonce != header.segment_nonce) { + return scan_nonfull_segment(header, segment_set, segment_id); + } + time_point last_modified(duration(tail.last_modified)); + time_point last_rewritten(duration(tail.last_rewritten)); + segments.update_last_modified_rewritten( + segment_id, last_modified, last_rewritten); + if (tail.get_type() == segment_type_t::JOURNAL) { + update_journal_tail_committed(tail.journal_tail); + update_journal_tail_target( + tail.journal_tail, + tail.alloc_replay_from); + } + init_mark_segment_closed( + segment_id, + header.segment_seq, + header.type); + return seastar::now(); + }).handle_error( + crimson::ct_error::enodata::handle( + [this, header, segment_id, &segment_set](auto) { + return scan_nonfull_segment(header, segment_set, segment_id); + }), + crimson::ct_error::pass_further_all{} + ); + }).handle_error( + crimson::ct_error::enoent::handle([](auto) { + return mount_ertr::now(); + }), + crimson::ct_error::enodata::handle([](auto) { + return mount_ertr::now(); + }), + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error"} + ); + }); + }); +} + +AsyncCleaner::scan_extents_ret AsyncCleaner::scan_nonfull_segment( + const segment_header_t& header, + scan_extents_ret_bare& segment_set, + segment_id_t segment_id) +{ + return seastar::do_with( + scan_valid_records_cursor({ + segments[segment_id].seq, + paddr_t::make_seg_paddr(segment_id, 0)}), + [this, segment_id, segment_header=header](auto& cursor) { + return seastar::do_with( + SegmentManagerGroup::found_record_handler_t( + [this, segment_id, segment_header]( + record_locator_t locator, + const record_group_header_t& header, + const bufferlist& mdbuf + ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> { + LOG_PREFIX(AsyncCleaner::scan_nonfull_segment); + if (segment_header.get_type() == segment_type_t::OOL) { + DEBUG("out-of-line segment {}, decodeing {} records", + segment_id, + header.records); + auto maybe_headers = try_decode_record_headers(header, mdbuf); + if (!maybe_headers) { + ERROR("unable to decode record headers for record group {}", + locator.record_block_base); + return crimson::ct_error::input_output_error::make(); + } + + for (auto& header : *maybe_headers) { + mod_time_point_t ctime = header.commit_time; + auto commit_type = header.commit_type; + if (!ctime) { + ERROR("AsyncCleaner::scan_nonfull_segment: extent {} 0 commit_time", + ctime); + ceph_abort("0 commit_time"); + } + time_point commit_time{duration(ctime)}; + assert(commit_type == record_commit_type_t::MODIFY + || commit_type == record_commit_type_t::REWRITE); + if (commit_type == record_commit_type_t::MODIFY) { + segments.update_last_modified_rewritten(segment_id, commit_time, {}); + } + if (commit_type == record_commit_type_t::REWRITE) { + segments.update_last_modified_rewritten(segment_id, {}, commit_time); + } + } + } else { + DEBUG("inline segment {}, decodeing {} records", + segment_id, + header.records); + auto maybe_record_deltas_list = try_decode_deltas( + header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + ERROR("unable to decode deltas for record {} at {}", + header, locator); + return crimson::ct_error::input_output_error::make(); + } + for (auto &record_deltas : *maybe_record_deltas_list) { + for (auto &[ctime, delta] : record_deltas.deltas) { + if (delta.type == extent_types_t::ALLOC_TAIL) { + journal_seq_t seq; + decode(seq, delta.bl); + update_alloc_info_replay_from(seq); + } + } + } + } + return seastar::now(); + }), + [&cursor, segment_header, this](auto& handler) { + return sm_group->scan_valid_records( + cursor, + segment_header.segment_nonce, + segments.get_segment_size(), + handler); + } + ); + }).safe_then([this, segment_id, header](auto) { + init_mark_segment_closed( + segment_id, + header.segment_seq, + header.type); + return seastar::now(); + }); +} + +AsyncCleaner::release_ertr::future<> +AsyncCleaner::maybe_release_segment(Transaction &t) +{ + auto to_release = t.get_segment_to_release(); + if (to_release != NULL_SEG_ID) { + LOG_PREFIX(AsyncCleaner::maybe_release_segment); + INFOT("releasing segment {}", t, to_release); + return sm_group->release_segment(to_release + ).safe_then([this, FNAME, &t, to_release] { + auto old_usage = calc_utilization(to_release); + ceph_assert(old_usage == 0); + segments.mark_empty(to_release); + auto new_usage = calc_utilization(to_release); + adjust_segment_util(old_usage, new_usage); + INFOT("released, should_block_on_gc {}, projected_avail_ratio {}, " + "reclaim_ratio {}", + t, + should_block_on_gc(), + get_projected_available_ratio(), + get_reclaim_ratio()); + if (space_tracker->get_usage(to_release) != 0) { + space_tracker->dump_usage(to_release); + ceph_abort(); + } + maybe_wake_gc_blocked_io(); + }); + } else { + return SegmentManager::release_ertr::now(); + } +} + +void AsyncCleaner::complete_init() +{ + LOG_PREFIX(AsyncCleaner::complete_init); + if (disable_trim) { + init_complete = true; + return; + } + INFO("done, start GC"); + ceph_assert(segments.get_journal_head() != JOURNAL_SEQ_NULL); + init_complete = true; + gc_process.start(); +} + +void AsyncCleaner::mark_space_used( + paddr_t addr, + extent_len_t len, + time_point last_modified, + time_point last_rewritten, + bool init_scan) +{ + LOG_PREFIX(AsyncCleaner::mark_space_used); + if (addr.get_addr_type() != addr_types_t::SEGMENT) { + return; + } + auto& seg_addr = addr.as_seg_paddr(); + + if (!init_scan && !init_complete) { + return; + } + + stats.used_bytes += len; + auto old_usage = calc_utilization(seg_addr.get_segment_id()); + [[maybe_unused]] auto ret = space_tracker->allocate( + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + len); + auto new_usage = calc_utilization(seg_addr.get_segment_id()); + adjust_segment_util(old_usage, new_usage); + + // use the last extent's last modified time for the calculation of the projected + // time the segments' live extents are to stay unmodified; this is an approximation + // of the sprite lfs' segment "age". + + segments.update_last_modified_rewritten( + seg_addr.get_segment_id(), last_modified, last_rewritten); + + gc_process.maybe_wake_on_space_used(); + assert(ret > 0); + DEBUG("segment {} new len: {}~{}, live_bytes: {}", + seg_addr.get_segment_id(), + addr, + len, + space_tracker->get_usage(seg_addr.get_segment_id())); +} + +void AsyncCleaner::mark_space_free( + paddr_t addr, + extent_len_t len) +{ + LOG_PREFIX(AsyncCleaner::mark_space_free); + if (!init_complete) { + return; + } + if (addr.get_addr_type() != addr_types_t::SEGMENT) { + return; + } + + ceph_assert(stats.used_bytes >= len); + stats.used_bytes -= len; + auto& seg_addr = addr.as_seg_paddr(); + + DEBUG("segment {} free len: {}~{}", + seg_addr.get_segment_id(), addr, len); + auto old_usage = calc_utilization(seg_addr.get_segment_id()); + [[maybe_unused]] auto ret = space_tracker->release( + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + len); + auto new_usage = calc_utilization(seg_addr.get_segment_id()); + adjust_segment_util(old_usage, new_usage); + maybe_wake_gc_blocked_io(); + assert(ret >= 0); + DEBUG("segment {} free len: {}~{}, live_bytes: {}", + seg_addr.get_segment_id(), + addr, + len, + space_tracker->get_usage(seg_addr.get_segment_id())); +} + +segment_id_t AsyncCleaner::get_next_reclaim_segment() const +{ + LOG_PREFIX(AsyncCleaner::get_next_reclaim_segment); + segment_id_t id = NULL_SEG_ID; + double max_benefit_cost = 0; + for (auto& [_id, segment_info] : segments) { + if (segment_info.is_closed() && + !segment_info.is_in_journal(journal_tail_committed)) { + double benefit_cost = calc_gc_benefit_cost(_id); + if (benefit_cost > max_benefit_cost) { + id = _id; + max_benefit_cost = benefit_cost; + } + } + } + if (id != NULL_SEG_ID) { + DEBUG("segment {}, benefit_cost {}", + id, max_benefit_cost); + return id; + } else { + ceph_assert(get_segments_reclaimable() == 0); + // see gc_should_reclaim_space() + ceph_abort("impossible!"); + return NULL_SEG_ID; + } +} + +void AsyncCleaner::log_gc_state(const char *caller) const +{ + LOG_PREFIX(AsyncCleaner::log_gc_state); + if (LOCAL_LOGGER.is_enabled(seastar::log_level::debug) && + !disable_trim) { + DEBUG( + "caller {}, " + "empty {}, " + "open {}, " + "closed {}, " + "in_journal {}, " + "total {}B, " + "available {}B, " + "unavailable {}B, " + "unavailable_used {}B, " + "unavailable_unused {}B; " + "reclaim_ratio {}, " + "available_ratio {}, " + "should_block_on_gc {}, " + "gc_should_reclaim_space {}, " + "journal_head {}, " + "journal_tail_target {}, " + "journal_tail_commit {}, " + "dirty_tail {}, " + "dirty_tail_limit {}, " + "gc_should_trim_journal {}, ", + caller, + segments.get_num_empty(), + segments.get_num_open(), + segments.get_num_closed(), + get_segments_in_journal(), + segments.get_total_bytes(), + segments.get_available_bytes(), + segments.get_unavailable_bytes(), + stats.used_bytes, + get_unavailable_unused_bytes(), + get_reclaim_ratio(), + segments.get_available_ratio(), + should_block_on_gc(), + gc_should_reclaim_space(), + segments.get_journal_head(), + journal_tail_target, + journal_tail_committed, + get_dirty_tail(), + get_dirty_tail_limit(), + gc_should_trim_journal() + ); + } +} + +seastar::future<> +AsyncCleaner::reserve_projected_usage(std::size_t projected_usage) +{ + if (disable_trim) { + return seastar::now(); + } + ceph_assert(init_complete); + // The pipeline configuration prevents another IO from entering + // prepare until the prior one exits and clears this. + ceph_assert(!blocked_io_wake); + ++stats.io_count; + bool is_blocked = false; + if (should_block_on_trim()) { + is_blocked = true; + ++stats.io_blocked_count_trim; + } + if (should_block_on_reclaim()) { + is_blocked = true; + ++stats.io_blocked_count_reclaim; + } + if (is_blocked) { + ++stats.io_blocking_num; + ++stats.io_blocked_count; + stats.io_blocked_sum += stats.io_blocking_num; + } + return seastar::do_until( + [this] { + log_gc_state("await_hard_limits"); + return !should_block_on_gc(); + }, + [this] { + blocked_io_wake = seastar::promise<>(); + return blocked_io_wake->get_future(); + } + ).then([this, projected_usage, is_blocked] { + ceph_assert(!blocked_io_wake); + stats.projected_used_bytes += projected_usage; + ++stats.projected_count; + stats.projected_used_bytes_sum += stats.projected_used_bytes; + if (is_blocked) { + assert(stats.io_blocking_num > 0); + --stats.io_blocking_num; + } + }); +} + +void AsyncCleaner::release_projected_usage(std::size_t projected_usage) +{ + if (disable_trim) return; + ceph_assert(init_complete); + ceph_assert(stats.projected_used_bytes >= projected_usage); + stats.projected_used_bytes -= projected_usage; + return maybe_wake_gc_blocked_io(); +} + +} diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h new file mode 100644 index 000000000000..0f2ded6bc435 --- /dev/null +++ b/src/crimson/os/seastore/async_cleaner.h @@ -0,0 +1,1296 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +#include "common/ceph_time.h" + +#include "osd/osd_types.h" + +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/segment_manager_group.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_seq_allocator.h" + +namespace crimson::os::seastore { + +/* + * segment_info_t + * + * Maintains the tracked information for a segment. + * It is read-only outside segments_info_t. + */ +struct segment_info_t { + using time_point = seastar::lowres_system_clock::time_point; + + // segment_info_t is initiated as set_empty() + Segment::segment_state_t state = Segment::segment_state_t::EMPTY; + + // Will be non-null for any segments in the current journal + segment_seq_t seq = NULL_SEG_SEQ; + + segment_type_t type = segment_type_t::NULL_SEG; + + time_point last_modified; + time_point last_rewritten; + + std::size_t written_to = 0; + + bool is_in_journal(journal_seq_t tail_committed) const { + return type == segment_type_t::JOURNAL && + tail_committed.segment_seq <= seq; + } + + bool is_empty() const { + return state == Segment::segment_state_t::EMPTY; + } + + bool is_closed() const { + return state == Segment::segment_state_t::CLOSED; + } + + bool is_open() const { + return state == Segment::segment_state_t::OPEN; + } + + void init_closed(segment_seq_t, segment_type_t, std::size_t); + + void set_open(segment_seq_t, segment_type_t); + + void set_empty(); + + void set_closed(); + + void update_last_modified_rewritten( + time_point _last_modified, time_point _last_rewritten) { + if (_last_modified != time_point() && last_modified < _last_modified) { + last_modified = _last_modified; + } + if (_last_rewritten != time_point() && last_rewritten < _last_rewritten) { + last_rewritten = _last_rewritten; + } + } +}; + +std::ostream& operator<<(std::ostream&, const segment_info_t&); + +/* + * segments_info_t + * + * Keep track of all segments and related information. + */ +class segments_info_t { +public: + using time_point = seastar::lowres_system_clock::time_point; + + segments_info_t() { + reset(); + } + + const segment_info_t& operator[](segment_id_t id) const { + return segments[id]; + } + + auto begin() const { + return segments.begin(); + } + + auto end() const { + return segments.end(); + } + + std::size_t get_num_segments() const { + assert(segments.size() > 0); + return segments.size(); + } + std::size_t get_segment_size() const { + assert(segment_size > 0); + return segment_size; + } + std::size_t get_num_in_journal_open() const { + return num_in_journal_open; + } + std::size_t get_num_type_journal() const { + return num_type_journal; + } + std::size_t get_num_type_ool() const { + return num_type_ool; + } + std::size_t get_num_open() const { + return num_open; + } + std::size_t get_num_empty() const { + return num_empty; + } + std::size_t get_num_closed() const { + return num_closed; + } + std::size_t get_count_open_journal() const { + return count_open_journal; + } + std::size_t get_count_open_ool() const { + return count_open_ool; + } + std::size_t get_count_release_journal() const { + return count_release_journal; + } + std::size_t get_count_release_ool() const { + return count_release_ool; + } + std::size_t get_count_close_journal() const { + return count_close_journal; + } + std::size_t get_count_close_ool() const { + return count_close_ool; + } + + std::size_t get_total_bytes() const { + return total_bytes; + } + /// the available space that is writable, including in open segments + std::size_t get_available_bytes() const { + return num_empty * get_segment_size() + avail_bytes_in_open; + } + /// the unavailable space that is not writable + std::size_t get_unavailable_bytes() const { + assert(total_bytes >= get_available_bytes()); + return total_bytes - get_available_bytes(); + } + std::size_t get_available_bytes_in_open() const { + return avail_bytes_in_open; + } + double get_available_ratio() const { + return (double)get_available_bytes() / (double)total_bytes; + } + + journal_seq_t get_journal_head() const { + if (unlikely(journal_segment_id == NULL_SEG_ID)) { + return JOURNAL_SEQ_NULL; + } + auto &segment_info = segments[journal_segment_id]; + assert(!segment_info.is_empty()); + assert(segment_info.type == segment_type_t::JOURNAL); + assert(segment_info.seq != NULL_SEG_SEQ); + return journal_seq_t{ + segment_info.seq, + paddr_t::make_seg_paddr( + journal_segment_id, + segment_info.written_to) + }; + } + + void reset(); + + void add_segment_manager(SegmentManager &segment_manager); + + // initiate non-empty segments, the others are by default empty + void init_closed(segment_id_t, segment_seq_t, segment_type_t); + + void mark_open(segment_id_t, segment_seq_t, segment_type_t); + + void mark_empty(segment_id_t); + + void mark_closed(segment_id_t); + + void update_written_to(segment_type_t, paddr_t); + + void update_last_modified_rewritten( + segment_id_t id, time_point last_modified, time_point last_rewritten) { + segments[id].update_last_modified_rewritten(last_modified, last_rewritten); + } + +private: + // See reset() for member initialization + segment_map_t segments; + + std::size_t segment_size; + + segment_id_t journal_segment_id; + std::size_t num_in_journal_open; + std::size_t num_type_journal; + std::size_t num_type_ool; + + std::size_t num_open; + std::size_t num_empty; + std::size_t num_closed; + + std::size_t count_open_journal; + std::size_t count_open_ool; + std::size_t count_release_journal; + std::size_t count_release_ool; + std::size_t count_close_journal; + std::size_t count_close_ool; + + std::size_t total_bytes; + std::size_t avail_bytes_in_open; +}; + +/** + * Callback interface for managing available segments + */ +class SegmentProvider { +public: + virtual journal_seq_t get_journal_tail_target() const = 0; + + virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0; + + virtual segment_id_t allocate_segment( + segment_seq_t seq, segment_type_t type) = 0; + + virtual journal_seq_t get_dirty_extents_replay_from() const = 0; + + virtual journal_seq_t get_alloc_info_replay_from() const = 0; + + virtual void close_segment(segment_id_t) = 0; + + virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; + + virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0; + + virtual SegmentManagerGroup* get_segment_manager_group() = 0; + + virtual ~SegmentProvider() {} +}; + +class SpaceTrackerI { +public: + virtual int64_t allocate( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) = 0; + + virtual int64_t release( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) = 0; + + virtual int64_t get_usage( + segment_id_t segment) const = 0; + + virtual bool equals(const SpaceTrackerI &other) const = 0; + + virtual std::unique_ptr make_empty() const = 0; + + virtual void dump_usage(segment_id_t) const = 0; + + virtual double calc_utilization(segment_id_t segment) const = 0; + + virtual void reset() = 0; + + virtual ~SpaceTrackerI() = default; +}; +using SpaceTrackerIRef = std::unique_ptr; + +class SpaceTrackerSimple : public SpaceTrackerI { + struct segment_bytes_t { + int64_t live_bytes = 0; + seastore_off_t total_bytes = 0; + }; + // Tracks live space for each segment + segment_map_t live_bytes_by_segment; + + int64_t update_usage(segment_id_t segment, int64_t delta) { + live_bytes_by_segment[segment].live_bytes += delta; + assert(live_bytes_by_segment[segment].live_bytes >= 0); + return live_bytes_by_segment[segment].live_bytes; + } +public: + SpaceTrackerSimple(const SpaceTrackerSimple &) = default; + SpaceTrackerSimple(const std::vector &sms) { + for (auto sm : sms) { + live_bytes_by_segment.add_device( + sm->get_device_id(), + sm->get_num_segments(), + {0, sm->get_segment_size()}); + } + } + + int64_t allocate( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) final { + return update_usage(segment, len); + } + + int64_t release( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) final { + return update_usage(segment, -(int64_t)len); + } + + int64_t get_usage(segment_id_t segment) const final { + return live_bytes_by_segment[segment].live_bytes; + } + + double calc_utilization(segment_id_t segment) const final { + auto& seg_bytes = live_bytes_by_segment[segment]; + return (double)seg_bytes.live_bytes / (double)seg_bytes.total_bytes; + } + + void dump_usage(segment_id_t) const final; + + void reset() final { + for (auto &i : live_bytes_by_segment) { + i.second = {0, 0}; + } + } + + SpaceTrackerIRef make_empty() const final { + auto ret = SpaceTrackerIRef(new SpaceTrackerSimple(*this)); + ret->reset(); + return ret; + } + + bool equals(const SpaceTrackerI &other) const; +}; + +class SpaceTrackerDetailed : public SpaceTrackerI { + class SegmentMap { + int64_t used = 0; + seastore_off_t total_bytes = 0; + std::vector bitmap; + + public: + SegmentMap( + size_t blocks, + seastore_off_t total_bytes) + : total_bytes(total_bytes), + bitmap(blocks, false) {} + + int64_t update_usage(int64_t delta) { + used += delta; + return used; + } + + int64_t allocate( + device_segment_id_t segment, + seastore_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t release( + device_segment_id_t segment, + seastore_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t get_usage() const { + return used; + } + + void dump_usage(extent_len_t block_size) const; + + double calc_utilization() const { + return (double)used / (double)total_bytes; + } + + void reset() { + used = 0; + for (auto &&i: bitmap) { + i = false; + } + } + }; + + // Tracks live space for each segment + segment_map_t segment_usage; + std::vector block_size_by_segment_manager; + +public: + SpaceTrackerDetailed(const SpaceTrackerDetailed &) = default; + SpaceTrackerDetailed(const std::vector &sms) + { + block_size_by_segment_manager.resize(DEVICE_ID_MAX, 0); + for (auto sm : sms) { + segment_usage.add_device( + sm->get_device_id(), + sm->get_num_segments(), + SegmentMap( + sm->get_segment_size() / sm->get_block_size(), + sm->get_segment_size())); + block_size_by_segment_manager[sm->get_device_id()] = sm->get_block_size(); + } + } + + int64_t allocate( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) final { + return segment_usage[segment].allocate( + segment.device_segment_id(), + offset, + len, + block_size_by_segment_manager[segment.device_id()]); + } + + int64_t release( + segment_id_t segment, + seastore_off_t offset, + extent_len_t len) final { + return segment_usage[segment].release( + segment.device_segment_id(), + offset, + len, + block_size_by_segment_manager[segment.device_id()]); + } + + int64_t get_usage(segment_id_t segment) const final { + return segment_usage[segment].get_usage(); + } + + double calc_utilization(segment_id_t segment) const final { + return segment_usage[segment].calc_utilization(); + } + + void dump_usage(segment_id_t seg) const final; + + void reset() final { + for (auto &i: segment_usage) { + i.second.reset(); + } + } + + SpaceTrackerIRef make_empty() const final { + auto ret = SpaceTrackerIRef(new SpaceTrackerDetailed(*this)); + ret->reset(); + return ret; + } + + bool equals(const SpaceTrackerI &other) const; +}; + + +class AsyncCleaner : public SegmentProvider { +public: + using time_point = seastar::lowres_system_clock::time_point; + using duration = seastar::lowres_system_clock::duration; + + /// Config + struct config_t { + /// Number of minimum journal segments to stop trimming. + size_t target_journal_segments = 0; + /// Number of maximum journal segments to block user transactions. + size_t max_journal_segments = 0; + + /// Number of journal segments the transactions in which can + /// have their corresponding backrefs unmerged + size_t target_backref_inflight_segments = 0; + + /// Ratio of maximum available space to disable reclaiming. + double available_ratio_gc_max = 0; + /// Ratio of minimum available space to force reclaiming. + double available_ratio_hard_limit = 0; + + /// Ratio of minimum reclaimable space to stop reclaiming. + double reclaim_ratio_gc_threshold = 0; + + /// Number of bytes to reclaim per cycle + size_t reclaim_bytes_per_cycle = 0; + + /// Number of bytes to rewrite dirty per cycle + size_t rewrite_dirty_bytes_per_cycle = 0; + + /// Number of bytes to rewrite backref per cycle + size_t rewrite_backref_bytes_per_cycle = 0; + + void validate() const { + ceph_assert(max_journal_segments > target_journal_segments); + ceph_assert(available_ratio_gc_max > available_ratio_hard_limit); + ceph_assert(reclaim_bytes_per_cycle > 0); + ceph_assert(rewrite_dirty_bytes_per_cycle > 0); + ceph_assert(rewrite_backref_bytes_per_cycle > 0); + } + + static config_t get_default() { + return config_t{ + 12, // target_journal_segments + 16, // max_journal_segments + 2, // target_backref_inflight_segments + .1, // available_ratio_gc_max + .05, // available_ratio_hard_limit + .1, // reclaim_ratio_gc_threshold + 1<<20,// reclaim_bytes_per_cycle + 1<<17,// rewrite_dirty_bytes_per_cycle + 1<<24 // rewrite_backref_bytes_per_cycle + }; + } + + static config_t get_test() { + return config_t{ + 2, // target_journal_segments + 4, // max_journal_segments + 2, // target_backref_inflight_segments + .99, // available_ratio_gc_max + .2, // available_ratio_hard_limit + .6, // reclaim_ratio_gc_threshold + 1<<20,// reclaim_bytes_per_cycle + 1<<17,// rewrite_dirty_bytes_per_cycle + 1<<24 // rewrite_backref_bytes_per_cycle + }; + } + }; + + /// Callback interface for querying and operating on segments + class ExtentCallbackInterface { + public: + virtual ~ExtentCallbackInterface() = default; + + virtual TransactionRef create_transaction( + Transaction::src_t, const char*) = 0; + + /// Creates empty transaction with interruptible context + template + auto with_transaction_intr( + Transaction::src_t src, + const char* name, + Func &&f) { + return seastar::do_with( + create_transaction(src, name), + [f=std::forward(f)](auto &ref_t) mutable { + return with_trans_intr( + *ref_t, + [f=std::forward(f)](auto& t) mutable { + return f(t); + } + ); + } + ); + } + + /// See Cache::get_next_dirty_extents + using get_next_dirty_extents_iertr = trans_iertr< + crimson::errorator< + crimson::ct_error::input_output_error> + >; + using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future< + std::vector>; + virtual get_next_dirty_extents_ret get_next_dirty_extents( + Transaction &t, ///< [in] current transaction + journal_seq_t bound,///< [in] return extents with dirty_from < bound + size_t max_bytes ///< [in] return up to max_bytes of extents + ) = 0; + + using extent_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::eagain>; + using extent_mapping_iertr = trans_iertr< + crimson::errorator< + crimson::ct_error::input_output_error> + >; + + /** + * rewrite_extent + * + * Updates t with operations moving the passed extents to a new + * segment. extent may be invalid, implementation must correctly + * handle finding the current instance if it is still alive and + * otherwise ignore it. + */ + using rewrite_extent_iertr = extent_mapping_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * get_extent_if_live + * + * Returns extent at specified location if still referenced by + * lba_manager and not removed by t. + * + * See TransactionManager::get_extent_if_live and + * LBAManager::get_physical_extent_if_live. + */ + using get_extent_if_live_iertr = extent_mapping_iertr; + using get_extent_if_live_ret = get_extent_if_live_iertr::future< + CachedExtentRef>; + virtual get_extent_if_live_ret get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + seastore_off_t len) = 0; + + /** + * submit_transaction_direct + * + * Submits transaction without any space throttling. + */ + using submit_transaction_direct_iertr = trans_iertr< + crimson::errorator< + crimson::ct_error::input_output_error> + >; + using submit_transaction_direct_ret = + submit_transaction_direct_iertr::future<>; + virtual submit_transaction_direct_ret submit_transaction_direct( + Transaction &t, + std::optional seq_to_trim = std::nullopt) = 0; + }; + +private: + const bool detailed; + const config_t config; + + SegmentManagerGroupRef sm_group; + BackrefManager &backref_manager; + + SpaceTrackerIRef space_tracker; + segments_info_t segments; + bool init_complete = false; + + struct { + /** + * used_bytes + * + * Bytes occupied by live extents + */ + uint64_t used_bytes = 0; + + /** + * projected_used_bytes + * + * Sum of projected bytes used by each transaction between throttle + * acquisition and commit completion. See reserve_projected_usage() + */ + uint64_t projected_used_bytes = 0; + uint64_t projected_count = 0; + uint64_t projected_used_bytes_sum = 0; + + uint64_t closed_journal_used_bytes = 0; + uint64_t closed_journal_total_bytes = 0; + uint64_t closed_ool_used_bytes = 0; + uint64_t closed_ool_total_bytes = 0; + + uint64_t io_blocking_num = 0; + uint64_t io_count = 0; + uint64_t io_blocked_count = 0; + uint64_t io_blocked_count_trim = 0; + uint64_t io_blocked_count_reclaim = 0; + uint64_t io_blocked_sum = 0; + + uint64_t reclaiming_bytes = 0; + uint64_t reclaimed_bytes = 0; + uint64_t reclaimed_segment_bytes = 0; + + seastar::metrics::histogram segment_util; + } stats; + seastar::metrics::metric_group metrics; + void register_metrics(); + + /// target journal_tail for next fresh segment + journal_seq_t journal_tail_target; + + /// target replay_from for dirty extents + journal_seq_t dirty_extents_replay_from; + + /// target replay_from for alloc infos + journal_seq_t alloc_info_replay_from; + + /// most recently committed journal_tail + journal_seq_t journal_tail_committed; + + ExtentCallbackInterface *ecb = nullptr; + + /// populated if there is an IO blocked on hard limits + std::optional> blocked_io_wake; + + SegmentSeqAllocatorRef ool_segment_seq_allocator; + + /** + * disable_trim + * + * added to enable unit testing of CircularBoundedJournal before + * proper support is added to AsyncCleaner. + * Should be removed once proper support is added. TODO + */ + bool disable_trim = false; +public: + AsyncCleaner( + config_t config, + SegmentManagerGroupRef&& sm_group, + BackrefManager &backref_manager, + bool detailed = false); + + SegmentSeqAllocator& get_ool_segment_seq_allocator() { + return *ool_segment_seq_allocator; + } + + using mount_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using mount_ret = mount_ertr::future<>; + mount_ret mount(); + + /* + * SegmentProvider interfaces + */ + journal_seq_t get_journal_tail_target() const final { + return journal_tail_target; + } + + const segment_info_t& get_seg_info(segment_id_t id) const final { + return segments[id]; + } + + segment_id_t allocate_segment( + segment_seq_t seq, segment_type_t type) final; + + void close_segment(segment_id_t segment) final; + + void update_journal_tail_committed(journal_seq_t committed) final; + + void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final { + segments.update_written_to(type, offset); + gc_process.maybe_wake_on_space_used(); + } + + SegmentManagerGroup* get_segment_manager_group() final { + return sm_group.get(); + } + + journal_seq_t get_dirty_extents_replay_from() const final { + return dirty_extents_replay_from; + } + + journal_seq_t get_alloc_info_replay_from() const final { + return alloc_info_replay_from; + } + + void update_journal_tail_target( + journal_seq_t dirty_replay_from, + journal_seq_t alloc_replay_from); + + void update_alloc_info_replay_from( + journal_seq_t alloc_replay_from); + + void init_mkfs() { + auto journal_head = segments.get_journal_head(); + ceph_assert(disable_trim || journal_head != JOURNAL_SEQ_NULL); + journal_tail_target = journal_head; + journal_tail_committed = journal_head; + } + + using release_ertr = SegmentManagerGroup::release_ertr; + release_ertr::future<> maybe_release_segment(Transaction &t); + + void adjust_segment_util(double old_usage, double new_usage) { + auto old_index = get_bucket_index(old_usage); + auto new_index = get_bucket_index(new_usage); + assert(stats.segment_util.buckets[old_index].count > 0); + stats.segment_util.buckets[old_index].count--; + stats.segment_util.buckets[new_index].count++; + } + + void mark_space_used( + paddr_t addr, + extent_len_t len, + time_point last_modified = time_point(), + time_point last_rewritten = time_point(), + bool init_scan = false); + + void mark_space_free( + paddr_t addr, + extent_len_t len); + + SpaceTrackerIRef get_empty_space_tracker() const { + return space_tracker->make_empty(); + } + + void complete_init(); + + store_statfs_t stat() const { + store_statfs_t st; + st.total = segments.get_total_bytes(); + st.available = segments.get_total_bytes() - stats.used_bytes; + st.allocated = stats.used_bytes; + st.data_stored = stats.used_bytes; + + // TODO add per extent type counters for omap_allocated and + // internal metadata + return st; + } + + seastar::future<> stop() { + return gc_process.stop(); + } + + seastar::future<> run_until_halt() { + return gc_process.run_until_halt(); + } + + void set_extent_callback(ExtentCallbackInterface *cb) { + ecb = cb; + } + + bool debug_check_space(const SpaceTrackerI &tracker) { + return space_tracker->equals(tracker); + } + + void set_disable_trim(bool val) { + disable_trim = val; + } + + using work_ertr = ExtentCallbackInterface::extent_mapping_ertr; + using work_iertr = ExtentCallbackInterface::extent_mapping_iertr; + +private: + /* + * 10 buckets for the number of closed segments by usage + * 2 extra buckets for the number of open and empty segments + */ + static constexpr double UTIL_STATE_OPEN = 1.05; + static constexpr double UTIL_STATE_EMPTY = 1.15; + static constexpr std::size_t UTIL_BUCKETS = 12; + static std::size_t get_bucket_index(double util) { + auto index = std::floor(util * 10); + assert(index < UTIL_BUCKETS); + return index; + } + double calc_utilization(segment_id_t id) const { + auto& info = segments[id]; + if (info.is_open()) { + return UTIL_STATE_OPEN; + } else if (info.is_empty()) { + return UTIL_STATE_EMPTY; + } else { + auto ret = space_tracker->calc_utilization(id); + assert(ret >= 0 && ret < 1); + return ret; + } + } + + // journal status helpers + + double calc_gc_benefit_cost(segment_id_t id) const { + double util = calc_utilization(id); + ceph_assert(util >= 0 && util < 1); + auto cur_time = seastar::lowres_system_clock::now(); + auto segment = segments[id]; + assert(cur_time >= segment.last_modified); + auto segment_age = + cur_time - std::max(segment.last_modified, segment.last_rewritten); + uint64_t age = segment_age.count(); + return (1 - util) * age / (1 + util); + } + + segment_id_t get_next_reclaim_segment() const; + + /** + * rewrite_dirty + * + * Writes out dirty blocks dirtied earlier than limit. + */ + using rewrite_dirty_iertr = work_iertr; + using rewrite_dirty_ret = rewrite_dirty_iertr::future<>; + rewrite_dirty_ret rewrite_dirty( + Transaction &t, + journal_seq_t limit); + + using trim_backrefs_iertr = work_iertr; + using trim_backrefs_ret = trim_backrefs_iertr::future; + trim_backrefs_ret trim_backrefs( + Transaction &t, + journal_seq_t limit); + + journal_seq_t get_dirty_tail() const { + auto ret = segments.get_journal_head(); + ceph_assert(ret != JOURNAL_SEQ_NULL); + if (ret.segment_seq >= config.target_journal_segments) { + ret.segment_seq -= config.target_journal_segments; + } else { + ret.segment_seq = 0; + ret.offset = P_ADDR_MIN; + } + return ret; + } + + journal_seq_t get_dirty_tail_limit() const { + auto ret = segments.get_journal_head(); + ceph_assert(ret != JOURNAL_SEQ_NULL); + if (ret.segment_seq >= config.max_journal_segments) { + ret.segment_seq -= config.max_journal_segments; + } else { + ret.segment_seq = 0; + ret.offset = P_ADDR_MIN; + } + return ret; + } + + journal_seq_t get_backref_tail() const { + auto ret = segments.get_journal_head(); + ceph_assert(ret != JOURNAL_SEQ_NULL); + if (ret.segment_seq >= config.target_backref_inflight_segments) { + ret.segment_seq -= config.target_backref_inflight_segments; + } else { + ret.segment_seq = 0; + ret.offset = P_ADDR_MIN; + } + return ret; + } + + struct reclaim_state_t { + std::size_t segment_size; + paddr_t start_pos; + paddr_t end_pos; + + static reclaim_state_t create( + segment_id_t segment_id, + std::size_t segment_size) { + return {segment_size, + P_ADDR_NULL, + paddr_t::make_seg_paddr(segment_id, 0)}; + } + + segment_id_t get_segment_id() const { + return end_pos.as_seg_paddr().get_segment_id(); + } + + bool is_complete() const { + return (std::size_t)end_pos.as_seg_paddr().get_segment_off() >= segment_size; + } + + void advance(std::size_t bytes) { + assert(!is_complete()); + start_pos = end_pos; + auto &end_seg_paddr = end_pos.as_seg_paddr(); + auto next_off = end_seg_paddr.get_segment_off() + bytes; + if (next_off > segment_size) { + end_seg_paddr.set_segment_off(segment_size); + } else { + end_seg_paddr.set_segment_off(next_off); + } + } + }; + std::optional reclaim_state; + + /** + * GCProcess + * + * Background gc process. + */ + using gc_cycle_ret = seastar::future<>; + class GCProcess { + std::optional process_join; + + AsyncCleaner &cleaner; + + std::optional> blocking; + + bool is_stopping() const { + return !process_join; + } + + gc_cycle_ret run(); + + void wake() { + if (blocking) { + blocking->set_value(); + blocking = std::nullopt; + } + } + + seastar::future<> maybe_wait_should_run() { + return seastar::do_until( + [this] { + cleaner.log_gc_state("GCProcess::maybe_wait_should_run"); + return is_stopping() || cleaner.gc_should_run(); + }, + [this] { + ceph_assert(!blocking); + blocking = seastar::promise<>(); + return blocking->get_future(); + }); + } + public: + GCProcess(AsyncCleaner &cleaner) : cleaner(cleaner) {} + + void start() { + ceph_assert(is_stopping()); + process_join = seastar::now(); // allow run() + process_join = run(); + assert(!is_stopping()); + } + + gc_cycle_ret stop() { + if (is_stopping()) { + return seastar::now(); + } + auto ret = std::move(*process_join); + process_join.reset(); + assert(is_stopping()); + wake(); + return ret; + } + + gc_cycle_ret run_until_halt() { + ceph_assert(is_stopping()); + return seastar::do_until( + [this] { + cleaner.log_gc_state("GCProcess::run_until_halt"); + return !cleaner.gc_should_run(); + }, + [this] { + return cleaner.do_gc_cycle(); + }); + } + + void maybe_wake_on_space_used() { + if (is_stopping()) { + return; + } + if (cleaner.gc_should_run()) { + wake(); + } + } + } gc_process; + + using gc_ertr = work_ertr::extend_ertr< + SegmentManagerGroup::scan_extents_ertr + >; + + gc_cycle_ret do_gc_cycle(); + + using gc_trim_journal_ertr = gc_ertr; + using gc_trim_journal_ret = gc_trim_journal_ertr::future<>; + gc_trim_journal_ret gc_trim_journal(); + + using gc_trim_backref_ertr = gc_ertr; + using gc_trim_backref_ret = gc_trim_backref_ertr::future; + gc_trim_backref_ret gc_trim_backref(journal_seq_t limit); + + using gc_reclaim_space_ertr = gc_ertr; + using gc_reclaim_space_ret = gc_reclaim_space_ertr::future<>; + gc_reclaim_space_ret gc_reclaim_space(); + + + using retrieve_live_extents_iertr = work_iertr; + using retrieve_live_extents_ret = + retrieve_live_extents_iertr::future; + retrieve_live_extents_ret _retrieve_live_extents( + Transaction &t, + std::set< + backref_buf_entry_t, + backref_buf_entry_t::cmp_t> &&backrefs, + std::vector &extents); + + using retrieve_backref_mappings_ertr = work_ertr; + using retrieve_backref_mappings_ret = + retrieve_backref_mappings_ertr::future; + retrieve_backref_mappings_ret retrieve_backref_mappings( + paddr_t start_paddr, + paddr_t end_paddr); + + /* + * Segments calculations + */ + std::size_t get_segments_in_journal() const { + if (!init_complete) { + return 0; + } + if (journal_tail_committed == JOURNAL_SEQ_NULL) { + return segments.get_num_type_journal(); + } + auto journal_head = segments.get_journal_head(); + assert(journal_head != JOURNAL_SEQ_NULL); + assert(journal_head.segment_seq >= journal_tail_committed.segment_seq); + return journal_head.segment_seq + 1 - journal_tail_committed.segment_seq; + } + std::size_t get_segments_in_journal_closed() const { + auto in_journal = get_segments_in_journal(); + auto in_journal_open = segments.get_num_in_journal_open(); + if (in_journal >= in_journal_open) { + return in_journal - in_journal_open; + } else { + return 0; + } + } + std::size_t get_segments_reclaimable() const { + assert(segments.get_num_closed() >= get_segments_in_journal_closed()); + return segments.get_num_closed() - get_segments_in_journal_closed(); + } + + /* + * Space calculations + */ + /// the unavailable space that is not reclaimable yet + std::size_t get_unavailable_unreclaimable_bytes() const { + auto ret = (segments.get_num_open() + get_segments_in_journal_closed()) * + segments.get_segment_size(); + assert(ret >= segments.get_available_bytes_in_open()); + return ret - segments.get_available_bytes_in_open(); + } + /// the unavailable space that can be reclaimed + std::size_t get_unavailable_reclaimable_bytes() const { + auto ret = get_segments_reclaimable() * segments.get_segment_size(); + ceph_assert(ret + get_unavailable_unreclaimable_bytes() == segments.get_unavailable_bytes()); + return ret; + } + /// the unavailable space that is not alive + std::size_t get_unavailable_unused_bytes() const { + assert(segments.get_unavailable_bytes() > stats.used_bytes); + return segments.get_unavailable_bytes() - stats.used_bytes; + } + double get_reclaim_ratio() const { + if (segments.get_unavailable_bytes() == 0) return 0; + return (double)get_unavailable_unused_bytes() / (double)segments.get_unavailable_bytes(); + } + + /* + * Space calculations (projected) + */ + std::size_t get_projected_available_bytes() const { + return (segments.get_available_bytes() > stats.projected_used_bytes) ? + segments.get_available_bytes() - stats.projected_used_bytes: + 0; + } + double get_projected_available_ratio() const { + return (double)get_projected_available_bytes() / + (double)segments.get_total_bytes(); + } + + /* + * Journal sizes + */ + std::size_t get_dirty_journal_size() const { + auto journal_head = segments.get_journal_head(); + if (journal_head == JOURNAL_SEQ_NULL || + dirty_extents_replay_from == JOURNAL_SEQ_NULL) { + return 0; + } + return (journal_head.segment_seq - dirty_extents_replay_from.segment_seq) * + segments.get_segment_size() + + journal_head.offset.as_seg_paddr().get_segment_off() - + segments.get_segment_size() - + dirty_extents_replay_from.offset.as_seg_paddr().get_segment_off(); + } + + std::size_t get_alloc_journal_size() const { + auto journal_head = segments.get_journal_head(); + if (journal_head == JOURNAL_SEQ_NULL || + alloc_info_replay_from == JOURNAL_SEQ_NULL) { + return 0; + } + return (journal_head.segment_seq - alloc_info_replay_from.segment_seq) * + segments.get_segment_size() + + journal_head.offset.as_seg_paddr().get_segment_off() - + segments.get_segment_size() - + alloc_info_replay_from.offset.as_seg_paddr().get_segment_off(); + } + + /** + * should_block_on_gc + * + * Encapsulates whether block pending gc. + */ + bool should_block_on_trim() const { + if (disable_trim) return false; + return get_dirty_tail_limit() > journal_tail_target; + } + + bool should_block_on_reclaim() const { + if (disable_trim) return false; + if (get_segments_reclaimable() == 0) { + return false; + } + auto aratio = get_projected_available_ratio(); + return aratio < config.available_ratio_hard_limit; + } + + bool should_block_on_gc() const { + return should_block_on_trim() || should_block_on_reclaim(); + } + + void log_gc_state(const char *caller) const; + +public: + seastar::future<> reserve_projected_usage(std::size_t projected_usage); + + void release_projected_usage(size_t projected_usage); + +private: + void maybe_wake_gc_blocked_io() { + if (!init_complete) { + return; + } + if (!should_block_on_gc() && blocked_io_wake) { + blocked_io_wake->set_value(); + blocked_io_wake = std::nullopt; + } + } + + using scan_extents_ret_bare = + std::vector>; + using scan_extents_ertr = SegmentManagerGroup::scan_extents_ertr; + using scan_extents_ret = scan_extents_ertr::future<>; + scan_extents_ret scan_nonfull_segment( + const segment_header_t& header, + scan_extents_ret_bare& segment_set, + segment_id_t segment_id); + + /** + * gc_should_reclaim_space + * + * Encapsulates logic for whether gc should be reclaiming segment space. + */ + bool gc_should_reclaim_space() const { + if (disable_trim) return false; + if (get_segments_reclaimable() == 0) { + return false; + } + auto aratio = segments.get_available_ratio(); + auto rratio = get_reclaim_ratio(); + return ( + (aratio < config.available_ratio_hard_limit) || + ((aratio < config.available_ratio_gc_max) && + (rratio > config.reclaim_ratio_gc_threshold)) + ); + } + + /** + * gc_should_trim_journal + * + * Encapsulates logic for whether gc should be reclaiming segment space. + */ + bool gc_should_trim_journal() const { + return get_dirty_tail() > journal_tail_target; + } + + bool gc_should_trim_backref() const { + return get_backref_tail() > alloc_info_replay_from; + } + /** + * gc_should_run + * + * True if gc should be running. + */ + bool gc_should_run() const { + if (disable_trim) return false; + ceph_assert(init_complete); + return gc_should_reclaim_space() + || gc_should_trim_journal() + || gc_should_trim_backref(); + } + + void init_mark_segment_closed( + segment_id_t segment, + segment_seq_t seq, + segment_type_t s_type) { + ceph_assert(!init_complete); + auto old_usage = calc_utilization(segment); + segments.init_closed(segment, seq, s_type); + auto new_usage = calc_utilization(segment); + adjust_segment_util(old_usage, new_usage); + if (s_type == segment_type_t::OOL) { + ool_segment_seq_allocator->set_next_segment_seq(seq); + } + } +}; +using AsyncCleanerRef = std::unique_ptr; + +} diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index ee128db1d856..dc7fdffa43ef 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -10,7 +10,7 @@ #include "crimson/os/seastore/logging.h" #include "crimson/common/config_proxy.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" // included for get_extent_by_type #include "crimson/os/seastore/collection_manager/collection_flat_node.h" @@ -1371,7 +1371,7 @@ void Cache::complete_commit( Transaction &t, paddr_t final_block_start, journal_seq_t seq, - SegmentCleaner *cleaner) + AsyncCleaner *cleaner) { LOG_PREFIX(Cache::complete_commit); SUBTRACET(seastore_t, "final_block_start={}, seq={}", diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 081dd49743de..3560406c4ebb 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -26,7 +26,7 @@ class BtreeBackrefManager; namespace crimson::os::seastore { class BackrefManager; -class SegmentCleaner; +class AsyncCleaner; struct backref_buf_entry_t { backref_buf_entry_t( @@ -747,7 +747,7 @@ public: Transaction &t, ///< [in, out] current transaction paddr_t final_block_start, ///< [in] offset of initial block journal_seq_t seq, ///< [in] journal commit seq - SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener + AsyncCleaner *cleaner=nullptr ///< [out] optional segment stat listener ); /** diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc index 0ea508b31517..2716228531d4 100644 --- a/src/crimson/os/seastore/journal/segment_allocator.cc +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -6,7 +6,7 @@ #include #include "crimson/os/seastore/logging.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" SET_SUBSYS(seastore_journal); diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h index 46c3675bdf1f..a97db1b74ce6 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.h +++ b/src/crimson/os/seastore/journal/segmented_journal.h @@ -9,7 +9,7 @@ #include "include/buffer.h" #include "include/denc.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/journal.h" #include "crimson/os/seastore/segment_manager_group.h" #include "crimson/os/seastore/ordering_handle.h" diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 2d0428e8f0bf..a07bd50ba560 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -23,7 +23,7 @@ #include "crimson/os/futurized_collection.h" #include "crimson/os/seastore/backref_manager.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/collection_manager/flat_collection_manager.h" #include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h" #include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h" diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc deleted file mode 100644 index a3c3d6bb04bd..000000000000 --- a/src/crimson/os/seastore/segment_cleaner.cc +++ /dev/null @@ -1,1439 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include - -#include "crimson/os/seastore/logging.h" - -#include "crimson/os/seastore/segment_cleaner.h" -#include "crimson/os/seastore/transaction_manager.h" - -SET_SUBSYS(seastore_cleaner); - -namespace crimson::os::seastore { - -void segment_info_t::set_open( - segment_seq_t _seq, segment_type_t _type) -{ - ceph_assert(_seq != NULL_SEG_SEQ); - ceph_assert(_type != segment_type_t::NULL_SEG); - state = Segment::segment_state_t::OPEN; - seq = _seq; - type = _type; - written_to = 0; -} - -void segment_info_t::set_empty() -{ - state = Segment::segment_state_t::EMPTY; - seq = NULL_SEG_SEQ; - type = segment_type_t::NULL_SEG; - last_modified = {}; - last_rewritten = {}; - written_to = 0; -} - -void segment_info_t::set_closed() -{ - state = Segment::segment_state_t::CLOSED; - // the rest of information is unchanged -} - -void segment_info_t::init_closed( - segment_seq_t _seq, segment_type_t _type, std::size_t seg_size) -{ - ceph_assert(_seq != NULL_SEG_SEQ); - ceph_assert(_type != segment_type_t::NULL_SEG); - state = Segment::segment_state_t::CLOSED; - seq = _seq; - type = _type; - written_to = seg_size; -} - -std::ostream& operator<<(std::ostream &out, const segment_info_t &info) -{ - out << "seg_info_t(" - << "state=" << info.state; - if (info.is_empty()) { - // pass - } else { // open or closed - out << ", seq=" << segment_seq_printer_t{info.seq} - << ", type=" << info.type - << ", last_modified=" << info.last_modified.time_since_epoch() - << ", last_rewritten=" << info.last_rewritten.time_since_epoch() - << ", written_to=" << info.written_to; - } - return out << ")"; -} - -void segments_info_t::reset() -{ - segments.clear(); - - segment_size = 0; - - journal_segment_id = NULL_SEG_ID; - num_in_journal_open = 0; - num_type_journal = 0; - num_type_ool = 0; - - num_open = 0; - num_empty = 0; - num_closed = 0; - - count_open_journal = 0; - count_open_ool = 0; - count_release_journal = 0; - count_release_ool = 0; - count_close_journal = 0; - count_close_ool = 0; - - total_bytes = 0; - avail_bytes_in_open = 0; -} - -void segments_info_t::add_segment_manager( - SegmentManager &segment_manager) -{ - LOG_PREFIX(segments_info_t::add_segment_manager); - device_id_t d_id = segment_manager.get_device_id(); - auto ssize = segment_manager.get_segment_size(); - auto nsegments = segment_manager.get_num_segments(); - auto sm_size = segment_manager.get_size(); - INFO("adding segment manager {}, size={}, ssize={}, segments={}", - device_id_printer_t{d_id}, sm_size, ssize, nsegments); - ceph_assert(ssize > 0); - ceph_assert(nsegments > 0); - ceph_assert(sm_size > 0); - - // also validate if the device is duplicated - segments.add_device(d_id, nsegments, segment_info_t{}); - - // assume all the segment managers share the same settings as follows. - if (segment_size == 0) { - ceph_assert(ssize > 0); - segment_size = ssize; - } else { - ceph_assert(segment_size == (std::size_t)ssize); - } - - // NOTE: by default the segments are empty - num_empty += nsegments; - - total_bytes += sm_size; -} - -void segments_info_t::init_closed( - segment_id_t segment, segment_seq_t seq, segment_type_t type) -{ - LOG_PREFIX(segments_info_t::init_closed); - auto& segment_info = segments[segment]; - INFO("initiating {} {} {}, {}, num_segments(empty={}, opened={}, closed={})", - segment, segment_seq_printer_t{seq}, type, - segment_info, num_empty, num_open, num_closed); - ceph_assert(segment_info.is_empty()); - segment_info.init_closed(seq, type, get_segment_size()); - ceph_assert(num_empty > 0); - --num_empty; - ++num_closed; - if (type == segment_type_t::JOURNAL) { - // init_closed won't initialize journal_segment_id - ceph_assert(get_journal_head() == JOURNAL_SEQ_NULL); - ++num_type_journal; - } else { - ++num_type_ool; - } - // do not increment count_close_*; -} - -void segments_info_t::mark_open( - segment_id_t segment, segment_seq_t seq, segment_type_t type) -{ - LOG_PREFIX(segments_info_t::mark_open); - auto& segment_info = segments[segment]; - INFO("opening {} {} {}, {}, num_segments(empty={}, opened={}, closed={})", - segment, segment_seq_printer_t{seq}, type, - segment_info, num_empty, num_open, num_closed); - ceph_assert(segment_info.is_empty()); - segment_info.set_open(seq, type); - ceph_assert(num_empty > 0); - --num_empty; - ++num_open; - if (type == segment_type_t::JOURNAL) { - if (journal_segment_id != NULL_SEG_ID) { - auto& last_journal_segment = segments[journal_segment_id]; - ceph_assert(last_journal_segment.is_closed()); - ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL); - ceph_assert(last_journal_segment.seq + 1 == seq); - } - journal_segment_id = segment; - - ++num_in_journal_open; - ++num_type_journal; - ++count_open_journal; - } else { - ++num_type_ool; - ++count_open_ool; - } - ceph_assert(segment_info.written_to == 0); - avail_bytes_in_open += get_segment_size(); -} - -void segments_info_t::mark_empty( - segment_id_t segment) -{ - LOG_PREFIX(segments_info_t::mark_empty); - auto& segment_info = segments[segment]; - INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})", - segment, segment_info, - num_empty, num_open, num_closed); - ceph_assert(segment_info.is_closed()); - auto type = segment_info.type; - assert(type != segment_type_t::NULL_SEG); - segment_info.set_empty(); - ceph_assert(num_closed > 0); - --num_closed; - ++num_empty; - if (type == segment_type_t::JOURNAL) { - ceph_assert(num_type_journal > 0); - --num_type_journal; - ++count_release_journal; - } else { - ceph_assert(num_type_ool > 0); - --num_type_ool; - ++count_release_ool; - } -} - -void segments_info_t::mark_closed( - segment_id_t segment) -{ - LOG_PREFIX(segments_info_t::mark_closed); - auto& segment_info = segments[segment]; - INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})", - segment, segment_info, - num_empty, num_open, num_closed); - ceph_assert(segment_info.is_open()); - segment_info.set_closed(); - ceph_assert(num_open > 0); - --num_open; - ++num_closed; - if (segment_info.type == segment_type_t::JOURNAL) { - ceph_assert(num_in_journal_open > 0); - --num_in_journal_open; - ++count_close_journal; - } else { - ++count_close_ool; - } - ceph_assert(get_segment_size() >= segment_info.written_to); - auto seg_avail_bytes = get_segment_size() - segment_info.written_to; - ceph_assert(avail_bytes_in_open >= seg_avail_bytes); - avail_bytes_in_open -= seg_avail_bytes; -} - -void segments_info_t::update_written_to( - segment_type_t type, - paddr_t offset) -{ - LOG_PREFIX(segments_info_t::update_written_to); - auto& saddr = offset.as_seg_paddr(); - auto& segment_info = segments[saddr.get_segment_id()]; - if (!segment_info.is_open()) { - ERROR("segment is not open, not updating, type={}, offset={}, {}", - type, offset, segment_info); - ceph_abort(); - } - - auto new_written_to = static_cast(saddr.get_segment_off()); - ceph_assert(new_written_to <= get_segment_size()); - if (segment_info.written_to > new_written_to) { - ERROR("written_to should not decrease! type={}, offset={}, {}", - type, offset, segment_info); - ceph_abort(); - } - - DEBUG("type={}, offset={}, {}", type, offset, segment_info); - ceph_assert(type == segment_info.type); - auto avail_deduction = new_written_to - segment_info.written_to; - ceph_assert(avail_bytes_in_open >= avail_deduction); - avail_bytes_in_open -= avail_deduction; - segment_info.written_to = new_written_to; -} - -bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const -{ - LOG_PREFIX(SpaceTrackerSimple::equals); - const auto &other = static_cast(_other); - - if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { - ERROR("different segment counts, bug in test"); - assert(0 == "segment counts should match"); - return false; - } - - bool all_match = true; - for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin(); - i != live_bytes_by_segment.end(); ++i, ++j) { - if (i->second.live_bytes != j->second.live_bytes) { - all_match = false; - DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}", - i->first, i->second.live_bytes, j->second.live_bytes); - } - } - return all_match; -} - -int64_t SpaceTrackerDetailed::SegmentMap::allocate( - device_segment_id_t segment, - seastore_off_t offset, - extent_len_t len, - const extent_len_t block_size) -{ - LOG_PREFIX(SegmentMap::allocate); - assert(offset % block_size == 0); - assert(len % block_size == 0); - - const auto b = (offset / block_size); - const auto e = (offset + len) / block_size; - - bool error = false; - for (auto i = b; i < e; ++i) { - if (bitmap[i]) { - if (!error) { - ERROR("found allocated in {}, {} ~ {}", segment, offset, len); - error = true; - } - DEBUG("block {} allocated", i * block_size); - } - bitmap[i] = true; - } - return update_usage(len); -} - -int64_t SpaceTrackerDetailed::SegmentMap::release( - device_segment_id_t segment, - seastore_off_t offset, - extent_len_t len, - const extent_len_t block_size) -{ - LOG_PREFIX(SegmentMap::release); - assert(offset % block_size == 0); - assert(len % block_size == 0); - - const auto b = (offset / block_size); - const auto e = (offset + len) / block_size; - - bool error = false; - for (auto i = b; i < e; ++i) { - if (!bitmap[i]) { - if (!error) { - ERROR("found unallocated in {}, {} ~ {}", segment, offset, len); - error = true; - } - DEBUG("block {} unallocated", i * block_size); - } - bitmap[i] = false; - } - return update_usage(-(int64_t)len); -} - -bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const -{ - LOG_PREFIX(SpaceTrackerDetailed::equals); - const auto &other = static_cast(_other); - - if (other.segment_usage.size() != segment_usage.size()) { - ERROR("different segment counts, bug in test"); - assert(0 == "segment counts should match"); - return false; - } - - bool all_match = true; - for (auto i = segment_usage.begin(), j = other.segment_usage.begin(); - i != segment_usage.end(); ++i, ++j) { - if (i->second.get_usage() != j->second.get_usage()) { - all_match = false; - ERROR("segment_id {} live bytes mismatch *this: {}, other: {}", - i->first, i->second.get_usage(), j->second.get_usage()); - } - } - return all_match; -} - -void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const -{ - LOG_PREFIX(SegmentMap::dump_usage); - INFO("dump start"); - for (unsigned i = 0; i < bitmap.size(); ++i) { - if (bitmap[i]) { - LOCAL_LOGGER.info(" {} still live", i * block_size); - } - } -} - -void SpaceTrackerDetailed::dump_usage(segment_id_t id) const -{ - LOG_PREFIX(SpaceTrackerDetailed::dump_usage); - INFO("{}", id); - segment_usage[id].dump_usage( - block_size_by_segment_manager[id.device_id()]); -} - -void SpaceTrackerSimple::dump_usage(segment_id_t id) const -{ - LOG_PREFIX(SpaceTrackerSimple::dump_usage); - INFO("id: {}, live_bytes: {}", - id, live_bytes_by_segment[id].live_bytes); -} - -SegmentCleaner::SegmentCleaner( - config_t config, - SegmentManagerGroupRef&& sm_group, - BackrefManager &backref_manager, - bool detailed) - : detailed(detailed), - config(config), - sm_group(std::move(sm_group)), - backref_manager(backref_manager), - ool_segment_seq_allocator( - new SegmentSeqAllocator(segment_type_t::OOL)), - gc_process(*this) -{ - config.validate(); -} - -void SegmentCleaner::register_metrics() -{ - namespace sm = seastar::metrics; - stats.segment_util.buckets.resize(UTIL_BUCKETS); - std::size_t i; - for (i = 0; i < UTIL_BUCKETS; ++i) { - stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10; - stats.segment_util.buckets[i].count = 0; - } - // NOTE: by default the segments are empty - i = get_bucket_index(UTIL_STATE_EMPTY); - stats.segment_util.buckets[i].count = segments.get_num_segments(); - - metrics.add_group("segment_cleaner", { - sm::make_counter("segments_number", - [this] { return segments.get_num_segments(); }, - sm::description("the number of segments")), - sm::make_counter("segment_size", - [this] { return segments.get_segment_size(); }, - sm::description("the bytes of a segment")), - sm::make_counter("segments_in_journal", - [this] { return get_segments_in_journal(); }, - sm::description("the number of segments in journal")), - sm::make_counter("segments_type_journal", - [this] { return segments.get_num_type_journal(); }, - sm::description("the number of segments typed journal")), - sm::make_counter("segments_type_ool", - [this] { return segments.get_num_type_ool(); }, - sm::description("the number of segments typed out-of-line")), - sm::make_counter("segments_open", - [this] { return segments.get_num_open(); }, - sm::description("the number of open segments")), - sm::make_counter("segments_empty", - [this] { return segments.get_num_empty(); }, - sm::description("the number of empty segments")), - sm::make_counter("segments_closed", - [this] { return segments.get_num_closed(); }, - sm::description("the number of closed segments")), - - sm::make_counter("segments_count_open_journal", - [this] { return segments.get_count_open_journal(); }, - sm::description("the count of open journal segment operations")), - sm::make_counter("segments_count_open_ool", - [this] { return segments.get_count_open_ool(); }, - sm::description("the count of open ool segment operations")), - sm::make_counter("segments_count_release_journal", - [this] { return segments.get_count_release_journal(); }, - sm::description("the count of release journal segment operations")), - sm::make_counter("segments_count_release_ool", - [this] { return segments.get_count_release_ool(); }, - sm::description("the count of release ool segment operations")), - sm::make_counter("segments_count_close_journal", - [this] { return segments.get_count_close_journal(); }, - sm::description("the count of close journal segment operations")), - sm::make_counter("segments_count_close_ool", - [this] { return segments.get_count_close_ool(); }, - sm::description("the count of close ool segment operations")), - - sm::make_counter("total_bytes", - [this] { return segments.get_total_bytes(); }, - sm::description("the size of the space")), - sm::make_counter("available_bytes", - [this] { return segments.get_available_bytes(); }, - sm::description("the size of the space is available")), - sm::make_counter("unavailable_unreclaimable_bytes", - [this] { return get_unavailable_unreclaimable_bytes(); }, - sm::description("the size of the space is unavailable and unreclaimable")), - sm::make_counter("unavailable_reclaimable_bytes", - [this] { return get_unavailable_reclaimable_bytes(); }, - sm::description("the size of the space is unavailable and reclaimable")), - sm::make_counter("used_bytes", stats.used_bytes, - sm::description("the size of the space occupied by live extents")), - sm::make_counter("unavailable_unused_bytes", - [this] { return get_unavailable_unused_bytes(); }, - sm::description("the size of the space is unavailable and not alive")), - - sm::make_counter("dirty_journal_bytes", - [this] { return get_dirty_journal_size(); }, - sm::description("the size of the journal for dirty extents")), - sm::make_counter("alloc_journal_bytes", - [this] { return get_alloc_journal_size(); }, - sm::description("the size of the journal for alloc info")), - - sm::make_counter("projected_count", stats.projected_count, - sm::description("the number of projected usage reservations")), - sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum, - sm::description("the sum of the projected usage in bytes")), - - sm::make_counter("io_count", stats.io_count, - sm::description("the sum of IOs")), - sm::make_counter("io_blocked_count", stats.io_blocked_count, - sm::description("IOs that are blocked by gc")), - sm::make_counter("io_blocked_count_trim", stats.io_blocked_count_trim, - sm::description("IOs that are blocked by trimming")), - sm::make_counter("io_blocked_count_reclaim", stats.io_blocked_count_reclaim, - sm::description("IOs that are blocked by reclaimming")), - sm::make_counter("io_blocked_sum", stats.io_blocked_sum, - sm::description("the sum of blocking IOs")), - - sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes, - sm::description("rewritten bytes due to reclaim")), - sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes, - sm::description("rewritten bytes due to reclaim")), - sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes, - sm::description("used bytes when close a journal segment")), - sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes, - sm::description("total bytes of closed journal segments")), - sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes, - sm::description("used bytes when close a ool segment")), - sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes, - sm::description("total bytes of closed ool segments")), - - sm::make_gauge("available_ratio", - [this] { return segments.get_available_ratio(); }, - sm::description("ratio of available space to total space")), - sm::make_gauge("reclaim_ratio", - [this] { return get_reclaim_ratio(); }, - sm::description("ratio of reclaimable space to unavailable space")), - - sm::make_histogram("segment_utilization_distribution", - [this]() -> seastar::metrics::histogram& { - return stats.segment_util; - }, - sm::description("utilization distribution of all segments")) - }); -} - -segment_id_t SegmentCleaner::allocate_segment( - segment_seq_t seq, - segment_type_t type) -{ - LOG_PREFIX(SegmentCleaner::allocate_segment); - assert(seq != NULL_SEG_SEQ); - for (auto it = segments.begin(); - it != segments.end(); - ++it) { - auto seg_id = it->first; - auto& segment_info = it->second; - if (segment_info.is_empty()) { - auto old_usage = calc_utilization(seg_id); - segments.mark_open(seg_id, seq, type); - auto new_usage = calc_utilization(seg_id); - adjust_segment_util(old_usage, new_usage); - INFO("opened, should_block_on_gc {}, projected_avail_ratio {}, " - "reclaim_ratio {}", - should_block_on_gc(), - get_projected_available_ratio(), - get_reclaim_ratio()); - return seg_id; - } - } - ERROR("out of space with segment_seq={}", segment_seq_printer_t{seq}); - ceph_abort(); - return NULL_SEG_ID; -} - -void SegmentCleaner::update_journal_tail_target( - journal_seq_t dirty_replay_from, - journal_seq_t alloc_replay_from) -{ - LOG_PREFIX(SegmentCleaner::update_journal_tail_target); - if (disable_trim) return; - assert(dirty_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - assert(alloc_replay_from.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - if (dirty_extents_replay_from == JOURNAL_SEQ_NULL - || dirty_replay_from > dirty_extents_replay_from) { - DEBUG("dirty_extents_replay_from={} => {}", - dirty_extents_replay_from, dirty_replay_from); - dirty_extents_replay_from = dirty_replay_from; - } - - update_alloc_info_replay_from(alloc_replay_from); - - journal_seq_t target = std::min(dirty_replay_from, alloc_replay_from); - ceph_assert(target != JOURNAL_SEQ_NULL); - auto journal_head = segments.get_journal_head(); - ceph_assert(journal_head == JOURNAL_SEQ_NULL || - journal_head >= target); - if (journal_tail_target == JOURNAL_SEQ_NULL || - target > journal_tail_target) { - if (!init_complete || - journal_tail_target.segment_seq == target.segment_seq) { - DEBUG("journal_tail_target={} => {}", journal_tail_target, target); - } else { - INFO("journal_tail_target={} => {}", journal_tail_target, target); - } - journal_tail_target = target; - } - gc_process.maybe_wake_on_space_used(); - maybe_wake_gc_blocked_io(); -} - -void SegmentCleaner::update_alloc_info_replay_from( - journal_seq_t alloc_replay_from) -{ - LOG_PREFIX(SegmentCleaner::update_alloc_info_replay_from); - if (alloc_info_replay_from == JOURNAL_SEQ_NULL - || alloc_replay_from > alloc_info_replay_from) { - DEBUG("alloc_info_replay_from={} => {}", - alloc_info_replay_from, alloc_replay_from); - alloc_info_replay_from = alloc_replay_from; - } -} - -void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed) -{ - LOG_PREFIX(SegmentCleaner::update_journal_tail_committed); - assert(committed.offset.get_addr_type() != addr_types_t::RANDOM_BLOCK); - if (committed == JOURNAL_SEQ_NULL) { - return; - } - auto journal_head = segments.get_journal_head(); - ceph_assert(journal_head == JOURNAL_SEQ_NULL || - journal_head >= committed); - - if (journal_tail_committed == JOURNAL_SEQ_NULL || - committed > journal_tail_committed) { - DEBUG("update journal_tail_committed={} => {}", - journal_tail_committed, committed); - journal_tail_committed = committed; - } - if (journal_tail_target == JOURNAL_SEQ_NULL || - committed > journal_tail_target) { - DEBUG("update journal_tail_target={} => {}", - journal_tail_target, committed); - journal_tail_target = committed; - } -} - -void SegmentCleaner::close_segment(segment_id_t segment) -{ - LOG_PREFIX(SegmentCleaner::close_segment); - auto old_usage = calc_utilization(segment); - segments.mark_closed(segment); - auto &seg_info = segments[segment]; - if (seg_info.type == segment_type_t::JOURNAL) { - stats.closed_journal_used_bytes += space_tracker->get_usage(segment); - stats.closed_journal_total_bytes += segments.get_segment_size(); - } else { - stats.closed_ool_used_bytes += space_tracker->get_usage(segment); - stats.closed_ool_total_bytes += segments.get_segment_size(); - } - auto new_usage = calc_utilization(segment); - adjust_segment_util(old_usage, new_usage); - INFO("closed, should_block_on_gc {}, projected_avail_ratio {}, " - "reclaim_ratio {}", - should_block_on_gc(), - get_projected_available_ratio(), - get_reclaim_ratio()); -} - -SegmentCleaner::trim_backrefs_ret SegmentCleaner::trim_backrefs( - Transaction &t, - journal_seq_t limit) -{ - return backref_manager.merge_cached_backrefs( - t, - limit, - config.rewrite_backref_bytes_per_cycle - ); -} - -SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty( - Transaction &t, - journal_seq_t limit) -{ - return ecb->get_next_dirty_extents( - t, - limit, - config.rewrite_dirty_bytes_per_cycle - ).si_then([=, &t](auto dirty_list) { - LOG_PREFIX(SegmentCleaner::rewrite_dirty); - DEBUGT("rewrite {} dirty extents", t, dirty_list.size()); - return seastar::do_with( - std::move(dirty_list), - [this, FNAME, &t](auto &dirty_list) { - return trans_intr::do_for_each( - dirty_list, - [this, FNAME, &t](auto &e) { - DEBUGT("cleaning {}", t, *e); - return ecb->rewrite_extent(t, e); - }); - }); - }); -} - -SegmentCleaner::gc_cycle_ret SegmentCleaner::GCProcess::run() -{ - return seastar::do_until( - [this] { return is_stopping(); }, - [this] { - return maybe_wait_should_run( - ).then([this] { - cleaner.log_gc_state("GCProcess::run"); - - if (is_stopping()) { - return seastar::now(); - } else { - return cleaner.do_gc_cycle(); - } - }); - }); -} - -SegmentCleaner::gc_cycle_ret SegmentCleaner::do_gc_cycle() -{ - if (gc_should_trim_journal()) { - return gc_trim_journal( - ).handle_error( - crimson::ct_error::assert_all{ - "GCProcess::run encountered invalid error in gc_trim_journal" - } - ); - } else if (gc_should_trim_backref()) { - return gc_trim_backref(get_backref_tail() - ).safe_then([](auto) { - return seastar::now(); - }).handle_error( - crimson::ct_error::assert_all{ - "GCProcess::run encountered invalid error in gc_trim_backref" - } - ); - } else if (gc_should_reclaim_space()) { - return gc_reclaim_space( - ).handle_error( - crimson::ct_error::assert_all{ - "GCProcess::run encountered invalid error in gc_reclaim_space" - } - ); - } else { - return seastar::now(); - } -} - -SegmentCleaner::gc_trim_backref_ret -SegmentCleaner::gc_trim_backref(journal_seq_t limit) { - return seastar::do_with( - journal_seq_t(), - [this, limit=std::move(limit)](auto &seq) mutable { - return repeat_eagain([this, limit=std::move(limit), &seq] { - return ecb->with_transaction_intr( - Transaction::src_t::TRIM_BACKREF, - "trim_backref", - [this, limit](auto &t) { - return trim_backrefs( - t, - limit - ).si_then([this, &t, limit](auto trim_backrefs_to) - -> ExtentCallbackInterface::submit_transaction_direct_iertr::future< - journal_seq_t> { - if (trim_backrefs_to != JOURNAL_SEQ_NULL) { - return ecb->submit_transaction_direct( - t, std::make_optional(trim_backrefs_to) - ).si_then([trim_backrefs_to=std::move(trim_backrefs_to)]() mutable { - return seastar::make_ready_future< - journal_seq_t>(std::move(trim_backrefs_to)); - }); - } - return seastar::make_ready_future(std::move(limit)); - }); - }).safe_then([&seq](auto trim_backrefs_to) { - seq = std::move(trim_backrefs_to); - }); - }).safe_then([&seq] { - return gc_trim_backref_ertr::make_ready_future< - journal_seq_t>(std::move(seq)); - }); - }); -} - -SegmentCleaner::gc_trim_journal_ret SegmentCleaner::gc_trim_journal() -{ - return gc_trim_backref(get_dirty_tail() - ).safe_then([this](auto seq) { - return repeat_eagain([this, seq=std::move(seq)]() mutable { - return ecb->with_transaction_intr( - Transaction::src_t::CLEANER_TRIM, - "trim_journal", - [this, seq=std::move(seq)](auto& t) - { - return rewrite_dirty(t, seq - ).si_then([this, &t] { - return ecb->submit_transaction_direct(t); - }); - }); - }); - }); -} - -SegmentCleaner::retrieve_live_extents_ret -SegmentCleaner::_retrieve_live_extents( - Transaction &t, - std::set< - backref_buf_entry_t, - backref_buf_entry_t::cmp_t> &&backrefs, - std::vector &extents) -{ - return seastar::do_with( - JOURNAL_SEQ_NULL, - std::move(backrefs), - [this, &t, &extents](auto &seq, auto &backrefs) { - return trans_intr::parallel_for_each( - backrefs, - [this, &extents, &t, &seq](auto &ent) { - LOG_PREFIX(SegmentCleaner::_retrieve_live_extents); - DEBUGT("getting extent of type {} at {}~{}", - t, - ent.type, - ent.paddr, - ent.len); - return ecb->get_extent_if_live( - t, ent.type, ent.paddr, ent.laddr, ent.len - ).si_then([this, FNAME, &extents, &ent, &seq, &t](auto ext) { - if (!ext) { - DEBUGT("addr {} dead, skipping", t, ent.paddr); - auto backref = backref_manager.get_cached_backref_removal(ent.paddr); - if (seq == JOURNAL_SEQ_NULL || seq < backref.seq) { - seq = backref.seq; - } - } else { - extents.emplace_back(std::move(ext)); - } - return ExtentCallbackInterface::rewrite_extent_iertr::now(); - }); - }).si_then([&seq] { - return retrieve_live_extents_iertr::make_ready_future< - journal_seq_t>(std::move(seq)); - }); - }); -} - -SegmentCleaner::retrieve_backref_mappings_ret -SegmentCleaner::retrieve_backref_mappings( - paddr_t start_paddr, - paddr_t end_paddr) -{ - return seastar::do_with( - backref_pin_list_t(), - [this, start_paddr, end_paddr](auto &pin_list) { - return repeat_eagain([this, start_paddr, end_paddr, &pin_list] { - return ecb->with_transaction_intr( - Transaction::src_t::READ, - "get_backref_mappings", - [this, start_paddr, end_paddr](auto &t) { - return backref_manager.get_mappings( - t, start_paddr, end_paddr - ); - }).safe_then([&pin_list](auto&& list) { - pin_list = std::move(list); - }); - }).safe_then([&pin_list] { - return seastar::make_ready_future(std::move(pin_list)); - }); - }); -} - -SegmentCleaner::gc_reclaim_space_ret SegmentCleaner::gc_reclaim_space() -{ - LOG_PREFIX(SegmentCleaner::gc_reclaim_space); - if (!reclaim_state) { - segment_id_t seg_id = get_next_reclaim_segment(); - auto &segment_info = segments[seg_id]; - INFO("reclaim {} {} start", seg_id, segment_info); - ceph_assert(segment_info.is_closed()); - reclaim_state = reclaim_state_t::create( - seg_id, segments.get_segment_size()); - } - reclaim_state->advance(config.reclaim_bytes_per_cycle); - - DEBUG("reclaiming {}~{}", - reclaim_state->start_pos, - reclaim_state->end_pos); - double pavail_ratio = get_projected_available_ratio(); - seastar::lowres_system_clock::time_point start = seastar::lowres_system_clock::now(); - - return seastar::do_with( - (size_t)0, - (size_t)0, - [this, pavail_ratio, start]( - auto &reclaimed, - auto &runs) { - return retrieve_backref_mappings( - reclaim_state->start_pos, - reclaim_state->end_pos - ).safe_then([this, &reclaimed, &runs](auto pin_list) { - return seastar::do_with( - std::move(pin_list), - [this, &reclaimed, &runs](auto &pin_list) { - return repeat_eagain( - [this, &reclaimed, &runs, &pin_list]() mutable { - reclaimed = 0; - runs++; - return seastar::do_with( - backref_manager.get_cached_backref_extents_in_range( - reclaim_state->start_pos, reclaim_state->end_pos), - backref_manager.get_cached_backrefs_in_range( - reclaim_state->start_pos, reclaim_state->end_pos), - backref_manager.get_cached_backref_removals_in_range( - reclaim_state->start_pos, reclaim_state->end_pos), - JOURNAL_SEQ_NULL, - [this, &reclaimed, &pin_list]( - auto &backref_extents, - auto &backrefs, - auto &del_backrefs, - auto &seq) { - return ecb->with_transaction_intr( - Transaction::src_t::CLEANER_RECLAIM, - "reclaim_space", - [this, &backref_extents, &backrefs, &seq, - &del_backrefs, &reclaimed, &pin_list](auto &t) { - LOG_PREFIX(SegmentCleaner::gc_reclaim_space); - DEBUGT("{} backrefs, {} del_backrefs, {} pins", t, - backrefs.size(), del_backrefs.size(), pin_list.size()); - for (auto &br : backrefs) { - if (seq == JOURNAL_SEQ_NULL - || (br.seq != JOURNAL_SEQ_NULL && br.seq > seq)) - seq = br.seq; - } - for (auto &pin : pin_list) { - backrefs.emplace( - pin->get_key(), - pin->get_val(), - pin->get_length(), - pin->get_type(), - journal_seq_t()); - } - for (auto &del_backref : del_backrefs) { - DEBUGT("del_backref {}~{} {} {}", t, - del_backref.paddr, del_backref.len, del_backref.type, del_backref.seq); - auto it = backrefs.find(del_backref.paddr); - if (it != backrefs.end()) - backrefs.erase(it); - if (seq == JOURNAL_SEQ_NULL - || (del_backref.seq != JOURNAL_SEQ_NULL && del_backref.seq > seq)) - seq = del_backref.seq; - } - return seastar::do_with( - std::vector(), - [this, &backref_extents, &backrefs, &reclaimed, &t, &seq] - (auto &extents) { - return backref_manager.retrieve_backref_extents( - t, std::move(backref_extents), extents - ).si_then([this, &extents, &t, &backrefs] { - return _retrieve_live_extents( - t, std::move(backrefs), extents); - }).si_then([this, &seq, &t](auto nseq) { - if (nseq != JOURNAL_SEQ_NULL && - (nseq > seq || seq == JOURNAL_SEQ_NULL)) - seq = nseq; - auto fut = BackrefManager::merge_cached_backrefs_iertr::now(); - if (seq != JOURNAL_SEQ_NULL) { - fut = backref_manager.merge_cached_backrefs( - t, seq, std::numeric_limits::max() - ).si_then([](auto) { - return BackrefManager::merge_cached_backrefs_iertr::now(); - }); - } - return fut; - }).si_then([&extents, this, &t, &reclaimed] { - return trans_intr::do_for_each( - extents, - [this, &t, &reclaimed](auto &ext) { - reclaimed += ext->get_length(); - return ecb->rewrite_extent(t, ext); - }); - }); - }).si_then([this, &t, &seq] { - if (reclaim_state->is_complete()) { - t.mark_segment_to_release(reclaim_state->get_segment_id()); - } - return ecb->submit_transaction_direct( - t, std::make_optional(std::move(seq))); - }); - }); - }); - }); - }); - }).safe_then( - [&reclaimed, this, pavail_ratio, start, &runs] { - LOG_PREFIX(SegmentCleaner::gc_reclaim_space); -#ifndef NDEBUG - auto ndel_backrefs = - backref_manager.get_cached_backref_removals_in_range( - reclaim_state->start_pos, reclaim_state->end_pos); - if (!ndel_backrefs.empty()) { - for (auto &del_br : ndel_backrefs) { - ERROR("unexpected del_backref {}~{} {} {}", - del_br.paddr, del_br.len, del_br.type, del_br.seq); - } - ceph_abort("impossible"); - } -#endif - stats.reclaiming_bytes += reclaimed; - auto d = seastar::lowres_system_clock::now() - start; - DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}", d, pavail_ratio, runs); - if (reclaim_state->is_complete()) { - INFO("reclaim {} finish, alive/total={}", - reclaim_state->get_segment_id(), - stats.reclaiming_bytes/(double)segments.get_segment_size()); - stats.reclaimed_bytes += stats.reclaiming_bytes; - stats.reclaimed_segment_bytes += segments.get_segment_size(); - stats.reclaiming_bytes = 0; - reclaim_state.reset(); - } - }); - }); -} - -SegmentCleaner::mount_ret SegmentCleaner::mount() -{ - LOG_PREFIX(SegmentCleaner::mount); - const auto& sms = sm_group->get_segment_managers(); - INFO("{} segment managers", sms.size()); - init_complete = false; - stats = {}; - journal_tail_target = JOURNAL_SEQ_NULL; - journal_tail_committed = JOURNAL_SEQ_NULL; - dirty_extents_replay_from = JOURNAL_SEQ_NULL; - alloc_info_replay_from = JOURNAL_SEQ_NULL; - - space_tracker.reset( - detailed ? - (SpaceTrackerI*)new SpaceTrackerDetailed( - sms) : - (SpaceTrackerI*)new SpaceTrackerSimple( - sms)); - - segments.reset(); - for (auto sm : sms) { - segments.add_segment_manager(*sm); - } - metrics.clear(); - register_metrics(); - - INFO("{} segments", segments.get_num_segments()); - return seastar::do_with( - std::vector>(), - [this, FNAME](auto& segment_set) { - return crimson::do_for_each( - segments.begin(), - segments.end(), - [this, FNAME, &segment_set](auto& it) { - auto segment_id = it.first; - return sm_group->read_segment_header( - segment_id - ).safe_then([segment_id, this, FNAME, &segment_set](auto header) { - INFO("segment_id={} -- {}", segment_id, header); - auto s_type = header.get_type(); - if (s_type == segment_type_t::NULL_SEG) { - ERROR("got null segment, segment_id={} -- {}", segment_id, header); - ceph_abort(); - } - return sm_group->read_segment_tail( - segment_id - ).safe_then([this, segment_id, &segment_set, header](auto tail) - -> scan_extents_ertr::future<> { - if (tail.segment_nonce != header.segment_nonce) { - return scan_nonfull_segment(header, segment_set, segment_id); - } - time_point last_modified(duration(tail.last_modified)); - time_point last_rewritten(duration(tail.last_rewritten)); - segments.update_last_modified_rewritten( - segment_id, last_modified, last_rewritten); - if (tail.get_type() == segment_type_t::JOURNAL) { - update_journal_tail_committed(tail.journal_tail); - update_journal_tail_target( - tail.journal_tail, - tail.alloc_replay_from); - } - init_mark_segment_closed( - segment_id, - header.segment_seq, - header.type); - return seastar::now(); - }).handle_error( - crimson::ct_error::enodata::handle( - [this, header, segment_id, &segment_set](auto) { - return scan_nonfull_segment(header, segment_set, segment_id); - }), - crimson::ct_error::pass_further_all{} - ); - }).handle_error( - crimson::ct_error::enoent::handle([](auto) { - return mount_ertr::now(); - }), - crimson::ct_error::enodata::handle([](auto) { - return mount_ertr::now(); - }), - crimson::ct_error::input_output_error::pass_further{}, - crimson::ct_error::assert_all{"unexpected error"} - ); - }); - }); -} - -SegmentCleaner::scan_extents_ret SegmentCleaner::scan_nonfull_segment( - const segment_header_t& header, - scan_extents_ret_bare& segment_set, - segment_id_t segment_id) -{ - return seastar::do_with( - scan_valid_records_cursor({ - segments[segment_id].seq, - paddr_t::make_seg_paddr(segment_id, 0)}), - [this, segment_id, segment_header=header](auto& cursor) { - return seastar::do_with( - SegmentManagerGroup::found_record_handler_t( - [this, segment_id, segment_header]( - record_locator_t locator, - const record_group_header_t& header, - const bufferlist& mdbuf - ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> { - LOG_PREFIX(SegmentCleaner::scan_nonfull_segment); - if (segment_header.get_type() == segment_type_t::OOL) { - DEBUG("out-of-line segment {}, decodeing {} records", - segment_id, - header.records); - auto maybe_headers = try_decode_record_headers(header, mdbuf); - if (!maybe_headers) { - ERROR("unable to decode record headers for record group {}", - locator.record_block_base); - return crimson::ct_error::input_output_error::make(); - } - - for (auto& header : *maybe_headers) { - mod_time_point_t ctime = header.commit_time; - auto commit_type = header.commit_type; - if (!ctime) { - ERROR("SegmentCleaner::scan_nonfull_segment: extent {} 0 commit_time", - ctime); - ceph_abort("0 commit_time"); - } - time_point commit_time{duration(ctime)}; - assert(commit_type == record_commit_type_t::MODIFY - || commit_type == record_commit_type_t::REWRITE); - if (commit_type == record_commit_type_t::MODIFY) { - segments.update_last_modified_rewritten(segment_id, commit_time, {}); - } - if (commit_type == record_commit_type_t::REWRITE) { - segments.update_last_modified_rewritten(segment_id, {}, commit_time); - } - } - } else { - DEBUG("inline segment {}, decodeing {} records", - segment_id, - header.records); - auto maybe_record_deltas_list = try_decode_deltas( - header, mdbuf, locator.record_block_base); - if (!maybe_record_deltas_list) { - ERROR("unable to decode deltas for record {} at {}", - header, locator); - return crimson::ct_error::input_output_error::make(); - } - for (auto &record_deltas : *maybe_record_deltas_list) { - for (auto &[ctime, delta] : record_deltas.deltas) { - if (delta.type == extent_types_t::ALLOC_TAIL) { - journal_seq_t seq; - decode(seq, delta.bl); - update_alloc_info_replay_from(seq); - } - } - } - } - return seastar::now(); - }), - [&cursor, segment_header, this](auto& handler) { - return sm_group->scan_valid_records( - cursor, - segment_header.segment_nonce, - segments.get_segment_size(), - handler); - } - ); - }).safe_then([this, segment_id, header](auto) { - init_mark_segment_closed( - segment_id, - header.segment_seq, - header.type); - return seastar::now(); - }); -} - -SegmentCleaner::release_ertr::future<> -SegmentCleaner::maybe_release_segment(Transaction &t) -{ - auto to_release = t.get_segment_to_release(); - if (to_release != NULL_SEG_ID) { - LOG_PREFIX(SegmentCleaner::maybe_release_segment); - INFOT("releasing segment {}", t, to_release); - return sm_group->release_segment(to_release - ).safe_then([this, FNAME, &t, to_release] { - auto old_usage = calc_utilization(to_release); - ceph_assert(old_usage == 0); - segments.mark_empty(to_release); - auto new_usage = calc_utilization(to_release); - adjust_segment_util(old_usage, new_usage); - INFOT("released, should_block_on_gc {}, projected_avail_ratio {}, " - "reclaim_ratio {}", - t, - should_block_on_gc(), - get_projected_available_ratio(), - get_reclaim_ratio()); - if (space_tracker->get_usage(to_release) != 0) { - space_tracker->dump_usage(to_release); - ceph_abort(); - } - maybe_wake_gc_blocked_io(); - }); - } else { - return SegmentManager::release_ertr::now(); - } -} - -void SegmentCleaner::complete_init() -{ - LOG_PREFIX(SegmentCleaner::complete_init); - if (disable_trim) { - init_complete = true; - return; - } - INFO("done, start GC"); - ceph_assert(segments.get_journal_head() != JOURNAL_SEQ_NULL); - init_complete = true; - gc_process.start(); -} - -void SegmentCleaner::mark_space_used( - paddr_t addr, - extent_len_t len, - time_point last_modified, - time_point last_rewritten, - bool init_scan) -{ - LOG_PREFIX(SegmentCleaner::mark_space_used); - if (addr.get_addr_type() != addr_types_t::SEGMENT) { - return; - } - auto& seg_addr = addr.as_seg_paddr(); - - if (!init_scan && !init_complete) { - return; - } - - stats.used_bytes += len; - auto old_usage = calc_utilization(seg_addr.get_segment_id()); - [[maybe_unused]] auto ret = space_tracker->allocate( - seg_addr.get_segment_id(), - seg_addr.get_segment_off(), - len); - auto new_usage = calc_utilization(seg_addr.get_segment_id()); - adjust_segment_util(old_usage, new_usage); - - // use the last extent's last modified time for the calculation of the projected - // time the segments' live extents are to stay unmodified; this is an approximation - // of the sprite lfs' segment "age". - - segments.update_last_modified_rewritten( - seg_addr.get_segment_id(), last_modified, last_rewritten); - - gc_process.maybe_wake_on_space_used(); - assert(ret > 0); - DEBUG("segment {} new len: {}~{}, live_bytes: {}", - seg_addr.get_segment_id(), - addr, - len, - space_tracker->get_usage(seg_addr.get_segment_id())); -} - -void SegmentCleaner::mark_space_free( - paddr_t addr, - extent_len_t len) -{ - LOG_PREFIX(SegmentCleaner::mark_space_free); - if (!init_complete) { - return; - } - if (addr.get_addr_type() != addr_types_t::SEGMENT) { - return; - } - - ceph_assert(stats.used_bytes >= len); - stats.used_bytes -= len; - auto& seg_addr = addr.as_seg_paddr(); - - DEBUG("segment {} free len: {}~{}", - seg_addr.get_segment_id(), addr, len); - auto old_usage = calc_utilization(seg_addr.get_segment_id()); - [[maybe_unused]] auto ret = space_tracker->release( - seg_addr.get_segment_id(), - seg_addr.get_segment_off(), - len); - auto new_usage = calc_utilization(seg_addr.get_segment_id()); - adjust_segment_util(old_usage, new_usage); - maybe_wake_gc_blocked_io(); - assert(ret >= 0); - DEBUG("segment {} free len: {}~{}, live_bytes: {}", - seg_addr.get_segment_id(), - addr, - len, - space_tracker->get_usage(seg_addr.get_segment_id())); -} - -segment_id_t SegmentCleaner::get_next_reclaim_segment() const -{ - LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment); - segment_id_t id = NULL_SEG_ID; - double max_benefit_cost = 0; - for (auto& [_id, segment_info] : segments) { - if (segment_info.is_closed() && - !segment_info.is_in_journal(journal_tail_committed)) { - double benefit_cost = calc_gc_benefit_cost(_id); - if (benefit_cost > max_benefit_cost) { - id = _id; - max_benefit_cost = benefit_cost; - } - } - } - if (id != NULL_SEG_ID) { - DEBUG("segment {}, benefit_cost {}", - id, max_benefit_cost); - return id; - } else { - ceph_assert(get_segments_reclaimable() == 0); - // see gc_should_reclaim_space() - ceph_abort("impossible!"); - return NULL_SEG_ID; - } -} - -void SegmentCleaner::log_gc_state(const char *caller) const -{ - LOG_PREFIX(SegmentCleaner::log_gc_state); - if (LOCAL_LOGGER.is_enabled(seastar::log_level::debug) && - !disable_trim) { - DEBUG( - "caller {}, " - "empty {}, " - "open {}, " - "closed {}, " - "in_journal {}, " - "total {}B, " - "available {}B, " - "unavailable {}B, " - "unavailable_used {}B, " - "unavailable_unused {}B; " - "reclaim_ratio {}, " - "available_ratio {}, " - "should_block_on_gc {}, " - "gc_should_reclaim_space {}, " - "journal_head {}, " - "journal_tail_target {}, " - "journal_tail_commit {}, " - "dirty_tail {}, " - "dirty_tail_limit {}, " - "gc_should_trim_journal {}, ", - caller, - segments.get_num_empty(), - segments.get_num_open(), - segments.get_num_closed(), - get_segments_in_journal(), - segments.get_total_bytes(), - segments.get_available_bytes(), - segments.get_unavailable_bytes(), - stats.used_bytes, - get_unavailable_unused_bytes(), - get_reclaim_ratio(), - segments.get_available_ratio(), - should_block_on_gc(), - gc_should_reclaim_space(), - segments.get_journal_head(), - journal_tail_target, - journal_tail_committed, - get_dirty_tail(), - get_dirty_tail_limit(), - gc_should_trim_journal() - ); - } -} - -seastar::future<> -SegmentCleaner::reserve_projected_usage(std::size_t projected_usage) -{ - if (disable_trim) { - return seastar::now(); - } - ceph_assert(init_complete); - // The pipeline configuration prevents another IO from entering - // prepare until the prior one exits and clears this. - ceph_assert(!blocked_io_wake); - ++stats.io_count; - bool is_blocked = false; - if (should_block_on_trim()) { - is_blocked = true; - ++stats.io_blocked_count_trim; - } - if (should_block_on_reclaim()) { - is_blocked = true; - ++stats.io_blocked_count_reclaim; - } - if (is_blocked) { - ++stats.io_blocking_num; - ++stats.io_blocked_count; - stats.io_blocked_sum += stats.io_blocking_num; - } - return seastar::do_until( - [this] { - log_gc_state("await_hard_limits"); - return !should_block_on_gc(); - }, - [this] { - blocked_io_wake = seastar::promise<>(); - return blocked_io_wake->get_future(); - } - ).then([this, projected_usage, is_blocked] { - ceph_assert(!blocked_io_wake); - stats.projected_used_bytes += projected_usage; - ++stats.projected_count; - stats.projected_used_bytes_sum += stats.projected_used_bytes; - if (is_blocked) { - assert(stats.io_blocking_num > 0); - --stats.io_blocking_num; - } - }); -} - -void SegmentCleaner::release_projected_usage(std::size_t projected_usage) -{ - if (disable_trim) return; - ceph_assert(init_complete); - ceph_assert(stats.projected_used_bytes >= projected_usage); - stats.projected_used_bytes -= projected_usage; - return maybe_wake_gc_blocked_io(); -} - -} diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h deleted file mode 100644 index c28e7b3686ea..000000000000 --- a/src/crimson/os/seastore/segment_cleaner.h +++ /dev/null @@ -1,1296 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#pragma once - -#include -#include - -#include "common/ceph_time.h" - -#include "osd/osd_types.h" - -#include "crimson/os/seastore/backref_manager.h" -#include "crimson/os/seastore/cached_extent.h" -#include "crimson/os/seastore/seastore_types.h" -#include "crimson/os/seastore/segment_manager.h" -#include "crimson/os/seastore/segment_manager_group.h" -#include "crimson/os/seastore/transaction.h" -#include "crimson/os/seastore/segment_seq_allocator.h" - -namespace crimson::os::seastore { - -/* - * segment_info_t - * - * Maintains the tracked information for a segment. - * It is read-only outside segments_info_t. - */ -struct segment_info_t { - using time_point = seastar::lowres_system_clock::time_point; - - // segment_info_t is initiated as set_empty() - Segment::segment_state_t state = Segment::segment_state_t::EMPTY; - - // Will be non-null for any segments in the current journal - segment_seq_t seq = NULL_SEG_SEQ; - - segment_type_t type = segment_type_t::NULL_SEG; - - time_point last_modified; - time_point last_rewritten; - - std::size_t written_to = 0; - - bool is_in_journal(journal_seq_t tail_committed) const { - return type == segment_type_t::JOURNAL && - tail_committed.segment_seq <= seq; - } - - bool is_empty() const { - return state == Segment::segment_state_t::EMPTY; - } - - bool is_closed() const { - return state == Segment::segment_state_t::CLOSED; - } - - bool is_open() const { - return state == Segment::segment_state_t::OPEN; - } - - void init_closed(segment_seq_t, segment_type_t, std::size_t); - - void set_open(segment_seq_t, segment_type_t); - - void set_empty(); - - void set_closed(); - - void update_last_modified_rewritten( - time_point _last_modified, time_point _last_rewritten) { - if (_last_modified != time_point() && last_modified < _last_modified) { - last_modified = _last_modified; - } - if (_last_rewritten != time_point() && last_rewritten < _last_rewritten) { - last_rewritten = _last_rewritten; - } - } -}; - -std::ostream& operator<<(std::ostream&, const segment_info_t&); - -/* - * segments_info_t - * - * Keep track of all segments and related information. - */ -class segments_info_t { -public: - using time_point = seastar::lowres_system_clock::time_point; - - segments_info_t() { - reset(); - } - - const segment_info_t& operator[](segment_id_t id) const { - return segments[id]; - } - - auto begin() const { - return segments.begin(); - } - - auto end() const { - return segments.end(); - } - - std::size_t get_num_segments() const { - assert(segments.size() > 0); - return segments.size(); - } - std::size_t get_segment_size() const { - assert(segment_size > 0); - return segment_size; - } - std::size_t get_num_in_journal_open() const { - return num_in_journal_open; - } - std::size_t get_num_type_journal() const { - return num_type_journal; - } - std::size_t get_num_type_ool() const { - return num_type_ool; - } - std::size_t get_num_open() const { - return num_open; - } - std::size_t get_num_empty() const { - return num_empty; - } - std::size_t get_num_closed() const { - return num_closed; - } - std::size_t get_count_open_journal() const { - return count_open_journal; - } - std::size_t get_count_open_ool() const { - return count_open_ool; - } - std::size_t get_count_release_journal() const { - return count_release_journal; - } - std::size_t get_count_release_ool() const { - return count_release_ool; - } - std::size_t get_count_close_journal() const { - return count_close_journal; - } - std::size_t get_count_close_ool() const { - return count_close_ool; - } - - std::size_t get_total_bytes() const { - return total_bytes; - } - /// the available space that is writable, including in open segments - std::size_t get_available_bytes() const { - return num_empty * get_segment_size() + avail_bytes_in_open; - } - /// the unavailable space that is not writable - std::size_t get_unavailable_bytes() const { - assert(total_bytes >= get_available_bytes()); - return total_bytes - get_available_bytes(); - } - std::size_t get_available_bytes_in_open() const { - return avail_bytes_in_open; - } - double get_available_ratio() const { - return (double)get_available_bytes() / (double)total_bytes; - } - - journal_seq_t get_journal_head() const { - if (unlikely(journal_segment_id == NULL_SEG_ID)) { - return JOURNAL_SEQ_NULL; - } - auto &segment_info = segments[journal_segment_id]; - assert(!segment_info.is_empty()); - assert(segment_info.type == segment_type_t::JOURNAL); - assert(segment_info.seq != NULL_SEG_SEQ); - return journal_seq_t{ - segment_info.seq, - paddr_t::make_seg_paddr( - journal_segment_id, - segment_info.written_to) - }; - } - - void reset(); - - void add_segment_manager(SegmentManager &segment_manager); - - // initiate non-empty segments, the others are by default empty - void init_closed(segment_id_t, segment_seq_t, segment_type_t); - - void mark_open(segment_id_t, segment_seq_t, segment_type_t); - - void mark_empty(segment_id_t); - - void mark_closed(segment_id_t); - - void update_written_to(segment_type_t, paddr_t); - - void update_last_modified_rewritten( - segment_id_t id, time_point last_modified, time_point last_rewritten) { - segments[id].update_last_modified_rewritten(last_modified, last_rewritten); - } - -private: - // See reset() for member initialization - segment_map_t segments; - - std::size_t segment_size; - - segment_id_t journal_segment_id; - std::size_t num_in_journal_open; - std::size_t num_type_journal; - std::size_t num_type_ool; - - std::size_t num_open; - std::size_t num_empty; - std::size_t num_closed; - - std::size_t count_open_journal; - std::size_t count_open_ool; - std::size_t count_release_journal; - std::size_t count_release_ool; - std::size_t count_close_journal; - std::size_t count_close_ool; - - std::size_t total_bytes; - std::size_t avail_bytes_in_open; -}; - -/** - * Callback interface for managing available segments - */ -class SegmentProvider { -public: - virtual journal_seq_t get_journal_tail_target() const = 0; - - virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0; - - virtual segment_id_t allocate_segment( - segment_seq_t seq, segment_type_t type) = 0; - - virtual journal_seq_t get_dirty_extents_replay_from() const = 0; - - virtual journal_seq_t get_alloc_info_replay_from() const = 0; - - virtual void close_segment(segment_id_t) = 0; - - virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; - - virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0; - - virtual SegmentManagerGroup* get_segment_manager_group() = 0; - - virtual ~SegmentProvider() {} -}; - -class SpaceTrackerI { -public: - virtual int64_t allocate( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) = 0; - - virtual int64_t release( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) = 0; - - virtual int64_t get_usage( - segment_id_t segment) const = 0; - - virtual bool equals(const SpaceTrackerI &other) const = 0; - - virtual std::unique_ptr make_empty() const = 0; - - virtual void dump_usage(segment_id_t) const = 0; - - virtual double calc_utilization(segment_id_t segment) const = 0; - - virtual void reset() = 0; - - virtual ~SpaceTrackerI() = default; -}; -using SpaceTrackerIRef = std::unique_ptr; - -class SpaceTrackerSimple : public SpaceTrackerI { - struct segment_bytes_t { - int64_t live_bytes = 0; - seastore_off_t total_bytes = 0; - }; - // Tracks live space for each segment - segment_map_t live_bytes_by_segment; - - int64_t update_usage(segment_id_t segment, int64_t delta) { - live_bytes_by_segment[segment].live_bytes += delta; - assert(live_bytes_by_segment[segment].live_bytes >= 0); - return live_bytes_by_segment[segment].live_bytes; - } -public: - SpaceTrackerSimple(const SpaceTrackerSimple &) = default; - SpaceTrackerSimple(const std::vector &sms) { - for (auto sm : sms) { - live_bytes_by_segment.add_device( - sm->get_device_id(), - sm->get_num_segments(), - {0, sm->get_segment_size()}); - } - } - - int64_t allocate( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) final { - return update_usage(segment, len); - } - - int64_t release( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) final { - return update_usage(segment, -(int64_t)len); - } - - int64_t get_usage(segment_id_t segment) const final { - return live_bytes_by_segment[segment].live_bytes; - } - - double calc_utilization(segment_id_t segment) const final { - auto& seg_bytes = live_bytes_by_segment[segment]; - return (double)seg_bytes.live_bytes / (double)seg_bytes.total_bytes; - } - - void dump_usage(segment_id_t) const final; - - void reset() final { - for (auto &i : live_bytes_by_segment) { - i.second = {0, 0}; - } - } - - SpaceTrackerIRef make_empty() const final { - auto ret = SpaceTrackerIRef(new SpaceTrackerSimple(*this)); - ret->reset(); - return ret; - } - - bool equals(const SpaceTrackerI &other) const; -}; - -class SpaceTrackerDetailed : public SpaceTrackerI { - class SegmentMap { - int64_t used = 0; - seastore_off_t total_bytes = 0; - std::vector bitmap; - - public: - SegmentMap( - size_t blocks, - seastore_off_t total_bytes) - : total_bytes(total_bytes), - bitmap(blocks, false) {} - - int64_t update_usage(int64_t delta) { - used += delta; - return used; - } - - int64_t allocate( - device_segment_id_t segment, - seastore_off_t offset, - extent_len_t len, - const extent_len_t block_size); - - int64_t release( - device_segment_id_t segment, - seastore_off_t offset, - extent_len_t len, - const extent_len_t block_size); - - int64_t get_usage() const { - return used; - } - - void dump_usage(extent_len_t block_size) const; - - double calc_utilization() const { - return (double)used / (double)total_bytes; - } - - void reset() { - used = 0; - for (auto &&i: bitmap) { - i = false; - } - } - }; - - // Tracks live space for each segment - segment_map_t segment_usage; - std::vector block_size_by_segment_manager; - -public: - SpaceTrackerDetailed(const SpaceTrackerDetailed &) = default; - SpaceTrackerDetailed(const std::vector &sms) - { - block_size_by_segment_manager.resize(DEVICE_ID_MAX, 0); - for (auto sm : sms) { - segment_usage.add_device( - sm->get_device_id(), - sm->get_num_segments(), - SegmentMap( - sm->get_segment_size() / sm->get_block_size(), - sm->get_segment_size())); - block_size_by_segment_manager[sm->get_device_id()] = sm->get_block_size(); - } - } - - int64_t allocate( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) final { - return segment_usage[segment].allocate( - segment.device_segment_id(), - offset, - len, - block_size_by_segment_manager[segment.device_id()]); - } - - int64_t release( - segment_id_t segment, - seastore_off_t offset, - extent_len_t len) final { - return segment_usage[segment].release( - segment.device_segment_id(), - offset, - len, - block_size_by_segment_manager[segment.device_id()]); - } - - int64_t get_usage(segment_id_t segment) const final { - return segment_usage[segment].get_usage(); - } - - double calc_utilization(segment_id_t segment) const final { - return segment_usage[segment].calc_utilization(); - } - - void dump_usage(segment_id_t seg) const final; - - void reset() final { - for (auto &i: segment_usage) { - i.second.reset(); - } - } - - SpaceTrackerIRef make_empty() const final { - auto ret = SpaceTrackerIRef(new SpaceTrackerDetailed(*this)); - ret->reset(); - return ret; - } - - bool equals(const SpaceTrackerI &other) const; -}; - - -class SegmentCleaner : public SegmentProvider { -public: - using time_point = seastar::lowres_system_clock::time_point; - using duration = seastar::lowres_system_clock::duration; - - /// Config - struct config_t { - /// Number of minimum journal segments to stop trimming. - size_t target_journal_segments = 0; - /// Number of maximum journal segments to block user transactions. - size_t max_journal_segments = 0; - - /// Number of journal segments the transactions in which can - /// have their corresponding backrefs unmerged - size_t target_backref_inflight_segments = 0; - - /// Ratio of maximum available space to disable reclaiming. - double available_ratio_gc_max = 0; - /// Ratio of minimum available space to force reclaiming. - double available_ratio_hard_limit = 0; - - /// Ratio of minimum reclaimable space to stop reclaiming. - double reclaim_ratio_gc_threshold = 0; - - /// Number of bytes to reclaim per cycle - size_t reclaim_bytes_per_cycle = 0; - - /// Number of bytes to rewrite dirty per cycle - size_t rewrite_dirty_bytes_per_cycle = 0; - - /// Number of bytes to rewrite backref per cycle - size_t rewrite_backref_bytes_per_cycle = 0; - - void validate() const { - ceph_assert(max_journal_segments > target_journal_segments); - ceph_assert(available_ratio_gc_max > available_ratio_hard_limit); - ceph_assert(reclaim_bytes_per_cycle > 0); - ceph_assert(rewrite_dirty_bytes_per_cycle > 0); - ceph_assert(rewrite_backref_bytes_per_cycle > 0); - } - - static config_t get_default() { - return config_t{ - 12, // target_journal_segments - 16, // max_journal_segments - 2, // target_backref_inflight_segments - .1, // available_ratio_gc_max - .05, // available_ratio_hard_limit - .1, // reclaim_ratio_gc_threshold - 1<<20,// reclaim_bytes_per_cycle - 1<<17,// rewrite_dirty_bytes_per_cycle - 1<<24 // rewrite_backref_bytes_per_cycle - }; - } - - static config_t get_test() { - return config_t{ - 2, // target_journal_segments - 4, // max_journal_segments - 2, // target_backref_inflight_segments - .99, // available_ratio_gc_max - .2, // available_ratio_hard_limit - .6, // reclaim_ratio_gc_threshold - 1<<20,// reclaim_bytes_per_cycle - 1<<17,// rewrite_dirty_bytes_per_cycle - 1<<24 // rewrite_backref_bytes_per_cycle - }; - } - }; - - /// Callback interface for querying and operating on segments - class ExtentCallbackInterface { - public: - virtual ~ExtentCallbackInterface() = default; - - virtual TransactionRef create_transaction( - Transaction::src_t, const char*) = 0; - - /// Creates empty transaction with interruptible context - template - auto with_transaction_intr( - Transaction::src_t src, - const char* name, - Func &&f) { - return seastar::do_with( - create_transaction(src, name), - [f=std::forward(f)](auto &ref_t) mutable { - return with_trans_intr( - *ref_t, - [f=std::forward(f)](auto& t) mutable { - return f(t); - } - ); - } - ); - } - - /// See Cache::get_next_dirty_extents - using get_next_dirty_extents_iertr = trans_iertr< - crimson::errorator< - crimson::ct_error::input_output_error> - >; - using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future< - std::vector>; - virtual get_next_dirty_extents_ret get_next_dirty_extents( - Transaction &t, ///< [in] current transaction - journal_seq_t bound,///< [in] return extents with dirty_from < bound - size_t max_bytes ///< [in] return up to max_bytes of extents - ) = 0; - - using extent_mapping_ertr = crimson::errorator< - crimson::ct_error::input_output_error, - crimson::ct_error::eagain>; - using extent_mapping_iertr = trans_iertr< - crimson::errorator< - crimson::ct_error::input_output_error> - >; - - /** - * rewrite_extent - * - * Updates t with operations moving the passed extents to a new - * segment. extent may be invalid, implementation must correctly - * handle finding the current instance if it is still alive and - * otherwise ignore it. - */ - using rewrite_extent_iertr = extent_mapping_iertr; - using rewrite_extent_ret = rewrite_extent_iertr::future<>; - virtual rewrite_extent_ret rewrite_extent( - Transaction &t, - CachedExtentRef extent) = 0; - - /** - * get_extent_if_live - * - * Returns extent at specified location if still referenced by - * lba_manager and not removed by t. - * - * See TransactionManager::get_extent_if_live and - * LBAManager::get_physical_extent_if_live. - */ - using get_extent_if_live_iertr = extent_mapping_iertr; - using get_extent_if_live_ret = get_extent_if_live_iertr::future< - CachedExtentRef>; - virtual get_extent_if_live_ret get_extent_if_live( - Transaction &t, - extent_types_t type, - paddr_t addr, - laddr_t laddr, - seastore_off_t len) = 0; - - /** - * submit_transaction_direct - * - * Submits transaction without any space throttling. - */ - using submit_transaction_direct_iertr = trans_iertr< - crimson::errorator< - crimson::ct_error::input_output_error> - >; - using submit_transaction_direct_ret = - submit_transaction_direct_iertr::future<>; - virtual submit_transaction_direct_ret submit_transaction_direct( - Transaction &t, - std::optional seq_to_trim = std::nullopt) = 0; - }; - -private: - const bool detailed; - const config_t config; - - SegmentManagerGroupRef sm_group; - BackrefManager &backref_manager; - - SpaceTrackerIRef space_tracker; - segments_info_t segments; - bool init_complete = false; - - struct { - /** - * used_bytes - * - * Bytes occupied by live extents - */ - uint64_t used_bytes = 0; - - /** - * projected_used_bytes - * - * Sum of projected bytes used by each transaction between throttle - * acquisition and commit completion. See reserve_projected_usage() - */ - uint64_t projected_used_bytes = 0; - uint64_t projected_count = 0; - uint64_t projected_used_bytes_sum = 0; - - uint64_t closed_journal_used_bytes = 0; - uint64_t closed_journal_total_bytes = 0; - uint64_t closed_ool_used_bytes = 0; - uint64_t closed_ool_total_bytes = 0; - - uint64_t io_blocking_num = 0; - uint64_t io_count = 0; - uint64_t io_blocked_count = 0; - uint64_t io_blocked_count_trim = 0; - uint64_t io_blocked_count_reclaim = 0; - uint64_t io_blocked_sum = 0; - - uint64_t reclaiming_bytes = 0; - uint64_t reclaimed_bytes = 0; - uint64_t reclaimed_segment_bytes = 0; - - seastar::metrics::histogram segment_util; - } stats; - seastar::metrics::metric_group metrics; - void register_metrics(); - - /// target journal_tail for next fresh segment - journal_seq_t journal_tail_target; - - /// target replay_from for dirty extents - journal_seq_t dirty_extents_replay_from; - - /// target replay_from for alloc infos - journal_seq_t alloc_info_replay_from; - - /// most recently committed journal_tail - journal_seq_t journal_tail_committed; - - ExtentCallbackInterface *ecb = nullptr; - - /// populated if there is an IO blocked on hard limits - std::optional> blocked_io_wake; - - SegmentSeqAllocatorRef ool_segment_seq_allocator; - - /** - * disable_trim - * - * added to enable unit testing of CircularBoundedJournal before - * proper support is added to SegmentCleaner. - * Should be removed once proper support is added. TODO - */ - bool disable_trim = false; -public: - SegmentCleaner( - config_t config, - SegmentManagerGroupRef&& sm_group, - BackrefManager &backref_manager, - bool detailed = false); - - SegmentSeqAllocator& get_ool_segment_seq_allocator() { - return *ool_segment_seq_allocator; - } - - using mount_ertr = crimson::errorator< - crimson::ct_error::input_output_error>; - using mount_ret = mount_ertr::future<>; - mount_ret mount(); - - /* - * SegmentProvider interfaces - */ - journal_seq_t get_journal_tail_target() const final { - return journal_tail_target; - } - - const segment_info_t& get_seg_info(segment_id_t id) const final { - return segments[id]; - } - - segment_id_t allocate_segment( - segment_seq_t seq, segment_type_t type) final; - - void close_segment(segment_id_t segment) final; - - void update_journal_tail_committed(journal_seq_t committed) final; - - void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final { - segments.update_written_to(type, offset); - gc_process.maybe_wake_on_space_used(); - } - - SegmentManagerGroup* get_segment_manager_group() final { - return sm_group.get(); - } - - journal_seq_t get_dirty_extents_replay_from() const final { - return dirty_extents_replay_from; - } - - journal_seq_t get_alloc_info_replay_from() const final { - return alloc_info_replay_from; - } - - void update_journal_tail_target( - journal_seq_t dirty_replay_from, - journal_seq_t alloc_replay_from); - - void update_alloc_info_replay_from( - journal_seq_t alloc_replay_from); - - void init_mkfs() { - auto journal_head = segments.get_journal_head(); - ceph_assert(disable_trim || journal_head != JOURNAL_SEQ_NULL); - journal_tail_target = journal_head; - journal_tail_committed = journal_head; - } - - using release_ertr = SegmentManagerGroup::release_ertr; - release_ertr::future<> maybe_release_segment(Transaction &t); - - void adjust_segment_util(double old_usage, double new_usage) { - auto old_index = get_bucket_index(old_usage); - auto new_index = get_bucket_index(new_usage); - assert(stats.segment_util.buckets[old_index].count > 0); - stats.segment_util.buckets[old_index].count--; - stats.segment_util.buckets[new_index].count++; - } - - void mark_space_used( - paddr_t addr, - extent_len_t len, - time_point last_modified = time_point(), - time_point last_rewritten = time_point(), - bool init_scan = false); - - void mark_space_free( - paddr_t addr, - extent_len_t len); - - SpaceTrackerIRef get_empty_space_tracker() const { - return space_tracker->make_empty(); - } - - void complete_init(); - - store_statfs_t stat() const { - store_statfs_t st; - st.total = segments.get_total_bytes(); - st.available = segments.get_total_bytes() - stats.used_bytes; - st.allocated = stats.used_bytes; - st.data_stored = stats.used_bytes; - - // TODO add per extent type counters for omap_allocated and - // internal metadata - return st; - } - - seastar::future<> stop() { - return gc_process.stop(); - } - - seastar::future<> run_until_halt() { - return gc_process.run_until_halt(); - } - - void set_extent_callback(ExtentCallbackInterface *cb) { - ecb = cb; - } - - bool debug_check_space(const SpaceTrackerI &tracker) { - return space_tracker->equals(tracker); - } - - void set_disable_trim(bool val) { - disable_trim = val; - } - - using work_ertr = ExtentCallbackInterface::extent_mapping_ertr; - using work_iertr = ExtentCallbackInterface::extent_mapping_iertr; - -private: - /* - * 10 buckets for the number of closed segments by usage - * 2 extra buckets for the number of open and empty segments - */ - static constexpr double UTIL_STATE_OPEN = 1.05; - static constexpr double UTIL_STATE_EMPTY = 1.15; - static constexpr std::size_t UTIL_BUCKETS = 12; - static std::size_t get_bucket_index(double util) { - auto index = std::floor(util * 10); - assert(index < UTIL_BUCKETS); - return index; - } - double calc_utilization(segment_id_t id) const { - auto& info = segments[id]; - if (info.is_open()) { - return UTIL_STATE_OPEN; - } else if (info.is_empty()) { - return UTIL_STATE_EMPTY; - } else { - auto ret = space_tracker->calc_utilization(id); - assert(ret >= 0 && ret < 1); - return ret; - } - } - - // journal status helpers - - double calc_gc_benefit_cost(segment_id_t id) const { - double util = calc_utilization(id); - ceph_assert(util >= 0 && util < 1); - auto cur_time = seastar::lowres_system_clock::now(); - auto segment = segments[id]; - assert(cur_time >= segment.last_modified); - auto segment_age = - cur_time - std::max(segment.last_modified, segment.last_rewritten); - uint64_t age = segment_age.count(); - return (1 - util) * age / (1 + util); - } - - segment_id_t get_next_reclaim_segment() const; - - /** - * rewrite_dirty - * - * Writes out dirty blocks dirtied earlier than limit. - */ - using rewrite_dirty_iertr = work_iertr; - using rewrite_dirty_ret = rewrite_dirty_iertr::future<>; - rewrite_dirty_ret rewrite_dirty( - Transaction &t, - journal_seq_t limit); - - using trim_backrefs_iertr = work_iertr; - using trim_backrefs_ret = trim_backrefs_iertr::future; - trim_backrefs_ret trim_backrefs( - Transaction &t, - journal_seq_t limit); - - journal_seq_t get_dirty_tail() const { - auto ret = segments.get_journal_head(); - ceph_assert(ret != JOURNAL_SEQ_NULL); - if (ret.segment_seq >= config.target_journal_segments) { - ret.segment_seq -= config.target_journal_segments; - } else { - ret.segment_seq = 0; - ret.offset = P_ADDR_MIN; - } - return ret; - } - - journal_seq_t get_dirty_tail_limit() const { - auto ret = segments.get_journal_head(); - ceph_assert(ret != JOURNAL_SEQ_NULL); - if (ret.segment_seq >= config.max_journal_segments) { - ret.segment_seq -= config.max_journal_segments; - } else { - ret.segment_seq = 0; - ret.offset = P_ADDR_MIN; - } - return ret; - } - - journal_seq_t get_backref_tail() const { - auto ret = segments.get_journal_head(); - ceph_assert(ret != JOURNAL_SEQ_NULL); - if (ret.segment_seq >= config.target_backref_inflight_segments) { - ret.segment_seq -= config.target_backref_inflight_segments; - } else { - ret.segment_seq = 0; - ret.offset = P_ADDR_MIN; - } - return ret; - } - - struct reclaim_state_t { - std::size_t segment_size; - paddr_t start_pos; - paddr_t end_pos; - - static reclaim_state_t create( - segment_id_t segment_id, - std::size_t segment_size) { - return {segment_size, - P_ADDR_NULL, - paddr_t::make_seg_paddr(segment_id, 0)}; - } - - segment_id_t get_segment_id() const { - return end_pos.as_seg_paddr().get_segment_id(); - } - - bool is_complete() const { - return (std::size_t)end_pos.as_seg_paddr().get_segment_off() >= segment_size; - } - - void advance(std::size_t bytes) { - assert(!is_complete()); - start_pos = end_pos; - auto &end_seg_paddr = end_pos.as_seg_paddr(); - auto next_off = end_seg_paddr.get_segment_off() + bytes; - if (next_off > segment_size) { - end_seg_paddr.set_segment_off(segment_size); - } else { - end_seg_paddr.set_segment_off(next_off); - } - } - }; - std::optional reclaim_state; - - /** - * GCProcess - * - * Background gc process. - */ - using gc_cycle_ret = seastar::future<>; - class GCProcess { - std::optional process_join; - - SegmentCleaner &cleaner; - - std::optional> blocking; - - bool is_stopping() const { - return !process_join; - } - - gc_cycle_ret run(); - - void wake() { - if (blocking) { - blocking->set_value(); - blocking = std::nullopt; - } - } - - seastar::future<> maybe_wait_should_run() { - return seastar::do_until( - [this] { - cleaner.log_gc_state("GCProcess::maybe_wait_should_run"); - return is_stopping() || cleaner.gc_should_run(); - }, - [this] { - ceph_assert(!blocking); - blocking = seastar::promise<>(); - return blocking->get_future(); - }); - } - public: - GCProcess(SegmentCleaner &cleaner) : cleaner(cleaner) {} - - void start() { - ceph_assert(is_stopping()); - process_join = seastar::now(); // allow run() - process_join = run(); - assert(!is_stopping()); - } - - gc_cycle_ret stop() { - if (is_stopping()) { - return seastar::now(); - } - auto ret = std::move(*process_join); - process_join.reset(); - assert(is_stopping()); - wake(); - return ret; - } - - gc_cycle_ret run_until_halt() { - ceph_assert(is_stopping()); - return seastar::do_until( - [this] { - cleaner.log_gc_state("GCProcess::run_until_halt"); - return !cleaner.gc_should_run(); - }, - [this] { - return cleaner.do_gc_cycle(); - }); - } - - void maybe_wake_on_space_used() { - if (is_stopping()) { - return; - } - if (cleaner.gc_should_run()) { - wake(); - } - } - } gc_process; - - using gc_ertr = work_ertr::extend_ertr< - SegmentManagerGroup::scan_extents_ertr - >; - - gc_cycle_ret do_gc_cycle(); - - using gc_trim_journal_ertr = gc_ertr; - using gc_trim_journal_ret = gc_trim_journal_ertr::future<>; - gc_trim_journal_ret gc_trim_journal(); - - using gc_trim_backref_ertr = gc_ertr; - using gc_trim_backref_ret = gc_trim_backref_ertr::future; - gc_trim_backref_ret gc_trim_backref(journal_seq_t limit); - - using gc_reclaim_space_ertr = gc_ertr; - using gc_reclaim_space_ret = gc_reclaim_space_ertr::future<>; - gc_reclaim_space_ret gc_reclaim_space(); - - - using retrieve_live_extents_iertr = work_iertr; - using retrieve_live_extents_ret = - retrieve_live_extents_iertr::future; - retrieve_live_extents_ret _retrieve_live_extents( - Transaction &t, - std::set< - backref_buf_entry_t, - backref_buf_entry_t::cmp_t> &&backrefs, - std::vector &extents); - - using retrieve_backref_mappings_ertr = work_ertr; - using retrieve_backref_mappings_ret = - retrieve_backref_mappings_ertr::future; - retrieve_backref_mappings_ret retrieve_backref_mappings( - paddr_t start_paddr, - paddr_t end_paddr); - - /* - * Segments calculations - */ - std::size_t get_segments_in_journal() const { - if (!init_complete) { - return 0; - } - if (journal_tail_committed == JOURNAL_SEQ_NULL) { - return segments.get_num_type_journal(); - } - auto journal_head = segments.get_journal_head(); - assert(journal_head != JOURNAL_SEQ_NULL); - assert(journal_head.segment_seq >= journal_tail_committed.segment_seq); - return journal_head.segment_seq + 1 - journal_tail_committed.segment_seq; - } - std::size_t get_segments_in_journal_closed() const { - auto in_journal = get_segments_in_journal(); - auto in_journal_open = segments.get_num_in_journal_open(); - if (in_journal >= in_journal_open) { - return in_journal - in_journal_open; - } else { - return 0; - } - } - std::size_t get_segments_reclaimable() const { - assert(segments.get_num_closed() >= get_segments_in_journal_closed()); - return segments.get_num_closed() - get_segments_in_journal_closed(); - } - - /* - * Space calculations - */ - /// the unavailable space that is not reclaimable yet - std::size_t get_unavailable_unreclaimable_bytes() const { - auto ret = (segments.get_num_open() + get_segments_in_journal_closed()) * - segments.get_segment_size(); - assert(ret >= segments.get_available_bytes_in_open()); - return ret - segments.get_available_bytes_in_open(); - } - /// the unavailable space that can be reclaimed - std::size_t get_unavailable_reclaimable_bytes() const { - auto ret = get_segments_reclaimable() * segments.get_segment_size(); - ceph_assert(ret + get_unavailable_unreclaimable_bytes() == segments.get_unavailable_bytes()); - return ret; - } - /// the unavailable space that is not alive - std::size_t get_unavailable_unused_bytes() const { - assert(segments.get_unavailable_bytes() > stats.used_bytes); - return segments.get_unavailable_bytes() - stats.used_bytes; - } - double get_reclaim_ratio() const { - if (segments.get_unavailable_bytes() == 0) return 0; - return (double)get_unavailable_unused_bytes() / (double)segments.get_unavailable_bytes(); - } - - /* - * Space calculations (projected) - */ - std::size_t get_projected_available_bytes() const { - return (segments.get_available_bytes() > stats.projected_used_bytes) ? - segments.get_available_bytes() - stats.projected_used_bytes: - 0; - } - double get_projected_available_ratio() const { - return (double)get_projected_available_bytes() / - (double)segments.get_total_bytes(); - } - - /* - * Journal sizes - */ - std::size_t get_dirty_journal_size() const { - auto journal_head = segments.get_journal_head(); - if (journal_head == JOURNAL_SEQ_NULL || - dirty_extents_replay_from == JOURNAL_SEQ_NULL) { - return 0; - } - return (journal_head.segment_seq - dirty_extents_replay_from.segment_seq) * - segments.get_segment_size() + - journal_head.offset.as_seg_paddr().get_segment_off() - - segments.get_segment_size() - - dirty_extents_replay_from.offset.as_seg_paddr().get_segment_off(); - } - - std::size_t get_alloc_journal_size() const { - auto journal_head = segments.get_journal_head(); - if (journal_head == JOURNAL_SEQ_NULL || - alloc_info_replay_from == JOURNAL_SEQ_NULL) { - return 0; - } - return (journal_head.segment_seq - alloc_info_replay_from.segment_seq) * - segments.get_segment_size() + - journal_head.offset.as_seg_paddr().get_segment_off() - - segments.get_segment_size() - - alloc_info_replay_from.offset.as_seg_paddr().get_segment_off(); - } - - /** - * should_block_on_gc - * - * Encapsulates whether block pending gc. - */ - bool should_block_on_trim() const { - if (disable_trim) return false; - return get_dirty_tail_limit() > journal_tail_target; - } - - bool should_block_on_reclaim() const { - if (disable_trim) return false; - if (get_segments_reclaimable() == 0) { - return false; - } - auto aratio = get_projected_available_ratio(); - return aratio < config.available_ratio_hard_limit; - } - - bool should_block_on_gc() const { - return should_block_on_trim() || should_block_on_reclaim(); - } - - void log_gc_state(const char *caller) const; - -public: - seastar::future<> reserve_projected_usage(std::size_t projected_usage); - - void release_projected_usage(size_t projected_usage); - -private: - void maybe_wake_gc_blocked_io() { - if (!init_complete) { - return; - } - if (!should_block_on_gc() && blocked_io_wake) { - blocked_io_wake->set_value(); - blocked_io_wake = std::nullopt; - } - } - - using scan_extents_ret_bare = - std::vector>; - using scan_extents_ertr = SegmentManagerGroup::scan_extents_ertr; - using scan_extents_ret = scan_extents_ertr::future<>; - scan_extents_ret scan_nonfull_segment( - const segment_header_t& header, - scan_extents_ret_bare& segment_set, - segment_id_t segment_id); - - /** - * gc_should_reclaim_space - * - * Encapsulates logic for whether gc should be reclaiming segment space. - */ - bool gc_should_reclaim_space() const { - if (disable_trim) return false; - if (get_segments_reclaimable() == 0) { - return false; - } - auto aratio = segments.get_available_ratio(); - auto rratio = get_reclaim_ratio(); - return ( - (aratio < config.available_ratio_hard_limit) || - ((aratio < config.available_ratio_gc_max) && - (rratio > config.reclaim_ratio_gc_threshold)) - ); - } - - /** - * gc_should_trim_journal - * - * Encapsulates logic for whether gc should be reclaiming segment space. - */ - bool gc_should_trim_journal() const { - return get_dirty_tail() > journal_tail_target; - } - - bool gc_should_trim_backref() const { - return get_backref_tail() > alloc_info_replay_from; - } - /** - * gc_should_run - * - * True if gc should be running. - */ - bool gc_should_run() const { - if (disable_trim) return false; - ceph_assert(init_complete); - return gc_should_reclaim_space() - || gc_should_trim_journal() - || gc_should_trim_backref(); - } - - void init_mark_segment_closed( - segment_id_t segment, - segment_seq_t seq, - segment_type_t s_type) { - ceph_assert(!init_complete); - auto old_usage = calc_utilization(segment); - segments.init_closed(segment, seq, s_type); - auto new_usage = calc_utilization(segment); - adjust_segment_util(old_usage, new_usage); - if (s_type == segment_type_t::OOL) { - ool_segment_seq_allocator->set_next_segment_seq(seq); - } - } -}; -using SegmentCleanerRef = std::unique_ptr; - -} diff --git a/src/crimson/os/seastore/segment_seq_allocator.h b/src/crimson/os/seastore/segment_seq_allocator.h index 7bbca15572a7..e4a864a7b3b6 100644 --- a/src/crimson/os/seastore/segment_seq_allocator.h +++ b/src/crimson/os/seastore/segment_seq_allocator.h @@ -7,7 +7,7 @@ #include "crimson/os/seastore/seastore_types.h" namespace crimson::os::seastore { -class SegmentCleaner; +class AsyncCleaner; } namespace crimson::os::seastore::journal { @@ -41,7 +41,7 @@ private: segment_seq_t next_segment_seq = 0; segment_type_t type = segment_type_t::NULL_SEG; friend class journal::SegmentedJournal; - friend class SegmentCleaner; + friend class AsyncCleaner; }; using SegmentSeqAllocatorRef = diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 2b89fe45e198..0a7d316235a9 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -23,23 +23,23 @@ SET_SUBSYS(seastore_tm); namespace crimson::os::seastore { TransactionManager::TransactionManager( - SegmentCleanerRef _segment_cleaner, + AsyncCleanerRef _async_cleaner, JournalRef _journal, CacheRef _cache, LBAManagerRef _lba_manager, ExtentPlacementManagerRef &&epm, BackrefManagerRef&& backref_manager, tm_make_config_t config) - : segment_cleaner(std::move(_segment_cleaner)), + : async_cleaner(std::move(_async_cleaner)), cache(std::move(_cache)), lba_manager(std::move(_lba_manager)), journal(std::move(_journal)), epm(std::move(epm)), backref_manager(std::move(backref_manager)), - sm_group(*segment_cleaner->get_segment_manager_group()), + sm_group(*async_cleaner->get_segment_manager_group()), config(config) { - segment_cleaner->set_extent_callback(this); + async_cleaner->set_extent_callback(this); journal->set_write_pipeline(&write_pipeline); } @@ -47,11 +47,11 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() { LOG_PREFIX(TransactionManager::mkfs); INFO("enter"); - return segment_cleaner->mount( + return async_cleaner->mount( ).safe_then([this] { return journal->open_for_write(); }).safe_then([this](auto) { - segment_cleaner->init_mkfs(); + async_cleaner->init_mkfs(); return epm->open(); }).safe_then([this, FNAME]() { return with_transaction_intr( @@ -88,7 +88,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() LOG_PREFIX(TransactionManager::mount); INFO("enter"); cache->init(); - return segment_cleaner->mount( + return async_cleaner->mount( ).safe_then([this] { return journal->replay( [this]( @@ -98,7 +98,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() auto last_modified) { auto start_seq = offsets.write_result.start_seq; - segment_cleaner->update_journal_tail_target( + async_cleaner->update_journal_tail_target( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); return cache->replay_delta( @@ -124,8 +124,8 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() else return lba_manager->init_cached_extent(t, e); }).si_then([this, FNAME, &t] { - assert(segment_cleaner->debug_check_space( - *segment_cleaner->get_empty_space_tracker())); + assert(async_cleaner->debug_check_space( + *async_cleaner->get_empty_space_tracker())); return backref_manager->scan_mapped_space( t, [this, FNAME, &t]( @@ -141,7 +141,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() len); if (addr.is_real() && !backref_manager->backref_should_be_removed(addr)) { - segment_cleaner->mark_space_used( + async_cleaner->mark_space_used( addr, len , seastar::lowres_system_clock::time_point(), @@ -163,7 +163,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() auto &backrefs = backref_manager->get_cached_backrefs(); DEBUG("marking {} backrefs used", backrefs.size()); for (auto &backref : backrefs) { - segment_cleaner->mark_space_used( + async_cleaner->mark_space_used( backref.paddr, backref.len, seastar::lowres_system_clock::time_point(), @@ -179,7 +179,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() }).safe_then([this] { return epm->open(); }).safe_then([FNAME, this] { - segment_cleaner->complete_init(); + async_cleaner->complete_init(); INFO("completed"); }).handle_error( mount_ertr::pass_further{}, @@ -192,7 +192,7 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() TransactionManager::close_ertr::future<> TransactionManager::close() { LOG_PREFIX(TransactionManager::close); INFO("enter"); - return segment_cleaner->stop( + return async_cleaner->stop( ).then([this] { return cache->close(); }).safe_then([this] { @@ -314,12 +314,12 @@ TransactionManager::submit_transaction( size_t projected_usage = t.get_allocation_size(); SUBTRACET(seastore_t, "waiting for projected_usage: {}", t, projected_usage); return trans_intr::make_interruptible( - segment_cleaner->reserve_projected_usage(projected_usage) + async_cleaner->reserve_projected_usage(projected_usage) ).then_interruptible([this, &t] { return submit_transaction_direct(t); }).finally([this, FNAME, projected_usage, &t] { SUBTRACET(seastore_t, "releasing projected_usage: {}", t, projected_usage); - segment_cleaner->release_projected_usage(projected_usage); + async_cleaner->release_projected_usage(projected_usage); }); }); } @@ -365,7 +365,7 @@ TransactionManager::submit_transaction_direct( if (seq_to_trim && *seq_to_trim != JOURNAL_SEQ_NULL) { cache->trim_backref_bufs(*seq_to_trim); } - auto record = cache->prepare_record(tref, segment_cleaner.get()); + auto record = cache->prepare_record(tref, async_cleaner.get()); tref.get_handle().maybe_release_collection_lock(); @@ -379,7 +379,7 @@ TransactionManager::submit_transaction_direct( tref, submit_result.record_block_base, start_seq, - segment_cleaner.get()); + async_cleaner.get()); std::vector lba_to_clear; std::vector backref_to_clear; @@ -409,10 +409,10 @@ TransactionManager::submit_transaction_direct( lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link); backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link); - segment_cleaner->update_journal_tail_target( + async_cleaner->update_journal_tail_target( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); - return segment_cleaner->maybe_release_segment(tref); + return async_cleaner->maybe_release_segment(tref); }).safe_then([FNAME, &tref] { SUBTRACET(seastore_t, "completed", tref); return tref.get_handle().complete(); @@ -486,7 +486,7 @@ TransactionManager::rewrite_logical_extent( /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc * extents since we're going to do it again once we either do the ool write - * or allocate a relative inline addr. TODO: refactor SegmentCleaner to + * or allocate a relative inline addr. TODO: refactor AsyncCleaner to * avoid this complication. */ return lba_manager->update_mapping( t, @@ -647,15 +647,15 @@ TransactionManagerRef make_transaction_manager(tm_make_config_t config) auto backref_manager = create_backref_manager(*sms, *cache); bool cleaner_is_detailed; - SegmentCleaner::config_t cleaner_config; + AsyncCleaner::config_t cleaner_config; if (config.is_test) { cleaner_is_detailed = true; - cleaner_config = SegmentCleaner::config_t::get_test(); + cleaner_config = AsyncCleaner::config_t::get_test(); } else { cleaner_is_detailed = false; - cleaner_config = SegmentCleaner::config_t::get_default(); + cleaner_config = AsyncCleaner::config_t::get_default(); } - auto segment_cleaner = std::make_unique( + auto async_cleaner = std::make_unique( cleaner_config, std::move(sms), *backref_manager, @@ -663,20 +663,20 @@ TransactionManagerRef make_transaction_manager(tm_make_config_t config) JournalRef journal; if (config.j_type == journal_type_t::SEGMENT_JOURNAL) { - journal = journal::make_segmented(*segment_cleaner); + journal = journal::make_segmented(*async_cleaner); } else { journal = journal::make_circularbounded( nullptr, ""); - segment_cleaner->set_disable_trim(true); + async_cleaner->set_disable_trim(true); ERROR("disabling journal trimming since support for CircularBoundedJournal\ hasn't been added yet"); } epm->init_ool_writers( - *segment_cleaner, - segment_cleaner->get_ool_segment_seq_allocator()); + *async_cleaner, + async_cleaner->get_ool_segment_seq_allocator()); return std::make_unique( - std::move(segment_cleaner), + std::move(async_cleaner), std::move(journal), std::move(cache), std::move(lba_manager), diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index a02b8bb21dda..8661297cf4bb 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -21,7 +21,7 @@ #include "crimson/osd/exceptions.h" #include "crimson/os/seastore/logging.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/lba_manager.h" @@ -103,13 +103,13 @@ auto repeat_eagain(F &&f) { * Abstraction hiding reading and writing to persistence. * Exposes transaction based interface with read isolation. */ -class TransactionManager : public SegmentCleaner::ExtentCallbackInterface { +class TransactionManager : public AsyncCleaner::ExtentCallbackInterface { public: using base_ertr = Cache::base_ertr; using base_iertr = Cache::base_iertr; TransactionManager( - SegmentCleanerRef segment_cleaner, + AsyncCleanerRef async_cleaner, JournalRef journal, CacheRef cache, LBAManagerRef lba_manager, @@ -423,8 +423,8 @@ public: using submit_transaction_iertr = base_iertr; submit_transaction_iertr::future<> submit_transaction(Transaction &); - /// SegmentCleaner::ExtentCallbackInterface - using SegmentCleaner::ExtentCallbackInterface::submit_transaction_direct_ret; + /// AsyncCleaner::ExtentCallbackInterface + using AsyncCleaner::ExtentCallbackInterface::submit_transaction_direct_ret; submit_transaction_direct_ret submit_transaction_direct( Transaction &t, std::optional seq_to_trim = std::nullopt) final; @@ -438,18 +438,18 @@ public: */ seastar::future<> flush(OrderingHandle &handle); - using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret; + using AsyncCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret; get_next_dirty_extents_ret get_next_dirty_extents( Transaction &t, journal_seq_t seq, size_t max_bytes) final; - using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret; + using AsyncCleaner::ExtentCallbackInterface::rewrite_extent_ret; rewrite_extent_ret rewrite_extent( Transaction &t, CachedExtentRef extent) final; - using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret; + using AsyncCleaner::ExtentCallbackInterface::get_extent_if_live_ret; get_extent_if_live_ret get_extent_if_live( Transaction &t, extent_types_t type, @@ -577,7 +577,7 @@ public: } store_statfs_t store_stat() const { - return segment_cleaner->stat(); + return async_cleaner->stat(); } void add_device(Device* dev, bool is_primary) { @@ -598,7 +598,7 @@ public: private: friend class Transaction; - SegmentCleanerRef segment_cleaner; + AsyncCleanerRef async_cleaner; CacheRef cache; LBAManagerRef lba_manager; JournalRef journal; @@ -614,8 +614,8 @@ private: LogicalCachedExtentRef extent); public: // Testing interfaces - auto get_segment_cleaner() { - return segment_cleaner.get(); + auto get_async_cleaner() { + return async_cleaner.get(); } auto get_lba_manager() { diff --git a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc index 892e5f780e95..0da0cce18441 100644 --- a/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc +++ b/src/test/crimson/seastore/onode_tree/test_fltree_onode_manager.cc @@ -112,7 +112,7 @@ struct fltree_onode_manager_test_t auto t = create_mutate_transaction(); std::invoke(f, *t); submit_transaction(std::move(t)); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } template diff --git a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc index 3d890c27683a..b00db5e8a6df 100644 --- a/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc +++ b/src/test/crimson/seastore/onode_tree/test_staged_fltree.cc @@ -1591,7 +1591,7 @@ TEST_F(d_seastore_tm_test_t, 6_random_tree_insert_erase) auto t = create_mutate_transaction(); INTR(tree->bootstrap, *t).unsafe_get(); submit_transaction(std::move(t)); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } // test insert @@ -1599,7 +1599,7 @@ TEST_F(d_seastore_tm_test_t, 6_random_tree_insert_erase) auto t = create_mutate_transaction(); INTR(tree->insert, *t).unsafe_get(); submit_transaction(std::move(t)); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } { auto t = create_read_transaction(); @@ -1623,7 +1623,7 @@ TEST_F(d_seastore_tm_test_t, 6_random_tree_insert_erase) auto size = kvs.size() / 4 * 3; INTR_R(tree->erase, *t, size).unsafe_get(); submit_transaction(std::move(t)); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } { auto t = create_read_transaction(); @@ -1646,7 +1646,7 @@ TEST_F(d_seastore_tm_test_t, 6_random_tree_insert_erase) auto size = kvs.size(); INTR_R(tree->erase, *t, size).unsafe_get(); submit_transaction(std::move(t)); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } { auto t = create_read_transaction(); @@ -1703,7 +1703,7 @@ TEST_F(d_seastore_tm_test_t, 7_tree_insert_erase_eagain) }); }); }).unsafe_get0(); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); // insert logger().warn("start inserting {} kvs ...", kvs.size()); @@ -1723,7 +1723,7 @@ TEST_F(d_seastore_tm_test_t, 7_tree_insert_erase_eagain) }); }); }).unsafe_get0(); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); ++iter; } } @@ -1769,7 +1769,7 @@ TEST_F(d_seastore_tm_test_t, 7_tree_insert_erase_eagain) }); }); }).unsafe_get0(); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); ++iter; } kvs.erase_from_random(kvs.random_begin(), kvs.random_end()); diff --git a/src/test/crimson/seastore/test_seastore_journal.cc b/src/test/crimson/seastore/test_seastore_journal.cc index 91600ca10b8c..a67b0aa70d43 100644 --- a/src/test/crimson/seastore/test_seastore_journal.cc +++ b/src/test/crimson/seastore/test_seastore_journal.cc @@ -6,7 +6,7 @@ #include #include "crimson/common/log.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/journal.h" #include "crimson/os/seastore/segment_manager/ephemeral.h" diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc index f5c564fa4b0d..4b6c86e142f5 100644 --- a/src/test/crimson/seastore/test_transaction_manager.cc +++ b/src/test/crimson/seastore/test_transaction_manager.cc @@ -8,7 +8,7 @@ #include "test/crimson/gtest_seastar.h" #include "test/crimson/seastore/transaction_manager_test_state.h" -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/transaction_manager.h" #include "crimson/os/seastore/segment_manager/ephemeral.h" @@ -396,7 +396,7 @@ struct transaction_manager_test_t : bool check_usage() { auto t = create_weak_test_transaction(); - SpaceTrackerIRef tracker(segment_cleaner->get_empty_space_tracker()); + SpaceTrackerIRef tracker(async_cleaner->get_empty_space_tracker()); with_trans_intr( *t.t, [this, &tracker](auto &t) { @@ -427,7 +427,7 @@ struct transaction_manager_test_t : return seastar::now(); }); }).unsafe_get0(); - return segment_cleaner->debug_check_space(*tracker); + return async_cleaner->debug_check_space(*tracker); } void replay() { @@ -578,7 +578,7 @@ struct transaction_manager_test_t : "try_submit_transaction hit invalid error" } ).then([this](auto ret) { - return segment_cleaner->run_until_halt().then([ret] { return ret; }); + return async_cleaner->run_until_halt().then([ret] { return ret; }); }).get0(); if (success) { @@ -628,7 +628,7 @@ struct transaction_manager_test_t : }); }); }).safe_then([this]() { - return segment_cleaner->run_until_halt(); + return async_cleaner->run_until_halt(); }).handle_error( crimson::ct_error::assert_all{ "Invalid error in SeaStore::list_collections" diff --git a/src/test/crimson/seastore/transaction_manager_test_state.h b/src/test/crimson/seastore/transaction_manager_test_state.h index b712effaf4be..19b5b08cdf3a 100644 --- a/src/test/crimson/seastore/transaction_manager_test_state.h +++ b/src/test/crimson/seastore/transaction_manager_test_state.h @@ -6,7 +6,7 @@ #include #include -#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/async_cleaner.h" #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/transaction_manager.h" #include "crimson/os/seastore/segment_manager/ephemeral.h" @@ -146,7 +146,7 @@ protected: LBAManager *lba_manager; BackrefManager *backref_manager; Cache* cache; - SegmentCleaner *segment_cleaner; + AsyncCleaner *async_cleaner; TMTestState() : EphemeralTestState(1) {} @@ -165,14 +165,14 @@ protected: tm->add_device(sec_sm.get(), false); } } - segment_cleaner = tm->get_segment_cleaner(); + async_cleaner = tm->get_async_cleaner(); lba_manager = tm->get_lba_manager(); backref_manager = tm->get_backref_manager(); cache = tm->get_cache(); } virtual void _destroy() override { - segment_cleaner = nullptr; + async_cleaner = nullptr; lba_manager = nullptr; tm.reset(); } @@ -191,9 +191,9 @@ protected: ).handle_error( crimson::ct_error::assert_all{"Error in mount"} ).then([this] { - return segment_cleaner->stop(); + return async_cleaner->stop(); }).then([this] { - return segment_cleaner->run_until_halt(); + return async_cleaner->run_until_halt(); }); } @@ -251,7 +251,7 @@ protected: void submit_transaction(TransactionRef t) { submit_transaction_fut(*t).unsafe_get0(); - segment_cleaner->run_until_halt().get0(); + async_cleaner->run_until_halt().get0(); } };