From debd83d175c3f709b6a301693ae6458015a93d5f Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 17 Sep 2020 16:40:50 -0700 Subject: [PATCH] crimson/os/seastore/segment_cleaner: add online gc Signed-off-by: Samuel Just --- src/crimson/os/seastore/segment_cleaner.cc | 111 ++++++++- src/crimson/os/seastore/segment_cleaner.h | 212 +++++++++++++++++- src/crimson/os/seastore/transaction.h | 12 + .../os/seastore/transaction_manager.cc | 21 +- src/crimson/os/seastore/transaction_manager.h | 15 ++ 5 files changed, 355 insertions(+), 16 deletions(-) diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc index fd39c22913875..21e669f561433 100644 --- a/src/crimson/os/seastore/segment_cleaner.cc +++ b/src/crimson/os/seastore/segment_cleaner.cc @@ -3,6 +3,7 @@ #include "crimson/common/log.h" +#include "crimson/os/seastore/transaction.h" #include "crimson/os/seastore/segment_cleaner.h" namespace { @@ -205,12 +206,44 @@ SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work( __func__, journal_tail_target, next_target); - if (journal_tail_target > next_target) { - return do_immediate_work_ertr::now(); + + logger().debug( + "SegmentCleaner::do_immediate_work gc total {}, available {}, unavailable {}, used {} available_ratio {}, reclaim_ratio {}, bytes_to_gc_for_available {}, bytes_to_gc_for_reclaim {}", + get_total_bytes(), + get_available_bytes(), + get_unavailable_bytes(), + get_used_bytes(), + get_available_ratio(), + get_reclaim_ratio(), + get_immediate_bytes_to_gc_for_available(), + get_immediate_bytes_to_gc_for_reclaim()); + + auto dirty_fut = do_immediate_work_ertr::now(); + if (journal_tail_target < next_target) { + dirty_fut = rewrite_dirty(t, next_target); } + return dirty_fut.safe_then([=, &t] { + return do_gc(t, get_immediate_bytes_to_gc()); + }).handle_error( + do_immediate_work_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ); +} + +SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work( + Transaction &t) +{ + return do_deferred_work_ret( + do_deferred_work_ertr::ready_future_marker{}, + ceph::timespan()); +} +SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty( + Transaction &t, + journal_seq_t limit) +{ return ecb->get_next_dirty_extents( - get_dirty_tail_limit() + limit ).then([=, &t](auto dirty_list) { if (dirty_list.empty()) { return do_immediate_work_ertr::now(); @@ -232,12 +265,74 @@ SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work( }); } -SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work( - Transaction &t) +SegmentCleaner::do_gc_ret SegmentCleaner::do_gc( + Transaction &t, + size_t bytes) { - return do_deferred_work_ret( - do_deferred_work_ertr::ready_future_marker{}, - ceph::timespan()); + if (bytes == 0) { + return do_gc_ertr::now(); + } + + if (gc_current_pos == P_ADDR_NULL) { + gc_current_pos.segment = get_next_gc_target(); + if (gc_current_pos == P_ADDR_NULL) { + // apparently there are no segments to gc + logger().debug( + "SegmentCleaner::do_gc: no segments to gc"); + return do_gc_ertr::now(); + } + logger().debug( + "SegmentCleaner::do_gc: starting gc on segment {}", + gc_current_pos.segment); + gc_current_pos.offset = 0; + } + + return ecb->scan_extents( + gc_current_pos, + bytes + ).safe_then([=, &t](auto addrs) { + return seastar::do_with( + std::move(addrs), + [=, &t](auto &addrs) { + auto &[next, addr_list] = addrs; + return crimson::do_for_each( + addr_list, + [=, &t](auto &addr_pair) { + auto &[addr, info] = addr_pair; + logger().debug( + "SegmentCleaner::do_gc: checking addr {}", + addr); + return ecb->get_extent_if_live( + t, + info.type, + addr, + info.addr, + info.len + ).safe_then([=, &t](CachedExtentRef ext) { + if (!ext) { + logger().debug( + "SegmentCleaner::do_gc: addr {} dead, skipping", + addr); + return ExtentCallbackInterface::rewrite_extent_ertr::now(); + } else { + logger().debug( + "SegmentCleaner::do_gc: addr {} alive, gc'ing {}", + addr, + *ext); + } + return ecb->rewrite_extent( + t, + ext); + }); + }).safe_then([=, &t] { + auto old_pos = std::exchange(gc_current_pos, next); + if (gc_current_pos == P_ADDR_NULL) { + t.mark_segment_to_release(old_pos.segment); + } + return ExtentCallbackInterface::release_segment_ertr::now(); + }); + }); + }); } } diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h index caa903e97529d..14e2a04f56272 100644 --- a/src/crimson/os/seastore/segment_cleaner.h +++ b/src/crimson/os/seastore/segment_cleaner.h @@ -21,6 +21,12 @@ struct segment_info_t { // Will be non-null for any segments in the current journal segment_seq_t journal_segment_seq = NULL_SEG_SEQ; + + bool is_in_journal(journal_seq_t tail_committed) const { + return journal_segment_seq != NULL_SEG_SEQ && + tail_committed.segment_seq <= journal_segment_seq; + } + bool is_empty() const { return state == Segment::segment_state_t::EMPTY; } @@ -209,6 +215,12 @@ public: size_t target_journal_segments = 0; size_t max_journal_segments = 0; + double reclaim_ratio_hard_limit = 0; + // don't apply reclaim ratio with available space below this + double reclaim_ratio_usage_min = 0; + + double available_ratio_hard_limit = 0; + static config_t default_from_segment_manager( SegmentManager &manager) { return config_t{ @@ -216,7 +228,11 @@ public: static_cast(manager.get_segment_size()), (size_t)manager.get_block_size(), 2, - 4}; + 4, + .5, + .95, + .2 + }; } }; @@ -270,6 +286,26 @@ public: paddr_t addr, laddr_t laddr, segment_off_t len) = 0; + + /** + * scan_extents + * + * Interface shim for Journal::scan_extents + */ + using scan_extents_ret = Journal::scan_extents_ret; + virtual scan_extents_ret scan_extents( + paddr_t addr, + extent_len_t bytes_to_read) = 0; + + /** + * release_segment + * + * Release segment. + */ + using release_segment_ertr = SegmentManager::release_ertr; + using release_segment_ret = release_segment_ertr::future<>; + virtual release_segment_ret release_segment( + segment_id_t id) = 0; }; private: @@ -338,6 +374,14 @@ public: segments[segment].journal_segment_seq = seq; } + segment_seq_t get_seq(segment_id_t id) final { + return segments[id].journal_segment_seq; + } + + void mark_segment_released(segment_id_t segment) { + return mark_empty(segment); + } + void mark_space_used( paddr_t addr, extent_len_t len, @@ -375,6 +419,26 @@ public: assert(ret >= 0); } + segment_id_t get_next_gc_target() const { + segment_id_t ret = NULL_SEG_ID; + int64_t least_live_bytes = std::numeric_limits::max(); + for (segment_id_t i = 0; i < segments.size(); ++i) { + if (segments[i].is_closed() && + !segments[i].is_in_journal(journal_tail_committed) && + space_tracker->get_usage(i) < least_live_bytes) { + ret = i; + least_live_bytes = space_tracker->get_usage(i); + } + } + if (ret != NULL_SEG_ID) { + crimson::get_logger(ceph_subsys_filestore).debug( + "SegmentCleaner::get_next_gc_target: segment {} seq {}", + ret, + segments[ret].journal_segment_seq); + } + return ret; + } + SpaceTrackerIRef get_empty_space_tracker() const { return space_tracker->make_empty(); } @@ -425,6 +489,18 @@ private: // journal status helpers + /** + * rewrite_dirty + * + * Writes out dirty blocks dirtied earlier than limit. + */ + using rewrite_dirty_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_dirty_ret = rewrite_dirty_ertr::future<>; + rewrite_dirty_ret rewrite_dirty( + Transaction &t, + journal_seq_t limit); + journal_seq_t get_dirty_tail() const { auto ret = journal_head; ret.segment_seq -= std::min( @@ -440,6 +516,140 @@ private: config.max_journal_segments); return ret; } + + // GC status helpers + paddr_t gc_current_pos = P_ADDR_NULL; + + /** + * do_gc + * + * Performs bytes worth of gc work on t. + */ + using do_gc_ertr = SegmentManager::read_ertr; + using do_gc_ret = do_gc_ertr::future<>; + do_gc_ret do_gc( + Transaction &t, + size_t bytes); + + size_t get_bytes_used_current_segment() const { + assert(journal_head != journal_seq_t()); + return journal_head.offset.offset; + } + + size_t get_bytes_available_current_segment() const { + return config.segment_size - get_bytes_used_current_segment(); + } + + /** + * get_bytes_scanned_current_segment + * + * Returns the number of bytes from the current gc segment that + * have been scanned. + */ + size_t get_bytes_scanned_current_segment() const { + if (gc_current_pos == P_ADDR_NULL) + return 0; + + return gc_current_pos.offset; + } + + size_t get_available_bytes() const { + return (empty_segments * config.segment_size) + + get_bytes_available_current_segment() + + get_bytes_scanned_current_segment(); + } + + size_t get_total_bytes() const { + return config.segment_size * config.num_segments; + } + + size_t get_unavailable_bytes() const { + return get_total_bytes() - get_available_bytes(); + } + + /// Returns bytes currently occupied by live extents (not journal) + size_t get_used_bytes() const { + return used_bytes; + } + + /// Returns the number of bytes in unavailable segments that are not live + size_t get_reclaimable_bytes() const { + return get_unavailable_bytes() - get_used_bytes(); + } + + /** + * get_reclaim_ratio + * + * Returns the ratio of unavailable space that is not currently used. + */ + double get_reclaim_ratio() const { + if (get_unavailable_bytes() == 0) return 0; + return (double)get_reclaimable_bytes() / (double)get_unavailable_bytes(); + } + + /** + * get_available_ratio + * + * Returns ratio of available space to write to total space + */ + double get_available_ratio() const { + return (double)get_available_bytes() / (double)get_total_bytes(); + } + + /** + * get_immediate_bytes_to_gc_for_reclaim + * + * Returns the number of bytes to gc in order to bring the + * reclaim ratio below reclaim_ratio_usage_min. + */ + size_t get_immediate_bytes_to_gc_for_reclaim() const { + if (get_reclaim_ratio() < config.reclaim_ratio_hard_limit) + return 0; + + const size_t unavailable_target = std::max( + get_used_bytes() / (1.0 - config.reclaim_ratio_hard_limit), + (1 - config.reclaim_ratio_usage_min) * get_total_bytes()); + + if (unavailable_target > get_unavailable_bytes()) + return 0; + + return (get_unavailable_bytes() - unavailable_target) / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc_for_available + * + * Returns the number of bytes to gc in order to bring the + * the ratio of available disk space to total disk space above + * available_ratio_hard_limit. + */ + size_t get_immediate_bytes_to_gc_for_available() const { + if (get_available_ratio() > config.available_ratio_hard_limit) { + return 0; + } + + const double ratio_to_make_available = config.available_ratio_hard_limit - + get_available_ratio(); + return ratio_to_make_available * (double)get_total_bytes() + / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc + * + * Returns number of bytes to gc in order to restore any strict + * limits. + */ + size_t get_immediate_bytes_to_gc() const { + // number of bytes to gc in order to correct reclaim ratio + size_t for_reclaim = get_immediate_bytes_to_gc_for_reclaim(); + + // number of bytes to gc in order to correct available_ratio + size_t for_available = get_immediate_bytes_to_gc_for_available(); + + return std::max(for_reclaim, for_available); + } + void mark_closed(segment_id_t segment) { assert(segments.size() > segment); if (init_complete) { diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 3c6d77520f14d..e189d1d32da03 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -79,6 +79,15 @@ public: write_set.insert(*ref); } + void mark_segment_to_release(segment_id_t segment) { + assert(to_release == NULL_SEG_ID); + to_release = segment; + } + + segment_id_t get_segment_to_release() const { + return to_release; + } + const auto &get_fresh_block_list() { return fresh_block_list; } @@ -118,6 +127,9 @@ private: pextent_set_t retired_set; ///< list of extents mutated by this transaction + ///< if != NULL_SEG_ID, release this segment after completion + segment_id_t to_release = NULL_SEG_ID; + Transaction(bool weak) : weak(weak) {} }; using TransactionRef = Transaction::Ref; diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 298cc00f54110..76517055fd3e1 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -186,13 +186,20 @@ TransactionManager::submit_transaction( return crimson::ct_error::eagain::make(); } - return journal.submit_record(std::move(*record)).safe_then( - [this, t=std::move(t)](auto p) mutable { - auto [addr, journal_seq] = p; - segment_cleaner.set_journal_head(journal_seq); - cache.complete_commit(*t, addr, journal_seq, &segment_cleaner); - lba_manager.complete_transaction(*t); - }, + return journal.submit_record(std::move(*record) + ).safe_then([this, t=std::move(t)](auto p) mutable { + auto [addr, journal_seq] = p; + segment_cleaner.set_journal_head(journal_seq); + cache.complete_commit(*t, addr, journal_seq, &segment_cleaner); + lba_manager.complete_transaction(*t); + auto to_release = t->get_segment_to_release(); + if (to_release != NULL_SEG_ID) { + segment_cleaner.mark_segment_released(to_release); + return segment_manager.release(to_release); + } else { + return SegmentManager::release_ertr::now(); + } + }).handle_error( submit_transaction_ertr::pass_further{}, crimson::ct_error::all_same_way([](auto e) { ceph_assert(0 == "Hit error submitting to journal"); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 4a93a6b156d2c..4ddb191aae3de 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -235,6 +235,21 @@ public: laddr_t laddr, segment_off_t len) final; + using scan_extents_ret = + SegmentCleaner::ExtentCallbackInterface::scan_extents_ret; + scan_extents_ret scan_extents( + paddr_t addr, + extent_len_t bytes_to_read) final { + return journal.scan_extents(addr, bytes_to_read); + } + + using release_segment_ret = + SegmentCleaner::ExtentCallbackInterface::release_segment_ret; + release_segment_ret release_segment( + segment_id_t id) { + return segment_manager.release(id); + } + ~TransactionManager(); private: -- 2.39.5