From 972af821bbf6dbea54db7abaa37a8dcb733a0eaf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Tue, 6 Dec 2022 22:20:15 +0100 Subject: [PATCH] crimson/osd: bring SnapTrimObjSubEvent Signed-off-by: Radoslaw Zarzynski --- src/crimson/osd/osd_operation.h | 2 + .../osd/osd_operation_external_tracking.h | 7 + .../osd/osd_operations/common/pg_pipeline.h | 1 + .../osd/osd_operations/snaptrim_event.cc | 345 ++++++++++++++++++ .../osd/osd_operations/snaptrim_event.h | 61 ++++ src/crimson/osd/pg.h | 1 + 6 files changed, 417 insertions(+) diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index 181276b07acc4..8ef44ee9e7894 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -46,6 +46,7 @@ enum class OperationTypeCode { logmissing_request, logmissing_request_reply, snaptrim_event, + snaptrimobj_subevent, last_op }; @@ -62,6 +63,7 @@ static constexpr const char* const OP_NAMES[] = { "logmissing_request", "logmissing_request_reply", "snaptrim_event", + "snaptrimobj_subevent", }; // prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry: diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h index 2e0cbd01c92a2..4b6dbf4b71007 100644 --- a/src/crimson/osd/osd_operation_external_tracking.h +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -297,4 +297,11 @@ struct EventBackendRegistry { } }; +template <> +struct EventBackendRegistry { + static std::tuple<> get_backends() { + return {}; + } +}; + } // namespace crimson diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h index a5c0c9fbcb3d3..58fa07b8b4d25 100644 --- a/src/crimson/osd/osd_operations/common/pg_pipeline.h +++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h @@ -12,6 +12,7 @@ class CommonPGPipeline { protected: friend class InternalClientRequest; friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; struct WaitForActive : OrderedExclusivePhaseT { static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc index c3ad65257f6f5..dd089b8747cdb 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.cc +++ b/src/crimson/osd/osd_operations/snaptrim_event.cc @@ -5,6 +5,7 @@ #include "crimson/osd/osd_operations/snaptrim_event.h" #include "crimson/osd/pg.h" +#include "include/expected.hpp" namespace { seastar::logger& logger() { @@ -19,6 +20,13 @@ namespace crimson { return {}; } }; + + template <> + struct EventBackendRegistry { + static std::tuple<> get_backends() { + return {}; + } + }; } namespace crimson::osd { @@ -136,6 +144,11 @@ seastar::future SnapTrimEvent::with_pg( } for (const auto& object : to_trim) { logger().debug("{}: trimming {}", *this, object); + auto [op, fut] = shard_services.start_operation( + pg, + object, + snapid); + subop_blocker.emplace_back(op->get_id(), std::move(fut)); } return subop_blocker.wait_completion().then([] { return seastar::make_ready_future( @@ -151,4 +164,336 @@ seastar::future SnapTrimEvent::with_pg( }, pg); } + +CommonPGPipeline& SnapTrimObjSubEvent::pp() +{ + return pg->request_pg_pipeline; +} + +seastar::future<> SnapTrimObjSubEvent::start() +{ + logger().debug("{}: start", *this); + + IRef ref = this; + auto maybe_delay = seastar::now(); + if (auto delay = 0; delay) { + maybe_delay = seastar::sleep( + std::chrono::milliseconds(std::lround(delay * 1000))); + } + return maybe_delay.then([this] { + return with_pg(pg->get_shard_services(), pg); + }).finally([ref=std::move(ref), this] { + logger().debug("{}: complete", *ref); + return handle.complete(); + }); +} + +tl::expected +SnapTrimObjSubEvent::remove_or_update( + ObjectContextRef obc, + ObjectContextRef head_obc) +{ + ceph::os::Transaction txn{}; + std::vector log_entries{}; + + SnapSet& snapset = obc->ssc->snapset; + auto citer = snapset.clone_snaps.find(coid.snap); + if (citer == snapset.clone_snaps.end()) { + logger().error("{}: No clone_snaps in snapset {} for object {}", + *this, snapset, coid); + return tl::unexpected{-ENOENT}; + } + const auto& old_snaps = citer->second; + if (old_snaps.empty()) { + logger().error("{}: no object info snaps for object {}", + *this, coid); + return tl::unexpected{-ENOENT}; + } + if (snapset.seq == 0) { + logger().error("{}: no snapset.seq for object {}", + *this, coid); + return tl::unexpected{-ENOENT}; + } + const OSDMapRef& osdmap = pg->get_osdmap(); + std::set new_snaps; + for (const auto& old_snap : old_snaps) { + if (!osdmap->in_removed_snaps_queue(pg->get_info().pgid.pgid.pool(), + old_snap) + && old_snap != snap_to_trim) { + new_snaps.insert(old_snap); + } + } + + std::vector::iterator p = snapset.clones.end(); + if (new_snaps.empty()) { + p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); + if (p == snapset.clones.end()) { + logger().error("{}: Snap {} not in clones", + *this, coid.snap); + return tl::unexpected{-ENOENT}; + } + } + int64_t num_objects_before_trim = delta_stats.num_objects; + osd_op_p.at_version = pg->next_version(); + object_info_t &coi = obc->obs.oi; + if (new_snaps.empty()) { + // remove clone + logger().info("{}: {} snaps {} -> {} ... deleting", + *this, coid, old_snaps, new_snaps); + + // ...from snapset + assert(p != snapset.clones.end()); + + snapid_t last = coid.snap; + delta_stats.num_bytes -= snapset.get_clone_bytes(last); + + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + std::vector::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + + // does the classical OSD really need is_present_clone(prev_coid)? + delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + delta_stats.num_objects--; + if (coi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (coi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (coi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, coid); + delta_stats.num_whiteouts--; + } + delta_stats.num_object_clones--; + + obc->obs.exists = false; + + snapset.clones.erase(p); + snapset.clone_overlap.erase(last); + snapset.clone_size.erase(last); + snapset.clone_snaps.erase(last); + + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + coid, + osd_op_p.at_version, + coi.version, + 0, + osd_reqid_t(), + coi.mtime, // will be replaced in `apply_to()` + 0} + ); + txn.remove( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); + + coi = object_info_t(coid); + + } else { + // save adjusted snaps for this object + logger().info("{}: {} snaps {} -> {}", + *this, coid, old_snaps, new_snaps); + snapset.clone_snaps[coid.snap] = + std::vector(new_snaps.rbegin(), new_snaps.rend()); + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + + coi.prior_version = coi.version; + coi.version = osd_op_p.at_version; + ceph::bufferlist bl; + encode(coi, bl, pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + OI_ATTR, + bl); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + coid, + coi.version, + coi.prior_version, + 0, + osd_reqid_t(), + coi.mtime, + 0} + ); + } + + osd_op_p.at_version = pg->next_version(); + + // save head snapset + logger().debug("{}: {} new snapset {} on {}", + *this, coid, snapset, head_obc->obs.oi); + const auto head_oid = coid.get_head(); + if (snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + logger().info("{}: {} removing {}", + *this, coid, head_oid); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + coi.mtime, // will be replaced in `apply_to()` + 0} + ); + logger().info("{}: remove snap head", *this); + object_info_t& oi = head_obc->obs.oi; + delta_stats.num_objects--; + if (oi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (oi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, oi.soid); + delta_stats.num_whiteouts--; + } + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + txn.remove(pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); + } else { + snapset.snaps.clear(); + logger().info("{}: writing updated snapset on {}, snapset is {}", + *this, head_oid, snapset); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + coi.mtime, + 0} + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = osd_op_p.at_version; + + std::map> attrs; + ceph::bufferlist bl; + encode(snapset, bl); + attrs[SS_ATTR] = std::move(bl); + + bl.clear(); + encode(head_obc->obs.oi, bl, + pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bl); + txn.setattrs( + pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + attrs); + } + + // Stats reporting - Set number of objects trimmed + if (num_objects_before_trim > delta_stats.num_objects) { + int64_t num_objects_trimmed = + num_objects_before_trim - delta_stats.num_objects; + //add_objects_trimmed_count(num_objects_trimmed); + } + return std::make_pair(std::move(txn), std::move(log_entries)); +} + +seastar::future<> SnapTrimObjSubEvent::with_pg( + ShardServices &shard_services, Ref _pg) +{ + return interruptor::with_interruption([this] { + return enter_stage( + pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage( + pp().recover_missing); + }).then_interruptible([this] { + //return do_recover_missing(pg, get_target_oid()); + return seastar::now(); + }).then_interruptible([this] { + return enter_stage( + pp().get_obc); + }).then_interruptible([this] { + logger().debug("{}: getting obc for {}", *this, coid); + // end of commonality + // with_cone_obc lock both clone's and head's obcs + return pg->obc_loader.with_clone_obc(coid, [this](auto clone_obc) { + logger().debug("{}: got clone_obc={}", *this, clone_obc); + return enter_stage( + pp().process + ).then_interruptible([this, clone_obc=std::move(clone_obc)]() mutable { + logger().debug("{}: processing clone_obc={}", *this, clone_obc); + auto head_obc = clone_obc->head; + return interruptor::async([=, this]() mutable { + if (auto ret = remove_or_update(clone_obc, head_obc); + !ret.has_value()) { + logger().error("{}: trimmig error {}", + *this, ret.error()); + //pg->state_set(PG_STATE_SNAPTRIM_ERROR); + } else { + auto [txn, log_entries] = std::move(ret).value(); + auto [submitted, all_completed] = pg->submit_transaction( + std::move(clone_obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + submitted.get(); + all_completed.get(); + } + }).then_interruptible([this] { + return enter_stage( + wait_repop + ); + }).then_interruptible([this, clone_obc=std::move(clone_obc)] { + return PG::load_obc_iertr::now(); + }); + }); + }).handle_error_interruptible(PG::load_obc_ertr::all_same_way([] { + return seastar::now(); + })); + }).then_interruptible([] { + // end of commonality + return seastar::now(); + }); + }, [this](std::exception_ptr eptr) { + // TODO: better debug output + logger().debug("{}: interrupted {}", *this, eptr); + }, pg); +} + +void SnapTrimObjSubEvent::print(std::ostream &lhs) const +{ + lhs << "SnapTrimObjSubEvent(" + << "coid=" << coid + << " snapid=" << snap_to_trim + << ")"; +} + +void SnapTrimObjSubEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("SnapTrimObjSubEvent"); + f->dump_stream("coid") << coid; + f->close_section(); +} + } // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h index bfda550f99962..4ec54af66a390 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.h +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -80,8 +80,69 @@ public: > tracking_events; }; +// remove single object. a SnapTrimEvent can create multiple subrequests. +// the division of labour is needed because of the restriction that an Op +// cannot revisite a pipeline's stage it already saw. +class SnapTrimObjSubEvent : public PhasedOperationT { +public: + static constexpr OperationTypeCode type = + OperationTypeCode::snaptrimobj_subevent; + + SnapTrimObjSubEvent( + Ref pg, + const hobject_t& coid, + snapid_t snap_to_trim) + : pg(std::move(pg)), + coid(coid), + snap_to_trim(snap_to_trim) { + } + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> start(); + seastar::future<> with_pg( + ShardServices &shard_services, Ref pg); + + CommonPGPipeline& pp(); + +private: + object_stat_sum_t delta_stats; + + using remove_or_update_ret_t = + std::pair>; + tl::expected + remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc); + + // we don't need to synchronize with other instances started by + // SnapTrimEvent; it's here for the sake of op tracking. + struct WaitRepop : OrderedConcurrentPhaseT { + static constexpr auto type_name = "SnapTrimObjSubEvent::wait_repop"; + } wait_repop; + + Ref pg; + PipelineHandle handle; + osd_op_params_t osd_op_p; + const hobject_t coid; + const snapid_t snap_to_trim; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + WaitRepop::BlockingEvent, + CompletionEvent + > tracking_events; +}; + } // namespace crimson::osd #if FMT_VERSION >= 90000 template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index f8be8e1cb5ed2..48e4d1fd210c7 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -721,6 +721,7 @@ private: friend class InternalClientRequest; friend class WatchTimeoutRequest; friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; private: seastar::future find_unfound() { return seastar::make_ready_future(true); -- 2.39.5