From ab8609c88eca28622ed3c282a3ccc72f6e3641cc Mon Sep 17 00:00:00 2001 From: Matan Breizman Date: Sun, 4 May 2025 14:22:38 +0000 Subject: [PATCH] crimson/osd: Logging fixes * Fix "failed to log message" * PGRecovery move to new logging macro * PGRecovery to print pg prefix as it's impossible to debug specific pg recovery ops without it. Signed-off-by: Matan Breizman (cherry picked from commit 314d2e7dddcd01b3e239561ecc3281f274d7012f) --- src/crimson/osd/backfill_state.cc | 6 +- src/crimson/osd/pg.cc | 2 +- src/crimson/osd/pg.h | 2 +- src/crimson/osd/pg_recovery.cc | 144 ++++++++++++++++-------------- 4 files changed, 81 insertions(+), 73 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 38373929b96b3..5c4fb7e7ee77f 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -122,7 +122,7 @@ void BackfillState::Enqueuing::maybe_update_range() ceph_assert(primary_bi.version == eversion_t()); return; } - DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}", + DEBUGDPP("bi is old, ({}) can be updated with log to {}", pg(), primary_bi.version, pg().get_projected_last_update()); @@ -183,9 +183,9 @@ void BackfillState::Enqueuing::maybe_update_range() } } }; - DEBUGDPP("{}: scanning pg log first", pg()); + DEBUGDPP("scanning pg log first", pg()); peering_state().scan_log_after(primary_bi.version, func); - DEBUGDPP("{}: scanning projected log", pg()); + DEBUGDPP("scanning projected log", pg()); pg().get_projected_log().scan_log_after(primary_bi.version, func); primary_bi.version = pg().get_projected_last_update(); } else { diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index d84eb061052fc..961f92a133050 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -630,7 +630,7 @@ void PG::on_active_actmap() publish_stats_to_osd(); }); } else { - logger().debug("{}: pg not clean, skipping snap trim"); + logger().debug("pg not clean, skipping snap trim"); ceph_assert(!peering_state.state_test(PG_STATE_SNAPTRIM)); } } diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 2129575fcfde3..6384f93f72d5b 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -309,7 +309,7 @@ public: LOG_PREFIX(PG::request_remote_recovery_reservation); SUBDEBUGDPP( osd, "priority {} on_grant {} on_preempt {}", - *this, on_grant->get_desc(), on_preempt->get_desc()); + *this, priority, on_grant->get_desc(), on_preempt->get_desc()); shard_services.remote_request_reservation( orderer, pgid, diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index fd6c1097dffbc..2adbd33443f24 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -20,12 +20,6 @@ SET_SUBSYS(osd); -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_osd); - } -} - using std::map; using std::set; using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; @@ -47,13 +41,14 @@ PGRecovery::start_recovery_ops( size_t max_to_start) { LOG_PREFIX(PGRecovery::start_recovery_ops); + DEBUGDPP("{} max {}", pg->get_dpp(), recover_op, max_to_start); assert(pg->is_primary()); assert(pg->is_peered()); if (pg->has_reset_since(recover_op.get_epoch_started()) || recover_op.is_cancelled()) { - DEBUGDPP("recovery {} cancelled.", pg->get_pgid(), recover_op); + DEBUGDPP("recovery {} cancelled.", pg->get_dpp(), recover_op); co_return seastar::stop_iteration::yes; } ceph_assert(pg->is_recovering()); @@ -79,7 +74,7 @@ PGRecovery::start_recovery_ops( //TODO: maybe we should implement a recovery race interruptor in the future if (pg->has_reset_since(recover_op.get_epoch_started()) || recover_op.is_cancelled()) { - DEBUGDPP("recovery {} cancelled.", pg->get_pgid(), recover_op); + DEBUGDPP("recovery {} cancelled.", pg->get_dpp(), recover_op); co_return seastar::stop_iteration::yes; } ceph_assert(pg->is_recovering()); @@ -106,6 +101,7 @@ size_t PGRecovery::start_primary_recovery_ops( size_t max_to_start, std::vector> *out) { + LOG_PREFIX(PGRecovery::start_primary_recovery_ops); if (!pg->is_recovering()) { return 0; } @@ -117,10 +113,11 @@ size_t PGRecovery::start_primary_recovery_ops( const auto &missing = pg->get_peering_state().get_pg_log().get_missing(); - logger().info("{} recovering {} in pg {}, missing {}", __func__, - pg->get_recovery_backend()->total_recovering(), - *static_cast(pg), - missing); + DEBUGDPP("recovering {} in pg {}, missing {}", + pg->get_dpp(), + pg->get_recovery_backend()->total_recovering(), + *static_cast(pg), + missing); unsigned started = 0; int skipped = 0; @@ -145,13 +142,13 @@ size_t PGRecovery::start_primary_recovery_ops( hobject_t head = soid.get_head(); if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { - logger().debug("{}: object {} unfound", __func__, soid); + DEBUGDPP("object {} unfound", pg->get_dpp(), soid); ++skipped; ++p; continue; } if (pg->get_peering_state().get_missing_loc().is_unfound(head)) { - logger().debug("{}: head object {} unfound", __func__, soid); + DEBUGDPP("head object {} unfound", pg->get_dpp(), soid); ++skipped; ++p; continue; @@ -161,9 +158,9 @@ size_t PGRecovery::start_primary_recovery_ops( ++p; bool head_missing = missing.is_missing(head); - logger().info( - "{} {} item.need {} {} {} {} {}", - __func__, + DEBUGDPP( + "{} item.need {} {} {} {} {}", + pg->get_dpp(), soid, item.need, missing.is_missing(soid) ? " (missing)":"", @@ -195,7 +192,7 @@ size_t PGRecovery::start_primary_recovery_ops( pg->get_peering_state().set_last_requested(v); } - logger().info("{} started {} skipped {}", __func__, started, skipped); + DEBUGDPP("started {} skipped {}", pg->get_dpp(), started, skipped); return started; } @@ -205,6 +202,7 @@ size_t PGRecovery::start_replica_recovery_ops( size_t max_to_start, std::vector> *out) { + LOG_PREFIX(PGRecovery::start_replica_recovery_ops); if (!pg->is_recovering()) { return 0; } @@ -217,10 +215,8 @@ size_t PGRecovery::start_replica_recovery_ops( assert(peer != pg->get_peering_state().get_primary()); const auto& pm = pg->get_peering_state().get_peer_missing(peer); - logger().debug("{}: peer osd.{} missing {} objects", __func__, - peer, pm.num_missing()); - logger().trace("{}: peer osd.{} missing {}", __func__, - peer, pm.get_items()); + DEBUGDPP("peer osd.{} missing {} objects", pg->get_dpp(), peer, pm.num_missing()); + TRACEDPP("peer osd.{} missing {}", pg->get_dpp(), peer, pm.get_items()); // recover oldest first for (auto p = pm.get_rmissing().begin(); @@ -229,26 +225,26 @@ size_t PGRecovery::start_replica_recovery_ops( const auto &soid = p->second; if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { - logger().debug("{}: object {} still unfound", __func__, soid); + DEBUGDPP("object {} still unfound", pg->get_dpp(), soid); continue; } const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer); if (soid > pi.last_backfill) { if (!pg->get_recovery_backend()->is_recovering(soid)) { - logger().error( - "{}: object {} in missing set for backfill (last_backfill {})" + ERRORDPP( + "object {} in missing set for backfill (last_backfill {})" " but not in recovering", - __func__, + pg->get_dpp(), soid, pi.last_backfill); - ceph_abort(); + ceph_abort_msg("start_replica_recovery_ops error"); } continue; } if (pg->get_recovery_backend()->is_recovering(soid)) { - logger().debug("{}: already recovering object {}", __func__, soid); + DEBUGDPP("already recovering object {}", pg->get_dpp(), soid); auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); out->emplace_back(recovery_waiter.wait_for_recovered(trigger)); started++; @@ -256,7 +252,7 @@ size_t PGRecovery::start_replica_recovery_ops( } if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { - logger().debug("{}: soid {} is a delete, removing", __func__, soid); + DEBUGDPP("soid {} is a delete, removing", pg->get_dpp(), soid); map::const_iterator r = pm.get_items().find(soid); started++; @@ -268,17 +264,16 @@ size_t PGRecovery::start_replica_recovery_ops( if (soid.is_snap() && pg->get_peering_state().get_pg_log().get_missing().is_missing( soid.get_head())) { - logger().debug("{}: head {} still missing on primary", __func__, - soid.get_head()); + DEBUGDPP("head {} still missing on primary", pg->get_dpp(), soid.get_head()); continue; } if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) { - logger().debug("{}: soid {} still missing on primary", __func__, soid); + DEBUGDPP("soid {} still missing on primary", pg->get_dpp(), soid); continue; } - logger().debug("{}: recover_object_replicas({})", __func__,soid); + DEBUGDPP("recover_object_replicas({})", pg->get_dpp(), soid); map::const_iterator r = pm.get_items().find( soid); started++; @@ -297,10 +292,11 @@ PGRecovery::recover_missing( eversion_t need, bool with_throttle) { - logger().info("{} {} v {}", __func__, soid, need); + LOG_PREFIX(PGRecovery::recover_missing); + DEBUGDPP("{} v {}", pg->get_dpp(), soid, need); auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); if (added) { - logger().info("{} {} v {}, new recovery", __func__, soid, need); + DEBUGDPP("{} v {}, new recovery", pg->get_dpp(), soid, need); if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { return recovering.wait_track_blocking( trigger, @@ -328,10 +324,11 @@ RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes( const hobject_t& soid, eversion_t need) { - logger().info("{} {} v {}", __func__, soid, need); + LOG_PREFIX(PGRecovery::prep_object_replica_deletes); + DEBUGDPP("soid {} need v {}", pg->get_dpp(), soid, need); auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); if (added) { - logger().info("{} {} v {}, new recovery", __func__, soid, need); + DEBUGDPP("soid {} v {}, new recovery", pg->get_dpp(), soid, need); return recovering.wait_track_blocking( trigger, pg->get_recovery_backend()->push_delete(soid, need).then_interruptible( @@ -352,10 +349,11 @@ RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes( const hobject_t& soid, eversion_t need) { - logger().info("{} {} v {}", __func__, soid, need); + LOG_PREFIX(PGRecovery::prep_object_replica_pushes); + DEBUGDPP("soid {} v {}", pg->get_dpp(), soid, need); auto [recovering, added] = pg->get_recovery_backend()->add_recovering(soid); if (added) { - logger().info("{} {} v {}, new recovery", __func__, soid, need); + DEBUGDPP("soid {} v {}, new recovery", pg->get_dpp(), soid, need); return recovering.wait_track_blocking( trigger, recover_object_with_throttle(soid, need) @@ -377,6 +375,8 @@ PGRecovery::on_local_recover( const bool is_delete, ceph::os::Transaction& t) { + LOG_PREFIX(PGRecovery::on_local_recover); + DEBUGDPP("{}", pg->get_dpp(), soid); if (const auto &log = pg->get_peering_state().get_pg_log(); !is_delete && log.get_missing().is_missing(recovery_info.soid) && @@ -384,7 +384,7 @@ PGRecovery::on_local_recover( assert(pg->is_primary()); if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second; latest->op == pg_log_entry_t::LOST_REVERT) { - ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet"); + ceph_abort_msg("mark_unfound_lost (LOST_REVERT) is not implemented yet"); } } @@ -426,7 +426,8 @@ void PGRecovery::on_global_recover ( const object_stat_sum_t& stat_diff, const bool is_delete) { - logger().info("{} {}", __func__, soid); + LOG_PREFIX(PGRecovery::on_global_recover); + DEBUGDPP("{}", pg->get_dpp(), soid); pg->get_peering_state().object_recovered(soid, stat_diff); pg->publish_stats_to_osd(); auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); @@ -452,21 +453,19 @@ void PGRecovery::on_peer_recover( const hobject_t &oid, const ObjectRecoveryInfo &recovery_info) { - crimson::get_logger(ceph_subsys_osd).debug( - "{}: {}, {} on {}", __func__, oid, - recovery_info.version, peer); + LOG_PREFIX(PGRecovery::on_peer_recover); + DEBUGDPP("{}, {} on {}", pg->get_dpp(), oid, recovery_info.version, peer); pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version); } void PGRecovery::_committed_pushed_object(epoch_t epoch, eversion_t last_complete) { + LOG_PREFIX(PGRecovery::_committed_pushed_object); if (!pg->has_reset_since(epoch)) { pg->get_peering_state().recovery_committed_to(last_complete); } else { - crimson::get_logger(ceph_subsys_osd).debug( - "{} pg has changed, not touching last_complete_ondisk", - __func__); + DEBUGDPP("pg has changed, not touching last_complete_ondisk", pg->get_dpp()); } } @@ -475,7 +474,8 @@ void PGRecovery::request_replica_scan( const hobject_t& begin, const hobject_t& end) { - logger().debug("{}: target.osd={}", __func__, target.osd); + LOG_PREFIX(PGRecovery::request_replica_scan); + DEBUGDPP("target.osd={}", pg->get_dpp(), target.osd); auto msg = crimson::make_message( MOSDPGScan::OP_SCAN_GET_DIGEST, pg->get_pg_whoami(), @@ -493,7 +493,8 @@ void PGRecovery::request_replica_scan( void PGRecovery::request_primary_scan( const hobject_t& begin) { - logger().debug("{}", __func__); + LOG_PREFIX(PGRecovery::request_primary_scan); + DEBUGDPP("begin {}", pg->get_dpp(), begin); using crimson::common::local_conf; std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary( begin, @@ -501,8 +502,7 @@ void PGRecovery::request_primary_scan( local_conf()->osd_backfill_scan_max, pg->get_peering_state().get_backfill_targets() ).then_interruptible([this] (PrimaryBackfillInterval bi) { - logger().debug("request_primary_scan:{}", __func__); - using BackfillState = crimson::osd::BackfillState; + using BackfillState = crimson::osd::BackfillState; backfill_state->process_event( BackfillState::PrimaryScanned{ std::move(bi) }.intrusive_from_this()); }); @@ -530,7 +530,8 @@ void PGRecovery::enqueue_push( const eversion_t& v, const std::vector &peers) { - logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers); + LOG_PREFIX(PGRecovery::enqueue_push); + DEBUGDPP("obj={} v={} peers={}", pg->get_dpp(), obj, v, peers); auto &peering_state = pg->get_peering_state(); auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj); if (!added) @@ -540,8 +541,9 @@ void PGRecovery::enqueue_push( handle_exception_interruptible([] (auto) { ceph_abort_msg("got exception on backfill's push"); return seastar::make_ready_future<>(); - }).then_interruptible([this, obj] { - logger().debug("enqueue_push:{}", __func__); + }).then_interruptible([this, FNAME, obj] { + + DEBUGDPP("enqueue_push", pg->get_dpp()); using BackfillState = crimson::osd::BackfillState; if (backfill_state->is_triggered()) { backfill_state->post_event( @@ -558,6 +560,8 @@ void PGRecovery::enqueue_drop( const hobject_t& obj, const eversion_t& v) { + LOG_PREFIX(PGRecovery::update_peers_last_backfill); + DEBUGDPP("obj={} v={} target={}", pg->get_dpp(), obj, v, target); // allocate a pair if target is seen for the first time auto& req = backfill_drop_requests[target]; if (!req) { @@ -581,8 +585,8 @@ void PGRecovery::maybe_flush() void PGRecovery::update_peers_last_backfill( const hobject_t& new_last_backfill) { - logger().debug("{}: new_last_backfill={}", - __func__, new_last_backfill); + LOG_PREFIX(PGRecovery::update_peers_last_backfill); + DEBUGDPP("new_last_backfill={}", pg->get_dpp(), new_last_backfill); // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to // all the backfill targets. Otherwise, we will move last_backfill up on // those targets need it and send OP_BACKFILL_PROGRESS to them. @@ -603,11 +607,11 @@ void PGRecovery::update_peers_last_backfill( m->stats = pinfo.stats; std::ignore = pg->get_shard_services().send_to_osd( bt.osd, std::move(m), pg->get_osdmap_epoch()); - logger().info("{}: peer {} num_objects now {} / {}", - __func__, - bt, - pinfo.stats.stats.sum.num_objects, - pg->get_info().stats.stats.sum.num_objects); + DEBUGDPP("peer {} num_objects now {} / {}", + pg->get_dpp(), + bt, + pinfo.stats.stats.sum.num_objects, + pg->get_info().stats.stats.sum.num_objects); } } } @@ -636,14 +640,14 @@ void PGRecovery::start_peering_event_operation_listener(T &&evt, float delay) { void PGRecovery::backfilled() { LOG_PREFIX(PGRecovery::backfilled); - DEBUGDPP("", pg->get_pgid()); + DEBUGDPP("", pg->get_dpp()); start_peering_event_operation_listener(PeeringState::Backfilled()); } void PGRecovery::request_backfill() { LOG_PREFIX(PGRecovery::request_backfill); - DEBUGDPP("", pg->get_pgid()); + DEBUGDPP("", pg->get_dpp()); start_peering_event_operation_listener(PeeringState::RequestBackfill()); } @@ -651,12 +655,14 @@ void PGRecovery::request_backfill() void PGRecovery::all_replicas_recovered() { LOG_PREFIX(PGRecovery::all_replicas_recovered); - DEBUGDPP("", pg->get_pgid()); + DEBUGDPP("", pg->get_dpp()); start_peering_event_operation_listener(PeeringState::AllReplicasRecovered()); } void PGRecovery::backfill_suspended() { + LOG_PREFIX(PGRecovery::backfill_suspended); + DEBUGDPP("", pg->get_dpp()); using BackfillState = crimson::osd::BackfillState; backfill_state->process_event( BackfillState::SuspendBackfill{}.intrusive_from_this()); @@ -665,7 +671,8 @@ void PGRecovery::backfill_suspended() void PGRecovery::dispatch_backfill_event( boost::intrusive_ptr evt) { - logger().debug("{}", __func__); + LOG_PREFIX(PGRecovery::dispatch_backfill_event); + DEBUGDPP("", pg->get_dpp()); assert(backfill_state); backfill_state->process_event(evt); // TODO: Do we need to worry about cases in which the pg has @@ -677,14 +684,15 @@ void PGRecovery::dispatch_backfill_event( void PGRecovery::on_activate_complete() { - logger().debug("{} backfill_state={}", - __func__, fmt::ptr(backfill_state.get())); + LOG_PREFIX(PGRecovery::on_activate_complete); + DEBUGDPP("backfill_state={}", pg->get_dpp(), fmt::ptr(backfill_state.get())); backfill_state.reset(); } void PGRecovery::on_backfill_reserved() { - logger().debug("{}", __func__); + LOG_PREFIX(PGRecovery::on_backfill_reserved); + DEBUGDPP("", pg->get_dpp()); ceph_assert(pg->get_peering_state().is_backfilling()); // let's be lazy with creating the backfill stuff using BackfillState = crimson::osd::BackfillState; -- 2.39.5