From f827e4dafc593fc6ef6cee2ccc4d311019ad14e9 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 30 Apr 2026 08:50:08 +0000 Subject: [PATCH] osd/scrub: auto-correct accounting-only stat mismatches MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When scrub detects a PG stats mismatch but no object-level inconsistencies (all replicas agree on actual data), fix the stats in place rather than reporting a scrub error. Previously, a pure stat mismatch would log [ERR], increment shallow_errors, and trigger OSD_SCRUB_ERRORS / PG_STATE_INCONSISTENT health alerts — yet leave the stats unfixed unless a repair scrub was manually initiated. The scrubber's own object count is authoritative in this case. Persistence of the corrected stats is deferred until the next transaction that sets dirty_info, consistent with the existing stats_invalid repair path. Signed-off-by: Ronen Friedman --- src/osd/PeeringState.cc | 9 ++++++ src/osd/scrubber/PrimaryLogScrub.cc | 49 +++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index bcd56504943..a7956970971 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -5077,6 +5077,15 @@ void PeeringState::apply_op_stats( const hobject_t &soid, const object_stat_sum_t &delta_stats) { + psdout(20) << fmt::format( + "apply_op_stats {} d.objs={} d.clns={} d.bytes={}" + " -> objs={} clns={} bytes={}", + soid, delta_stats.num_objects, delta_stats.num_object_clones, + delta_stats.num_bytes, + info.stats.stats.sum.num_objects + delta_stats.num_objects, + info.stats.stats.sum.num_object_clones + delta_stats.num_object_clones, + info.stats.stats.sum.num_bytes + delta_stats.num_bytes) + << dendl; info.stats.stats.add(delta_stats); info.stats.stats.floor(0); diff --git a/src/osd/scrubber/PrimaryLogScrub.cc b/src/osd/scrubber/PrimaryLogScrub.cc index 493bca8e25e..833d5a2fa61 100644 --- a/src/osd/scrubber/PrimaryLogScrub.cc +++ b/src/osd/scrubber/PrimaryLogScrub.cc @@ -109,6 +109,12 @@ void PrimaryLogScrub::submit_digest_fixes(const digests_fixes_t& fixes) void PrimaryLogScrub::add_to_stats(const object_stat_sum_t& stat) { + dout(20) << fmt::format( + "{} objs+{} clns+{} -> scrub_objs={} scrub_clns={}", + __func__, stat.num_objects, stat.num_object_clones, + m_scrub_cstat.sum.num_objects + stat.num_objects, + m_scrub_cstat.sum.num_object_clones + stat.num_object_clones) + << dendl; m_scrub_cstat.add(stat); } @@ -177,7 +183,11 @@ void PrimaryLogScrub::_scrub_finish() m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts || m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) { - m_osds->clog->error() << info.pgid << " " << m_mode_desc + if (m_shallow_errors || m_deep_errors) { + // Object-level errors were already found: this stat mismatch may + // reflect real data damage. Report it as an error. + m_osds->clog->error() + << info.pgid << " " << m_mode_desc << " : stat mismatch, got " << m_scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, " @@ -208,7 +218,36 @@ void PrimaryLogScrub::_scrub_finish() if (m_is_repair) { ++m_fixed_count; - m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) { + m_pl_pg->recovery_state.update_stats( + [this](auto& history, auto& stats) { + stats.stats = m_scrub_cstat; + stats.dirty_stats_invalid = false; + stats.omap_stats_invalid = false; + stats.hitset_stats_invalid = false; + stats.hitset_bytes_stats_invalid = false; + stats.pin_stats_invalid = false; + stats.manifest_stats_invalid = false; + return false; + }); + m_pl_pg->publish_stats_to_osd(); + m_pl_pg->recovery_state.share_pg_info(); + } + } else { + // No object-level inconsistencies: this is a bookkeeping error in + // the PG stats. Fix it silently. + dout(1) << fmt::format( + "{} {} : fixing stat mismatch (no object errors)," + " got {}/{} objects, {}/{} clones, {}/{} bytes", + info.pgid, m_mode_desc, + m_scrub_cstat.sum.num_objects, + info.stats.stats.sum.num_objects, + m_scrub_cstat.sum.num_object_clones, + info.stats.stats.sum.num_object_clones, + m_scrub_cstat.sum.num_bytes, + info.stats.stats.sum.num_bytes) + << dendl; + m_pl_pg->recovery_state.update_stats( + [this](auto& history, auto& stats) { stats.stats = m_scrub_cstat; stats.dirty_stats_invalid = false; stats.omap_stats_invalid = false; @@ -247,7 +286,11 @@ void PrimaryLogScrub::stats_of_handled_objects( if (is_primary() && is_scrub_active()) { if (soid < m_start) { - dout(20) << fmt::format("{} {} < [{},{})", __func__, soid, m_start, m_end) + dout(20) << fmt::format( + "{} {} < [{},{}) d.objs={} d.clns={} -> scrub_objs={}", + __func__, soid, m_start, m_end, + delta_stats.num_objects, delta_stats.num_object_clones, + m_scrub_cstat.sum.num_objects + delta_stats.num_objects) << dendl; m_scrub_cstat.add(delta_stats); -- 2.47.3