From b8b70217760594e99f6a2c66b1c903e77af4a3fd Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Wed, 21 May 2025 11:41:01 +0000 Subject: [PATCH] src/mon/MgrStatMonitor: fix invalid iterator increment in calc_pool_availability() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Erasing entries from `pool_availability` inside a range-for loop invalidated the hidden iterator, triggering an “Invalid read” under Valgrind. - Use `std::erase_if(pool_availability, predicate)` for atomic removal. - Refactor the stats-update loop to use structured bindings and a clear `++it` for readability. Fixes: https://tracker.ceph.com/issues/71271 Signed-off-by: Nitzan Mordechai (cherry picked from commit 7369a4dded210d9410fb00e259d95df013532cc1) --- src/mon/MgrStatMonitor.cc | 140 +++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 71 deletions(-) diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index 224651a2eb6ee..b2d4c20564475 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -113,88 +113,86 @@ void MgrStatMonitor::calc_pool_availability() return; } - // if reset_availability_last_uptime_downtime_val is not utime_t(1, 2), - // update last_uptime and last_downtime for all pools to the - // recorded values - if (reset_availability_last_uptime_downtime_val.has_value()) { - for (const auto& i : pool_availability) { - const auto& poolid = i.first; - pool_availability[poolid].last_downtime = reset_availability_last_uptime_downtime_val.value(); - pool_availability[poolid].last_uptime = reset_availability_last_uptime_downtime_val.value(); + // Add new pools from digest + for ([[maybe_unused]] const auto& [poolid, _] : digest.pool_pg_unavailable_map) { + if (!pool_availability.contains(poolid)) { + pool_availability.emplace(poolid, PoolAvailability{}); + dout(20) << fmt::format("{}: Adding pool: {}", + __func__, poolid) << dendl; } - dout(20) << __func__ << " reset last_uptime and last_downtime to " - << reset_availability_last_uptime_downtime_val << dendl; - reset_availability_last_uptime_downtime_val.reset(); } - auto pool_avail_end = pool_availability.end(); - for (const auto& i : digest.pool_pg_unavailable_map) { - const auto& poolid = i.first; - if (pool_availability.find(poolid) == pool_avail_end){ - // New Pool so we add. - pool_availability.insert({poolid, PoolAvailability()}); - dout(20) << __func__ << "Adding pool: " << poolid << dendl; + // Remove unavailable pools + // A pool is removed if it meets either of the following criteria: + // 1. It is not present in the digest's pool_pg_unavailable_map. + // 2. It is not present in the osdmap (checked via mon.osdmon()->osdmap). + std::erase_if(pool_availability, [&](const auto& kv) { + const auto& poolid = kv.first; + + if (!digest.pool_pg_unavailable_map.contains(poolid)) { + dout(20) << fmt::format("{}: Deleting pool (not in digest): {}", + __func__, poolid) << dendl; + return true; } - } - utime_t now(ceph_clock_now()); - auto pool_unavail_end = digest.pool_pg_unavailable_map.end(); - for (const auto& i : pool_availability) { - const auto& poolid = i.first; - if (digest.pool_pg_unavailable_map.find(poolid) == - pool_unavail_end) { - // delete none exist pool - pool_availability.erase(poolid); - dout(20) << __func__ << "Deleting pool: " << poolid << dendl; - continue; - } - if (mon.osdmon()->osdmap.have_pg_pool(poolid)){ - // Currently, couldn't find an elegant way to get pool name - pool_availability[poolid].pool_name = mon.osdmon()->osdmap.get_pool_name(poolid); - } else { - pool_availability.erase(poolid); - dout(20) << __func__ << "pool: " - << poolid << " no longer exists in osdmap! Deleting pool: " - << poolid << dendl; - continue; + if (!mon.osdmon()->osdmap.have_pg_pool(poolid)) { + dout(20) << fmt::format("{}: Deleting pool (not in osdmap): {}", + __func__, poolid) << dendl; + return true; } - if (pool_availability[poolid].is_avail) { - if (!digest.pool_pg_unavailable_map[poolid].empty()) { - // avail to unavail - dout(20) << __func__ - << ": Pool " << poolid << " status: Available to Unavailable" << dendl; - pool_availability[poolid].is_avail = false; - pool_availability[poolid].num_failures += 1; - pool_availability[poolid].last_downtime = now; - pool_availability[poolid].uptime += - now - pool_availability[poolid].last_uptime; + return false; + }); + + // Update pool availability + utime_t now(ceph_clock_now()); + for (auto& [poolid, avail] : pool_availability) { + avail.pool_name = mon.osdmon()->osdmap.get_pool_name(poolid); + + auto it = digest.pool_pg_unavailable_map.find(poolid); + const bool currently_avail = (it != digest.pool_pg_unavailable_map.end()) && + it->second.empty(); + + if (avail.is_avail) { + if (!currently_avail) { // Available → Unavailable + dout(20) << fmt::format("{}: Pool {} status: Available to Unavailable", + __func__, poolid) << dendl; + avail.is_avail = false; + ++avail.num_failures; + avail.last_downtime = now; + avail.uptime += now - avail.last_uptime; } else { - // avail to avail - dout(20) << __func__ - << ": Pool " << poolid << " status: Available to Available" << dendl; - pool_availability[poolid].uptime += - now - pool_availability[poolid].last_uptime; - pool_availability[poolid].last_uptime = now; + // Available to Available + dout(20) << fmt::format("{}: Pool {} status: Available to Available", + __func__, poolid) << dendl; + avail.uptime += now - avail.last_uptime; + avail.last_uptime = now; } - } else { - if (!digest.pool_pg_unavailable_map[poolid].empty()) { - // unavail to unavail - dout(20) << __func__ - << ": Pool " << poolid << " status: Unavailable to Unavailable" << dendl; - pool_availability[poolid].downtime += - now - pool_availability[poolid].last_downtime; - pool_availability[poolid].last_downtime = now; - } else { - // unavail to avail - dout(20) << __func__ - << ": Pool " << poolid << " status: Unavailable to Available" << dendl; - pool_availability[poolid].is_avail = true; - pool_availability[poolid].last_uptime = now; - pool_availability[poolid].uptime += - now - pool_availability[poolid].last_downtime; + } else { // Unavailable + if (currently_avail) { // Unavailable to Available + dout(20) << fmt::format("{}: Pool {} status: Unavailable to Available", + __func__, poolid) << dendl; + avail.is_avail = true; + avail.last_uptime = now; + avail.uptime += now - avail.last_downtime; + } else { // Unavailable to Unavailable + dout(20) << fmt::format("{}: Pool {} status: Unavailable to Unavailable", + __func__, poolid) << dendl; + avail.downtime += now - avail.last_downtime; + avail.last_downtime = now; } } + + // if reset_availability_last_uptime_downtime_val is not utime_t(1, 2), + // update last_uptime and last_downtime for all pools to the + // recorded values + if (reset_availability_last_uptime_downtime_val.has_value()) { + dout(20) << fmt::format("{}: Pool {} reset last_uptime and last_downtime to {}", + __func__, poolid, reset_availability_last_uptime_downtime_val.value()) << dendl; + avail.last_downtime = reset_availability_last_uptime_downtime_val.value(); + avail.last_uptime = reset_availability_last_uptime_downtime_val.value(); + } } pending_pool_availability.swap(pool_availability); + reset_availability_last_uptime_downtime_val.reset(); } void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) -- 2.39.5