From 06e22e7f39584ea7b50b7ca2cab4f502815fa9cf Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Thu, 26 Oct 2023 19:08:37 +0000 Subject: [PATCH] src/mon/PGMap.cc: init pool_availability Added PoolAvailability Struct Modified PGMap.cc to include a k,v map: `pool_availability`. The key being the `poolid` and value is `PoolAvailability` Init the function: `PGMap::get_unavailable_pg_in_pool_map()` to identify and aggregate all the PGs we mark as `unavailable` as well as the pool that associates with the unavailable PG. Also, included `pool_availability` to `PGMapDigest::dump()`. Fixes: https://tracker.ceph.com/issues/67777 Signed-off-by: Kamoltat --- src/mon/MgrStatMonitor.cc | 88 +++++++++++++++++++++++++++++ src/mon/MgrStatMonitor.h | 8 +++ src/mon/OSDMonitor.cc | 29 ++++++++++ src/mon/PGMap.cc | 61 +++++++++++++++++++- src/mon/PGMap.h | 5 ++ src/mon/mon_types.h | 69 ++++++++++++++++++++++ src/tools/ceph-dencoder/osd_types.h | 1 + 7 files changed, 259 insertions(+), 2 deletions(-) diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index 9da4c50da084c..e568a491c4383 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -66,6 +66,79 @@ void MgrStatMonitor::create_initial() encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL); } +void MgrStatMonitor::calc_pool_availability() +{ + dout(20) << __func__ << dendl; + auto pool_avail_end = pool_availability.end(); + for (const auto& i : digest.pool_pg_unavailable_map) { + const auto& poolid = i.first; + if (pool_availability.find(poolid) == pool_avail_end){ + // New Pool so we add. + pool_availability.insert({poolid, PoolAvailability()}); + dout(20) << __func__ << "Adding pool: " << poolid << dendl; + } + } + utime_t now(ceph_clock_now()); + auto pool_unavail_end = digest.pool_pg_unavailable_map.end(); + for (const auto& i : pool_availability) { + const auto& poolid = i.first; + if (digest.pool_pg_unavailable_map.find(poolid) == + pool_unavail_end) { + // delete none exist pool + pool_availability.erase(poolid); + dout(20) << __func__ << "Deleting pool: " << poolid << dendl; + continue; + } + if (mon.osdmon()->osdmap.have_pg_pool(poolid)){ + // Currently, couldn't find an elegant way to get pool name + pool_availability[poolid].pool_name = mon.osdmon()->osdmap.get_pool_name(poolid); + } else { + pool_availability.erase(poolid); + dout(20) << __func__ << "pool: " + << poolid << " no longer exists in osdmap! Deleting pool: " + << poolid << dendl; + continue; + } + if (pool_availability[poolid].is_avail) { + if (!digest.pool_pg_unavailable_map[poolid].empty()) { + // avail to unavail + dout(20) << __func__ + << ": Pool " << poolid << " status: Available to Unavailable" << dendl; + pool_availability[poolid].is_avail = false; + pool_availability[poolid].num_failures += 1; + pool_availability[poolid].last_downtime = now; + pool_availability[poolid].uptime += + now - pool_availability[poolid].last_uptime; + } else { + // avail to avail + dout(20) << __func__ + << ": Pool " << poolid << " status: Available to Available" << dendl; + pool_availability[poolid].uptime += + now - pool_availability[poolid].last_uptime; + pool_availability[poolid].last_uptime = now; + } + } else { + if (!digest.pool_pg_unavailable_map[poolid].empty()) { + // unavail to unavail + dout(20) << __func__ + << ": Pool " << poolid << " status: Unavailable to Unavailable" << dendl; + pool_availability[poolid].downtime += + now - pool_availability[poolid].last_downtime; + pool_availability[poolid].last_downtime = now; + } else { + // unavail to avail + dout(20) << __func__ + << ": Pool " << poolid << " status: Unavailable to Available" << dendl; + pool_availability[poolid].is_avail = true; + pool_availability[poolid].last_uptime = now; + pool_availability[poolid].uptime += + now - pool_availability[poolid].last_downtime; + } + } + } + pending_pool_availability.swap(pool_availability); +} + void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) { version = get_last_committed(); @@ -82,9 +155,13 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) if (!p.end()) { decode(progress_events, p); } + if (!p.end()) { + decode(pool_availability, p); + } dout(10) << __func__ << " v" << version << " service_map e" << service_map.epoch << " " << progress_events.size() << " progress events" + << " " << pool_availability.size() << " pools availability tracked" << dendl; } catch (ceph::buffer::error& e) { @@ -95,6 +172,7 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) check_subs(); update_logger(); mon.osdmon()->notify_new_pg_digest(); + calc_pool_availability(); } void MgrStatMonitor::update_logger() @@ -156,6 +234,7 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t) ceph_assert(pending_service_map_bl.length()); bl.append(pending_service_map_bl); encode(pending_progress_events, bl); + encode(pending_pool_availability, bl); put_version(t, version, bl); put_last_committed(t, version); @@ -260,6 +339,15 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op) jf.close_section(); jf.flush(*_dout); *_dout << dendl; + dout(20) << "pool_availability:\n"; + JSONFormatter jf(true); + jf.open_object_section("pool_availability"); + for (auto& i : pending_pool_availability) { + jf.dump_object(std::to_string(i.first), i.second); + } + jf.close_section(); + jf.flush(*_dout); + *_dout << dendl; return true; } diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h index 8f27a98781cf3..a50a4da083a2c 100644 --- a/src/mon/MgrStatMonitor.h +++ b/src/mon/MgrStatMonitor.h @@ -14,12 +14,14 @@ class MgrStatMonitor : public PaxosService { PGMapDigest digest; ServiceMap service_map; std::map progress_events; + std::map pool_availability; // pending commit PGMapDigest pending_digest; health_check_map_t pending_health_checks; std::map pending_progress_events; ceph::buffer::list pending_service_map_bl; + std::map pending_pool_availability; public: MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name); @@ -49,6 +51,8 @@ public: bool preprocess_getpoolstats(MonOpRequestRef op); bool preprocess_statfs(MonOpRequestRef op); + void calc_pool_availability(); + void check_sub(Subscription *sub); void check_subs(); void send_digests(); @@ -83,6 +87,10 @@ public: return digest; } + const std::map& get_pool_availability() { + return pool_availability; + } + ceph_statfs get_statfs(OSDMap& osdmap, std::optional data_pool) const { return digest.get_statfs(osdmap, data_pool); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index d9f190f41a9db..0a9107c88392e 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -111,6 +111,8 @@ using ceph::ErasureCodeProfile; using ceph::Formatter; using ceph::JSONFormatter; using ceph::make_message; +using ceph::make_timespan; +using ceph::timespan_str; using namespace std::literals; #define dout_subsys ceph_subsys_mon @@ -14407,6 +14409,33 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1)); return true; + } else if (prefix == "osd pool availability-status") { + TextTable tbl; + tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("UPTIME", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("DOWNTIME", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("NUMFAILURES", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("MTBF", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("MTTR", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("SCORE", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("AVAILABLE", TextTable::LEFT, TextTable::RIGHT); + std::map pool_availability = mon.mgrstatmon()->get_pool_availability(); + for (const auto& i : pool_availability) { + const auto& p = i.second; + double mtbf = p.num_failures > 0 ? (p.uptime / p.num_failures) : 0; + double mttr = p.num_failures > 0 ? (p.downtime / p.num_failures) : 0; + double score = mtbf > 0 ? mtbf / (mtbf + mttr): 1.0; + tbl << p.pool_name; + tbl << timespan_str(make_timespan(p.uptime)); + tbl << timespan_str(make_timespan(p.downtime)); + tbl << p.num_failures; + tbl << timespan_str(make_timespan(mtbf)); + tbl << timespan_str(make_timespan(mttr)); + tbl << score; + tbl << p.is_avail; + tbl << TextTable::endrow; + } + rdata.append(stringify(tbl)); } else if (prefix == "osd force-create-pg") { pg_t pgid; string pgidstr; diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 32dfcd0e3ef77..0c81b2a9ff484 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -56,7 +56,7 @@ MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap); void PGMapDigest::encode(bufferlist& bl, uint64_t features) const { // NOTE: see PGMap::encode_digest - uint8_t v = 4; + uint8_t v = 5; assert(HAVE_FEATURE(features, SERVER_NAUTILUS)); ENCODE_START(v, 1, bl); encode(num_pg, bl); @@ -77,12 +77,13 @@ void PGMapDigest::encode(bufferlist& bl, uint64_t features) const encode(avail_space_by_rule, bl); encode(purged_snaps, bl); encode(osd_sum_by_class, bl, features); + encode(pool_pg_unavailable_map, bl); ENCODE_FINISH(bl); } void PGMapDigest::decode(bufferlist::const_iterator& p) { - DECODE_START(4, p); + DECODE_START(5, p); assert(struct_v >= 4); decode(num_pg, p); decode(num_pg_active, p); @@ -102,6 +103,9 @@ void PGMapDigest::decode(bufferlist::const_iterator& p) decode(avail_space_by_rule, p); decode(purged_snaps, p); decode(osd_sum_by_class, p); + if (struct_v >= 5) { + decode(pool_pg_unavailable_map, p); + } DECODE_FINISH(p); } @@ -151,6 +155,18 @@ void PGMapDigest::dump(ceph::Formatter *f) const f->close_section(); } f->close_section(); + f->open_array_section("pool_pg_unavailable_map"); + for (auto& p : pool_pg_unavailable_map) { + f->open_object_section("pool_pg_unavailable_map"); + f->dump_string("poolid", std::to_string(p.first)); + f->open_array_section("pgs"); + for (const auto& pg : p.second) { + f->dump_stream("pg") << pg; + } + f->close_section(); + f->close_section(); + } + f->close_section(); f->open_array_section("num_pg_by_osd"); for (auto& p : num_pg_by_osd) { f->open_object_section("count"); @@ -1261,6 +1277,46 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) last_pg_scan = inc.pg_scan; } +/* + Returns a map of all pools in a cluster. Each value lists any PGs that + are in any of the following states: + - non-active + - stale + + Eg: {1=[1.0],2=[],3=[]} + Here the cluster has 3 pools with id 1,2,3 and pool 1 has an inactive PG 1.0 +*/ +void PGMap::get_unavailable_pg_in_pool_map(const OSDMap& osdmap) +{ + dout(20) << __func__ << dendl; + pool_pg_unavailable_map.clear(); + utime_t now(ceph_clock_now()); + utime_t cutoff = now - utime_t(g_conf().get_val("mon_pg_stuck_threshold"), 0); + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + const auto poolid = i->first.pool(); + pool_pg_unavailable_map[poolid]; + utime_t val = cutoff; + + if (!(i->second.state & PG_STATE_ACTIVE)) { // This case covers unknown state since unknow state bit == 0; + if (i->second.last_active < val) + val = i->second.last_active; + } + + if (i->second.state & PG_STATE_STALE) { + if (i->second.last_unstale < val) + val = i->second.last_unstale; + } + + if (val < cutoff) { + pool_pg_unavailable_map[poolid].push_back(i->first); + dout(20) << "pool: " << poolid << " pg: " << i->first + << " is stuck unavailable" << " state: " << i->second.state << dendl; + } + } +} + void PGMap::calc_stats() { num_pg = 0; @@ -1488,6 +1544,7 @@ void PGMap::encode_digest(const OSDMap& osdmap, get_rules_avail(osdmap, &avail_space_by_rule); calc_osd_sum_by_class(osdmap); calc_purged_snaps(); + get_unavailable_pg_in_pool_map(osdmap); PGMapDigest::encode(bl, features); } diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 4b65ca0457660..7ed6c63c12b8a 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -27,6 +27,9 @@ #include "common/Formatter.h" #include "osd/osd_types.h" #include "include/mempool.h" +#include "mon/health_check.h" +#include +#include "mon/mon_types.h" #include #include @@ -57,6 +60,7 @@ public: osd_stat_t osd_sum; mempool::pgmap::map osd_sum_by_class; mempool::pgmap::unordered_map num_pg_by_state; + mempool::pgmap::map> pool_pg_unavailable_map; struct pg_count { int32_t acting = 0; int32_t up_not_acting = 0; @@ -440,6 +444,7 @@ public: void apply_incremental(CephContext *cct, const Incremental& inc); void calc_stats(); + void get_unavailable_pg_in_pool_map(const OSDMap& osdmap); void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool sameosds=false); bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h index 12cd3b450c41b..87ad69be92cd1 100644 --- a/src/mon/mon_types.h +++ b/src/mon/mon_types.h @@ -28,6 +28,7 @@ #include "common/bit_str.h" #include "common/ceph_releases.h" #include "msg/msg_types.h" // for entity_addrvec_t +#include "common/Clock.h" // use as paxos_service index enum { @@ -736,4 +737,72 @@ struct ProgressEvent { }; WRITE_CLASS_ENCODER(ProgressEvent) +struct PoolAvailability { + std::string pool_name = ""; + utime_t started_at = ceph_clock_now(); + uint64_t uptime = 0; + utime_t last_uptime = ceph_clock_now(); + uint64_t downtime = 0; + utime_t last_downtime = ceph_clock_now(); + uint64_t num_failures = 0; + bool is_avail = true; + + PoolAvailability() {} + + void dump(ceph::Formatter *f) const { + ceph_assert(f != NULL); + f->dump_stream("pool_name") << pool_name; + f->dump_stream("started_at") << started_at; + f->dump_int("uptime", uptime); + f->dump_stream("last_uptime") << last_uptime; + f->dump_int("downtime", downtime); + f->dump_stream("last_downtime") << last_downtime; + f->dump_int("num_failures", num_failures); + f->dump_bool("is_avail", is_avail); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(pool_name, bl); + encode(started_at, bl); + encode(uptime, bl); + encode(last_uptime, bl); + encode(downtime, bl); + encode(last_downtime, bl); + encode(num_failures, bl); + encode(is_avail, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &p) { + DECODE_START(1, p); + decode(pool_name, p); + decode(started_at, p); + decode(uptime, p); + decode(last_uptime, p); + decode(downtime, p); + decode(last_downtime, p); + decode(num_failures, p); + decode(is_avail, p); + DECODE_FINISH(p); + } + + static void generate_test_instances(std::list& o) { + o.push_back(new PoolAvailability); + o.back()->started_at = utime_t(123, 456); + o.back()->last_uptime = utime_t(123, 456); + o.back()->last_downtime = utime_t(123, 456); + o.push_back(new PoolAvailability); + o.back()->pool_name = "foo"; + o.back()->started_at = utime_t(123, 456); + o.back()->uptime = 100; + o.back()->last_uptime = utime_t(123, 456); + o.back()->downtime = 15; + o.back()->last_downtime = utime_t(123, 456); + o.back()->num_failures = 2; + o.back()->is_avail = true; + } +}; +WRITE_CLASS_ENCODER(PoolAvailability) + #endif diff --git a/src/tools/ceph-dencoder/osd_types.h b/src/tools/ceph-dencoder/osd_types.h index 01141f115296a..c49173a90b6d6 100644 --- a/src/tools/ceph-dencoder/osd_types.h +++ b/src/tools/ceph-dencoder/osd_types.h @@ -170,6 +170,7 @@ TYPE(mon_feature_t) TYPE_FEATUREFUL(DataStats) TYPE_FEATUREFUL(ProgressEvent) TYPE(FeatureMap) +TYPE(PoolAvailability) #include "mon/CreatingPGs.h" TYPE_FEATUREFUL(creating_pgs_t) -- 2.39.5