]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
src/mon/PGMap.cc: init pool_availability
authorKamoltat <ksirivad@redhat.com>
Thu, 26 Oct 2023 19:08:37 +0000 (19:08 +0000)
committerShraddha Agrawal <shraddha.agrawal000@gmail.com>
Thu, 24 Apr 2025 14:10:24 +0000 (19:40 +0530)
Added PoolAvailability Struct

Modified PGMap.cc to include a k,v map:
`pool_availability`.

The key being the `poolid` and value
is `PoolAvailability`

Init the function:
`PGMap::get_unavailable_pg_in_pool_map()`
to identify and aggregate all the PGs we
mark as `unavailable` as well as the pool
that associates with the unavailable PG.

Also, included `pool_availability`
to `PGMapDigest::dump()`.

Fixes: https://tracker.ceph.com/issues/67777
Signed-off-by: Kamoltat <ksirivad@redhat.com>
src/mon/MgrStatMonitor.cc
src/mon/MgrStatMonitor.h
src/mon/OSDMonitor.cc
src/mon/PGMap.cc
src/mon/PGMap.h
src/mon/mon_types.h
src/tools/ceph-dencoder/osd_types.h

index 9da4c50da084cb15edaaec042ddba7ae09d9aa6c..e568a491c438367db006bda554245121de26a16b 100644 (file)
@@ -66,6 +66,79 @@ void MgrStatMonitor::create_initial()
   encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
 }
 
+void MgrStatMonitor::calc_pool_availability()
+{
+  dout(20) << __func__ << dendl;
+  auto pool_avail_end = pool_availability.end();
+  for (const auto& i : digest.pool_pg_unavailable_map) {
+    const auto& poolid = i.first;
+    if (pool_availability.find(poolid) == pool_avail_end){
+      // New Pool so we add.
+      pool_availability.insert({poolid, PoolAvailability()});
+      dout(20) << __func__ << "Adding pool: " << poolid << dendl;
+    }
+  }
+  utime_t now(ceph_clock_now());
+  auto pool_unavail_end = digest.pool_pg_unavailable_map.end();
+  for (const auto& i : pool_availability) {
+    const auto& poolid = i.first;
+    if (digest.pool_pg_unavailable_map.find(poolid) ==
+      pool_unavail_end) {
+      // delete none exist pool
+      pool_availability.erase(poolid);
+      dout(20) << __func__ << "Deleting pool: " << poolid << dendl;
+      continue;
+    }
+    if (mon.osdmon()->osdmap.have_pg_pool(poolid)){
+      // Currently, couldn't find an elegant way to get pool name
+      pool_availability[poolid].pool_name = mon.osdmon()->osdmap.get_pool_name(poolid);
+    } else {
+      pool_availability.erase(poolid);
+      dout(20) << __func__ << "pool: " 
+              << poolid << " no longer exists in osdmap! Deleting pool: " 
+         << poolid << dendl;
+      continue;
+    }
+    if (pool_availability[poolid].is_avail) {
+      if (!digest.pool_pg_unavailable_map[poolid].empty()) {
+        // avail to unavail
+        dout(20) << __func__ 
+                << ": Pool " << poolid << " status: Available to Unavailable" << dendl;
+        pool_availability[poolid].is_avail = false;
+        pool_availability[poolid].num_failures += 1;
+        pool_availability[poolid].last_downtime = now;
+        pool_availability[poolid].uptime +=
+          now - pool_availability[poolid].last_uptime;
+      } else {
+        // avail to avail
+        dout(20) << __func__ 
+                << ": Pool " << poolid << " status: Available to Available" << dendl;
+        pool_availability[poolid].uptime +=
+          now - pool_availability[poolid].last_uptime;
+        pool_availability[poolid].last_uptime = now;
+      }
+    } else {
+      if (!digest.pool_pg_unavailable_map[poolid].empty()) {
+        // unavail to unavail
+        dout(20) << __func__ 
+                << ": Pool " << poolid << " status: Unavailable to Unavailable" << dendl;
+        pool_availability[poolid].downtime +=
+          now - pool_availability[poolid].last_downtime;
+        pool_availability[poolid].last_downtime = now;
+      } else {
+        // unavail to avail
+        dout(20) << __func__ 
+                << ": Pool " << poolid << " status: Unavailable to Available" << dendl;
+        pool_availability[poolid].is_avail = true;
+        pool_availability[poolid].last_uptime = now;
+        pool_availability[poolid].uptime +=
+          now - pool_availability[poolid].last_downtime;
+      }
+    }
+  }
+  pending_pool_availability.swap(pool_availability);
+}
+
 void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version = get_last_committed();
@@ -82,9 +155,13 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
       if (!p.end()) {
        decode(progress_events, p);
       }
+      if (!p.end()) {
+        decode(pool_availability, p);
+      }
       dout(10) << __func__ << " v" << version
               << " service_map e" << service_map.epoch
               << " " << progress_events.size() << " progress events"
+         << " " << pool_availability.size() << " pools availability tracked"
               << dendl;
     }
     catch (ceph::buffer::error& e) {
@@ -95,6 +172,7 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
   check_subs();
   update_logger();
   mon.osdmon()->notify_new_pg_digest();
+  calc_pool_availability();
 }
 
 void MgrStatMonitor::update_logger()
@@ -156,6 +234,7 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   ceph_assert(pending_service_map_bl.length());
   bl.append(pending_service_map_bl);
   encode(pending_progress_events, bl);
+  encode(pending_pool_availability, bl);
   put_version(t, version, bl);
   put_last_committed(t, version);
 
@@ -260,6 +339,15 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op)
   jf.close_section();
   jf.flush(*_dout);
   *_dout << dendl;
+  dout(20) << "pool_availability:\n";
+  JSONFormatter jf(true);
+  jf.open_object_section("pool_availability");
+  for (auto& i : pending_pool_availability) {
+    jf.dump_object(std::to_string(i.first), i.second);
+  }
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
   return true;
 }
 
index 8f27a98781cf3e3df8fc0e44d71f5a033a96e087..a50a4da083a2ca7dbe0fb7974a3b0e3e1621ced2 100644 (file)
@@ -14,12 +14,14 @@ class MgrStatMonitor : public PaxosService {
   PGMapDigest digest;
   ServiceMap service_map;
   std::map<std::string,ProgressEvent> progress_events;
+  std::map<uint64_t, PoolAvailability> pool_availability;
 
   // pending commit
   PGMapDigest pending_digest;
   health_check_map_t pending_health_checks;
   std::map<std::string,ProgressEvent> pending_progress_events;
   ceph::buffer::list pending_service_map_bl;
+  std::map<uint64_t, PoolAvailability> pending_pool_availability;
 
 public:
   MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name);
@@ -49,6 +51,8 @@ public:
   bool preprocess_getpoolstats(MonOpRequestRef op);
   bool preprocess_statfs(MonOpRequestRef op);
 
+  void calc_pool_availability();
+
   void check_sub(Subscription *sub);
   void check_subs();
   void send_digests();
@@ -83,6 +87,10 @@ public:
     return digest;
   }
 
+  const std::map<uint64_t, PoolAvailability>& get_pool_availability() {
+    return pool_availability;
+  }
+
   ceph_statfs get_statfs(OSDMap& osdmap,
                         std::optional<int64_t> data_pool) const {
     return digest.get_statfs(osdmap, data_pool);
index d9f190f41a9dbf7baacfca796f809d291cfe2c5a..0a9107c88392e0bf3654ad1a9c2351677cf571cb 100644 (file)
@@ -111,6 +111,8 @@ using ceph::ErasureCodeProfile;
 using ceph::Formatter;
 using ceph::JSONFormatter;
 using ceph::make_message;
+using ceph::make_timespan;
+using ceph::timespan_str;
 using namespace std::literals;
 
 #define dout_subsys ceph_subsys_mon
@@ -14407,6 +14409,33 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
                                                   get_last_committed() + 1));
     return true;
+  } else if (prefix == "osd pool availability-status") {
+    TextTable tbl;
+    tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("UPTIME", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("DOWNTIME", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("NUMFAILURES", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("MTBF", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("MTTR", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("SCORE", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("AVAILABLE", TextTable::LEFT, TextTable::RIGHT);
+    std::map<uint64_t, PoolAvailability> pool_availability = mon.mgrstatmon()->get_pool_availability();
+    for (const auto& i : pool_availability) {
+      const auto& p = i.second;
+      double mtbf = p.num_failures > 0 ? (p.uptime / p.num_failures) : 0;
+      double mttr = p.num_failures > 0 ? (p.downtime / p.num_failures) : 0;
+      double score = mtbf > 0 ? mtbf / (mtbf +  mttr): 1.0;
+      tbl << p.pool_name;
+      tbl << timespan_str(make_timespan(p.uptime));
+      tbl << timespan_str(make_timespan(p.downtime));
+      tbl << p.num_failures;
+      tbl << timespan_str(make_timespan(mtbf));
+      tbl << timespan_str(make_timespan(mttr));
+      tbl << score;
+      tbl << p.is_avail;
+      tbl << TextTable::endrow;
+    }
+    rdata.append(stringify(tbl));
   } else if (prefix == "osd force-create-pg") {
     pg_t pgid;
     string pgidstr;
index 32dfcd0e3ef7790adad8638d5fab1ae56a33c3c3..0c81b2a9ff484b83d140c5d1e3f8316dc586a5f8 100644 (file)
@@ -56,7 +56,7 @@ MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
 {
   // NOTE: see PGMap::encode_digest
-  uint8_t v = 4;
+  uint8_t v = 5;
   assert(HAVE_FEATURE(features, SERVER_NAUTILUS));
   ENCODE_START(v, 1, bl);
   encode(num_pg, bl);
@@ -77,12 +77,13 @@ void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
   encode(avail_space_by_rule, bl);
   encode(purged_snaps, bl);
   encode(osd_sum_by_class, bl, features);
+  encode(pool_pg_unavailable_map, bl);
   ENCODE_FINISH(bl);
 }
 
 void PGMapDigest::decode(bufferlist::const_iterator& p)
 {
-  DECODE_START(4, p);
+  DECODE_START(5, p);
   assert(struct_v >= 4);
   decode(num_pg, p);
   decode(num_pg_active, p);
@@ -102,6 +103,9 @@ void PGMapDigest::decode(bufferlist::const_iterator& p)
   decode(avail_space_by_rule, p);
   decode(purged_snaps, p);
   decode(osd_sum_by_class, p);
+  if (struct_v >= 5) {
+    decode(pool_pg_unavailable_map, p);
+  }
   DECODE_FINISH(p);
 }
 
@@ -151,6 +155,18 @@ void PGMapDigest::dump(ceph::Formatter *f) const
     f->close_section();
   }
   f->close_section();
+  f->open_array_section("pool_pg_unavailable_map");
+  for (auto& p : pool_pg_unavailable_map) {
+    f->open_object_section("pool_pg_unavailable_map");
+    f->dump_string("poolid", std::to_string(p.first));
+    f->open_array_section("pgs");
+    for (const auto& pg : p.second) {
+      f->dump_stream("pg") << pg;
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
   f->open_array_section("num_pg_by_osd");
   for (auto& p : num_pg_by_osd) {
     f->open_object_section("count");
@@ -1261,6 +1277,46 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     last_pg_scan = inc.pg_scan;
 }
 
+/*
+  Returns a map of all pools in a cluster. Each value lists any PGs that 
+  are in any of the following states: 
+  - non-active 
+  - stale 
+
+  Eg: {1=[1.0],2=[],3=[]}
+  Here the cluster has 3 pools with id 1,2,3 and pool 1 has an inactive PG 1.0
+*/
+void PGMap::get_unavailable_pg_in_pool_map(const OSDMap& osdmap)
+{
+  dout(20) << __func__ << dendl;
+  pool_pg_unavailable_map.clear();
+  utime_t now(ceph_clock_now());
+  utime_t cutoff = now - utime_t(g_conf().get_val<int64_t>("mon_pg_stuck_threshold"), 0);
+  for (auto i = pg_stat.begin();
+       i != pg_stat.end();
+       ++i) {
+    const auto poolid = i->first.pool();
+    pool_pg_unavailable_map[poolid];
+    utime_t val = cutoff;
+
+    if (!(i->second.state & PG_STATE_ACTIVE)) { // This case covers unknown state since unknow state bit == 0;
+      if (i->second.last_active < val)
+       val = i->second.last_active;
+    }
+
+    if (i->second.state & PG_STATE_STALE) {
+      if (i->second.last_unstale < val)
+       val = i->second.last_unstale;
+    }
+
+    if (val < cutoff) {
+      pool_pg_unavailable_map[poolid].push_back(i->first);
+      dout(20) << "pool: " << poolid << " pg: " << i->first
+         << " is stuck unavailable" << " state: " << i->second.state << dendl;
+    }
+  }
+}
+
 void PGMap::calc_stats()
 {
   num_pg = 0;
@@ -1488,6 +1544,7 @@ void PGMap::encode_digest(const OSDMap& osdmap,
   get_rules_avail(osdmap, &avail_space_by_rule);
   calc_osd_sum_by_class(osdmap);
   calc_purged_snaps();
+  get_unavailable_pg_in_pool_map(osdmap);
   PGMapDigest::encode(bl, features);
 }
 
index 4b65ca04576604eee71a2e39feb2c5f5c4825ada..7ed6c63c12b8a4852202e016e18b85bfef832e70 100644 (file)
@@ -27,6 +27,9 @@
 #include "common/Formatter.h"
 #include "osd/osd_types.h"
 #include "include/mempool.h"
+#include "mon/health_check.h"
+#include <sstream>
+#include "mon/mon_types.h"
 
 #include <cstdint>
 #include <iosfwd>
@@ -57,6 +60,7 @@ public:
   osd_stat_t osd_sum;
   mempool::pgmap::map<std::string,osd_stat_t> osd_sum_by_class;
   mempool::pgmap::unordered_map<uint64_t,int32_t> num_pg_by_state;
+  mempool::pgmap::map<uint64_t,std::vector<pg_t>> pool_pg_unavailable_map;
   struct pg_count {
     int32_t acting = 0;
     int32_t up_not_acting = 0;
@@ -440,6 +444,7 @@ public:
 
   void apply_incremental(CephContext *cct, const Incremental& inc);
   void calc_stats();
+  void get_unavailable_pg_in_pool_map(const OSDMap& osdmap);
   void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
                   bool sameosds=false);
   bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
index 12cd3b450c41b32ed085b1d87c0d5ef1661b86ec..87ad69be92cd183f8f08f840a8b49f13eef4a9ce 100644 (file)
@@ -28,6 +28,7 @@
 #include "common/bit_str.h"
 #include "common/ceph_releases.h"
 #include "msg/msg_types.h" // for entity_addrvec_t
+#include "common/Clock.h"
 
 // use as paxos_service index
 enum {
@@ -736,4 +737,72 @@ struct ProgressEvent {
 };
 WRITE_CLASS_ENCODER(ProgressEvent)
 
+struct PoolAvailability {
+  std::string pool_name  = "";
+  utime_t started_at = ceph_clock_now();
+  uint64_t uptime = 0;
+  utime_t last_uptime = ceph_clock_now();
+  uint64_t downtime = 0;
+  utime_t last_downtime = ceph_clock_now();
+  uint64_t num_failures = 0;
+  bool is_avail = true;
+
+  PoolAvailability() {}
+
+  void dump(ceph::Formatter *f) const {
+    ceph_assert(f != NULL);
+    f->dump_stream("pool_name") << pool_name;
+    f->dump_stream("started_at") << started_at;
+    f->dump_int("uptime", uptime);
+    f->dump_stream("last_uptime") << last_uptime;
+    f->dump_int("downtime", downtime);
+    f->dump_stream("last_downtime") << last_downtime;
+    f->dump_int("num_failures", num_failures);
+    f->dump_bool("is_avail", is_avail);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(pool_name, bl);
+    encode(started_at, bl);
+    encode(uptime, bl);
+    encode(last_uptime, bl);
+    encode(downtime, bl);
+    encode(last_downtime, bl);
+    encode(num_failures, bl);
+    encode(is_avail, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &p) {
+    DECODE_START(1, p);
+    decode(pool_name, p);
+    decode(started_at, p);
+    decode(uptime, p);
+    decode(last_uptime, p);
+    decode(downtime, p);
+    decode(last_downtime, p);
+    decode(num_failures, p);
+    decode(is_avail, p);
+    DECODE_FINISH(p);
+  }
+
+  static void generate_test_instances(std::list<PoolAvailability*>& o) {
+    o.push_back(new PoolAvailability);
+    o.back()->started_at = utime_t(123, 456);      
+    o.back()->last_uptime = utime_t(123, 456);      
+    o.back()->last_downtime = utime_t(123, 456);   
+    o.push_back(new PoolAvailability);
+    o.back()->pool_name = "foo";    
+    o.back()->started_at = utime_t(123, 456);    
+    o.back()->uptime = 100;    
+    o.back()->last_uptime = utime_t(123, 456);    
+    o.back()->downtime = 15;    
+    o.back()->last_downtime = utime_t(123, 456);    
+    o.back()->num_failures = 2;    
+    o.back()->is_avail = true;    
+  }  
+};
+WRITE_CLASS_ENCODER(PoolAvailability)
+
 #endif
index 01141f115296abe6ca931633098d5391dcaad24e..c49173a90b6d6e72cd9cc5e020f2b73c3f484cd5 100644 (file)
@@ -170,6 +170,7 @@ TYPE(mon_feature_t)
 TYPE_FEATUREFUL(DataStats)
 TYPE_FEATUREFUL(ProgressEvent)
 TYPE(FeatureMap)
+TYPE(PoolAvailability)
 
 #include "mon/CreatingPGs.h"
 TYPE_FEATUREFUL(creating_pgs_t)