From 5dcb6d81bbc2a4c4d0da4a33d9f6bbba5065a1ad Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 10 Jul 2019 17:12:16 -0500 Subject: [PATCH] mon: use per-pool stats only when all OSDs are reporting Previously, we would start using the per-pool stat sums as soon as *any* OSDs were reporting. For a legacy cluster, that meant that as soon as one bluestore instance is updated or one new bluestore OSD is created, the usage stats per pool would become useless. Instead, only use the per-pool stats once *all* OSDs are reporting the new values. This mostly aligns with the health warning when one more bluestore OSDs are not reporting; once they are update the warning goes away. (That does not factor in fielstore OSDs, though; all OSDs need to be new *and* bluestore.) Signed-off-by: Sage Weil --- src/librados/RadosClient.cc | 8 +++++--- src/librados/RadosClient.h | 3 ++- src/librados/librados_c.cc | 7 ++++--- src/librados/librados_cxx.cc | 7 ++++--- src/messages/MGetPoolStatsReply.h | 21 ++++++++++++++++++--- src/mon/MgrStatMonitor.cc | 1 + src/mon/PGMap.cc | 16 ++++++++++------ src/mon/PGMap.h | 8 +++++++- src/osd/osd_types.h | 8 ++++---- src/osdc/Objecter.cc | 3 +++ src/osdc/Objecter.h | 5 ++++- src/test/mon/PGMap.cc | 6 +++--- 12 files changed, 65 insertions(+), 28 deletions(-) diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc index 498ca69be58..2fc2012c906 100644 --- a/src/librados/RadosClient.cc +++ b/src/librados/RadosClient.cc @@ -637,15 +637,17 @@ int librados::RadosClient::pool_list(std::list >& v) } int librados::RadosClient::get_pool_stats(std::list& pools, - map& result) + map *result, + bool *per_pool) { Mutex mylock("RadosClient::get_pool_stats::mylock"); Cond cond; bool done; int ret = 0; - objecter->get_pool_stats(pools, &result, new C_SafeCond(&mylock, &cond, &done, - &ret)); + objecter->get_pool_stats(pools, result, per_pool, + new C_SafeCond(&mylock, &cond, &done, + &ret)); mylock.Lock(); while (!done) diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h index ecba2e6af20..93f7f66e971 100644 --- a/src/librados/RadosClient.h +++ b/src/librados/RadosClient.h @@ -118,7 +118,8 @@ public: bool wait_latest_map = false); int pool_list(std::list >& ls); - int get_pool_stats(std::list& ls, map& result); + int get_pool_stats(std::list& ls, map *result, + bool *per_pool); int get_fs_stats(ceph_statfs& result); bool get_pool_is_selfmanaged_snaps_mode(const std::string& pool); diff --git a/src/librados/librados_c.cc b/src/librados/librados_c.cc index 40183c28f9c..ac913d1ebdf 100644 --- a/src/librados/librados_c.cc +++ b/src/librados/librados_c.cc @@ -1010,18 +1010,19 @@ extern "C" int _rados_ioctx_pool_stat(rados_ioctx_t io, ls.push_back(pool_name); map rawresult; - err = io_ctx_impl->client->get_pool_stats(ls, rawresult); + bool per_pool = false; + err = io_ctx_impl->client->get_pool_stats(ls, &rawresult, &per_pool); if (err) { tracepoint(librados, rados_ioctx_pool_stat_exit, err, stats); return err; } ::pool_stat_t& r = rawresult[pool_name]; - uint64_t allocated_bytes = r.get_allocated_bytes(); + uint64_t allocated_bytes = r.get_allocated_bytes(per_pool); // FIXME: raw_used_rate is unknown hence use 1.0 here // meaning we keep net amount aggregated over all replicas // Not a big deal so far since this field isn't exposed - uint64_t user_bytes = r.get_user_bytes(1.0); + uint64_t user_bytes = r.get_user_bytes(1.0, per_pool); stats->num_kb = shift_round_up(allocated_bytes, 10); stats->num_bytes = allocated_bytes; diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc index 4dc75a8900b..7a5094f1ad7 100644 --- a/src/librados/librados_cxx.cc +++ b/src/librados/librados_cxx.cc @@ -2563,18 +2563,19 @@ int librados::Rados::get_pool_stats(std::list& v, stats_map& result) { map rawresult; - int r = client->get_pool_stats(v, rawresult); + bool per_pool = false; + int r = client->get_pool_stats(v, &rawresult, &per_pool); for (map::iterator p = rawresult.begin(); p != rawresult.end(); ++p) { pool_stat_t& pv = result[p->first]; auto& pstat = p->second; store_statfs_t &statfs = pstat.store_stats; - uint64_t allocated_bytes = pstat.get_allocated_bytes(); + uint64_t allocated_bytes = pstat.get_allocated_bytes(per_pool); // FIXME: raw_used_rate is unknown hence use 1.0 here // meaning we keep net amount aggregated over all replicas // Not a big deal so far since this field isn't exposed - uint64_t user_bytes = pstat.get_user_bytes(1.0); + uint64_t user_bytes = pstat.get_user_bytes(1.0, per_pool); object_stat_sum_t *sum = &p->second.stats.sum; pv.num_kb = shift_round_up(allocated_bytes, 10); diff --git a/src/messages/MGetPoolStatsReply.h b/src/messages/MGetPoolStatsReply.h index 9a7ccb4cc46..ff474d3d5db 100644 --- a/src/messages/MGetPoolStatsReply.h +++ b/src/messages/MGetPoolStatsReply.h @@ -17,13 +17,19 @@ #define CEPH_MGETPOOLSTATSREPLY_H class MGetPoolStatsReply : public PaxosServiceMessage { + static constexpr int HEAD_VERSION = 2; + static constexpr int COMPAT_VERSION = 1; + public: uuid_d fsid; std::map pool_stats; + bool per_pool = false; - MGetPoolStatsReply() : PaxosServiceMessage{MSG_GETPOOLSTATSREPLY, 0} {} + MGetPoolStatsReply() : PaxosServiceMessage{MSG_GETPOOLSTATSREPLY, 0, + HEAD_VERSION, COMPAT_VERSION} {} MGetPoolStatsReply(uuid_d& f, ceph_tid_t t, version_t v) : - PaxosServiceMessage{MSG_GETPOOLSTATSREPLY, v}, + PaxosServiceMessage{MSG_GETPOOLSTATSREPLY, v, + HEAD_VERSION, COMPAT_VERSION}, fsid(f) { set_tid(t); } @@ -34,7 +40,10 @@ private: public: std::string_view get_type_name() const override { return "getpoolstats"; } void print(std::ostream& out) const override { - out << "getpoolstatsreply(" << get_tid() << " v" << version << ")"; + out << "getpoolstatsreply(" << get_tid(); + if (per_pool) + out << " per_pool"; + out << " v" << version << ")"; } void encode_payload(uint64_t features) override { @@ -42,6 +51,7 @@ public: paxos_encode(); encode(fsid, payload); encode(pool_stats, payload, features); + encode(per_pool, payload); } void decode_payload() override { using ceph::decode; @@ -49,6 +59,11 @@ public: paxos_decode(p); decode(fsid, p); decode(pool_stats, p); + if (header.version >= 2) { + decode(per_pool, p); + } else { + per_pool = false; + } } private: template diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index f88cdf823fb..6cef81b47ba 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -246,6 +246,7 @@ bool MgrStatMonitor::preprocess_getpoolstats(MonOpRequestRef op) } epoch_t ver = get_last_committed(); auto reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), ver); + reply->per_pool = digest.use_per_pool_stats(); for (const auto& pool_name : m->pools) { const auto pool_id = mon->osdmon()->osdmap.lookup_pg_pool_name(pool_name); if (pool_id == -ENOENT) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 11bf4fda886..31b26fd3434 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -807,7 +807,9 @@ void PGMapDigest::dump_pool_stats_full( << pool_id; } float raw_used_rate = osd_map.pool_raw_used_rate(pool_id); - dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, pool); + bool per_pool = use_per_pool_stats(); + dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool, + pool); if (f) { f->close_section(); // stats f->close_section(); // pool @@ -836,6 +838,8 @@ void PGMapDigest::dump_cluster_stats(stringstream *ss, f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio()); + f->dump_unsigned("num_osds", osd_sum.num_osds); + f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds); f->close_section(); f->open_object_section("stats_by_class"); for (auto& i : osd_sum_by_class) { @@ -886,7 +890,7 @@ void PGMapDigest::dump_cluster_stats(stringstream *ss, void PGMapDigest::dump_object_stat_sum( TextTable &tbl, ceph::Formatter *f, const pool_stat_t &pool_stat, uint64_t avail, - float raw_used_rate, bool verbose, + float raw_used_rate, bool verbose, bool per_pool, const pg_pool_t *pool) { const object_stat_sum_t &sum = pool_stat.stats.sum; @@ -895,8 +899,8 @@ void PGMapDigest::dump_object_stat_sum( if (sum.num_object_copies > 0) { raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies; } - - uint64_t used_bytes = pool_stat.get_allocated_bytes(); + + uint64_t used_bytes = pool_stat.get_allocated_bytes(per_pool); float used = 0.0; // note avail passed in is raw_avail, calc raw_used here. @@ -908,7 +912,7 @@ void PGMapDigest::dump_object_stat_sum( } auto avail_res = raw_used_rate ? avail / raw_used_rate : 0; // an approximation for actually stored user data - auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate); + auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate, per_pool); if (f) { f->dump_int("stored", stored_normalized); f->dump_int("objects", sum.num_objects); @@ -927,7 +931,7 @@ void PGMapDigest::dump_object_stat_sum( f->dump_int("compress_bytes_used", statfs.data_compressed_allocated); f->dump_int("compress_under_bytes", statfs.data_compressed_original); // Stored by user amplified by replication - f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0)); + f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0, per_pool)); } } else { tbl << stringify(byte_u_t(stored_normalized)); diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 6d1af633869..1fc92ebc7d1 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -71,6 +71,10 @@ public: mempool::pgmap::map> purged_snaps; + bool use_per_pool_stats() const { + return osd_sum.num_osds == osd_sum.num_per_pool_osds; + } + // recent deltas, and summation /** * keep track of last deltas for each pool, calculated using @@ -169,7 +173,9 @@ public: const pool_stat_t &pool_stat, uint64_t avail, float raw_used_rate, - bool verbose, const pg_pool_t *pool); + bool verbose, + bool per_pool, + const pg_pool_t *pool); size_t get_num_pg_by_osd(int osd) const { auto p = num_pg_by_osd.find(osd); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e8205b30ca0..1455f10adcd 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2509,9 +2509,9 @@ struct pool_stat_t { // In legacy mode used and netto values are the same. But for new per-pool // collection 'used' provides amount of space ALLOCATED at all related OSDs // and 'netto' is amount of stored user data. - uint64_t get_allocated_bytes() const { + uint64_t get_allocated_bytes(bool per_pool) const { uint64_t allocated_bytes; - if (num_store_stats) { + if (per_pool) { allocated_bytes = store_stats.allocated; } else { // legacy mode, use numbers from 'stats' @@ -2522,9 +2522,9 @@ struct pool_stat_t { allocated_bytes += stats.sum.num_omap_bytes; return allocated_bytes; } - uint64_t get_user_bytes(float raw_used_rate) const { + uint64_t get_user_bytes(float raw_used_rate, bool per_pool) const { uint64_t user_bytes; - if (num_store_stats) { + if (per_pool) { user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0; } else { // legacy mode, use numbers from 'stats' diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index edca8255bbd..e0ba1253a5e 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -4175,6 +4175,7 @@ void Objecter::_finish_pool_op(PoolOp *op, int r) void Objecter::get_pool_stats(list& pools, map *result, + bool *per_pool, Context *onfinish) { ldout(cct, 10) << "get_pool_stats " << pools << dendl; @@ -4183,6 +4184,7 @@ void Objecter::get_pool_stats(list& pools, op->tid = ++last_tid; op->pools = pools; op->pool_stats = result; + op->per_pool = per_pool; op->onfinish = onfinish; if (mon_timeout > timespan(0)) { op->ontimeout = timer.add_event(mon_timeout, @@ -4229,6 +4231,7 @@ void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m) PoolStatOp *op = poolstat_ops[tid]; ldout(cct, 10) << "have request " << tid << " at " << op << dendl; *op->pool_stats = m->pool_stats; + *op->per_pool = m->per_pool; if (m->version > last_seen_pgmap_version) { last_seen_pgmap_version = m->version; } diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index b428ecc5217..a0b42ae2c15 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1644,6 +1644,7 @@ public: std::list pools; std::map *pool_stats; + bool *per_pool; Context *onfinish; uint64_t ontimeout; @@ -2998,7 +2999,9 @@ private: void _poolstat_submit(PoolStatOp *op); public: void handle_get_pool_stats_reply(MGetPoolStatsReply *m); - void get_pool_stats(std::list& pools, std::map *result, + void get_pool_stats(std::list& pools, + std::map *result, + bool *per_pool, Context *onfinish); int pool_stat_op_cancel(ceph_tid_t tid, int r); void _finish_pool_stat_op(PoolStatOp *op, int r); diff --git a/src/test/mon/PGMap.cc b/src/test/mon/PGMap.cc index efdb186e7ca..73007e0d509 100644 --- a/src/test/mon/PGMap.cc +++ b/src/test/mon/PGMap.cc @@ -80,7 +80,7 @@ TEST(pgmap, dump_object_stat_sum_0) pool.size = 2; pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); float copies_rate = (static_cast(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies) * pool.get_size(); @@ -117,7 +117,7 @@ TEST(pgmap, dump_object_stat_sum_1) pool.size = 2; pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); unsigned col = 0; ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++)); ASSERT_EQ(stringify(si_u_t(0)), tbl.get(0, col++)); @@ -148,7 +148,7 @@ TEST(pgmap, dump_object_stat_sum_2) pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); unsigned col = 0; ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++)); ASSERT_EQ(stringify(si_u_t(0)), tbl.get(0, col++)); -- 2.39.5