From: Sage Weil Date: Fri, 12 Jul 2019 22:25:49 +0000 (-0500) Subject: mon: use per-pool stats only when all OSDs are reporting X-Git-Tag: v14.2.2~3^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=aacfa8f08cb7c916ffa821545615d4a5c2fa5b05;p=ceph.git mon: use per-pool stats only when all OSDs are reporting Previously, we would start using the per-pool stat sums as soon as *any* OSDs were reporting. For a legacy cluster, that meant that as soon as one bluestore instance is updated or one new bluestore OSD is created, the usage stats per pool would become useless. Instead, only use the per-pool stats once *all* OSDs are reporting the new values. This mostly aligns with the health warning when one more bluestore OSDs are not reporting; once they are update the warning goes away. (That does not factor in fielstore OSDs, though; all OSDs need to be new *and* bluestore.) Signed-off-by: Sage Weil (cherry picked from commit 5dcb6d81bbc2a4c4d0da4a33d9f6bbba5065a1ad) # Conflicts: # src/messages/MGetPoolStatsReply.h - ctor arguments are all weird # src/mon/PGMap.h - lots of std:: everywhre # src/osdc/Objecter.h - std:: everywhere --- diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc index e2321759987..902c2c66fbf 100644 --- a/src/librados/RadosClient.cc +++ b/src/librados/RadosClient.cc @@ -645,15 +645,17 @@ int librados::RadosClient::pool_list(std::list >& v) } int librados::RadosClient::get_pool_stats(std::list& pools, - map& result) + map *result, + bool *per_pool) { Mutex mylock("RadosClient::get_pool_stats::mylock"); Cond cond; bool done; int ret = 0; - objecter->get_pool_stats(pools, &result, new C_SafeCond(&mylock, &cond, &done, - &ret)); + objecter->get_pool_stats(pools, result, per_pool, + new C_SafeCond(&mylock, &cond, &done, + &ret)); mylock.Lock(); while (!done) diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h index 8c12aa066a1..54eb3f9faf6 100644 --- a/src/librados/RadosClient.h +++ b/src/librados/RadosClient.h @@ -119,7 +119,8 @@ public: bool wait_latest_map = false); int pool_list(std::list >& ls); - int get_pool_stats(std::list& ls, map& result); + int get_pool_stats(std::list& ls, map *result, + bool *per_pool); int get_fs_stats(ceph_statfs& result); bool get_pool_is_selfmanaged_snaps_mode(const std::string& pool); diff --git a/src/librados/librados_c.cc b/src/librados/librados_c.cc index ea20879823f..118ca2fd5b7 100644 --- a/src/librados/librados_c.cc +++ b/src/librados/librados_c.cc @@ -1010,18 +1010,19 @@ extern "C" int _rados_ioctx_pool_stat(rados_ioctx_t io, ls.push_back(pool_name); map rawresult; - err = io_ctx_impl->client->get_pool_stats(ls, rawresult); + bool per_pool = false; + err = io_ctx_impl->client->get_pool_stats(ls, &rawresult, &per_pool); if (err) { tracepoint(librados, rados_ioctx_pool_stat_exit, err, stats); return err; } ::pool_stat_t& r = rawresult[pool_name]; - uint64_t allocated_bytes = r.get_allocated_bytes(); + uint64_t allocated_bytes = r.get_allocated_bytes(per_pool); // FIXME: raw_used_rate is unknown hence use 1.0 here // meaning we keep net amount aggregated over all replicas // Not a big deal so far since this field isn't exposed - uint64_t user_bytes = r.get_user_bytes(1.0); + uint64_t user_bytes = r.get_user_bytes(1.0, per_pool); stats->num_kb = shift_round_up(allocated_bytes, 10); stats->num_bytes = allocated_bytes; diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc index c174c7d58a7..77190788a67 100644 --- a/src/librados/librados_cxx.cc +++ b/src/librados/librados_cxx.cc @@ -2476,18 +2476,19 @@ int librados::Rados::get_pool_stats(std::list& v, stats_map& result) { map rawresult; - int r = client->get_pool_stats(v, rawresult); + bool per_pool = false; + int r = client->get_pool_stats(v, &rawresult, &per_pool); for (map::iterator p = rawresult.begin(); p != rawresult.end(); ++p) { pool_stat_t& pv = result[p->first]; auto& pstat = p->second; store_statfs_t &statfs = pstat.store_stats; - uint64_t allocated_bytes = pstat.get_allocated_bytes(); + uint64_t allocated_bytes = pstat.get_allocated_bytes(per_pool); // FIXME: raw_used_rate is unknown hence use 1.0 here // meaning we keep net amount aggregated over all replicas // Not a big deal so far since this field isn't exposed - uint64_t user_bytes = pstat.get_user_bytes(1.0); + uint64_t user_bytes = pstat.get_user_bytes(1.0, per_pool); object_stat_sum_t *sum = &p->second.stats.sum; pv.num_kb = shift_round_up(allocated_bytes, 10); diff --git a/src/messages/MGetPoolStatsReply.h b/src/messages/MGetPoolStatsReply.h index e9bb829dc17..8af585843aa 100644 --- a/src/messages/MGetPoolStatsReply.h +++ b/src/messages/MGetPoolStatsReply.h @@ -17,15 +17,21 @@ #define CEPH_MGETPOOLSTATSREPLY_H class MGetPoolStatsReply : public MessageInstance { + static constexpr int HEAD_VERSION = 2; + static constexpr int COMPAT_VERSION = 1; + public: friend factory; uuid_d fsid; map pool_stats; + bool per_pool = false; - MGetPoolStatsReply() : MessageInstance(MSG_GETPOOLSTATSREPLY, 0) {} + MGetPoolStatsReply() : MessageInstance(MSG_GETPOOLSTATSREPLY, 0, + HEAD_VERSION, COMPAT_VERSION) {} MGetPoolStatsReply(uuid_d& f, ceph_tid_t t, version_t v) : - MessageInstance(MSG_GETPOOLSTATSREPLY, v), + MessageInstance(MSG_GETPOOLSTATSREPLY, v, + HEAD_VERSION, COMPAT_VERSION), fsid(f) { set_tid(t); } @@ -36,7 +42,10 @@ private: public: std::string_view get_type_name() const override { return "getpoolstats"; } void print(ostream& out) const override { - out << "getpoolstatsreply(" << get_tid() << " v" << version << ")"; + out << "getpoolstatsreply(" << get_tid(); + if (per_pool) + out << " per_pool"; + out << " v" << version << ")"; } void encode_payload(uint64_t features) override { @@ -44,6 +53,7 @@ public: paxos_encode(); encode(fsid, payload); encode(pool_stats, payload, features); + encode(per_pool, payload); } void decode_payload() override { using ceph::decode; @@ -51,6 +61,11 @@ public: paxos_decode(p); decode(fsid, p); decode(pool_stats, p); + if (header.version >= 2) { + decode(per_pool, p); + } else { + per_pool = false; + } } }; diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index f88cdf823fb..6cef81b47ba 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -246,6 +246,7 @@ bool MgrStatMonitor::preprocess_getpoolstats(MonOpRequestRef op) } epoch_t ver = get_last_committed(); auto reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), ver); + reply->per_pool = digest.use_per_pool_stats(); for (const auto& pool_name : m->pools) { const auto pool_id = mon->osdmon()->osdmap.lookup_pg_pool_name(pool_name); if (pool_id == -ENOENT) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index be8a20f8596..a067f4af6f5 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -798,7 +798,9 @@ void PGMapDigest::dump_pool_stats_full( << pool_id; } float raw_used_rate = osd_map.pool_raw_used_rate(pool_id); - dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, pool); + bool per_pool = use_per_pool_stats(); + dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool, + pool); if (f) { f->close_section(); // stats f->close_section(); // pool @@ -827,6 +829,8 @@ void PGMapDigest::dump_cluster_stats(stringstream *ss, f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio()); + f->dump_unsigned("num_osds", osd_sum.num_osds); + f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds); f->close_section(); f->open_object_section("stats_by_class"); for (auto& i : osd_sum_by_class) { @@ -877,7 +881,7 @@ void PGMapDigest::dump_cluster_stats(stringstream *ss, void PGMapDigest::dump_object_stat_sum( TextTable &tbl, Formatter *f, const pool_stat_t &pool_stat, uint64_t avail, - float raw_used_rate, bool verbose, + float raw_used_rate, bool verbose, bool per_pool, const pg_pool_t *pool) { const object_stat_sum_t &sum = pool_stat.stats.sum; @@ -886,8 +890,8 @@ void PGMapDigest::dump_object_stat_sum( if (sum.num_object_copies > 0) { raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies; } - - uint64_t used_bytes = pool_stat.get_allocated_bytes(); + + uint64_t used_bytes = pool_stat.get_allocated_bytes(per_pool); float used = 0.0; // note avail passed in is raw_avail, calc raw_used here. @@ -899,7 +903,7 @@ void PGMapDigest::dump_object_stat_sum( } auto avail_res = raw_used_rate ? avail / raw_used_rate : 0; // an approximation for actually stored user data - auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate); + auto stored_normalized = pool_stat.get_user_bytes(raw_used_rate, per_pool); if (f) { f->dump_int("stored", stored_normalized); f->dump_int("objects", sum.num_objects); @@ -918,7 +922,7 @@ void PGMapDigest::dump_object_stat_sum( f->dump_int("compress_bytes_used", statfs.data_compressed_allocated); f->dump_int("compress_under_bytes", statfs.data_compressed_original); // Stored by user amplified by replication - f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0)); + f->dump_int("stored_raw", pool_stat.get_user_bytes(1.0, per_pool)); } } else { tbl << stringify(byte_u_t(stored_normalized)); diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 550560064ea..91a7c1dd22b 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -71,6 +71,10 @@ public: mempool::pgmap::map> purged_snaps; + bool use_per_pool_stats() const { + return osd_sum.num_osds == osd_sum.num_per_pool_osds; + } + // recent deltas, and summation /** * keep track of last deltas for each pool, calculated using @@ -169,7 +173,8 @@ public: const pool_stat_t &pool_stat, uint64_t avail, float raw_used_rate, - bool verbose, const pg_pool_t *pool); + bool verbose, bool per_pool, + const pg_pool_t *pool); size_t get_num_pg_by_osd(int osd) const { auto p = num_pg_by_osd.find(osd); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 52da66e9d6e..7e591d956b5 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2497,9 +2497,9 @@ struct pool_stat_t { // In legacy mode used and netto values are the same. But for new per-pool // collection 'used' provides amount of space ALLOCATED at all related OSDs // and 'netto' is amount of stored user data. - uint64_t get_allocated_bytes() const { + uint64_t get_allocated_bytes(bool per_pool) const { uint64_t allocated_bytes; - if (num_store_stats) { + if (per_pool) { allocated_bytes = store_stats.allocated; } else { // legacy mode, use numbers from 'stats' @@ -2510,9 +2510,9 @@ struct pool_stat_t { allocated_bytes += stats.sum.num_omap_bytes; return allocated_bytes; } - uint64_t get_user_bytes(float raw_used_rate) const { + uint64_t get_user_bytes(float raw_used_rate, bool per_pool) const { uint64_t user_bytes; - if (num_store_stats) { + if (per_pool) { user_bytes = raw_used_rate ? store_stats.data_stored / raw_used_rate : 0; } else { // legacy mode, use numbers from 'stats' diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 27e00c2309f..0c172c95518 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -4140,6 +4140,7 @@ void Objecter::_finish_pool_op(PoolOp *op, int r) void Objecter::get_pool_stats(list& pools, map *result, + bool *per_pool, Context *onfinish) { ldout(cct, 10) << "get_pool_stats " << pools << dendl; @@ -4148,6 +4149,7 @@ void Objecter::get_pool_stats(list& pools, op->tid = ++last_tid; op->pools = pools; op->pool_stats = result; + op->per_pool = per_pool; op->onfinish = onfinish; if (mon_timeout > timespan(0)) { op->ontimeout = timer.add_event(mon_timeout, @@ -4194,6 +4196,7 @@ void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m) PoolStatOp *op = poolstat_ops[tid]; ldout(cct, 10) << "have request " << tid << " at " << op << dendl; *op->pool_stats = m->pool_stats; + *op->per_pool = m->per_pool; if (m->version > last_seen_pgmap_version) { last_seen_pgmap_version = m->version; } diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 3dfa1a2d7f7..ca8d85f7ac1 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1564,6 +1564,7 @@ public: list pools; map *pool_stats; + bool *per_pool; Context *onfinish; uint64_t ontimeout; @@ -2937,6 +2938,7 @@ private: public: void handle_get_pool_stats_reply(MGetPoolStatsReply *m); void get_pool_stats(list& pools, map *result, + bool *per_pool, Context *onfinish); int pool_stat_op_cancel(ceph_tid_t tid, int r); void _finish_pool_stat_op(PoolStatOp *op, int r); diff --git a/src/test/mon/PGMap.cc b/src/test/mon/PGMap.cc index efdb186e7ca..73007e0d509 100644 --- a/src/test/mon/PGMap.cc +++ b/src/test/mon/PGMap.cc @@ -80,7 +80,7 @@ TEST(pgmap, dump_object_stat_sum_0) pool.size = 2; pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); float copies_rate = (static_cast(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies) * pool.get_size(); @@ -117,7 +117,7 @@ TEST(pgmap, dump_object_stat_sum_1) pool.size = 2; pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); unsigned col = 0; ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++)); ASSERT_EQ(stringify(si_u_t(0)), tbl.get(0, col++)); @@ -148,7 +148,7 @@ TEST(pgmap, dump_object_stat_sum_2) pool.type = pg_pool_t::TYPE_REPLICATED; PGMap::dump_object_stat_sum(tbl, nullptr, pool_stat, avail, - pool.get_size(), verbose, &pool); + pool.get_size(), verbose, true, &pool); unsigned col = 0; ASSERT_EQ(stringify(byte_u_t(0)), tbl.get(0, col++)); ASSERT_EQ(stringify(si_u_t(0)), tbl.get(0, col++));