From: Venky Shankar Date: Wed, 19 May 2021 05:16:22 +0000 (-0400) Subject: mds, mgr: plumb in new client metrics X-Git-Tag: v17.2.4~7^2~6 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4144dbee7f938324ce1acebb15bc4797f57cb9b1;p=ceph.git mds, mgr: plumb in new client metrics Signed-off-by: Venky Shankar (cherry picked from commit f1083c0b3d41e6691945e8b40df8aa707a261217) --- diff --git a/src/include/cephfs/metrics/Types.h b/src/include/cephfs/metrics/Types.h index 7f5a40e245d8..d7cf56138611 100644 --- a/src/include/cephfs/metrics/Types.h +++ b/src/include/cephfs/metrics/Types.h @@ -27,6 +27,12 @@ enum ClientMetricType { CLIENT_METRIC_TYPE_OPENED_INODES, CLIENT_METRIC_TYPE_READ_IO_SIZES, CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, }; inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) { switch(type) { @@ -60,6 +66,24 @@ inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES: os << "WRITE_IO_SIZES"; break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_READ_LATENCY: + os << "AVG_READ_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_READ_LATENCY: + os << "STDEV_READ_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY: + os << "AVG_WRITE_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY: + os << "STDEV_WRITE_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY: + os << "AVG_METADATA_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY: + os << "STDEV_METADATA_LATENCY"; + break; default: os << "(UNKNOWN:" << static_cast::type>(type) << ")"; break; @@ -128,97 +152,154 @@ struct CapInfoPayload : public ClientMetricPayloadBase { struct ReadLatencyPayload : public ClientMetricPayloadBase { utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count ReadLatencyPayload() : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY) { } - ReadLatencyPayload(utime_t lat) - : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY), lat(lat) { + ReadLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count) { } void encode(bufferlist &bl) const { using ceph::encode; - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); ENCODE_FINISH(bl); } void decode(bufferlist::const_iterator &iter) { using ceph::decode; - DECODE_START(1, iter); + DECODE_START(2, iter); decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } DECODE_FINISH(iter); } void dump(Formatter *f) const { f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } void print(std::ostream *out) const { - *out << "latency: " << lat; + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; } }; struct WriteLatencyPayload : public ClientMetricPayloadBase { utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count WriteLatencyPayload() : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY) { } - WriteLatencyPayload(utime_t lat) - : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY), lat(lat) { + WriteLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count){ } void encode(bufferlist &bl) const { using ceph::encode; - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); ENCODE_FINISH(bl); } void decode(bufferlist::const_iterator &iter) { using ceph::decode; - DECODE_START(1, iter); + DECODE_START(2, iter); decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } DECODE_FINISH(iter); } void dump(Formatter *f) const { f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } void print(std::ostream *out) const { - *out << "latency: " << lat; + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; } }; struct MetadataLatencyPayload : public ClientMetricPayloadBase { utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count MetadataLatencyPayload() - : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { } - MetadataLatencyPayload(utime_t lat) - : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY), lat(lat) { + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { } + MetadataLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count) { } void encode(bufferlist &bl) const { using ceph::encode; - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); ENCODE_FINISH(bl); } void decode(bufferlist::const_iterator &iter) { using ceph::decode; - DECODE_START(1, iter); + DECODE_START(2, iter); decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } DECODE_FINISH(iter); } void dump(Formatter *f) const { f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } void print(std::ostream *out) const { - *out << "latency: " << lat; + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; } }; diff --git a/src/mds/MDSPerfMetricTypes.h b/src/mds/MDSPerfMetricTypes.h index 6bf64e91809f..78b838c892d9 100644 --- a/src/mds/MDSPerfMetricTypes.h +++ b/src/mds/MDSPerfMetricTypes.h @@ -39,66 +39,102 @@ struct CapHitMetric { struct ReadLatencyMetric { utime_t lat; + utime_t mean; + uint64_t sq_sum; + uint64_t count; bool updated = false; DENC(ReadLatencyMetric, v, p) { - DENC_START(2, 1, p); + DENC_START(3, 1, p); denc(v.lat, p); if (struct_v >= 2) denc(v.updated, p); + if (struct_v >= 3) { + denc(v.mean, p); + denc(v.sq_sum, p); + denc(v.count, p); + } DENC_FINISH(p); } void dump(Formatter *f) const { f->dump_object("read_latency", lat); + f->dump_object("avg_read_alatency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } friend std::ostream& operator<<(std::ostream& os, const ReadLatencyMetric &metric) { - os << "{latency=" << metric.lat << "}"; + os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean + << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}"; return os; } }; struct WriteLatencyMetric { utime_t lat; + utime_t mean; + uint64_t sq_sum; + uint64_t count; bool updated = false; DENC(WriteLatencyMetric, v, p) { - DENC_START(2, 1, p); + DENC_START(3, 1, p); denc(v.lat, p); if (struct_v >= 2) denc(v.updated, p); + if (struct_v >= 3) { + denc(v.mean, p); + denc(v.sq_sum, p); + denc(v.count, p); + } DENC_FINISH(p); } void dump(Formatter *f) const { f->dump_object("write_latency", lat); + f->dump_object("avg_write_alatency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } friend std::ostream& operator<<(std::ostream& os, const WriteLatencyMetric &metric) { - os << "{latency=" << metric.lat << "}"; + os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean + << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}"; return os; } }; struct MetadataLatencyMetric { utime_t lat; + utime_t mean; + uint64_t sq_sum; + uint64_t count; bool updated = false; DENC(MetadataLatencyMetric, v, p) { - DENC_START(2, 1, p); + DENC_START(3, 1, p); denc(v.lat, p); if (struct_v >= 2) denc(v.updated, p); + if (struct_v >= 3) { + denc(v.mean, p); + denc(v.sq_sum, p); + denc(v.count, p); + } DENC_FINISH(p); } void dump(Formatter *f) const { f->dump_object("metadata_latency", lat); + f->dump_object("avg_metadata_alatency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); } friend std::ostream& operator<<(std::ostream& os, const MetadataLatencyMetric &metric) { - os << "{latency=" << metric.lat << "}"; + os << "{latency=" << metric.lat << ", avg_latency=" << metric.mean + << ", sq_sum=" << metric.sq_sum << ", count=" << metric.count << "}"; return os; } }; diff --git a/src/mds/MetricAggregator.cc b/src/mds/MetricAggregator.cc index 046e79269868..6487084fb639 100644 --- a/src/mds/MetricAggregator.cc +++ b/src/mds/MetricAggregator.cc @@ -168,6 +168,42 @@ void MetricAggregator::refresh_metrics_for_rank(const entity_inst_t &client, c->second = metrics.write_io_sizes_metric.total_size; } break; + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + if (metrics.read_latency_metric.updated) { + c->first = metrics.read_latency_metric.mean.tv.tv_sec; + c->second = metrics.read_latency_metric.mean.tv.tv_nsec; + } + break; + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + if (metrics.read_latency_metric.updated) { + c->first = metrics.read_latency_metric.sq_sum; + c->second = metrics.read_latency_metric.count; + } + break; + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + if (metrics.write_latency_metric.updated) { + c->first = metrics.write_latency_metric.mean.tv.tv_sec; + c->second = metrics.write_latency_metric.mean.tv.tv_nsec; + } + break; + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + if (metrics.write_latency_metric.updated) { + c->first = metrics.write_latency_metric.sq_sum; + c->second = metrics.write_latency_metric.count; + } + break; + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + if (metrics.metadata_latency_metric.updated) { + c->first = metrics.metadata_latency_metric.mean.tv.tv_sec; + c->second = metrics.metadata_latency_metric.mean.tv.tv_nsec; + } + break; + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + if (metrics.metadata_latency_metric.updated) { + c->first = metrics.metadata_latency_metric.sq_sum; + c->second = metrics.metadata_latency_metric.count; + } + break; default: ceph_abort_msg("unknown counter type"); } diff --git a/src/mds/MetricsHandler.cc b/src/mds/MetricsHandler.cc index 3fcaaaec1fab..b28b06b7ad29 100644 --- a/src/mds/MetricsHandler.cc +++ b/src/mds/MetricsHandler.cc @@ -166,7 +166,9 @@ void MetricsHandler::handle_payload(Session *session, const CapInfoPayload &payl void MetricsHandler::handle_payload(Session *session, const ReadLatencyPayload &payload) { dout(20) << ": type=" << payload.get_type() - << ", session=" << session << ", latency=" << payload.lat << dendl; + << ", session=" << session << ", latency=" << payload.lat + << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum + << ", count=" << payload.count << dendl; auto it = client_metrics_map.find(session->info.inst); if (it == client_metrics_map.end()) { @@ -176,12 +178,17 @@ void MetricsHandler::handle_payload(Session *session, const ReadLatencyPayload & auto &metrics = it->second.second; metrics.update_type = UPDATE_TYPE_REFRESH; metrics.read_latency_metric.lat = payload.lat; + metrics.read_latency_metric.mean = payload.mean; + metrics.read_latency_metric.sq_sum = payload.sq_sum; + metrics.read_latency_metric.count = payload.count; metrics.read_latency_metric.updated = true; } void MetricsHandler::handle_payload(Session *session, const WriteLatencyPayload &payload) { dout(20) << ": type=" << payload.get_type() - << ", session=" << session << ", latency=" << payload.lat << dendl; + << ", session=" << session << ", latency=" << payload.lat + << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum + << ", count=" << payload.count << dendl; auto it = client_metrics_map.find(session->info.inst); if (it == client_metrics_map.end()) { @@ -191,12 +198,17 @@ void MetricsHandler::handle_payload(Session *session, const WriteLatencyPayload auto &metrics = it->second.second; metrics.update_type = UPDATE_TYPE_REFRESH; metrics.write_latency_metric.lat = payload.lat; + metrics.write_latency_metric.mean = payload.mean; + metrics.write_latency_metric.sq_sum = payload.sq_sum; + metrics.write_latency_metric.count = payload.count; metrics.write_latency_metric.updated = true; } void MetricsHandler::handle_payload(Session *session, const MetadataLatencyPayload &payload) { dout(20) << ": type=" << payload.get_type() - << ", session=" << session << ", latency=" << payload.lat << dendl; + << ", session=" << session << ", latency=" << payload.lat + << ", avg=" << payload.mean << ", sq_sum=" << payload.sq_sum + << ", count=" << payload.count << dendl; auto it = client_metrics_map.find(session->info.inst); if (it == client_metrics_map.end()) { @@ -206,6 +218,9 @@ void MetricsHandler::handle_payload(Session *session, const MetadataLatencyPaylo auto &metrics = it->second.second; metrics.update_type = UPDATE_TYPE_REFRESH; metrics.metadata_latency_metric.lat = payload.lat; + metrics.metadata_latency_metric.mean = payload.mean; + metrics.metadata_latency_metric.sq_sum = payload.sq_sum; + metrics.metadata_latency_metric.count = payload.count; metrics.metadata_latency_metric.updated = true; } diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc index 2e894b031d48..ca441d5e539d 100644 --- a/src/mgr/BaseMgrModule.cc +++ b/src/mgr/BaseMgrModule.cc @@ -1104,6 +1104,12 @@ ceph_add_mds_perf_query(BaseMgrModule *self, PyObject *args) {"opened_inodes", MDSPerformanceCounterType::OPENED_INODES_METRIC}, {"read_io_sizes", MDSPerformanceCounterType::READ_IO_SIZES_METRIC}, {"write_io_sizes", MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC}, + {"avg_read_latency", MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC}, + {"stdev_read_latency", MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC}, + {"avg_write_latency", MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC}, + {"stdev_write_latency", MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC}, + {"avg_metadata_latency", MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC}, + {"stdev_metadata_latency", MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC}, }; PyObject *py_query = nullptr; diff --git a/src/mgr/MDSPerfMetricTypes.cc b/src/mgr/MDSPerfMetricTypes.cc index 5568cbe5d7a3..a16003774a49 100644 --- a/src/mgr/MDSPerfMetricTypes.cc +++ b/src/mgr/MDSPerfMetricTypes.cc @@ -35,6 +35,12 @@ void MDSPerformanceCounterDescriptor::pack_counter( case MDSPerformanceCounterType::OPENED_INODES_METRIC: case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: break; default: ceph_abort_msg("unknown counter type"); @@ -57,6 +63,12 @@ void MDSPerformanceCounterDescriptor::unpack_counter( case MDSPerformanceCounterType::OPENED_INODES_METRIC: case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: break; default: ceph_abort_msg("unknown counter type"); @@ -95,6 +107,24 @@ std::ostream& operator<<(std::ostream &os, const MDSPerformanceCounterDescriptor case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: os << "write_io_sizes_metric"; break; + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + os << "avg_read_latency"; + break; + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + os << "stdev_read_latency"; + break; + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + os << "avg_write_latency"; + break; + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + os << "stdev_write_latency"; + break; + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + os << "avg_metadata_latency"; + break; + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: + os << "stdev_metadata_latency"; + break; } return os; diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h index a965e5fa7122..aa35b8cab0fc 100644 --- a/src/mgr/MDSPerfMetricTypes.h +++ b/src/mgr/MDSPerfMetricTypes.h @@ -126,6 +126,12 @@ enum class MDSPerformanceCounterType : uint8_t { OPENED_INODES_METRIC = 7, READ_IO_SIZES_METRIC = 8, WRITE_IO_SIZES_METRIC = 9, + AVG_READ_LATENCY_METRIC = 10, + STDEV_READ_LATENCY_METRIC = 11, + AVG_WRITE_LATENCY_METRIC = 12, + STDEV_WRITE_LATENCY_METRIC = 13, + AVG_METADATA_LATENCY_METRIC = 14, + STDEV_METADATA_LATENCY_METRIC = 15, }; struct MDSPerformanceCounterDescriptor { @@ -143,6 +149,12 @@ struct MDSPerformanceCounterDescriptor { case MDSPerformanceCounterType::OPENED_INODES_METRIC: case MDSPerformanceCounterType::READ_IO_SIZES_METRIC: case MDSPerformanceCounterType::WRITE_IO_SIZES_METRIC: + case MDSPerformanceCounterType::AVG_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_READ_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_WRITE_LATENCY_METRIC: + case MDSPerformanceCounterType::AVG_METADATA_LATENCY_METRIC: + case MDSPerformanceCounterType::STDEV_METADATA_LATENCY_METRIC: return true; default: return false;