From 020fb2f7007bb5ee3b7f7306063aac054be7dc84 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 16 Mar 2022 17:15:57 +0800 Subject: [PATCH] mds, client: only send the metrices supported by MDSes For the old ceph clusters the clients won't send any metrics to them as default unless they have backported this commit, but there has one option 'client_collect_and_send_global_metrics' still could be used to enable it manually. This will fix the crash bug when upgrading from old ceph clusters, which will crash the MDSes once they receive unknown metrics. Fixes: https://tracker.ceph.com/issues/54411 Signed-off-by: Xiubo Li (cherry picked from commit e9a26c551c763f75a403ff26f6304d5c10f2ca38) Conflicts: src/client/Client.cc --- src/client/Client.cc | 71 ++++++++++++++++++++++++++-------------- src/client/MetaSession.h | 1 + src/mds/Server.cc | 13 ++++++-- src/mds/Server.h | 1 + 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 32f2fd1cf5c0a..34d85a4beccf4 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -2221,6 +2221,7 @@ void Client::handle_client_session(const MConstRef& m) break; } session->mds_features = std::move(m->supported_features); + session->mds_metric_flags = std::move(m->metric_spec.metric_flags); renew_caps(session); session->state = MetaSession::STATE_OPEN; @@ -6711,57 +6712,79 @@ void Client::collect_and_send_global_metrics() { std::vector message; // read latency - metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read))); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) { + metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read))); + message.push_back(metric); + } // write latency - metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat))); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) { + metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat))); + message.push_back(metric); + } // metadata latency - metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat))); - message.push_back(metric); + if (session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) { + metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat))); + message.push_back(metric); + } // cap hit ratio -- nr_caps is unused right now - auto [cap_hits, cap_misses] = get_cap_hit_rates(); - metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0)); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) { + auto [cap_hits, cap_misses] = get_cap_hit_rates(); + metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0)); + message.push_back(metric); + } // dentry lease hit ratio - auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates(); - metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr)); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) { + auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates(); + metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr)); + message.push_back(metric); + } // opened files - { + if (session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) { auto [opened_files, total_inodes] = get_opened_files_rates(); metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes)); + message.push_back(metric); } - message.push_back(metric); // pinned i_caps - { + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) { auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates(); metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes)); + message.push_back(metric); } - message.push_back(metric); // opened inodes - { + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) { auto [opened_inodes, total_inodes] = get_opened_inodes_rates(); metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes)); + message.push_back(metric); } - message.push_back(metric); // read io sizes - metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops, - total_read_size)); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) { + metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops, + total_read_size)); + message.push_back(metric); + } // write io sizes - metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops, - total_write_size)); - message.push_back(metric); + if (_collect_and_send_global_metrics || + session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) { + metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops, + total_write_size)); + message.push_back(metric); + } session->con->send_message2(make_message(std::move(message))); } diff --git a/src/client/MetaSession.h b/src/client/MetaSession.h index c28cd0b83b6e7..d9a1e481aca2d 100644 --- a/src/client/MetaSession.h +++ b/src/client/MetaSession.h @@ -25,6 +25,7 @@ struct MetaSession { uint64_t cap_renew_seq = 0; entity_addrvec_t addrs; feature_bitset_t mds_features; + feature_bitset_t mds_metric_flags; enum { STATE_NEW, // Unused diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 09c024099870e..4fbe058e30633 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -264,6 +264,7 @@ Server::Server(MDSRank *m, MetricsHandler *metrics_handler) : caps_throttle_retry_request_timeout = g_conf().get_val("mds_cap_acquisition_throttle_retry_request_timeout"); dir_max_entries = g_conf().get_val("mds_dir_max_entries"); supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED); + supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL); } void Server::dispatch(const cref_t &m) @@ -855,8 +856,10 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve metrics_handler->add_session(session); ceph_assert(session->get_connection()); auto reply = make_message(CEPH_SESSION_OPEN); - if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) { reply->supported_features = supported_features; + reply->metric_spec = supported_metric_spec; + } mds->send_message_client(reply, session); if (mdcache->is_readonly()) { auto m = make_message(CEPH_SESSION_FORCE_RO); @@ -1009,8 +1012,10 @@ void Server::finish_force_open_sessions(const mapadd_session(session); auto reply = make_message(CEPH_SESSION_OPEN); - if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) { reply->supported_features = supported_features; + reply->metric_spec = supported_metric_spec; + } mds->send_message_client(reply, session); if (mdcache->is_readonly()) @@ -1472,8 +1477,10 @@ void Server::handle_client_reconnect(const cref_t &m) metrics_handler->add_session(session); // notify client of success with an OPEN auto reply = make_message(CEPH_SESSION_OPEN); - if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) + if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) { reply->supported_features = supported_features; + reply->metric_spec = supported_metric_spec; + } mds->send_message_client(reply, session); mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay; } diff --git a/src/mds/Server.h b/src/mds/Server.h index 15b5934a72781..66e9d2136e97e 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -487,6 +487,7 @@ private: set client_reconnect_denied; // clients whose reconnect msg have been denied . feature_bitset_t supported_features; + feature_bitset_t supported_metric_spec; feature_bitset_t required_client_features; bool forward_all_requests_to_auth = false; -- 2.39.5