There have been cases where the MDS does an undesirable failover because it
misses heartbeat resets after a long recovery in up:replay. It was observed
that the MDS was processing a flood of metrics messages from all reconnecting
clients. This likely caused undersiable MetricAggregator::lock contention in
the messenger threads while fast dispatching client metrics.
Instead, use the normal dispatch where acquiring locks is okay to do.
See-also: linux.git/
f7c2f4f6ce16fb58f7d024f3e1b40023c4b43ff9
Fixes: https://tracker.ceph.com/issues/65658
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
}
}
-bool MetricAggregator::ms_can_fast_dispatch2(const cref_t<Message> &m) const {
- return m->get_type() == MSG_MDS_METRICS;
-}
-
-void MetricAggregator::ms_fast_dispatch2(const ref_t<Message> &m) {
- bool handled = ms_dispatch2(m);
- ceph_assert(handled);
-}
-
bool MetricAggregator::ms_dispatch2(const ref_t<Message> &m) {
if (m->get_type() == MSG_MDS_METRICS &&
m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MDS) {
void notify_mdsmap(const MDSMap &mdsmap);
- bool ms_can_fast_dispatch_any() const override {
- return true;
- }
- bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override;
- void ms_fast_dispatch2(const ref_t<Message> &m) override;
bool ms_dispatch2(const ref_t<Message> &m) override;
void ms_handle_connect(Connection *c) override {