From 2096113d9e1589c571d96e34dd9cd841308a2567 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Mon, 5 Jun 2017 13:33:14 -0700 Subject: [PATCH] osd: heartbeat with packets large enough to require working jumbo frames. We get periodic reports that users somehow misconfigure one of their switches so that it drops jumbo frames, yet the servers are still passing them along. In that case, MOSDOp messages generally don't get through because they are much larger than the 1500-byte non-jumbo limit, but the MOSDPing messages have kept going (as they are very small and dispatched independently, even when the server is willing to make jumbo frames). This means peer OSDs won't mark down the ones behind the broken switch, despite all IO hanging. Push the MOSDPing message size over the 1500-byte limit so that anybody in this scenario will see the OSDs stuck behind a bad switch get marked down. Fixes: http://tracker.ceph.com/issues/20087 Signed-off-by: Greg Farnum --- src/common/config_opts.h | 1 + src/messages/MOSDPing.h | 31 +++++++++++++++++++++++++++---- src/osd/OSD.cc | 15 ++++++++------- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b1d1dfc8f7be7..b8cb69104e568 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -797,6 +797,7 @@ OPTION(osd_heartbeat_interval, OPT_INT, 6) // (seconds) how often we ping OPTION(osd_heartbeat_grace, OPT_INT, 20) OPTION(osd_heartbeat_min_peers, OPT_INT, 10) // minimum number of peers OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL, false) // prio the heartbeat tcp socket and set dscp as CS6 on it if true +OPTION(osd_heartbeat_min_size, OPT_INT, 2000) // the minimum size of OSD heartbeat messages to send // max number of parallel snap trims/pg OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2) diff --git a/src/messages/MOSDPing.h b/src/messages/MOSDPing.h index 3bb54c5ae0ffd..7a8bf2f986f01 100644 --- a/src/messages/MOSDPing.h +++ b/src/messages/MOSDPing.h @@ -12,6 +12,17 @@ * */ + +/** + * This is used to send pings between daemons (so far, the OSDs) for + * heartbeat purposes. We include a timestamp and distinguish between + * outgoing pings and responses to those. If you set the + * min_message in the constructor, the message will inflate itself + * to the specified size -- this is good for dealing with network + * issues with jumbo frames. See http://tracker.ceph.com/issues/20087 + * + */ + #ifndef CEPH_MOSDPING_H #define CEPH_MOSDPING_H @@ -23,7 +34,7 @@ class MOSDPing : public Message { - static const int HEAD_VERSION = 2; + static const int HEAD_VERSION = 3; static const int COMPAT_VERSION = 2; public: @@ -52,13 +63,15 @@ class MOSDPing : public Message { __u8 op; osd_peer_stat_t peer_stat; utime_t stamp; + uint32_t min_message_size; - MOSDPing(const uuid_d& f, epoch_t e, __u8 o, utime_t s) + MOSDPing(const uuid_d& f, epoch_t e, __u8 o, utime_t s, uint32_t min_message) : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION), - fsid(f), map_epoch(e), peer_as_of_epoch(0), op(o), stamp(s) + fsid(f), map_epoch(e), peer_as_of_epoch(0), op(o), stamp(s), + min_message_size(min_message) { } MOSDPing() - : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION) + : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION), min_message_size(0) {} private: ~MOSDPing() override {} @@ -72,6 +85,12 @@ public: ::decode(op, p); ::decode(peer_stat, p); ::decode(stamp, p); + if (header.version >= 3) { + bufferlist size_bl; + int payload_mid_length = p.get_off(); + ::decode(size_bl, p); + min_message_size = size_bl.length() + payload_mid_length; + } } void encode_payload(uint64_t features) override { ::encode(fsid, payload); @@ -80,6 +99,10 @@ public: ::encode(op, payload); ::encode(peer_stat, payload); ::encode(stamp, payload); + bufferptr size_bp(MAX(min_message_size - payload.length(), 0)); + bufferlist size_bl;; + size_bl.push_back(size_bp); + ::encode(size_bl, payload); } const char *get_type_name() const override { return "osd_ping"; } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 83b26198c902c..74dbfcd1050e8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4669,8 +4669,8 @@ void OSD::handle_osd_ping(MOSDPing *m) Message *r = new MOSDPing(monc->get_fsid(), curmap->get_epoch(), - MOSDPing::PING_REPLY, - m->stamp); + MOSDPing::PING_REPLY, m->stamp, + cct->_conf->osd_heartbeat_min_size); m->get_connection()->send_message(r); if (curmap->is_up(from)) { @@ -4687,7 +4687,8 @@ void OSD::handle_osd_ping(MOSDPing *m) Message *r = new MOSDPing(monc->get_fsid(), curmap->get_epoch(), MOSDPing::YOU_DIED, - m->stamp); + m->stamp, + cct->_conf->osd_heartbeat_min_size); m->get_connection()->send_message(r); } } @@ -4865,14 +4866,14 @@ void OSD::heartbeat() dout(30) << "heartbeat sending ping to osd." << peer << dendl; i->second.con_back->send_message(new MOSDPing(monc->get_fsid(), service.get_osdmap()->get_epoch(), - MOSDPing::PING, - now)); + MOSDPing::PING, now, + cct->_conf->osd_heartbeat_min_size)); if (i->second.con_front) i->second.con_front->send_message(new MOSDPing(monc->get_fsid(), service.get_osdmap()->get_epoch(), - MOSDPing::PING, - now)); + MOSDPing::PING, now, + cct->_conf->osd_heartbeat_min_size)); } logger->set(l_osd_hb_to, heartbeat_peers.size()); -- 2.39.5