]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: heartbeat with packets large enough to require working jumbo frames. 15535/head
authorGreg Farnum <gfarnum@redhat.com>
Mon, 5 Jun 2017 20:33:14 +0000 (13:33 -0700)
committerGreg Farnum <gfarnum@redhat.com>
Tue, 13 Jun 2017 20:43:29 +0000 (13:43 -0700)
We get periodic reports that users somehow misconfigure one of their switches
so that it drops jumbo frames, yet the servers are still passing them along. In
that case, MOSDOp messages generally don't get through because they are much
larger than the 1500-byte non-jumbo limit, but the MOSDPing messages have kept
going (as they are very small and dispatched independently, even when the
server is willing to make jumbo frames). This means peer OSDs won't mark down
the ones behind the broken switch, despite all IO hanging.
Push the MOSDPing message size over the 1500-byte limit so that anybody in
this scenario will see the OSDs stuck behind a bad switch get marked down.

Fixes: http://tracker.ceph.com/issues/20087
Signed-off-by: Greg Farnum <gfarnum@redhat.com>
src/common/config_opts.h
src/messages/MOSDPing.h
src/osd/OSD.cc

index b1d1dfc8f7be74e4d3c989cf3ed3bb7243595ce9..b8cb69104e568d976473edb9d66c56fa9b7ef245 100644 (file)
@@ -797,6 +797,7 @@ OPTION(osd_heartbeat_interval, OPT_INT, 6)       // (seconds) how often we ping
 OPTION(osd_heartbeat_grace, OPT_INT, 20)
 OPTION(osd_heartbeat_min_peers, OPT_INT, 10)     // minimum number of peers
 OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL, false) // prio the heartbeat tcp socket and set dscp as CS6 on it if true
+OPTION(osd_heartbeat_min_size, OPT_INT, 2000) // the minimum size of OSD heartbeat messages to send
 
 // max number of parallel snap trims/pg
 OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2)
index 3bb54c5ae0ffd95143f066e26465969c497ad250..7a8bf2f986f01c0f7b1d8949e63cf4a0b903d13d 100644 (file)
  * 
  */
 
+
+/**
+ * This is used to send pings between daemons (so far, the OSDs) for
+ * heartbeat purposes. We include a timestamp and distinguish between
+ * outgoing pings and responses to those. If you set the
+ * min_message in the constructor, the message will inflate itself
+ * to the specified size -- this is good for dealing with network
+ * issues with jumbo frames. See http://tracker.ceph.com/issues/20087
+ *
+ */
+
 #ifndef CEPH_MOSDPING_H
 #define CEPH_MOSDPING_H
 
@@ -23,7 +34,7 @@
 
 class MOSDPing : public Message {
 
-  static const int HEAD_VERSION = 2;
+  static const int HEAD_VERSION = 3;
   static const int COMPAT_VERSION = 2;
 
  public:
@@ -52,13 +63,15 @@ class MOSDPing : public Message {
   __u8 op;
   osd_peer_stat_t peer_stat;
   utime_t stamp;
+  uint32_t min_message_size;
 
-  MOSDPing(const uuid_d& f, epoch_t e, __u8 o, utime_t s)
+  MOSDPing(const uuid_d& f, epoch_t e, __u8 o, utime_t s, uint32_t min_message)
     : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION),
-      fsid(f), map_epoch(e), peer_as_of_epoch(0), op(o), stamp(s)
+      fsid(f), map_epoch(e), peer_as_of_epoch(0), op(o), stamp(s),
+      min_message_size(min_message)
   { }
   MOSDPing()
-    : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION)
+    : Message(MSG_OSD_PING, HEAD_VERSION, COMPAT_VERSION), min_message_size(0)
   {}
 private:
   ~MOSDPing() override {}
@@ -72,6 +85,12 @@ public:
     ::decode(op, p);
     ::decode(peer_stat, p);
     ::decode(stamp, p);
+    if (header.version >= 3) {
+      bufferlist size_bl;
+      int payload_mid_length = p.get_off();
+      ::decode(size_bl, p);
+      min_message_size = size_bl.length() + payload_mid_length;
+    }
   }
   void encode_payload(uint64_t features) override {
     ::encode(fsid, payload);
@@ -80,6 +99,10 @@ public:
     ::encode(op, payload);
     ::encode(peer_stat, payload);
     ::encode(stamp, payload);
+    bufferptr size_bp(MAX(min_message_size - payload.length(), 0));
+    bufferlist size_bl;;
+    size_bl.push_back(size_bp);
+    ::encode(size_bl, payload);
   }
 
   const char *get_type_name() const override { return "osd_ping"; }
index 83b26198c902c81a548b58bcab487f9b384f6007..74dbfcd1050e83e6980520bcf294693e8654999a 100644 (file)
@@ -4669,8 +4669,8 @@ void OSD::handle_osd_ping(MOSDPing *m)
 
       Message *r = new MOSDPing(monc->get_fsid(),
                                curmap->get_epoch(),
-                               MOSDPing::PING_REPLY,
-                               m->stamp);
+                               MOSDPing::PING_REPLY, m->stamp,
+                               cct->_conf->osd_heartbeat_min_size);
       m->get_connection()->send_message(r);
 
       if (curmap->is_up(from)) {
@@ -4687,7 +4687,8 @@ void OSD::handle_osd_ping(MOSDPing *m)
        Message *r = new MOSDPing(monc->get_fsid(),
                                  curmap->get_epoch(),
                                  MOSDPing::YOU_DIED,
-                                 m->stamp);
+                                 m->stamp,
+                                 cct->_conf->osd_heartbeat_min_size);
        m->get_connection()->send_message(r);
       }
     }
@@ -4865,14 +4866,14 @@ void OSD::heartbeat()
     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
                                          service.get_osdmap()->get_epoch(),
-                                         MOSDPing::PING,
-                                         now));
+                                         MOSDPing::PING, now,
+                                         cct->_conf->osd_heartbeat_min_size));
 
     if (i->second.con_front)
       i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
                                             service.get_osdmap()->get_epoch(),
-                                                    MOSDPing::PING,
-                                                    now));
+                                            MOSDPing::PING, now,
+                                         cct->_conf->osd_heartbeat_min_size));
   }
 
   logger->set(l_osd_hb_to, heartbeat_peers.size());