From 4aa02f8472ae068d826fdce30df68d9d30f3a0cf Mon Sep 17 00:00:00 2001 From: Jian Wen Date: Thu, 8 Jan 2015 16:54:23 +0800 Subject: [PATCH] osd: add an option to prioritize heartbeat traffic By default every hardware queue of a network interface is assigned a pfifo_fast QDisc. When network congestion occurs, the data traffic may starve out the heartbeat traffic. To make sure that heartbeat packets are always transmitted(dequeued) first, Setting the SO_PRIORITY as 6 for the sockets that are used to transmit heartbeat messages. The length of heartbeat messages are small. And an OSD Daemon doesn't ping its peers very often. So the heartbeat traffic is not likely to starve out the data traffic. Using fq_codel instead of pfifo_fast is another good choice to avoid bufferbloat. It's not available until Linux 3.5 though. Signed-off-by: Jian Wen --- src/ceph_osd.cc | 6 ++++++ src/common/config_opts.h | 1 + src/msg/Messenger.h | 25 +++++++++++++++++++++++++ src/msg/simple/Accepter.cc | 9 +++++++++ src/msg/simple/Pipe.cc | 9 +++++++++ 5 files changed, 50 insertions(+) diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index 827e06e857a98..0199ce7e4b28a 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -456,6 +456,12 @@ int main(int argc, const char **argv) if (r < 0) exit(1); + if (g_conf->osd_heartbeat_use_min_delay_socket) { + ms_hbclient->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + ms_hb_back_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + ms_hb_front_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); + } + // hb back should bind to same ip as cluster_addr (if specified) entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr; if (hb_back_addr.is_blank_ip()) { diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5b0ab0fc4b1cf..e98f4dfb26fee 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -538,6 +538,7 @@ OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t()) OPTION(osd_heartbeat_interval, OPT_INT, 6) // (seconds) how often we ping peers OPTION(osd_heartbeat_grace, OPT_INT, 20) // (seconds) how long before we decide a peer has failed OPTION(osd_heartbeat_min_peers, OPT_INT, 10) // minimum number of peers +OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL, false) // set SO_PRIORITY of the sockets as 6(high) if true // max number of parallel snap trims/pg OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2) diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index 4462d6c5dfd1f..1bdcd10986d0b 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -32,6 +32,8 @@ using namespace std; #include #include +#define SOCKET_PRIORITY_MIN_DELAY 6 + class MDS; class Timer; @@ -47,6 +49,7 @@ protected: int default_send_priority; /// set to true once the Messenger has started, and set to false on shutdown bool started; + int socket_priority; public: /** @@ -126,6 +129,7 @@ public: Messenger(CephContext *cct_, entity_name_t w) : my_inst(), default_send_priority(CEPH_MSG_PRIO_DEFAULT), started(false), + socket_priority(-1), cct(cct_) { my_inst.name = w; @@ -299,6 +303,27 @@ public: assert(!started); default_send_priority = p; } + /** + * Set the priority(SO_PRIORITY) for all packets to be sent on this socket. + * + * Linux uses this value to order the networking queues: packets with a higher + * priority may be processed first depending on the selected device queueing + * discipline. + * + * @param prio The priority. Setting a priority outside the range 0 to 6 + * requires the CAP_NET_ADMIN capability. + */ + void set_socket_priority(int prio) { + socket_priority = prio; + } + /** + * Get the socket priority + * + * @return the socket priority + */ + int get_socket_priority() { + return socket_priority; + } /** * Add a new Dispatcher to the front of the list. If you add * a Dispatcher which is already included, it will get a duplicate diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc index 7d989a93691e1..fe7d87b67f7e5 100644 --- a/src/msg/simple/Accepter.cc +++ b/src/msg/simple/Accepter.cc @@ -229,6 +229,15 @@ void *Accepter::entry() socklen_t slen = sizeof(addr.ss_addr()); int sd = ::accept(listen_sd, (sockaddr*)&addr.ss_addr(), &slen); if (sd >= 0) { + int prio = msgr->get_socket_priority(); + if (prio >= 0) { + int rc = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + if (rc < 0) { + ldout(msgr->cct,0) << "WARNING: failed to set SO_PRIORITY for sd " + << sd << ": " << cpp_strerror(errno) << dendl; + } + } + errors = 0; ldout(msgr->cct,10) << "accepted incoming on sd " << sd << dendl; diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc index 6f4f989668b1d..5dabbb0d8969b 100644 --- a/src/msg/simple/Pipe.cc +++ b/src/msg/simple/Pipe.cc @@ -846,6 +846,15 @@ void Pipe::set_socket_options() ldout(msgr->cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl; } #endif + + int prio = msgr->get_socket_priority(); + if (prio >= 0) { + int r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + if (r < 0) { + ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio + << ": " << cpp_strerror(errno) << dendl; + } + } } int Pipe::connect() -- 2.39.5