From 13d675b4244552ef4e229ff7e2db828fa8adb4a5 Mon Sep 17 00:00:00 2001 From: Song Shun Date: Thu, 22 Oct 2020 19:49:57 +0800 Subject: [PATCH] msg: add min delay packets support for mons when suffering network bottleneck, like switch qos control for only 50Mbps, mon cluster may stuck in electing for long time and elects again and again, which introduces many troubles, like it's hard to find where the problem is. even more worse, it may lead to IO hang when one osd can't serve IO. so it's better to keep mon cluster stable always. Signed-off-by: Song Shun --- src/common/options/mon.yaml.in | 8 ++++++++ src/msg/async/AsyncConnection.cc | 14 +++++++++++++- src/msg/async/PosixStack.cc | 3 +++ src/msg/async/Stack.h | 5 +++++ src/msg/async/dpdk/DPDKStack.h | 1 + src/msg/async/rdma/RDMAConnectedSocketImpl.cc | 5 +++++ src/msg/async/rdma/RDMAStack.h | 1 + 7 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index a92fa7e1847b..3870ea517e94 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -1321,3 +1321,11 @@ options: services: - mon with_legacy: true +- name: mon_use_min_delay_socket + type: bool + level: advanced + default: false + desc: priority packets between mons + with_legacy: true + see_also: + - osd_heartbeat_use_min_delay_socket diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 49e8e85bf0e4..8051f5907ef1 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -407,6 +407,12 @@ void AsyncConnection::process() { SocketOptions opts; opts.priority = async_msgr->get_socket_priority(); + if (async_msgr->cct->_conf->mon_use_min_delay_socket) { + if (async_msgr->get_mytype() == CEPH_ENTITY_TYPE_MON && + peer_is_mon()) { + opts.priority = SOCKET_PRIORITY_MIN_DELAY; + } + } opts.connect_bind_addr = msgr->get_myaddrs().front(); ssize_t r = worker->connect(target_addr, opts, &cs); if (r < 0) { @@ -451,7 +457,13 @@ void AsyncConnection::process() { case STATE_ACCEPTING: { center->create_file_event(cs.fd(), EVENT_READABLE, read_handler); state = STATE_CONNECTION_ESTABLISHED; - + if (async_msgr->cct->_conf->mon_use_min_delay_socket) { + if (async_msgr->get_mytype() == CEPH_ENTITY_TYPE_MON && + peer_is_mon()) { + cs.set_priority(cs.fd(), SOCKET_PRIORITY_MIN_DELAY, + target_addr.get_family()); + } + } break; } diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc index a38e82cf39cc..373bed7dec3e 100644 --- a/src/msg/async/PosixStack.cc +++ b/src/msg/async/PosixStack.cc @@ -206,6 +206,9 @@ class PosixConnectedSocketImpl final : public ConnectedSocketImpl { void close() override { compat_closesocket(_fd); } + void set_priority(int sd, int prio, int domain) override { + handler.set_priority(sd, prio, domain); + } int fd() const override { return _fd; } diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h index 97201787c73c..e21795b3d94e 100644 --- a/src/msg/async/Stack.h +++ b/src/msg/async/Stack.h @@ -32,6 +32,7 @@ class ConnectedSocketImpl { virtual void shutdown() = 0; virtual void close() = 0; virtual int fd() const = 0; + virtual void set_priority(int sd, int prio, int domain) = 0; }; class ConnectedSocket; @@ -123,6 +124,10 @@ class ConnectedSocket { return _csi->fd(); } + void set_priority(int sd, int prio, int domain) { + _csi->set_priority(sd, prio, domain); + } + explicit operator bool() const { return _csi.get(); } diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h index f05873572a33..3f64f566990b 100644 --- a/src/msg/async/dpdk/DPDKStack.h +++ b/src/msg/async/dpdk/DPDKStack.h @@ -47,6 +47,7 @@ class DPDKServerSocketImpl : public ServerSocketImpl { virtual int fd() const override { return _listener.fd(); } + virtual void set_priority(int sd, int prio, int domain) override {} }; // NativeConnectedSocketImpl diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc index 5ab6c9b2e803..6c79dc54f31e 100644 --- a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc +++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc @@ -573,6 +573,11 @@ void RDMAConnectedSocketImpl::close() active = false; } +void RDMAConnectedSocketImpl::set_priority(int sd, int prio, int domain) { + ceph::NetHandler net(cct); + net.set_priority(sd, prio, domain); +} + void RDMAConnectedSocketImpl::fault() { ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl; diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h index aa64ca57e6fa..a36fd5fb7f91 100644 --- a/src/msg/async/rdma/RDMAStack.h +++ b/src/msg/async/rdma/RDMAStack.h @@ -218,6 +218,7 @@ class RDMAConnectedSocketImpl : public ConnectedSocketImpl { virtual void shutdown() override; virtual void close() override; virtual int fd() const override { return notify_fd; } + virtual void set_priority(int sd, int prio, int domain) override; void fault(); const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); } uint32_t get_peer_qpn () const { return peer_qpn; } -- 2.47.3