From: Aliaksei Makarau Date: Tue, 31 Mar 2026 06:40:04 +0000 (+0200) Subject: This change introduces the shared memory communication (SMC-D) for the cluster network. X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e65af75a67b445bf7014842e9d9b3cfbae1e464b;p=ceph.git This change introduces the shared memory communication (SMC-D) for the cluster network. SMC-D is faster than ethernet in IBM Z LPARs and/or VMs (zVM or KVM). Fixes: https://tracker.ceph.com/issues/66702 Signed-off-by: Aliaksei Makarau --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 83c0d869e7a9..9048b094bb5e 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -930,7 +930,7 @@ options: level: advanced desc: Messenger implementation to use for network communication fmt_desc: Transport type used by Async Messenger. Can be ``async+posix``, - ``async+dpdk`` or ``async+rdma``. Posix uses standard TCP/IP networking and is + ``async+dpdk``, ``async+rdma``, or ``async+smc``. Posix uses standard TCP/IP networking and is default. Other transports may be experimental and support may be limited. default: async+posix flags: diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc index 04040468d0c8..6451386aa31a 100644 --- a/src/msg/async/AsyncMessenger.cc +++ b/src/msg/async/AsyncMessenger.cc @@ -375,6 +375,8 @@ AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name, transport_type = "rdma"; else if (type.find("dpdk") != std::string::npos) transport_type = "dpdk"; + else if (type.find("smc") != std::string::npos) + transport_type = "smc"; auto single = &cct->lookup_or_create_singleton_object( "AsyncMessenger::NetworkStack::" + transport_type, true, cct); diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc index 5bf088708fc0..58baa7a82f2c 100644 --- a/src/msg/async/PosixStack.cc +++ b/src/msg/async/PosixStack.cc @@ -336,7 +336,7 @@ int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, C return 0; } -PosixNetworkStack::PosixNetworkStack(CephContext *c) - : NetworkStack(c) +PosixNetworkStack::PosixNetworkStack(CephContext *c, bool try_smc) + : NetworkStack(c), try_smc(try_smc) { } diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h index 99d1c340732d..bab3dde34023 100644 --- a/src/msg/async/PosixStack.h +++ b/src/msg/async/PosixStack.h @@ -29,8 +29,8 @@ class PosixWorker : public Worker { ceph::NetHandler net; void initialize() override; public: - PosixWorker(CephContext *c, unsigned i) - : Worker(c, i), net(c) {} + PosixWorker(CephContext *c, unsigned i, bool try_smc) + : Worker(c, i), net(c, try_smc) {} int listen(entity_addr_t &sa, unsigned addr_slot, const SocketOptions &opt, @@ -40,13 +40,14 @@ class PosixWorker : public Worker { class PosixNetworkStack : public NetworkStack { std::vector threads; + bool try_smc; virtual Worker* create_worker(CephContext *c, unsigned worker_id) override { - return new PosixWorker(c, worker_id); + return new PosixWorker(c, worker_id, try_smc); } public: - explicit PosixNetworkStack(CephContext *c); + explicit PosixNetworkStack(CephContext *c, bool try_smc); void spawn_worker(std::function &&func) override { threads.emplace_back(std::move(func)); diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc index d6225b871248..efd290bb4e89 100644 --- a/src/msg/async/Stack.cc +++ b/src/msg/async/Stack.cc @@ -66,7 +66,9 @@ std::shared_ptr NetworkStack::create(CephContext *c, std::shared_ptr stack = nullptr; if (t == "posix") - stack.reset(new PosixNetworkStack(c)); + stack.reset(new PosixNetworkStack(c, false)); + else if (t == "smc") + stack.reset(new PosixNetworkStack(c, true)); #ifdef HAVE_RDMA else if (t == "rdma") stack.reset(new RDMAStack(c)); diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc index f068e2deb724..105259413a32 100644 --- a/src/msg/async/net_handler.cc +++ b/src/msg/async/net_handler.cc @@ -32,14 +32,33 @@ #undef dout_prefix #define dout_prefix *_dout << "NetHandler " +#ifndef SMCPROTO_SMC + #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ + #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ +#endif + namespace ceph{ int NetHandler::create_socket(int domain, bool reuse_addr) { int s; int r = 0; + int protocol = IPPROTO_TCP; + +#if defined(AF_SMC) + if (this->try_smc) { + /* check if socket is eligible for AF_SMC */ + if (domain == AF_INET || domain == AF_INET6) { + if (domain == AF_INET) + protocol = SMCPROTO_SMC; + else /* AF_INET6 */ + protocol = SMCPROTO_SMC6; + domain = AF_SMC; + } + } +#endif - if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) { + if ((s = socket_cloexec(domain, SOCK_STREAM, protocol)) == -1) { r = ceph_sock_errno(); lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl; return -r; diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h index 755bbf489c56..3cc5debb196b 100644 --- a/src/msg/async/net_handler.h +++ b/src/msg/async/net_handler.h @@ -24,9 +24,11 @@ namespace ceph { int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock); CephContext *cct; + bool try_smc; public: int create_socket(int domain, bool reuse_addr=false); - explicit NetHandler(CephContext *c): cct(c) {} + explicit NetHandler(CephContext *c, bool try_smc=false): cct(c), try_smc(try_smc) { + } int set_nonblock(int sd); int set_socket_options(int sd, bool nodelay, int size); int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);