From 1b24588dc8e03adaff472a94f594f2851b4acec4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Mon, 30 Jan 2023 18:28:00 +0100 Subject: [PATCH] mon, msgr: rework the public addrs support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit A monitor does support two kinds of addresses when binding msgr: 1) `bind_addrs` which are basically passed down the stack to e.g. the `bind()` syscall -- they are physical, low-level ones; 2) `public_addrs` which are exposed to peers over Ceph's on-wire protocols. This differentation is crucial for Rook and K8S use scenarios where, due to ephemeral nature of node's physical IPs, virtual _cluster IPs_ must be used instead for mon accesses. Although monitors still bind to physical ones, virtuals are exposed. Then, thankfully to NAT, the latter are converted to the former ones. Unfortunately, the current implmentation of this feature is based on the idea to 1) bind-as-usually and then 2) change-the-addrs via powerful `set_addrs()` setter. This setter is too powerful but the main issue is the `rebind()` sceario. It doesn't happen in monitors but OSDs may rebind when they get marked down by a monitor. Fixes: https://tracker.ceph.com/issues/58779 Signed-off-by: Radosław Zarzyński --- src/ceph_mon.cc | 8 +----- src/msg/Messenger.cc | 5 ++-- src/msg/Messenger.h | 19 ++++++------- src/msg/async/AsyncMessenger.cc | 47 +++++++++++++++++++++------------ src/msg/async/AsyncMessenger.h | 20 ++++++++++---- 5 files changed, 57 insertions(+), 42 deletions(-) diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 4410e0dcb56..ebd110b4254 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -886,18 +886,12 @@ int main(int argc, const char **argv) } // bind - err = msgr->bindv(bind_addrs); + err = msgr->bindv(bind_addrs, public_addrs); if (err < 0) { derr << "unable to bind monitor to " << bind_addrs << dendl; prefork.exit(1); } - // if the public and bind addr are different set the msgr addr - // to the public one, now that the bind is complete. - if (public_addrs != bind_addrs) { - msgr->set_addrs(public_addrs); - } - if (g_conf()->daemonize) { global_init_postfork_finish(g_ceph_context); prefork.daemonize(); diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc index edc74a9a490..5bcfb56e104 100644 --- a/src/msg/Messenger.cc +++ b/src/msg/Messenger.cc @@ -104,8 +104,9 @@ int get_default_crc_flags(const ConfigProxy& conf) return r; } -int Messenger::bindv(const entity_addrvec_t& addrs) +int Messenger::bindv(const entity_addrvec_t& bind_addrs, + std::optional public_addrs) { - return bind(addrs.legacy_addr()); + return bind(bind_addrs.legacy_addr(), std::move(public_addrs)); } diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index c832589e88e..71b7d2549c5 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -17,8 +17,9 @@ #ifndef CEPH_MESSENGER_H #define CEPH_MESSENGER_H -#include #include +#include +#include #include #include @@ -269,14 +270,7 @@ public: * @param addr The address to use as a template. */ virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0; - /** - * set the address for this Messenger. This is useful if the Messenger - * binds to a specific address but advertises a different address on the - * the network. - * - * @param addr The address to use. - */ - virtual void set_addrs(const entity_addrvec_t &addr) = 0; + /// Get the default send priority. int get_default_send_priority() { return default_send_priority; } /** @@ -426,12 +420,15 @@ public: * in an unspecified order. * * @param bind_addr The address to bind to. + * @patam public_addrs The addresses to announce over the network * @return 0 on success, or -1 on error, or -errno if * we can be more specific about the failure. */ - virtual int bind(const entity_addr_t& bind_addr) = 0; + virtual int bind(const entity_addr_t& bind_addr, + std::optional public_addrs=std::nullopt) = 0; - virtual int bindv(const entity_addrvec_t& addrs); + virtual int bindv(const entity_addrvec_t& bind_addrs, + std::optional public_addrs=std::nullopt); /** * This function performs a full restart of the Messenger component, diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc index f5dd03295e9..7572827b37b 100644 --- a/src/msg/async/AsyncMessenger.cc +++ b/src/msg/async/AsyncMessenger.cc @@ -324,7 +324,7 @@ void AsyncMessenger::ready() stack->ready(); if (pending_bind) { - int err = bindv(pending_bind_addrs); + int err = bindv(pending_bind_addrs, saved_public_addrs); if (err) { lderr(cct) << __func__ << " postponed bind failed" << dendl; ceph_abort(); @@ -357,9 +357,11 @@ int AsyncMessenger::shutdown() return 0; } -int AsyncMessenger::bind(const entity_addr_t &bind_addr) +int AsyncMessenger::bind(const entity_addr_t &bind_addr, + std::optional public_addrs) { - ldout(cct,10) << __func__ << " " << bind_addr << dendl; + ldout(cct, 10) << __func__ << " " << bind_addr + << " public " << public_addrs << dendl; // old bind() can take entity_addr_t(). new bindv() can take a // 0.0.0.0-like address but needs type and family to be set. auto a = bind_addr; @@ -371,10 +373,11 @@ int AsyncMessenger::bind(const entity_addr_t &bind_addr) a.set_family(AF_INET); } } - return bindv(entity_addrvec_t(a)); + return bindv(entity_addrvec_t(a), public_addrs); } -int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs) +int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs, + std::optional public_addrs) { lock.lock(); @@ -384,7 +387,14 @@ int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs) return -1; } - ldout(cct,10) << __func__ << " " << bind_addrs << dendl; + ldout(cct, 10) << __func__ << " " << bind_addrs + << " public " << public_addrs << dendl; + if (public_addrs && bind_addrs != public_addrs) { + // for the sake of rebind() and the is-not-ready case let's + // store public_addrs. there is no point in that if public + // addrs are indifferent from bind_addrs. + saved_public_addrs = std::move(public_addrs); + } if (!stack->is_ready()) { ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl; @@ -494,6 +504,20 @@ void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs, entity_addrvec_t newaddrs = *my_addrs; for (auto& a : newaddrs.v) { a.set_nonce(nonce); + if (saved_public_addrs) { + // transplantate network layer addresses while keeping ports + // (as they can be figured out by msgr from the allowed range [1]) + // unless they are explicitly specified (NATing both IP/port?) + // + // [1]: the low-level `Processor::bind` scans for free ports in + // a range controlled by ms_bind_port_min and ms_bind_port_max + const auto& public_addr = + saved_public_addrs->addr_of_type(a.get_type()); + const auto public_port = public_addr.get_port(); + const auto bound_port = a.get_port(); + a.set_sockaddr(public_addr.get_sockaddr()); + a.set_port(public_port == 0 ? bound_port : public_port); + } } set_myaddrs(newaddrs); @@ -765,17 +789,6 @@ bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs) return ret; } -void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs) -{ - std::lock_guard l{lock}; - auto t = addrs; - for (auto& a : t.v) { - a.set_nonce(nonce); - } - set_myaddrs(t); - _init_local_connection(); -} - void AsyncMessenger::shutdown_connections(bool queue_reset) { ldout(cct,1) << __func__ << " " << dendl; diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h index 17b1f57f527..0a3227dec5d 100644 --- a/src/msg/async/AsyncMessenger.h +++ b/src/msg/async/AsyncMessenger.h @@ -18,6 +18,7 @@ #define CEPH_ASYNCMESSENGER_H #include +#include #include "include/types.h" #include "include/xlist.h" @@ -94,7 +95,6 @@ public: * @{ */ bool set_addr_unknowns(const entity_addrvec_t &addr) override; - void set_addrs(const entity_addrvec_t &addrs) override; int get_dispatch_queue_len() override { return dispatch_queue.get_queue_len(); @@ -114,9 +114,11 @@ public: cluster_protocol = p; } - int bind(const entity_addr_t& bind_addr) override; + int bind(const entity_addr_t& bind_addr, + std::optional public_addrs=std::nullopt) override; int rebind(const std::set& avoid_ports) override; - int bindv(const entity_addrvec_t& bind_addrs) override; + int bindv(const entity_addrvec_t& bind_addrs, + std::optional public_addrs=std::nullopt) override; int client_bind(const entity_addr_t& bind_addr) override; @@ -230,11 +232,19 @@ private: bool need_addr = true; /** - * set to bind addresses if bind was called before NetworkStack was ready to - * bind + * set to bind addresses if bind or bindv were called before NetworkStack + * was ready to bind */ entity_addrvec_t pending_bind_addrs; + /** + * set to public addresses (those announced by the msgr's protocols). + * they are stored to handle the cases when either: + * a) bind or bindv were called before NetworkStack was ready to bind, + * b) rebind is called down the road. + */ + std::optional saved_public_addrs; + /** * false; set to true if a pending bind exists */ -- 2.39.5