From 58850c8098dbe2d189c9cc9696604e1778efc9dc Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Tue, 2 Aug 2022 13:42:01 +0000 Subject: [PATCH] crimson/net: don't throw on failure when renewing tickets and keyrings In the classical `MonClient` renewal of tickets and rotating secrets, generally speaking, doesn't verify the `MAuthReply`. The only difference is breaking the renewal loop: ```cpp void MonClient::_finish_auth(int auth_err) { // ... if (!auth_err && active_con) { ceph_assert(auth); _check_auth_tickets(); } // ... } ``` However, `crimson::mon::Client` throws `error::negotiation_failure` which is finally handled in `Gated::dispatch()`. As the handler there doesn't expect this particular error, it aborts the entire process. This was the reason for failing one of the jobs at Sepia: ``` rzarzynski@teuthology:/a/rzarzynski-2022-08-01_11:01:29-crimson-rados-main-distro-default-smithi/6954468$ less ./remote/smithi174/log/ceph-osd.1.log.gz ... DEBUG 2022-08-01 11:26:13,772 [shard 0] monc - renew_rotating_keyring secrets are up-to-date (they expire after 1659353143.772506) INFO 2022-08-01 11:26:13,772 [shard 0] monc - renew_tickets: retrieving new tickets INFO 2022-08-01 11:26:13,772 [shard 0] monc - sending auth(proto 2 132 bytes epoch 0) v1 INFO 2022-08-01 11:26:13,772 [shard 0] monc - waiting INFO 2022-08-01 11:26:13,773 [shard 0] monc - handle_auth_reply mon v2:172.21.15.174:6802/101543 => v2:172.21.15.174:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0 INFO 2022-08-01 11:26:13,773 [shard 0] monc - handle_auth_reply INFO 2022-08-01 11:26:13,773 [shard 0] monc - renew_tickets: retrieving new tickets INFO 2022-08-01 11:26:13,773 [shard 0] monc - sending auth(proto 2 132 bytes epoch 0) v1 DEBUG 2022-08-01 11:26:13,773 [shard 0] monc - renew_rotating_keyring secrets are up-to-date (they expire after 1659353143.7732666) INFO 2022-08-01 11:26:13,773 [shard 0] monc - waiting INFO 2022-08-01 11:26:13,773 [shard 0] monc - do_auth_single: mon v2:172.21.15.174:6802/101543 => v2:172.21.15.174:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0 INFO 2022-08-01 11:26:13,773 [shard 0] monc - handle_auth_reply mon v2:172.21.15.174:6802/101543 => v2:172.21.15.174:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0 INFO 2022-08-01 11:26:13,773 [shard 0] monc - handle_auth_reply INFO 2022-08-01 11:26:13,773 [shard 0] monc - do_auth_single: mon v2:172.21.15.174:6802/101543 => v2:172.21.15.174:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0 WARN 2022-08-01 11:26:13,773 [shard 0] auth - cephx client: could not verify service_ticket reply ERROR 2022-08-01 11:26:13,773 [shard 0] monc - do_auth_single: got error -13 on mon v2:172.21.15.174:3300/0 DEBUG 2022-08-01 11:26:13,773 [shard 0] monc - renew_tickets: don't need new tickets DEBUG 2022-08-01 11:26:13,774 [shard 0] monc - renew_rotating_keyring secrets are up-to-date (they expire after 1659353143.773988) ERROR 2022-08-01 11:26:13,774 [shard 0] osd - mon.osd.1 dispatch() ms_dispatch caught exception: std::system_error (error crimson::net:3, negotiation failure) ``` ``` 2022-08-01T11:26:13.798 INFO:tasks.ceph.osd.1.smithi174.stderr:ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/giga ntic/release/17.0.0-13947-g4f4eedf5/rpm/el8/BUILD/ceph-17.0.0-13947-g4f4eedf5/src/crimson/common/gated.h:36: crimson::common::Gated::dispatch&, crimson::mon::Client>(const char*, crimson::mon::Client&, crimson::mon::Client::ms_dispatch(crimson::net::ConnectionRef, MessageRef)::&)::: Assertion `*eptr.__cxa_exception_type() == typeid(seastar::gate_closed_exception)' failed. 2022-08-01T11:26:13.811 INFO:tasks.ceph.osd.1.smithi174.stderr:Aborting on shard 0. ``` ```cpp class Gated { // ... template inline seastar::future<> dispatch(const char* what, T& who, Func&& func) { return seastar::with_gate(pending_dispatch, std::forward(func) ).handle_exception([what, &who] (std::exception_ptr eptr) { if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) { gated_logger().debug( "{}, {} skipped, system shutdown", who, what); return; } gated_logger().error( "{} dispatch() {} caught exception: {}", who, what, eptr); assert(*eptr.__cxa_exception_type() == typeid(seastar::gate_closed_exception)); }); } ``` See: http://pulpito.front.sepia.ceph.com/rzarzynski-2022-08-01_11:01:29-crimson-rados-main-distro-default-smithi/6954468/ Signed-off-by: Radoslaw Zarzynski --- src/crimson/mon/MonClient.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc index f2e7f83eb3d64..5eca1e0736cab 100644 --- a/src/crimson/mon/MonClient.cc +++ b/src/crimson/mon/MonClient.cc @@ -140,9 +140,7 @@ seastar::future<> Connection::renew_tickets() logger().info("{}: retrieving new tickets", __func__); return do_auth(request_t::general).then([](const auth_result_t r) { if (r == auth_result_t::failure) { - throw std::system_error( - make_error_code( - crimson::net::error::negotiation_failure)); + logger().info("renew_tickets: ignoring failed auth reply"); } }); } else { @@ -174,8 +172,7 @@ seastar::future<> Connection::renew_rotating_keyring() last_rotating_renew_sent = now; return do_auth(request_t::rotating).then([](const auth_result_t r) { if (r == auth_result_t::failure) { - throw std::system_error(make_error_code( - crimson::net::error::negotiation_failure)); + logger().info("renew_rotating_keyring: ignoring failed auth reply"); } }); } -- 2.39.5