From: xie xingguo Date: Tue, 8 Jan 2019 10:38:45 +0000 (+0800) Subject: msg/async: do not force updating rotating keys inline X-Git-Tag: v14.1.0~411^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=794a8f9cf51cf176636d114ccfbbf68fbc304083;p=ceph.git msg/async: do not force updating rotating keys inline We found quite a few OSDs were unable to re-join the cluster after the updation of the core switch was done. The symptoms are similar - all these OSDs are complaining about not being able to renew rotating keys, which are necessary for authorized entities to talk with each other. The root cause is that a specific OSD would keep hunting a reachable Mon, and if unavailable, the hunting process would reboot every __timeout__ seconds, causing the async-connection in progress torn down and re-created. However the underlying thread in charge of the hunting process could be blocked if there were hundreds of async-connections which were also waiting for new rotating keys, e.g.: ``` 2018-12-29 16:35:19.210884 7f416d6ee700 0 -- 172.18.35.6:6808/1036230 >> 172.18.35.4:6810/1037600 conn(0x7f41d9e3c000 :6808 s=STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH pgs=293 cs=25 l=0).handle_connect_reply connect got BADAUTHORIZER 2018-12-29 16:35:19.210891 7f416d6ee700 10 monclient(hunting): wait_auth_rotating waiting (until 2018-12-29 16:35:29.210889) 2018-12-29 16:35:29.210947 7f416d6ee700 0 monclient(hunting): wait_auth_rotating timed out after 10 2018-12-29 16:35:29.211101 7f416d6ee700 0 -- 172.18.35.6:6808/1036230 >> 172.18.35.4:6824/1028882 conn(0x7f418195d000 :-1 s=STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH p gs=1433 cs=8 l=0).handle_connect_reply connect got BADAUTHORIZER 2018-12-29 16:35:29.211108 7f416d6ee700 10 monclient(hunting): wait_auth_rotating waiting (until 2018-12-29 16:35:39.211108) 2018-12-29 16:35:39.211167 7f416d6ee700 0 monclient(hunting): wait_auth_rotating timed out after 10 ``` which as a result causes the corresponding OSD being stuck at hunting forever. Fix by avoiding updating rotating keys on the messenger level and making monclient do it instead. On detecting a bad or an outdated rotating key, we could simply backoff and restart the connecting procedure. Signed-off-by: yanjun Signed-off-by: xie xingguo --- diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc index ab4b526105f..d72d4111d07 100644 --- a/src/msg/async/ProtocolV1.cc +++ b/src/msg/async/ProtocolV1.cc @@ -79,7 +79,6 @@ ProtocolV1::ProtocolV1(AsyncConnection *connection) once_ready(false), state(NONE), global_seq(0), - got_bad_auth(false), authorizer(nullptr), wait_for_seq(false) { temp_buffer = new char[4096]; @@ -100,7 +99,6 @@ void ProtocolV1::connect() { this->state = START_CONNECT; // reset connect state variables - got_bad_auth = false; if (authorizer) { delete authorizer; authorizer = nullptr; @@ -1238,7 +1236,6 @@ void ProtocolV1::reset_recv_state() { delete authorizer; } authorizer = nullptr; - got_bad_auth = false; } // clean read and write callbacks @@ -1565,14 +1562,7 @@ CtPtr ProtocolV1::handle_connect_reply_2() { if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl; - if (got_bad_auth) { - return _fault(); - } - got_bad_auth = true; - delete authorizer; - authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type, - true); // try harder - return CONTINUE(send_connect_message); + return _fault(); } if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) { diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h index 7973b07eecd..cf2370f1a94 100644 --- a/src/msg/async/ProtocolV1.h +++ b/src/msg/async/ProtocolV1.h @@ -226,7 +226,6 @@ public: // Client Protocol private: int global_seq; - bool got_bad_auth; AuthAuthorizer *authorizer; CONTINUATION_DECL(ProtocolV1, send_client_banner); @@ -301,4 +300,4 @@ public: } }; -#endif /* _MSG_ASYNC_PROTOCOL_V1_ */ \ No newline at end of file +#endif /* _MSG_ASYNC_PROTOCOL_V1_ */ diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc index 99b424e6dbc..4521acce48f 100644 --- a/src/msg/async/ProtocolV2.cc +++ b/src/msg/async/ProtocolV2.cc @@ -74,7 +74,6 @@ ProtocolV2::ProtocolV2(AsyncConnection *connection) once_ready(false), state(NONE), global_seq(0), - got_bad_auth(false), authorizer(nullptr), wait_for_seq(false) { temp_buffer = new char[4096]; @@ -95,7 +94,6 @@ void ProtocolV2::connect() { this->state = START_CONNECT; // reset connect state variables - got_bad_auth = false; if (authorizer) { delete authorizer; authorizer = nullptr; @@ -1235,7 +1233,6 @@ void ProtocolV2::reset_recv_state() { delete authorizer; } authorizer = nullptr; - got_bad_auth = false; } // clean read and write callbacks @@ -1595,14 +1592,7 @@ CtPtr ProtocolV2::handle_connect_reply_2() { if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl; - if (got_bad_auth) { - return _fault(); - } - got_bad_auth = true; - delete authorizer; - authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type, - true); // try harder - return CONTINUE(send_connect_message); + return _fault(); } if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) { diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h index 2439e22b558..bac6d52cdac 100644 --- a/src/msg/async/ProtocolV2.h +++ b/src/msg/async/ProtocolV2.h @@ -181,7 +181,6 @@ public: // Client Protocol private: int global_seq; - bool got_bad_auth; AuthAuthorizer *authorizer; CONTINUATION_DECL(ProtocolV2, send_client_banner);