From ac3569cd8bbb4b2b9a0edf298ce0b83bc18f1fea Mon Sep 17 00:00:00 2001 From: Alexey Sheplyakov Date: Mon, 21 Mar 2016 18:20:18 +0300 Subject: [PATCH] hammer: monclient: avoid key renew storm on clock skew Refreshing rotating keys too often is a symptom of a clock skew, try to detect it and don't cause extra problems: * MonClient::_check_auth_rotating: - detect and report premature keys expiration due to a time skew - rate limit refreshing the keys to avoid excessive RAM and CPU usage (both by OSD in question and monitors which have to process a lot of auth messages) * MonClient::wait_auth_rotating: wait for valid (not expired) keys * OSD::init(): bail out after 10 attempts to obtain the rotating keys Fixes: #15336 Signed-off-by: Alexey Sheplyakov (cherry picked from commit 918c12c2ab5d014d0623b1accf959b041aac5128) Conflicts: src/osd/OSD.cc no loadavg checks in Hammer, hence the conflict. Move the counter and max_attempts initialization to a suitable place. --- src/mon/MonClient.cc | 26 ++++++++++++++++++++++---- src/mon/MonClient.h | 1 + src/osd/OSD.cc | 11 +++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index 9c9a3e7a2b4a..9c0f01f7daef 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -518,6 +518,7 @@ void MonClient::handle_auth(MAuthReply *m) if (ret == 0) { if (state != MC_STATE_HAVE_SESSION) { state = MC_STATE_HAVE_SESSION; + last_rotating_renew_sent = utime_t(); while (!waiting_for_session.empty()) { _send_mon_message(waiting_for_session.front()); waiting_for_session.pop_front(); @@ -802,8 +803,11 @@ int MonClient::_check_auth_rotating() return 0; } - utime_t cutoff = ceph_clock_now(cct); + utime_t now = ceph_clock_now(cct); + utime_t cutoff = now; cutoff -= MIN(30.0, cct->_conf->auth_service_ticket_ttl / 4.0); + utime_t issued_at_lower_bound = now; + issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl; if (!rotating_secrets->need_new_secrets(cutoff)) { ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl; rotating_secrets->dump_rotating(); @@ -811,9 +815,22 @@ int MonClient::_check_auth_rotating() } ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl; + if (!rotating_secrets->need_new_secrets() && + rotating_secrets->need_new_secrets(issued_at_lower_bound)) { + // the key has expired before it has been issued? + lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early" + << " (before " << issued_at_lower_bound << ")" << dendl; + } + if ((now > last_rotating_renew_sent) && + double(now - last_rotating_renew_sent) < 1) { + ldout(cct, 10) << __func__ << " called too often (last: " + << last_rotating_renew_sent << "), skipping refresh" << dendl; + return 0; + } MAuth *m = new MAuth; m->protocol = auth->get_protocol(); if (auth->build_rotating_request(m->auth_payload)) { + last_rotating_renew_sent = now; _send_mon_message(m); } else { m->put(); @@ -824,7 +841,8 @@ int MonClient::_check_auth_rotating() int MonClient::wait_auth_rotating(double timeout) { Mutex::Locker l(monc_lock); - utime_t until = ceph_clock_now(cct); + utime_t now = ceph_clock_now(cct); + utime_t until = now; until += timeout; if (auth->get_protocol() == CEPH_AUTH_NONE) @@ -834,14 +852,14 @@ int MonClient::wait_auth_rotating(double timeout) return 0; while (auth_principal_needs_rotating_keys(entity_name) && - rotating_secrets->need_new_secrets()) { - utime_t now = ceph_clock_now(cct); + rotating_secrets->need_new_secrets(now)) { if (now >= until) { ldout(cct, 0) << "wait_auth_rotating timed out after " << timeout << dendl; return -ETIMEDOUT; } ldout(cct, 10) << "wait_auth_rotating waiting (until " << until << ")" << dendl; auth_cond.WaitUntil(monc_lock, until); + now = ceph_clock_now(cct); } ldout(cct, 10) << "wait_auth_rotating done" << dendl; return 0; diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h index 239d91b45872..ced77e02cf7a 100644 --- a/src/mon/MonClient.h +++ b/src/mon/MonClient.h @@ -179,6 +179,7 @@ private: int authenticate_err; list waiting_for_session; + utime_t last_rotating_renew_sent; Context *session_established_context; bool had_a_connection; double reopen_interval_multiplier; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f210821a4cee..0d25d8f14a03 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1809,6 +1809,9 @@ int OSD::init() dout(2) << "boot" << dendl; + int rotating_auth_attempts = 0; + const int max_rotating_auth_attempts = 10; + // read superblock r = read_superblock(); if (r < 0) { @@ -1949,6 +1952,14 @@ int OSD::init() while (monc->wait_auth_rotating(30.0) < 0) { derr << "unable to obtain rotating service keys; retrying" << dendl; + ++rotating_auth_attempts; + if (rotating_auth_attempts > max_rotating_auth_attempts) { + osd_lock.Lock(); // make locker happy + if (!is_stopping()) { + r = - ETIMEDOUT; + } + goto monout; + } } osd_lock.Lock(); -- 2.47.3