From d6fb8755ca839ef5c1f94c3bc92a0e799c8f2d85 Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Mon, 20 May 2024 00:54:59 +0300 Subject: [PATCH] mds/queisce-db: collect acks while bootstrapping Keeping the acks that come in will allow processing them immediately after the bootstrap is over, avoiding unnecessary set state transitions. Fixes: https://tracker.ceph.com/issues/66119 Signed-off-by: Leonid Usov --- src/mds/QuiesceDb.h | 11 +++++++++++ src/mds/QuiesceDbManager.cc | 32 ++++++++++++++++++++++---------- src/mds/QuiesceDbManager.h | 6 +++--- src/test/mds/TestQuiesceDb.cc | 2 +- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/mds/QuiesceDb.h b/src/mds/QuiesceDb.h index 98d98451f6d81..244f342cc73eb 100644 --- a/src/mds/QuiesceDb.h +++ b/src/mds/QuiesceDb.h @@ -681,6 +681,17 @@ operator<<(std::basic_ostream& os, const QuiesceMap& map) struct QuiesceDbPeerAck { QuiesceInterface::PeerId origin; QuiesceMap diff_map; + + QuiesceDbPeerAck() = default; + QuiesceDbPeerAck(QuiesceDbPeerAck const&) = default; + QuiesceDbPeerAck(QuiesceDbPeerAck &&) = default; + QuiesceDbPeerAck(QuiesceInterface::PeerId origin, std::convertible_to auto&& diff_map) + : origin(origin) + , diff_map(std::forward(diff_map)) + {} + + QuiesceDbPeerAck& operator=(QuiesceDbPeerAck const&) = default; + QuiesceDbPeerAck& operator=(QuiesceDbPeerAck&&) = default; }; template diff --git a/src/mds/QuiesceDbManager.cc b/src/mds/QuiesceDbManager.cc index 4ac49e332609d..88844f09c817e 100644 --- a/src/mds/QuiesceDbManager.cc +++ b/src/mds/QuiesceDbManager.cc @@ -110,14 +110,20 @@ void* QuiesceDbManager::quiesce_db_thread_main() // we're good to process things next_event_at_age = leader_upkeep(std::move(acks), std::move(requests)); } else { - // not yet there. Put the requests back onto the queue and wait for updates + // not yet there. Put the acks and requests back onto the queue and wait for updates ls.lock(); while (!requests.empty()) { pending_requests.emplace_front(std::move(requests.back())); requests.pop_back(); } + while (!acks.empty()) { + pending_acks.emplace_front(std::move(acks.back())); + acks.pop_back(); + } if (pending_db_updates.empty()) { - dout(5) << "bootstrap: waiting for peer updates with timeout " << bootstrap_delay << dendl; + dout(5) << "bootstrap: waiting for new peers with pending acks: " << pending_acks.size() + << " requests: " << pending_requests.size() + << ". Wait timeout: " << bootstrap_delay << dendl; submit_condition.wait_for(ls, bootstrap_delay); } continue; @@ -401,7 +407,7 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep(decltype(pending_acks)&& ack while (!acks.empty()) { auto& [from, diff_map] = acks.front(); leader_record_ack(from, std::move(diff_map)); - acks.pop(); + acks.pop_front(); } // process requests @@ -439,7 +445,9 @@ void QuiesceDbManager::complete_requests() { } } } + // non-zero result codes are all errors + dout(10) << "completing request '" << req->request << " with rc: " << -res << dendl; req->complete(-res); } done_requests.clear(); @@ -940,7 +948,6 @@ size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const Qu continue; } reported_state = pr_state.state; - reporting_peers.insert({pr_state.state, {peer, info.diff_map.db_version}}); } // but we only consider the peer up to date given the version @@ -948,8 +955,13 @@ size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const Qu up_to_date_peers++; } - min_reported_state = std::min(min_reported_state, reported_state); - max_reported_state = std::max(max_reported_state, reported_state); + // we keep track of reported states only if the peer actually said something + // even if for an older version + if (info.diff_map.db_version.set_version > 0) { + reporting_peers.insert({ reported_state, { peer, info.diff_map.db_version } }); + min_reported_state = std::min(min_reported_state, reported_state); + max_reported_state = std::max(max_reported_state, reported_state); + } } if (min_reported_state == QS__MAX) { @@ -1099,15 +1111,16 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep_awaits() for (auto it = awaits.begin(); it != awaits.end();) { auto & [set_id, actx] = *it; Db::Sets::const_iterator set_it = db.sets.find(set_id); + QuiesceState set_state = QS__INVALID; int rc = db.get_age() >= actx.expire_at_age ? EINPROGRESS : EBUSY; if (set_it == db.sets.cend()) { rc = ENOENT; } else { - auto const & set = set_it->second; - - switch(set.rstate.state) { + auto const& set = set_it->second; + set_state = set.rstate.state; + switch(set_state) { case QS_CANCELED: rc = ECANCELED; break; @@ -1140,7 +1153,6 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep_awaits() } if (rc != EBUSY) { - dout(10) << "completing an await for the set '" << set_id << "' with rc: " << rc << dendl; done_requests[actx.req_ctx] = rc; it = awaits.erase(it); } else { diff --git a/src/mds/QuiesceDbManager.h b/src/mds/QuiesceDbManager.h index 3c9bde1b526fc..9654ce802eb13 100644 --- a/src/mds/QuiesceDbManager.h +++ b/src/mds/QuiesceDbManager.h @@ -100,7 +100,7 @@ class QuiesceDbManager { return -ESTALE; } - pending_acks.push(std::move(ack)); + pending_acks.emplace_back(std::move(ack)); submit_condition.notify_all(); return 0; } @@ -138,7 +138,7 @@ class QuiesceDbManager { if (cluster_membership->leader == cluster_membership->me) { // local delivery - pending_acks.push({ cluster_membership->me, std::move(diff_map) }); + pending_acks.emplace_back(cluster_membership->me, std::move(diff_map)); submit_condition.notify_all(); } else { // send to the leader outside of the lock @@ -201,7 +201,7 @@ class QuiesceDbManager { std::optional agent_callback; std::optional cluster_membership; std::queue pending_db_updates; - std::queue pending_acks; + std::deque pending_acks; std::deque pending_requests; bool db_thread_should_exit = false; bool db_thread_should_clear_db = true; diff --git a/src/test/mds/TestQuiesceDb.cc b/src/test/mds/TestQuiesceDb.cc index ce0ad767edb11..19c8d9b6163d2 100644 --- a/src/test/mds/TestQuiesceDb.cc +++ b/src/test/mds/TestQuiesceDb.cc @@ -984,7 +984,7 @@ TEST_F(QuiesceDbTest, InterruptedQuiesceAwait) EXPECT_EQ(ERR(ETIMEDOUT), await2.wait_result()); // shouldn't have taken much longer than the timeout configured on the set - auto epsilon = sec(0.01); + auto epsilon = sec(0.05); ASSERT_LE(QuiesceClock::now() - then - epsilon, last_request->response.sets.at("set2").timeout); // let's cancel set 1 while awaiting it a few times -- 2.39.5