From: Leonid Usov Date: Mon, 13 May 2024 21:10:04 +0000 (+0300) Subject: squid: mds/quiesce-db: track db epoch separately from the membership epoch X-Git-Tag: v19.1.1~299^2~22 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=fd766f27f39819ea6ce2e780f2b5b68dd84cf40b;p=ceph.git squid: mds/quiesce-db: track db epoch separately from the membership epoch Tracking the db epoch separately will make sure that replicas only follow leader's epoch choice, even if they are already on the new membership epoch. This eliminates races due to the random order of mdsmap updates. Fixes: https://tracker.ceph.com/issues/65977 Signed-off-by: Leonid Usov (cherry picked from commit 379ef7196b61142dc7753992f897ad91b37f048f) Fixes: https://tracker.ceph.com/issues/66070 --- diff --git a/src/mds/QuiesceDbManager.cc b/src/mds/QuiesceDbManager.cc index 64833934ba1b7..b068bacaf6e11 100644 --- a/src/mds/QuiesceDbManager.cc +++ b/src/mds/QuiesceDbManager.cc @@ -58,7 +58,7 @@ bool QuiesceDbManager::db_thread_has_work() const || pending_acks.size() > 0 || pending_requests.size() > 0 || pending_db_updates.size() > 0 - || (agent_callback.has_value() && agent_callback->if_newer < db_version()) + || (agent_callback.has_value() && agent_callback->if_newer < db.version()) || (cluster_membership.has_value() && cluster_membership->epoch != membership.epoch); } @@ -105,16 +105,21 @@ void* QuiesceDbManager::quiesce_db_thread_main() ls.unlock(); if (membership.is_leader()) { - if (leader_bootstrap(std::move(db_updates), next_event_at_age)) { + const QuiesceTimeInterval bootstrap_delay = leader_bootstrap(std::move(db_updates)); + if (bootstrap_delay == QuiesceTimeInterval::zero()){ // we're good to process things next_event_at_age = leader_upkeep(std::move(acks), std::move(requests)); } else { - // not yet there. Put the requests back onto the queue + // not yet there. Put the requests back onto the queue and wait for updates ls.lock(); while (!requests.empty()) { pending_requests.emplace_front(std::move(requests.back())); requests.pop_back(); } + if (pending_db_updates.empty()) { + dout(5) << "bootstrap: waiting for peer updates with timeout " << bootstrap_delay << dendl; + submit_condition.wait_for(ls, bootstrap_delay); + } continue; } } else { @@ -129,16 +134,16 @@ void* QuiesceDbManager::quiesce_db_thread_main() complete_requests(); // by default, only send ack if the version has changed - bool send_ack = last_acked != db_version(); - QuiesceMap quiesce_map(db_version()); + bool send_ack = last_acked != db.version(); + QuiesceMap quiesce_map(db.version()); { std::lock_guard lc(agent_mutex); if (agent_callback) { - if (agent_callback->if_newer < db_version()) { - dout(20) << "notifying agent with db version " << db_version() << dendl; + if (agent_callback->if_newer < db.version()) { + dout(20) << "notifying agent with db version " << db.version() << dendl; calculate_quiesce_map(quiesce_map); send_ack = agent_callback->notify(quiesce_map); - agent_callback->if_newer = db_version(); + agent_callback->if_newer = db.version(); } else { send_ack = false; } @@ -251,6 +256,8 @@ QuiesceDbManager::membership_upkeep() for (auto peer : cluster_membership->members) { peers.try_emplace(peer); } + // update the db epoch + db.epoch = cluster_membership->epoch; } else { peers.clear(); // abort awaits with EINPROGRESS @@ -291,14 +298,8 @@ QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates QuiesceDbListing &update = db_updates.back().db; - if (update.db_version.epoch != membership.epoch) { - dout(10) << "ignoring db update from another epoch: " << update.db_version << " != " << db_version() << dendl; - return QuiesceTimeInterval::max(); - } - if (update.db_version.set_version == 0) { - // this is a call from a leader - // to upload our local db version + // this is a call from the leader to upload our local db version update.sets = db.sets; update.db_version.set_version = db.set_version; update.db_age = db.get_age(); @@ -311,6 +312,7 @@ QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates dout(10) << "significant db_time_zero change to " << time_zero << " from " << db.time_zero << dendl; } db.time_zero = time_zero; + db.epoch = update.db_version.epoch; if (db.set_version > update.db_version.set_version) { dout(3) << "got an older version of DB from the leader: " << update.db_version.set_version << " < " << db.set_version << dendl; @@ -327,8 +329,11 @@ QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates return QuiesceTimeInterval::max(); } -bool QuiesceDbManager::leader_bootstrap(decltype(pending_db_updates)&& db_updates, QuiesceTimeInterval &next_event_at_age) +QuiesceTimeInterval QuiesceDbManager::leader_bootstrap(decltype(pending_db_updates)&& db_updates) { + const QuiesceTimeInterval PEER_DISCOVERY_INTERVAL = std::chrono::seconds(1); + QuiesceTimeInterval bootstrap_delay = PEER_DISCOVERY_INTERVAL; + // check that we've heard from all peers in this epoch std::unordered_set unknown_peers; for (auto&& [peer, info] : peers) { @@ -348,7 +353,7 @@ bool QuiesceDbManager::leader_bootstrap(decltype(pending_db_updates)&& db_update if (db.set_version < update.db_version.set_version) { dout(3) << "preferring version from peer " << from << " (" << update.db_version - << ") over mine (" << db_version() << ")" + << ") over mine (" << db.version() << ")" << " and incrementing it to collect acks" << dendl; db.time_zero = QuiesceClock::now() - update.db_age; db.set_version = update.db_version.set_version + 1; @@ -361,24 +366,33 @@ bool QuiesceDbManager::leader_bootstrap(decltype(pending_db_updates)&& db_update db_updates.pop(); } + QuiesceTimePoint const now = QuiesceClock::now(); for (auto & peer: unknown_peers) { PeerInfo & info = peers[peer]; - QuiesceTimePoint next_discovery = info.last_seen + std::chrono::seconds(1); - if (info.last_seen == QuiesceClock::zero() || next_discovery < QuiesceClock::now()) { + QuiesceTimePoint next_discovery = info.last_activity + PEER_DISCOVERY_INTERVAL; + if (next_discovery < now) { // send a discovery request to unknown peers dout(5) << " sending a discovery request to " << peer << dendl; membership.send_listing_to(peer, QuiesceDbListing(membership.epoch)); - info.last_seen = QuiesceClock::now(); - next_discovery = info.last_seen + std::chrono::seconds(1); + info.last_activity = now; + next_discovery = info.last_activity + PEER_DISCOVERY_INTERVAL; } - QuiesceTimeInterval next_discovery_at_age = next_discovery - db.time_zero; + // next_discovery is >= now + if (bootstrap_delay > next_discovery - now) { + bootstrap_delay = (next_discovery - now); + } + } - next_event_at_age = std::min(next_event_at_age, next_discovery_at_age); + bool all_peers_known = unknown_peers.empty(); + + if (!all_peers_known) { + dout(10) << "unknown peers: " << unknown_peers << dendl; } - // true if all peers are known - return unknown_peers.empty(); + // add some margin to hit the discovery interval for the earliest discovery. + const QuiesceTimeInterval a_little_more = std::chrono::milliseconds(100); + return all_peers_known ? QuiesceTimeInterval::zero() : (bootstrap_delay + a_little_more); } QuiesceTimeInterval QuiesceDbManager::leader_upkeep(decltype(pending_acks)&& acks, decltype(pending_requests)&& requests) @@ -412,7 +426,7 @@ void QuiesceDbManager::complete_requests() { r.clear(); if (membership.leader == membership.me) { r.db_age = db.get_age(); - r.db_version = db_version(); + r.db_version = db.version(); if (req->request.set_id) { Db::Sets::const_iterator it = db.sets.find(*req->request.set_id); @@ -443,9 +457,9 @@ void QuiesceDbManager::leader_record_ack(QuiesceInterface::PeerId from, QuiesceM auto & info = it->second; - if (diff_map.db_version > db_version()) { - dout(15) << "future version ack by peer " << from << " (" << diff_map.db_version << " > " << db_version() << ")" << dendl; - if (diff_map.db_version.epoch > db_version().epoch && diff_map.db_version.set_version <= db_version().set_version) { + if (diff_map.db_version > db.version()) { + dout(15) << "future version ack by peer " << from << " (" << diff_map.db_version << " > " << db.version() << ")" << dendl; + if (diff_map.db_version.epoch > db.version().epoch && diff_map.db_version.set_version <= db.version().set_version) { dout(15) << "my epoch is behind, ignoring this until my membership is updated" << dendl; } else { dout(5) << "will send the peer a full DB" << dendl; @@ -454,7 +468,7 @@ void QuiesceDbManager::leader_record_ack(QuiesceInterface::PeerId from, QuiesceM } else { dout(20) << "ack " << diff_map << " from peer " << from << dendl; info.diff_map = std::move(diff_map); - info.last_seen = QuiesceClock::now(); + info.last_activity = QuiesceClock::now(); } } @@ -844,7 +858,7 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep_db() for (auto &[peer, sets]: peer_updates) { QuiesceDbListing update; update.db_age = db.get_age(); - update.db_version = db_version(); + update.db_version = db.version(); std::ranges::copy(sets, std::inserter(update.sets, update.sets.end())); dout(20) << "updating peer " << peer << " with " << sets.size() @@ -883,11 +897,7 @@ size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const Qu std::multimap> reporting_peers; for (auto& [peer, info] : peers) { - // we consider the last bit of information we had from a given peer - // however, we want to skip peers which haven't been bootstrapped yet - if (info.diff_map.db_version.set_version == 0) { - continue; - } + // we consider the last bit of information we had from the peer auto dit = info.diff_map.roots.find(root); QuiesceState reported_state = set.get_requested_member_state(); @@ -1135,7 +1145,7 @@ static QuiesceTimeInterval get_root_ttl(const QuiesceSet & set, const QuiesceSet void QuiesceDbManager::calculate_quiesce_map(QuiesceMap &map) { map.roots.clear(); - map.db_version = db_version(); + map.db_version = db.version(); auto db_age = db.get_age(); for(auto & [set_id, set]: db.sets) { diff --git a/src/mds/QuiesceDbManager.h b/src/mds/QuiesceDbManager.h index 2fdc605ac8462..b84b18fc52916 100644 --- a/src/mds/QuiesceDbManager.h +++ b/src/mds/QuiesceDbManager.h @@ -227,6 +227,7 @@ class QuiesceDbManager { // the database. struct Db { QuiesceTimePoint time_zero; + epoch_t epoch; QuiesceSetVersion set_version = 0; using Sets = std::unordered_map; Sets sets; @@ -235,22 +236,23 @@ class QuiesceDbManager { return QuiesceClock::now() - time_zero; } void clear() { - set_version = 0; + set_version = 0; + epoch = 0; sets.clear(); time_zero = QuiesceClock::now(); } - } db; - QuiesceDbVersion db_version() const { return {membership.epoch, db.set_version}; } + QuiesceDbVersion version() const { return {epoch, set_version}; } + } db; QuiesceClusterMembership membership; struct PeerInfo { QuiesceMap diff_map; - QuiesceTimePoint last_seen; - PeerInfo(QuiesceMap&& diff_map, QuiesceTimePoint last_seen) + QuiesceTimePoint last_activity; + PeerInfo(QuiesceMap&& diff_map, QuiesceTimePoint last_activity) : diff_map(diff_map) - , last_seen(last_seen) + , last_activity(last_activity) { } PeerInfo() { } @@ -278,7 +280,8 @@ class QuiesceDbManager { std::pair membership_upkeep(); QuiesceTimeInterval replica_upkeep(decltype(pending_db_updates)&& db_updates); - bool leader_bootstrap(decltype(pending_db_updates)&& db_updates, QuiesceTimeInterval &next_event_at_age); + // returns zero interval if bootstrapped, otherwise the time to sleep while we wait for peer responses + QuiesceTimeInterval leader_bootstrap(decltype(pending_db_updates)&& db_updates); QuiesceTimeInterval leader_upkeep(decltype(pending_acks)&& acks, decltype(pending_requests)&& requests);