]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
squid: mds/queisce-db: collect acks while bootstrapping
authorLeonid Usov <leonid.usov@ibm.com>
Sun, 19 May 2024 21:54:59 +0000 (00:54 +0300)
committerLeonid Usov <leonid.usov@ibm.com>
Tue, 28 May 2024 16:06:19 +0000 (19:06 +0300)
Keeping the acks that come in will allow processing them
immediately after the bootstrap is over, avoiding unnecessary
set state transitions.

Fixes: https://tracker.ceph.com/issues/66119
Signed-off-by: Leonid Usov <leonid.usov@ibm.com>
(cherry picked from commit d6fb8755ca839ef5c1f94c3bc92a0e799c8f2d85)
Fixes: https://tracker.ceph.com/issues/66155
src/mds/QuiesceDb.h
src/mds/QuiesceDbManager.cc
src/mds/QuiesceDbManager.h
src/test/mds/TestQuiesceDb.cc

index 98d98451f6d81430f62320e649427b10b29850e3..244f342cc73eb79358f5d3503904dc0afa3ace59 100644 (file)
@@ -681,6 +681,17 @@ operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceMap& map)
 struct QuiesceDbPeerAck {
   QuiesceInterface::PeerId origin;
   QuiesceMap diff_map;
+
+  QuiesceDbPeerAck() = default;
+  QuiesceDbPeerAck(QuiesceDbPeerAck const&) = default;
+  QuiesceDbPeerAck(QuiesceDbPeerAck &&) = default;
+  QuiesceDbPeerAck(QuiesceInterface::PeerId origin, std::convertible_to<QuiesceMap> auto&& diff_map)
+      : origin(origin)
+      , diff_map(std::forward<QuiesceMap>(diff_map))
+  {}
+
+  QuiesceDbPeerAck& operator=(QuiesceDbPeerAck const&) = default;
+  QuiesceDbPeerAck& operator=(QuiesceDbPeerAck&&) = default;
 };
 
 template <class CharT, class Traits>
index 4ac49e332609dc73c8b4a97fd58cb790c4d2ae4b..88844f09c817e8400da1e71fac024ff7b50cd714 100644 (file)
@@ -110,14 +110,20 @@ void* QuiesceDbManager::quiesce_db_thread_main()
           // we're good to process things
           next_event_at_age = leader_upkeep(std::move(acks), std::move(requests));
         } else {
-          // not yet there. Put the requests back onto the queue and wait for updates
+          // not yet there. Put the acks and requests back onto the queue and wait for updates
           ls.lock();
           while (!requests.empty()) {
             pending_requests.emplace_front(std::move(requests.back()));
             requests.pop_back();
           }
+          while (!acks.empty()) {
+            pending_acks.emplace_front(std::move(acks.back()));
+            acks.pop_back();
+          }
           if (pending_db_updates.empty()) {
-            dout(5) << "bootstrap: waiting for peer updates with timeout " << bootstrap_delay << dendl;
+            dout(5) << "bootstrap: waiting for new peers with pending acks: " << pending_acks.size()
+              << " requests: " << pending_requests.size()
+              << ". Wait timeout: " << bootstrap_delay << dendl;
             submit_condition.wait_for(ls, bootstrap_delay);
           }
           continue;
@@ -401,7 +407,7 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep(decltype(pending_acks)&& ack
   while (!acks.empty()) {
     auto& [from, diff_map] = acks.front();
     leader_record_ack(from, std::move(diff_map));
-    acks.pop();
+    acks.pop_front();
   }
 
   // process requests
@@ -439,7 +445,9 @@ void QuiesceDbManager::complete_requests() {
         }
       }
     }
+
     // non-zero result codes are all errors
+    dout(10) << "completing request '" << req->request << " with rc: " << -res << dendl;
     req->complete(-res);
   }
   done_requests.clear();
@@ -940,7 +948,6 @@ size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const Qu
         continue;
       }
       reported_state = pr_state.state;
-      reporting_peers.insert({pr_state.state, {peer, info.diff_map.db_version}});
     }
 
     // but we only consider the peer up to date given the version
@@ -948,8 +955,13 @@ size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const Qu
       up_to_date_peers++;
     }
 
-    min_reported_state = std::min(min_reported_state, reported_state);
-    max_reported_state = std::max(max_reported_state, reported_state);
+    // we keep track of reported states only if the peer actually said something
+    // even if for an older version
+    if (info.diff_map.db_version.set_version > 0) {
+      reporting_peers.insert({ reported_state, { peer, info.diff_map.db_version } });
+      min_reported_state = std::min(min_reported_state, reported_state);
+      max_reported_state = std::max(max_reported_state, reported_state);
+    }
   }
 
   if (min_reported_state == QS__MAX) {
@@ -1099,15 +1111,16 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep_awaits()
   for (auto it = awaits.begin(); it != awaits.end();) {
     auto & [set_id, actx] = *it;
     Db::Sets::const_iterator set_it = db.sets.find(set_id);
+    QuiesceState set_state = QS__INVALID;
 
     int rc = db.get_age() >= actx.expire_at_age ? EINPROGRESS : EBUSY;
 
     if (set_it == db.sets.cend()) {
       rc = ENOENT;
     } else {
-      auto const & set = set_it->second;
-
-      switch(set.rstate.state) {
+      auto const& set = set_it->second;
+      set_state = set.rstate.state;
+      switch(set_state) {
         case QS_CANCELED:
           rc = ECANCELED;
           break;
@@ -1140,7 +1153,6 @@ QuiesceTimeInterval QuiesceDbManager::leader_upkeep_awaits()
     }
 
     if (rc != EBUSY) {
-      dout(10) << "completing an await for the set '" << set_id << "' with rc: " << rc << dendl;
       done_requests[actx.req_ctx] = rc;
       it = awaits.erase(it);
     } else {
index 3c9bde1b526fc67f37e1799f2354bcc04a838577..9654ce802eb13d5f16c8520d59a9dab13c7c128c 100644 (file)
@@ -100,7 +100,7 @@ class QuiesceDbManager {
         return -ESTALE;
       }
 
-      pending_acks.push(std::move(ack));
+      pending_acks.emplace_back(std::move(ack));
       submit_condition.notify_all();
       return 0;
     }
@@ -138,7 +138,7 @@ class QuiesceDbManager {
 
       if (cluster_membership->leader == cluster_membership->me) {
         // local delivery
-        pending_acks.push({ cluster_membership->me, std::move(diff_map) });
+        pending_acks.emplace_back(cluster_membership->me, std::move(diff_map));
         submit_condition.notify_all();
       } else {
         // send to the leader outside of the lock
@@ -201,7 +201,7 @@ class QuiesceDbManager {
     std::optional<AgentCallback> agent_callback;
     std::optional<QuiesceClusterMembership> cluster_membership;
     std::queue<QuiesceDbPeerListing> pending_db_updates;
-    std::queue<QuiesceDbPeerAck> pending_acks;
+    std::deque<QuiesceDbPeerAck> pending_acks;
     std::deque<RequestContext*> pending_requests;
     bool db_thread_should_exit = false;
     bool db_thread_should_clear_db = true;
index ce0ad767edb11c028365fd19a97910f9f00d5ade..19c8d9b6163d268d8f60989327b9a8045dddb95d 100644 (file)
@@ -984,7 +984,7 @@ TEST_F(QuiesceDbTest, InterruptedQuiesceAwait)
   EXPECT_EQ(ERR(ETIMEDOUT), await2.wait_result());
 
   // shouldn't have taken much longer than the timeout configured on the set
-  auto epsilon = sec(0.01);
+  auto epsilon = sec(0.05);
   ASSERT_LE(QuiesceClock::now() - then - epsilon, last_request->response.sets.at("set2").timeout);
 
   // let's cancel set 1 while awaiting it a few times