]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds/quiesce-db: always clear the db if a membership is lost 56695/head
authorLeonid Usov <leonid.usov@ibm.com>
Sun, 17 Mar 2024 16:22:34 +0000 (12:22 -0400)
committerLeonid Usov <leonid.usov@ibm.com>
Thu, 4 Apr 2024 07:57:13 +0000 (10:57 +0300)
Fixes: https://tracker.ceph.com/issues/64912
Signed-off-by: Leonid Usov <leonid.usov@ibm.com>
(cherry picked from commit e1c8e08961e8199f1da74dc9d1eb2a940e2908d5)

src/mds/QuiesceDbManager.cc
src/mds/QuiesceDbManager.h
src/test/mds/TestQuiesceDb.cc

index 6fccaacf10c475c97a1b57b8f286796de3a8fe43..bcbcbeed0b142b48db1fd8cccb858e1876aae3ec 100644 (file)
@@ -168,18 +168,23 @@ void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_mem
     dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
     db_thread_should_exit = false;
     quiesce_db_thread.create("quiesce_db_mgr");
-  } else {
+  } else if (quiesce_db_thread.is_started()) {
     submit_condition.notify_all();
   }
 
   if (inject_request) {
-    pending_requests.push_front(inject_request);
+    if (will_participate || quiesce_db_thread.is_started()) {
+      pending_requests.push_front(inject_request);
+    } else {
+      inject_request->complete(ENOTTY);
+    }
   }
 
   if (will_participate) {
     cluster_membership = new_membership;
   } else {
     cluster_membership.reset();
+    db_thread_should_clear_db = true;
   }
 
   std::lock_guard lc(agent_mutex);
@@ -188,8 +193,20 @@ void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_mem
   }
 }
 
-std::pair<QuiesceDbManager::IsMemberBool, QuiesceDbManager::ShouldExitBool> QuiesceDbManager::membership_upkeep()
+std::pair<QuiesceDbManager::IsMemberBool, QuiesceDbManager::ShouldExitBool>
+QuiesceDbManager::membership_upkeep()
 {
+  if (db_thread_should_clear_db) {
+    dout(5) << "a reset of the db has been requested" << dendl;
+    db_thread_should_clear_db = false;
+    membership.epoch = 0;
+    // clear the peers to bootstrap from scratch if we are the leader
+    peers.clear();
+    // reset the db
+    db.clear();
+    // not clearing awaits and requests, they will be handled below
+  }
+
   if (cluster_membership && cluster_membership->epoch == membership.epoch) {
     // no changes
     return {true, db_thread_should_exit};
@@ -218,12 +235,6 @@ std::pair<QuiesceDbManager::IsMemberBool, QuiesceDbManager::ShouldExitBool> Quie
     for (auto peer : cluster_membership->members) {
       peers.try_emplace(peer);
     }
-
-    if (db.set_version == 0) {
-      db.time_zero = QuiesceClock::now();
-      db.sets.clear();
-    }
-
   } else {
     peers.clear();
     // abort awaits with EINPROGRESS
@@ -244,11 +255,6 @@ std::pair<QuiesceDbManager::IsMemberBool, QuiesceDbManager::ShouldExitBool> Quie
   if (cluster_membership) {
     membership = *cluster_membership;
     dout(15) << "Updated membership" << dendl;
-  } else {
-    membership.epoch = 0;
-    peers.clear();
-    awaits.clear();
-    db.clear();
   }
 
   return { cluster_membership.has_value(), db_thread_should_exit };
@@ -291,7 +297,7 @@ QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates
   db.time_zero = time_zero;
 
   if (db.set_version > update.db_version.set_version) {
-    dout(3) << "got an older version of DB from the leader: " << db.set_version << " > " << update.db_version.set_version << dendl;
+    dout(3) << "got an older version of DB from the leader: " << update.db_version.set_version << " < " << db.set_version << dendl;
     dout(3) << "discarding the DB" << dendl;
     db.clear();
   } else {
index 98d0b84fc24b6d8894ad97e24b1ffd7d781e066a..2fdc605ac8462c41054fc35b2c47908a57e06270 100644 (file)
@@ -204,6 +204,7 @@ class QuiesceDbManager {
     std::queue<QuiesceDbPeerAck> pending_acks;
     std::deque<RequestContext*> pending_requests;
     bool db_thread_should_exit = false;
+    bool db_thread_should_clear_db = true;
 
     class QuiesceDbThread : public Thread {
       public:
index 2ffba9778831e5bce0cd778ee176028d74358462..cf05aaa038d0ac7eff0520bb9c5b6455cfdd1e12 100644 (file)
@@ -1571,7 +1571,7 @@ TEST_F(QuiesceDbTest, MultiRankRecovery)
   ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
 
   // we expect the db to be populated since the new leader must have discovered newer versions
-  // we expect the sets to become quiescing since there's at least one member that's not acking (the new one)
+  // we expect the sets to become quiesced since all members are now acking
   EXPECT_EQ(OK(), run_request([](auto& r) {
     r.set_id = "set1";
     r.await = sec(1);
@@ -1598,15 +1598,9 @@ TEST_F(QuiesceDbTest, MultiRankRecovery)
   });
 
   // add back a quiescing peer
-  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3)}));
-
-  EXPECT_EQ(OK(), run_request([](auto& r) {}));
-  ASSERT_EQ(2, last_request->response.sets.size());
-  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
-  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
 
   EXPECT_EQ(std::future_status::ready, did_ack3.wait_for(std::chrono::milliseconds(2000)));
-
   EXPECT_EQ(OK(), run_request([](auto& r) {}));
   ASSERT_EQ(2, last_request->response.sets.size());
   EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);