]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds/quiesce: QuiesceDb.h and QuiesceDbManager with tests
authorLeonid Usov <leonid.usov@ibm.com>
Tue, 31 Oct 2023 12:46:55 +0000 (14:46 +0200)
committerLeonid Usov <leonid.usov@ibm.com>
Thu, 14 Mar 2024 19:07:52 +0000 (15:07 -0400)
Quiesce DB is one of the components of the "Consistent Snapshots" epic.
The solution is discussed in a slide deck available for viewing to @redhat users:
https://docs.google.com/presentation/d/1wE3-e9AAme7Q3qmeshUSthJoQGw7-fKTrtS9PsdAIVo/edit?usp=sharing

This commit is focusing on the replicated quiesce database maintained by the MDS rank cluster.
One of the major goals was to design the component in a way that can be easily tested
outside of the MDS infrastructure, which is why the communication layer
has been asbtracted out by introducing just two communication callbacks
that will need to be implemented by the infrastructure.

The most of the component code is delivered in a single coherent commit, along with the uint tests.
Other commits will be dedicated to integration with the MDS infrastructure and other changes
that can't be attributed to the core quiesce db code or its tests.

The quiesce db component is composed of the following major parts/actors:

* QuiesceDbManager is the main actor, implementing both the leader and the replica roles.
  Normally, there will be an instance of the manager per MDS rank, although given the
  decoupling of the infrastructure and the manager, one can run any number of instances
  on a single node, which is how test are working.
* The manager interfaces to the infrastructure via two main APIs with the infrastructure
  that provides communication and cluster configuration (actor 2) and the quiesce db
  client that is responsible for the quiescing of the roots (actor 3)
** ClusterMembership is how manager is configured to be part of a (virtual) cluster.
   This structure will deliver information about other peers, the leader and provide
   two communication APIs: send_listing_to for db replication from the leader to replicas
   and send_ack for reporting quiesce success from the agents.
** Client Interface consists of a QuisceMap notify callback and a dedicated manager
   method to submit asynchronous acks following the agent (rank) quiesce progress.

The API of the quiesce db is described in the slide deck mentioned above. The full scope
of capabilities are encapsulated in a single QuiesceDbRequest structure. This should
simplify the implementation of other components that will have to propagate the functionality
to the administrator user of the volumes plugin.

Signed-off-by: Leonid Usov <leonid.usov@ibm.com>
(cherry picked from commit 0e61c44238ef5ec1d2b0406600a4ce981b07f87a)

src/common/subsys.h
src/mds/BoostUrlImpl.cc [new file with mode: 0644]
src/mds/QuiesceAgent.h [new file with mode: 0644]
src/mds/QuiesceDb.h [new file with mode: 0644]
src/mds/QuiesceDbManager.cc [new file with mode: 0644]
src/mds/QuiesceDbManager.h [new file with mode: 0644]
src/test/mds/CMakeLists.txt
src/test/mds/TestQuiesceDb.cc [new file with mode: 0644]

index d52018c880dd36dd5804c2a427bd4f3b6a1cc723..0bc92346530faf297e071f575f2cf7886357684a 100644 (file)
@@ -28,6 +28,7 @@ SUBSYS(mds_locker, 1, 5)
 SUBSYS(mds_log, 1, 5)
 SUBSYS(mds_log_expire, 1, 5)
 SUBSYS(mds_migrator, 1, 5)
+SUBSYS(mds_quiesce, 3, 5)
 SUBSYS(buffer, 0, 1)
 SUBSYS(timer, 0, 1)
 SUBSYS(filer, 0, 1)
diff --git a/src/mds/BoostUrlImpl.cc b/src/mds/BoostUrlImpl.cc
new file mode 100644 (file)
index 0000000..c2e992a
--- /dev/null
@@ -0,0 +1 @@
+#include <boost/url/src.hpp>
diff --git a/src/mds/QuiesceAgent.h b/src/mds/QuiesceAgent.h
new file mode 100644 (file)
index 0000000..6a059de
--- /dev/null
@@ -0,0 +1,19 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#pragma once
+#include "QuiesceDb.h"
+
+class QuiesceAgent {
+  // subscribe to the QM map
+  // keeps runtime version of the map
+  // notifies the QM when runtime version is different from the last know requested version
+};
\ No newline at end of file
diff --git a/src/mds/QuiesceDb.h b/src/mds/QuiesceDb.h
new file mode 100644 (file)
index 0000000..5fedcfe
--- /dev/null
@@ -0,0 +1,711 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#pragma once
+#include <string>
+#include <map>
+#include <unordered_map>
+#include <optional>
+#include <vector>
+#include <ranges>
+
+#include "mds/mdstypes.h"
+#include "common/ceph_time.h"
+
+// NB! The order of the states in the enum is important!
+// There are places in the code that aggregate multiple states
+// via min or max, depending on the task.
+// The order of states represents the natural lifecycle
+// of a set and its members, this is specifically important
+// for the active states.
+enum QuiesceState: uint8_t {
+  QS__INVALID,
+
+  // these states are considered "active"
+  QS_QUIESCING, QS__ACTIVE = QS_QUIESCING,
+  QS_QUIESCED,
+  QS_RELEASING,
+
+  // the below states are all terminal, or "inactive"
+  QS_RELEASED, QS__TERMINAL = QS_RELEASED,
+  // the below states are all about types of failure
+  QS_EXPIRED, QS__FAILURE = QS_EXPIRED,
+
+  QS_FAILED,
+
+  // the below states aren't allowed for roots, only for sets
+  QS_CANCELED, QS__SET_ONLY = QS_CANCELED,
+  QS_TIMEDOUT,
+
+  QS__MAX,
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceState& qs)
+{
+  switch (qs) {
+  case QS__INVALID:
+    return os << "QS__INVALID (" << (int)qs << ")";
+  case QS_QUIESCING:
+    return os << "QS_QUIESCING (" << (int)qs << ")";
+  case QS_QUIESCED:
+    return os << "QS_QUIESCED (" << (int)qs << ")";
+  case QS_RELEASING:
+    return os << "QS_RELEASING (" << (int)qs << ")";
+  case QS_RELEASED:
+    return os << "QS_RELEASED (" << (int)qs << ")";
+  case QS_FAILED:
+    return os << "QS_FAILED (" << (int)qs << ")";
+  case QS_CANCELED:
+    return os << "QS_CANCELED (" << (int)qs << ")";
+  case QS_TIMEDOUT:
+    return os << "QS_TIMEDOUT (" << (int)qs << ")";
+  case QS_EXPIRED:
+    return os << "QS_EXPIRED (" << (int)qs << ")";
+  default:
+    return os << "!Unknown quiesce state! (" << (int)qs << ")";
+  }
+};
+
+inline const char * quiesce_state_name(QuiesceState state) {
+  switch (state) {
+  case QS__INVALID:
+    return "<invalid>";
+  case QS_QUIESCING:
+    return "QUIESCING";
+  case QS_QUIESCED:
+    return "QUIESCED";
+  case QS_RELEASING:
+    return "RELEASING";
+  case QS_RELEASED:
+    return "RELEASED";
+  case QS_FAILED:
+    return "FAILED";
+  case QS_CANCELED:
+    return "CANCELED";
+  case QS_TIMEDOUT:
+    return "TIMEDOUT";
+  case QS_EXPIRED:
+    return "EXPIRED";
+  default:
+    return "<unknown>";
+  }
+}
+
+// Since MDS clock is not syncrhonized, and the quiesce db has to be replicated,
+// we measure all events in the quiesce database relative to the database age.
+// The age of the database is just large enough to have earliest events carry
+// a non-negative age stamp.
+// This is sufficient because we only care to honor the timeouts that are relative
+// to the other recorded database events.
+// This approach also relieves us from storing or transfering absolute time stamps:
+// every client can deduce the lower boundary of event's absolute time given the 
+// message roundrip timing - if they bother enough. Otherwise, they can just subtract
+// the received database age from now() and get their own absolute time reference.
+
+using QuiesceClock = ceph::coarse_real_clock;
+using QuiesceTimePoint = QuiesceClock::time_point;
+using QuiesceTimeInterval = QuiesceClock::duration;
+using QuiesceSetId = std::string;
+using QuiesceRoot = std::string;
+using QuiesceSetVersion = uint64_t;
+
+struct QuiesceDbVersion {
+  epoch_t epoch;
+  QuiesceSetVersion set_version;
+  auto operator<=>(QuiesceDbVersion const& other) const = default;
+  QuiesceDbVersion& operator+(unsigned int delta) {
+    set_version += delta;
+    return *this;
+  }
+};
+
+inline auto operator==(int const& set_version, QuiesceDbVersion const& db_version)
+{
+  return db_version.set_version == (QuiesceSetVersion)set_version;
+}
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceDbVersion& dbv)
+{
+  return os << "(" << dbv.epoch << ":" << dbv.set_version << ")";
+}
+
+struct QuiesceTimeIntervalSec {
+  const QuiesceTimeInterval interval;
+  QuiesceTimeIntervalSec(const QuiesceTimeInterval &interval) : interval(interval) {}
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceTimeIntervalSec& sec)
+{
+  using std::chrono::duration_cast;
+  using dd = std::chrono::duration<double>;
+  const auto precision = os.precision();
+  const auto flags = os.flags();
+
+  os
+    << std::fixed
+    << std::setprecision(1)
+    << duration_cast<dd>(sec.interval).count()
+    << std::setprecision(precision);
+
+  os.flags(flags);
+  return os;
+}
+
+struct RecordedQuiesceState {
+  QuiesceState state;
+  QuiesceTimeInterval at_age;
+
+  operator QuiesceState() {
+    return state;
+  }
+
+  bool update(const QuiesceState &state, const QuiesceTimeInterval &at_age) {
+    if (state != this->state) {
+      this->state = state;
+      this->at_age = at_age;
+      return true;
+    }
+    return false;
+  }
+
+  RecordedQuiesceState(QuiesceState state, QuiesceTimeInterval at_age) : state(state), at_age(at_age) {}
+  RecordedQuiesceState() : RecordedQuiesceState (QS__INVALID, QuiesceTimeInterval::zero()) {}
+  RecordedQuiesceState(RecordedQuiesceState const&) = default;
+  RecordedQuiesceState(RecordedQuiesceState &&) = default;
+  RecordedQuiesceState& operator=(RecordedQuiesceState const&) = default;
+  RecordedQuiesceState& operator=(RecordedQuiesceState &&) = default;
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const RecordedQuiesceState& rstate)
+{
+  return os << rstate.state;
+}
+
+/// @brief  `QuiesceSet` is the only record type in the quiesce database
+///         It encodes sufficient information to have the db taken over by
+///         a new manager and correctly decide on the next state transition.
+struct QuiesceSet {
+  /// @brief  A member of a set represents a single root this set wants quiesced
+  ///         It carries the information about the current known state of this root
+  ///         and whether it got excluded from this set.
+  ///         It's possible that holding on to excluded members is an overkill.
+  struct MemberInfo {
+    RecordedQuiesceState rstate;
+    bool excluded = false;
+    MemberInfo(QuiesceState state, QuiesceTimeInterval at_age)
+        : rstate(state, at_age)
+        , excluded(false)
+    {
+    }
+    MemberInfo(QuiesceTimeInterval at_age)
+        : MemberInfo(QS_QUIESCING, at_age)
+    {
+    }
+    MemberInfo() = default;
+    MemberInfo(MemberInfo const& o) = default;
+    MemberInfo(MemberInfo &&) = default;
+    MemberInfo& operator=(MemberInfo const& o) = default;
+    MemberInfo& operator=(MemberInfo &&) = default;
+
+    bool is_quiescing() const { return rstate.state < QS_QUIESCED; }
+    bool is_failed() const { return rstate.state >= QS__FAILURE; }
+  };
+  
+  /// @brief  The db version when this set got modified last
+  QuiesceSetVersion version = 0;
+  /// @brief  The last recorded state change of this set
+  RecordedQuiesceState rstate;
+  /// @brief  How much time to give every new member to reach the quiesced state
+  ///         By default the value is zero. It means that new sets which don't
+  ///         have this field explicitly updated will immediately reach QS_TIMEDOUT.
+  QuiesceTimeInterval timeout = QuiesceTimeInterval::zero();
+  /// @brief  How much time to allow the set to spend in QS_QUIESCED or QS_RELEASING states
+  ///         The expiration timer is reset every time a successful await is executed
+  ///         on a QS_QUIESCED set.
+  QuiesceTimeInterval expiration = QuiesceTimeInterval::zero();
+  using Members = std::unordered_map<QuiesceRoot, MemberInfo>;
+  Members members;
+
+  /// @brief The effective state of a member is just a max of its parent set state and its own state
+  ///        The exception is when the set is releasing: we want to consider any ack from peers
+  ///        that confirms quiesced state of the member to be treated as RELEASED.
+  /// @param member_state the reported state of the member
+  /// @return the effective member state
+  QuiesceState get_effective_member_state(QuiesceState reported_state) const
+  {
+    if (is_releasing()) {
+      if (reported_state >= QS_QUIESCED && reported_state <= QS_RELEASED) {
+        return QS_RELEASED;
+      }
+    }
+    if (is_quiesced() && reported_state < QS_QUIESCED) {
+      // we need to change back to quiescing
+      return QS_QUIESCING;
+    }
+    return std::max(reported_state, rstate.state);
+  }
+
+  /// @brief The requested state of a member is what we send to the agents for 
+  ///        executing the quiesce protocol. This state is deliberately reduced
+  ///        to provoke clients to ack back and thus confirm their current state
+  /// @param set_state the state of the set this member is from
+  /// @return the effective member state
+  QuiesceState get_requested_member_state() const
+  {
+    if (rstate.state >= QS__TERMINAL) {
+      return rstate.state;
+    }
+    if (rstate.state <= QS_QUIESCED) {
+      // request quiescing even if we are already quiesced
+      return QS_QUIESCING;
+    }
+    // we can't have anything else unless the state enum was changed
+    // which will have to be addressed here.
+    ceph_assert(rstate.state == QS_RELEASING);
+    return QS_RELEASING;
+  }
+
+  bool is_active() const {
+    return
+      rstate.state > QS__INVALID
+      && rstate.state < QS__TERMINAL;
+  }
+
+  QuiesceState next_state(QuiesceState min_member_state) const;
+
+  bool is_quiescing() const { return rstate.state < QS_QUIESCED; }
+  bool is_quiesced() const { return rstate.state == QS_QUIESCED; }
+  bool is_releasing() const { return rstate.state == QS_RELEASING; }
+  bool is_released() const { return rstate.state == QS_RELEASED; }
+
+  QuiesceSet() = default;
+  QuiesceSet(QuiesceSet const &) = default;
+  QuiesceSet(QuiesceSet &&) = default;
+  QuiesceSet& operator=(QuiesceSet const &) = default;
+  QuiesceSet& operator=(QuiesceSet &&) = default;
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceSet::MemberInfo& member)
+{
+  return os << (member.excluded ? "(excluded)" : "") << member.rstate;
+}
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceSet& set)
+{
+  size_t active = 0, inactive = 0;
+
+  for (auto && [_, m]: set.members) {
+    if (m.excluded) {
+      ++inactive;
+    } else {
+      ++active;
+    }
+  }
+
+  return os << "q-set[" << set.rstate << " v:" << set.version << ", m:" << active << "/" << inactive
+    << ", t:" << QuiesceTimeIntervalSec(set.timeout) << ", e:" << QuiesceTimeIntervalSec(set.expiration) << "]";
+}
+
+/// @brief QuiesceDbRequest is the only client interface to the database.
+///        This structure alone should be capable of encoding the full variety
+///        of different requests that can be submitted by the client.
+struct QuiesceDbRequest {
+  /// @brief `RootsOp` should be considered together with the `roots` set below
+  ///        to know the operation. Each name in the enum has two verbs: first
+  ///        verb is for the case when `roots` is not empty, and the second is
+  ///        for when `roots` is empty
+  enum RootsOp: uint8_t {
+    INCLUDE_OR_QUERY,
+    EXCLUDE_OR_RELEASE,
+    RESET_OR_CANCEL,
+    __INVALID
+  };
+
+  enum Flags: uint8_t {
+    NONE = 0,
+    VERBOSE = 1,
+    EXCLUSIVE = 2,
+  };
+
+  struct Control {
+    union {
+      struct {
+        RootsOp roots_op;
+        Flags flags;
+      };
+      uint64_t raw;
+    };
+    Control() : raw(0) {}
+    Control(RootsOp op) : raw(0) {
+      roots_op = op;
+    }
+    bool operator==(const Control& other) const {
+      return other.raw == raw;
+    }
+  };
+
+  Control control;
+
+  /// @brief `set_id` is optional to allow for the following operations:
+  ///         * including roots without providing a set id will generate a new set with a unique id
+  ///           ** NB! the new set id will stored in this field by the db manager
+  ///         * querying without a set id will return the full db
+  ///         * cancelling without a set id will cancel all active sets
+  std::optional<std::string> set_id;
+  /// @brief When `if_version` provided, the request will only be executed
+  ///        if the named set has exactly the version, otherwise ESTALE is returned
+  ///        and no set modification is performed.
+  ///        Requires a set_id.
+  std::optional<QuiesceSetVersion> if_version;
+  /// @brief Updates the quiesce timeout of an active set.
+  ///        Requires a set_id. Attempt to update an inactive set will result in EPERM
+  std::optional<QuiesceTimeInterval> timeout;
+  /// @brief Updates the quiesce expiration of an active set.
+  ///        Requires a set_id. Attempt to update an inactive set will result in EPERM
+  std::optional<QuiesceTimeInterval> expiration;
+  /// @brief When `await` is non-null, then after performing other encoded operations
+  ///        this request is put on the await queue of the given set.
+  ///        The value of this member defines the await timeout.
+  ///        Requires a set id. The result code is one of the following:
+  ///         EPERM     - the set is not in one of the awaitable states:
+  ///                     [QS_QUIESCING, QS_QUIESCED, QS_RELEASING, QS_RELEASED]
+  ///         SUCCESS   - the set is currently QS_QUIESCED or QS_RELEASED.
+  ///                     When an await is completed successfully for a QS_QUIESCED set,
+  ///                     this set's quiesce expiration timer is reset.
+  ///         EINTR     - the set had a change in members or in state
+  ///         ECANCELED - the set was canceled
+  ///         ETIMEDOUT - at least one of the set members failed to quiesce 
+  ///                     within the configured quiesce timeout.
+  ///                     OR the set is RELEASING and it couldn't reach RELEASED before it expired
+  ///                     NB: the quiesce timeout is measured for every member separately
+  ///                     from the moment that member is included.
+  ///         EINPROGRESS - the time limit configured for this await call has elapsed 
+  ///                     before the set changed state.
+  std::optional<QuiesceTimeInterval> await;
+  using Roots = std::unordered_set<QuiesceRoot>;
+  /// @brief `roots` help identify the wanted operation as well as providing
+  ///        the actual roots to mutate the members of the set.
+  Roots roots;
+
+  bool operator==(const QuiesceDbRequest&) const = default;
+
+  bool is_valid() const {
+    return control.roots_op < __INVALID && (
+        // Everything goes if a set id is provided
+        set_id
+        // or it's a new set creation, in which case the request should be including roots
+        || includes_roots()
+        // Otherwise, the allowed wildcard operations are: query and cancel all.
+        // Also, one can't await a wildcard
+        || ((is_cancel_all() || is_query()) && !await && !timeout && !expiration && !if_version)
+    );
+  }
+
+  bool is_mutating() const { return (control.roots_op != INCLUDE_OR_QUERY) || !roots.empty() || timeout || expiration; }
+  bool is_cancel_all() const { return !set_id && is_cancel(); }
+  bool excludes_roots() const { return control.roots_op == RESET_OR_CANCEL || (control.roots_op == EXCLUDE_OR_RELEASE && !roots.empty()); }
+  bool includes_roots() const { return (control.roots_op == RESET_OR_CANCEL || control.roots_op == INCLUDE_OR_QUERY) && !roots.empty(); }
+
+  bool is_include() const { return control.roots_op == INCLUDE_OR_QUERY && !roots.empty(); }
+  bool is_query() const { return control.roots_op == INCLUDE_OR_QUERY && roots.empty(); }
+  bool is_exclude() const { return control.roots_op == EXCLUDE_OR_RELEASE && !roots.empty(); }
+  bool is_release() const { return control.roots_op == EXCLUDE_OR_RELEASE && roots.empty(); }
+  bool is_reset() const { return control.roots_op == RESET_OR_CANCEL && !roots.empty(); }
+  bool is_cancel() const { return control.roots_op == RESET_OR_CANCEL && roots.empty(); }
+
+  bool is_verbose() const { return control.flags & Flags::VERBOSE; }
+  bool is_exclusive() const { return control.flags & Flags::EXCLUSIVE; }
+
+  bool should_exclude(QuiesceRoot root) const {
+    switch (control.roots_op) {
+    case INCLUDE_OR_QUERY:
+      return false;
+    case EXCLUDE_OR_RELEASE:
+      return roots.contains(root);
+    case RESET_OR_CANCEL:
+      return !roots.contains(root);
+      default: ceph_abort("unknown roots_op"); return false;
+    }
+  }
+
+  void reset(std::invocable<QuiesceDbRequest&> auto const &config)
+  {
+    set_id.reset();
+    if_version.reset();
+    timeout.reset();
+    expiration.reset();
+    await.reset();
+    roots.clear();
+    control.raw = 0; // implies roots_op == INCLUDE_OR_QUERY;
+
+    config(*this);
+  }
+  void reset() {
+    reset([](auto&r){});
+  }
+
+  template<typename R = Roots>
+  requires requires ( R&& roots) {
+    Roots(std::forward<R>(roots));
+  }
+  void set_roots(RootsOp op, R&& roots) {
+    control.roots_op = op;
+    this->roots = Roots(std::forward<R>(roots));
+  }
+
+  template <std::ranges::range R>
+  void set_roots(RootsOp op, const R& roots_range)
+  {
+    control.roots_op = op;
+    this->roots = Roots(roots_range.begin(), roots_range.end());
+  }
+
+  template <typename R = Roots>
+  void include_roots(R&& roots)
+  {
+    set_roots(INCLUDE_OR_QUERY, std::forward<R>(roots));
+  }
+
+  template <typename R = Roots>
+  void exclude_roots(R&& roots)
+  {
+    set_roots(EXCLUDE_OR_RELEASE, std::forward<R>(roots));
+  }
+
+  void release_roots() {
+    set_roots(EXCLUDE_OR_RELEASE, {});
+  }
+
+  template <typename R = Roots>
+  void reset_roots(R&& roots)
+  {
+    set_roots(RESET_OR_CANCEL, std::forward<R>(roots));
+  }
+
+  void cancel_roots()
+  {
+    set_roots(RESET_OR_CANCEL, {});
+  }
+
+  template <typename S = std::string>
+  void query(S&& set_id) {
+    reset([set_id](auto &r){
+      r.set_id = std::forward<S>(set_id);
+    });
+  }
+
+  const char * op_string() const {
+    switch (control.roots_op) {
+    case INCLUDE_OR_QUERY:
+      return roots.empty() ? "query" : "include";
+    case EXCLUDE_OR_RELEASE:
+      return roots.empty() ? "release" : "exclude";
+    case RESET_OR_CANCEL:
+      return roots.empty() ? "cancel" : "reset";
+    default:
+      return "<unknown>";
+    }
+  }
+
+
+  QuiesceDbRequest() {}
+  QuiesceDbRequest(const QuiesceDbRequest &) = default;
+  QuiesceDbRequest(QuiesceDbRequest &&) = default;
+  QuiesceDbRequest(std::invocable<QuiesceDbRequest&> auto const &config) {
+    reset(config);
+  }
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceDbRequest& req)
+{
+  os << "q-req[" << req.op_string();
+
+  if (req.set_id) {
+    os << " \"" << req.set_id << "\"";
+  }
+
+  if (req.if_version) {
+    os << " ?v:" << req.if_version;
+  }
+
+  if (req.await) {
+    os << " a:" << QuiesceTimeIntervalSec(*req.await);
+  }
+
+  return os << " roots:" << req.roots.size() << "]";
+}
+
+/// @brief  A `QuiesceDbListing` represents a subset of the database, up to
+///         a full database. The contents of the listing is decided by the leader
+///         based on the acks it got from every given replica: the update will
+///         contain all sets that have their version > than the last acked by the peer.
+struct QuiesceDbListing {
+  QuiesceDbVersion db_version = {0, 0};
+  /// @brief  Crusially, the precise `db_age` must be included in every db listing
+  ///         This data is used by all replicas to update their calculated DB TIME ZERO.
+  ///         All events in the database are measured relative to the DB TIME ZERO
+  QuiesceTimeInterval db_age = QuiesceTimeInterval::zero();
+  std::unordered_map<QuiesceSetId, QuiesceSet> sets;
+
+  void clear() {
+    db_version = {0, 0};
+    db_age = QuiesceTimeInterval::zero();
+    sets.clear();
+  }
+
+  QuiesceDbListing(epoch_t epoch) : db_version {epoch, 0} {}
+  QuiesceDbListing() = default;
+  QuiesceDbListing(QuiesceDbListing const&) = default;
+  QuiesceDbListing(QuiesceDbListing &&) = default;
+  QuiesceDbListing& operator=(QuiesceDbListing const&) = default;
+  QuiesceDbListing& operator=(QuiesceDbListing &&) = default;
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceDbListing& dbl)
+{
+  size_t active = 0, inactive = 0;
+
+  for (auto&& [_, s] : dbl.sets) {
+    if (s.is_active()) {
+      ++active;
+    } else {
+      ++inactive;
+    }
+  }
+
+  return os << "q-db[v:" << dbl.db_version << " sets:" << active << "/" << inactive << "]";
+}
+
+/// @brief  `QuiesceMap` is a root-centric representation of the quiesce database
+///         It lists roots with their effective states as of particular version.
+///         Additionally, the same structure is used by the peers when reporting
+///         actual roots states which are different from what the DB version encodes
+struct QuiesceMap {
+  QuiesceDbVersion db_version;
+  struct RootInfo {
+    QuiesceState state;
+    QuiesceTimeInterval ttl = QuiesceTimeInterval::zero();
+    bool is_valid() const { return state > QS__INVALID && state < QS__SET_ONLY; }
+    RootInfo() : RootInfo(QS__INVALID) {}
+    RootInfo(QuiesceState state) : RootInfo(state,QuiesceTimeInterval::zero()) {}
+    RootInfo(QuiesceState state, QuiesceTimeInterval ttl)
+        : state(state)
+        , ttl(ttl)
+    {
+    }
+    inline bool operator==(const RootInfo& other) const {
+      return state == other.state && ttl == other.ttl;
+    }
+
+    RootInfo(RootInfo const&) = default;
+    RootInfo(RootInfo &&) = default;
+    RootInfo& operator=(RootInfo const&) = default;
+    RootInfo& operator=(RootInfo &&) = default;
+  };
+  using Roots = std::unordered_map<QuiesceRoot, RootInfo>;
+  Roots roots;
+  void reset() {
+    db_version = {0, 0};
+    roots.clear();
+  }
+
+  QuiesceMap() : db_version({0, 0}), roots() { }
+  QuiesceMap(QuiesceDbVersion db_version) : db_version(db_version), roots() { }
+  QuiesceMap(QuiesceDbVersion db_version, Roots &&roots) : db_version(db_version), roots(roots) { }
+  QuiesceMap(QuiesceDbVersion db_version, Roots const& roots) : db_version(db_version), roots(roots) { }
+
+  QuiesceMap(QuiesceMap const&) = default;
+  QuiesceMap(QuiesceMap &&) = default;
+  QuiesceMap& operator=(QuiesceMap const&) = default;
+  QuiesceMap& operator=(QuiesceMap &&) = default;
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const QuiesceMap& map)
+{
+
+  size_t active = 0, inactive = 0;
+
+  for (auto&& [_, r] : map.roots) {
+    if (r.state < QS__TERMINAL) {
+      ++active;
+    } else {
+      ++inactive;
+    }
+  }
+
+  return os << "q-map[v:" << map.db_version << " roots:" << active << "/" << inactive << "]";
+}
+
+inline QuiesceTimeInterval interval_saturate_add(QuiesceTimeInterval lhs, QuiesceTimeInterval rhs)
+{
+  // assuming an unsigned time interval.
+  // TODO: make this function generic and also saturate add signed values
+  assert(std::is_unsigned_v<QuiesceTimeInterval::rep>);
+
+  QuiesceTimeInterval result = lhs + rhs;
+
+  // the sum can't be smaller than either part
+  // since we're working with an unsigned value
+  if (result < lhs || result < rhs) {
+    // this must have been an overflow
+    return QuiesceTimeInterval::max();
+  }
+
+  return result;
+};
+
+inline QuiesceTimePoint interval_saturate_add_now(QuiesceTimeInterval interval) {
+  return QuiesceTimePoint(interval_saturate_add(QuiesceClock::now().time_since_epoch(), interval));
+};
+
+namespace QuiesceInterface {
+  using PeerId = mds_gid_t;
+  /// @brief  A callback from the manager to the agent with an up-to-date root list
+  ///         The map is mutable and will be used as synchronous agent ack if the return value is true
+  using AgentNotify = std::function<bool(QuiesceMap&)>;
+  /// @brief  Used to send asyncrhonous acks from agents about changes to the root states
+  ///         The transport layer should include sufficient information to know the sender of the ack
+  using AgentAck = std::function<int(QuiesceMap&&)>;
+  /// @brief  Used by the leader to replicate the DB changes to its peers
+  using DbPeerUpdate = std::function<int(PeerId, QuiesceDbListing&&)>;
+
+  using RequestHandle = metareqid_t;
+  /// @brief  Used by the agent to initiate an ongoing quiesce request for the given quiesce root
+  ///         The context will be completed when the quiescing is achieved by this rank. The IO pause
+  ///         should continue until the request is canceled.
+  ///         Repeated requests for the same root should succeed, returning a _new_ request id;
+  ///         the old context should be completed with an error EINTR, and the old request id should be invalidated.
+  ///         If the root has already reached quiescence by the time the repeated request is submitted
+  ///         then the new context should be immediately (syncrhonously) completed with success and then discarded.
+  ///         Syncrhonous errors should be reported by completing the supplied context, and the return value
+  ///         should be std::nullopt in such cases
+  using RequestSubmit = std::function<std::optional<RequestHandle>(QuiesceRoot, Context*)>;
+  /// @brief  Cancels the quiesce request. May be called at any time after the request got submitted
+  using RequestCancel = std::function<int(const RequestHandle&)>;
+};
diff --git a/src/mds/QuiesceDbManager.cc b/src/mds/QuiesceDbManager.cc
new file mode 100644 (file)
index 0000000..3eb9009
--- /dev/null
@@ -0,0 +1,1119 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "mds/QuiesceDbManager.h"
+#include "common/debug.h"
+#include "fmt/format.h"
+#include "include/ceph_assert.h"
+#include <algorithm>
+#include <random>
+#include <ranges>
+#include <type_traits>
+#include "boost/url.hpp"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds_quiesce
+#undef dout_prefix
+#define dout_prefix *_dout << "quiesce.mgr <" << __func__ << "> "
+
+#undef dout
+#define dout(lvl)                                                        \
+  do {                                                                   \
+    auto subsys = ceph_subsys_mds;                                       \
+    if ((dout_context)->_conf->subsys.should_gather(dout_subsys, lvl)) { \
+      subsys = dout_subsys;                                              \
+    }                                                                    \
+  dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+
+#undef dendl
+#define dendl \
+  dendl_impl; \
+  }           \
+  while (0)
+
+#define dset(suffix) "[" << set_id << "@" << set.version << "] " << suffix
+#define dsetroot(suffix) "[" << set_id << "@" << set.version << "," << root << "] " << suffix
+
+const QuiesceInterface::PeerId QuiesceClusterMembership::INVALID_MEMBER = MDS_GID_NONE;
+
+static QuiesceTimeInterval time_distance(QuiesceTimePoint lhs, QuiesceTimePoint rhs) {
+  if (lhs > rhs) {
+    return lhs - rhs;
+  } else {
+    return rhs - lhs;
+  }
+}
+
+bool QuiesceDbManager::db_thread_has_work() const
+{
+  return false
+      || pending_acks.size() > 0
+      || pending_requests.size() > 0
+      || pending_db_updates.size() > 0
+      || (agent_callback.has_value() && agent_callback->if_newer < db_version())
+      || (!cluster_membership.has_value() || cluster_membership->epoch != membership.epoch);
+}
+
+void* QuiesceDbManager::quiesce_db_thread_main()
+{
+  db_thread_enter();
+
+  std::unique_lock ls(submit_mutex);
+  QuiesceTimeInterval next_event_at_age = QuiesceTimeInterval::max();
+  QuiesceDbVersion last_acked = {0, 0};
+
+  while (true) {
+
+    auto db_age = db.get_age();
+
+    if (!db_thread_has_work() && next_event_at_age > db_age) {
+      submit_condition.wait_for(ls, next_event_at_age - db_age);
+    }
+
+    if (!membership_upkeep()) {
+      break;
+    }
+
+    {
+      decltype(pending_acks) acks(std::move(pending_acks));
+      decltype(pending_requests) requests(std::move(pending_requests));
+      decltype(pending_db_updates) db_updates(std::move(pending_db_updates));
+
+      ls.unlock();
+
+      if (membership.is_leader()) {
+        if (leader_bootstrap(std::move(db_updates), next_event_at_age)) {
+          // we're good to process things
+          next_event_at_age = leader_upkeep(std::move(acks), std::move(requests));
+        } else {
+          // not yet there. Put the requests back onto the queue
+          ls.lock();
+          while (!requests.empty()) {
+            pending_requests.emplace_front(std::move(requests.back()));
+            requests.pop_back();
+          }
+          continue;
+        }
+      } else {
+        next_event_at_age = replica_upkeep(std::move(db_updates));
+      }
+    }
+  
+    complete_requests();
+
+    // by default, only send ack if the version has changed
+    bool send_ack = last_acked != db_version();
+    QuiesceMap quiesce_map(db_version());
+    {
+      std::lock_guard lc(agent_mutex);
+      if (agent_callback) {
+        if (agent_callback->if_newer < db_version()) {
+          dout(20) << "notifying agent with db version " << db_version() << dendl;
+          calculate_quiesce_map(quiesce_map);
+          send_ack = agent_callback->notify(quiesce_map);
+          agent_callback->if_newer = db_version();
+        } else {
+          send_ack = false;
+        }
+      } else {
+        // by default, ack the db version and agree to whatever was sent
+        // This means that a quiesce cluster member with an empty agent callback 
+        // will cause roots to stay quiescing indefinitely
+        dout(5) << "no agent callback registered, responding with an empty ack" << dendl;
+      }
+    }
+
+    if (send_ack) {
+      auto db_version = quiesce_map.db_version;
+      dout(20) << "synchronous agent ack: " << quiesce_map << dendl;
+      auto rc = membership.send_ack(std::move(quiesce_map));
+      if (rc != 0) {
+        dout(1) << "ERROR ("<< rc <<") when sending synchronous agent ack " 
+        << quiesce_map << dendl;
+      } else {
+        last_acked = db_version;
+      }
+    }
+
+    ls.lock();
+  }
+
+  ls.unlock();
+
+  db_thread_exit();
+
+  return 0;
+}
+
+void QuiesceDbManager::update_membership(const QuiesceClusterMembership& new_membership, RequestContext* inject_request)
+{
+  std::unique_lock lock(submit_mutex);
+
+  bool will_participate = new_membership.members.contains(new_membership.me);
+  dout(20) << "will participate: " << std::boolalpha << will_participate << std::noboolalpha << dendl;
+
+  if (cluster_membership && !will_participate) {
+    // stop the thread
+    cluster_membership.reset();
+    submit_condition.notify_all();
+    lock.unlock();
+    ceph_assert(quiesce_db_thread.is_started());
+    dout(5) << "stopping the db mgr thread at epoch: " << new_membership.epoch << dendl;
+    quiesce_db_thread.join();
+  } else if (will_participate) {
+    if (!cluster_membership) {
+      // start the thread
+      dout(5) << "starting the db mgr thread at epoch: " << new_membership.epoch << dendl;
+      quiesce_db_thread.create("quiesce_db_mgr");
+    } else {
+      submit_condition.notify_all();
+    }
+    if (inject_request) {
+      pending_requests.push_front(inject_request);
+    }
+    cluster_membership = new_membership;
+    
+    std::lock_guard lc(agent_mutex);
+    if (agent_callback) {
+        agent_callback->if_newer = {0, 0};
+    }
+  }
+
+  if (!will_participate && inject_request) {
+    inject_request->complete(-EPERM);
+  }
+}
+
+bool QuiesceDbManager::membership_upkeep()
+{
+  if (cluster_membership && cluster_membership->epoch == membership.epoch) {
+    // no changes
+    return true;
+  }
+
+  bool was_leader = membership.epoch > 0 && membership.leader == membership.me;
+  bool is_leader = cluster_membership && cluster_membership->leader == cluster_membership->me;
+  if (cluster_membership) {
+    dout(10) << "epoch: " << cluster_membership->epoch << " is_leader: " << is_leader << " was_leader: " << was_leader << dendl;
+  } else {
+    dout(10) << "shutdown! was_leader: " << was_leader << dendl;
+  }
+
+  if (is_leader) {
+    // remove peers that aren't present anymore
+    for (auto peer_it = peers.begin(); peer_it != peers.end();) {
+      if (cluster_membership->members.contains(peer_it->first)) {
+        peer_it++;
+      } else {
+        peer_it = peers.erase(peer_it);
+      }
+    }
+    // create empty info for new peers
+    for (auto peer : cluster_membership->members) {
+      peers.try_emplace(peer);
+    }
+
+    if (db.set_version == 0) {
+      db.time_zero = QuiesceClock::now();
+      db.sets.clear();
+    }
+
+  } else {
+    peers.clear();
+    // abort awaits with EINPROGRESS
+    // the reason is that we don't really have a new version
+    // of any of the sets, we just aren't authoritative anymore
+    // hence, EINPROGRESS is a more appropriate response than, say, EINTR
+    for (auto & [_, await_ctx]: awaits) {
+      done_requests[await_ctx.req_ctx] = EINPROGRESS;
+    }
+    awaits.clear();
+    // reject pending requests
+    while (!pending_requests.empty()) {
+      done_requests[pending_requests.front()] = EPERM;
+      pending_requests.pop_front();
+    }
+  }
+
+  if (cluster_membership) {
+    membership = *cluster_membership;
+  }
+
+  return cluster_membership.has_value();
+}
+
+QuiesceTimeInterval QuiesceDbManager::replica_upkeep(decltype(pending_db_updates)&& db_updates)
+{
+  // as a replica, we only care about the latest update
+  while (db_updates.size() > 1) {
+    dout(10) << "skipping an older update from " << db_updates.front().first << " version " << db_updates.front().second.db_version << dendl;
+    db_updates.pop();
+  }
+
+  if (db_updates.empty()) {
+    // no db updates, wait forever
+    return QuiesceTimeInterval::max();
+  }
+
+  QuiesceDbListing &update = db_updates.back().second;
+
+  if (update.db_version.epoch != membership.epoch) {
+    dout(10) << "ignoring db update from another epoch: " << update.db_version << " != " << db_version() << dendl;
+    return QuiesceTimeInterval::max();
+  }
+
+  if (update.db_version.set_version == 0) {
+    // this is a call from a leader
+    // to upload our local db version
+    update.sets = db.sets;
+    update.db_version.set_version = db.set_version;
+    update.db_age = db.get_age();
+    membership.send_listing_to(membership.leader, std::move(update));
+    return QuiesceTimeInterval::max();
+  }
+
+  auto time_zero = QuiesceClock::now() - update.db_age;
+  if (time_distance(time_zero, db.time_zero) > std::chrono::seconds(1)) {
+    dout(10) << "significant db_time_zero change to " << time_zero << " from " << db.time_zero << dendl;
+  }
+  db.time_zero = time_zero;
+
+  if (db.set_version > update.db_version.set_version) {
+    dout(3) << "got an older version of DB from the leader: " << db.set_version << " > " << update.db_version.set_version << dendl;
+    dout(3) << "discarding the DB" << dendl;
+    db.reset();
+  } else {
+    for (auto& [qs_id, qs] : update.sets) {
+      db.sets.insert_or_assign(qs_id, std::move(qs));
+    }
+    db.set_version = update.db_version.set_version;
+  }
+
+  // wait forever
+  return QuiesceTimeInterval::max();
+}
+
+bool QuiesceDbManager::leader_bootstrap(decltype(pending_db_updates)&& db_updates, QuiesceTimeInterval &next_event_at_age)
+{
+  // check that we've heard from all peers in this epoch
+  std::unordered_set<QuiesceInterface::PeerId> unknown_peers;
+  for (auto&& [peer, info] : peers) {
+    if (info.diff_map.db_version.epoch < membership.epoch && info.diff_map.db_version.set_version == 0) {
+      if (peer != membership.me) {
+        unknown_peers.insert(peer);
+      }
+    }
+  }
+
+  // only consider db submissions from unknown peers
+  while (!unknown_peers.empty() && !db_updates.empty()) {
+    auto &[from, update] = db_updates.front();
+    if (update.db_version.epoch == membership.epoch && unknown_peers.erase(from) > 0) {
+      // see if this peer's version is newer than mine
+      if (db.set_version < update.db_version.set_version) {
+        dout(3) << "preferring version from peer " 
+          << from << " (" << update.db_version 
+          << ") over mine (" << db_version() << ")" 
+          << " and incrementing it to collect acks" << dendl;
+        db.time_zero = QuiesceClock::now() - update.db_age;
+        db.set_version = update.db_version.set_version + 1;
+        db.sets = update.sets;
+      }
+      // record that we've seen this peer;
+      // set the epoch correctly but use set version 0 because it's not an ack yet.
+      peers[from] =  PeerInfo {QuiesceMap({membership.epoch, 0}), QuiesceClock::now()};
+    }
+    db_updates.pop();
+  }
+
+  for (auto & peer: unknown_peers) {
+    PeerInfo & info = peers[peer];
+
+    QuiesceTimePoint next_discovery = info.last_seen + std::chrono::seconds(1);
+    if (info.last_seen == QuiesceClock::zero() || next_discovery < QuiesceClock::now()) {
+      // send a discovery request to unknown peers
+      dout(5) << " sending a discovery request to " << peer << dendl;
+      membership.send_listing_to(peer, QuiesceDbListing(membership.epoch));
+      info.last_seen = QuiesceClock::now();
+      next_discovery = info.last_seen + std::chrono::seconds(1);
+    }
+    QuiesceTimeInterval next_discovery_at_age = next_discovery - db.time_zero;
+
+    next_event_at_age = std::min(next_event_at_age, next_discovery_at_age);
+  }
+
+  // true if all peers are known
+  return unknown_peers.empty();
+}
+
+QuiesceTimeInterval QuiesceDbManager::leader_upkeep(decltype(pending_acks)&& acks, decltype(pending_requests)&& requests)
+{
+  // record peer acks
+  while (!acks.empty()) {
+    auto& [from, diff_map] = acks.front();
+    leader_record_ack(from, std::move(diff_map));
+    acks.pop();
+  }
+
+  // process requests
+  while (!requests.empty()) {
+    auto req_ctx = requests.front();
+    int result = leader_process_request(req_ctx);
+    if (result != EBUSY) {
+      done_requests[req_ctx] = result;
+    }
+    requests.pop_front();
+  }
+
+  QuiesceTimeInterval next_db_event_at_age = leader_upkeep_db();
+  QuiesceTimeInterval next_await_event_at_age = leader_upkeep_awaits();
+
+  return std::min(next_db_event_at_age, next_await_event_at_age);
+}
+
+void QuiesceDbManager::complete_requests() {
+  for (auto [req, res]: done_requests) {
+    auto & r = req->response;
+    r.clear();
+    if (membership.leader == membership.me) {
+      r.db_age = db.get_age();
+      r.db_version = db_version();
+
+      if (req->request.set_id) {
+        Db::Sets::const_iterator it = db.sets.find(*req->request.set_id);
+        if (it != db.sets.end()) {
+          r.sets.emplace(*it);
+        }
+      } else if (req->request.is_query()) {
+        for (auto && it : std::as_const(db.sets)) {
+          r.sets.emplace(it);
+        }
+      }
+    }
+    // non-zero result codes are all errors
+    req->complete(-res);
+  }
+  done_requests.clear();
+}
+
+void QuiesceDbManager::leader_record_ack(QuiesceInterface::PeerId from, QuiesceMap&& diff_map)
+{
+  auto it = peers.find(from);
+
+  if (it == peers.end()) {
+    // ignore updates from unknown peers
+    return;
+  }
+
+  auto & info = it->second;
+
+  if (diff_map.db_version > db_version()) {
+    dout(3) << "ignoring unknown version ack by rank " << from << " (" << diff_map.db_version << " > " << db_version() << ")" << dendl;
+    dout(5) << "will send the peer a full DB" << dendl;
+    info.diff_map.reset();
+  } else {
+    info.diff_map = std::move(diff_map);
+    info.last_seen = QuiesceClock::now();
+  }
+}
+
+static std::string random_hex_string() {
+  std::mt19937 gen(std::random_device {} ());
+  return fmt::format("{:x}", gen());
+}
+
+bool QuiesceDbManager::sanitize_roots(QuiesceDbRequest::Roots& roots)
+{
+  static const std::string file_scheme = "file";
+  static const std::string inode_scheme = "inode";
+  static const std::unordered_set<std::string> supported_schemes { file_scheme, inode_scheme };
+  QuiesceDbRequest::Roots result;
+  for (auto &root : roots) {
+    auto parsed_uri = boost::urls::parse_uri_reference(root);
+    if (!parsed_uri) {
+      dout(2) << "Couldn't parse root '" << root << "' as URI (error: " << parsed_uri.error() << ")" << dendl;
+      return false;
+    }
+
+    boost::url root_url = parsed_uri.value();
+    root_url.normalize();
+
+    if (!root_url.has_scheme()) {
+      root_url.set_scheme(file_scheme);
+    } else if (!supported_schemes.contains(root_url.scheme())) {
+      dout(2) << "Unsupported root URL scheme '" << root_url.scheme() << "'" << dendl;
+      return false;
+    }
+
+    if (root_url.has_authority()) {
+      auto auth_str = root_url.authority().buffer();
+      bool ok_remove = false;
+      if (auth_str == membership.fs_name) {
+        ok_remove = true;
+      } else {
+        try {
+          ok_remove = std::stoll(auth_str) == membership.fs_id;
+        } catch (...) { }
+      }
+      if (ok_remove) {
+        // OK, but remove the authority for now
+        // we may want to enforce it if we decide to keep a single database for all file systems
+        dout(10) << "Removing the fs name or id '" << auth_str << "' from the root url authority section" << dendl;
+        root_url.remove_authority();
+      } else {
+        dout(2) << "The root url '" << root_url.buffer() 
+          << "' includes an authority section '" << auth_str 
+          << "' which doesn't match the fs id (" << membership.fs_id 
+          << ") or name ('" << membership.fs_name << "')" << dendl;
+        return false;
+      }
+    }
+
+    std::string sanitized_path;
+    sanitized_path.reserve(root_url.path().size());
+    // deal with the file path
+    //  * make it absolute (start with a slash)
+    //  * remove repeated slashes
+    //  * remove the trailing slash
+    bool skip_slash = true;
+    for (auto&& c : root_url.path()) {
+      if (c != '/' || !skip_slash) {
+        sanitized_path.push_back(c);
+      }
+      skip_slash = c == '/';
+    }
+
+    if (sanitized_path.size() > 0 && sanitized_path.back() == '/') {
+      sanitized_path.pop_back();
+    }
+
+    if (root_url.scheme() == file_scheme) {
+      sanitized_path.insert(sanitized_path.begin(), '/');
+    } else if (root_url.scheme() == inode_scheme) {
+      uint64_t inodeno = 0;
+      try {
+        inodeno = std::stoull(sanitized_path);
+      } catch (...) { }
+
+      if (!inodeno || fmt::format("{}", inodeno) != sanitized_path) {
+        dout(2) << "Root '" << root << "' does not encode a vaild inode number" << dendl;
+        return false;
+      }
+    }
+
+    root_url.set_path(sanitized_path);
+
+    if (root_url.buffer() != root) {
+      dout(10) << "Normalized root '" << root << "' to '" << root_url.buffer() << "'" << dendl;
+    }
+    result.insert(root_url.buffer());
+  }
+  roots.swap(result);
+  return true;
+}
+
+int QuiesceDbManager::leader_process_request(RequestContext* req_ctx)
+{
+  QuiesceDbRequest &request = req_ctx->request;
+
+  if (!request.is_valid()) {
+    dout(2) << "rejecting an invalid request" << dendl;
+    return EINVAL;
+  }
+
+  if (!sanitize_roots(request.roots)) {
+    dout(2) << "failed to sanitize roots for a request" << dendl;
+    return EINVAL;
+  }
+
+  const auto db_age = db.get_age();
+
+  if (request.is_cancel_all()) {
+    dout(3) << "WARNING: got a cancel all request" << dendl;
+    // special case - reset all
+    // this will only work on active sets
+    for (auto &[set_id, set]: db.sets) {
+      if (set.is_active()) {
+        bool did_update = false;
+        for (auto&& [_, member]: set.members) {
+          did_update |= !member.excluded;
+          member.excluded = true;
+        }
+
+        ceph_assert(did_update);
+        ceph_assert(set.rstate.update(QS_CANCELED, db_age));
+        set.version = db.set_version+1;
+      }
+    }
+    return 0;
+  }
+
+  // figure out the set to update
+  auto set_it = db.sets.end();
+
+  if (request.set_id) {
+    set_it = db.sets.find(*request.set_id);
+  } else if (request.if_version > 0) {
+    dout(2) << "can't expect a non-zero version (" << *request.if_version << ") for a new set" << dendl;
+    return EINVAL;
+  }
+
+  if (set_it == db.sets.end()) {
+    if (request.includes_roots() && request.if_version <= 0) {
+      // such requests may introduce a new set
+      if (!request.set_id) {
+        // we should generate a unique set id
+        QuiesceSetId new_set_id;
+        do {
+          new_set_id = random_hex_string();
+        } while (db.sets.contains(new_set_id));
+        // update the set_id in the request so that we can
+        // later know which set got created
+        request.set_id.emplace(std::move(new_set_id));
+      }
+      set_it = db.sets.emplace(*request.set_id, QuiesceSet()).first;
+    } else if (request.is_mutating() || request.await) {
+      ceph_assert(request.set_id.has_value());
+      dout(2) << "coudn't find set with id '" << *request.set_id <<  "'" << dendl;
+      return ENOENT;
+    }
+  }
+
+  if (set_it != db.sets.end()) {
+    auto& [set_id, set] = *set_it;
+
+    int result = leader_update_set(*set_it, request);
+    if (result != 0) {
+      return result;
+    }
+
+    if (request.await) {
+      // this check may have a false negative for a quiesced set
+      // that will be released in another request in the same batch
+      // in that case, this await will be enqueued but then found and completed
+      // with the same error in `leader_upkeep_awaits`
+      if ((set.is_releasing() || set.is_released()) && !request.is_release()) {
+        dout(2) << dset("can't quiesce-await a set that was released (") << set.rstate.state << ")" << dendl;
+        return EPERM;
+      }
+
+      auto expire_at_age = interval_saturate_add(db_age, *request.await);
+      awaits.emplace(std::piecewise_construct,
+          std::forward_as_tuple(set_id),
+          std::forward_as_tuple(expire_at_age, req_ctx));
+      // let the caller know that the request isn't done yet
+      return EBUSY;
+    }
+  }
+
+  // if we got here it must be a success
+  return 0;
+}
+
+int QuiesceDbManager::leader_update_set(Db::Sets::value_type& set_it, const QuiesceDbRequest& request)
+{
+  auto & [set_id, set] = set_it;
+  if (request.if_version && set.version != *request.if_version) {
+    dout(10) << dset("is newer than requested (") << *request.if_version << ") " << dendl;
+    return ESTALE;
+  }
+
+  if (!request.is_mutating()) {
+    return 0;
+  }
+
+  bool did_update = false;
+  bool did_update_roots = false;
+
+  if (request.is_release()) {
+    // the release command is allowed in states
+    // quiesced, releasing, released
+    switch (set.rstate.state) {
+      case QS_QUIESCED:
+        // we only update the state to RELEASING,
+        // and not the age. This is to keep counting
+        // towards the quiesce expiration.
+        // TODO: this could be reconsidered, but would
+        // then probably require an additional timestamp
+        set.rstate.state = QS_RELEASING;
+        did_update = true;
+        dout(15) << dset("") << "updating state to: " << set.rstate.state << dendl;
+      case QS_RELEASING:
+      case QS_RELEASED:
+        break;
+      default:
+        dout(2) << dset("can't release in the state: ") << set.rstate.state << dendl;
+        return EPERM;
+    }
+  } else {
+    const auto db_age = db.get_age();
+    bool reset = false;
+
+    if (!request.is_reset()) {
+      // only active or new sets can be modified
+      if (!set.is_active() && set.version > 0) {
+        dout(2) << dset("rejecting modification in the terminal state: ") << set.rstate.state << dendl;
+        return EPERM;
+      } else if (request.includes_roots() && set.is_releasing()) {
+        dout(2) << dset("rejecting new roots in the QS_RELEASING state") << dendl;
+        return EPERM;
+      }
+    } else {
+      // a reset request can be used to resurrect a set from whichever state it's in now
+      if (set.rstate.state > QS_QUIESCED) {
+        dout(5) << dset("reset back to a QUIESCING state") << dendl;
+        did_update = set.rstate.update(QS_QUIESCING, db_age);
+        ceph_assert(did_update);
+        reset = true;
+      }
+    }
+
+    if (request.timeout) {
+      set.timeout = *request.timeout;
+      did_update = true;
+    }
+
+    if (request.expiration) {
+      set.expiration = *request.expiration;
+      did_update = true;
+    }
+
+    size_t included_count = 0;
+    QuiesceState min_member_state = QS__MAX;
+
+    for (auto& [root, info] : set.members) {
+      if (request.should_exclude(root)) {
+        did_update_roots |= !info.excluded;
+        info.excluded = true;
+      } else if (!info.excluded) {
+        included_count ++;
+
+        QuiesceState effective_member_state;
+
+        if (reset) {
+          dout(5) << dsetroot("reset back to a QUIESCING state") << dendl;
+          info.rstate.state = QS_QUIESCING;
+          info.rstate.at_age = db_age;
+          did_update_roots = true;
+          effective_member_state = info.rstate.state;
+        } else {
+          QuiesceState min_reported_state;
+          QuiesceState max_reported_state;
+          size_t reporting_peers = check_peer_reports(set_id, set, root, info, min_reported_state, max_reported_state);
+
+          if (reporting_peers == peers.size() && max_reported_state < QS__FAILURE) {
+            effective_member_state = set.get_effective_member_state(min_reported_state);
+          } else {
+            effective_member_state = info.rstate.state;
+          }
+        }
+
+        min_member_state = std::min(min_member_state, effective_member_state);
+      }
+    }
+
+    if (request.includes_roots()) {
+      for (auto const& root : request.roots) {
+        auto const& [member_it, emplaced] = set.members.try_emplace(root, db_age);
+        auto& [_, info] = *member_it;
+        if (emplaced || info.excluded) {
+          info.excluded = false;
+          did_update_roots = true;
+          included_count++;
+          info.rstate = { QS_QUIESCING, db_age };
+          min_member_state = std::min(min_member_state, QS_QUIESCING);
+        }
+      }
+    }
+
+    did_update |= did_update_roots;
+
+    if (included_count == 0) {
+      dout(20) << dset("cancelled due to 0 included members") << dendl;
+      did_update = set.rstate.update(QS_CANCELED, db_age);
+      ceph_assert(did_update);
+    } else if (min_member_state < QS__MAX) {
+      auto next_state = set.next_state(min_member_state);
+      if (did_update |= set.rstate.update(next_state, db_age)) {
+        dout(15) << dset("updated to match the min state of the remaining (") << included_count << ") members: " << set.rstate.state << dendl;
+      }
+    }
+  }
+
+  if (did_update) {
+    dout(20) << dset("updating version from ") << set.version << " to " << db.set_version + 1 << dendl;
+    set.version = db.set_version + 1;
+    if (did_update_roots) {
+      // any awaits pending on this set must be interrupted
+      // NB! even though the set may be QUIESCED now, it could only
+      // get there due to exclusion of quiescing roots, which is
+      // not a vaild way to successfully await a set, hence EINTR
+      // However, if the set had all roots removed then we
+      // should respond in ECANCELED to notify that no more await
+      // attempts will be permitted
+      auto range = awaits.equal_range(set_id);
+      int rc = EINTR;
+      if (!set.is_active()) {
+        ceph_assert(set.rstate.state == QS_CANCELED);
+        rc = ECANCELED;
+      }
+      for (auto it = range.first; it != range.second; it++) {
+        done_requests[it->second.req_ctx] = rc;
+      }
+      if (range.first != range.second) {
+        dout(10) << dset("interrupting awaits with rc = ") << rc << " due to a change in members" << dendl;
+      }
+      awaits.erase(range.first, range.second);
+    }
+  }
+
+  return 0;
+}
+
+QuiesceTimeInterval QuiesceDbManager::leader_upkeep_db()
+{
+  std::map<QuiesceInterface::PeerId, std::deque<std::reference_wrapper<Db::Sets::value_type>>> peer_updates;
+
+  QuiesceTimeInterval next_event_at_age = QuiesceTimeInterval::max();
+  QuiesceSetVersion max_set_version = db.set_version;
+
+  for(auto & set_it: db.sets) {
+    auto & [set_id, set] = set_it;
+    auto next_set_event_at_age = leader_upkeep_set(set_it);
+
+    max_set_version = std::max(max_set_version, set.version);
+    next_event_at_age = std::min(next_event_at_age, next_set_event_at_age);
+
+    for(auto const & [peer, info]: peers) {
+      // update remote peers if their version is lower than this set's
+      // don't update myself
+      if (peer == membership.me) {
+        continue;
+      }
+      if (info.diff_map.db_version.set_version < set.version) {
+        peer_updates[peer].emplace_back(set_it);
+      }
+    }
+  }
+
+  db.set_version = max_set_version;
+
+  // update the peers
+  for (auto &[peer, sets]: peer_updates) {
+    QuiesceDbListing update;
+    update.db_age = db.get_age();
+    update.db_version = db_version();
+    std::ranges::copy(sets, std::inserter(update.sets, update.sets.end()));
+
+    dout(20) << "updating peer " << peer << " with " << sets.size() 
+      << " sets modified in db version range (" 
+      << peers[peer].diff_map.db_version << ".." << db.set_version << "]" << dendl;
+
+    auto rc = membership.send_listing_to(peer, std::move(update));
+    if (rc != 0) {
+      dout(1) << "ERROR (" << rc << ") trying to replicate db version " 
+        << db.set_version << " with " << sets.size() 
+        << " sets to the peer " << peer << dendl;
+    }
+  }
+
+  return next_event_at_age;
+}
+
+QuiesceState QuiesceSet::next_state(QuiesceState min_member_state) const {
+  ceph_assert(min_member_state > QS__INVALID);
+  ceph_assert(rstate.state < QS__TERMINAL);
+
+  if (is_releasing() && min_member_state == QS_QUIESCED) {
+    // keep releasing
+    return QS_RELEASING;
+  }
+
+  // otherwise, follow the member state
+  return min_member_state;
+}
+
+size_t QuiesceDbManager::check_peer_reports(const QuiesceSetId& set_id, const QuiesceSet& set, const QuiesceRoot& root, const QuiesceSet::MemberInfo& member, QuiesceState& min_reported_state, QuiesceState& max_reported_state) {
+  min_reported_state = QS__MAX;
+  max_reported_state = QS__INVALID;
+
+  size_t up_to_date_peers = 0;
+
+  for (auto& [peer, info] : peers) {
+    // we consider the last bit of information we had from a given peer
+    // however, we want to skip peers which haven't been bootstrapped yet
+    if (info.diff_map.db_version.set_version == 0) {
+      continue;
+    }
+    auto dit = info.diff_map.roots.find(root);
+    QuiesceState reported_state = set.get_requested_member_state();
+
+    if (dit != info.diff_map.roots.end()) {
+      // the peer has something to say about this root
+      auto const& pr_state = dit->second;
+      if (!pr_state.is_valid()) {
+        dout(5) << dsetroot("ignoring an invalid peer state ") << pr_state.state << dendl;
+        continue;
+      }
+      reported_state = pr_state.state;
+    }
+
+    // but we only consider the peer up to date given the version
+    if (info.diff_map.db_version >= QuiesceDbVersion { membership.epoch, set.version }) {
+      up_to_date_peers++;
+    }
+
+    min_reported_state = std::min(min_reported_state, reported_state);
+    max_reported_state = std::max(max_reported_state, reported_state);
+  }
+
+  if (min_reported_state == QS__MAX) {
+    min_reported_state = set.get_requested_member_state();
+    max_reported_state = set.get_requested_member_state();
+  }
+
+  return up_to_date_peers;
+}
+
+QuiesceTimeInterval QuiesceDbManager::leader_upkeep_set(Db::Sets::value_type& set_it)
+{
+  auto& [set_id, set] = set_it;
+
+  if (!set.is_active()) {
+    return QuiesceTimeInterval::max();
+  }
+
+  QuiesceTimeInterval end_of_life = QuiesceTimeInterval::max();
+
+  const auto db_age = db.get_age();
+  // no quiescing could have started before the current db_age
+
+  QuiesceState min_member_state = QS__MAX;
+  size_t included_members = 0;
+  // for each included member, apply recorded acks and check quiesce timeouts
+  for (auto& [root, member] : set.members) {
+    if (member.excluded) {
+      continue;
+    }
+    included_members++;
+
+    QuiesceState min_reported_state;
+    QuiesceState max_reported_state;
+
+    size_t reporting_peers = check_peer_reports(set_id, set, root, member, min_reported_state, max_reported_state);
+    auto effective_state = set.get_effective_member_state(min_reported_state);
+
+    if (max_reported_state >= QS__FAILURE) {
+      // if at least one peer is reporting a failure state then move to it
+      dout(5) << dsetroot("reported by at least one peer as: ") << max_reported_state << dendl;
+      if (member.rstate.update(max_reported_state, db_age)) {
+        dout(15) << dsetroot("updating member state to ") << member.rstate.state << dendl;
+        set.version = db.set_version + 1;
+      }
+    } else if (effective_state < member.rstate.state) {
+      // someone has reported a rollback state for the root
+      dout(15) << dsetroot("reported by at least one peer as ") << min_reported_state << " vs. the expected " << member.rstate.state << dendl;
+      if (member.rstate.update(effective_state, db_age)) {
+        dout(15) << dsetroot("updating member state to ") << member.rstate.state << dendl;
+        set.version = db.set_version + 1;
+      }
+    } else if (reporting_peers == peers.size()) {
+      dout(20) << dsetroot("min reported state for all (") << reporting_peers << ") peers: " << min_reported_state 
+          << ". Effective state: " << effective_state << dendl;
+      if (member.rstate.update(effective_state, db_age)) {
+        dout(15) << dsetroot("updating member state to ") << member.rstate.state << dendl;
+        set.version = db.set_version + 1;
+      }
+    }
+
+    if (member.is_quiescing()) {
+      // the quiesce timeout applies in this case
+      auto timeout_at_age = interval_saturate_add(member.rstate.at_age, set.timeout);
+      if (timeout_at_age <= db_age) {
+        // NB: deliberately not changing the member state
+        dout(10) << dsetroot("detected a member quiesce timeout") << dendl;
+        ceph_assert(set.rstate.update(QS_TIMEDOUT, db_age));
+        set.version = db.set_version + 1;
+        break;
+      }
+      end_of_life = std::min(end_of_life, timeout_at_age);
+    } else if (member.is_failed()) {
+      // if at least one member is in a failure state
+      // then the set must receive it as well
+      dout(5) << dsetroot("propagating the terminal member state to the set level: ") << member.rstate.state << dendl;
+      ceph_assert(set.rstate.update(member.rstate.state, db_age));
+      set.version = db.set_version + 1;
+      break;
+    }
+
+    min_member_state = std::min(min_member_state, member.rstate.state);
+  }
+
+  if (!set.is_active()) {
+    return QuiesceTimeInterval::max();
+  }
+
+  // we should have at least one included members to be active
+  ceph_assert(included_members > 0);
+  auto next_state = set.next_state(min_member_state);
+
+  if (set.rstate.update(next_state, db_age)) {
+    set.version = db.set_version + 1;
+    dout(15) << dset("updated set state to match member reports: ") << set.rstate.state << dendl;
+  }
+
+  if (set.is_quiesced() || set.is_released()) {
+    // any awaits pending on this set should be completed now,
+    // before the set may enter a QS_EXPIRED state
+    // due to a zero expiration timeout.
+    // this could be used for barriers.
+    auto range = awaits.equal_range(set_id);
+    for (auto it = range.first; it != range.second; it++) {
+      done_requests[it->second.req_ctx] = 0;
+      if (set.is_quiesced()) {
+        // since we've just completed a _quiesce_ await
+        // we should also reset the recorded age of the QUIESCED state
+        // to postpone the expiration time checked below
+        set.rstate.at_age = db_age;
+        set.version = db.set_version + 1;
+        dout(20) << dset("reset quiesced state age upon successful await") << dendl;
+      }
+    }
+    awaits.erase(range.first, range.second);
+  }
+
+  // check timeouts:
+  if (set.is_quiescing()) {
+    // sanity check that we haven't missed this before
+    ceph_assert(end_of_life > db_age);
+  } else if (set.is_active()) {
+    auto expire_at_age = interval_saturate_add(set.rstate.at_age, set.expiration);
+    if (expire_at_age <= db_age) {
+      // we have expired
+      ceph_assert(set.rstate.update(QS_EXPIRED, db_age));
+      set.version = db.set_version + 1;
+    } else {
+      end_of_life = std::min(end_of_life, expire_at_age);
+    }
+  }
+
+  return end_of_life;
+}
+
+QuiesceTimeInterval QuiesceDbManager::leader_upkeep_awaits()
+{
+  QuiesceTimeInterval next_event_at_age = QuiesceTimeInterval::max();
+  for (auto it = awaits.begin(); it != awaits.end();) {
+    auto & [set_id, actx] = *it;
+    Db::Sets::const_iterator set_it = db.sets.find(set_id);
+
+    int rc = db.get_age() >= actx.expire_at_age ? EINPROGRESS : EBUSY;
+
+    if (set_it == db.sets.cend()) {
+      rc = ENOENT;
+    } else {
+      auto const & set = set_it->second;
+
+      switch(set.rstate.state) {
+        case QS_CANCELED:
+          rc = ECANCELED;
+          break;
+        case QS_EXPIRED:
+        case QS_TIMEDOUT:
+          rc = ETIMEDOUT; 
+          break;
+        case QS_QUIESCED:
+          rc = 0; // fallthrough
+        case QS_QUIESCING:
+          ceph_assert(!actx.req_ctx->request.is_release());
+          break;
+        case QS_RELEASED:
+          rc = 0; // fallthrough
+        case QS_RELEASING:
+          if (!actx.req_ctx->request.is_release()) {
+            // technically possible for a quiesce await
+            // to get here if a concurrent release request
+            // was submitted in the same batch;
+            // see the corresponding check in
+            // `leader_process_request`
+            rc = EPERM;
+          }
+          break;
+        case QS_FAILED:
+          rc = EBADF;
+          break;
+        default: ceph_abort("unexpected quiesce set state");
+      }
+    }
+
+    if (rc != EBUSY) {
+      dout(10) << "completing an await for the set '" << set_id << "' with rc: " << rc << dendl;
+      done_requests[actx.req_ctx] = rc;
+      it = awaits.erase(it);
+    } else {
+      next_event_at_age = std::min(next_event_at_age, actx.expire_at_age);
+      ++it;
+    }
+  }
+  return next_event_at_age;
+}
+
+static QuiesceTimeInterval get_root_ttl(const QuiesceSet & set, const QuiesceSet::MemberInfo &member, QuiesceTimeInterval db_age) {
+
+  QuiesceTimeInterval end_of_life = db_age;
+
+  if (set.is_quiesced() || set.is_releasing()) {
+    end_of_life = set.rstate.at_age + set.expiration;
+  } else if (set.is_active()) {
+    auto age = db_age; // taking the upper bound here
+    if (member.is_quiescing()) {
+      // we know that this member is on a timer
+      age = member.rstate.at_age;
+    }
+    end_of_life = age + set.timeout; 
+  }
+
+  if (end_of_life > db_age) {
+    return end_of_life - db_age;
+  } else {
+    return QuiesceTimeInterval::zero();
+  }
+}
+
+void QuiesceDbManager::calculate_quiesce_map(QuiesceMap &map)
+{
+  map.roots.clear();
+  map.db_version = db_version();
+  auto db_age = db.get_age();
+
+  for(auto & [set_id, set]: db.sets) {
+    if (set.is_active()) {
+      // we only report active sets;
+      for(auto & [root, member]: set.members) {
+        if (member.excluded) {
+          continue;
+        }
+
+        // for a quiesce map, we want to report active roots as either QUIESCING or RELEASING
+        // this is to make sure that clients always have a reason to report back and confirm
+        // the quiesced state.
+        auto requested = set.get_requested_member_state();
+        auto ttl = get_root_ttl(set, member, db_age);
+        auto root_it = map.roots.try_emplace(root, QuiesceMap::RootInfo { requested, ttl }).first;
+
+        // the min below resolves conditions when members representing the same root have different state/ttl
+        // e.g. if at least one member is QUIESCING then the root should be QUIESCING
+        root_it->second.state = std::min(root_it->second.state, requested);
+        root_it->second.ttl = std::min(root_it->second.ttl, ttl);
+      }
+    }
+  }
+}
diff --git a/src/mds/QuiesceDbManager.h b/src/mds/QuiesceDbManager.h
new file mode 100644 (file)
index 0000000..a20b0cf
--- /dev/null
@@ -0,0 +1,303 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#pragma once
+#include "mds/QuiesceDb.h"
+#include "include/Context.h"
+#include <memory>
+#include <functional>
+#include <mutex>
+#include <condition_variable>
+#include <set>
+#include <queue>
+
+template <>
+struct std::hash<mds_gid_t> {
+  size_t operator()(const mds_gid_t& gid) const
+  {
+    return hash<uint64_t> {}(gid);
+  }
+};
+
+struct QuiesceClusterMembership {
+  static const QuiesceInterface::PeerId INVALID_MEMBER;
+
+  epoch_t epoch = 0;
+  fs_cluster_id_t fs_id = FS_CLUSTER_ID_NONE;
+  std::string fs_name;
+
+  QuiesceInterface::PeerId me = INVALID_MEMBER;
+  QuiesceInterface::PeerId leader = INVALID_MEMBER;
+  std::set<QuiesceInterface::PeerId> members;
+
+  // A courier interface to decouple from the messaging layer
+  // Failures can be ignored, manager will call this repeatedly if needed
+  QuiesceInterface::DbPeerUpdate send_listing_to;
+  QuiesceInterface::AgentAck send_ack;
+
+  bool is_leader() const { return leader == me && me != INVALID_MEMBER; }
+};
+
+class QuiesceDbManager {
+  public:
+
+    struct RequestContext : public Context {
+      QuiesceDbRequest request;
+      QuiesceDbListing response;
+    };
+
+    QuiesceDbManager() : quiesce_db_thread(this) {};
+    virtual ~QuiesceDbManager()
+    {
+      update_membership({});
+    }
+
+    // This will reset the manager state
+    // according to the new cluster config
+    void update_membership(const QuiesceClusterMembership& new_membership) {
+      update_membership(new_membership, nullptr);
+    }
+    void update_membership(const QuiesceClusterMembership& new_membership, RequestContext* inject_request);
+
+    // ============================
+    // quiesce db leader interface: 
+    //    -> EPERM unless this is the leader
+    
+    // client interface to the DB
+    int submit_request(RequestContext* ctx) {
+      std::lock_guard l(submit_mutex);
+
+      if (!cluster_membership || !cluster_membership->is_leader()) {
+        return -ENOTTY;
+      }
+
+      pending_requests.push_back(ctx);
+      submit_condition.notify_all();
+      return 0;
+    }
+    // acks the messaging system
+    int submit_ack_from(QuiesceInterface::PeerId sender, const QuiesceMap& diff_map) {
+      std::lock_guard l(submit_mutex);
+
+      if (!cluster_membership || !cluster_membership->is_leader()) {
+        return -EPERM;
+      }
+
+      if (!cluster_membership->members.contains(sender)) {
+        return -ESTALE;
+      }
+
+      pending_acks.push({ sender, diff_map });
+      submit_condition.notify_all();
+      return 0;
+    }
+    
+    // =============================
+    // quiesce db replica interface:
+    //    -> EPERM if this is the leader
+
+    // process an incoming listing from a leader
+    int submit_listing_from(QuiesceInterface::PeerId sender, QuiesceDbListing&& listing) {
+      std::lock_guard l(submit_mutex);
+
+      if (!cluster_membership) {
+        return -EPERM;
+      }
+
+      if (cluster_membership->epoch != listing.db_version.epoch) {
+        return -ESTALE;
+      }
+
+      pending_db_updates.push({sender, std::move(listing)});
+      submit_condition.notify_all();
+      return 0;
+    }
+
+    // =============================
+    // Quiesce Agent interface:
+
+    int submit_agent_ack(QuiesceMap&& diff_map)
+    {
+      std::unique_lock l(submit_mutex);
+      if (!cluster_membership) {
+        return -EPERM;
+      }
+
+      if (cluster_membership->leader == cluster_membership->me) {
+        // local delivery
+        pending_acks.push({ cluster_membership->me, std::move(diff_map) });
+        submit_condition.notify_all();
+      } else {
+        // send to the leader outside of the lock
+        auto send_ack = cluster_membership->send_ack;
+        l.unlock();
+        send_ack(std::move(diff_map));
+      }
+      return 0;
+    }
+
+    struct AgentCallback {
+      using Notify = std::function<bool(QuiesceMap&)>;
+      Notify notify;
+      QuiesceDbVersion if_newer = {0, 0};
+
+      AgentCallback(const Notify &notify, QuiesceDbVersion if_newer = {0, 0})
+          : notify(notify)
+          , if_newer(if_newer)
+      {
+      }
+    };
+
+    std::optional<AgentCallback> reset_agent_callback(AgentCallback::Notify notify, QuiesceDbVersion if_newer = {0, 0}) {
+      return reset_agent_callback(AgentCallback(notify, if_newer));
+    }
+
+    std::optional<AgentCallback> reset_agent_callback(std::optional<AgentCallback> callback_if_newer = std::nullopt)
+    {
+      std::lock_guard ls(submit_mutex);
+      std::lock_guard lc(agent_mutex);
+      agent_callback.swap(callback_if_newer);
+      if (agent_callback) {
+        submit_condition.notify_all();
+      }
+      return callback_if_newer;
+    }
+
+    std::optional<AgentCallback> reset_agent_callback(QuiesceDbVersion if_newer)
+    {
+      std::lock_guard ls(submit_mutex);
+      std::lock_guard lc(agent_mutex);
+      if (agent_callback) {
+        agent_callback->if_newer = if_newer;
+        submit_condition.notify_all();
+      }
+      return agent_callback;
+    }
+
+    std::optional<AgentCallback> get_agent_callback() const
+    {
+      std::lock_guard lc(agent_mutex);
+      return agent_callback;
+    }
+
+  protected:
+    mutable std::mutex submit_mutex;
+    mutable std::mutex agent_mutex;
+    std::condition_variable submit_condition;
+
+    std::optional<AgentCallback> agent_callback;
+    std::optional<QuiesceClusterMembership> cluster_membership;
+    std::queue<std::pair<QuiesceInterface::PeerId, QuiesceDbListing>> pending_db_updates;
+    std::queue<std::pair<QuiesceInterface::PeerId, QuiesceMap>> pending_acks;
+    std::deque<RequestContext*> pending_requests;
+
+    class QuiesceDbThread : public Thread {
+      public:
+        explicit QuiesceDbThread(QuiesceDbManager* qm)
+            : qm(qm)
+        {
+        }
+        void* entry() override
+        {
+          return qm->quiesce_db_thread_main();
+        }
+
+      private:
+        QuiesceDbManager* qm;
+    } quiesce_db_thread;
+
+    // =============================================
+    // The below is managed by the quiesce db thread
+
+    // the database.
+    struct Db {
+      QuiesceTimePoint time_zero;
+      QuiesceSetVersion set_version = 0;
+      using Sets = std::unordered_map<QuiesceSetId, QuiesceSet>;
+      Sets sets;
+
+      QuiesceTimeInterval get_age() const {
+        return QuiesceClock::now() - time_zero;
+      }
+      void reset() { 
+        set_version = 0; 
+        sets.clear();
+        time_zero = QuiesceClock::now();
+      }
+    } db;
+
+    QuiesceDbVersion db_version() const { return {membership.epoch, db.set_version}; }
+
+    QuiesceClusterMembership membership;
+
+    struct PeerInfo {
+        QuiesceMap diff_map;
+        QuiesceTimePoint last_seen;
+        PeerInfo(QuiesceMap&& diff_map, QuiesceTimePoint last_seen)
+            : diff_map(diff_map)
+            , last_seen(last_seen)
+        {
+        }
+        PeerInfo() { }
+    };
+    std::unordered_map<QuiesceInterface::PeerId, PeerInfo> peers;
+
+    struct AwaitContext {
+      QuiesceTimeInterval expire_at_age = QuiesceTimeInterval::zero();
+      RequestContext* req_ctx = nullptr;
+      AwaitContext(QuiesceTimeInterval exp, RequestContext* r)
+          : expire_at_age(exp)
+          , req_ctx(r)
+      {
+      }
+    };
+    // multiple awaits may be active per set
+    std::unordered_multimap<QuiesceSetId, AwaitContext> awaits;
+    std::unordered_map<RequestContext*, int> done_requests;
+
+    void* quiesce_db_thread_main();
+
+    void db_thread_enter() {
+      // this will invalidate the membership, see membership_upkeep()
+      membership.epoch = 0;
+      peers.clear();
+      awaits.clear();
+      done_requests.clear();
+      db.reset();
+    }
+
+    void db_thread_exit() {
+      complete_requests();
+    }
+
+    bool db_thread_has_work() const;
+
+    bool membership_upkeep();
+
+    QuiesceTimeInterval replica_upkeep(decltype(pending_db_updates)&& db_updates);
+    bool leader_bootstrap(decltype(pending_db_updates)&& db_updates, QuiesceTimeInterval &next_event_at_age);
+    QuiesceTimeInterval leader_upkeep(decltype(pending_acks)&& acks, decltype(pending_requests)&& requests);
+    
+
+    void leader_record_ack(QuiesceInterface::PeerId from, QuiesceMap&& diff_map);
+    int leader_process_request(RequestContext* req_ctx);
+    bool sanitize_roots(QuiesceDbRequest::Roots &roots);
+    int leader_update_set(Db::Sets::value_type& set_it, const QuiesceDbRequest& req);
+    QuiesceTimeInterval leader_upkeep_set(Db::Sets::value_type& set_it);
+    QuiesceTimeInterval leader_upkeep_db();
+    QuiesceTimeInterval leader_upkeep_awaits();
+
+    size_t check_peer_reports(const QuiesceSetId& set_id, const QuiesceSet& set, const QuiesceRoot& root, const QuiesceSet::MemberInfo& member, QuiesceState& min_reported_state, QuiesceState& max_reported_state);
+
+    void calculate_quiesce_map(QuiesceMap &map);
+
+    void complete_requests();
+};
\ No newline at end of file
index 857b205e19662fc58473896f56725fd3e3e98ce5..39b47bb2da4d056c5007d081525cb36662ce2b14 100644 (file)
@@ -14,3 +14,13 @@ add_executable(unittest_mds_sessionfilter
 add_ceph_unittest(unittest_mds_sessionfilter)
 target_link_libraries(unittest_mds_sessionfilter mds osdc ceph-common global ${BLKID_LIBRARIES})
 
+# unittest_mds_quiesce_db
+add_executable(unittest_mds_quiesce_db
+  TestQuiesceDb.cc
+  ../../../src/mds/QuiesceDbManager.cc
+  ../../../src/mds/BoostUrlImpl.cc
+  $<TARGET_OBJECTS:unit-main>
+)
+add_ceph_unittest(unittest_mds_quiesce_db)
+target_link_libraries(unittest_mds_quiesce_db ceph-common global)
+
diff --git a/src/test/mds/TestQuiesceDb.cc b/src/test/mds/TestQuiesceDb.cc
new file mode 100644 (file)
index 0000000..e820db4
--- /dev/null
@@ -0,0 +1,1613 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2023 IBM, RedHat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "mds/QuiesceDbManager.h"
+#include "gtest/gtest.h"
+#include "common/Cond.h"
+#include <ranges>
+#include <system_error>
+#include <thread>
+#include <queue>
+#include <functional>
+#include <algorithm>
+#include <iostream>
+#include <future>
+#include <list>
+#include <array>
+#include <utility>
+#include <cstdlib>
+#include "fmt/format.h"
+#include "common/debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds_quiesce
+#undef dout_prefix
+#define dout_prefix *_dout << "== test == "
+
+struct GenericVerboseErrorCode {
+  int error_code;
+  GenericVerboseErrorCode(int error_code) : error_code(std::abs(error_code)) {}
+  auto operator<=>(const GenericVerboseErrorCode&) const = default;
+};
+
+template <class CharT, class Traits>
+static std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const GenericVerboseErrorCode& ec)
+{
+  if (0 == ec.error_code) {
+    return os << "Success(0)";
+  } else {
+    return os << std::generic_category().message(ec.error_code) << "(" << ec.error_code << ")";
+  }
+};
+
+class QuiesceDbTest: public testing::Test {
+  protected:
+    template <class _Rep = std::chrono::seconds::rep, class _Period = std::chrono::seconds::period, typename D = std::chrono::duration<_Rep, _Period>, class Function, class... Args>
+    static bool timed_run(D timeout, Function&& f, Args&&... args)
+    {
+      std::promise<void> done;
+      auto future = done.get_future();
+
+      auto job = std::bind(f, args...);
+
+      auto tt = std::thread([job = std::move(job)](std::promise<void> done) {
+        job();
+        done.set_value();
+      },
+          std::move(done));
+
+      tt.detach();
+
+      return future.wait_for(timeout) != std::future_status::timeout;
+    }
+    struct TestQuiesceDbManager: public QuiesceDbManager
+    {
+      using QuiesceDbManager::QuiesceDbManager;
+      using QuiesceDbManager::Db;
+      Db& internal_db() {
+        return db;
+      }
+      QuiesceClusterMembership& internal_membership() {
+        return membership;
+      }
+      decltype(pending_requests)& internal_pending_requests() {
+        return pending_requests;
+      }
+      decltype(awaits)& internal_awaits() {
+        return awaits;
+      }
+      decltype(peers)& internal_peers() {
+        return peers;
+      }
+    };
+
+    epoch_t epoch = 0;
+    std::map<QuiesceInterface::PeerId, std::unique_ptr<TestQuiesceDbManager>> managers;
+
+    std::mutex comms_mutex;
+    std::condition_variable comms_cond;
+
+    fs_cluster_id_t fs_id = 1;
+    std::string fs_name = "a";
+
+    std::unordered_map<QuiesceInterface::PeerId, QuiesceMap> latest_acks;
+    using AckHook = std::function<bool(QuiesceInterface::PeerId, QuiesceMap&)>;
+    std::list<std::pair<AckHook, std::promise<void>>> ack_hooks;
+
+    std::future<void> add_ack_hook(AckHook&& predicate)
+    {
+      std::lock_guard l(comms_mutex);
+      auto &&[_, promise] = ack_hooks.emplace_back(predicate, std::promise<void> {});
+      return promise.get_future();
+    }
+
+    void SetUp() override {
+      for (QuiesceInterface::PeerId r = mds_gid_t(1); r < mds_gid_t(11); r++) {
+        managers[r].reset(new TestQuiesceDbManager());
+      }
+    }
+
+    void TearDown() override
+    {
+      dout(6) << "\n tearing down the cluster" << dendl;
+      // We want to cause the managers to destruct
+      // before we have the last_request destructed.
+      // We should remove entries from `managers` under the comms lock
+      // to avoid race with attempts of messaging between the managers.
+      // Then we actually clear the map, destructing the managers,
+      // outside the lock: the destruction will join the db threads
+      // which in turn migh attempt to send a message
+      std::unique_lock l(comms_mutex);
+      auto mgrs = std::move(managers);
+      l.unlock();
+      mgrs.clear();
+    }
+
+    void configure_cluster(std::vector<QuiesceInterface::PeerId> leader_and_replicas = { mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) })
+    {
+      ++epoch;
+      ASSERT_GE(leader_and_replicas.size(), 1);
+      std::set<QuiesceInterface::PeerId> members(leader_and_replicas.begin(), leader_and_replicas.end());
+      auto leader = leader_and_replicas[0];
+      for (const auto &[this_peer, mgr] : managers) {
+        QuiesceClusterMembership mem = {
+          epoch,
+          fs_id,
+          fs_name,
+          this_peer,
+          leader,
+          members,
+          [epoch = this->epoch, this, leader, me = this_peer](auto recipient, auto listing) {
+            std::unique_lock l(comms_mutex);
+            if (epoch == this->epoch) {
+              if (this->managers.contains(recipient)) {
+                dout(10) << "listing from " << me << " (leader=" << leader << ") to " << recipient << " for version " << listing.db_version << " with " << listing.sets.size() << " sets" << dendl;
+                this->managers[recipient]->submit_listing_from(me, std::move(listing));
+                comms_cond.notify_all();
+                return 0;
+              }
+            }
+            return -1;
+          },
+          [epoch = this->epoch, this, leader, me = this_peer](auto diff_map) {
+            std::unique_lock l(comms_mutex);
+            if (epoch == this->epoch) {
+              if (this->managers.contains(leader)) {
+                std::queue<std::promise<void>> done_hooks;
+                dout(10) << "ack from " << me << " to the leader (" << leader << ") for version " << diff_map.db_version << " with " << diff_map.roots.size() << " roots" << dendl;
+                auto [it, inserted] = latest_acks.insert({me, diff_map});
+                if (!inserted) {
+                  if (it->second.db_version == diff_map.db_version) {
+                    if (it->second.roots == diff_map.roots) {
+                      dout(1) << "WARNING: detected a potentialy redundant ack" << dendl;
+                    }
+                  }
+                  it->second = diff_map;
+                }
+                for (auto it = ack_hooks.begin(); it != ack_hooks.end();) {
+                  if (it->first(me, diff_map)) {
+                    done_hooks.emplace(std::move(it->second));
+                    it = ack_hooks.erase(it);
+                  } else {
+                    it++;
+                  }
+                }
+                this->managers[leader]->submit_ack_from(me, std::move(diff_map));
+                comms_cond.notify_all();
+                l.unlock();
+                while(!done_hooks.empty()) {
+                  done_hooks.front().set_value();
+                  done_hooks.pop();
+                }
+                return 0;
+              }
+            }
+            return -1;
+          }
+        };
+        mgr->update_membership(mem);
+      }
+      dout(6) << "\n === configured cluster with the following members, starting with the leader: " << leader_and_replicas << dendl;
+    }
+
+    struct TestRequestContext: public QuiesceDbManager::RequestContext, public C_SaferCond {
+      void finish(int r) override { C_SaferCond::finish(r); }
+      void complete(int r) override { C_SaferCond::complete(r); }
+
+      const QuiesceDbTest& parent;
+      TestRequestContext(const QuiesceDbTest& parent) : parent(parent) {}
+      ~TestRequestContext() {
+        wait();
+      }
+
+      bool start(std::invocable<QuiesceDbRequest&> auto const & c)
+      {
+        done = false;
+        response.clear();
+        request.reset(c);
+
+        int rr = -1;
+
+        for (auto& [rank, mgr] : parent.managers) {
+          if (!(rr = mgr->submit_request(this))) {
+            break;
+          }
+        }
+
+        if (rr == EPERM) {
+          // change the error to something never returned for a request
+          // EPIPE seems reasonable as we couldn't find the leader to send the command to
+          complete(EPIPE);
+          return false;
+        }
+
+        return true;
+      }
+
+      GenericVerboseErrorCode check_result() {
+        std::unique_lock l{lock};
+        if (done) {
+          return ERR(rval);
+        }
+        // this error is never returned by the manager
+        return NA();
+      }
+
+      GenericVerboseErrorCode wait_result() {
+        return ERR(wait());
+      }
+
+      GenericVerboseErrorCode wait_result_for(double seconds)
+      {
+        return ERR(wait_for(seconds));
+      }
+    };
+
+    std::deque<std::unique_ptr<TestRequestContext>> requests;
+    std::unique_ptr<TestRequestContext> last_request;
+
+    const QuiesceDbManager::AgentCallback::Notify QUIESCING_AGENT_CB = [](QuiesceMap& quiesce_map) {
+      dout(15) << "QUIESCING_AGENT_CB: notified with " << quiesce_map.roots.size() << " roots for version " << quiesce_map.db_version << dendl;
+      for (auto it = quiesce_map.roots.begin(); it != quiesce_map.roots.end();) {
+        switch (it->second.state) {
+        case QS_QUIESCING:
+          it->second.state = QS_QUIESCED;
+          dout(10) << "QUIESCING_AGENT_CB: reporting '" << it->first << "' as " << it->second.state << dendl;
+          it++;
+          break;
+        default:
+          it = quiesce_map.roots.erase(it);
+          break;
+        }
+      }
+      return true;
+    };
+
+    const QuiesceDbManager::AgentCallback::Notify FAILING_AGENT_CB = [](QuiesceMap& quiesce_map) {
+      dout(15) << "FAILING_AGENT_CB: notified with " << quiesce_map.roots.size() << " roots for version " << quiesce_map.db_version << dendl;
+      for (auto it = quiesce_map.roots.begin(); it != quiesce_map.roots.end();) {
+        switch (it->second.state) {
+        case QS_QUIESCING:
+          it->second.state = QS_FAILED;
+          dout(10) << "FAILING_AGENT_CB: reporting '" << it->first << "' as " << it->second.state << dendl;
+          it++;
+          break;
+        default:
+          it = quiesce_map.roots.erase(it);
+          break;
+        }
+      }
+      return true;
+    };
+
+    const QuiesceDbManager::AgentCallback::Notify SILENT_AGENT_CB = [](QuiesceMap& quiesce_map) {
+      dout(15) << "SILENT_AGENT_CB: nacking quiesce map version " << quiesce_map.db_version << " with " << quiesce_map.roots.size() << " roots" << dendl;
+      return false;
+    };
+
+    GenericVerboseErrorCode
+    run_request(std::invocable<QuiesceDbRequest&> auto const& c)
+    {
+      last_request.reset(new TestRequestContext(*this));
+      last_request->start(c);
+      return ERR(last_request->wait());
+    }
+
+    GenericVerboseErrorCode
+    run_request_for(double seconds, std::invocable<QuiesceDbRequest&> auto const& c)
+    {
+      last_request.reset(new TestRequestContext(*this));
+      last_request->start(c);
+      return ERR(last_request->wait_for(seconds));
+    }
+
+    TestRequestContext& start_request(std::invocable<QuiesceDbRequest&> auto const& c)
+    {
+      auto &ptr = requests.emplace_back(new TestRequestContext(*this));
+      ptr->start(c);
+      return *ptr;
+    }
+
+    TestQuiesceDbManager::Db& db(QuiesceInterface::PeerId peer) {
+      return managers[peer]->internal_db();
+    }
+
+    static GenericVerboseErrorCode ERR(int val) {
+      return GenericVerboseErrorCode(val);
+    }
+    static GenericVerboseErrorCode OK()
+    {
+      return ERR(0);
+    }
+    static GenericVerboseErrorCode NA() {
+      return ERR(EBUSY);
+    }
+
+    static QuiesceTimeInterval sec(double val) {
+      return std::chrono::duration_cast<QuiesceTimeInterval>(std::chrono::duration<double>(val));
+    }
+};
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, ManagerStartup) {
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+  ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(2) }));
+  ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2) }));
+  ASSERT_EQ(OK(), run_request_for(100, [](auto& r) {}));
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, SetCreation) {
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  // create a named set by resetting roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set0";
+    r.reset_roots({"root1"});
+  }));
+
+  // the set must have timed out immediately since we haven't configured
+  // the expiration timeout.
+  ASSERT_TRUE(last_request->response.sets.contains("set0"));
+  EXPECT_EQ(QS_TIMEDOUT, last_request->response.sets.at("set0").rstate.state);
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.contains(*last_request->request.set_id));
+
+  // create a named set by including roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.include_roots({"root1"});
+  }));
+
+  // the set must have timed out immediately since we haven't configured
+  // the expiration timeout. 
+  ASSERT_TRUE(last_request->response.sets.contains("set1"));
+  EXPECT_EQ(QS_TIMEDOUT, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.contains(*last_request->request.set_id));
+
+  // create a new unique set by including roots
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.include_roots({"root2"});
+  }));
+
+  // the manager must have filled the set id with a unique value
+  ASSERT_TRUE(last_request->request.set_id.has_value());
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.contains(*last_request->request.set_id));
+
+  // create a new unique set by resetting roots
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.reset_roots({"root2"});
+  }));
+
+  // the manager must have filled the set id with a unique value
+  ASSERT_TRUE(last_request->request.set_id.has_value());
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.contains(*last_request->request.set_id));
+
+  // prevent modification of a named set when a new set is desired
+  EXPECT_EQ(ERR(ESTALE), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.if_version = 0;
+    r.roots.emplace("root3");
+  }));
+  EXPECT_EQ(1, last_request->response.sets.size());
+  EXPECT_TRUE(last_request->response.sets.contains("set1"));
+
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.if_version = 0;
+    r.roots.emplace("root4");
+  }));
+
+  EXPECT_EQ(1, last_request->response.sets.size());
+  EXPECT_TRUE(last_request->response.sets.contains("set2"));
+  EXPECT_EQ(QS_TIMEDOUT, last_request->response.sets.at("set2").rstate.state);
+
+  // let's try to create a new named but expect it to have non-zero version
+  EXPECT_EQ(ERR(ENOENT), run_request([](auto& r) {
+    r.set_id = "set3";
+    r.if_version = 1;
+    r.roots.emplace("root4");
+  }));
+
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  // let's try to create a new anonymous but expect it to have non-zero version
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.if_version = 2;
+    r.roots.emplace("root4");
+  }));
+
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  // an empty string is a valid set id.
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "";
+    r.roots.emplace("root1");
+  }));
+}
+
+template<class T>
+constexpr
+std::array<std::optional<T>, 2> nullopt_and_default() {
+  return {std::nullopt, T{}};
+}
+
+template<class F, class... V, size_t... S>
+  requires std::invocable<F, V...>
+void cartesian_apply(F func, std::array<V, S> const & ... array_args) {
+  // inspired by https://stackoverflow.com/a/31169617/5171225
+
+  // the iteration count is a product of all array sizes
+  const long long N = (S * ...);
+
+  for (long long n = 0; n < N; ++n) {
+    std::lldiv_t q { n, 0 };
+
+    // we use parameter pack expansion as part of the brace initializer
+    // to perform sequential calculation of the 
+    auto apply_tuple = std::tuple<V const &...> { 
+      (q = div(q.quot, array_args.size()), array_args.at(q.rem)) 
+      ... 
+    };
+
+    if (!std::apply(func, apply_tuple)) {
+      return;
+    }
+  }
+}
+
+template<class... Args>
+void coutall(Args&&... args) {
+  int dummy[sizeof...(args)] = { (std::cout << args, std::cout << " ", 0)... };
+  std::cout << std::endl;
+}
+
+TEST_F(QuiesceDbTest, QuiesceRequestValidation)
+{
+
+  auto checkRequest = [](
+    decltype(std::declval<QuiesceDbRequest>().control.roots_op) const& op,
+    decltype(std::declval<QuiesceDbRequest>().set_id)           const& set_id,
+    decltype(std::declval<QuiesceDbRequest>().if_version)       const& if_version,
+    decltype(std::declval<QuiesceDbRequest>().timeout)          const& timeout,
+    decltype(std::declval<QuiesceDbRequest>().expiration)       const& expiration,
+    decltype(std::declval<QuiesceDbRequest>().await)            const& await,
+    decltype(std::declval<QuiesceDbRequest>().roots)            const& roots) {
+      QuiesceDbRequest r;
+      r.control.roots_op = op;
+      r.set_id = set_id;
+      r.if_version = if_version;
+      r.timeout = timeout;
+      r.expiration = expiration;
+      r.await = await;
+      r.roots = roots;
+
+      if (op >= QuiesceDbRequest::RootsOp::__INVALID) {
+        EXPECT_FALSE(r.is_valid())
+          << "op: " << r.op_string() << ", set_id: " << bool(set_id) 
+          << ", if_version: " << bool(if_version) 
+          << ", timeout: " << bool(timeout) << ", expiration: " 
+          << bool(expiration) << ", await: " 
+          << bool(await) << ", roots.size(): " << roots.size();
+      } else {
+        // if set id is provided, all goes
+        if (set_id) {
+          EXPECT_TRUE(r.is_valid())
+            << "op: " << r.op_string() << ", set_id: " << bool(set_id) 
+            << ", if_version: " << bool(if_version) 
+            << ", timeout: " << bool(timeout) << ", expiration: " 
+            << bool(expiration) << ", await: " 
+            << bool(await) << ", roots.size(): " << roots.size();
+        } else {
+          // without the set id we can create a new set
+          // or perform operations on all sets
+          if (roots.size() > 0) {
+            // if roots are provided, we assume creation
+            // all combinations are valid unless it's an exclude,
+            // which doesn't make sense without a set id
+            EXPECT_NE(r.is_exclude(), r.is_valid())
+              << "op: " << r.op_string() << ", set_id: " << bool(set_id) 
+              << ", if_version: " << bool(if_version) 
+              << ", timeout: " << bool(timeout) << ", expiration: " 
+              << bool(expiration) << ", await: " 
+              << bool(await) << ", roots.size(): " << roots.size();
+          } else {
+            // means it's a query or a "cancel all"
+            // no other parameters should be set
+            if (if_version || timeout || expiration || await) {
+              EXPECT_FALSE(r.is_valid())
+                << "op: " << r.op_string() << ", set_id: " << bool(set_id) 
+                << ", if_version: " << bool(if_version) 
+                << ", timeout: " << bool(timeout) << ", expiration: " 
+                << bool(expiration) << ", await: " 
+                << bool(await) << ", roots.size(): " << roots.size();
+            } else {
+              EXPECT_NE(r.is_release(), r.is_valid())
+                << "op: " << r.op_string() << ", set_id: " << bool(set_id) 
+                << ", if_version: " << bool(if_version) 
+                << ", timeout: " << bool(timeout) << ", expiration: " 
+                << bool(expiration) << ", await: " 
+                << bool(await) << ", roots.size(): " << roots.size();
+            }
+          }
+        }
+      }
+
+      return !testing::Test::HasFailure();
+  };
+
+  const auto ops = std::array { QuiesceDbRequest::RootsOp::INCLUDE_OR_QUERY, QuiesceDbRequest::RootsOp::EXCLUDE_OR_RELEASE, QuiesceDbRequest::RootsOp::RESET_OR_CANCEL, QuiesceDbRequest::RootsOp::__INVALID };
+  const auto strings = nullopt_and_default<std::string>();
+  const auto versions = nullopt_and_default<QuiesceSetVersion>();
+  const auto intervals = nullopt_and_default<QuiesceTimeInterval>();
+  const auto roots = std::array { QuiesceDbRequest::Roots {}, QuiesceDbRequest::Roots { "root1" } };
+
+  cartesian_apply(checkRequest,
+      ops, strings, versions, intervals, intervals, intervals, roots);
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, RootSanitization)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+  // a positive test with all kinds of expected fixes
+  ASSERT_EQ(OK(), run_request([this](auto& r) {
+    r.set_id = "set1";
+    r.include_roots({
+      "file:root1",
+      fmt::format("file://{}/root2", fs_id),
+      fmt::format("//{}/root3", fs_name),
+      fmt::format("inode://{}/4", fs_id),
+      fmt::format("inode://{}/5", fs_name),
+      "inode:18446744073709551615",
+      "inode:/18446744073709551614",
+      "inode:/18446744073709551613/",
+      "root6/.///./..////root6//"
+    });
+  }));
+
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root1"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root2"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root3"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("inode:4"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("inode:5"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("inode:18446744073709551615"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("inode:18446744073709551614"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("inode:18446744073709551613"));
+  EXPECT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root6/root6"));
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.include_roots({
+      "//10/root1"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "//badfsname/root1"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode://badfsname/1"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:-4"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:18446744073709551616" // too big to fit a uint64_t
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:1/2/3/4"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:abcd"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:123-456"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:"
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.set_id = "badset";
+    r.include_roots({
+      "inode:0" // zero is an invalid inodeno
+    });
+  }));
+  EXPECT_EQ(0, last_request->response.sets.size());
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, SetModification)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  // create a named set by including roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(60);
+    r.expiration = sec(60);
+    r.include_roots({"root1"});
+  }));
+
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.contains("set1"));
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root1"));
+
+  // include more roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.include_roots({"root2", "root3"});
+  }));
+
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root2"));
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root3"));
+  ASSERT_EQ(db(mds_gid_t(1)).sets.at("set1").members.size(), 3);
+
+  auto latest_v = last_request->response.sets.at("set1").version;
+
+  // including present roots shouldn't bump the version
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.include_roots({ "root2", "root3" });
+  }));
+
+  ASSERT_EQ(latest_v, last_request->response.sets.at("set1").version);
+  ASSERT_EQ(latest_v, db(mds_gid_t(1)).sets.at("set1").version);
+
+  // resetting to the same roots shouldn't bump the version
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.reset_roots({ "root1","root2", "root3" });
+  }));
+
+  ASSERT_EQ(latest_v, last_request->response.sets.at("set1").version);
+  ASSERT_EQ(latest_v, db(mds_gid_t(1)).sets.at("set1").version);
+
+  // exclude roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.exclude_roots({ "root1", "root4" }); // root4 wasn't included, noop
+  }));
+
+  // the db doesn't delete set memebers, only marks them as excluded
+  ASSERT_EQ(db(mds_gid_t(1)).sets.at("set1").members.size(), 3);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root1").excluded);
+  ASSERT_FALSE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root2").excluded);
+  ASSERT_FALSE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root3").excluded);
+  ASSERT_FALSE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root4"));
+
+  // reset roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.reset_roots({"root4"});
+  }));
+
+  ASSERT_EQ(db(mds_gid_t(1)).sets.at("set1").members.size(), 4);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root1").excluded);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root2").excluded);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root3").excluded);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.contains("file:/root4"));
+  ASSERT_FALSE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root4").excluded);
+
+  // reset is an including op, should allow creating a set with it
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.timeout = sec(60);
+    r.expiration = sec(60);
+    r.reset_roots({"root5"});
+  }));
+
+  ASSERT_FALSE(db(mds_gid_t(1)).sets.at("set2").members.at("file:/root5").excluded);
+
+  // cancel with no set_id should cancel all active sets
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.control.roots_op = QuiesceDbRequest::RootsOp::RESET_OR_CANCEL;
+  }));
+
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set1").members.at("file:/root4").excluded);
+  ASSERT_TRUE(db(mds_gid_t(1)).sets.at("set2").members.at("file:/root5").excluded);
+
+  ASSERT_EQ(QuiesceState::QS_CANCELED, db(mds_gid_t(1)).sets.at("set1").rstate.state);
+  ASSERT_EQ(QuiesceState::QS_CANCELED, db(mds_gid_t(1)).sets.at("set2").rstate.state);
+
+  // reset can be used to resurrect a set from a terminal state
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(60);
+    r.expiration = sec(60);
+    r.reset_roots({ "root5" });
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, db(mds_gid_t(1)).sets.at("set1").rstate.state);
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, Timeouts) {
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  // install the agent callback to reach the QUIESCED state
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(0.1);
+    r.expiration = sec(0.1);
+    r.include_roots({"root1"});
+    r.await = sec(1);
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+
+  std::this_thread::sleep_for(sec(0.15));
+
+  ASSERT_EQ(QuiesceState::QS_EXPIRED, db(mds_gid_t(1)).sets.at("set1").rstate.state);
+
+  // reset can be used to resurrect a set from a terminal state
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.reset_roots({ "root5" });
+  }));
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, last_request->response.sets.at("set1").rstate.state);
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.timeout = sec(0.1);
+    r.expiration = sec(0.1);
+    r.include_roots({ "root1" });
+    r.await = sec(1);
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+
+  // prevent the db agent from acking the roots
+  managers.at(mds_gid_t(1))->reset_agent_callback(SILENT_AGENT_CB);
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.release_roots();
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_RELEASING, last_request->response.sets.at("set2").rstate.state);
+
+  std::this_thread::sleep_for(sec(0.15));
+
+  ASSERT_EQ(QuiesceState::QS_EXPIRED, db(mds_gid_t(1)).sets.at("set2").rstate.state);
+
+  // reset can be used to resurrect a set from a terminal state
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.reset_roots({ "root1" });
+  }));
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, last_request->response.sets.at("set2").rstate.state);
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set3";
+    r.timeout = sec(0.1);
+    r.include_roots({ "root1" });
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, db(mds_gid_t(1)).sets.at("set3").rstate.state);
+
+  std::this_thread::sleep_for(sec(0.15));
+
+  ASSERT_EQ(QuiesceState::QS_TIMEDOUT, db(mds_gid_t(1)).sets.at("set3").rstate.state);  // reset can be used to resurrect a set from a terminal state
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set3";
+    r.reset_roots({ "root1" });
+  }));
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, last_request->response.sets.at("set3").rstate.state);
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, Failures) {
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(0.1);
+    r.expiration = sec(0.1);
+    r.include_roots({"root1"});
+  }));
+
+  EXPECT_EQ(QuiesceState::QS_QUIESCING, last_request->response.sets.at("set1").rstate.state);
+
+  {
+    // wait for the agent to ack root1 as failed
+    auto did_ack = add_ack_hook([](auto rank, auto const& ack) {
+      return ack.roots.contains("file:/root1") && ack.roots.at("file:/root1").state == QS_FAILED;
+    });
+
+    // allow acks
+    managers.at(mds_gid_t(1))->reset_agent_callback(FAILING_AGENT_CB);
+
+    EXPECT_EQ(std::future_status::ready, did_ack.wait_for(std::chrono::milliseconds(100)));
+  }
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+  }));
+
+  EXPECT_EQ(QuiesceState::QS_FAILED, db(mds_gid_t(1)).sets.at("set1").rstate.state);
+  EXPECT_EQ(QuiesceState::QS_FAILED, last_request->response.sets.at("set1").rstate.state);
+
+  ASSERT_EQ(ERR(EBADF), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.timeout = sec(0.1);
+    r.expiration = sec(0.1);
+    r.include_roots({ "root1" });
+    r.await = sec(1);
+  }));
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, InterruptedQuiesceAwait)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  auto then = QuiesceClock::now();
+
+  // await timeout should result in a EINPROGRESS given that the set
+  // isn't modified in the meantime
+  ASSERT_EQ(ERR(EINPROGRESS), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(100);
+    r.roots.emplace("root1");
+    r.await = sec(0.1);
+  }));
+
+  ASSERT_EQ(QuiesceState::QS_QUIESCING, db(mds_gid_t(1)).sets.at("set1").rstate.state);
+  ASSERT_GE(QuiesceClock::now() - then, *last_request->request.await);
+
+  // start an asyncrhonous await request
+  auto & await = start_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  });
+
+  // flush the pending requests by running a simple query
+  EXPECT_EQ(OK(), run_request([](auto& r) { r.query("set1"); }));
+
+  // still running
+  EXPECT_EQ(NA(), await.check_result());
+
+  // modify the set but don't change roots 
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.expiration = sec(100);
+    r.timeout = sec(10);
+    r.roots.emplace("root1");
+  }));
+
+  // should still be running
+  EXPECT_EQ(NA(), await.check_result());
+
+  // add another set
+  then = QuiesceClock::now();
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.timeout = sec(0.1);
+    r.roots.emplace("root1");
+  }));
+
+  // should still be running
+  EXPECT_EQ(NA(), await.check_result());
+
+  // modify roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.roots.emplace("root2");
+  }));
+
+  EXPECT_EQ(ERR(EINTR), await.wait_result());
+
+  // start async await on set2
+  auto & await2 = start_request([](auto& r) {
+    r.set_id = "set2";
+    r.await = sec(100);
+  });
+
+  // should be running
+  EXPECT_EQ(NA(), await2.check_result());
+
+  // and another one, this time wait for it to finish
+  ASSERT_EQ(ERR(ETIMEDOUT), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.await = sec(100);
+  }));
+
+  // the other await on the same set must have finished with the same result
+  EXPECT_EQ(ERR(ETIMEDOUT), await2.wait_result());
+
+  // shouldn't have taken much longer than the timeout configured on the set
+  auto epsilon = sec(0.01);
+  ASSERT_LE(QuiesceClock::now() - then - epsilon, last_request->response.sets.at("set2").timeout);
+
+  // let's cancel set 1 while awaiting it a few times
+
+  // start async await on set1
+  auto& await3 = start_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  });
+
+  auto& await4 = start_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  });
+
+  // should be running
+  EXPECT_EQ(NA(), await3.check_result());
+  EXPECT_EQ(NA(), await4.check_result());
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.reset_roots({});
+  }));
+
+  EXPECT_EQ(ERR(ECANCELED), await3.wait_result());
+  EXPECT_EQ(ERR(ECANCELED), await4.wait_result());
+
+  // awaiting a set in a terminal state should immediately
+  // complete with the corresponding error
+  ASSERT_EQ(ERR(ECANCELED), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  }));
+  ASSERT_EQ(ERR(ETIMEDOUT), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.await = sec(100);
+  }));
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, RepeatedQuiesceAwait) {
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  // let us reach quiescing
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  // pick an expiration timeout
+  auto expiration = sec(0.1);
+
+  // create a set and let it quiesce
+  ASSERT_EQ(OK(), run_request([=](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(0.1);
+    r.expiration = expiration;
+    r.roots.emplace("root1");
+    r.await = QuiesceTimeInterval::max();
+  }));
+
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+
+  // sleep for half the expiration interval multiple times
+  // each time sending another await request
+  // the expectation is that every time we call await
+  // the expiration timer is reset, hence we should be able to
+  // sustain the loop for arbitrarily long
+  for (int i = 0; i < 10; i++) {
+    std::this_thread::sleep_for(expiration/2);
+    ASSERT_EQ(OK(), run_request([i](auto& r) {
+      r.set_id = "set1";
+      if (i % 2) {
+        // this shouldn't affect anything
+        r.reset_roots({"root1"});
+      }
+      r.await = sec(0);
+    }));
+  }
+
+  // Prevent the set from reaching the RELEASED state
+  managers.at(mds_gid_t(1))->reset_agent_callback(SILENT_AGENT_CB);
+
+  // start releasing and observe that the timer isn't reset in this case,
+  // so after a few EINPROGRESS we eventually reach timeout due to expiration
+  for (int i = 0; i < 2; i++) {
+    ASSERT_EQ(ERR(EINPROGRESS), run_request([=](auto& r) {
+      r.set_id = "set1";
+      r.release_roots();
+      r.await = (expiration*2)/5;
+    }));
+  }
+
+  // NB: the ETIMEDOUT is the await result, while the set itself should be EXPIRED
+  EXPECT_EQ(ERR(ETIMEDOUT), run_request([=](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+    r.await = expiration;
+  }));
+
+  EXPECT_EQ(QS_EXPIRED, last_request->response.sets.at("set1").rstate.state);
+
+  EXPECT_EQ(ERR(ETIMEDOUT), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(0.1);
+  }));
+
+  EXPECT_EQ(ERR(EPERM), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+  }));
+
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, ReleaseAwait)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  // create some sets
+  for (auto&& set_id : { "set1", "set2", "set3" }) {
+    ASSERT_EQ(OK(), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.timeout = sec(100);
+      r.expiration = sec(100);
+      r.include_roots({ "root1", "root2"});
+    })) << "creating " << set_id;
+    EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at(set_id).rstate.state);
+  }
+
+  // we shouldn't be able to release-await a quiescing set
+  for (auto&& set_id : { "set1", "set2" }) {
+    ASSERT_EQ(ERR(EPERM), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.release_roots();
+      r.await = sec(1);
+    })) << "bad release-await " << set_id;
+  }
+
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  for (auto&& set_id : { "set1", "set2", "set3" }) {
+    ASSERT_EQ(OK(), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.await = sec(0.1);
+    })) << "quiesce-await " << set_id;
+    EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at(set_id).rstate.state);
+  }
+
+  managers.at(mds_gid_t(1))->reset_agent_callback(SILENT_AGENT_CB);
+
+  auto & release_await1 = start_request([](auto &r) {
+    r.set_id = "set1";
+    r.release_roots();
+    r.await = sec(100);
+  });
+
+  auto& release_await2 = start_request([](auto& r) {
+    r.set_id = "set2";
+    r.release_roots();
+    r.await = sec(100);
+  });
+
+  EXPECT_EQ(OK(), run_request([](auto &r){}));
+  // releasing should be in progress
+  EXPECT_EQ(NA(), release_await1.check_result());
+  EXPECT_EQ(NA(), release_await2.check_result());
+  EXPECT_EQ(QS_RELEASING, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_RELEASING, last_request->response.sets.at("set2").rstate.state);
+  auto releasing_v1 = last_request->response.sets.at("set1").version;
+
+  // we can request release again without any version bump
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+  }));
+
+  EXPECT_EQ(releasing_v1, last_request->response.sets.at("set1").version );
+
+  // we can release-await with a short await timeout
+  EXPECT_EQ(ERR(EINPROGRESS), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+    r.await = sec(0.1);
+  }));
+
+  // we can't quiesce-await a set that's releasing
+  EXPECT_EQ(ERR(EPERM), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(0.1);
+  }));
+
+  // shouldn't be able to add roots to a releasing set
+  EXPECT_EQ(ERR(EPERM), run_request([](auto &r) {
+    r.set_id = "set1";
+    r.include_roots({"root3"});
+  }));
+
+  // still on the same set version
+  EXPECT_EQ(releasing_v1, last_request->response.sets.at("set1").version );
+
+  // it should be allowed to exclude roots from a releasing set
+  EXPECT_EQ(OK(), run_request([](auto &r) {
+    r.set_id = "set2";
+    r.exclude_roots({"root2"});
+  }));
+
+  // the corresponding await must have been interrupted due to the change to the members
+  EXPECT_EQ(ERR(EINTR), release_await2.wait_result_for(0.1));
+
+  // still releasing
+  EXPECT_EQ(QS_RELEASING, last_request->response.sets.at("set2").rstate.state);
+
+  // await again
+  auto& release_await22 = start_request([](auto& r) {
+    r.set_id = "set2";
+    r.release_roots();
+    r.await = sec(100);
+  });
+
+  EXPECT_EQ(NA(), release_await22.check_result());
+
+  // excluding the last root should cancel the set
+  EXPECT_EQ(OK(), run_request([](auto &r) {
+    r.set_id = "set2";
+    r.exclude_roots({"root1"});
+  }));
+
+  EXPECT_EQ(ERR(ECANCELED), release_await22.wait_result_for(0.1));
+
+  std::atomic<QuiesceState> root1_state(QS__INVALID);
+  managers.at(mds_gid_t(1))->reset_agent_callback([&](auto &map){
+    if (map.roots.contains("file:/root1")) {
+      root1_state = map.roots.at("file:/root1").state;
+      root1_state.notify_all();
+    }
+    return false;
+  });
+
+  // validate that root1 is still reported to the agents as QUIESCING
+  // even though we are already releasing set1
+  // this is because there is another set with this root which is not releasing
+  EXPECT_TRUE(timed_run(sec(0.1), [&](){root1_state.wait(QS__INVALID);}));
+  EXPECT_EQ(QS_QUIESCING, root1_state.load());
+
+  // allow acks
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+  EXPECT_EQ(OK(), release_await1.wait_result_for(0.1));
+
+  EXPECT_EQ(QS_RELEASED, release_await1.response.sets.at("set1").rstate.state);
+
+  // it should be OK to request release or release-await on a RELEASED set
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+  }));
+
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+    r.await = sec(0.1);
+  }));
+
+  // it's invalid to send a release without a set id
+  EXPECT_EQ(ERR(EINVAL), run_request([](auto& r) {
+    r.release_roots();
+  }));
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, LeaderShutdown)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1) }));
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.timeout = sec(60);
+    r.expiration = sec(60);
+    r.include_roots({ "root1" });
+  }));
+
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.timeout = sec(60);
+    r.expiration = sec(60);
+    r.include_roots({ "root2", "root3"});
+  }));
+
+  std::queue<TestRequestContext*> outstanding_awaits;
+  std::queue<TestRequestContext*> pending_requests;
+
+  // let's have several awaits pending
+  for(auto&& set_id: {"set1", "set2"}) {
+    for (int i=0; i<2; i++) {
+      outstanding_awaits.emplace(&start_request([set_id](auto&r) {
+        r.set_id = set_id;
+        r.await = sec(100);
+      }));
+      EXPECT_EQ(NA(), outstanding_awaits.front()->check_result());
+    }
+  }
+
+  // flush the pending requests by running a simple query
+  EXPECT_EQ(OK(), run_request([](auto& r) { r.query("set1"); }));
+
+  ASSERT_EQ(outstanding_awaits.size(), managers.at(mds_gid_t(1))->internal_awaits().size());
+
+  std::mutex agent_mutex;
+  std::condition_variable agent_cond;
+  bool callback_reached = false;
+
+  // block the db thread with a malicious agent callback
+  managers.at(mds_gid_t(1))->reset_agent_callback([&](auto& map) {
+    std::unique_lock l(agent_mutex);
+    callback_reached = true;
+    agent_cond.notify_all();
+    l.unlock();
+    std::this_thread::sleep_for(sec(0.1));
+    return false;
+  });
+
+  {
+    std::unique_lock l(agent_mutex);
+    agent_cond.wait(l, [&]{return callback_reached;});
+  }
+
+  // now that the db thread is sleeping we can pile up some pending requests
+  pending_requests.emplace(&start_request([](auto& r) {
+    r.set_id = "set3";
+    r.include_roots({"root4"});
+  }));
+  EXPECT_EQ(NA(), pending_requests.front()->check_result());
+
+  pending_requests.emplace(&start_request([](auto& r) {
+    r.set_id = "set4";
+    r.include_roots({"root5"});
+  }));
+  EXPECT_EQ(NA(), pending_requests.front()->check_result());
+
+  pending_requests.emplace(&start_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  }));
+  EXPECT_EQ(NA(), pending_requests.front()->check_result());
+
+  ASSERT_EQ(managers.at(mds_gid_t(1))->internal_pending_requests().size(), pending_requests.size());
+
+  // reset the membership of the manager
+  // this will block until the db thread exits
+  managers.at(mds_gid_t(1))->update_membership({});
+
+  // as of now all requests must have finished
+  while(!outstanding_awaits.empty()) {
+    auto& r = *outstanding_awaits.front();
+    EXPECT_EQ(ERR(EINPROGRESS), r.check_result());
+    outstanding_awaits.pop();
+  }
+
+  while (!pending_requests.empty()) {
+    auto& r = *pending_requests.front();
+    EXPECT_EQ(ERR(EPERM), r.check_result());
+    pending_requests.pop();
+  }
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, MultiRankQuiesce)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({  mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
+
+  std::vector<TestRequestContext*> awaits;
+
+  // create and await several sets
+  // we deliberately avoid setting the expiration timeout in this test
+  for (auto&& set_id: {"set1", "set2", "set3"}) {
+    awaits.emplace_back(&start_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.timeout = sec(100);
+      r.include_roots({"root1"});
+      r.await = sec(100);
+    }));
+  }
+
+  // flush the pending requests by running a simple query
+  ASSERT_EQ(OK(), run_request([](auto&r){r.query("set1");}));
+
+  ASSERT_EQ(awaits.size(), managers.at(mds_gid_t(1))->internal_awaits().size());
+
+  for (auto&& await: awaits) {
+    EXPECT_EQ(NA(), await->check_result()) << await->request.set_id.value();
+  }
+
+  {
+    std::unordered_set<QuiesceInterface::PeerId> peers_quiesced;
+    auto did_ack = add_ack_hook([&](auto p, auto const &m) {
+      if (m.roots.contains("file:/root1") && (m.roots.at("file:/root1").state == QS_QUIESCED)) {
+        peers_quiesced.insert(p);
+      }
+      return peers_quiesced.size() >= 2;
+    });
+
+    // let two of the three peers ack quiescing of the root
+    managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+    managers.at(mds_gid_t(2))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+    ASSERT_EQ(std::future_status::ready, did_ack.wait_for(std::chrono::milliseconds(100)));
+  }
+
+  // kick the db queue with a simple query
+  ASSERT_EQ(OK(), run_request([](auto& r) { r.query("set1"); }));
+
+  // should still be waiting for the last agent
+  EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at("set1").rstate.state);
+  for (auto&& await: awaits) {
+    EXPECT_EQ(NA(), await->check_result()) << await->request.set_id.value();
+  }
+
+  {
+    // wait for the late peer to ack root1 as released
+    auto did_ack = add_ack_hook([](auto gid, auto const& ack) {
+      return gid == mds_gid_t(3) && ack.roots.contains("file:/root1") && ack.roots.at("file:/root1").state == QS_QUIESCED;
+    });
+
+    // allow acks
+    managers.at(mds_gid_t(3))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+    EXPECT_EQ(std::future_status::ready, did_ack.wait_for(std::chrono::milliseconds(100)));
+  }
+
+  // kick the db queue with a simple query
+  ASSERT_EQ(OK(), run_request([](auto& r) {}));
+
+  // first three sets must be expired because they had 0 expiration
+  EXPECT_EQ(QS_EXPIRED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_EXPIRED, last_request->response.sets.at("set2").rstate.state);
+  EXPECT_EQ(QS_EXPIRED, last_request->response.sets.at("set3").rstate.state);
+
+  // pending quiesce requests must have all completed successfully
+  // even though some of the sets got expired immediately
+  for (auto&& await : awaits) {
+    EXPECT_EQ(OK(), await->check_result()) << await->request.set_id.value();
+  }
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, MultiRankRelease)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+  managers.at(mds_gid_t(2))->reset_agent_callback(QUIESCING_AGENT_CB);
+  managers.at(mds_gid_t(3))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  // quiesce two sets
+  for (auto&& set_id : { "set1", "set2" }) {
+    ASSERT_EQ(OK(), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.timeout = sec(60);
+      r.expiration = sec(60);
+      r.await = sec(100);
+      r.include_roots({ "root1" });
+    }));
+    EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at(set_id).rstate.state);
+  }
+
+  auto quiesced_v = db(mds_gid_t(1)).sets.at("set1").version;
+
+  // prevent one of the acks
+  managers.at(mds_gid_t(2))->reset_agent_callback(SILENT_AGENT_CB);
+
+  // release roots
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+  }));
+
+  EXPECT_EQ(QS_RELEASING, last_request->response.sets.at("set1").rstate.state);
+  auto releasing_v = last_request->response.sets.at("set1").version;
+  ASSERT_NE(quiesced_v, releasing_v);
+
+  auto &async_release = start_request([](auto& r) {
+    r.set_id = "set2";
+    r.await = sec(100);
+    r.release_roots();
+  });
+
+  EXPECT_EQ(NA(), async_release.check_result());
+
+  // shouldn't hurt to run release twice for set 1
+  ASSERT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.release_roots();
+  }));
+
+  EXPECT_EQ(releasing_v, last_request->response.sets.at("set1").version);
+
+  // we shouldn't be able to quiesce-await a releasing set
+  ASSERT_EQ(ERR(EPERM), run_request_for(1, [](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(100);
+  }));
+
+  auto latest_v = db(mds_gid_t(1)).set_version;
+
+  // wait for all peers to sync version 
+  {
+    std::unique_lock l(comms_mutex);
+    auto result = comms_cond.wait_for(l, std::chrono::milliseconds(100), [&] {
+      auto min_v = std::min({ db(mds_gid_t(1)).set_version, db(mds_gid_t(2)).set_version, db(mds_gid_t(3)).set_version });
+      return min_v >= latest_v;
+    });
+    ASSERT_TRUE(result);
+  }
+
+  // all replicas must agree
+  for (auto&& gid : {mds_gid_t(1), mds_gid_t(2), mds_gid_t(3)}) {
+    EXPECT_EQ(QS_RELEASING, db(gid).sets.at("set1").rstate.state) << "db of gid " << gid;
+    EXPECT_EQ(QS_RELEASING, db(gid).sets.at("set2").rstate.state) << "db of gid " << gid;
+  }
+
+  // wait for the late peer to ack back
+  auto did_ack = add_ack_hook([](auto gid, auto const &ack){
+    return gid == mds_gid_t(2);
+  });
+
+  // allow acks
+  managers.at(mds_gid_t(2))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  EXPECT_EQ(std::future_status::ready, did_ack.wait_for(std::chrono::milliseconds(100)));
+
+  ASSERT_EQ(OK(), run_request([](auto& r) { }));
+
+  EXPECT_EQ(QS_RELEASED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_RELEASED, last_request->response.sets.at("set2").rstate.state);
+  EXPECT_EQ(OK(), async_release.check_result());
+
+  // validate that we can release-await RELEASED sets
+  // but can't quiesce-await the same
+  for (auto&& set_id : { "set1", "set2" }) {
+    ASSERT_EQ(OK(), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.await = sec(100);
+      r.release_roots();
+    }));
+    ASSERT_EQ(ERR(EPERM), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.await = sec(100);
+    }));
+  }
+}
+
+/* ================================================================ */
+TEST_F(QuiesceDbTest, MultiRankRecovery)
+{
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
+  managers.at(mds_gid_t(1))->reset_agent_callback(QUIESCING_AGENT_CB);
+  managers.at(mds_gid_t(2))->reset_agent_callback(QUIESCING_AGENT_CB);
+  managers.at(mds_gid_t(3))->reset_agent_callback(QUIESCING_AGENT_CB);
+
+  // quiesce two sets
+  for (auto&& set_id : { "set1", "set2" }) {
+    ASSERT_EQ(OK(), run_request([set_id](auto& r) {
+      r.set_id = set_id;
+      r.timeout = sec(60);
+      r.expiration = sec(60);
+      r.await = sec(100);
+      r.include_roots({ "root1" });
+    }));
+    EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at(set_id).rstate.state);
+  }
+
+
+  auto did_ack41 = add_ack_hook([](auto gid, auto const &ack){
+    return gid == mds_gid_t(4) && ack.db_version.set_version > 0;
+  });
+
+  // reconfigure the cluster so that a new member is assigned leader
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(4), mds_gid_t(2), mds_gid_t(3) }));
+
+  EXPECT_EQ(std::future_status::ready, did_ack41.wait_for(std::chrono::milliseconds(2000)));
+
+  // we expect the db to be populated since the new leader must have discovered newer versions
+  // we expect the sets to become quiescing since there's at least one member that's not acking (the new one)
+  EXPECT_EQ(OK(), run_request([](auto& r) {}));
+  ASSERT_EQ(2, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at("set2").rstate.state);
+
+  // reconfigure the cluster back to quiescing members
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3) }));
+
+  // we expect the db to be populated since the new leader must have discovered newer versions
+  // we expect the sets to become quiescing since there's at least one member that's not acking (the new one)
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set1";
+    r.await = sec(1);
+  }));
+  ASSERT_EQ(1, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(OK(), run_request([](auto& r) {
+    r.set_id = "set2";
+    r.await = sec(1);
+  }));
+  ASSERT_EQ(1, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+
+  // lose a non-leader node
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2) }));
+
+  EXPECT_EQ(OK(), run_request([](auto& r) {}));
+  ASSERT_EQ(2, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+
+  auto did_ack3 = add_ack_hook([](auto gid, auto const &ack){
+    return gid == mds_gid_t(3) && ack.db_version.set_version > 0;
+  });
+
+  // add back a quiescing peer
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3)}));
+
+  EXPECT_EQ(OK(), run_request([](auto& r) {}));
+  ASSERT_EQ(2, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+
+  EXPECT_EQ(std::future_status::ready, did_ack3.wait_for(std::chrono::milliseconds(2000)));
+
+  EXPECT_EQ(OK(), run_request([](auto& r) {}));
+  ASSERT_EQ(2, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_QUIESCED, last_request->response.sets.at("set2").rstate.state);
+
+  auto did_ack42 = add_ack_hook([](auto gid, auto const &ack){
+    return gid == mds_gid_t(4) && ack.db_version.set_version > 0;
+  });
+
+  // add a non-quiescing peer
+  ASSERT_NO_FATAL_FAILURE(configure_cluster({ mds_gid_t(1), mds_gid_t(2), mds_gid_t(3), mds_gid_t(4) }));
+
+  EXPECT_EQ(std::future_status::ready, did_ack42.wait_for(std::chrono::milliseconds(2000)));
+  EXPECT_EQ(OK(), run_request([](auto& r) {}));
+  ASSERT_EQ(2, last_request->response.sets.size());
+  EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at("set1").rstate.state);
+  EXPECT_EQ(QS_QUIESCING, last_request->response.sets.at("set2").rstate.state);
+}
\ No newline at end of file