ceph: add client reset state machine and session teardown

author Alex Markuze <amarkuze@redhat.com>

Thu, 7 May 2026 09:11:59 +0000 (09:11 +0000)

committer Ilya Dryomov <idryomov@gmail.com>

Wed, 20 May 2026 00:20:19 +0000 (02:20 +0200)
author Alex Markuze <amarkuze@redhat.com>
Thu, 7 May 2026 09:11:59 +0000 (09:11 +0000)
committer Ilya Dryomov <idryomov@gmail.com>
Wed, 20 May 2026 00:20:19 +0000 (02:20 +0200)
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c

index c4ff2266bb944bc540ef3e06cc915f7b03e27b75..677221bd64e018cf446a6971e913b20e4a3e680e 100644 (file)
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -249,6 +249,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
  {
         struct inode *inode = file_inode(file);
         struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
         struct ceph_client *cl = ceph_inode_to_client(inode);
         int err = 0;
         u16 op = CEPH_MDS_OP_SETFILELOCK;
@@ -275,6 +276,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
                 return -EIO;
         }
  
+       /* Wait for reset to complete before acquiring new locks */
+       if (op == CEPH_MDS_OP_SETFILELOCK && !lock_is_unlock(fl)) {
+               err = ceph_mdsc_wait_for_reset(mdsc);
+               if (err)
+                       return err;
+       }
+
         if (lock_is_read(fl))
                 lock_cmd = CEPH_LOCK_SHARED;
         else if (lock_is_write(fl))
@@ -311,6 +319,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
  {
         struct inode *inode = file_inode(file);
         struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
         struct ceph_client *cl = ceph_inode_to_client(inode);
         int err = 0;
         u8 wait = 0;
@@ -330,6 +339,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                 return -EIO;
         }
  
+       /* Wait for reset to complete before acquiring new locks */
+       if (!lock_is_unlock(fl)) {
+               err = ceph_mdsc_wait_for_reset(mdsc);
+               if (err)
+                       return err;
+       }
+
         if (IS_SETLKW(cmd))
                 wait = 1;
  
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index 5743103c17caedd7aa68dcd4afd5f2ee420cf127..9eef4c8ac1b39388ebfe1f6f3f7c8728c242378b 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,6 +6,7 @@
  #include <linux/slab.h>
  #include <linux/gfp.h>
  #include <linux/sched.h>
+#include <linux/delay.h>
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
  #include <linux/ratelimit.h>
@@ -65,6 +66,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
                             struct list_head *head);
  static void ceph_cap_release_work(struct work_struct *work);
  static void ceph_cap_reclaim_work(struct work_struct *work);
+static void ceph_mdsc_reset_workfn(struct work_struct *work);
  
  static const struct ceph_connection_operations mds_con_ops;
  
@@ -3844,6 +3846,22 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
         struct ceph_client *cl = mdsc->fsc->client;
         int err = 0;
  
+       /*
+        * If a reset is in progress, wait for it to complete.
+        *
+        * This is best-effort: a request can pass this check just
+        * before the phase leaves IDLE and proceed concurrently with
+        * reset.  That is acceptable because (a) such requests will
+        * either complete normally or fail and be retried by the
+        * caller, and (b) adding lock serialization here would
+        * penalize every request for a rare manual operation.
+        */
+       err = ceph_mdsc_wait_for_reset(mdsc);
+       if (err) {
+               doutc(cl, "wait_for_reset failed: %d\n", err);
+               return err;
+       }
+
         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
         if (req->r_inode)
                 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
@@ -5266,6 +5284,474 @@ fail_return:
         return err;
  }
  
+const char *ceph_reset_phase_name(enum ceph_client_reset_phase phase)
+{
+       switch (phase) {
+       case CEPH_CLIENT_RESET_IDLE:      return "idle";
+       case CEPH_CLIENT_RESET_QUIESCING: return "quiescing";
+       case CEPH_CLIENT_RESET_DRAINING:  return "draining";
+       case CEPH_CLIENT_RESET_TEARDOWN:  return "teardown";
+       default:                          return "unknown";
+       }
+}
+
+/**
+ * ceph_mdsc_wait_for_reset - wait for an active reset to complete
+ * @mdsc: MDS client
+ *
+ * Returns 0 if reset completed successfully or no reset was active.
+ * Returns -EAGAIN if reset completed with an error, signalling the
+ * caller to retry.  The internal error (e.g. -ENOMEM) is not propagated
+ * because callers like open() or flock() have no way to act on
+ * work-function internals.  The detailed error is available via debugfs
+ * reset/status and tracepoints.
+ * Returns -ETIMEDOUT if we timed out waiting.
+ * Returns -ERESTARTSYS if interrupted by signal.
+ */
+int ceph_mdsc_wait_for_reset(struct ceph_mds_client *mdsc)
+{
+       struct ceph_client_reset_state *st = &mdsc->reset_state;
+       struct ceph_client *cl = mdsc->fsc->client;
+       unsigned long deadline = jiffies + CEPH_CLIENT_RESET_WAIT_TIMEOUT_SEC * HZ;
+       int blocked_count;
+       long remaining;
+       long wait_ret;
+       int ret;
+
+       if (ceph_reset_is_idle(st))
+               return 0;
+
+       blocked_count = atomic_inc_return(&st->blocked_requests);
+       doutc(cl, "request blocked during reset, %d total blocked\n",
+             blocked_count);
+
+retry:
+       remaining = max_t(long, deadline - jiffies, 1);
+       wait_ret = wait_event_interruptible_timeout(st->blocked_wq,
+                                                   ceph_reset_is_idle(st),
+                                                   remaining);
+
+       if (wait_ret == 0) {
+               atomic_dec(&st->blocked_requests);
+               pr_warn_client(cl, "timed out waiting for reset to complete\n");
+               return -ETIMEDOUT;
+       }
+       if (wait_ret < 0) {
+               atomic_dec(&st->blocked_requests);
+               return (int)wait_ret;  /* -ERESTARTSYS */
+       }
+
+       /*
+        * Verify phase is still IDLE under the lock.  If another reset
+        * was scheduled between the wake-up and this check, loop back
+        * and wait for it to finish rather than returning a stale result.
+        */
+       spin_lock(&st->lock);
+       if (st->phase != CEPH_CLIENT_RESET_IDLE) {
+               spin_unlock(&st->lock);
+               if (time_before(jiffies, deadline))
+                       goto retry;
+               atomic_dec(&st->blocked_requests);
+               return -ETIMEDOUT;
+       }
+       ret = st->last_errno;
+       spin_unlock(&st->lock);
+
+       atomic_dec(&st->blocked_requests);
+       return ret ? -EAGAIN : 0;
+}
+
+static void ceph_mdsc_reset_complete(struct ceph_mds_client *mdsc, int ret)
+{
+       struct ceph_client_reset_state *st = &mdsc->reset_state;
+
+       spin_lock(&st->lock);
+       /*
+        * If destroy already marked us as shut down, it owns the
+        * final bookkeeping and waiter wakeup.  Just bail so we
+        * don't overwrite its state.
+        */
+       if (st->shutdown) {
+               spin_unlock(&st->lock);
+               return;
+       }
+       st->last_finish = jiffies;
+       st->last_errno = ret;
+       st->phase = CEPH_CLIENT_RESET_IDLE;
+       if (ret)
+               st->failure_count++;
+       else
+               st->success_count++;
+       spin_unlock(&st->lock);
+
+       /* Wake up all requests that were blocked waiting for reset */
+       wake_up_all(&st->blocked_wq);
+
+}
+
+static void ceph_mdsc_reset_workfn(struct work_struct *work)
+{
+       struct ceph_mds_client *mdsc =
+               container_of(work, struct ceph_mds_client, reset_work);
+       struct ceph_client_reset_state *st = &mdsc->reset_state;
+       struct ceph_client *cl = mdsc->fsc->client;
+       struct ceph_mds_session **sessions = NULL;
+       char reason[CEPH_CLIENT_RESET_REASON_LEN];
+       unsigned long drain_deadline;
+       int max_sessions, i, n = 0, torn_down = 0;
+       int ret = 0;
+
+       spin_lock(&st->lock);
+       strscpy(reason, st->last_reason, sizeof(reason));
+       spin_unlock(&st->lock);
+
+       mutex_lock(&mdsc->mutex);
+       max_sessions = mdsc->max_sessions;
+       if (max_sessions <= 0) {
+               mutex_unlock(&mdsc->mutex);
+               goto out_complete;
+       }
+
+       sessions = kcalloc(max_sessions, sizeof(*sessions), GFP_KERNEL);
+       if (!sessions) {
+               mutex_unlock(&mdsc->mutex);
+               ret = -ENOMEM;
+               pr_err_client(cl,
+                             "manual session reset failed to allocate session array\n");
+               ceph_mdsc_reset_complete(mdsc, ret);
+               return;
+       }
+
+       for (i = 0; i < max_sessions; i++) {
+               struct ceph_mds_session *session = mdsc->sessions[i];
+
+               if (!session)
+                       continue;
+
+               /*
+                * Read session state without s_mutex to avoid nesting
+                * mdsc->mutex -> s_mutex, which would invert the
+                * s_mutex -> mdsc->mutex order used by
+                * cleanup_session_requests().  s_state is an int
+                * so loads are atomic; the teardown loop below
+                * handles races with concurrent state transitions.
+                */
+               switch (READ_ONCE(session->s_state)) {
+               case CEPH_MDS_SESSION_OPEN:
+               case CEPH_MDS_SESSION_HUNG:
+               case CEPH_MDS_SESSION_OPENING:
+               case CEPH_MDS_SESSION_RESTARTING:
+               case CEPH_MDS_SESSION_RECONNECTING:
+               case CEPH_MDS_SESSION_CLOSING:
+                       sessions[n++] = ceph_get_mds_session(session);
+                       break;
+               default:
+                       pr_info_client(cl,
+                                      "mds%d in state %s, skipping reset\n",
+                                      session->s_mds,
+                                      ceph_session_state_name(session->s_state));
+                       break;
+               }
+       }
+       mutex_unlock(&mdsc->mutex);
+
+       pr_info_client(cl,
+                      "manual session reset executing (sessions=%d, reason=\"%s\")\n",
+                      n, reason);
+
+       if (n == 0) {
+               kfree(sessions);
+               goto out_complete;
+       }
+
+       spin_lock(&st->lock);
+       if (st->shutdown) {
+               spin_unlock(&st->lock);
+               goto out_sessions;
+       }
+       st->phase = CEPH_CLIENT_RESET_DRAINING;
+       spin_unlock(&st->lock);
+
+       /*
+        * Best-effort drain: flush dirty state while sessions are still
+        * alive.  New requests are blocked while phase != IDLE.
+        * The sessions are functional, so non-stuck state drains normally.
+        * Stuck state (the cause of the stalemate the operator is trying
+        * to break) will not drain -- that is expected, and we proceed to
+        * forced teardown after the timeout.
+        *
+        * Four things are drained:
+        *  1. MDS journal -- send_flush_mdlog asks each MDS to journal
+        *     pending unsafe operations (creates, renames, setattrs).
+        *  2. Unsafe requests -- bounded wait for each unsafe write
+        *     request to reach safe status via r_safe_completion.
+        *  3. Dirty caps -- ceph_flush_dirty_caps triggers cap flush on
+        *     all sessions.  Non-stuck caps flush in milliseconds.
+        *  4. Cap releases -- push pending cap release messages.
+        *
+        * The unsafe-request wait and cap-flush wait below provide
+        * the bounded drain window during which all categories can
+        * make progress.
+        */
+       for (i = 0; i < n; i++)
+               send_flush_mdlog(sessions[i]);
+
+       /*
+        * Both drain legs (unsafe requests and cap flushes) share a
+        * single deadline so the total drain time is bounded at
+        * CEPH_CLIENT_RESET_DRAIN_SEC.
+        */
+       drain_deadline = jiffies + CEPH_CLIENT_RESET_DRAIN_SEC * HZ;
+
+       /*
+        * Wait for unsafe write requests (creates, renames, setattrs)
+        * to reach safe status.  Uses the same pattern as
+        * flush_mdlog_and_wait_mdsc_unsafe_requests() but bounded by
+        * the shared drain deadline.  Requests that do not complete within
+        * the window are force-dropped during teardown.
+        */
+       {
+               struct ceph_mds_request *req;
+               struct rb_node *rn;
+               u64 last_tid;
+
+               mutex_lock(&mdsc->mutex);
+               last_tid = mdsc->last_tid;
+               mutex_unlock(&mdsc->mutex);
+
+               mutex_lock(&mdsc->mutex);
+               rn = rb_first(&mdsc->request_tree);
+               while (rn) {
+                       req = rb_entry(rn, struct ceph_mds_request, r_node);
+                       if (req->r_tid > last_tid)
+                               break;
+                       if (req->r_op == CEPH_MDS_OP_SETFILELOCK ||
+                           !(req->r_op & CEPH_MDS_OP_WRITE)) {
+                               rn = rb_next(rn);
+                               continue;
+                       }
+                       ceph_mdsc_get_request(req);
+                       mutex_unlock(&mdsc->mutex);
+
+                       wait_for_completion_timeout(&req->r_safe_completion,
+                               max_t(long, drain_deadline - jiffies, 1));
+
+                       mutex_lock(&mdsc->mutex);
+                       ceph_mdsc_put_request(req);
+                       if (time_after(jiffies, drain_deadline))
+                               break;
+                       rn = rb_first(&mdsc->request_tree);
+               }
+               mutex_unlock(&mdsc->mutex);
+
+               if (time_after_eq(jiffies, drain_deadline))
+                       WRITE_ONCE(st->drain_timed_out, true);
+       }
+
+       ceph_flush_dirty_caps(mdsc);
+       ceph_flush_cap_releases(mdsc);
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       if (!list_empty(&mdsc->cap_flush_list)) {
+               struct ceph_cap_flush *cf =
+                       list_last_entry(&mdsc->cap_flush_list,
+                                       struct ceph_cap_flush, g_list);
+               u64 want_flush = mdsc->last_cap_flush_tid;
+               long drain_ret;
+
+               /*
+                * Setting wake on the last entry is sufficient: flush
+                * entries complete in order, so when this entry finishes
+                * all earlier ones are already done.
+                */
+               cf->wake = true;
+               spin_unlock(&mdsc->cap_dirty_lock);
+               pr_info_client(cl,
+                              "draining (want_flush=%llu, %d sessions)\n",
+                              want_flush, n);
+               drain_ret = wait_event_timeout(mdsc->cap_flushing_wq,
+                                              check_caps_flush(mdsc,
+                                                               want_flush),
+                                              max_t(long,
+                                                    drain_deadline - jiffies,
+                                                    1));
+               if (drain_ret == 0) {
+                       pr_info_client(cl,
+                                      "drain timed out, proceeding with forced teardown\n");
+                       WRITE_ONCE(st->drain_timed_out, true);
+               } else {
+                       pr_info_client(cl, "drain completed successfully\n");
+               }
+       } else {
+               spin_unlock(&mdsc->cap_dirty_lock);
+       }
+
+       spin_lock(&st->lock);
+       if (st->shutdown) {
+               spin_unlock(&st->lock);
+               goto out_sessions;
+       }
+       st->phase = CEPH_CLIENT_RESET_TEARDOWN;
+       spin_unlock(&st->lock);
+
+       /*
+        * Ask each MDS to close the session before we tear it down
+        * locally.  Without this the MDS sees only a connection drop and
+        * waits for the client to reconnect (up to session_autoclose
+        * seconds) before evicting the session and releasing locks.
+        *
+        * Reuse the normal close machinery so the session state/sequence
+        * snapshot is serialized under s_mutex and a racing s_seq bump
+        * retransmits REQUEST_CLOSE while the session remains CLOSING.
+        * We send all close requests first, then yield briefly to let the
+        * network stack transmit them before __unregister_session()
+        * closes the connections.
+        */
+       for (i = 0; i < n; i++) {
+               int err;
+
+               mutex_lock(&sessions[i]->s_mutex);
+               err = __close_session(mdsc, sessions[i]);
+               mutex_unlock(&sessions[i]->s_mutex);
+               if (err < 0)
+                       pr_warn_client(cl,
+                                      "mds%d failed to queue close request before reset: %d\n",
+                                      sessions[i]->s_mds, err);
+       }
+       /*
+        * Best-effort grace period: yield briefly so the network stack
+        * can transmit the queued REQUEST_CLOSE messages before we tear
+        * down connections.  Not a correctness requirement -- the MDS
+        * will still evict via session_autoclose if it never receives
+        * the close request.
+        *
+        * Event-based waiting is not viable here: there is no completion
+        * event for "message left the NIC," and waiting for the MDS
+        * SESSION_CLOSE response would re-create the stalemate that the
+        * reset is meant to break.
+        */
+       if (n > 0)
+               msleep(CEPH_CLIENT_RESET_CLOSE_GRACE_MS);
+
+       /*
+        * Tear down each session: close the connection, remove all
+        * caps, clean up requests, then kick pending requests so they
+        * re-open a fresh session on the next attempt.
+        *
+        * This is modeled on the check_new_map() forced-close path
+        * for stopped MDS ranks - a proven pattern for hard session
+        * teardown.  We do NOT attempt send_mds_reconnect() because
+        * the MDS only accepts reconnects during its own RECONNECT
+        * phase (after MDS restart), not from an active client.
+        *
+        * Any state that did not drain (caps that didn't flush, unsafe
+        * requests that the MDS didn't journal) is force-dropped here.
+        * This is intentional: that state is stuck and is the reason
+        * the operator triggered the reset.
+        */
+       for (i = 0; i < n; i++) {
+               int mds = sessions[i]->s_mds;
+
+               pr_info_client(cl, "mds%d resetting session\n", mds);
+
+               mutex_lock(&mdsc->mutex);
+               if (mds >= mdsc->max_sessions ||
+                   mdsc->sessions[mds] != sessions[i]) {
+                       pr_info_client(cl,
+                                      "mds%d session already torn down, skipping\n",
+                                      mds);
+                       mutex_unlock(&mdsc->mutex);
+                       ceph_put_mds_session(sessions[i]);
+                       sessions[i] = NULL;
+                       continue;
+               }
+               sessions[i]->s_state = CEPH_MDS_SESSION_CLOSED;
+               __unregister_session(mdsc, sessions[i]);
+               __wake_requests(mdsc, &sessions[i]->s_waiting);
+               mutex_unlock(&mdsc->mutex);
+
+               mutex_lock(&sessions[i]->s_mutex);
+               cleanup_session_requests(mdsc, sessions[i]);
+               remove_session_caps(sessions[i]);
+               mutex_unlock(&sessions[i]->s_mutex);
+
+               wake_up_all(&mdsc->session_close_wq);
+
+               ceph_put_mds_session(sessions[i]);
+
+               mutex_lock(&mdsc->mutex);
+               kick_requests(mdsc, mds);
+               mutex_unlock(&mdsc->mutex);
+
+               torn_down++;
+               pr_info_client(cl, "mds%d session reset complete\n", mds);
+       }
+
+       kfree(sessions);
+
+       spin_lock(&st->lock);
+       st->sessions_reset = torn_down;
+       spin_unlock(&st->lock);
+
+out_complete:
+       ceph_mdsc_reset_complete(mdsc, ret);
+       return;
+
+out_sessions:
+       /* shutdown == true: ceph_mdsc_destroy() owns the final transition. */
+       for (i = 0; i < n; i++)
+               ceph_put_mds_session(sessions[i]);
+       kfree(sessions);
+}
+
+int ceph_mdsc_schedule_reset(struct ceph_mds_client *mdsc,
+                            const char *reason)
+{
+       struct ceph_client_reset_state *st = &mdsc->reset_state;
+       struct ceph_fs_client *fsc = mdsc->fsc;
+       const char *msg = (reason && reason[0]) ? reason : "manual";
+       int mount_state;
+
+       mount_state = READ_ONCE(fsc->mount_state);
+       if (mount_state != CEPH_MOUNT_MOUNTED) {
+               pr_warn_client(fsc->client,
+                              "reset rejected: mount_state=%d (not mounted)\n",
+                              mount_state);
+               return -EINVAL;
+       }
+
+       spin_lock(&st->lock);
+       if (st->phase != CEPH_CLIENT_RESET_IDLE) {
+               spin_unlock(&st->lock);
+               return -EBUSY;
+       }
+
+       st->phase = CEPH_CLIENT_RESET_QUIESCING;
+       st->last_start = jiffies;
+       st->last_errno = 0;
+       st->drain_timed_out = false;
+       st->sessions_reset = 0;
+       st->trigger_count++;
+       strscpy(st->last_reason, msg, sizeof(st->last_reason));
+       spin_unlock(&st->lock);
+
+       if (WARN_ON_ONCE(!queue_work(system_unbound_wq, &mdsc->reset_work))) {
+               spin_lock(&st->lock);
+               st->phase = CEPH_CLIENT_RESET_IDLE;
+               st->last_errno = -EALREADY;
+               st->last_finish = jiffies;
+               st->failure_count++;
+               spin_unlock(&st->lock);
+               wake_up_all(&st->blocked_wq);
+               return -EALREADY;
+       }
+
+       pr_info_client(mdsc->fsc->client,
+                      "manual session reset scheduled (reason=\"%s\")\n",
+                      msg);
+       return 0;
+}
+
  
  /*
   * compare old and new mdsmaps, kicking requests
@@ -5811,6 +6297,11 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
         INIT_LIST_HEAD(&mdsc->dentry_leases);
         INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
  
+       spin_lock_init(&mdsc->reset_state.lock);
+       init_waitqueue_head(&mdsc->reset_state.blocked_wq);
+       atomic_set(&mdsc->reset_state.blocked_requests, 0);
+       INIT_WORK(&mdsc->reset_work, ceph_mdsc_reset_workfn);
+
         ceph_caps_init(mdsc);
         ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
  
@@ -6336,6 +6827,23 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
         /* flush out any connection work with references to us */
         ceph_msgr_flush();
  
+       /*
+        * Mark reset as failed and wake any blocked waiters before
+        * cancelling, so unmount doesn't stall on blocked_wq timeout
+        * if cancel_work_sync() prevents the work from running.
+        */
+       spin_lock(&mdsc->reset_state.lock);
+       mdsc->reset_state.shutdown = true;
+       if (mdsc->reset_state.phase != CEPH_CLIENT_RESET_IDLE) {
+               mdsc->reset_state.phase = CEPH_CLIENT_RESET_IDLE;
+               mdsc->reset_state.last_errno = -ESHUTDOWN;
+               mdsc->reset_state.last_finish = jiffies;
+               mdsc->reset_state.failure_count++;
+       }
+       spin_unlock(&mdsc->reset_state.lock);
+       wake_up_all(&mdsc->reset_state.blocked_wq);
+
+       cancel_work_sync(&mdsc->reset_work);
         ceph_mdsc_stop(mdsc);
  
         ceph_metric_destroy(&mdsc->metric);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h

index 8208fdf02efe8728fba6761d2410618659b6e3d0..b1a0621cd37e277c9d99117fded6cc5caede6a82 100644 (file)
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -80,7 +80,47 @@ struct ceph_cap;
  #define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60
  #define CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES 5
  #define CEPH_CAP_FLUSH_MAX_DUMP_ITERS 5
+#define CEPH_CLIENT_RESET_REASON_LEN   64
+#define CEPH_CLIENT_RESET_DRAIN_SEC    30
+#define CEPH_CLIENT_RESET_CLOSE_GRACE_MS 100
+#define CEPH_CLIENT_RESET_WAIT_TIMEOUT_SEC 120
  
+enum ceph_client_reset_phase {
+       CEPH_CLIENT_RESET_IDLE = 0,
+       /*
+        * QUIESCING is set synchronously by schedule_reset() before the
+        * workqueue item is dispatched.  It gates new requests (any
+        * phase != IDLE blocks callers) during the window between
+        * scheduling and the work function's transition to DRAINING.
+        */
+       CEPH_CLIENT_RESET_QUIESCING,
+       CEPH_CLIENT_RESET_DRAINING,
+       CEPH_CLIENT_RESET_TEARDOWN,
+};
+
+struct ceph_client_reset_state {
+       spinlock_t lock;                /* protects all fields below */
+       u64 trigger_count;              /* number of resets triggered */
+       u64 success_count;              /* number of successful resets */
+       u64 failure_count;              /* number of failed resets */
+       unsigned long last_start;       /* jiffies when last reset started */
+       unsigned long last_finish;      /* jiffies when last reset finished */
+       int last_errno;                 /* result of most recent reset */
+       enum ceph_client_reset_phase phase; /* current reset phase */
+       bool drain_timed_out;           /* drain exceeded timeout */
+       bool shutdown;                  /* destroy in progress */
+       int sessions_reset;             /* sessions torn down in last reset */
+       char last_reason[CEPH_CLIENT_RESET_REASON_LEN]; /* operator-supplied reason */
+
+       /* Request blocking during reset */
+       wait_queue_head_t blocked_wq;   /* waitqueue for blocked callers */
+       atomic_t blocked_requests;      /* count of blocked callers */
+};
+
+static inline bool ceph_reset_is_idle(struct ceph_client_reset_state *st)
+{
+       return READ_ONCE(st->phase) == CEPH_CLIENT_RESET_IDLE;
+}
  struct ceph_mds_cap_match {
         s64 uid;  /* default to MDS_AUTH_UID_ANY */
         u32 num_gids;
@@ -543,6 +583,8 @@ struct ceph_mds_client {
         struct list_head  dentry_dir_leases; /* lru list */
  
         struct ceph_client_metric metric;
+       struct work_struct      reset_work;
+       struct ceph_client_reset_state reset_state;
         struct ceph_subvolume_metrics_tracker subvol_metrics;
  
         /* Subvolume metrics send tracking */
@@ -574,10 +616,14 @@ extern struct ceph_mds_session *
  __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
  
  extern const char *ceph_session_state_name(int s);
+extern const char *ceph_reset_phase_name(enum ceph_client_reset_phase phase);
  
  extern struct ceph_mds_session *
  ceph_get_mds_session(struct ceph_mds_session *s);
  extern void ceph_put_mds_session(struct ceph_mds_session *s);
+int ceph_mdsc_schedule_reset(struct ceph_mds_client *mdsc,
+                            const char *reason);
+int ceph_mdsc_wait_for_reset(struct ceph_mds_client *mdsc);
  
  extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
  extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
author	Alex Markuze <amarkuze@redhat.com>
	Thu, 7 May 2026 09:11:59 +0000 (09:11 +0000)
committer	Ilya Dryomov <idryomov@gmail.com>
	Wed, 20 May 2026 00:20:19 +0000 (02:20 +0200)
fs/ceph/locks.c		patch \| blob \| history
fs/ceph/mds_client.c		patch \| blob \| history
fs/ceph/mds_client.h		patch \| blob \| history