#include <linux/slab.h>
#include <linux/gfp.h>
#include <linux/sched.h>
+#include <linux/delay.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
struct list_head *head);
static void ceph_cap_release_work(struct work_struct *work);
static void ceph_cap_reclaim_work(struct work_struct *work);
+static void ceph_mdsc_reset_workfn(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
struct ceph_client *cl = mdsc->fsc->client;
int err = 0;
+ /*
+ * If a reset is in progress, wait for it to complete.
+ *
+ * This is best-effort: a request can pass this check just
+ * before the phase leaves IDLE and proceed concurrently with
+ * reset. That is acceptable because (a) such requests will
+ * either complete normally or fail and be retried by the
+ * caller, and (b) adding lock serialization here would
+ * penalize every request for a rare manual operation.
+ */
+ err = ceph_mdsc_wait_for_reset(mdsc);
+ if (err) {
+ doutc(cl, "wait_for_reset failed: %d\n", err);
+ return err;
+ }
+
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
return err;
}
+const char *ceph_reset_phase_name(enum ceph_client_reset_phase phase)
+{
+ switch (phase) {
+ case CEPH_CLIENT_RESET_IDLE: return "idle";
+ case CEPH_CLIENT_RESET_QUIESCING: return "quiescing";
+ case CEPH_CLIENT_RESET_DRAINING: return "draining";
+ case CEPH_CLIENT_RESET_TEARDOWN: return "teardown";
+ default: return "unknown";
+ }
+}
+
+/**
+ * ceph_mdsc_wait_for_reset - wait for an active reset to complete
+ * @mdsc: MDS client
+ *
+ * Returns 0 if reset completed successfully or no reset was active.
+ * Returns -EAGAIN if reset completed with an error, signalling the
+ * caller to retry. The internal error (e.g. -ENOMEM) is not propagated
+ * because callers like open() or flock() have no way to act on
+ * work-function internals. The detailed error is available via debugfs
+ * reset/status and tracepoints.
+ * Returns -ETIMEDOUT if we timed out waiting.
+ * Returns -ERESTARTSYS if interrupted by signal.
+ */
+int ceph_mdsc_wait_for_reset(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client_reset_state *st = &mdsc->reset_state;
+ struct ceph_client *cl = mdsc->fsc->client;
+ unsigned long deadline = jiffies + CEPH_CLIENT_RESET_WAIT_TIMEOUT_SEC * HZ;
+ int blocked_count;
+ long remaining;
+ long wait_ret;
+ int ret;
+
+ if (ceph_reset_is_idle(st))
+ return 0;
+
+ blocked_count = atomic_inc_return(&st->blocked_requests);
+ doutc(cl, "request blocked during reset, %d total blocked\n",
+ blocked_count);
+
+retry:
+ remaining = max_t(long, deadline - jiffies, 1);
+ wait_ret = wait_event_interruptible_timeout(st->blocked_wq,
+ ceph_reset_is_idle(st),
+ remaining);
+
+ if (wait_ret == 0) {
+ atomic_dec(&st->blocked_requests);
+ pr_warn_client(cl, "timed out waiting for reset to complete\n");
+ return -ETIMEDOUT;
+ }
+ if (wait_ret < 0) {
+ atomic_dec(&st->blocked_requests);
+ return (int)wait_ret; /* -ERESTARTSYS */
+ }
+
+ /*
+ * Verify phase is still IDLE under the lock. If another reset
+ * was scheduled between the wake-up and this check, loop back
+ * and wait for it to finish rather than returning a stale result.
+ */
+ spin_lock(&st->lock);
+ if (st->phase != CEPH_CLIENT_RESET_IDLE) {
+ spin_unlock(&st->lock);
+ if (time_before(jiffies, deadline))
+ goto retry;
+ atomic_dec(&st->blocked_requests);
+ return -ETIMEDOUT;
+ }
+ ret = st->last_errno;
+ spin_unlock(&st->lock);
+
+ atomic_dec(&st->blocked_requests);
+ return ret ? -EAGAIN : 0;
+}
+
+static void ceph_mdsc_reset_complete(struct ceph_mds_client *mdsc, int ret)
+{
+ struct ceph_client_reset_state *st = &mdsc->reset_state;
+
+ spin_lock(&st->lock);
+ /*
+ * If destroy already marked us as shut down, it owns the
+ * final bookkeeping and waiter wakeup. Just bail so we
+ * don't overwrite its state.
+ */
+ if (st->shutdown) {
+ spin_unlock(&st->lock);
+ return;
+ }
+ st->last_finish = jiffies;
+ st->last_errno = ret;
+ st->phase = CEPH_CLIENT_RESET_IDLE;
+ if (ret)
+ st->failure_count++;
+ else
+ st->success_count++;
+ spin_unlock(&st->lock);
+
+ /* Wake up all requests that were blocked waiting for reset */
+ wake_up_all(&st->blocked_wq);
+
+}
+
+static void ceph_mdsc_reset_workfn(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, reset_work);
+ struct ceph_client_reset_state *st = &mdsc->reset_state;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_session **sessions = NULL;
+ char reason[CEPH_CLIENT_RESET_REASON_LEN];
+ unsigned long drain_deadline;
+ int max_sessions, i, n = 0, torn_down = 0;
+ int ret = 0;
+
+ spin_lock(&st->lock);
+ strscpy(reason, st->last_reason, sizeof(reason));
+ spin_unlock(&st->lock);
+
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+ if (max_sessions <= 0) {
+ mutex_unlock(&mdsc->mutex);
+ goto out_complete;
+ }
+
+ sessions = kcalloc(max_sessions, sizeof(*sessions), GFP_KERNEL);
+ if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
+ ret = -ENOMEM;
+ pr_err_client(cl,
+ "manual session reset failed to allocate session array\n");
+ ceph_mdsc_reset_complete(mdsc, ret);
+ return;
+ }
+
+ for (i = 0; i < max_sessions; i++) {
+ struct ceph_mds_session *session = mdsc->sessions[i];
+
+ if (!session)
+ continue;
+
+ /*
+ * Read session state without s_mutex to avoid nesting
+ * mdsc->mutex -> s_mutex, which would invert the
+ * s_mutex -> mdsc->mutex order used by
+ * cleanup_session_requests(). s_state is an int
+ * so loads are atomic; the teardown loop below
+ * handles races with concurrent state transitions.
+ */
+ switch (READ_ONCE(session->s_state)) {
+ case CEPH_MDS_SESSION_OPEN:
+ case CEPH_MDS_SESSION_HUNG:
+ case CEPH_MDS_SESSION_OPENING:
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_RECONNECTING:
+ case CEPH_MDS_SESSION_CLOSING:
+ sessions[n++] = ceph_get_mds_session(session);
+ break;
+ default:
+ pr_info_client(cl,
+ "mds%d in state %s, skipping reset\n",
+ session->s_mds,
+ ceph_session_state_name(session->s_state));
+ break;
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ pr_info_client(cl,
+ "manual session reset executing (sessions=%d, reason=\"%s\")\n",
+ n, reason);
+
+ if (n == 0) {
+ kfree(sessions);
+ goto out_complete;
+ }
+
+ spin_lock(&st->lock);
+ if (st->shutdown) {
+ spin_unlock(&st->lock);
+ goto out_sessions;
+ }
+ st->phase = CEPH_CLIENT_RESET_DRAINING;
+ spin_unlock(&st->lock);
+
+ /*
+ * Best-effort drain: flush dirty state while sessions are still
+ * alive. New requests are blocked while phase != IDLE.
+ * The sessions are functional, so non-stuck state drains normally.
+ * Stuck state (the cause of the stalemate the operator is trying
+ * to break) will not drain -- that is expected, and we proceed to
+ * forced teardown after the timeout.
+ *
+ * Four things are drained:
+ * 1. MDS journal -- send_flush_mdlog asks each MDS to journal
+ * pending unsafe operations (creates, renames, setattrs).
+ * 2. Unsafe requests -- bounded wait for each unsafe write
+ * request to reach safe status via r_safe_completion.
+ * 3. Dirty caps -- ceph_flush_dirty_caps triggers cap flush on
+ * all sessions. Non-stuck caps flush in milliseconds.
+ * 4. Cap releases -- push pending cap release messages.
+ *
+ * The unsafe-request wait and cap-flush wait below provide
+ * the bounded drain window during which all categories can
+ * make progress.
+ */
+ for (i = 0; i < n; i++)
+ send_flush_mdlog(sessions[i]);
+
+ /*
+ * Both drain legs (unsafe requests and cap flushes) share a
+ * single deadline so the total drain time is bounded at
+ * CEPH_CLIENT_RESET_DRAIN_SEC.
+ */
+ drain_deadline = jiffies + CEPH_CLIENT_RESET_DRAIN_SEC * HZ;
+
+ /*
+ * Wait for unsafe write requests (creates, renames, setattrs)
+ * to reach safe status. Uses the same pattern as
+ * flush_mdlog_and_wait_mdsc_unsafe_requests() but bounded by
+ * the shared drain deadline. Requests that do not complete within
+ * the window are force-dropped during teardown.
+ */
+ {
+ struct ceph_mds_request *req;
+ struct rb_node *rn;
+ u64 last_tid;
+
+ mutex_lock(&mdsc->mutex);
+ last_tid = mdsc->last_tid;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&mdsc->mutex);
+ rn = rb_first(&mdsc->request_tree);
+ while (rn) {
+ req = rb_entry(rn, struct ceph_mds_request, r_node);
+ if (req->r_tid > last_tid)
+ break;
+ if (req->r_op == CEPH_MDS_OP_SETFILELOCK ||
+ !(req->r_op & CEPH_MDS_OP_WRITE)) {
+ rn = rb_next(rn);
+ continue;
+ }
+ ceph_mdsc_get_request(req);
+ mutex_unlock(&mdsc->mutex);
+
+ wait_for_completion_timeout(&req->r_safe_completion,
+ max_t(long, drain_deadline - jiffies, 1));
+
+ mutex_lock(&mdsc->mutex);
+ ceph_mdsc_put_request(req);
+ if (time_after(jiffies, drain_deadline))
+ break;
+ rn = rb_first(&mdsc->request_tree);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ if (time_after_eq(jiffies, drain_deadline))
+ WRITE_ONCE(st->drain_timed_out, true);
+ }
+
+ ceph_flush_dirty_caps(mdsc);
+ ceph_flush_cap_releases(mdsc);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ u64 want_flush = mdsc->last_cap_flush_tid;
+ long drain_ret;
+
+ /*
+ * Setting wake on the last entry is sufficient: flush
+ * entries complete in order, so when this entry finishes
+ * all earlier ones are already done.
+ */
+ cf->wake = true;
+ spin_unlock(&mdsc->cap_dirty_lock);
+ pr_info_client(cl,
+ "draining (want_flush=%llu, %d sessions)\n",
+ want_flush, n);
+ drain_ret = wait_event_timeout(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc,
+ want_flush),
+ max_t(long,
+ drain_deadline - jiffies,
+ 1));
+ if (drain_ret == 0) {
+ pr_info_client(cl,
+ "drain timed out, proceeding with forced teardown\n");
+ WRITE_ONCE(st->drain_timed_out, true);
+ } else {
+ pr_info_client(cl, "drain completed successfully\n");
+ }
+ } else {
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+
+ spin_lock(&st->lock);
+ if (st->shutdown) {
+ spin_unlock(&st->lock);
+ goto out_sessions;
+ }
+ st->phase = CEPH_CLIENT_RESET_TEARDOWN;
+ spin_unlock(&st->lock);
+
+ /*
+ * Ask each MDS to close the session before we tear it down
+ * locally. Without this the MDS sees only a connection drop and
+ * waits for the client to reconnect (up to session_autoclose
+ * seconds) before evicting the session and releasing locks.
+ *
+ * Reuse the normal close machinery so the session state/sequence
+ * snapshot is serialized under s_mutex and a racing s_seq bump
+ * retransmits REQUEST_CLOSE while the session remains CLOSING.
+ * We send all close requests first, then yield briefly to let the
+ * network stack transmit them before __unregister_session()
+ * closes the connections.
+ */
+ for (i = 0; i < n; i++) {
+ int err;
+
+ mutex_lock(&sessions[i]->s_mutex);
+ err = __close_session(mdsc, sessions[i]);
+ mutex_unlock(&sessions[i]->s_mutex);
+ if (err < 0)
+ pr_warn_client(cl,
+ "mds%d failed to queue close request before reset: %d\n",
+ sessions[i]->s_mds, err);
+ }
+ /*
+ * Best-effort grace period: yield briefly so the network stack
+ * can transmit the queued REQUEST_CLOSE messages before we tear
+ * down connections. Not a correctness requirement -- the MDS
+ * will still evict via session_autoclose if it never receives
+ * the close request.
+ *
+ * Event-based waiting is not viable here: there is no completion
+ * event for "message left the NIC," and waiting for the MDS
+ * SESSION_CLOSE response would re-create the stalemate that the
+ * reset is meant to break.
+ */
+ if (n > 0)
+ msleep(CEPH_CLIENT_RESET_CLOSE_GRACE_MS);
+
+ /*
+ * Tear down each session: close the connection, remove all
+ * caps, clean up requests, then kick pending requests so they
+ * re-open a fresh session on the next attempt.
+ *
+ * This is modeled on the check_new_map() forced-close path
+ * for stopped MDS ranks - a proven pattern for hard session
+ * teardown. We do NOT attempt send_mds_reconnect() because
+ * the MDS only accepts reconnects during its own RECONNECT
+ * phase (after MDS restart), not from an active client.
+ *
+ * Any state that did not drain (caps that didn't flush, unsafe
+ * requests that the MDS didn't journal) is force-dropped here.
+ * This is intentional: that state is stuck and is the reason
+ * the operator triggered the reset.
+ */
+ for (i = 0; i < n; i++) {
+ int mds = sessions[i]->s_mds;
+
+ pr_info_client(cl, "mds%d resetting session\n", mds);
+
+ mutex_lock(&mdsc->mutex);
+ if (mds >= mdsc->max_sessions ||
+ mdsc->sessions[mds] != sessions[i]) {
+ pr_info_client(cl,
+ "mds%d session already torn down, skipping\n",
+ mds);
+ mutex_unlock(&mdsc->mutex);
+ ceph_put_mds_session(sessions[i]);
+ sessions[i] = NULL;
+ continue;
+ }
+ sessions[i]->s_state = CEPH_MDS_SESSION_CLOSED;
+ __unregister_session(mdsc, sessions[i]);
+ __wake_requests(mdsc, &sessions[i]->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&sessions[i]->s_mutex);
+ cleanup_session_requests(mdsc, sessions[i]);
+ remove_session_caps(sessions[i]);
+ mutex_unlock(&sessions[i]->s_mutex);
+
+ wake_up_all(&mdsc->session_close_wq);
+
+ ceph_put_mds_session(sessions[i]);
+
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+
+ torn_down++;
+ pr_info_client(cl, "mds%d session reset complete\n", mds);
+ }
+
+ kfree(sessions);
+
+ spin_lock(&st->lock);
+ st->sessions_reset = torn_down;
+ spin_unlock(&st->lock);
+
+out_complete:
+ ceph_mdsc_reset_complete(mdsc, ret);
+ return;
+
+out_sessions:
+ /* shutdown == true: ceph_mdsc_destroy() owns the final transition. */
+ for (i = 0; i < n; i++)
+ ceph_put_mds_session(sessions[i]);
+ kfree(sessions);
+}
+
+int ceph_mdsc_schedule_reset(struct ceph_mds_client *mdsc,
+ const char *reason)
+{
+ struct ceph_client_reset_state *st = &mdsc->reset_state;
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *msg = (reason && reason[0]) ? reason : "manual";
+ int mount_state;
+
+ mount_state = READ_ONCE(fsc->mount_state);
+ if (mount_state != CEPH_MOUNT_MOUNTED) {
+ pr_warn_client(fsc->client,
+ "reset rejected: mount_state=%d (not mounted)\n",
+ mount_state);
+ return -EINVAL;
+ }
+
+ spin_lock(&st->lock);
+ if (st->phase != CEPH_CLIENT_RESET_IDLE) {
+ spin_unlock(&st->lock);
+ return -EBUSY;
+ }
+
+ st->phase = CEPH_CLIENT_RESET_QUIESCING;
+ st->last_start = jiffies;
+ st->last_errno = 0;
+ st->drain_timed_out = false;
+ st->sessions_reset = 0;
+ st->trigger_count++;
+ strscpy(st->last_reason, msg, sizeof(st->last_reason));
+ spin_unlock(&st->lock);
+
+ if (WARN_ON_ONCE(!queue_work(system_unbound_wq, &mdsc->reset_work))) {
+ spin_lock(&st->lock);
+ st->phase = CEPH_CLIENT_RESET_IDLE;
+ st->last_errno = -EALREADY;
+ st->last_finish = jiffies;
+ st->failure_count++;
+ spin_unlock(&st->lock);
+ wake_up_all(&st->blocked_wq);
+ return -EALREADY;
+ }
+
+ pr_info_client(mdsc->fsc->client,
+ "manual session reset scheduled (reason=\"%s\")\n",
+ msg);
+ return 0;
+}
+
/*
* compare old and new mdsmaps, kicking requests
INIT_LIST_HEAD(&mdsc->dentry_leases);
INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
+ spin_lock_init(&mdsc->reset_state.lock);
+ init_waitqueue_head(&mdsc->reset_state.blocked_wq);
+ atomic_set(&mdsc->reset_state.blocked_requests, 0);
+ INIT_WORK(&mdsc->reset_work, ceph_mdsc_reset_workfn);
+
ceph_caps_init(mdsc);
ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
/* flush out any connection work with references to us */
ceph_msgr_flush();
+ /*
+ * Mark reset as failed and wake any blocked waiters before
+ * cancelling, so unmount doesn't stall on blocked_wq timeout
+ * if cancel_work_sync() prevents the work from running.
+ */
+ spin_lock(&mdsc->reset_state.lock);
+ mdsc->reset_state.shutdown = true;
+ if (mdsc->reset_state.phase != CEPH_CLIENT_RESET_IDLE) {
+ mdsc->reset_state.phase = CEPH_CLIENT_RESET_IDLE;
+ mdsc->reset_state.last_errno = -ESHUTDOWN;
+ mdsc->reset_state.last_finish = jiffies;
+ mdsc->reset_state.failure_count++;
+ }
+ spin_unlock(&mdsc->reset_state.lock);
+ wake_up_all(&mdsc->reset_state.blocked_wq);
+
+ cancel_work_sync(&mdsc->reset_work);
ceph_mdsc_stop(mdsc);
ceph_metric_destroy(&mdsc->metric);