From 70d92c85bf8d03c269d083dacee7e88448707b1e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 7 Jul 2021 09:20:34 -0400 Subject: [PATCH] [DO NOT MERGE] ceph: dump info about cap flushes when we're waiting too long for them We've had some cases of hung umounts in teuthology testing. It looks like client is waiting for cap flushes to complete, but they aren't. Add a field to the inode to track the highest cap flush tid seen for that inode. Also, add a backpointer to the inode to the ceph_cap_flush struct. Change wait_caps_flush to wait 60s, and then dump info about the condition of the list. Also, print pr_info messages if we end up dropping a FLUSH_ACK for an inode onto the floor, or if we get a message on an unregistered session. Reported-by: Patrick Donnelly URL: https://tracker.ceph.com/issues/51279 Signed-off-by: Jeff Layton --- fs/ceph/caps.c | 17 +++++++++++++++-- fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++-- fs/ceph/snap.c | 1 + fs/ceph/super.h | 2 ++ 5 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7fb4aae97412..d26edff6dd75 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1945,6 +1945,7 @@ static u64 __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; + cf->ci = ci; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); @@ -3819,6 +3820,10 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_ci = false; bool wake_mdsc = false; + /* track latest cap flush ack seen for this inode */ + if (flush_tid > ci->i_last_cap_flush_ack) + ci->i_last_cap_flush_ack = flush_tid; + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) @@ -4450,7 +4455,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, session->s_seq, (unsigned)seq); if (!inode) { - doutc(cl, " i don't have ino %llx\n", vino.ino); + if (op == CEPH_CAP_OP_FLUSH_ACK) + pr_info_client(cl, "can't find ino %llx:%llx for flush_ack!\n", + vino.snap, vino.ino); + else + doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: @@ -4505,10 +4514,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { - doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", + doutc(cl, " no cap on %p ino %llx:%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); + if (op == CEPH_CAP_OP_FLUSH_ACK) + pr_info_client(cl, "no cap on %p ino %llx:%llx from mds%d for flush_ack!\n", + inode, ceph_ino(inode), + ceph_snap(inode), session->s_mds); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7b2e77517f23..23057c3fd31e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -624,6 +624,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_last_cap_flush_ack = 0; ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; for (i = 0; i < CEPH_FILE_MODE_BITS; i++) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab9c268a8bb..968918bcefdd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2291,6 +2291,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, return ret; } +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap_flush *cf; + + pr_info_client(cl, "still waiting for cap flushes through %llu:\n", + want_tid); + spin_lock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) { + if (cf->tid > want_tid) + break; + pr_info_client(cl, "%llx:%llx %s %llu %llu %d%s\n", + ceph_vinop(&cf->ci->netfs.inode), + ceph_cap_string(cf->caps), cf->tid, + cf->ci->i_last_cap_flush_ack, cf->wake, + cf->is_capsnap ? " is_capsnap" : ""); + } + spin_unlock(&mdsc->cap_dirty_lock); +} + /* * flush all dirty inode data to disk. * @@ -2300,11 +2320,19 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; + int i = 0; + long ret; doutc(cl, "want %llu\n", want_flush_tid); - wait_event(mdsc->cap_flushing_wq, - check_caps_flush(mdsc, want_flush_tid)); + do { + ret = wait_event_timeout(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid), 60 * HZ); + if (ret == 0 && ++i < 5) + dump_cap_flushes(mdsc, want_flush_tid); + else if (ret == 1) + pr_info_client(cl, "condition evaluated to true after timeout!\n"); + } while (ret == 0); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } @@ -5945,6 +5973,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) mutex_lock(&mdsc->mutex); if (__verify_registered_session(mdsc, s) < 0) { + pr_info_client(cl, "dropping tid %llu from unregistered session %d\n", + msg->hdr.tid, s->s_mds); mutex_unlock(&mdsc->mutex); goto out; } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c65f2b202b2b..d17904221e44 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -595,6 +595,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); + capsnap->cap_flush.ci = ci; capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b63b4cd9b5b6..7eedf4d8a6db 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -211,6 +211,7 @@ struct ceph_cap_flush { bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode + struct ceph_inode_info *ci; }; /* @@ -416,6 +417,7 @@ struct ceph_inode_info { struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + u64 i_last_cap_flush_ack; /* latest cap flush_ack tid for this inode */ unsigned long i_last_rd; unsigned long i_last_wr; -- 2.47.3