From 30e0b3fafd374b0f471db85364a468cab83935a2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 7 Jul 2021 09:20:34 -0400 Subject: [PATCH] [DO NOT MERGE] ceph: dump info about cap flushes when we're waiting too long for them We've had some cases of hung umounts in teuthology testing. It looks like client is waiting for cap flushes to complete, but they aren't. Add a field to the inode to track the highest cap flush tid seen for that inode. Also, add a backpointer to the inode to the ceph_cap_flush struct. Change wait_caps_flush to wait 60s, and then dump info about the condition of the list. Also, print pr_info messages if we end up dropping a FLUSH_ACK for an inode onto the floor, or if we get a message on an unregistered session. Reported-by: Patrick Donnelly URL: https://tracker.ceph.com/issues/51279 Signed-off-by: Jeff Layton --- fs/ceph/caps.c | 27 +++++++++++++++++++++------ fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++-- fs/ceph/snap.c | 1 + fs/ceph/super.h | 2 ++ 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a8d8b56cf9d2..5e7c9bd4dfa2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1932,6 +1932,7 @@ static u64 __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; + cf->ci = ci; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); @@ -3825,6 +3826,10 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_ci = false; bool wake_mdsc = false; + /* track latest cap flush ack seen for this inode */ + if (flush_tid > ci->i_last_cap_flush_ack) + ci->i_last_cap_flush_ack = flush_tid; + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) @@ -3984,8 +3989,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { - doutc(cl, " cap_snap %p follows %lld " - "tid %lld != %lld\n", iter, + pr_info_client(cl, " %llx.%llx cap_snap %p follows %lld " + "tid %lld != %lld\n", ceph_vinop(inode), iter, follows, flush_tid, iter->cap_flush.tid); break; @@ -3993,8 +3998,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap = iter; break; } else { - doutc(cl, " skipping cap_snap %p follows %lld\n", - iter, iter->follows); + pr_info_client(cl, " %llx.%llx skipping cap_snap %p follows %lld, " + "iter->follows %lld tid %lld iter->cap_flush.tid %lld\n", + ceph_vinop(inode), iter, follows, iter->follows, flush_tid, + iter->cap_flush.tid); } } if (capsnap) @@ -4455,7 +4462,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, mutex_lock(&session->s_mutex); if (!inode) { - doutc(cl, " i don't have ino %llx\n", vino.ino); + if (op == CEPH_CAP_OP_FLUSH_ACK) + pr_info_client(cl, "can't find ino %llx:%llx for flush_ack!\n", + vino.snap, vino.ino); + else + doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: @@ -4510,10 +4521,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { - doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", + doutc(cl, " no cap on %p ino %llx:%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); + if (op == CEPH_CAP_OP_FLUSH_ACK) + pr_info_client(cl, "no cap on %p ino %llx:%llx from mds%d for flush_ack!\n", + inode, ceph_ino(inode), + ceph_snap(inode), session->s_mds); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 06cd2963e41e..c46fa6ed11c1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -624,6 +624,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_last_cap_flush_ack = 0; ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; for (i = 0; i < CEPH_FILE_MODE_BITS; i++) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 230e0c3f341f..542fa0c15c51 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2283,6 +2283,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, return ret; } +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap_flush *cf; + + pr_info_client(cl, "still waiting for cap flushes through %llu:\n", + want_tid); + spin_lock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) { + if (cf->tid > want_tid) + break; + pr_info_client(cl, "%llx:%llx %s %llu %llu %d%s\n", + ceph_vinop(&cf->ci->netfs.inode), + ceph_cap_string(cf->caps), cf->tid, + cf->ci->i_last_cap_flush_ack, cf->wake, + cf->is_capsnap ? " is_capsnap" : ""); + } + spin_unlock(&mdsc->cap_dirty_lock); +} + /* * flush all dirty inode data to disk. * @@ -2292,11 +2312,19 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; + int i = 0; + long ret; doutc(cl, "want %llu\n", want_flush_tid); - wait_event(mdsc->cap_flushing_wq, - check_caps_flush(mdsc, want_flush_tid)); + do { + ret = wait_event_timeout(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid), 60 * HZ); + if (ret == 0 && ++i < 5) + dump_cap_flushes(mdsc, want_flush_tid); + else if (ret == 1) + pr_info_client(cl, "condition evaluated to true after timeout!\n"); + } while (ret == 0); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } @@ -6228,6 +6256,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) mutex_lock(&mdsc->mutex); if (__verify_registered_session(mdsc, s) < 0) { + pr_info_client(cl, "dropping tid %llu from unregistered session %d\n", + msg->hdr.tid, s->s_mds); mutex_unlock(&mdsc->mutex); goto out; } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c65f2b202b2b..d17904221e44 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -595,6 +595,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); + capsnap->cap_flush.ci = ci; capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index bb0db0cc8003..4a54fc273ec1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -212,6 +212,7 @@ struct ceph_cap_flush { bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode + struct ceph_inode_info *ci; }; /* @@ -417,6 +418,7 @@ struct ceph_inode_info { struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + u64 i_last_cap_flush_ack; /* latest cap flush_ack tid for this inode */ unsigned long i_last_rd; unsigned long i_last_wr; -- 2.47.3