From 45e409451a2cba3f5806085d168b6b3ba13f208f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Jul 2009 15:11:52 -0700 Subject: [PATCH] kclient: assign seq to cap flush; avoid starvation on sync_fs Assign a seq# to each cap flush. When we sync_fs, wait only for prior cap flushes to complete. Also, fix wait_unsafe_requests. --- src/TODO | 4 +- src/kernel/caps.c | 12 +++--- src/kernel/inode.c | 1 + src/kernel/mds_client.c | 84 ++++++++++++++++++++++++++++++++--------- src/kernel/mds_client.h | 8 ++-- src/kernel/super.h | 1 + 6 files changed, 80 insertions(+), 30 deletions(-) diff --git a/src/TODO b/src/TODO index f5953997341d4..1cd1538f0f1a6 100644 --- a/src/TODO +++ b/src/TODO @@ -79,8 +79,8 @@ repair - mds scrubbing kclient -- make wait on cap flush smarter - - assign a tid to cap flush ops? +- need a seq# in cap flush to reliably mark things clean + - currently we might have: dirty 1, flush 1, dirty 2, flush 2, flush_ack 1 -> mark clean (wrong!) - return EBADF on files without caps - fix up mds selection, and ESTALE handling - make cap import/export efficient diff --git a/src/kernel/caps.c b/src/kernel/caps.c index 9e3ef7049a206..5a4f2128acb91 100644 --- a/src/kernel/caps.c +++ b/src/kernel/caps.c @@ -1147,9 +1147,11 @@ static void __mark_caps_flushing(struct inode *inode, BUG_ON(list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_flushing_item)) { - dout(20, " inode %p now flushing\n", &ci->vfs_inode); - list_add(&ci->i_flushing_item, &session->s_cap_flushing); + list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; + dout(20, " inode %p now flushing seq %lld\n", &ci->vfs_inode, + ci->i_cap_flush_seq); } spin_unlock(&mdsc->cap_dirty_lock); } @@ -2089,11 +2091,7 @@ static void handle_cap_flush_ack(struct inode *inode, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - if (!mdsc->num_cap_flushing) - wake_up(&mdsc->cap_flushing_wq); - else - dout(20, " still %d caps flushing\n", - mdsc->num_cap_flushing); + wake_up(&mdsc->cap_flushing_wq); dout(20, " inode %p now !flushing\n", inode); if (!new_dirty) { dout(20, " inode %p now clean\n", inode); diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 458f69083d4d0..56fe7710dd6e6 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -275,6 +275,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); + ci->i_cap_flush_seq = 0; init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index bc9f0b62c587d..648fbcaab000c 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -2449,6 +2449,7 @@ void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); + mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); @@ -2524,58 +2525,105 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * sync - flush all dirty inode data to disk + * sync - flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq */ -static int are_no_sync_caps(struct ceph_mds_client *mdsc) +static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) { - int num; - spin_lock(&mdsc->cap_dirty_lock); - num = mdsc->num_cap_flushing; - spin_unlock(&mdsc->cap_dirty_lock); - dout(20, "are_no_sync_caps = %d\n", num); - return num == 0; + int mds, ret = 1; + + dout(10, "check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + if (ci->i_cap_flush_seq <= want_flush_seq) { + dout(10, "check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", inode, + ci->i_cap_flush_seq, want_flush_seq, + session->s_mds); + ret = 0; + } + spin_unlock(&inode->i_lock); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (!ret) + return ret; + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout(10, "check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + return ret; } /* * wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc) +static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { struct ceph_mds_request *req; - u64 last_tid, next_tid; + u64 next_tid; int got; mutex_lock(&mdsc->mutex); - last_tid = mdsc->last_tid; - dout(10, "wait_unsafe_requests last is %lld\n", last_tid); + dout(10, "wait_unsafe_requests want %lld\n", want_tid); while (1) { got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req, next_tid, 1); if (!got) break; - if (req->r_tid > last_tid) + if (req->r_tid > want_tid) break; + + next_tid = req->r_tid + 1; if ((req->r_op & CEPH_MDS_OP_WRITE) == 0) continue; /* not a write op */ - next_tid = req->r_tid + 1; ceph_mdsc_get_request(req); mutex_unlock(&mdsc->mutex); - dout(10, "wait_unsafe_requests wait on %llu (last is %llu)\n", - req->r_tid, last_tid); + dout(10, "wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); } mutex_unlock(&mdsc->mutex); + dout(10, "wait_unsafe_requests done\n"); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { + u64 want_tid, want_flush; + dout(10, "sync\n"); + mutex_lock(&mdsc->mutex); + want_tid = mdsc->last_tid; + want_flush = mdsc->cap_flush_seq; + mutex_unlock(&mdsc->mutex); + dout(10, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + ceph_check_delayed_caps(mdsc, 1); - wait_unsafe_requests(mdsc); - wait_event(mdsc->cap_flushing_wq, are_no_sync_caps(mdsc)); + + wait_unsafe_requests(mdsc, want_tid); + wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); } diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h index c3dd837d10e29..934ec412fa60c 100644 --- a/src/kernel/mds_client.h +++ b/src/kernel/mds_client.h @@ -264,9 +264,11 @@ struct ceph_mds_client { spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - struct list_head cap_dirty; /* inodes with dirty caps */ - int num_cap_flushing; /* # caps we are flushing */ - spinlock_t cap_dirty_lock; + + u64 cap_flush_seq; + struct list_head cap_dirty; /* inodes with dirty caps */ + int num_cap_flushing; /* # caps we are flushing */ + spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; struct dentry *debugfs_file; diff --git a/src/kernel/super.h b/src/kernel/super.h index bdf8dcb3760f9..e5e8b2d3f7cfc 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -308,6 +308,7 @@ struct ceph_inode_info { struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ struct list_head i_dirty_item, i_flushing_item; + u64 i_cap_flush_seq; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ -- 2.39.5