]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
kclient: assign seq to cap flush; avoid starvation on sync_fs
authorSage Weil <sage@newdream.net>
Thu, 9 Jul 2009 22:11:52 +0000 (15:11 -0700)
committerSage Weil <sage@newdream.net>
Thu, 9 Jul 2009 22:11:52 +0000 (15:11 -0700)
Assign a seq# to each cap flush.  When we sync_fs, wait only for
prior cap flushes to complete.

Also, fix wait_unsafe_requests.

src/TODO
src/kernel/caps.c
src/kernel/inode.c
src/kernel/mds_client.c
src/kernel/mds_client.h
src/kernel/super.h

index f5953997341d4d7faa170038e6c3f0b4484ea7dc..1cd1538f0f1a63ec1c0f514b62c95f55efdb6dad 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -79,8 +79,8 @@ repair
 - mds scrubbing
 
 kclient
-- make wait on cap flush smarter
-  - assign a tid to cap flush ops?
+- need a seq# in cap flush to reliably mark things clean
+  - currently we might have: dirty 1, flush 1, dirty 2, flush 2, flush_ack 1 -> mark clean (wrong!)
 - return EBADF on files without caps
 - fix up mds selection, and ESTALE handling
 - make cap import/export efficient
index 9e3ef7049a2069ceea9fe0ea31c64f53e9b50aee..5a4f2128acb91a9afe85fd8aba17ceb2de91253d 100644 (file)
@@ -1147,9 +1147,11 @@ static void __mark_caps_flushing(struct inode *inode,
        BUG_ON(list_empty(&ci->i_dirty_item));
        spin_lock(&mdsc->cap_dirty_lock);
        if (list_empty(&ci->i_flushing_item)) {
-               dout(20, " inode %p now flushing\n", &ci->vfs_inode);
-               list_add(&ci->i_flushing_item, &session->s_cap_flushing);
+               list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
                mdsc->num_cap_flushing++;
+               ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+               dout(20, " inode %p now flushing seq %lld\n", &ci->vfs_inode,
+                    ci->i_cap_flush_seq);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
 }
@@ -2089,11 +2091,7 @@ static void handle_cap_flush_ack(struct inode *inode,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-               if (!mdsc->num_cap_flushing)
-                       wake_up(&mdsc->cap_flushing_wq);
-               else
-                       dout(20, " still %d caps flushing\n",
-                            mdsc->num_cap_flushing);
+               wake_up(&mdsc->cap_flushing_wq);
                dout(20, " inode %p now !flushing\n", inode);
                if (!new_dirty) {
                        dout(20, " inode %p now clean\n", inode);
index 458f69083d4d0cfc2bf90be54e5bbf099f811329..56fe7710dd6e63179dd3cc027b8f1a4ebba20e04 100644 (file)
@@ -275,6 +275,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_flushing_caps = 0;
        INIT_LIST_HEAD(&ci->i_dirty_item);
        INIT_LIST_HEAD(&ci->i_flushing_item);
+       ci->i_cap_flush_seq = 0;
        init_waitqueue_head(&ci->i_cap_wq);
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
index bc9f0b62c587df9077ee698f3f77bce1ac27c47c..648fbcaab000c22d4578227c6f4222b6e6df677b 100644 (file)
@@ -2449,6 +2449,7 @@ void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        spin_lock_init(&mdsc->cap_delay_lock);
        INIT_LIST_HEAD(&mdsc->snap_flush_list);
        spin_lock_init(&mdsc->snap_flush_lock);
+       mdsc->cap_flush_seq = 0;
        INIT_LIST_HEAD(&mdsc->cap_dirty);
        mdsc->num_cap_flushing = 0;
        spin_lock_init(&mdsc->cap_dirty_lock);
@@ -2524,58 +2525,105 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 }
 
 /*
- * sync - flush all dirty inode data to disk
+ * sync - flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
  */
-static int are_no_sync_caps(struct ceph_mds_client *mdsc)
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 {
-       int num;
-       spin_lock(&mdsc->cap_dirty_lock);
-       num = mdsc->num_cap_flushing;
-       spin_unlock(&mdsc->cap_dirty_lock);
-       dout(20, "are_no_sync_caps = %d\n", num);
-       return num == 0;
+       int mds, ret = 1;
+
+       dout(10, "check_cap_flush want %lld\n", want_flush_seq);
+       mutex_lock(&mdsc->mutex);
+       for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+               struct ceph_mds_session *session = mdsc->sessions[mds];
+
+               if (!session)
+                       continue;
+               get_session(session);
+               mutex_unlock(&mdsc->mutex);
+
+               mutex_lock(&session->s_mutex);
+               if (!list_empty(&session->s_cap_flushing)) {
+                       struct ceph_inode_info *ci =
+                               list_entry(session->s_cap_flushing.next,
+                                          struct ceph_inode_info,
+                                          i_flushing_item);
+                       struct inode *inode = &ci->vfs_inode;
+
+                       spin_lock(&inode->i_lock);
+                       if (ci->i_cap_flush_seq <= want_flush_seq) {
+                               dout(10, "check_cap_flush still flushing %p "
+                                    "seq %lld <= %lld to mds%d\n", inode,
+                                    ci->i_cap_flush_seq, want_flush_seq,
+                                    session->s_mds);
+                               ret = 0;
+                       }
+                       spin_unlock(&inode->i_lock);
+               }
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+
+               if (!ret)
+                       return ret;
+               mutex_lock(&mdsc->mutex);
+       }
+
+       mutex_unlock(&mdsc->mutex);
+       dout(10, "check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+       return ret;
 }
 
 /*
  * wait for all write mds requests to flush.
  */
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc)
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
        struct ceph_mds_request *req;
-       u64 last_tid, next_tid;
+       u64 next_tid;
        int got;
 
        mutex_lock(&mdsc->mutex);
-       last_tid = mdsc->last_tid;
-       dout(10, "wait_unsafe_requests last is %lld\n", last_tid);
+       dout(10, "wait_unsafe_requests want %lld\n", want_tid);
        while (1) {
                got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req,
                                             next_tid, 1);
                if (!got)
                        break;
-               if (req->r_tid > last_tid)
+               if (req->r_tid > want_tid)
                        break;
+
+               next_tid = req->r_tid + 1;
                if ((req->r_op & CEPH_MDS_OP_WRITE) == 0)
                        continue;  /* not a write op */
 
-               next_tid = req->r_tid + 1;
                ceph_mdsc_get_request(req);
                mutex_unlock(&mdsc->mutex);
-               dout(10, "wait_unsafe_requests  wait on %llu (last is %llu)\n",
-                    req->r_tid, last_tid);
+               dout(10, "wait_unsafe_requests  wait on %llu (want %llu)\n",
+                    req->r_tid, want_tid);
                wait_for_completion(&req->r_safe_completion);
                mutex_lock(&mdsc->mutex);
                ceph_mdsc_put_request(req);
        }
        mutex_unlock(&mdsc->mutex);
+       dout(10, "wait_unsafe_requests done\n");
 }
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
+       u64 want_tid, want_flush;
+
        dout(10, "sync\n");
+       mutex_lock(&mdsc->mutex);
+       want_tid = mdsc->last_tid;
+       want_flush = mdsc->cap_flush_seq;
+       mutex_unlock(&mdsc->mutex);
+       dout(10, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
        ceph_check_delayed_caps(mdsc, 1);
-       wait_unsafe_requests(mdsc);
-       wait_event(mdsc->cap_flushing_wq, are_no_sync_caps(mdsc));
+
+       wait_unsafe_requests(mdsc, want_tid);
+       wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
 
 
index c3dd837d10e2964900a9d58675098eeb37042870..934ec412fa60c36e78cf86ceb47d152be0e7be23 100644 (file)
@@ -264,9 +264,11 @@ struct ceph_mds_client {
        spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
        struct list_head snap_flush_list;  /* cap_snaps ready to flush */
        spinlock_t       snap_flush_lock;
-       struct list_head cap_dirty;        /* inodes with dirty caps */
-       int num_cap_flushing;              /* # caps we are flushing */
-       spinlock_t       cap_dirty_lock;
+
+       u64               cap_flush_seq;
+       struct list_head  cap_dirty;        /* inodes with dirty caps */
+       int               num_cap_flushing; /* # caps we are flushing */
+       spinlock_t        cap_dirty_lock;   /* protects above items */
        wait_queue_head_t cap_flushing_wq;
 
        struct dentry           *debugfs_file;
index bdf8dcb3760f978d07ea521fe78f764b8ce409df..e5e8b2d3f7cfcf161f625f1f8013c2f4c5c556ef 100644 (file)
@@ -308,6 +308,7 @@ struct ceph_inode_info {
        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
        unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
        struct list_head i_dirty_item, i_flushing_item;
+       u64 i_cap_flush_seq;
        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
        unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */