]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
kclient: maintain per-cap-bit tid map to fix pipelined cap updates
authorSage Weil <sage@newdream.net>
Thu, 30 Jul 2009 22:08:44 +0000 (15:08 -0700)
committerSage Weil <sage@newdream.net>
Thu, 30 Jul 2009 22:25:03 +0000 (15:25 -0700)
We want to allow pipelined cap updates, like

 client->mds  writeback Fw 1
dirty FwAx
 client->mds  writeback FwAx 2
dirty Ax
 client->mds  writeback Ax 3
 mds->client  ack 1
 mds->client  ack 2
 mds->client  ack 3

We need to make sure that the Fw bit is only cleaned after ack 2,
and Ax after ack 3.  A single tid for the inode isn't sufficient,
since that would e.g. ignore ack 2... we need a tid per cap bit so
we can pipeline writeback of different caps.

Note that we can't simply write back dirty | flushing caps every
time, since the write may also be releasing the cap.  And it would
gum up the MDS locking.

Move the last_tid to the inode, and only pay attention to 16 bits
per cap bit.. that's 17*2 bytes, vs the old 16.  Could be worse.
An 8 bit tid is probably also sufficient (that's 256 pipelined
writes) if we're concerned about inode size down the road.

src/include/ceph_fs.h
src/kernel/caps.c
src/kernel/inode.c
src/kernel/mds_client.c
src/kernel/mds_client.h
src/kernel/super.h

index e617c2fd9511339633e3fc6aad9843d90d6fe14f..c53999d40e99fc0604365320fe173459b01af890 100644 (file)
@@ -660,6 +660,8 @@ static inline int ceph_flags_to_mode(int flags)
 #define CEPH_CAP_SXATTR     6
 #define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
 
+#define CEPH_CAP_BITS       16
+
 /* composed values */
 #define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
 #define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
index 933e34c2c3c2152debdff2862423feabe8bc8b91..d49f54a3c997647c494786f7284f97581b4703db 100644 (file)
@@ -1001,6 +1001,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        u64 xattr_version = 0;
        int delayed = 0;
        u64 flush_tid = 0;
+       int i;
 
        dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
             inode, cap, cap->session,
@@ -1038,12 +1039,16 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        if (flushing) {
                /*
                 * assign a tid for flush operations so we can avoid
-                *  flush1 -> dirty1 -> flush2 -> flushack1 -> mark clean
-                * type races.
+                * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+                * clean type races.  track latest tid for every bit
+                * so we can handle flush AxFw, flush Fw, and have the
+                * first ack clean Ax.
                 */
-               flush_tid = ++cap->session->s_cap_flush_tid;
-               ci->i_cap_flush_tid = flush_tid;
-               dout(" cap_flush_tid %lld\n", flush_tid);
+               flush_tid = ++ci->i_cap_flush_last_tid;
+               dout(" cap_flush_tid %d\n", (int)flush_tid);
+               for (i = 0; i < CEPH_CAP_BITS; i++)
+                       if (flushing & (1 << i))
+                               ci->i_cap_flush_tid[i] = flush_tid;
        }
 
        keep = cap->implemented;
@@ -1159,7 +1164,7 @@ retry:
                        goto retry;
                }
 
-               capsnap->flush_tid = ++session->s_cap_flush_tid;
+               capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
                atomic_inc(&capsnap->nref);
                if (!list_empty(&capsnap->flushing_item))
                        list_del_init(&capsnap->flushing_item);
@@ -2178,43 +2183,51 @@ static void handle_cap_flush_ack(struct inode *inode,
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
-       int cleaned = le32_to_cpu(m->dirty);
+       int dirty = le32_to_cpu(m->dirty);
+       int cleaned = 0;
        u64 flush_tid = le64_to_cpu(m->client_tid);
        int old_dirty = 0, new_dirty = 0;
+       int i;
 
-       dout("handle_cap_flush_ack inode %p mds%d seq %d cleaned %s,"
+       for (i = 0; i < CEPH_CAP_BITS; i++)
+               if ((dirty & (1 << i)) &&
+                   flush_tid == ci->i_cap_flush_tid[i])
+                       cleaned |= 1 << i;
+
+       dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
             " flushing %s -> %s\n",
-            inode, session->s_mds, seq, ceph_cap_string(cleaned),
-            ceph_cap_string(ci->i_flushing_caps),
+            inode, session->s_mds, seq, ceph_cap_string(dirty),
+            ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
             ceph_cap_string(ci->i_flushing_caps & ~cleaned));
-       if (flush_tid != ci->i_cap_flush_tid) {
-               dout(" flush_tid %lld != my flush_tid %lld, ignoring\n",
-                    flush_tid, ci->i_cap_flush_tid);
-       } else {
-               old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
-               ci->i_flushing_caps &= ~cleaned;
-               new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
-               if (old_dirty) {
-                       spin_lock(&mdsc->cap_dirty_lock);
-                       list_del_init(&ci->i_flushing_item);
-                       if (!list_empty(&session->s_cap_flushing))
-                               dout(" mds%d still flushing cap on %p\n",
-                                    session->s_mds,
-                                    &list_entry(session->s_cap_flushing.next,
-                                                struct ceph_inode_info,
-                                                i_flushing_item)->vfs_inode);
-                       mdsc->num_cap_flushing--;
-                       wake_up(&mdsc->cap_flushing_wq);
-                       dout(" inode %p now !flushing\n", inode);
-                       if (!new_dirty) {
-                               dout(" inode %p now clean\n", inode);
-                               list_del_init(&ci->i_dirty_item);
-                       }
-                       spin_unlock(&mdsc->cap_dirty_lock);
-                       wake_up(&ci->i_cap_wq);
-               }
+
+       if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+               goto out;
+
+       old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
+       ci->i_flushing_caps &= ~cleaned;
+       new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       if (ci->i_flushing_caps == 0) {
+               list_del_init(&ci->i_flushing_item);
+               if (!list_empty(&session->s_cap_flushing))
+                       dout(" mds%d still flushing cap on %p\n",
+                            session->s_mds,
+                            &list_entry(session->s_cap_flushing.next,
+                                        struct ceph_inode_info,
+                                        i_flushing_item)->vfs_inode);
+               mdsc->num_cap_flushing--;
+               wake_up(&mdsc->cap_flushing_wq);
+               dout(" inode %p now !flushing\n", inode);
+       }
+       if (old_dirty && !new_dirty) {
+               dout(" inode %p now clean\n", inode);
+               list_del_init(&ci->i_dirty_item);
        }
+       spin_unlock(&mdsc->cap_dirty_lock);
+       wake_up(&ci->i_cap_wq);
 
+out:
        spin_unlock(&inode->i_lock);
        if (old_dirty && !new_dirty)
                iput(inode);
index 7a0c578f15aa9b3049b52e43d4f47696d5dc1e92..62e238a3bb90de8940c3fcc430b96fc5ecfa86cc 100644 (file)
@@ -292,6 +292,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ci->i_dirty_item);
        INIT_LIST_HEAD(&ci->i_flushing_item);
        ci->i_cap_flush_seq = 0;
+       ci->i_cap_flush_last_tid = 0;
+       memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
        init_waitqueue_head(&ci->i_cap_wq);
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
index 337a9bdc40948c03992f0b38bef8137606e257f7..7417d3e80825a99061ec956b9c2ffeb7d82aa87e 100644 (file)
@@ -336,7 +336,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        INIT_LIST_HEAD(&s->s_cap_releases_done);
        INIT_LIST_HEAD(&s->s_cap_flushing);
        INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
-       s->s_cap_flush_tid = 0;
 
        dout("register_session mds%d\n", mds);
        if (mds >= mdsc->max_sessions) {
index a8f8deee92084fc3f151149fc0dd0c59ff00d811..2361f461083d302ae8a267ee36d119a43c545bf1 100644 (file)
@@ -108,7 +108,6 @@ struct ceph_mds_session {
        /* protected by mutex */
        struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
        struct list_head  s_cap_snaps_flushing;
-       u64               s_cap_flush_tid;
        unsigned long     s_renew_requested; /* last time we sent a renew req */
 
        atomic_t          s_ref;
index 6de97adc8a66dd47730f15d5f897975af359086d..388a64d6066b4182cbba93587dc3782da146d67e 100644 (file)
@@ -311,7 +311,11 @@ struct ceph_inode_info {
        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
        unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
        struct list_head i_dirty_item, i_flushing_item;
-       u64 i_cap_flush_seq, i_cap_flush_tid;
+       u64 i_cap_flush_seq;
+       /* we need to track cap writeback on a per-cap-bit basis, to allow
+        * overlapping, pipelined cap flushes to the mds.  we can probably
+        * reduce the tid to 8 bits if we're concerned about inode size. */
+       u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
        unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */