kclient: revamp fsync

author Sage Weil <sage@newdream.net>

Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)

committer Sage Weil <sage@newdream.net>

Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)
author Sage Weil <sage@newdream.net>
Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)
committer Sage Weil <sage@newdream.net>
Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)
diff --git a/src/TODO b/src/TODO

index fa75a27f6790553d99d41d1698f21e2e222abccb..51cf5ac28b8bd6c6ef5229aee43fa7f41d410bdc 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -48,11 +48,11 @@ v0.11
  v0.12
  - mapping_set_error on failed writepage
  - document correct debugfs mount point
-- clean up layout ioctls
-- fix bad kmalloc
-- use mempool for write path allocations where appropriate
-- fixed bug with cap, snap writeback
-
+- simplify layout/striping ioctls
+- removed bad kmalloc in writepages
+- use mempools for writeback allocations where appropriate (*)
+- fixed a problem with capability, snap metadata writeback
+- cleaned up f(data)sync wrt metadata writeback
  
  - osdmap: allow explicit pg 'override' mappings
  - http gw
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h

index c53999d40e99fc0604365320fe173459b01af890..45e2f14a7ba8f70a1bb79e170538ac7643afc050 100644 (file)
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -709,7 +709,8 @@ static inline int ceph_flags_to_mode(int flags)
                            CEPH_CAP_LINK_EXCL |         \
                            CEPH_CAP_XATTR_EXCL |        \
                            CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |        \
+                             CEPH_CAP_FILE_EXCL)
  #define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
  #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
                            CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
diff --git a/src/kernel/caps.c b/src/kernel/caps.c

index d49f54a3c997647c494786f7284f97581b4703db..cf964a990e1c1909397eaadd3344c7d1d9827326 100644 (file)
--- a/src/kernel/caps.c
+++ b/src/kernel/caps.c
@@ -978,7 +978,8 @@ void ceph_queue_caps_release(struct inode *inode)
   * caller should hold snap_rwsem (read), s_mutex.
   */
  static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-                     int op, int used, int want, int retain, int flushing)
+                     int op, int used, int want, int retain, int flushing,
+                     unsigned *pflush_tid)
         __releases(cap->ci->vfs_inode->i_lock)
  {
         struct ceph_inode_info *ci = cap->ci;
@@ -1045,6 +1046,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                  * first ack clean Ax.
                  */
                 flush_tid = ++ci->i_cap_flush_last_tid;
+               if (pflush_tid)
+                       *pflush_tid = flush_tid;
                 dout(" cap_flush_tid %d\n", (int)flush_tid);
                 for (i = 0; i < CEPH_CAP_BITS; i++)
                         if (flushing & (1 << i))
@@ -1464,7 +1467,7 @@ ack:
  
                 /* __send_cap drops i_lock */
                 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-                                     retain, flushing);
+                                     retain, flushing, NULL);
                 goto retry; /* retake i_lock and restart our cap scan. */
         }
  
@@ -1522,7 +1525,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  /*
   * Try to flush dirty caps back to the auth mds.
   */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session)
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+                         unsigned *flush_tid)
  {
         struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1558,7 +1562,8 @@ retry:
  
                 /* __send_cap drops i_lock */
                 __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-                          cap->issued | cap->implemented, flushing);
+                          cap->issued | cap->implemented, flushing,
+                          flush_tid);
                 goto out_unlocked;
         }
  out:
@@ -1569,33 +1574,123 @@ out_unlocked:
         return flushing;
  }
  
-static int caps_are_clean(struct inode *inode)
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
  {
-       int dirty;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int dirty, i, ret = 1;
+
         spin_lock(&inode->i_lock);
-       dirty = __ceph_caps_dirty(ceph_inode(inode));
+       dirty = __ceph_caps_dirty(ci);
+       for (i = 0; i < CEPH_CAP_BITS; i++)
+               if ((ci->i_flushing_caps & (1 << i)) &&
+                   ci->i_cap_flush_tid[i] <= tid) {
+                       /* still flushing this bit */
+                       ret = 0;
+                       break;
+               }
         spin_unlock(&inode->i_lock);
-       return !dirty;
+       return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct list_head *head = &ci->i_unsafe_writes;
+       struct ceph_osd_request *req;
+       u64 last_tid;
+
+       spin_lock(&ci->i_unsafe_lock);
+       if (list_empty(head))
+               goto out;
+
+       /* set upper bound as _last_ entry in chain */
+       req = list_entry(head->prev, struct ceph_osd_request,
+                        r_unsafe_item);
+       last_tid = req->r_tid;
+
+       do {
+               ceph_osdc_get_request(req);
+               spin_unlock(&ci->i_unsafe_lock);
+               dout("sync_write_wait on tid %llu (until %llu)\n",
+                    req->r_tid, last_tid);
+               wait_for_completion(&req->r_safe_completion);
+               spin_lock(&ci->i_unsafe_lock);
+               ceph_osdc_put_request(req);
+
+               /*
+                * from here on look at first entry in chain, since we
+                * only want to wait for anything older than last_tid
+                */
+               if (list_empty(head))
+                       break;
+               req = list_entry(head->next, struct ceph_osd_request,
+                                r_unsafe_item);
+       } while (req->r_tid < last_tid);
+out:
+       spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       unsigned flush_tid;
+       int ret;
+       int dirty;
+
+       dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+       sync_write_wait(inode);
+
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (ret < 0)
+               return ret;
+
+       dirty = try_flush_caps(inode, NULL, &flush_tid);
+       dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+       /*
+        * only wait on non-file metadata writeback (the mds
+        * can recover size and mtime, so we don't need to
+        * wait for that)
+        */
+       if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+               dout("fsync waiting for flush_tid %u\n", flush_tid);
+               ret = wait_event_interruptible(ci->i_cap_wq,
+                                      caps_are_flushed(inode, flush_tid));
+       }
+
+       dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+       return ret;
  }
  
  /*
   * Flush any dirty caps back to the mds.  If we aren't asked to wait,
   * queue inode for flush but don't do so immediately, because we can
- * get by with fewer MDS messages if we wait for e.g. data writeback
- * to complete first.
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
   */
  int ceph_write_inode(struct inode *inode, int wait)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
+       unsigned flush_tid;
         int err = 0;
         int dirty;
  
         dout("write_inode %p wait=%d\n", inode, wait);
         if (wait) {
-               dirty = try_flush_caps(inode, NULL);
+               dirty = try_flush_caps(inode, NULL, &flush_tid);
                 if (dirty)
                         err = wait_event_interruptible(ci->i_cap_wq,
-                                                      caps_are_clean(inode));
+                                      caps_are_flushed(inode, flush_tid));
         } else {
                 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
  
@@ -1607,7 +1702,6 @@ int ceph_write_inode(struct inode *inode, int wait)
         return err;
  }
  
-
  /*
   * After a recovering MDS goes active, we need to resend any caps
   * we were flushing.
@@ -1653,7 +1747,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    __ceph_caps_used(ci),
                                    __ceph_caps_wanted(ci),
                                    cap->issued | cap->implemented,
-                                  ci->i_flushing_caps);
+                                  ci->i_flushing_caps, NULL);
                 } else {
                         pr_err("ceph %p auth cap %p not mds%d ???\n", inode,
                                cap, session->s_mds);
@@ -2408,7 +2502,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
                      issued, wanted, seq, mseq, realmino,
                      ttl_ms, jiffies - ttl_ms/2, CEPH_CAP_FLAG_AUTH,
                      NULL /* no caps context */);
-       try_flush_caps(inode, session);
+       try_flush_caps(inode, session, NULL);
         up_read(&mdsc->snap_rwsem);
  }
  
diff --git a/src/kernel/file.c b/src/kernel/file.c

index 416fc82d3fb9864fab580b1dc0227081ef182ffa..8ebe6c68b1d65f77ba278cc07b33eee3c94f4363 100644 (file)
--- a/src/kernel/file.c
+++ b/src/kernel/file.c
@@ -477,50 +477,6 @@ static void sync_write_commit(struct ceph_osd_request *req)
         ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
  }
  
-/*
- * Wait on any unsafe replies for the given inode.  First wait on the
- * newest request, and make that the upper bound.  Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_writes;
-       struct ceph_osd_request *req;
-       u64 last_tid;
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       /* set upper bound as _last_ entry in chain */
-       req = list_entry(head->prev, struct ceph_osd_request,
-                        r_unsafe_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_osdc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-               dout("sync_write_wait on tid %llu (until %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_safe_completion);
-               spin_lock(&ci->i_unsafe_lock);
-               ceph_osdc_put_request(req);
-
-               /*
-                * from here on look at first entry in chain, since we
-                * only want to wait for anything older than last_tid
-                */
-               if (list_empty(head))
-                       break;
-               req = list_entry(head->next, struct ceph_osd_request,
-                                r_unsafe_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-}
-
  /*
   * Synchronous write, straight from __user pointer or user pages (if
   * O_DIRECT).
@@ -776,27 +732,6 @@ out:
         return ret;
  }
  
-static int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
-       struct inode *inode = dentry->d_inode;
-       int ret;
-
-       dout("fsync %p\n", inode);
-       sync_write_wait(inode);
-
-       ret = filemap_write_and_wait(inode->i_mapping);
-       if (ret < 0)
-               return ret;
-
-       /*
-        * Queue up the cap flush, but don't wait on it: the MDS can
-        * recover from the object size/mtimes.
-        */
-       ceph_write_inode(inode, 0);
-
-       return ret;
-}
-
  const struct file_operations ceph_file_fops = {
         .open = ceph_open,
         .release = ceph_release,
diff --git a/src/kernel/super.h b/src/kernel/super.h

index 63c049dea59309e8ebdfc08a7b21ac5b76e8a33b..c4591244cac70a77a65e7116d7b6aef4ac6d604d 100644 (file)
--- a/src/kernel/super.h
+++ b/src/kernel/super.h
@@ -860,6 +860,7 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
  
  extern void ceph_queue_caps_release(struct inode *inode);
  extern int ceph_write_inode(struct inode *inode, int unused);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
  extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                     struct ceph_mds_session *session);
  extern int ceph_get_cap_mds(struct inode *inode);
author	Sage Weil <sage@newdream.net>
	Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)
committer	Sage Weil <sage@newdream.net>
	Wed, 5 Aug 2009 18:38:32 +0000 (11:38 -0700)
src/TODO		patch \| blob \| history
src/include/ceph_fs.h		patch \| blob \| history
src/kernel/caps.c		patch \| blob \| history
src/kernel/file.c		patch \| blob \| history
src/kernel/super.h		patch \| blob \| history