From c46435045659157c2516562099237b467d3d45b8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 21 Jul 2009 14:43:20 -0700 Subject: [PATCH] kclient: file.c cleanup --- src/kernel/file.c | 56 ++++++++++--------- src/kernel/import_patch_set_into_linux_git.sh | 18 ++++++ 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/src/kernel/file.c b/src/kernel/file.c index f9851690df3b5..fbf02c3a2c180 100644 --- a/src/kernel/file.c +++ b/src/kernel/file.c @@ -109,7 +109,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) * * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the - * MDS). + * MDS). We do, however, need to inform the MDS (asynchronously) + * if our wanted caps set expands. */ int ceph_open(struct inode *inode, struct file *file) { @@ -150,9 +151,8 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * We re-use existing caps only if already have an open file - * that also wants them. That is, our want for the caps is - * registered with the MDS. + * No need to block if we have any caps. Update wanted set + * asynchronously. */ spin_lock(&inode->i_lock); if (__ceph_is_any_real_caps(ci)) { @@ -388,12 +388,9 @@ static int copy_page_vector_to_user(struct page **pages, char __user *data, /* * Completely synchronous read and write methods. Direct from __user - * buffer to osd. + * buffer to osd, or directly to user pages (if O_DIRECT). * - * If read spans object boundary, just do multiple reads. - * - * FIXME: for a correct atomic read, we should take read locks on all - * objects. + * If the read spans object boundary, just do multiple reads. */ static ssize_t ceph_sync_read(struct file *file, char __user *data, unsigned left, loff_t *offset) @@ -525,11 +522,12 @@ out: } /* - * synchronous write. from userspace. + * Synchronous write, straight from __user pointer or user pages (if + * O_DIRECT). * - * FIXME: if write spans object boundary, just do two separate write. - * for a correct atomic write, we should take write locks on all - * objects, rollback on failure, etc. + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) */ static ssize_t ceph_sync_write(struct file *file, const char __user *data, size_t left, loff_t *offset) @@ -678,15 +676,16 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, ssize_t ret; int got = 0; - dout("aio_read %llx.%llx %llu~%u trying to get caps on %p\n", - ceph_vinop(inode), pos, (unsigned)len, inode); + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), pos, (unsigned)len, inode); __ceph_do_pending_vmtruncate(inode); ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &got, -1); if (ret < 0) goto out; - dout("aio_read %llx.%llx %llu~%u got cap refs on %s\n", - ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got)); + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)len, + ceph_cap_string(got)); if ((got & CEPH_CAP_FILE_CACHE) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || @@ -697,8 +696,8 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, ret = generic_file_aio_read(iocb, iov, nr_segs, pos); out: - dout("aio_read %llx.%llx dropping cap refs on %s\n", - ceph_vinop(inode), ceph_cap_string(got)); + dout("aio_read %p %llx.%llx dropping cap refs on %s\n", + inode, ceph_vinop(inode), ceph_cap_string(got)); ceph_put_cap_refs(ci, got); return ret; } @@ -732,15 +731,17 @@ retry_snap: if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) return -ENOSPC; __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llu~%u getting caps. i_size %llu\n", - inode, pos, (unsigned)iov->iov_len, inode->i_size); + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode->i_size); ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &got, endoff); if (ret < 0) goto out; - dout("aio_write %p %llu~%u got cap refs on %s\n", - inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got)); + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); if ((got & CEPH_CAP_FILE_BUFFER) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || @@ -761,13 +762,14 @@ retry_snap: } out: - dout("aio_write %p %llu~%u dropping cap refs on %s\n", - inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got)); + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); if (ret == -EOLDSNAPC) { - dout("aio_write %p %llu~%u got EOLDSNAPC, retrying\n", - inode, pos, (unsigned)iov->iov_len); + dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); goto retry_snap; } diff --git a/src/kernel/import_patch_set_into_linux_git.sh b/src/kernel/import_patch_set_into_linux_git.sh index 99440028de3ce..666c113ea423b 100755 --- a/src/kernel/import_patch_set_into_linux_git.sh +++ b/src/kernel/import_patch_set_into_linux_git.sh @@ -101,6 +101,19 @@ However, if the MDS replies without a trace (e.g., when retrying an update after an MDS failure recovery), some operation-specific cleanup may be needed. +We can validate cached dentries in two ways. A per-dentry lease may +be issued by the MDS, or a per-directory cap may be issued that acts +as a lease on the entire directory. In the latter case, a 'gen' value +is used to determine which dentries belong to the currently leased +directory contents. + +We normally prepopulate the dcache and icache with readdir results. +This makes subsequent lookups and getattrs avoid any server +interaction. It also lets us satisfy readdir operation by peeking at +the dcache IFF we hold the per-directory cap/lease, previously +performed a readdir, and haven't dropped any of the resulting +dentries. + EOF git add $target/ceph/file.c @@ -113,6 +126,11 @@ performing IO on a file. We take references on held capabilities for the duration of the read/write to avoid prematurely releasing them back to the MDS. +We implement two main paths for read and write: one that is buffered +(and uses generic_aio_{read,write}), and one that is fully synchronous +and blocking (operating either on a __user pointer or, if O_DIRECT, +directly on user pages). + EOF git add $target/ceph/addr.c -- 2.39.5