From 4225176e973decad76075ebfdcd0602951177696 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Jun 2009 16:55:37 -0700 Subject: [PATCH] kclient: independently track size of address space We do vmtruncate asynchronously, but must update i_size immediately. This is partly to avoid dropping dirty snapped pages. But we can't use i_size for writepages because that will reflect the truncated size, not the size of the pre-trunc snapped state. --- src/kernel/addr.c | 20 +++++++++++++------- src/kernel/inode.c | 32 +++++++++++++++++++++++++++++--- src/kernel/super.h | 1 + 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/kernel/addr.c b/src/kernel/addr.c index 4ee5d75fceffc..07265f36cd7b7 100644 --- a/src/kernel/addr.c +++ b/src/kernel/addr.c @@ -403,9 +403,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) osdc = &ceph_inode_to_client(inode)->osdc; /* is this a partial page at end of file? */ - i_size = i_size_read(inode); + spin_lock(&inode->i_lock); + i_size = ci->i_as_size; if (i_size < page_off + len) len = i_size - page_off; + spin_unlock(&inode->i_lock); dout(10, "writepage %p page %p index %lu on %llu~%u\n", inode, page, page->index, page_off, len); @@ -641,9 +643,9 @@ retry: int pvec_pages, locked_pages; struct page *page; int want; - u64 offset, len; + u64 offset, len, as_size; struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; + struct ceph_osd_op *op; next = 0; locked_pages = 0; @@ -695,9 +697,11 @@ get_more_pages: dout(20, "waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if (page_offset(page) >= i_size_read(inode)) { - dout(20, "%p past eof %llu\n", page, - i_size_read(inode)); + spin_lock(&inode->i_lock); + as_size = ci->i_as_size; + spin_unlock(&inode->i_lock); + if (page_offset(page) >= as_size) { + dout(20, "%p > as_size %llu\n", page, as_size); done = 1; unlock_page(page); break; @@ -789,8 +793,10 @@ get_more_pages: /* submit the write */ offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min(i_size_read(inode) - offset, + spin_lock(&inode->i_lock); + len = min(ci->i_as_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); + spin_unlock(&inode->i_lock); dout(10, "writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 3ed15068218c0..8c2c43d6f4208 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -288,6 +288,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; + ci->i_as_size = 0; ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; @@ -364,6 +365,8 @@ int ceph_fill_file_size(struct inode *inode, int issued, inode->i_size = size; inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; + if (ci->i_as_size < size) + ci->i_as_size = size; if (truncate_seq != ci->i_truncate_seq) { dout(10, "truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); @@ -1153,6 +1156,8 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) dout(30, "set_size %p %llu -> %llu\n", inode, inode->i_size, size); inode->i_size = size; inode->i_blocks = (size + (1 << 9) - 1) >> 9; + if (ci->i_as_size < size) + ci->i_as_size = size; /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -1256,12 +1261,27 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) u64 to; int wrbuffer_refs, wake = 0; +retry: spin_lock(&inode->i_lock); if (ci->i_truncate_pending == 0) { dout(10, "__do_pending_vmtruncate %p none pending\n", inode); spin_unlock(&inode->i_lock); return; } + + /* + * make sure any dirty snapped pages are flushed before we + * possibly truncate them.. so write AND block! + */ + if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + dout(10, "__do_pending_vmtruncate %p flushing snaps first\n", + inode); + spin_unlock(&inode->i_lock); + filemap_write_and_wait_range(&inode->i_data, 0, + CEPH_FILE_MAX_SIZE); + goto retry; + } + to = ci->i_truncate_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout(10, "__do_pending_vmtruncate %p (%d) to %lld\n", inode, @@ -1271,6 +1291,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) truncate_inode_pages(inode->i_mapping, to); spin_lock(&inode->i_lock); + ci->i_as_size = to; ci->i_truncate_pending--; if (ci->i_truncate_pending == 0) wake = 1; @@ -1282,6 +1303,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) wake_up(&ci->i_cap_wq); } + /* * symlinks */ @@ -1425,10 +1447,14 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) } if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { - ci->i_truncate_size = attr->ia_size; - ci->i_truncate_pending++; - queue_trunc = 1; inode->i_size = attr->ia_size; + if (ci->i_as_size < attr->ia_size) { + ci->i_as_size = attr->ia_size; + } else { + ci->i_truncate_size = attr->ia_size; + ci->i_truncate_pending++; + queue_trunc = 1; + } inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; inode->i_ctime = attr->ia_ctime; diff --git a/src/kernel/super.h b/src/kernel/super.h index 9bd3a7970c2b5..3921a6e6931a8 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -320,6 +320,7 @@ struct ceph_inode_info { int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + loff_t i_as_size; /* address space size (pre-truncation) */ u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ -- 2.39.5