From 07bdbfadb099c630a61422045e492e41f21816fb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Jun 2009 12:28:51 -0700 Subject: [PATCH] kclient: fix snap_rwsem vs s_mutex deadlock in ceph_inode_set_size Call ceph_check_caps outside of the snap_rwsem. Fixes this lockdep warning: [84919.839574] cp/2724 is trying to acquire lock: [84919.839574] (&s->s_mutex){--..}, at: [] ceph_check_caps+0x 968/0xc04 [ceph] [84919.839574] [84919.839574] but task is already holding lock: [84919.839574] (&mdsc->snap_rwsem){----}, at: [] ceph_write_b egin+0x1d9/0x696 [ceph] [84919.839574] [84919.839574] which lock already depends on the new lock. [84919.839574] [84919.839574] [84919.839574] the existing dependency chain (in reverse order) is: [84919.839574] [84919.839574] -> #1 (&mdsc->snap_rwsem){----}: [84919.839574] [] validate_chain+0x9ef/0xce8 [84919.839574] [] __lock_acquire+0x730/0x7b9 [84919.839574] [] lock_acquire+0x85/0xa9 [84919.839574] [] down_write+0x43/0x77 [84919.839574] [] ceph_mdsc_handle_reply+0x960/0xa72 [c eph] [84919.839574] [] ceph_dispatch+0x4eb/0x5f6 [ceph] [84919.839574] [] try_read+0x15cf/0x17da [ceph] [84919.839574] [] con_work+0x28f/0x18a9 [ceph] [84919.839574] [] run_workqueue+0xf5/0x209 [84919.839574] [] worker_thread+0xdb/0xe8 [84919.839574] [] kthread+0x49/0x79 [84919.839574] [] child_rip+0xa/0x20 [84919.839574] [] 0xffffffffffffffff [84919.839574] [84919.839574] -> #0 (&s->s_mutex){--..}: [84919.839574] [] validate_chain+0x681/0xce8 [84919.839574] [] __lock_acquire+0x730/0x7b9 [84919.839574] [] lock_acquire+0x85/0xa9 [84919.839574] [] mutex_lock_nested+0x116/0x2f4 [84919.839574] [] ceph_check_caps+0x968/0xc04 [ceph] [84919.839574] [] ceph_inode_set_size+0xe1/0xf4 [ceph] [84919.839574] [] ceph_write_end+0x16f/0x1b4 [ceph] [84919.839574] [] generic_file_buffered_write+0x1a2/0x2 d4 [84919.839574] [] __generic_file_aio_write_nolock+0x354 /0x3be [84919.839574] [] generic_file_aio_write+0x66/0xc2 [84919.839574] [] ceph_aio_write+0x969/0xb52 [ceph] [84919.839574] [] do_sync_write+0xe2/0x126 [84919.839574] [] vfs_write+0xae/0x137 [84919.839574] [] sys_write+0x47/0x6f [84919.839574] [] system_call_fastpath+0x16/0x1b [84919.839574] [] 0xffffffffffffffff [84919.839574] [84919.839574] other info that might help us debug this: [84919.839574] [84919.839574] 2 locks held by cp/2724: [84919.839574] #0: (&sb->s_type->i_mutex_key#12){--..}, at: [] generic_file_aio_write+0x4f/0xc2 [84919.839574] #1: (&mdsc->snap_rwsem){----}, at: [] ceph_wr ite_begin+0x1d9/0x696 [ceph] [84919.839574] [84919.839574] stack backtrace: [84919.839574] Pid: 2724, comm: cp Not tainted 2.6.29 #16 [84919.839574] Call Trace: [84919.839574] [] print_circular_bug_tail+0xc7/0xd2 [84919.839574] [] validate_chain+0x681/0xce8 [84919.839574] [] ? get_page_from_freelist+0x3c6/0x6b5 [84919.839574] [] __lock_acquire+0x730/0x7b9 [84919.839574] [] lock_acquire+0x85/0xa9 [84919.839574] [] ? ceph_check_caps+0x968/0xc04 [ceph] [84919.839574] [] mutex_lock_nested+0x116/0x2f4 [84919.839574] [] ? ceph_check_caps+0x968/0xc04 [ceph] [84919.839574] [] ? ceph_check_caps+0x968/0xc04 [ceph] [84919.839574] [] ceph_check_caps+0x968/0xc04 [ceph] [84919.839574] [] ? get_lock_stats+0x34/0x5e [84919.839574] [] ceph_inode_set_size+0xe1/0xf4 [ceph] [84919.839574] [] ceph_write_end+0x16f/0x1b4 [ceph] [84919.839574] [] generic_file_buffered_write+0x1a2/0x2d4 [84919.839574] [] __generic_file_aio_write_nolock+0x354/0x3be [84919.839574] [] generic_file_aio_write+0x66/0xc2 [84919.839574] [] ceph_aio_write+0x969/0xb52 [ceph] [84919.839574] [] ? touch_atime+0xee/0x110 [84919.839574] [] ? nfs_file_read+0x107/0x116 [84919.839574] [] do_sync_write+0xe2/0x126 [84919.839574] [] ? __lock_acquire+0x74e/0x7b9 [84919.839574] [] ? put_lock_stats+0xe/0x27 [84919.839574] [] ? autoremove_wake_function+0x0/0x38 [84919.839574] [] ? dnotify_parent+0x6c/0x74 [84919.839574] [] ? security_file_permission+0x11/0x13 [84919.839574] [] vfs_write+0xae/0x137 [84919.839574] [] sys_write+0x47/0x6f [84919.839574] [] system_call_fastpath+0x16/0x1b --- src/kernel/addr.c | 6 +++++- src/kernel/file.c | 6 +++++- src/kernel/inode.c | 14 +++++++------- src/kernel/super.h | 2 +- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/kernel/addr.c b/src/kernel/addr.c index 326125f994651..4ee5d75fceffc 100644 --- a/src/kernel/addr.c +++ b/src/kernel/addr.c @@ -996,6 +996,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int check_cap = 0; dout(10, "write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1012,7 +1013,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, /* did file size increase? */ /* (no need for i_size_read(); we caller holds i_mutex */ if (pos+copied > inode->i_size) - ceph_inode_set_size(inode, pos+copied); + check_cap = ceph_inode_set_size(inode, pos+copied); if (!PageUptodate(page)) SetPageUptodate(page); @@ -1023,6 +1024,9 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); + if (check_cap) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + return copied; } diff --git a/src/kernel/file.c b/src/kernel/file.c index 8d21e3a103190..46a708e549edb 100644 --- a/src/kernel/file.c +++ b/src/kernel/file.c @@ -526,6 +526,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int written = 0; int flags; int do_sync = 0; + int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; @@ -632,7 +633,10 @@ out: ret = written; *offset = pos; if (pos > i_size_read(inode)) - ceph_inode_set_size(inode, pos); + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, + NULL); } return ret; } diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 92f226fb8729f..1179f2ead4d87 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -1143,9 +1143,10 @@ out: return err; } -void ceph_inode_set_size(struct inode *inode, loff_t size) +int ceph_inode_set_size(struct inode *inode, loff_t size) { struct ceph_inode_info *ci = ceph_inode(inode); + int ret = 0; spin_lock(&inode->i_lock); dout(30, "set_size %p %llu -> %llu\n", inode, inode->i_size, size); @@ -1154,12 +1155,11 @@ void ceph_inode_set_size(struct inode *inode, loff_t size) /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && - (ci->i_reported_size << 1) < ci->i_max_size) { - spin_unlock(&inode->i_lock); - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - } else { - spin_unlock(&inode->i_lock); - } + (ci->i_reported_size << 1) < ci->i_max_size) + ret = 1; + + spin_unlock(&inode->i_lock); + return ret; } /* diff --git a/src/kernel/super.h b/src/kernel/super.h index 8b12b359b88f9..9bd3a7970c2b5 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -799,7 +799,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); -extern void ceph_inode_set_size(struct inode *inode, loff_t size); +extern int ceph_inode_set_size(struct inode *inode, loff_t size); extern void ceph_inode_writeback(struct work_struct *work); extern void ceph_vmtruncate_work(struct work_struct *work); extern void __ceph_do_pending_vmtruncate(struct inode *inode); -- 2.39.5