]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
todo: move bugs to tracker
authorSage Weil <sage@newdream.net>
Fri, 9 Apr 2010 21:03:47 +0000 (14:03 -0700)
committerSage Weil <sage@newdream.net>
Fri, 9 Apr 2010 21:52:01 +0000 (14:52 -0700)
src/TODO

index 44cd2f5251f1ad9775c78697357ca4d2324251ab..29c3aba84ad03907f8b9ac8e53de50364f92b812 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -61,189 +61,6 @@ v0.21
 - rebuild mds hierarchy
 - kclient: retry alloc on ENOMEM when reading from connection?
 
-bugs
-- mds rejoin: invented dirfrags (MDCache.cc:3469) have version=0; subsequent modification of dentries/inodes predirty a bad (small) version #.
-- rm -r failure (on kernel tree)
-- dbench 1, restart mds (may take a few times), dbench will error out.
-
-- kclient: need to mark dirty pages on snap, so that they get redirtied under new snapc
-
-- kclient lockdep warning
-[ 1615.328733] =======================================================
-[ 1615.331050] [ INFO: possible circular locking dependency detected ]
-[ 1615.331050] 2.6.34-rc2 #22
-[ 1615.331050] -------------------------------------------------------
-[ 1615.331050] fixdep/3263 is trying to acquire lock:
-[ 1615.331050]  (&osdc->request_mutex){+.+...}, at: [<ffffffffa007b66c>] ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.331050] 
-[ 1615.331050] but task is already holding lock:
-[ 1615.331050]  (&mm->mmap_sem){++++++}, at: [<ffffffff810208c0>] do_page_fault+0x104/0x278
-[ 1615.331050] 
-[ 1615.331050] which lock already depends on the new lock.
-[ 1615.331050] 
-[ 1615.331050] 
-[ 1615.331050] the existing dependency chain (in reverse order) is:
-[ 1615.331050] 
-[ 1615.331050] -> #3 (&mm->mmap_sem){++++++}:
-[ 1615.331050]        [<ffffffff81059fd3>] validate_chain+0xa4d/0xd28
-[ 1615.331050]        [<ffffffff8105aa7f>] __lock_acquire+0x7d1/0x84e
-[ 1615.331050]        [<ffffffff8105ab84>] lock_acquire+0x88/0xa5
-[ 1615.331050]        [<ffffffff81094daf>] might_fault+0x90/0xb3
-[ 1615.331050]        [<ffffffff81390d1e>] memcpy_fromiovecend+0x54/0x8e
-[ 1615.331050]        [<ffffffff813b6ea7>] ip_generic_getfrag+0x2a/0x8f
-[ 1615.331050]        [<ffffffff813b5da2>] ip_append_data+0x5f6/0x971
-[ 1615.331050]        [<ffffffff813d35bf>] udp_sendmsg+0x4e8/0x603
-[ 1615.331050]        [<ffffffff813d91e3>] inet_sendmsg+0x46/0x53
-[ 1615.331050]        [<ffffffff813878c1>] sock_sendmsg+0xd4/0xf5
-[ 1615.331050]        [<ffffffff81387e0f>] sys_sendto+0xdf/0x107
-[ 1615.331050]        [<ffffffff810029eb>] system_call_fastpath+0x16/0x1b
-[ 1615.331050] 
-[ 1615.331050] -> #2 (sk_lock-AF_INET){+.+.+.}:
-[ 1615.331050]        [<ffffffff81059fd3>] validate_chain+0xa4d/0xd28
-[ 1615.331050]        [<ffffffff8105aa7f>] __lock_acquire+0x7d1/0x84e
-[ 1615.331050]        [<ffffffff8105ab84>] lock_acquire+0x88/0xa5
-[ 1615.331050]        [<ffffffff8138a562>] lock_sock_nested+0xeb/0xff
-[ 1615.331050]        [<ffffffff813da29d>] inet_stream_connect+0x2b/0x25c
-[ 1615.331050]        [<ffffffffa006eea6>] try_write+0x26e/0x102c [ceph]
-[ 1615.331050]        [<ffffffffa00705ba>] con_work+0x126/0x6bc [ceph]
-[ 1615.529553]        [<ffffffff8104774e>] worker_thread+0x1e8/0x2fa
-[ 1615.529553]        [<ffffffff8104a4aa>] kthread+0x7d/0x85
-[ 1615.529553]        [<ffffffff81003794>] kernel_thread_helper+0x4/0x10
-[ 1615.529553] 
-[ 1615.529553] -> #1 (&con->mutex){+.+.+.}:
-[ 1615.529553]        [<ffffffff81059fd3>] validate_chain+0xa4d/0xd28
-[ 1615.529553]        [<ffffffff8105aa7f>] __lock_acquire+0x7d1/0x84e
-[ 1615.529553]        [<ffffffff8105ab84>] lock_acquire+0x88/0xa5
-[ 1615.529553]        [<ffffffff81425727>] mutex_lock_nested+0x62/0x32c
-[ 1615.529553]        [<ffffffffa0070cd3>] ceph_con_send+0xb3/0x244 [ceph]
-[ 1615.529553]        [<ffffffffa007b591>] __send_request+0x108/0x196 [ceph]
-[ 1615.529553]        [<ffffffffa007b794>] ceph_osdc_start_request+0x175/0x278 [ceph]
-[ 1615.529553]        [<ffffffffa006029d>] ceph_writepages_start+0xb23/0x112a [ceph]
-[ 1615.529553]        [<ffffffff810849aa>] do_writepages+0x1f/0x28
-[ 1615.529553]        [<ffffffff810ca5e8>] writeback_single_inode+0xb6/0x1f5
-[ 1615.529553]        [<ffffffff810cad9b>] writeback_inodes_wb+0x2d1/0x378
-[ 1615.529553]        [<ffffffff810cafa8>] wb_writeback+0x166/0x1e0
-[ 1615.529553]        [<ffffffff810cb154>] wb_do_writeback+0x83/0x1d3
-[ 1615.529553]        [<ffffffff810cb2d2>] bdi_writeback_task+0x2e/0x9b
-[ 1615.529553]        [<ffffffff8108fd73>] bdi_start_fn+0x71/0xd2
-[ 1615.529553]        [<ffffffff8104a4aa>] kthread+0x7d/0x85
-[ 1615.529553]        [<ffffffff81003794>] kernel_thread_helper+0x4/0x10
-[ 1615.529553] 
-[ 1615.529553] -> #0 (&osdc->request_mutex){+.+...}:
-[ 1615.529553]        [<ffffffff81059cbf>] validate_chain+0x739/0xd28
-[ 1615.529553]        [<ffffffff8105aa7f>] __lock_acquire+0x7d1/0x84e
-[ 1615.529553]        [<ffffffff8105ab84>] lock_acquire+0x88/0xa5
-[ 1615.529553]        [<ffffffff81425727>] mutex_lock_nested+0x62/0x32c
-[ 1615.529553]        [<ffffffffa007b66c>] ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.529553]        [<ffffffffa007d8b6>] ceph_osdc_readpages+0x123/0x222 [ceph]
-[ 1615.529553]        [<ffffffffa005f4b7>] ceph_readpages+0x193/0x456 [ceph]
-[ 1615.529553]        [<ffffffff81085bd1>] __do_page_cache_readahead+0x17d/0x1f5
-[ 1615.529553]        [<ffffffff81085c65>] ra_submit+0x1c/0x20
-[ 1615.529553]        [<ffffffff81085fab>] ondemand_readahead+0x264/0x277
-[ 1615.529553]        [<ffffffff81086092>] page_cache_sync_readahead+0x33/0x35
-[ 1615.529553]        [<ffffffff8107f0d7>] filemap_fault+0x143/0x31f
-[ 1615.529553]        [<ffffffff810913bf>] __do_fault+0x50/0x415
-[ 1615.529553]        [<ffffffff810934d9>] handle_mm_fault+0x334/0x6a6
-[ 1615.529553]        [<ffffffff810209af>] do_page_fault+0x1f3/0x278
-[ 1615.529553]        [<ffffffff814281ff>] page_fault+0x1f/0x30
-[ 1615.529553] 
-[ 1615.529553] other info that might help us debug this:
-[ 1615.529553] 
-[ 1615.529553] 1 lock held by fixdep/3263:
-[ 1615.529553]  #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff810208c0>] do_page_fault+0x104/0x278
-[ 1615.529553] 
-[ 1615.529553] stack backtrace:
-[ 1615.529553] Pid: 3263, comm: fixdep Not tainted 2.6.34-rc2 #22
-[ 1615.529553] Call Trace:
-[ 1615.529553]  [<ffffffff81058f49>] print_circular_bug+0xb3/0xc1
-[ 1615.529553]  [<ffffffff81059cbf>] validate_chain+0x739/0xd28
-[ 1615.529553]  [<ffffffff810099d7>] ? native_sched_clock+0x37/0x71
-[ 1615.824177]  [<ffffffff8105aa7f>] __lock_acquire+0x7d1/0x84e
-[ 1615.824177]  [<ffffffff8105ab84>] lock_acquire+0x88/0xa5
-[ 1615.824177]  [<ffffffffa007b66c>] ? ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.824177]  [<ffffffffa007b66c>] ? ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.824177]  [<ffffffff81425727>] mutex_lock_nested+0x62/0x32c
-[ 1615.824177]  [<ffffffffa007b66c>] ? ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.824177]  [<ffffffffa007b66c>] ceph_osdc_start_request+0x4d/0x278 [ceph]
-[ 1615.824177]  [<ffffffffa007d8b6>] ceph_osdc_readpages+0x123/0x222 [ceph]
-[ 1615.824177]  [<ffffffffa005f4b7>] ceph_readpages+0x193/0x456 [ceph]
-[ 1615.824177]  [<ffffffff810099d7>] ? native_sched_clock+0x37/0x71
-[ 1615.824177]  [<ffffffff81056580>] ? get_lock_stats+0x19/0x4c
-[ 1615.824177]  [<ffffffff81085bd1>] __do_page_cache_readahead+0x17d/0x1f5
-[ 1615.824177]  [<ffffffff81085ad0>] ? __do_page_cache_readahead+0x7c/0x1f5
-[ 1615.824177]  [<ffffffff8107d848>] ? find_get_page+0xd9/0x12d
-[ 1615.824177]  [<ffffffff81085c65>] ra_submit+0x1c/0x20
-[ 1615.916887]  [<ffffffff81085fab>] ondemand_readahead+0x264/0x277
-[ 1615.916887]  [<ffffffff81086092>] page_cache_sync_readahead+0x33/0x35
-[ 1615.931403]  [<ffffffff8107f0d7>] filemap_fault+0x143/0x31f
-[ 1615.931403]  [<ffffffff810913bf>] __do_fault+0x50/0x415
-[ 1615.931403]  [<ffffffff8105aa99>] ? __lock_acquire+0x7eb/0x84e
-[ 1615.946963]  [<ffffffff810934d9>] handle_mm_fault+0x334/0x6a6
-[ 1615.946963]  [<ffffffff810209af>] do_page_fault+0x1f3/0x278
-[ 1615.946963]  [<ffffffff814281ff>] page_fault+0x1f/0x30
-
-- kclient: dcache bug
-[ 2793.947421] ------------[ cut here ]------------
-[ 2793.950108] kernel BUG at fs/dcache.c:1887!
-[ 2793.950108] invalid opcode: 0000 [#1] PREEMPT SMP 
-[ 2793.950108] last sysfs file: /sys/kernel/uevent_seqnum
-[ 2793.950108] CPU 0 
-[ 2793.950108] Modules linked in: aes_x86_64 aes_generic ceph fan ac battery container ehci_hcd uhci_hcd button thermal processor
-[ 2793.950108] 
-[ 2793.950108] Pid: 2818, comm: ceph-msgr/0 Not tainted 2.6.34-rc2 #23 PDSMi+/PDSMi
-[ 2793.950108] RIP: 0010:[<ffffffff810c0d50>]  [<ffffffff810c0d50>] d_materialise_unique+0x2f3/0x307
-[ 2793.950108] RSP: 0018:ffff88011a2b99a0  EFLAGS: 00010246
-[ 2793.950108] RAX: ffff88011a2b9fd8 RBX: ffff88004c976070 RCX: ffff88004c9760f0
-[ 2793.950108] RDX: 0000000000008f8e RSI: ffffffff81682340 RDI: ffff88011e3488c8
-[ 2793.950108] RBP: ffff88011a2b99d0 R08: 0000000000000002 R09: 0000000000000000
-[ 2793.950108] R10: 0000000000000000 R11: ffff88011cac4280 R12: ffff88004c9c6650
-[ 2793.950108] R13: ffff88001e7444d0 R14: ffff88011e3488c8 R15: ffff88001e7444d0
-[ 2793.950108] FS:  0000000000000000(0000) GS:ffff880002600000(0000) knlGS:0000000000000000
-[ 2793.950108] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
-[ 2793.950108] CR2: 00002b6b6acae004 CR3: 00000000df8c6000 CR4: 00000000000006f0
-[ 2793.950108] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
-[ 2793.950108] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
-[ 2793.950108] Process ceph-msgr/0 (pid: 2818, threadinfo ffff88011a2b8000, task ffff88011cac4280)
-[ 2793.950108] Stack:
-[ 2793.950108]  ffff88004c9c6650 ffff88004c9c6650 ffff88004c9c6650 fffffffffffffffe
-[ 2793.950108] <0> ffff88011e3488c8 ffff88001e7444d0 ffff88011a2b9a40 ffffffffa004ed84
-[ 2793.950108] <0> ffff88011e3488c8 ffff88001e7444d0 ffff88011a2b9a40 ffffffffa004fb3f
-[ 2793.950108] Call Trace:
-[ 2793.950108]  [<ffffffffa004ed84>] splice_dentry+0x89/0x21b [ceph]
-[ 2793.950108]  [<ffffffffa004fb3f>] ? ceph_get_inode+0x2e/0x12f [ceph]
-[ 2793.950108]  [<ffffffffa0050988>] ceph_fill_trace+0x3bd/0x9a0 [ceph]
-[ 2793.950108]  [<ffffffffa006309a>] ? __cleanup_empty_realms+0x61/0x6a [ceph]
-[ 2793.950108]  [<ffffffffa0070c07>] ? dispatch+0xb41/0x1461 [ceph]
-[ 2793.950108]  [<ffffffffa0070c69>] dispatch+0xba3/0x1461 [ceph]
-[ 2793.950108]  [<ffffffff810a754b>] ? poison_obj+0x27/0x32
-[ 2793.950108]  [<ffffffff81057f41>] ? trace_hardirqs_on+0xd/0xf
-[ 2793.950108]  [<ffffffffa0068568>] try_read+0xebf/0x158f [ceph]
-[ 2793.950108]  [<ffffffff81057f41>] ? trace_hardirqs_on+0xd/0xf
-[ 2793.950108]  [<ffffffff81424e17>] ? schedule+0x672/0x698
-[ 2793.950108]  [<ffffffffa006a5ae>] con_work+0x11a/0x6bc [ceph]
-[ 2793.950108]  [<ffffffff8104774e>] worker_thread+0x1e8/0x2fa
-[ 2793.950108]  [<ffffffff810476f5>] ? worker_thread+0x18f/0x2fa
-[ 2793.950108]  [<ffffffffa006a494>] ? con_work+0x0/0x6bc [ceph]
-[ 2793.950108]  [<ffffffff8104a7dc>] ? autoremove_wake_function+0x0/0x38
-[ 2793.950108]  [<ffffffff81047566>] ? worker_thread+0x0/0x2fa
-[ 2793.950108]  [<ffffffff8104a4aa>] kthread+0x7d/0x85
-[ 2793.950108]  [<ffffffff81003794>] kernel_thread_helper+0x4/0x10
-[ 2793.950108]  [<ffffffff81427fc0>] ? restore_args+0x0/0x30
-[ 2793.950108]  [<ffffffff8104a42d>] ? kthread+0x0/0x85
-[ 2793.950108]  [<ffffffff81003790>] ? kernel_thread_helper+0x0/0x10
-[ 2793.950108] Code: 39 eb 75 10 48 8b 75 d0 48 89 df 31 db e8 47 6f 12 00 eb 1b 48 8b 7d d0 e8 c0 06 00 00 eb 10 48 c7 c7 40 23 68 81 e8 61 6f 36 00 <0f> 0b eb fe 41 5e 48 89 d8 5b 41 5c 41 5d 41 5e 41 5f c9 c3 55 
-[ 2793.950108] RIP  [<ffffffff810c0d50>] d_materialise_unique+0x2f3/0x307
-[ 2793.950108]  RSP <ffff88011a2b99a0>
-[ 2794.275464] ---[ end trace e39f19630a44d9a2 ]---
-
-?- bonnie++ -u root -d /mnt/ceph/ -s 0 -n 1
-(03:35:29 PM) Isteriat: Using uid:0, gid:0.
-(03:35:29 PM) Isteriat: Create files in sequential order...done.
-(03:35:29 PM) Isteriat: Stat files in sequential order...Expected 1024 files but only got 0
-(03:35:29 PM) Isteriat: Cleaning up test directory after error.
-
-- osd pg split breaks if not all osds are up...
-
 
 filestore performance notes
 - write ordering options