From 72e286a6587ff91ebd01100802a889a9a3bed60a Mon Sep 17 00:00:00 2001 From: ethanwu Date: Fri, 17 Feb 2023 16:12:00 +0800 Subject: [PATCH] os/bluestore/bluefs: fix dir_link might add link that already exists in compact log After commit eac1807cf5f19dd79eb95bcb0cde80c67acb69f8 os/bluestore/bluefs: Weaken locks in open_for_write There's a race window between open_for_write and log compaction Process A Process B open_for_write _compact_log_async_LD_LNF_D log.lock node.lock ... update nodes.dir_map(add dirlink A) node.lock(wait for process A) node.unlock ... log.lock(wait for Process B) ... compact log(create log based on nodes.dir_map which has dirlink A) ... ... ... ... ... node.unlock() ... log.unlock log file create event(dirlink A) log.unlock After the above case, bluefs log will have something like this 0x0: txn(seq 1 len 0x141ee crc 0x3e1c626f) 0x0: op_init 0x0: op_file_update file(ino 2524749 size 0x246b6 mtime 2023-02-08T03:07:19.950963+0800 allocated 30000 alloc_commit 30000 extents [1:0xa135e0000~30000]) 0x0: op_file_update file(ino 2524746 size 0x175af mtime 2023-02-08T03:07:19.771584+0800 allocated 20000 alloc_commit 20000 extents [1:0xa13530000~20000]) ... 0x0: op_dir_link db/2524749.sst to 2524751 0x0: op_dir_link db/2524750.sst to 2524752 0x0: op_dir_link db/CURRENT to 2491157 ... 0x0: op_jump seq 18414993 offset 0x20000 0x20000: txn(seq 18414994 len 0x65 crc 0xc1f9ec5f) 0x20000: op_file_update file(ino 2524752 size 0x0 mtime 2023-02-08T03:07:20.205074+0800 allocated 0 alloc_commit 0 extents []) 0x20000: op_dir_link db/2524750.sst to 2524752 dir_link db/2524750.sst to 2524752 exists at both compacted log(txn seq 1) and log txn seq 18414994. If log compaction won't happen later or abnormal shutdown happens, next time bluefs mount replay will fail at following assert 2023-02-10T11:05:09.826+0800 7f1f97b71280 10 bluefs _replay 0x20000: txn(seq 18414994 len 0x65 crc 0xc1f9ec5f) 2023-02-10T11:05:09.826+0800 7f1f97b71280 20 bluefs _replay 0x20000: op_file_update file(ino 2524752 size 0x0 mtime 2023-02-08T03:07:20.205074+0800 allocated 0 alloc_commit 0 extents []) 2023-02-10T11:05:09.826+0800 7f1f97b71280 20 bluefs _replay 0x20000: op_dir_link db/2524750.sst to 2524752 2023-02-10T11:05:09.832+0800 7f1f97b71280 -1 //source/ceph/src/os/bluestore/BlueFS.cc: In function 'int BlueFS::_replay(bool, bool)' thread 7f1f97b71280 time 2023-02-10T11:05:09.827662+0800 //source/ceph/src/os/bluestore/BlueFS.cc: 1419: FAILED ceph_assert(r == q->second->file_map.end()) Refer to other operations that update the node and add a log entry at the same time, such as rename. Fixed this by taking log lock and node lock at the begining function(follow lock ordering, so log lock first.), i.e. N_LD -> LND Fixes: https://tracker.ceph.com/issues/56210 Signed-off-by: ethanwu (cherry picked from commit c55f737ba4959bb43e3f57c8247d8b9df67f42f2) --- src/os/bluestore/BlueFS.cc | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 64fbe1e88039f..e466b3c01db0c 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3925,7 +3925,7 @@ int BlueFS::open_for_write( std::string_view dirname, std::string_view filename, FileWriter **h, - bool overwrite)/*_N_LD*/ + bool overwrite)/*_LND*/ { _maybe_check_vselector_LNF(); FileRef file; @@ -3933,7 +3933,8 @@ int BlueFS::open_for_write( bool truncate = false; mempool::bluefs::vector pending_release_extents; { - std::unique_lock nl(nodes.lock); + std::lock_guard ll(log.lock); + std::lock_guard nl(nodes.lock); dout(10) << __func__ << " " << dirname << "/" << filename << dendl; map::iterator p = nodes.dir_map.find(dirname); DirRef dir; @@ -3991,17 +3992,15 @@ int BlueFS::open_for_write( dout(20) << __func__ << " mapping " << dirname << "/" << filename << " vsel_hint " << file->vselector_hint << dendl; - } - { - std::lock_guard ll(log.lock); - log.t.op_file_update(file->fnode); - if (create) - log.t.op_dir_link(dirname, filename, file->fnode.ino); - std::lock_guard dl(dirty.lock); - for (auto& p : pending_release_extents) { - dirty.pending_release[p.bdev].insert(p.offset, p.length); - } + log.t.op_file_update(file->fnode); + if (create) + log.t.op_dir_link(dirname, filename, file->fnode.ino); + + std::lock_guard dl(dirty.lock); + for (auto& p : pending_release_extents) { + dirty.pending_release[p.bdev].insert(p.offset, p.length); + } } *h = _create_writer(file); -- 2.39.5