From 8bcf327c64f437cc893ddcea4c0c405adb4dc027 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 1 Jul 2025 14:29:41 +0800 Subject: [PATCH] common/mutex_debug: fix arm64 SIGBUS/segfault due to data race The mutex_debugging_base::is_locked_by_me() member function was experiencing random SIGBUS and segfault on ARM64 due to a data race on the non-atomic locked_by member. The racing occurs when: - Thread A writes to locked_by during lock acquisition - Thread B reads from locked_by in is_locked_by_me() - These accesses happen concurrently without synchronization On ARM64, std::thread::id (8 bytes when using libstdc++) writes/reads are not atomic, causing the reader to potentially see partially written or corrupted thread ID values, leading to undefined behavior in the comparison operator. Fix by making locked_by atomic and using proper memory ordering in is_locked_by_me() to ensure synchronized access. Fixes: https://tracker.ceph.com/issues/71547 Signed-off-by: Kefu Chai --- src/common/mutex_debug.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h index d56d0ebee99..bc12fbc0f0e 100644 --- a/src/common/mutex_debug.h +++ b/src/common/mutex_debug.h @@ -40,7 +40,7 @@ protected: bool backtrace; // gather backtrace on lock acquisition std::atomic nlock = 0; - std::thread::id locked_by = {}; + std::atomic locked_by = {}; bool _enable_lockdep() const { return lockdep && g_lockdep; @@ -58,8 +58,12 @@ public: return (nlock > 0); } bool is_locked_by_me() const { - return nlock.load(std::memory_order_acquire) > 0 && locked_by == std::this_thread::get_id(); + if (nlock.load(std::memory_order_acquire) <= 0) { + return false; + } + return locked_by.load(std::memory_order_relaxed) == std::this_thread::get_id(); } + operator bool() const { return is_locked_by_me(); } @@ -150,21 +154,23 @@ public: } void _post_lock() { - if (!recursive) - ceph_assert(nlock == 0); - locked_by = std::this_thread::get_id(); + if (!recursive) { + ceph_assert(nlock.load(std::memory_order_relaxed) == 0); + } + locked_by.store(std::this_thread::get_id(), std::memory_order_relaxed); nlock.fetch_add(1, std::memory_order_release); } void _pre_unlock() { + const int current_nlock = nlock.load(std::memory_order_relaxed); if (recursive) { - ceph_assert(nlock > 0); + ceph_assert(current_nlock > 0); } else { - ceph_assert(nlock == 1); + ceph_assert(current_nlock == 1); } - ceph_assert(locked_by == std::this_thread::get_id()); - if (nlock == 1) - locked_by = std::thread::id(); + ceph_assert(locked_by.load(std::memory_order_relaxed) == std::this_thread::get_id()); + if (current_nlock == 1) + locked_by.store(std::thread::id(), std::memory_order_relaxed); nlock.fetch_sub(1, std::memory_order_release); } -- 2.39.5