]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
common/mutex_debug: fix arm64 SIGBUS/segfault due to data race 64273/head
authorKefu Chai <tchaikov@gmail.com>
Tue, 1 Jul 2025 06:29:41 +0000 (14:29 +0800)
committerKefu Chai <tchaikov@gmail.com>
Thu, 3 Jul 2025 04:22:14 +0000 (12:22 +0800)
The mutex_debugging_base::is_locked_by_me() member function was
experiencing random SIGBUS and segfault on ARM64 due to a data race on
the non-atomic locked_by member.

The racing occurs when:
- Thread A writes to locked_by during lock acquisition
- Thread B reads from locked_by in is_locked_by_me()
- These accesses happen concurrently without synchronization

On ARM64, std::thread::id (8 bytes when using libstdc++) writes/reads
are not atomic, causing the reader to potentially see partially written
or corrupted thread ID values, leading to undefined behavior in the
comparison operator.

Fix by making locked_by atomic and using proper memory ordering
in is_locked_by_me() to ensure synchronized access.

Fixes: https://tracker.ceph.com/issues/71547
Signed-off-by: Kefu Chai <tchaikov@gmail.com>
src/common/mutex_debug.h

index d56d0ebee9987ef38a69e36fdcb80048bc49c264..bc12fbc0f0ee5089aeba97a76194abfba9793fe5 100644 (file)
@@ -40,7 +40,7 @@ protected:
   bool backtrace; // gather backtrace on lock acquisition
 
   std::atomic<int> nlock = 0;
-  std::thread::id locked_by = {};
+  std::atomic<std::thread::id> locked_by = {};
 
   bool _enable_lockdep() const {
     return lockdep && g_lockdep;
@@ -58,8 +58,12 @@ public:
     return (nlock > 0);
   }
   bool is_locked_by_me() const {
-    return nlock.load(std::memory_order_acquire) > 0 && locked_by == std::this_thread::get_id();
+    if (nlock.load(std::memory_order_acquire) <= 0) {
+      return false;
+    }
+    return locked_by.load(std::memory_order_relaxed) == std::this_thread::get_id();
   }
+
   operator bool() const {
     return is_locked_by_me();
   }
@@ -150,21 +154,23 @@ public:
   }
 
   void _post_lock() {
-    if (!recursive)
-      ceph_assert(nlock == 0);
-    locked_by = std::this_thread::get_id();
+    if (!recursive) {
+      ceph_assert(nlock.load(std::memory_order_relaxed) == 0);
+    }
+    locked_by.store(std::this_thread::get_id(), std::memory_order_relaxed);
     nlock.fetch_add(1, std::memory_order_release);
   }
 
   void _pre_unlock() {
+    const int current_nlock = nlock.load(std::memory_order_relaxed);
     if (recursive) {
-      ceph_assert(nlock > 0);
+      ceph_assert(current_nlock > 0);
     } else {
-      ceph_assert(nlock == 1);
+      ceph_assert(current_nlock == 1);
     }
-    ceph_assert(locked_by == std::this_thread::get_id());
-    if (nlock == 1)
-      locked_by = std::thread::id();
+    ceph_assert(locked_by.load(std::memory_order_relaxed) == std::this_thread::get_id());
+    if (current_nlock == 1)
+      locked_by.store(std::thread::id(), std::memory_order_relaxed);
     nlock.fetch_sub(1, std::memory_order_release);
   }