From cd16f6b2065b92c5f2046166d0491f57d88bde7a Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Tue, 4 May 2021 18:52:49 +0000 Subject: [PATCH] global: fault handlers cope with simultaneous faults now. This fix deals with the problem that arose due to installing the fault handlers with the `SA_RESETHAND` flag which instructs the kernel to restore the default handler for a signal upon entry to its handler. Unfortunately, in a situation when more than one fault happens the same time (which may happen if e.g. two `tp_osd_tp` threads run into the same, buggy path), the default handler can interrupt-and-exit-the-process when our original one is still executing. The problem can be demonstrated with ``` rados bench -p test-pool 1 write -b 4096 --no-cleanup ``` and following instrumentation in `ceph-osd`: ```diff diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 626e8ccefbd..cde46776d53 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -6617,6 +6617,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector& ops) ++ctx->num_write; result = 0; { // write + *((int*)((int)ceph_gettid() % 0x42)) = 0xdeadbeef; __u32 seq = oi.truncate_seq; tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); if (op.extent.length != osd_op.indata.length()) { ``` Fixes: https://tracker.ceph.com/issues/50647 Signed-off-by: Radoslaw Zarzynski --- src/global/signal_handler.cc | 40 +++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc index 0447f96e1b1..a015ee9f31f 100644 --- a/src/global/signal_handler.cc +++ b/src/global/signal_handler.cc @@ -84,6 +84,7 @@ void sighup_handler(int signum) static void reraise_fatal(int signum) { // Use default handler to dump core + signal(signum, SIG_DFL); int ret = raise(signum); // Normally, we won't get here. If we do, something is very weird. @@ -144,11 +145,26 @@ static int parse_from_os_release( return 0; } -static void handle_fatal_signal(int signum) +static void handle_oneshot_fatal_signal(int signum) { - // This code may itself trigger a SIGSEGV if the heap is corrupt. In that - // case, SA_RESETHAND specifies that the default signal handler-- - // presumably dump core-- will handle it. + constexpr static pid_t NULL_TID{0}; + static std::atomic handler_tid{NULL_TID}; + if (auto expected{NULL_TID}; + !handler_tid.compare_exchange_strong(expected, ceph_gettid())) { + if (expected == ceph_gettid()) { + // The handler code may itself trigger a SIGSEGV if the heap is corrupt. + // In that case, SIG_DFL followed by return specifies that the default + // signal handler -- presumably dump core -- will handle it. + signal(signum, SIG_DFL); + } else { + // Huh, another thread got into troubles while we are handling the fault. + // If this is i.e. SIGSEGV handler, returning means retrying the faulty + // instruction one more time, and thus all those extra threads will run + // into a busy-wait basically. + } + return; + } + char buf[1024]; char pthread_name[16] = {0}; //limited by 16B include terminating null byte. int r = ceph_pthread_getname(pthread_self(), pthread_name, sizeof(pthread_name)); @@ -335,14 +351,14 @@ static void handle_fatal_signal(int signum) void install_standard_sighandlers(void) { - install_sighandler(SIGSEGV, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGABRT, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGBUS, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGILL, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGFPE, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGXCPU, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGXFSZ, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); - install_sighandler(SIGSYS, handle_fatal_signal, SA_RESETHAND | SA_NODEFER); + install_sighandler(SIGSEGV, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGABRT, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGBUS, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGILL, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGFPE, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGXCPU, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGXFSZ, handle_oneshot_fatal_signal, SA_NODEFER); + install_sighandler(SIGSYS, handle_oneshot_fatal_signal, SA_NODEFER); } -- 2.39.5