From 9b7c026b93946350775ada609db140a8fdccd32f Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 6 May 2021 17:21:28 +0000 Subject: [PATCH] crimson/alienstore: block SIGHUP to coexist with Seastar's signal handling. In `crimson/osd/main.cc` we instruct Seastar to handle `SIGHUP`. ``` // just ignore SIGHUP, we don't reread settings seastar::engine().handle_signal(SIGHUP, [] {}) ``` This happens using the Seastar's signal handling infrastructure which is incompliant with the alien world. ``` void reactor::signals::handle_signal(int signo, noncopyable_function&& handler) { // ... struct sigaction sa; sa.sa_sigaction = [](int sig, siginfo_t *info, void *p) { engine()._backend->signal_received(sig, info, p); }; // ... } ``` ``` extern __thread reactor* local_engine; extern __thread size_t task_quota; inline reactor& engine() { return *local_engine; } ``` The low-level signal handler above assumes `local_engine._backend` is not null which stays true only for threads from the S*'s world. Unfortunately, as we don't block the `SIGHUP` for alien threads, kernel is perfectly authorized to pick up one them to run the handler leading to weirdly-looking segfaults like this one: ``` INFO 2021-04-23 07:06:57,807 [shard 0] bluestore - stat DEBUG 2021-04-23 07:06:58,753 [shard 0] ms - [osd.1(client) v2:172.21.15.100:6802/30478@51064 >> mgr.4105 v2:172.21.15.109:6800/29891] --> #7 === pg_stats(0 pgs seq 55834574872 v 0) v2 (87) ... INFO 2021-04-23 07:06:58,813 [shard 0] bluestore - stat DEBUG 2021-04-23 07:06:59,753 [shard 0] osd - AdminSocket::handle_client: incoming asok string: {"prefix": "get_command_descriptions"} INFO 2021-04-23 07:06:59,753 [shard 0] osd - asok response length: 2947 INFO 2021-04-23 07:06:59,817 [shard 0] bluestore - stat DEBUG 2021-04-23 07:06:59,865 [shard 0] osd - AdminSocket::handle_client: incoming asok string: {"prefix": "get_command_descriptions"} INFO 2021-04-23 07:06:59,866 [shard 0] osd - asok response length: 2947 DEBUG 2021-04-23 07:07:00,020 [shard 0] osd - AdminSocket::handle_client: incoming asok string: {"prefix": "get_command_descriptions"} INFO 2021-04-23 07:07:00,020 [shard 0] osd - asok response length: 2947 INFO 2021-04-23 07:07:00,820 [shard 0] bluestore - stat ... Backtrace: 0# 0x00005600CD0D6AAF in ceph-osd 1# FatalSignal::signaled(int) in ceph-osd 2# FatalSignal::install_oneshot_signal_handler<11>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) in ceph-osd 3# 0x00007F5877C7EB20 in /lib64/libpthread.so.0 4# 0x00005600CD830B81 in ceph-osd 5# 0x00007F5877C7EB20 in /lib64/libpthread.so.0 6# pthread_cond_timedwait in /lib64/libpthread.so.0 7# crimson::os::ThreadPool::loop(std::chrono::duration >, unsigned long) in ceph-osd 8# 0x00007F5877999BA3 in /lib64/libstdc++.so.6 9# 0x00007F5877C7414A in /lib64/libpthread.so.0 10# clone in /lib64/libc.so.6 daemon-helper: command crashed with signal 11 ``` Ultimately, it turned out the thread came out from a syscall (`futex`) and started crunching the `SIGHUP` handler's code in which a nullptr dereference happened. This patch blocks `SIGHUP` for all threads spawned by `AlienStore`. Signed-off-by: Radoslaw Zarzynski --- src/crimson/os/alienstore/thread_pool.cc | 14 ++++++++++++++ src/crimson/os/alienstore/thread_pool.h | 1 + src/crimson/osd/main.cc | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc index 4b81dabd802..747d6714e36 100644 --- a/src/crimson/os/alienstore/thread_pool.cc +++ b/src/crimson/os/alienstore/thread_pool.cc @@ -26,6 +26,7 @@ ThreadPool::ThreadPool(size_t n_threads, if (!cpus.empty()) { pin(cpus); } + block_sighup(); (void) pthread_setname_np(pthread_self(), "alien-store-tp"); loop(queue_max_wait, i); }); @@ -51,6 +52,19 @@ void ThreadPool::pin(const std::vector& cpus) ceph_assert(r == 0); } +void ThreadPool::block_sighup() +{ + sigset_t sigs; + sigemptyset(&sigs); + // alien threads must ignore the SIGHUP. It's necessary as in + // `crimson/osd/main.cc` we set a handler using the Seastar's + // signal handling infrastrucute which assumes the `_backend` + // of `seastar::engine()` is not null. Grep `reactor.cc` for + // `sigaction` or just visit `reactor::signals::handle_signal()`. + sigaddset(&sigs, SIGHUP); + pthread_sigmask(SIG_BLOCK, &sigs, nullptr); +} + void ThreadPool::loop(std::chrono::milliseconds queue_max_wait, size_t shard) { auto& pending = pending_queues[shard]; diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h index bbc1430f75a..db44bd02f9e 100644 --- a/src/crimson/os/alienstore/thread_pool.h +++ b/src/crimson/os/alienstore/thread_pool.h @@ -121,6 +121,7 @@ class ThreadPool { return stopping.load(std::memory_order_relaxed); } static void pin(const std::vector& cpus); + static void block_sighup(); seastar::semaphore& local_free_slots() { return submit_queue.local().free_slots; } diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc index fd80c84ac25..04f8ca7cf33 100644 --- a/src/crimson/osd/main.cc +++ b/src/crimson/osd/main.cc @@ -223,7 +223,8 @@ int main(int argc, char* argv[]) ceph_abort_msg(fmt::format("pidfile_write failed with {} {}", ret, cpp_strerror(-ret))); } - // just ignore SIGHUP, we don't reread settings + // just ignore SIGHUP, we don't reread settings. keep in mind signals + // handled by S* must be blocked for alien threads (see AlienStore). seastar::engine().handle_signal(SIGHUP, [] {}); const int whoami = std::stoi(local_conf()->name.get_id()); const auto nonce = get_nonce(); -- 2.39.5