From ce84a03fd06bce43b650611f02ab4cd62bd644fe Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 4 Sep 2024 13:47:17 +0300 Subject: [PATCH] stall-detector: Try hard not to crash while collecting backtrace Sometimes stall-detector signal comes in the middle of exception handling. If the stall is detected, stack unwiding starts to collect the stalled backtrace. Since exception handling means unwiding the stack as well, those two unwinders need to cooperate carefully, which is not guaranteed (spoiler: they don't cooperate carefully). In unlucky case, segmentation fault happens, the app is killed with SEGV. This patch helps stall detector to bail out in case of SEGV arrival while collecting the backtrace with minimally possible yet detailed enough stall report. Signed-off-by: Pavel Emelyanov --- include/seastar/core/reactor.hh | 1 + src/core/reactor.cc | 29 +++++++++++++++++++++++++++++ tests/unit/stall_detector_test.cc | 5 +++++ 3 files changed, 35 insertions(+) diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh index 862e028c2e5..ebf2b7d3844 100644 --- a/include/seastar/core/reactor.hh +++ b/include/seastar/core/reactor.hh @@ -655,6 +655,7 @@ public: /// resets the supression state. static void set_stall_detector_report_function(std::function report); static std::function get_stall_detector_report_function(); + static void set_stall_detector_crash_collecting_backtrace(); }; }; diff --git a/src/core/reactor.cc b/src/core/reactor.cc index 1ddda9e6ed0..1e8bbe24fc9 100644 --- a/src/core/reactor.cc +++ b/src/core/reactor.cc @@ -32,6 +32,7 @@ module; #include #include #include +#include #include #include @@ -842,7 +843,23 @@ class backtrace_buffer { } }; +static thread_local jmp_buf stall_detector_env; +static thread_local bool in_stall_detector = false; +static thread_local bool crash_collecting_backtrace = false; + +inline void maybe_crash_for_test() noexcept { + if (crash_collecting_backtrace) [[unlikely]] { + *(volatile int *)nullptr = 0; + } +} + static void print_with_backtrace(backtrace_buffer& buf, bool oneline) noexcept { + if (sigsetjmp(stall_detector_env, 0)) { + buf.append(" ¯\\_(ツ)_/¯\n"); + goto out; + } + in_stall_detector = true; + if (local_engine) { buf.append(" on shard "); buf.append_decimal(this_shard_id()); @@ -853,12 +870,17 @@ static void print_with_backtrace(backtrace_buffer& buf, bool oneline) noexcept { if (!oneline) { buf.append(".\nBacktrace:\n"); + maybe_crash_for_test(); buf.append_backtrace(); } else { buf.append(". Backtrace:"); + maybe_crash_for_test(); buf.append_backtrace_oneline(); buf.append("\n"); } + +out: + in_stall_detector = false; buf.flush(); } @@ -1502,6 +1524,10 @@ reactor::test::set_stall_detector_report_function(std::function report) r._cpu_stall_detector->reset_suppression_state(reactor::now()); } +void reactor::test::set_stall_detector_crash_collecting_backtrace() { + crash_collecting_backtrace = true; +} + std::function reactor::test::get_stall_detector_report_function() { return engine()._cpu_stall_detector->get_config().report; @@ -3930,6 +3956,9 @@ void install_oneshot_signal_handler() { struct sigaction sa; sa.sa_sigaction = [](int sig, siginfo_t *info, void *p) { + if (sig == SIGSEGV && in_stall_detector) { + siglongjmp(stall_detector_env, 1); + } std::lock_guard g(lock); if (!handled) { handled = true; diff --git a/tests/unit/stall_detector_test.cc b/tests/unit/stall_detector_test.cc index 289fd0a2437..31c3e5ef64f 100644 --- a/tests/unit/stall_detector_test.cc +++ b/tests/unit/stall_detector_test.cc @@ -175,6 +175,11 @@ SEASTAR_THREAD_TEST_CASE(spin_in_kernel) { test_spin_with_body("kernel", [] { mmap_populate(128 * 1024); }); } +SEASTAR_THREAD_TEST_CASE(crash_collecting_backtrace) { + reactor::test::set_stall_detector_crash_collecting_backtrace(); + engine().update_blocked_reactor_notify_ms(100ms); + spin(500ms); +} #else