From 90a46dcf40e2e9467b9ce278a54f82efed5838bd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 15 Jun 2018 15:54:02 -0500 Subject: [PATCH] global/signal_handler: write crash dumps to /var/lib/ceph/crash/$uuid/ Include two files: - meta, a JSON blob with everything interesting we can think of - log, the dump_recent log events Signed-off-by: Sage Weil --- src/global/signal_handler.cc | 152 +++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc index ae860e21456..bdcc06f2155 100644 --- a/src/global/signal_handler.cc +++ b/src/global/signal_handler.cc @@ -12,11 +12,17 @@ * */ +#include + #include "include/compat.h" #include "pthread.h" #include "common/BackTrace.h" #include "common/debug.h" +#include "common/safe_io.h" +#include "common/version.h" + +#include "include/uuid.h" #include "global/pidfile.h" #include "global/signal_handler.h" @@ -88,6 +94,49 @@ static void reraise_fatal(int signum) exit(1); } + +// /etc/os-release looks like +// +// NAME=Fedora +// VERSION="28 (Server Edition)" +// ID=fedora +// VERSION_ID=28 +// +// or +// +// NAME="Ubuntu" +// VERSION="16.04.3 LTS (Xenial Xerus)" +// ID=ubuntu +// ID_LIKE=debian +// +// get_from_os_release("FOO=bar\nTHIS=\"that\"\n", "FOO=", ...) will +// write "bar\0" to out buffer, which is assumed to be as large as the input +// file. +static int parse_from_os_release( + const char *file, const char *key, + char *out) +{ + const char *p = strstr(file, key); + if (!p) { + return -1; + } + const char *start = p + strlen(key); + const char *end = strchr(start, '\n'); + if (!end) { + return -1; + } + if (*start == '"' && *(end - 1) == '"') { + ++start; + --end; + } + if (start >= end) { + return -1; + } + memcpy(out, start, end - start); + out[end - start] = 0; + return 0; +} + static void handle_fatal_signal(int signum) { // This code may itself trigger a SIGSEGV if the heap is corrupt. In that @@ -119,6 +168,101 @@ static void handle_fatal_signal(int signum) bt.print(oss); dout_emergency(oss.str()); + char base[PATH_MAX] = { 0 }; + if (g_ceph_context && + g_ceph_context->_conf->crash_dir.size()) { + // -- crash dump -- + // id + ostringstream idss; + utime_t now = ceph_clock_now(); + now.gmtime(idss); + uuid_d uuid; + uuid.generate_random(); + idss << "_" << uuid; + string id = idss.str(); + std::replace(id.begin(), id.end(), ' ', '_'); + + snprintf(base, sizeof(base), "%s/%s", + g_ceph_context->_conf->crash_dir.c_str(), + id.c_str()); + int r = ::mkdir(base, 0700); + if (r >= 0) { + char fn[PATH_MAX*2]; + snprintf(fn, sizeof(fn)-1, "%s/meta", base); + int fd = ::open(fn, O_CREAT|O_WRONLY, 0600); + if (fd >= 0) { + JSONFormatter jf(true); + jf.open_object_section("crash"); + jf.dump_string("crash_id", id); + now.gmtime(jf.dump_stream("timestamp")); + jf.dump_string("entity_name", g_ceph_context->_conf->name.to_str()); + jf.dump_string("ceph_version", ceph_version_to_str()); + + struct utsname u; + r = uname(&u); + if (r >= 0) { + jf.dump_string("utsname_hostname", u.nodename); + jf.dump_string("utsname_sysname", u.sysname); + jf.dump_string("utsname_release", u.release); + jf.dump_string("utsname_version", u.version); + jf.dump_string("utsname_machine", u.machine); + } + + // os-releaes + int in = ::open("/etc/os-release", O_RDONLY); + if (in >= 0) { + char buf[4096]; + r = safe_read(in, buf, sizeof(buf)-1); + if (r >= 0) { + buf[r] = 0; + char v[4096]; + if (parse_from_os_release(buf, "NAME=", v) >= 0) { + jf.dump_string("os_name", v); + } + if (parse_from_os_release(buf, "ID=", v) >= 0) { + jf.dump_string("os_id", v); + } + if (parse_from_os_release(buf, "VERSION_ID=", v) >= 0) { + jf.dump_string("os_version_id", v); + } + if (parse_from_os_release(buf, "VERSION=", v) >= 0) { + jf.dump_string("os_version", v); + } + } + ::close(in); + } + + // assert? + if (g_assert_condition) { + jf.dump_string("assert_condition", g_assert_condition); + } + if (g_assert_func) { + jf.dump_string("assert_func", g_assert_func); + } + if (g_assert_file) { + jf.dump_string("assert_file", g_assert_file); + } + if (g_assert_line) { + jf.dump_unsigned("assert_file", g_assert_line); + } + if (g_assert_thread_name[0]) { + jf.dump_string("assert_thread_name", g_assert_thread_name); + } + + // backtrace + bt.dump(&jf); + + jf.close_section(); + ostringstream oss; + jf.flush(oss); + string s = oss.str(); + r = safe_write(fd, s.c_str(), s.size()); + (void)r; + ::close(fd); + } + } + } + // avoid recursion back into logging code if that is where // we got the SEGV. if (g_ceph_context && @@ -133,6 +277,14 @@ static void handle_fatal_signal(int signum) << dendl; g_ceph_context->_log->dump_recent(); + + if (base[0]) { + char fn[PATH_MAX*2]; + snprintf(fn, sizeof(fn)-1, "%s/log", base); + g_ceph_context->_log->set_log_file(fn); + g_ceph_context->_log->reopen_log_file(); + g_ceph_context->_log->dump_recent(); + } } reraise_fatal(signum); -- 2.39.5