From 2551130bcc12c55ddaa879b4f5b77c81890b51b2 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 25 Oct 2022 11:42:10 -0400 Subject: [PATCH] log: use non-blocking atomic writes to stderr fifos Buggy container runtimes are known to do partial reads on container's stderr and emit torn logs [1]. This is due to write(2) allowing partial data transfer. One way to avoid this when stderr is a FIFO (which is the case for container runtimes) is to change the file flags to set O_NONBLOCK which alters the behavior of write(2) when the write size is less than PIPE_BUF. This makes a write guaranteed to be atomic. The small complexity added is to poll on stderr in this case whenever a write(2) fails with EAGAIN. This does not block the log flush thread in any new way. It would always block when the stderr FIFO was full. [1] https://github.com/containers/conmon/issues/242 Fixes: cef04bc4ed0a3b1c1e9873fbbdd023b3bdf8af03 Fixes: https://tracker.ceph.com/issues/57923 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1996599 Related: https://bugzilla.redhat.com/show_bug.cgi?id=2108228 Signed-off-by: Patrick Donnelly --- src/log/Log.cc | 100 ++++++++++++++++++++++++++++++++++++++++++++++++- src/log/Log.h | 9 +++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/src/log/Log.cc b/src/log/Log.cc index 78bedf03cfa4e..967bcd5dcb7b8 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -18,10 +18,14 @@ #include "LogClock.h" #include "SubsystemMap.h" +#include + #include #include +#include #include +#include #include #include @@ -49,6 +53,7 @@ Log::Log(const SubsystemMap *s) m_recent(DEFAULT_MAX_RECENT) { m_log_buf.reserve(MAX_LOG_BUF); + _configure_stderr(); } Log::~Log() @@ -64,6 +69,36 @@ Log::~Log() } } +void Log::_configure_stderr() +{ + struct stat info; + if (int rc = fstat(m_fd_stderr, &info); rc == -1) { + std::cerr << "failed to stat stderr: " << cpp_strerror(errno) << std::endl; + return; + } + + if (S_ISFIFO(info.st_mode)) { + /* Set O_NONBLOCK on FIFO stderr file. We want to ensure atomic debug log + * writes so they do not get partially read by e.g. buggy container + * runtimes. See also IEEE Std 1003.1-2017 and Log::_log_stderr below. + */ + int flags = fcntl(m_fd_stderr, F_GETFL); + if (flags == -1) { + std::cerr << "failed to get fcntl flags for stderr: " << cpp_strerror(errno) << std::endl; + return; + } + if (!(flags & O_NONBLOCK)) { + flags |= O_NONBLOCK; + flags = fcntl(m_fd_stderr, F_SETFL, flags); + if (flags == -1) { + std::cerr << "failed to set fcntl flags for stderr: " << cpp_strerror(errno) << std::endl; + return; + } + } + do_stderr_poll = true; + } +} + /// void Log::set_coarse_timestamps(bool coarse) { @@ -260,6 +295,69 @@ void Log::_log_safe_write(std::string_view sv) } } +void Log::set_stderr_fd(int fd) +{ + m_fd_stderr = fd; + _configure_stderr(); +} + +void Log::_log_stderr(std::string_view strv) +{ + if (do_stderr_poll) { + auto& prefix = m_log_stderr_prefix; + size_t const len = prefix.size() + strv.size(); + boost::container::small_vector buf; + buf.resize(len+1, '\0'); + memcpy(buf.data(), prefix.c_str(), prefix.size()); + memcpy(buf.data()+prefix.size(), strv.data(), strv.size()); + + char const* const start = buf.data(); + char const* current = start; + while ((size_t)(current-start) < len) { + auto chunk = std::min(PIPE_BUF, len-(ssize_t)(current-start)); + while (1) { + ssize_t rc = write(m_fd_stderr, current, chunk); + if (rc == chunk) { + current += chunk; + break; + } else if (rc > 0) { + /* According to IEEE Std 1003.1-2017, this cannot happen: + * + * Write requests to a pipe or FIFO shall be handled in the same way as a regular file with the following exceptions: + * ... + * If the O_NONBLOCK flag is set ... + * ... + * A write request for {PIPE_BUF} or fewer bytes shall have the + * following effect: if there is sufficient space available in + * the pipe, write() shall transfer all the data and return the + * number of bytes requested. Otherwise, write() shall transfer + * no data and return -1 with errno set to [EAGAIN]. + * + * In any case, handle misbehavior gracefully by incrementing current. + */ + current += rc; + break; + } else if (rc == -1) { + if (errno == EAGAIN) { + struct pollfd pfd[1]; + pfd[0].fd = m_fd_stderr; + pfd[0].events = POLLOUT; + poll(pfd, 1, -1); + /* ignore errors / success, just retry the write */ + } else if (errno == EINTR) { + continue; + } else { + /* some other kind of error, no point logging if stderr writes fail */ + return; + } + } + } + } + } else { + fmt::print(std::cerr, "{}{}", m_log_stderr_prefix, strv); + } +} + void Log::_flush_logbuf() { if (m_log_buf.size()) { @@ -319,7 +417,7 @@ void Log::_flush(EntryVector& t, bool crash) pos[used++] = '\n'; if (do_stderr) { - fmt::print(std::cerr, "{}{}", m_log_stderr_prefix, std::string_view(pos, used)); + _log_stderr(std::string_view(pos, used)); } if (do_fd) { diff --git a/src/log/Log.h b/src/log/Log.h index 51ed58d928966..3a60937af5583 100644 --- a/src/log/Log.h +++ b/src/log/Log.h @@ -18,6 +18,8 @@ #include "log/Entry.h" +#include + struct uuid_d; namespace ceph { @@ -44,6 +46,7 @@ public: void reopen_log_file(); void chown_log_file(uid_t uid, gid_t gid); void set_log_stderr_prefix(std::string_view p); + void set_stderr_fd(int fd); void flush(); @@ -108,6 +111,8 @@ private: uid_t m_uid = 0; gid_t m_gid = 0; + int m_fd_stderr = STDERR_FILENO; + int m_fd_last_error = 0; ///< last error we say writing to fd (if any) int m_syslog_log = -2, m_syslog_crash = -2; @@ -116,6 +121,7 @@ private: int m_journald_log = -3, m_journald_crash = -3; std::string m_log_stderr_prefix; + bool do_stderr_poll = false; std::shared_ptr m_graylog; std::unique_ptr m_journald; @@ -133,6 +139,9 @@ private: void _log_safe_write(std::string_view sv); void _flush_logbuf(); void _log_message(std::string_view s, bool crash); + void _configure_stderr(); + void _log_stderr(std::string_view strv); + }; -- 2.39.5