From: Adam C. Emerson Date: Fri, 6 Mar 2026 00:53:17 +0000 (-0500) Subject: neorados: Go through linger_cancel on `io_context` shutdown X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f5affe06676bec2f32f856bce85b5d78932d807f;p=ceph.git neorados: Go through linger_cancel on `io_context` shutdown Rather than just dropping the reference, clean up the linger operation properly within Objecter. Also, clear out handlers before relinquishing reference to avoid use-after-free. Fixes: https://tracker.ceph.com/issues/75164 Signed-off-by: Adam C. Emerson --- diff --git a/qa/workunits/rados/test.sh b/qa/workunits/rados/test.sh index 62ae8356497f..857b4f73519d 100755 --- a/qa/workunits/rados/test.sh +++ b/qa/workunits/rados/test.sh @@ -28,6 +28,10 @@ NEORADOS_TESTS=( read_operations snapshots watch_notify write_operations ) +VALGRIND_TESTS=( + neorados_leak_watch_notify +) + # Note on argument ordering: This script processes arguments sequentially, # so the order matters. Arguments must be provided in this specific sequence: # 1. --serial OR --crimson (optional) @@ -102,6 +106,11 @@ if [ $vstart -eq 1 ]; then ninja -j$(nproc) ceph_test_neorados_$f done + for f in "${VALGRIND_TESTS[@]}"; + do + ninja -j$(nproc) ceph_test_$f + done + echo "Setting up a test cluster..." ninja -j$(nproc) vstart ../src/vstart.sh --debug --new -x --localhost --bluestore @@ -133,9 +142,7 @@ do done # Running all tests in ceph_test_neorados -for f in \ - cls cmd handler_error io ec_io list ec_list misc pool read_operations snapshots \ - watch_notify write_operations +for f in "${NEORADOS_TESTS[@]}" do executable="ceph_test_neorados_$f" if [ $vstart -eq 1 ]; then @@ -165,6 +172,31 @@ do fi done +# Running all tests in VALGRIND_TESTS +for f in "${VALGRIND_TESTS[@]}" +do + executable="ceph_test_$f" + if [ $vstart -eq 1 ]; then + executable="./bin/$executable" + fi + if [ $parallel -eq 1 ]; then + r=$(printf '%25s' "$f") + ff=$(echo "$f" | awk '{print $1}') + bash -o pipefail -exc "valgrind $executable --error-exitcode=1 --track-origins=yes --leak-check=yes --log-file=valgrind_$ff.log" & + pid=$! + echo "valgrind test $f on pid $pid" + pids[$f]=$pid + test_type["$f"]="valgrind" # Store test type for later use in parallel mode + else + # If running in serial mode, run the test directly + if ! timeout "$timeout" valgrind "$executable" --track-origins=yes --error-exitcode=1 --leak-check=yes; then + echo "ERROR: Test $f timed out after $timeout seconds" + echo "Check the logs for failures in $f" + ret=1 + fi + fi +done + if [ $parallel -eq 1 ]; then for t in "${!pids[@]}" do diff --git a/src/neorados/RADOS.cc b/src/neorados/RADOS.cc index f14012633508..69d5bc4cd256 100644 --- a/src/neorados/RADOS.cc +++ b/src/neorados/RADOS.cc @@ -1398,12 +1398,23 @@ class Notifier : public async::service_list_base_hook { std::shared_ptr neoref; void service_shutdown() { - if (neoref) { - neoref = nullptr; - } - linger_op.reset(); + // Ensure neorados object and operation live to the end of the + // function. + auto localop = std::move(linger_op); + [[maybe_unused]] auto localneo = std::move(neoref); std::unique_lock l(m); handlers.clear(); + l.unlock(); + if (localop) { + // We are being taken down and will execute no more + // handlers. Call `linger_cancel` to clean up properly in + // Objecter. (It doesn't call out to the OSD or anything.) + // + // Removal and decrement are guarded by `linger_op->canceled` + // so there's no risk of an underflow. + localop->objecter->linger_cancel(localop.get()); + } + // The Notifier object is freed by the `linger_cancel` above. } public: diff --git a/src/test/neorados/CMakeLists.txt b/src/test/neorados/CMakeLists.txt index f5f63e18b1c6..505bc59a8aaf 100644 --- a/src/test/neorados/CMakeLists.txt +++ b/src/test/neorados/CMakeLists.txt @@ -233,3 +233,10 @@ target_link_libraries(ceph_test_neorados_watch_notify install(TARGETS ceph_test_neorados_watch_notify DESTINATION ${CMAKE_INSTALL_BINDIR}) + +add_executable(ceph_test_neorados_leak_watch_notify leak_watch_notify.cc) +target_link_libraries(ceph_test_neorados_leak_watch_notify global libneorados + neoradostest-support ${unittest_libs}) +install(TARGETS + ceph_test_neorados_leak_watch_notify + DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/src/test/neorados/leak_watch_notify.cc b/src/test/neorados/leak_watch_notify.cc new file mode 100644 index 000000000000..407e622b5952 --- /dev/null +++ b/src/test/neorados/leak_watch_notify.cc @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 sts=2 expandtab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2026 Contributors to the Ceph Project + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include + +#include +#include + +#include "include/neorados/RADOS.hpp" + +#include "common_tests.h" + +namespace asio = boost::asio; + +using boost::system::error_code; + +// Intentionally get a watch and don't unwatch it to make sure we call +// `linger_cancel` and don't leak. + +int main() +{ + asio::io_context c; + std::string_view oid = "obj"; + + std::optional rados; + neorados::IOContext pool; + neorados::RADOS::Builder{}.build(c, [&](error_code ec, neorados::RADOS r_) { + rados = std::move(r_); + create_pool(*rados, get_temp_pool_name(), + [&](error_code ec, int64_t poolid) { + pool.set_pool(poolid); + rados->watch(oid, pool, [&](error_code, std::uint64_t cookie) { + std::cout << "Got watch: " << cookie << std::endl; + }); + }); + }); + c.run(); +}