From 97f2434d23836b8032dceeed76d73d3ec77f386d Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 3 Jun 2025 16:07:33 +0800 Subject: [PATCH] librbd/cache/pwl: fix memory leak in SyncPoint persist context cleanup Previously, SyncPoint allocated two C_Gather instances tracked by raw pointers but failed to properly clean them up when only a single sync point existed, causing memory leaks detected by AddressSanitizer. This change fixes the leak by modifying AbstractWriteLog::shut_down() to check for prior sync points in the chain. When the current sync point is the only one present, we now activate the m_prior_log_entries_persisted context to ensure: - The onfinish callback executes and releases the captured strong reference to the enclosing SyncPoint - The parent m_sync_point_persist context completes and gets properly released This ensures all allocated contexts are cleaned up correctly during shutdown, eliminating the memory leak. The ASan report: ``` Indirect leak of 2064 byte(s) in 1 object(s) allocated from: #0 0x56440919ae2d in operator new(unsigned long) (/home/jenkins-build/build/workspace/ceph-pull-requests/build/bin/unittest_librbd+0x2f3de2d) (BuildId: 6a04677c6ee5235f1a41815df807f97c5b96d4cd) #1 0x56440bd67751 in __gnu_cxx::new_allocator::allocate(unsigned long, void const*) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/ext/new_allocator.h:127:27 #2 0x56440bd676e0 in std::allocator::allocate(unsigned long) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/allocator.h:185:32 #3 0x56440bd676e0 in std::allocator_traits>::allocate(std::allocator&, unsigned long) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:464:20 #4 0x56440bd6730b in std::_Vector_base>::_M_allocate(unsigned long) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_vector.h:346:20 #5 0x7fd33e00e8d1 in std::vector>::reserve(unsigned long) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/vector.tcc:78:22 #6 0x7fd33e00c51c in librbd::cache::pwl::SyncPoint::SyncPoint(unsigned long, ceph::common::CephContext*) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/SyncPoint.cc:20:27 #7 0x56440bd65f26 in decltype(::new((void*)(0)) librbd::cache::pwl::SyncPoint(std::declval(), std::declval())) std::construct_at(librbd::cache::pwl::SyncPoint*, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_construct.h:97:39 #8 0x56440bd65b98 in void std::allocator_traits>::construct(std::allocator&, librbd::cache::pwl::SyncPoint*, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:518:4 #9 0x56440bd657d3 in std::_Sp_counted_ptr_inplace, (__gnu_cxx::_Lock_policy)2>::_Sp_counted_ptr_inplace(std::allocator, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:519:4 #10 0x56440bd65371 in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::__shared_count, unsigned long&, ceph::common::CephContext*&>(librbd::cache::pwl::SyncPoint*&, std::_Sp_alloc_shared_tag>, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:651:6 #11 0x56440bd65163 in std::__shared_ptr::__shared_ptr, unsigned long&, ceph::common::CephContext*&>(std::_Sp_alloc_shared_tag>, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:1342:14 #12 0x56440bd650e6 in std::shared_ptr::shared_ptr, unsigned long&, ceph::common::CephContext*&>(std::_Sp_alloc_shared_tag>, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr.h:409:4 #13 0x56440bd65057 in std::shared_ptr std::allocate_shared, unsigned long&, ceph::common::CephContext*&>(std::allocator const&, unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr.h:862:14 #14 0x56440bca97e7 in std::shared_ptr std::make_shared(unsigned long&, ceph::common::CephContext*&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr.h:878:14 #15 0x56440bd443c8 in librbd::cache::pwl::AbstractWriteLog::new_sync_point(librbd::cache::pwl::DeferredContexts&) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:1905:20 #16 0x56440bd42e4c in librbd::cache::pwl::AbstractWriteLog::flush_new_sync_point(librbd::cache::pwl::C_FlushRequest>*, librbd::cache::pwl::DeferredContexts&) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:1951:3 #17 0x56440bd9cbf2 in librbd::cache::pwl::AbstractWriteLog::flush_new_sync_point_if_needed(librbd::cache::pwl::C_FlushRequest>*, librbd::cache::pwl::DeferredContexts&) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:1990:5 #18 0x56440bd9c636 in librbd::cache::pwl::AbstractWriteLog::internal_flush(bool, Context*)::'lambda'(librbd::cache::pwl::GuardedRequestFunctionContext&)::operator()(librbd::cache::pwl::GuardedRequestFunctionContext&) const /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:2152:9 #19 0x56440bd9b9b4 in boost::detail::function::void_function_obj_invoker::internal_flush(bool, Context*)::'lambda'(librbd::cache::pwl::GuardedRequestFunctionContext&), void, librbd::cache::pwl::GuardedRequestFunctionContext&>::invoke(boost::detail::function::function_buffer&, librbd::cache::pwl::GuardedRequestFunctionContext&) /opt/ceph/include/boost/function/function_template.hpp:100:11 #20 0x56440bd29321 in boost::function_n::operator()(librbd::cache::pwl::GuardedRequestFunctionContext&) const /opt/ceph/include/boost/function/function_template.hpp:789:14 #21 0x56440bd28d85 in librbd::cache::pwl::GuardedRequestFunctionContext::finish(int) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/Request.h:335:5 #22 0x5644091e0fe0 in Context::complete(int) /home/jenkins-build/build/workspace/ceph-pull-requests/src/include/Context.h:102:5 #23 0x56440bd9b378 in librbd::cache::pwl::AbstractWriteLog::detain_guarded_request(librbd::cache::pwl::C_BlockIORequest>*, librbd::cache::pwl::GuardedRequestFunctionContext*, bool) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:1202:20 #24 0x56440bd96c50 in librbd::cache::pwl::AbstractWriteLog::internal_flush(bool, Context*) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:2154:3 #25 0x56440bd1e4b5 in librbd::cache::pwl::AbstractWriteLog::shut_down(Context*) /home/jenkins-build/build/workspace/ceph-pull-requests/src/librbd/cache/pwl/AbstractWriteLog.cc:703:3 #26 0x56440bdb9022 in librbd::cache::pwl::TestMockCacheSSDWriteLog_compare_and_write_compare_matched_Test::TestBody() /home/jenkins-build/build/workspace/ceph-pull-requests/src/test/librbd/cache/pwl/test_mock_SSDWriteLog.cc:403:7 ``` Fixes: https://tracker.ceph.com/issues/71335 Signed-off-by: Kefu Chai (cherry picked from commit 05fd6f90e6e528c628af7fbb106b73e89d57464c) --- src/librbd/cache/pwl/AbstractWriteLog.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc index b6af9b2ec2d27..9b188df917f20 100644 --- a/src/librbd/cache/pwl/AbstractWriteLog.cc +++ b/src/librbd/cache/pwl/AbstractWriteLog.cc @@ -657,6 +657,20 @@ void AbstractWriteLog::shut_down(Context *on_finish) { periodic_stats(); std::unique_lock locker(m_lock); + + ceph_assert(m_current_sync_point); + if (!m_current_sync_point->earlier_sync_point) { + // This is the only sync point, hence no need to wait for the persistence + // of prior sync points. + m_current_sync_point->prior_persisted_gather_activate(); + // we don't create a new sync point, if there are no writes in current sync point's + // log entry. + ceph_assert(m_current_sync_point->log_entry->writes == 0); + // In that case, we shold not wait for the log entry's persistence of current + // sync point, which is otherwise waited until we flush its prior sync point. + m_current_sync_point->persist_gather_activate(); + } + check_image_cache_state_clean(); m_wake_up_enabled = false; m_log_entries.clear(); -- 2.39.5