From: Prasanna Kumar Kalever Date: Thu, 4 Sep 2025 16:26:15 +0000 (+0530) Subject: librbd/mirror: remove peer UUID of group snapshot before attempting removal X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=28ac649c31bd37cbef86dc2e70d3c18a3063f01a;p=ceph-ci.git librbd/mirror: remove peer UUID of group snapshot before attempting removal Problem: When the number of snapshots exceeds the configured limit (rbd_mirroring_max_mirroring_snapshots), the cleanup process remove the last group snapshot on the primary. If this removal fails partially, the group snapshot with some or empty image snap may still remain. As a result: * The snapshot is incorrectly considered for synchronization to the secondary. * The secondary snapshot can remain forever in the incomplete state, leading to stuck of daemon progress. Solution: Before attempting to remove the group snapshot: * Explicitly remove the peer UUID associated with the snapshot. * This ensures that the snapshot is no longer considered eligible for synchronization. * Even if the actual snapshot deletion fails partially, the snapshot is not tried for mirroring, avoiding daemon stalls or inconsistencies. Signed-off-by: Prasanna Kumar Kalever Resolves: rhbz#2392398 --- diff --git a/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.cc b/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.cc index 98f14be5c52..68920f34960 100644 --- a/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.cc +++ b/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.cc @@ -41,8 +41,6 @@ template void GroupUnlinkPeerRequest::unlink_peer() { ldout(m_cct, 10) << "rbd_mirroring_max_mirroring_snapshots = " << m_max_snaps << dendl; - uint64_t count = 0; - auto unlink_snap = m_group_snaps.end(); // First pass : cleanup snaps that have no peer_uuids or are incomplete for (auto peer: *m_mirror_peer_uuids){ for (auto it = m_group_snaps.begin(); it != m_group_snaps.end(); it++) { @@ -56,14 +54,15 @@ void GroupUnlinkPeerRequest::unlink_peer() { (ns->mirror_peer_uuids.count(peer) != 0 && ns->is_primary() && it->state == cls::rbd::GROUP_SNAPSHOT_STATE_INCOMPLETE)){ - unlink_snap = it; - process_snapshot(*unlink_snap, peer); + process_snapshot(*it, peer); return; } } } for (auto peer: *m_mirror_peer_uuids){ + uint64_t count = 0; + auto unlink_snap = m_group_snaps.end(); for (auto it = m_group_snaps.begin(); it != m_group_snaps.end(); it++) { auto ns = std::get_if( &it->snapshot_namespace); @@ -135,17 +134,17 @@ void GroupUnlinkPeerRequest::process_snapshot(cls::rbd::GroupSnapshot group_s std::string mirror_peer_uuid) { ldout(m_cct, 10) << "snap id: " << group_snap.id << dendl; bool found = false; - bool has_newer_mirror_snap = false; + m_has_newer_mirror_snap = false; for (auto it = m_group_snaps.begin(); it != m_group_snaps.end(); it++) { if (it->id == group_snap.id) { found = true; } else if (found) { auto ns = std::get_if( - &it->snapshot_namespace); + &it->snapshot_namespace); if (ns != nullptr) { - has_newer_mirror_snap = true; - break; + m_has_newer_mirror_snap = true; + break; } } } @@ -153,11 +152,13 @@ void GroupUnlinkPeerRequest::process_snapshot(cls::rbd::GroupSnapshot group_s if (!found) { ldout(m_cct, 15) << "missing snapshot: snap_id=" << group_snap.id << dendl; finish(-ENOENT); - return; + return; } - if (has_newer_mirror_snap) { - remove_group_snapshot(group_snap); + const auto& ns = std::get( + group_snap.snapshot_namespace); + if (ns.mirror_peer_uuids.empty()) { + remove_group_snapshot(group_snap); } else { remove_peer_uuid(group_snap, mirror_peer_uuid); } @@ -170,9 +171,10 @@ void GroupUnlinkPeerRequest::remove_peer_uuid( std::string mirror_peer_uuid) { ldout(m_cct, 10) << dendl; - auto aio_comp = create_rados_callback< - GroupUnlinkPeerRequest, - &GroupUnlinkPeerRequest::handle_remove_peer_uuid>(this); + auto aio_comp = create_rados_callback( + new LambdaContext([this, group_snap](int r) { + handle_remove_peer_uuid(r, group_snap); + })); auto ns = std::get_if( &group_snap.snapshot_namespace); @@ -188,7 +190,8 @@ void GroupUnlinkPeerRequest::remove_peer_uuid( } template -void GroupUnlinkPeerRequest::handle_remove_peer_uuid(int r) { +void GroupUnlinkPeerRequest::handle_remove_peer_uuid( + int r, cls::rbd::GroupSnapshot group_snap) { ldout(m_cct, 10) << "r=" << r << dendl; if (r < 0) { @@ -197,7 +200,12 @@ void GroupUnlinkPeerRequest::handle_remove_peer_uuid(int r) { finish(r); return; } - list_group_snaps(); + + if (m_has_newer_mirror_snap) { + remove_group_snapshot(group_snap); + } else { + list_group_snaps(); + } } diff --git a/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.h b/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.h index c48d04e746b..c624eaf9e01 100644 --- a/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.h +++ b/src/librbd/mirror/snapshot/GroupUnlinkPeerRequest.h @@ -55,6 +55,7 @@ private: Context *m_on_finish; uint64_t m_max_snaps; + bool m_has_newer_mirror_snap = false; CephContext *m_cct; std::vector m_group_snaps; @@ -71,7 +72,7 @@ private: void remove_peer_uuid(cls::rbd::GroupSnapshot group_snap, std::string mirror_peer_uuid); - void handle_remove_peer_uuid(int r); + void handle_remove_peer_uuid(int r, cls::rbd::GroupSnapshot group_snap); void remove_group_snapshot(cls::rbd::GroupSnapshot group_snap); void handle_remove_group_snapshot(int r);