From: Prasanna Kumar Kalever Date: Fri, 21 Mar 2025 14:00:09 +0000 (+0530) Subject: rbd-mirror: address group_snap_set() failures X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=e7cb8696f7f383304460264797ae19c045bedd83;p=ceph-ci.git rbd-mirror: address group_snap_set() failures There are 5 places where we call group_snap_set() 1. create_mirror_snapshot() 2. create_regular_snapshot() 3. mirror_snapshot_complete() 4. regular_snapshot_complete() and 5. remove_mirror_peer_uuid() For 1 & 2 cases, we can simply delete the so far created group snapshot in the respective callback handler which is basically an empty INCOMPLETE group snapshot and let the state machine recreate it again later. For 3 & 4 cases, we are cannot delete the created snapshot, because the image snapshots whould have synced/syncing by now, deleting the group snapshot will bring additional comlications (if there is a failover at the same time). Hence setting m_retry_validate_snap flag in this case, this would all the rescan even for regular group snapshots, if the snapshot is yet INCOMPLETE on disk the validate_image_snaps_sync_complete() will be called again. For case 5, added logic to retry remove_mirror_peer_uuid() again. Signed-off-by: Prasanna Kumar Kalever --- diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.cc b/src/tools/rbd_mirror/group_replayer/Replayer.cc index c1c59c34e56..407d38f7249 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.cc +++ b/src/tools/rbd_mirror/group_replayer/Replayer.cc @@ -288,7 +288,9 @@ void Replayer::load_local_group_snapshots() { validate_image_snaps_sync_complete(local_snap.id); auto snap_type = cls::rbd::get_group_snap_namespace_type( local_snap.snapshot_namespace); - if (snap_type == cls::rbd::GROUP_SNAPSHOT_NAMESPACE_TYPE_MIRROR) { + if (snap_type == cls::rbd::GROUP_SNAPSHOT_NAMESPACE_TYPE_MIRROR + || m_retry_validate_snap) { + m_retry_validate_snap = false; schedule_load_group_snapshots(); return; } @@ -439,13 +441,18 @@ void Replayer::validate_image_snaps_sync_complete( if (itl == m_local_group_snaps.end()) { return; } + + int r; auto snap_type = cls::rbd::get_group_snap_namespace_type( itl->snapshot_namespace); if (snap_type == cls::rbd::GROUP_SNAPSHOT_NAMESPACE_TYPE_USER) { locker.unlock(); C_SaferCond *ctx = new C_SaferCond; regular_snapshot_complete(group_snap_id, ctx); - ctx->wait(); + r = ctx->wait(); + if (r < 0) { + m_retry_validate_snap = true; + } return; } @@ -455,17 +462,19 @@ void Replayer::validate_image_snaps_sync_complete( // 1. Get remote image_id // 2. Translate to local image id // 3. get the image spec - if (itr->snaps.size() == 0) { dout(5) << "Image list is empty!!" << dendl; C_SaferCond *ctx = new C_SaferCond; mirror_snapshot_complete(group_snap_id, nullptr, ctx); - ctx->wait(); + r = ctx->wait(); + if (r < 0) { + m_retry_validate_snap = true; + } return; } std::vector local_images; - int r = local_group_image_list_by_id(&local_images); + r = local_group_image_list_by_id(&local_images); if (r < 0) { derr << "failed group image list: " << cpp_strerror(r) << dendl; return; @@ -537,7 +546,10 @@ void Replayer::validate_image_snaps_sync_complete( for (auto &spec : image_snap_spec) { C_SaferCond *ctx = new C_SaferCond; mirror_snapshot_complete(group_snap_id, &spec, ctx); - ctx->wait(); + r = ctx->wait(); + if (r < 0) { + m_retry_validate_snap = true; + } } locker.unlock(); } @@ -802,7 +814,26 @@ void Replayer::handle_create_mirror_snapshot( int r, const std::string &group_snap_id, Context *on_finish) { dout(10) << group_snap_id << ", r=" << r << dendl; - on_finish->complete(0); + if (r < 0) { // clean the group snapshot here + dout(10) << "cleaning snapshot as failed group_snap_set previously with : " + << group_snap_id << ", this will be attempted to sync later" + << dendl; + r = librbd::cls_client::group_snap_remove(&m_local_io_ctx, + librbd::util::group_header_name(m_local_group_id), group_snap_id); + if (r < 0) { + derr << "failed to remove group snapshot : " + << group_snap_id << " : " << cpp_strerror(r) << dendl; + } + for (auto local_snap = m_local_group_snaps.begin(); + local_snap != m_local_group_snaps.end(); ++local_snap) { + if (local_snap->id != group_snap_id) { + continue; + } + m_local_group_snaps.erase(local_snap); + } + } + + on_finish->complete(r); } template @@ -863,7 +894,7 @@ void Replayer::handle_mirror_snapshot_complete( int r, const std::string &group_snap_id, Context *on_finish) { dout(10) << group_snap_id << ", r=" << r << dendl; - on_finish->complete(0); + on_finish->complete(r); } template @@ -883,8 +914,8 @@ void Replayer::remove_mirror_peer_uuid(const std::string &snap_id) { if (rns != nullptr) { rns->mirror_peer_uuids.clear(); auto comp = create_rados_callback( - new LambdaContext([this](int r) { - dout(10) << "remove_mirror_peer_uuid done for remote group snap" << dendl; + new LambdaContext([this, snap_id](int r) { + handle_remove_mirror_peer_uuid(r, snap_id); })); librados::ObjectWriteOperation op; @@ -896,6 +927,18 @@ void Replayer::remove_mirror_peer_uuid(const std::string &snap_id) { } } +template +void Replayer::handle_remove_mirror_peer_uuid( + int r, const std::string &snap_id) { + dout(10) << snap_id << ", r=" << r << dendl; + + if (r < 0) { + remove_mirror_peer_uuid(snap_id); // retry? + } + + return; +} + template bool Replayer::prune_all_image_snapshots(cls::rbd::GroupSnapshot *local_snap) { bool retain = false; @@ -1021,8 +1064,8 @@ void Replayer::create_regular_snapshot( librbd::cls_client::group_snap_set(&op, group_snap); auto comp = create_rados_callback( - new LambdaContext([this, on_finish](int r) { - handle_create_regular_snapshot(r, on_finish); + new LambdaContext([this, group_snap_id, on_finish](int r) { + handle_create_regular_snapshot(r, group_snap_id, on_finish); })); int r = m_local_io_ctx.aio_operate( librbd::util::group_header_name(m_local_group_id), comp, &op); @@ -1032,14 +1075,29 @@ void Replayer::create_regular_snapshot( template void Replayer::handle_create_regular_snapshot( - int r, Context *on_finish) { - dout(10) << "r=" << r << dendl; + int r, const std::string &group_snap_id, Context *on_finish) { + dout(10) << group_snap_id << ", r=" << r << dendl; - if (r < 0) { - derr << "error creating local non-primary group snapshot: " - << cpp_strerror(r) << dendl; + if (r < 0) { // clean the group snapshot here + dout(10) << "cleaning snapshot as failed group_snap_set previously with : " + << group_snap_id << ", this will be attempted to sync later" + << dendl; + r = librbd::cls_client::group_snap_remove(&m_local_io_ctx, + librbd::util::group_header_name(m_local_group_id), group_snap_id); + if (r < 0) { + derr << "failed to remove group snapshot : " + << group_snap_id << " : " << cpp_strerror(r) << dendl; + } + for (auto local_snap = m_local_group_snaps.begin(); + local_snap != m_local_group_snaps.end(); ++local_snap) { + if (local_snap->id != group_snap_id) { + continue; + } + m_local_group_snaps.erase(local_snap); + } } - on_finish->complete(0); + + on_finish->complete(r); } @@ -1198,8 +1256,8 @@ void Replayer::regular_snapshot_complete( librbd::cls_client::group_snap_set(&op, *itl); auto comp = create_rados_callback( - new LambdaContext([this, on_finish](int r) { - handle_regular_snapshot_complete(r, on_finish); + new LambdaContext([this, group_snap_id, on_finish](int r) { + handle_regular_snapshot_complete(r, group_snap_id, on_finish); })); int r = m_local_io_ctx.aio_operate( librbd::util::group_header_name(m_local_group_id), comp, &op); @@ -1209,9 +1267,9 @@ void Replayer::regular_snapshot_complete( template void Replayer::handle_regular_snapshot_complete( - int r, Context *on_finish) { - dout(10) << "r=" << r << dendl; - on_finish->complete(0); + int r, const std::string &group_snap_id, Context *on_finish) { + dout(10) << group_snap_id << ", r=" << r << dendl; + on_finish->complete(r); } template diff --git a/src/tools/rbd_mirror/group_replayer/Replayer.h b/src/tools/rbd_mirror/group_replayer/Replayer.h index 3e4f2f2e9e8..ef19ae9c556 100644 --- a/src/tools/rbd_mirror/group_replayer/Replayer.h +++ b/src/tools/rbd_mirror/group_replayer/Replayer.h @@ -106,6 +106,7 @@ private: AsyncOpTracker m_in_flight_op_tracker; bool m_stop_requested = false; + bool m_retry_validate_snap = false; bool is_replay_interrupted(); bool is_replay_interrupted(std::unique_lock* locker); @@ -151,6 +152,7 @@ private: int r, const std::string &group_snap_id, Context *on_finish); void remove_mirror_peer_uuid(const std::string &snap_id); + void handle_remove_mirror_peer_uuid(int r, const std::string &snap_id); bool prune_all_image_snapshots(cls::rbd::GroupSnapshot *local_snap); void unlink_group_snapshots(); @@ -159,13 +161,16 @@ private: const std::string &group_snap_id, Context *on_finish); void handle_create_regular_snapshot(int r, Context *on_finish); + void handle_create_regular_snapshot( + int r, const std::string &group_snap_id, Context *on_finish); void set_image_replayer_limits(const std::string &image_id, cls::rbd::GroupSnapshot *remote_snap); void regular_snapshot_complete( const std::string &group_snap_id, Context *on_finish); - void handle_regular_snapshot_complete(int r, Context *on_finish); + void handle_regular_snapshot_complete( + int r, const std::string &group_snap_id, Context *on_finish); }; } // namespace group_replayer