From 00d8c05010a4c50a978f5812bc29adf8f2de8b66 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Mon, 7 Feb 2022 15:23:57 -0500 Subject: [PATCH] rgw: Retry -ECANCELED in reshard commit and cancel Signed-off-by: Adam C. Emerson --- src/rgw/rgw_reshard.cc | 48 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc index ffc176c591e8f..7a00306424526 100644 --- a/src/rgw/rgw_reshard.cc +++ b/src/rgw/rgw_reshard.cc @@ -460,9 +460,18 @@ static int cancel_reshard(rgw::sal::RadosStore* store, const ReshardFaultInjector& fault, const DoutPrefixProvider *dpp) { + static constexpr auto max_retries = 10; // unblock writes to the current index shard objects - int ret = set_resharding_status(dpp, static_cast(store), bucket_info, - cls_rgw_reshard_status::NOT_RESHARDING); + int ret = 0; + int tries = 0; + do { + ret = set_resharding_status(dpp, static_cast(store), bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + ++tries; + ldpp_dout(dpp, 1) << "WARNING: " << __func__ + << " set_resharding_status got -ECANCELED. Retrying." + << dendl; + } while (ret == -ECANCELED && tries < max_retries); if (ret < 0) { ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " "writes to current index objects: " << cpp_strerror(ret) << dendl; @@ -470,7 +479,15 @@ static int cancel_reshard(rgw::sal::RadosStore* store, } if (bucket_info.layout.target_index) { - return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + tries = 0; + do { + ret = revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + ++tries; + ldpp_dout(dpp, 1) << "WARNING: " << __func__ + << " revert_target_layout got -ECANCELED. Retrying." + << dendl; + } while (ret == -ECANCELED && tries < max_retries); + return ret; } // there is nothing to revert return 0; @@ -482,6 +499,7 @@ static int commit_reshard(rgw::sal::RadosStore* store, const ReshardFaultInjector& fault, const DoutPrefixProvider *dpp) { + static constexpr auto max_retries = 10; auto& layout = bucket_info.layout; auto prev = layout; // make a copy for cleanup const auto next_log_gen = layout.logs.empty() ? 1 : @@ -514,10 +532,14 @@ static int commit_reshard(rgw::sal::RadosStore* store, int ret = fault.check("commit_target_layout"); if (ret == 0) { // no fault injected, write the bucket instance metadata - ret = - store->getRados()->put_bucket_instance_info(bucket_info, false, - real_time(), - &bucket_attrs, dpp); + int tries = 0; + do { + ret = + store->getRados()->put_bucket_instance_info(bucket_info, false, + real_time(), + &bucket_attrs, dpp); + ++tries; + } while (ret == -ECANCELED && tries < max_retries); } if (ret < 0) { @@ -527,8 +549,16 @@ static int commit_reshard(rgw::sal::RadosStore* store, bucket_info.layout = std::move(prev); // restore in-memory layout // unblock writes to the current index shard objects - int ret2 = set_resharding_status(dpp, store, bucket_info, - cls_rgw_reshard_status::NOT_RESHARDING); + int tries = 0; + int ret2 = 0; + do { + ret2 = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + ++tries; + ldpp_dout(dpp, 1) << "WARNING: " << __func__ + << " set_resharding_status got -ECANCELED. Retrying." + << dendl; + } while (ret2 == -ECANCELED && tries < max_retries); if (ret2 < 0) { ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " "writes to current index objects: " << cpp_strerror(ret2) << dendl; -- 2.39.5