From 7d0e68ae51640300b450439f5fcc4a900c93ef0b Mon Sep 17 00:00:00 2001 From: "J. Eric Ivancich" Date: Mon, 1 Apr 2019 10:59:51 -0400 Subject: [PATCH] rgw: fix race b/w bucket reshard and ops waiting on reshard completion A previous commit (f84f70d4) added functionality to clean up old bucket instances and bucket shards once sharding completed successfully. However the existing code that allowed a bucket operation to wait until resharding completed was relying on a field in the old bucket instance to know the bucket instance id of the new bucket instance. This created a race condition as to whether the clean-up or read of the bucket instance id would occur first. This solution rereads the bucket entry point object when resharding completes to determine the bucket instance id of the new bucket thereby avoiding the race. Signed-off-by: J. Eric Ivancich (cherry picked from commit 26b793680805721cac86f64cdd234b273bf33be4) Conflicts: src/rgw/rgw_rados.cc: equivalent changes made to rgw_reshard.cc --- src/rgw/rgw_rados.cc | 5 +++-- src/rgw/rgw_reshard.cc | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 3ca45acacd5..8eb1333e251 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -5609,6 +5609,7 @@ int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id) RGWObjectCtx obj_ctx(store); + bucket_info.objv_tracker.clear(); int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr); if (ret < 0) { return ret; @@ -10334,7 +10335,7 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio return r; } invalidate_bs(); - } + } // for loop if (r < 0) { return r; @@ -11302,7 +11303,7 @@ int RGWRados::guard_reshard(BucketShard *bs, obj = *pobj; obj.bucket.update_bucket_id(new_bucket_id); pobj = &obj; - } + } // for loop if (r < 0) { return r; diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc index dc86cb47ea9..4d73b18414c 100644 --- a/src/rgw/rgw_reshard.cc +++ b/src/rgw/rgw_reshard.cc @@ -928,20 +928,47 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, int ret = 0; cls_rgw_bucket_instance_entry entry; - for (int i=0; i < num_retries; i++) { - ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry); + // since we want to run this recovery code from two distinct places, + // let's just put it in a lambda so we can easily re-use; if the + // lambda successfully fetches a new bucket id, it sets + // new_bucket_id and returns 0, otherwise it returns a negative + // error code + auto fetch_new_bucket_id = + [this, bucket_info](RGWRados* store, + const std::string& log_tag, + std::string* new_bucket_id) -> int { + RGWBucketInfo fresh_bucket_info = bucket_info; + int ret = store->try_refresh_bucket_info(fresh_bucket_info, nullptr); if (ret < 0) { + ldout(store->ctx(), 0) << __func__ << + " ERROR: failed to refresh bucket info after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + *new_bucket_id = fresh_bucket_info.bucket.bucket_id; + return 0; + }; + + for (int i = 1; i <= num_retries; i++) { + ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry); + if (ret == -ENOENT) { + return fetch_new_bucket_id(bs->store, + "get_bucket_resharding_failed", + new_bucket_id); + } else if (ret < 0) { ldout(store->ctx(), 0) << __func__ << " ERROR: failed to get bucket resharding :" << cpp_strerror(-ret)<< dendl; return ret; } if (!entry.resharding_in_progress()) { - *new_bucket_id = entry.new_bucket_instance_id; - return 0; + return fetch_new_bucket_id(bs->store, + "get_bucket_resharding_succeeded", + new_bucket_id); } - ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl; - if (i == num_retries - 1) { + ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries ? "retrying" : "too many retries") << dendl; + + if (i == num_retries) { break; } @@ -987,7 +1014,8 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl; return ret; } - } + } // for loop + ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl; return -ERR_BUSY_RESHARDING; } -- 2.47.3