rgw: fix race b/w bucket reshard and ops waiting on reshard completion

author J. Eric Ivancich <ivancich@redhat.com>

Mon, 1 Apr 2019 14:59:51 +0000 (10:59 -0400)

committer J. Eric Ivancich <ivancich@redhat.com>

Fri, 19 Jul 2019 16:28:50 +0000 (12:28 -0400)
author J. Eric Ivancich <ivancich@redhat.com>
Mon, 1 Apr 2019 14:59:51 +0000 (10:59 -0400)
committer J. Eric Ivancich <ivancich@redhat.com>
Fri, 19 Jul 2019 16:28:50 +0000 (12:28 -0400)
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc

index 3ca45acacd58fd20b5fda85700d58caf1fcdfa62..8eb1333e2516b055e76b94d5256a57ded33c01df 100644 (file)
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -5609,6 +5609,7 @@ int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
  
    RGWObjectCtx obj_ctx(store);
  
+  bucket_info.objv_tracker.clear();
    int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
    if (ret < 0) {
      return ret;
@@ -10334,7 +10335,7 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio
        return r;
      }
      invalidate_bs();
-  }
+  } // for loop
  
    if (r < 0) {
      return r;
@@ -11302,7 +11303,7 @@ int RGWRados::guard_reshard(BucketShard *bs,
      obj = *pobj;
      obj.bucket.update_bucket_id(new_bucket_id);
      pobj = &obj;
-  }
+  } // for loop
  
    if (r < 0) {
      return r;
diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc

index dc86cb47ea9fd295ef1db65696d9ecf343e63d23..4d73b18414c866b51c02e01ab435ac57421430d9 100644 (file)
--- a/src/rgw/rgw_reshard.cc
+++ b/src/rgw/rgw_reshard.cc
@@ -928,20 +928,47 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs,
    int ret = 0;
    cls_rgw_bucket_instance_entry entry;
  
-  for (int i=0; i < num_retries; i++) {
-    ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
+  // since we want to run this recovery code from two distinct places,
+  // let's just put it in a lambda so we can easily re-use; if the
+  // lambda successfully fetches a new bucket id, it sets
+  // new_bucket_id and returns 0, otherwise it returns a negative
+  // error code
+  auto fetch_new_bucket_id =
+    [this, bucket_info](RGWRados* store,
+                       const std::string& log_tag,
+                       std::string* new_bucket_id) -> int {
+    RGWBucketInfo fresh_bucket_info = bucket_info;
+    int ret = store->try_refresh_bucket_info(fresh_bucket_info, nullptr);
      if (ret < 0) {
+      ldout(store->ctx(), 0) << __func__ <<
+       " ERROR: failed to refresh bucket info after reshard at " <<
+       log_tag << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
+    return 0;
+  };
+
+  for (int i = 1; i <= num_retries; i++) {
+    ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
+    if (ret == -ENOENT) {
+      return fetch_new_bucket_id(bs->store,
+                                "get_bucket_resharding_failed",
+                                new_bucket_id);
+    } else if (ret < 0) {
        ldout(store->ctx(), 0) << __func__ << " ERROR: failed to get bucket resharding :"  <<
         cpp_strerror(-ret)<< dendl;
        return ret;
      }
      if (!entry.resharding_in_progress()) {
-      *new_bucket_id = entry.new_bucket_instance_id;
-      return 0;
+      return fetch_new_bucket_id(bs->store,
+                                "get_bucket_resharding_succeeded",
+                                 new_bucket_id);
      }
-    ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl;
  
-    if (i == num_retries - 1) {
+    ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries ? "retrying" : "too many retries") << dendl;
+
+    if (i == num_retries) {
        break;
      }
  
@@ -987,7 +1014,8 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs,
        ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl;
        return ret;
      }
-  }
+  } // for loop
+
    ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl;
    return -ERR_BUSY_RESHARDING;
  }
author	J. Eric Ivancich <ivancich@redhat.com>
	Mon, 1 Apr 2019 14:59:51 +0000 (10:59 -0400)
committer	J. Eric Ivancich <ivancich@redhat.com>
	Fri, 19 Jul 2019 16:28:50 +0000 (12:28 -0400)
src/rgw/rgw_rados.cc		patch \| blob \| history
src/rgw/rgw_reshard.cc		patch \| blob \| history