From: J. Eric Ivancich Date: Wed, 17 Oct 2018 17:43:24 +0000 (-0400) Subject: rgw: recover from incomplete reshard attempt X-Git-Tag: v12.2.11~115^2~10 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f4a18a100240b870a3621faf6dc942ec47e34b7b;p=ceph.git rgw: recover from incomplete reshard attempt In case a reshard attempt is left in an incomplete state, i.e., flags still show resharding even though the bucket reshard lock isn't being held, try to recover by taking the bucket reshard lock and clearing flags associated with resharding. This change requires access to an RGWBucketInfo object. So call stack into this function should provide that to prevent unnecessary work. Changes were made to provide this object. Signed-off-by: J. Eric Ivancich (cherry picked from commit 4891ae59314041802da0f6dc249ccbeb761616dc) --- diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 73b998a8ac2..4cde0d751e9 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -5652,7 +5652,7 @@ next: for (int i = 0; i < max_shards; i++) { RGWRados::BucketShard bs(store); int shard_id = (bucket_info.num_shards > 0 ? i : -1); - int ret = bs.init(bucket, shard_id); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); marker.clear(); if (ret < 0) { @@ -5713,7 +5713,7 @@ next: for (int i = 0; i < max_shards; i++) { RGWRados::BucketShard bs(store); int shard_id = (bucket_info.num_shards > 0 ? i : -1); - int ret = bs.init(bucket, shard_id); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); if (ret < 0) { cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl; return -ret; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 75c1cdbc231..ff2025c63f2 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -3504,21 +3504,22 @@ int RGWIndexCompletionThread::process() ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl; RGWRados::BucketShard bs(store); + RGWBucketInfo bucket_info; - int r = bs.init(c->obj.bucket, c->obj); + int r = bs.init(c->obj.bucket, c->obj, &bucket_info); if (r < 0) { ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl; /* not much to do */ continue; } - r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int { - librados::ObjectWriteOperation o; - cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, - c->log_op, c->bilog_op, &c->zones_trace); - - return bs->index_ctx.operate(bs->bucket_obj, &o); + r = store->guard_reshard(&bs, c->obj, bucket_info, + [&](RGWRados::BucketShard *bs) -> int { + librados::ObjectWriteOperation o; + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, + c->log_op, c->bilog_op, &c->zones_trace); + return bs->index_ctx.operate(bs->bucket_obj, &o); }); if (r < 0) { ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl; @@ -6688,19 +6689,24 @@ int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key return 0; } -int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj) +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + const rgw_obj& obj, + RGWBucketInfo* bucket_info_out) { bucket = _bucket; RGWObjectCtx obj_ctx(store); RGWBucketInfo bucket_info; - int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + + int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL); if (ret < 0) { return ret; } - ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id); + ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id); if (ret < 0) { ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; return ret; @@ -6710,7 +6716,9 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj) return 0; } -int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid) +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + int sid, + RGWBucketInfo* bucket_info_out) { bucket = _bucket; shard_id = sid; @@ -6718,12 +6726,14 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid) RGWObjectCtx obj_ctx(store); RGWBucketInfo bucket_info; - int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL); + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL); if (ret < 0) { return ret; } - ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj); + ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj); if (ret < 0) { ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; return ret; @@ -8952,7 +8962,7 @@ int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info) return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); } -int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) +int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) { librados::IoCtx index_ctx; map bucket_objs; @@ -10216,7 +10226,7 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio } ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl; string new_bucket_id; - r = store->block_while_resharding(bs, &new_bucket_id); + r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info); if (r == -ERR_BUSY_RESHARDING) { continue; } @@ -10268,9 +10278,9 @@ int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_t } } - int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int { - return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace); - }); + int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int { + return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace); + }); if (r < 0) { return r; @@ -10369,9 +10379,9 @@ int RGWRados::Bucket::UpdateIndex::cancel() RGWRados *store = target->get_store(); BucketShard *bs; - int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int { - return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace); - }); + int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int { + return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace); + }); /* * need to update data log anyhow, so that whoever follows needs to update its internal markers @@ -11176,14 +11186,17 @@ int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjStat return ret; } -int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function call) +int RGWRados::guard_reshard(BucketShard *bs, + const rgw_obj& obj_instance, + const RGWBucketInfo& bucket_info, + std::function call) { rgw_obj obj; const rgw_obj *pobj = &obj_instance; int r; for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { - r = bs->init(pobj->bucket, *pobj); + r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */); if (r < 0) { ldout(cct, 5) << "bs.init() returned ret=" << r << dendl; return r; @@ -11194,7 +11207,7 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f } ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl; string new_bucket_id; - r = block_while_resharding(bs, &new_bucket_id); + r = block_while_resharding(bs, &new_bucket_id, bucket_info); if (r == -ERR_BUSY_RESHARDING) { continue; } @@ -11216,11 +11229,13 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f return 0; } -int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id) +int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info) { std::shared_ptr waiter = reshard_wait; - return waiter->block_while_resharding(bs, new_bucket_id); + return waiter->block_while_resharding(bs, new_bucket_id, bucket_info); } int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance, @@ -11246,13 +11261,14 @@ int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjStat BucketShard bs(this); cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); - r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { - librados::ObjectWriteOperation op; - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - return cls_rgw_bucket_link_olh(bs->index_ctx, op, - bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch, - unmod_since, high_precision_time, - get_zone().log_data, zones_trace); + r = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + librados::ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_bucket_link_olh(bs->index_ctx, op, + bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch, + unmod_since, high_precision_time, + get_zone().log_data, zones_trace); }); if (r < 0) { ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl; @@ -11290,11 +11306,12 @@ int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, con BucketShard bs(this); cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); - r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { - librados::ObjectWriteOperation op; - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag, - olh_tag, olh_epoch, get_zone().log_data, zones_trace); + r = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + librados::ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag, + olh_tag, olh_epoch, get_zone().log_data, zones_trace); }); if (r < 0) { ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl; @@ -11316,7 +11333,8 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj } BucketShard bs(this); - int ret = bs.init(obj_instance.bucket, obj_instance); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; @@ -11326,12 +11344,13 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { - ObjectReadOperation op; - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op, - key, ver_marker, olh_tag, log, is_truncated); - }); + ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + ObjectReadOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op, + key, ver_marker, olh_tag, log, is_truncated); + }); if (ret < 0) { ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl; return ret; @@ -11349,7 +11368,8 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj } BucketShard bs(this); - int ret = bs.init(obj_instance.bucket, obj_instance); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; @@ -11359,11 +11379,12 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { - ObjectWriteOperation op; - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - cls_rgw_trim_olh_log(op, key, ver, olh_tag); - return pbs->index_ctx.operate(pbs->bucket_obj, &op); + ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_trim_olh_log(op, key, ver, olh_tag); + return pbs->index_ctx.operate(pbs->bucket_obj, &op); }); if (ret < 0) { ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl; @@ -11387,10 +11408,11 @@ int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjSta cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { - ObjectWriteOperation op; - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag); + int ret = guard_reshard(&bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag); }); if (ret < 0) { ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl; @@ -12876,7 +12898,7 @@ int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rg int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry) { BucketShard bs(this); - int ret = bs.init(bucket, obj); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; @@ -12908,7 +12930,7 @@ int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry) int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry) { BucketShard bs(this); - int ret = bs.init(bucket, obj); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; @@ -12921,7 +12943,7 @@ int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& { rgw_obj obj(bucket, obj_name); BucketShard bs(this); - int ret = bs.init(bucket, obj); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; @@ -12963,7 +12985,7 @@ int RGWRados::bi_remove(BucketShard& bs) int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list *entries, bool *is_truncated) { BucketShard bs(this); - int ret = bs.init(bucket, shard_id); + int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */); if (ret < 0) { ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl; return ret; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index cbe498bbb9d..c8c654c5675 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2707,8 +2707,8 @@ public: string bucket_obj; explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {} - int init(const rgw_bucket& _bucket, const rgw_obj& obj); - int init(const rgw_bucket& _bucket, int sid); + int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out); + int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out); int init(const RGWBucketInfo& bucket_info, int sid); }; @@ -2748,7 +2748,8 @@ public: int get_bucket_shard(BucketShard **pbs) { if (!bs_initialized) { - int r = bs.init(bucket_info.bucket, obj); + int r = + bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */); if (r < 0) { return r; } @@ -2944,7 +2945,8 @@ public: rgw_zone_set *zones_trace{nullptr}; int init_bs() { - int r = bs.init(target->get_bucket(), obj); + int r = + bs.init(target->get_bucket(), obj, nullptr /* no RGWBucketInfo */); if (r < 0) { return r; } @@ -3351,8 +3353,13 @@ public: int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op); int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op); - int guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function call); - int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id); + int guard_reshard(BucketShard *bs, + const rgw_obj& obj_instance, + const RGWBucketInfo& bucket_info, + std::function call); + int block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info); void bucket_index_guard_olh_op(RGWObjState& olh_state, librados::ObjectOperation& op); int olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag); @@ -3624,7 +3631,7 @@ public: map *existing_stats, map *calculated_stats); int bucket_rebuild_index(RGWBucketInfo& bucket_info); - int bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); + int bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); int remove_objs_from_index(RGWBucketInfo& bucket_info, list& oid_list); int move_rados_obj(librados::IoCtx& src_ioctx, const string& src_oid, const string& src_locator, diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc index 0241ef95afd..4188b20e3de 100644 --- a/src/rgw/rgw_reshard.cc +++ b/src/rgw/rgw_reshard.cc @@ -74,7 +74,7 @@ public: aio_completions(_completions) { num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1); - bs.init(bucket_info.bucket, num_shard); + bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */); } int get_num_shard() { @@ -213,7 +213,7 @@ RGWBucketReshard::RGWBucketReshard(RGWRados *_store, { } int RGWBucketReshard::set_resharding_status(RGWRados* store, - RGWBucketInfo& bucket_info, + const RGWBucketInfo& bucket_info, const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status) @@ -236,9 +236,10 @@ int RGWBucketReshard::set_resharding_status(RGWRados* store, } // reshard lock assumes lock is held -int RGWBucketReshard::clear_resharding() +int RGWBucketReshard::clear_resharding(RGWRados* store, + const RGWBucketInfo& bucket_info) { - int ret = clear_index_shard_reshard_status(); + int ret = clear_index_shard_reshard_status(store, bucket_info); if (ret < 0) { ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ << " ERROR: error clearing reshard status from index shard " << @@ -259,7 +260,7 @@ int RGWBucketReshard::clear_resharding() } int RGWBucketReshard::clear_index_shard_reshard_status(RGWRados* store, - RGWBucketInfo& bucket_info) + const RGWBucketInfo& bucket_info) { uint32_t num_shards = bucket_info.num_shards; @@ -863,7 +864,9 @@ int RGWReshardWait::do_wait() return 0; } -int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id) +int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info) { int ret = 0; cls_rgw_bucket_instance_entry entry; @@ -880,12 +883,48 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *ne return 0; } ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl; - /* needed to unlock as clear resharding uses the same lock */ if (i == num_retries - 1) { break; } + // If bucket is erroneously marked as resharding (e.g., crash or + // other error) then fix it. If we can take the bucket reshard + // lock then it means no other resharding should be taking place, + // and we're free to clear the flags. + { + // since we expect to do this rarely, we'll do our work in a + // block and erase our work after each try + + RGWObjectCtx obj_ctx(bs->store); + const rgw_bucket& b = bs->bucket; + std::string bucket_id = b.get_key(); + RGWBucketReshardLock reshard_lock(bs->store, bucket_info, true); + ret = reshard_lock.lock(); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << + " INFO: failed to take reshard lock for bucket " << + bucket_id << "; expected if resharding underway" << dendl; + } else { + ldout(store->ctx(), 10) << __func__ << + " INFO: was able to take reshard lock for bucket " << + bucket_id << dendl; + ret = RGWBucketReshard::clear_resharding(bs->store, bucket_info); + if (ret < 0) { + reshard_lock.unlock(); + ldout(store->ctx(), 0) << __func__ << + " ERROR: failed to clear resharding flags for bucket " << + bucket_id << dendl; + } else { + reshard_lock.unlock(); + ldout(store->ctx(), 5) << __func__ << + " INFO: apparently successfully cleared resharding flags for " + "bucket " << bucket_id << dendl; + continue; // if we apparently succeed immediately test again + } // if clear resharding succeeded + } // if taking of lock succeeded + } // block to encapsulate recovery from incomplete reshard + ret = do_wait(); if (ret < 0) { ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl; diff --git a/src/rgw/rgw_reshard.h b/src/rgw/rgw_reshard.h index a129e8b1b13..be47c86b2c9 100644 --- a/src/rgw/rgw_reshard.h +++ b/src/rgw/rgw_reshard.h @@ -69,8 +69,6 @@ private: RGWBucketReshardLock reshard_lock; RGWBucketReshardLock* outer_reshard_lock; - int clear_resharding(); - int create_new_bucket_instance(int new_num_shards, RGWBucketInfo& new_bucket_info); int do_reshard(int num_shards, @@ -92,12 +90,18 @@ public: RGWReshard *reshard_log = nullptr); int get_status(std::list *status); int cancel(); + static int clear_resharding(RGWRados* store, + const RGWBucketInfo& bucket_info); + int clear_resharding() { + return clear_resharding(store, bucket_info); + } static int clear_index_shard_reshard_status(RGWRados* store, - RGWBucketInfo& bucket_info); + const RGWBucketInfo& bucket_info); int clear_index_shard_reshard_status() { return clear_index_shard_reshard_status(store, bucket_info); } - static int set_resharding_status(RGWRados* store, RGWBucketInfo& bucket_info, + static int set_resharding_status(RGWRados* store, + const RGWBucketInfo& bucket_info, const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status); @@ -180,7 +184,9 @@ public: ~RGWReshardWait() { assert(going_down); } - int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id); + int block_while_resharding(RGWRados::BucketShard *bs, + string *new_bucket_id, + const RGWBucketInfo& bucket_info); void stop() { Mutex::Locker l(lock);