From 010ab8e0cdeb1a661ba8a65c3c82c0c993d6c9d3 Mon Sep 17 00:00:00 2001 From: "J. Eric Ivancich" Date: Wed, 12 Feb 2020 20:38:44 -0500 Subject: [PATCH] rgw: address 0-length listing results when non-vis entries dominate A change to advance the marker in RGWRados::cls_bucket_list_ordered to the last entry visited rather than the final entry in list to push progress as far as possible. Since non-vis entries tend to cluster on the same shard, such as during incomplete multipart uploads, this can severely limit the number of entries returned by a call to RGWRados::cls_bucket_list_ordered since once that shard has provided all its members, we must stop. This interacts with a recent optimization to reduce the number of entries requested from each shard. To address this the number of attempts is sent as a parameter, so the number of entries requested from each shard can grow with each attempt. Currently the growth is linear but perhaps exponential growth (capped at number of entries requested) should be considered. Previously RGWRados::Bucket::List::list_objects_ordered was capped at 2 attempts, but now we keep attempting to insure we make forward progress and return entries when some exist. If we fail to make forward progress, we log the error condition and stop looping. Additional logging, mostly at level 20, is added to the two key functions involved in ordered bucket listing to make it easier to follow the logic and address potential future issues that might arise. Additionally modify attempt number based on how many results were received. Change the per-shard request number, so it grows exponentially rather than linearly as the attempts go up. Signed-off-by: J. Eric Ivancich (cherry picked from commit 28bd8bada4236060db6d5aed6b1eb345ab507890) --- src/rgw/rgw_admin.cc | 15 ++++-- src/rgw/rgw_bucket.cc | 10 +++- src/rgw/rgw_rados.cc | 110 +++++++++++++++++++++++++++++++++--------- src/rgw/rgw_rados.h | 9 ++-- 4 files changed, 114 insertions(+), 30 deletions(-) diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 550e649ef7522..20ede3f2e69ff 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -6253,20 +6253,27 @@ next: formatter->open_object_section("result"); formatter->dump_string("bucket", bucket_name); formatter->open_array_section("objects"); + + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t attempt = 1; while (is_truncated) { map result; int r = store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, marker, - prefix, 1000, true, + prefix, NUM_ENTRIES, true, attempt, result, &is_truncated, &marker, bucket_object_check_filter); - if (r < 0 && r != -ENOENT) { cerr << "ERROR: failed operation r=" << r << std::endl; + } else if (r == -ENOENT) { + break; } - if (r == -ENOENT) - break; + if (result.size() < NUM_ENTRIES / 8) { + ++attempt; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) { + --attempt; + } map::iterator iter; for (iter = result.begin(); iter != result.end(); ++iter) { diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index 80ba1ba354bda..5092278c4f0dd 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -1164,11 +1164,13 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state, Formatter *formatter = flusher.get_formatter(); formatter->open_object_section("objects"); + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t attempt = 1; while (is_truncated) { map result; int r = store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, - marker, prefix, 1000, true, + marker, prefix, NUM_ENTRIES, true, attempt, result, &is_truncated, &marker, bucket_object_check_filter); if (r == -ENOENT) { @@ -1177,6 +1179,12 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state, set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r)); } + if (result.size() < NUM_ENTRIES / 8) { + ++attempt; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) { + --attempt; + } + dump_bucket_index(result, formatter); flusher.flush(); } diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 350825fe90903..79ab2b3186b7b 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -5,6 +5,8 @@ #include #include #include +#include + #include #include @@ -2449,9 +2451,27 @@ int RGWRados::Bucket::List::list_objects_ordered( } } - constexpr int allowed_read_attempts = 2; + rgw_obj_index_key prev_marker; string skip_after_delim; - for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) { + uint16_t attempt = 0; + while (true) { + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " beginning attempt=" << ++attempt << dendl; + + // this loop is generally expected only to have a single + // iteration; the standard exit is at the bottom of the loop, but + // there's an error condition emergency exit as well + + if (attempt > 1 && !(prev_marker < cur_marker)) { + // we've failed to make forward progress + ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ << + ": ERROR marker failed to make forward progress; attempt=" << attempt << + ", prev_marker=" << prev_marker << + ", cur_marker=" << cur_marker << dendl; + break; + } + prev_marker = cur_marker; + std::map ent_map; int r = store->cls_bucket_list_ordered(target->get_bucket_info(), shard_id, @@ -2459,6 +2479,7 @@ int RGWRados::Bucket::List::list_objects_ordered( cur_prefix, read_ahead + 1 - count, params.list_versions, + attempt, ent_map, &truncated, &cur_marker); @@ -2545,6 +2566,9 @@ int RGWRados::Bucket::List::list_objects_ordered( goto done; } + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " adding entry " << entry.key << " to result" << dendl; + result->emplace_back(std::move(entry)); count++; } // eiter for loop @@ -2567,15 +2591,23 @@ int RGWRados::Bucket::List::list_objects_ordered( } } - // if we finished listing, or if we're returning at least half the - // requested entries, that's enough; S3 and swift protocols allow - // returning fewer than max entries - if (!truncated || count >= max / 2) { + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " INFO end of outer loop, truncated=" << truncated << + ", count=" << count << ", attempt=" << attempt << dendl; + + if (!truncated || count >= (max + 1) / 2) { + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + break; + } else if (attempt > 8 && count >= 1) { + // if we've made at least 8 attempts and we have some, but very + // few, results, return with what we have break; } ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ << - " INFO ordered bucket listing requires read #" << (2 + attempt) << + " INFO ordered bucket listing requires read #" << (1 + attempt) << dendl; } // read attempt loop @@ -9163,19 +9195,24 @@ uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, - int shard_id, - const rgw_obj_index_key& start, + const int shard_id, + const rgw_obj_index_key& start_after, const string& prefix, - uint32_t num_entries, - bool list_versions, + const uint32_t num_entries, + const bool list_versions, + const uint16_t attempt, map& m, bool *is_truncated, rgw_obj_index_key *last_entry, bool (*force_check_filter)(const string& name)) { - ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket << - " start " << start.name << "[" << start.instance << "] num_entries " << - num_entries << dendl; + ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after.name << + "[" << start_after.instance << + "]\", prefix=\"" << prefix << + "\" num_entries=" << num_entries << + ", list_versions=" << list_versions << + ", attempt=" << attempt << dendl; librados::IoCtx index_ctx; // key - oid (for different shards if there is any) @@ -9188,15 +9225,27 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } const uint32_t shard_count = oids.size(); - const uint32_t num_entries_per_shard = - calc_ordered_bucket_list_per_shard(num_entries, shard_count); + uint32_t num_entries_per_shard; + if (attempt == 0) { + num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + } else if (attempt <= 11) { + // we'll max out the exponential multiplication factor at 1024 (2<<10) + num_entries_per_shard = + std::min(num_entries, + (uint32_t(1 << (attempt - 1)) * + calc_ordered_bucket_list_per_shard(num_entries, shard_count))); + } else { + num_entries_per_shard = num_entries; + } - ldout(cct, 10) << __func__ << " request from each of " << shard_count << + ldout(cct, 10) << "RGWRados::" << __func__ << + " request from each of " << shard_count << " shard(s) for " << num_entries_per_shard << " entries to get " << num_entries << " total entries" << dendl; map list_results; - cls_rgw_obj_key start_key(start.name, start.instance); + cls_rgw_obj_key start_key(start_after.name, start_after.instance); r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries_per_shard, list_versions, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); @@ -9237,8 +9286,12 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, const string& name = vcurrents[pos]->first; struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second; - bool force_check = force_check_filter && - force_check_filter(dirent.key.name); + ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " << + dirent.key << " from shard " << pos << dendl; + + bool force_check = + force_check_filter && force_check_filter(dirent.key.name); + if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) { @@ -9255,11 +9308,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } else { r = 0; } + if (r >= 0) { - ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " << + ldout(cct, 10) << "RGWRados::" << __func__ << ": got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl; m[name] = std::move(dirent); ++count; + } else { + ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; } // refresh the candidates map @@ -9275,7 +9332,7 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } } // while we haven't provided requested # of result entries - // suggest updates if there is any + // suggest updates if there are any for (auto& miter : updates) { if (miter.second.length()) { ObjectWriteOperation o; @@ -9296,6 +9353,10 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } } + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, count=" << count << ", is_truncated=" << *is_truncated << + dendl; + if (*is_truncated && count < num_entries) { ldout(cct, 10) << "RGWRados::" << __func__ << ": INFO requested " << num_entries << " entries but returning " << @@ -9304,6 +9365,11 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, if (pos >= 0) { *last_entry = std::move((--vcurrents[pos])->first); + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry=" << *last_entry << dendl; + } else { + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry NOT SET" << dendl; } return 0; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index d772ccee67e5a..162a9b15adb17 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2205,10 +2205,13 @@ public: ceph::real_time& removed_mtime, list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); int cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); int cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout); - int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, int shard_id, - const rgw_obj_index_key& start, + int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, + const int shard_id, + const rgw_obj_index_key& start_after, const string& prefix, - uint32_t num_entries, bool list_versions, + const uint32_t num_entries, + const bool list_versions, + const uint16_t attempt, // 0 means ignore map& m, bool *is_truncated, rgw_obj_index_key *last_entry, -- 2.39.5