From: J. Eric Ivancich Date: Thu, 13 Feb 2020 01:38:44 +0000 (-0500) Subject: rgw: address 0-length listing results when non-vis entries dominate X-Git-Tag: v15.1.1~432^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=28bd8bada4236060db6d5aed6b1eb345ab507890;p=ceph.git rgw: address 0-length listing results when non-vis entries dominate A change to advance the marker in RGWRados::cls_bucket_list_ordered to the last entry visited rather than the final entry in list to push progress as far as possible. Since non-vis entries tend to cluster on the same shard, such as during incomplete multipart uploads, this can severely limit the number of entries returned by a call to RGWRados::cls_bucket_list_ordered since once that shard has provided all its members, we must stop. This interacts with a recent optimization to reduce the number of entries requested from each shard. To address this the number of attempts is sent as a parameter, so the number of entries requested from each shard can grow with each attempt. Currently the growth is linear but perhaps exponential growth (capped at number of entries requested) should be considered. Previously RGWRados::Bucket::List::list_objects_ordered was capped at 2 attempts, but now we keep attempting to insure we make forward progress and return entries when some exist. If we fail to make forward progress, we log the error condition and stop looping. Additional logging, mostly at level 20, is added to the two key functions involved in ordered bucket listing to make it easier to follow the logic and address potential future issues that might arise. Additionally modify attempt number based on how many results were received. Change the per-shard request number, so it grows exponentially rather than linearly as the attempts go up. Signed-off-by: J. Eric Ivancich --- diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index bcfda46bf01e..477f05f3d788 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -6749,21 +6749,31 @@ next: formatter->open_object_section("result"); formatter->dump_string("bucket", bucket_name); formatter->open_array_section("objects"); + + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t attempt = 1; while (is_truncated) { RGWRados::ent_map_t result; - int r = store->getRados()->cls_bucket_list_ordered( - bucket_info, RGW_NO_SHARD, - marker, empty_prefix, empty_delimiter, - 1000, true, - result, &is_truncated, &cls_filtered, &marker, - null_yield, - rgw_bucket_object_check_filter); + int r = + store->getRados()->cls_bucket_list_ordered( + bucket_info, RGW_NO_SHARD, + marker, empty_prefix, empty_delimiter, + NUM_ENTRIES, true, attempt, + result, &is_truncated, &cls_filtered, &marker, + null_yield, + rgw_bucket_object_check_filter); + if (r < 0 && r != -ENOENT) { cerr << "ERROR: failed operation r=" << r << std::endl; + } else if (r == -ENOENT) { + break; } - if (r == -ENOENT) - break; + if (result.size() < NUM_ENTRIES / 8) { + ++attempt; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) { + --attempt; + } for (auto iter = result.begin(); iter != result.end(); ++iter) { rgw_obj_key key = iter->second.key; diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index 258919a4bc0a..c17936a825d9 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -1015,20 +1015,29 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state, Formatter *formatter = flusher.get_formatter(); formatter->open_object_section("objects"); + constexpr uint32_t NUM_ENTRIES = 1000; + uint16_t attempt = 1; while (is_truncated) { RGWRados::ent_map_t result; result.reserve(listing_max_entries); int r = store->getRados()->cls_bucket_list_ordered( bucket_info, RGW_NO_SHARD, marker, prefix, empty_delimiter, - listing_max_entries, true, result, &is_truncated, &cls_filtered, - &marker, y, rgw_bucket_object_check_filter); + listing_max_entries, true, attempt, + result, &is_truncated, &cls_filtered, &marker, + y, rgw_bucket_object_check_filter); if (r == -ENOENT) { break; } else if (r < 0 && r != -ENOENT) { set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r)); } + if (result.size() < NUM_ENTRIES / 8) { + ++attempt; + } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) { + --attempt; + } + dump_bucket_index(result, formatter); flusher.flush(); } diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 58b921e44e1d..8b4bef1b9579 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -5,6 +5,8 @@ #include #include #include +#include + #include #include @@ -1774,10 +1776,25 @@ int RGWRados::Bucket::List::list_objects_ordered( } } - constexpr int allowed_read_attempts = 2; - for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) { + rgw_obj_index_key prev_marker; + uint16_t attempt = 0; + while (true) { + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " beginning attempt=" << ++attempt << dendl; + // this loop is generally expected only to have a single - // iteration; see bottom of loop for early exit + // iteration; the standard exit is at the bottom of the loop, but + // there's an error condition emergency exit as well + + if (attempt > 1 && !(prev_marker < cur_marker)) { + // we've failed to make forward progress + ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ << + ": ERROR marker failed to make forward progress; attempt=" << attempt << + ", prev_marker=" << prev_marker << + ", cur_marker=" << cur_marker << dendl; + break; + } + prev_marker = cur_marker; ent_map_t ent_map; ent_map.reserve(read_ahead); @@ -1788,6 +1805,7 @@ int RGWRados::Bucket::List::list_objects_ordered( params.delim, read_ahead + 1 - count, params.list_versions, + attempt, ent_map, &truncated, &cls_filtered, @@ -1914,6 +1932,9 @@ int RGWRados::Bucket::List::list_objects_ordered( goto done; } + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " adding entry " << entry.key << " to result" << dendl; + result->emplace_back(std::move(entry)); count++; } // eiter for loop @@ -1942,15 +1963,23 @@ int RGWRados::Bucket::List::list_objects_ordered( } } // if older osd didn't do delimiter filtering - // if we finished listing, or if we're returning at least half the - // requested entries, that's enough; S3 and swift protocols allow - // returning fewer than max entries - if (!truncated || count >= max / 2) { + ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ << + " INFO end of outer loop, truncated=" << truncated << + ", count=" << count << ", attempt=" << attempt << dendl; + + if (!truncated || count >= (max + 1) / 2) { + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + break; + } else if (attempt > 8 && count >= 1) { + // if we've made at least 8 attempts and we have some, but very + // few, results, return with what we have break; } ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ << - " INFO ordered bucket listing requires read #" << (2 + attempt) << + " INFO ordered bucket listing requires read #" << (1 + attempt) << dendl; } // read attempt loop @@ -8136,12 +8165,13 @@ uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, - int shard_id, + const int shard_id, const rgw_obj_index_key& start_after, const string& prefix, const string& delimiter, - uint32_t num_entries, - bool list_versions, + const uint32_t num_entries, + const bool list_versions, + const uint16_t attempt, ent_map_t& m, bool* is_truncated, bool* cls_filtered, @@ -8149,9 +8179,13 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, optional_yield y, check_filter_t force_check_filter) { - ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket << - " start_after " << start_after.name << "[" << start_after.instance << - "] num_entries " << num_entries << dendl; + ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after.name << + "[" << start_after.instance << + "]\", prefix=\"" << prefix << + "\" num_entries=" << num_entries << + ", list_versions=" << list_versions << + ", attempt=" << attempt << dendl; RGWSI_RADOS::Pool index_pool; // key - oid (for different shards if there is any) @@ -8165,10 +8199,22 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } const uint32_t shard_count = oids.size(); - const uint32_t num_entries_per_shard = - calc_ordered_bucket_list_per_shard(num_entries, shard_count); + uint32_t num_entries_per_shard; + if (attempt == 0) { + num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + } else if (attempt <= 11) { + // we'll max out the exponential multiplication factor at 1024 (2<<10) + num_entries_per_shard = + std::min(num_entries, + (uint32_t(1 << (attempt - 1)) * + calc_ordered_bucket_list_per_shard(num_entries, shard_count))); + } else { + num_entries_per_shard = num_entries; + } - ldout(cct, 10) << __func__ << " request from each of " << shard_count << + ldout(cct, 10) << "RGWRados::" << __func__ << + " request from each of " << shard_count << " shard(s) for " << num_entries_per_shard << " entries to get " << num_entries << " total entries" << dendl; @@ -8183,6 +8229,21 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, return r; } + auto result_info = + [](const map& m) -> std::string { + std::stringstream out; + out << "{ size:" << m.size() << ", entries:["; + for (const auto& i : m) { + out << " { " << i.first << ", " << i.second.dir.m.size() << " },"; + } + out << "] }"; + return out.str(); + }; + + ldout(cct, 20) << "RGWRados::" << __func__ << + " CLSRGWIssueBucketList() result=" << + result_info(list_results) << dendl; + // create a list of iterators that are used to iterate each shard vector vcurrents; vector vends; @@ -8226,6 +8287,9 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, const string& name = vcurrents[pos]->first; struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second; + ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " << + dirent.key << " from shard " << pos << dendl; + bool force_check = force_check_filter && force_check_filter(dirent.key.name); @@ -8247,11 +8311,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } else { r = 0; } + if (r >= 0) { - ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " << + ldout(cct, 10) << "RGWRados::" << __func__ << ": got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl; m[name] = std::move(dirent); ++count; + } else { + ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; } // refresh the candidates map @@ -8267,7 +8335,7 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } } // while we haven't provided requested # of result entries - // suggest updates if there is any + // suggest updates if there are any for (auto& miter : updates) { if (miter.second.length()) { ObjectWriteOperation o; @@ -8289,14 +8357,24 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, } } + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, count=" << count << ", is_truncated=" << *is_truncated << + dendl; + if (*is_truncated && count < num_entries) { ldout(cct, 10) << "RGWRados::" << __func__ << ": INFO requested " << num_entries << " entries but returning " << count << ", which is truncated" << dendl; } - if (pos >= 0) + if (pos >= 0) { *last_entry = std::move((--vcurrents[pos])->first); + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry=" << *last_entry << dendl; + } else { + ldout(cct, 20) << "RGWRados::" << __func__ << + ": returning, last_entry NOT SET" << dendl; + } return 0; } diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index c3187b3c9d10..6ad236a70672 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1378,12 +1378,13 @@ public: using check_filter_t = bool (*)(const std::string&); int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, - int shard_id, + const int shard_id, const rgw_obj_index_key& start_after, const string& prefix, const string& delimiter, - uint32_t num_entries, - bool list_versions, + const uint32_t num_entries, + const bool list_versions, + const uint16_t attempt, // 0 means ignore ent_map_t& m, bool* is_truncated, bool* cls_filtered,