From: J. Eric Ivancich Date: Mon, 14 Oct 2019 20:21:35 +0000 (-0400) Subject: rgw: reduce per-shard entry count during ordered bucket listing X-Git-Tag: v14.2.10~77^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d89def43c20f574d46b07836c238f6fbd31a3ab7;p=ceph.git rgw: reduce per-shard entry count during ordered bucket listing Currently, if a client requests the 1000 next entries from a bucket, each bucket index shard will receive a request for the 1000 next entries. When there are hundreds, thousands, or tens of thousands of bucket index shards, this results in a huge amplification of the request, even though only 1000 entries will be returned. These changes reduce the per-bucket index shard requests. These also allow re-requests in edge cases where all of one shard's returned entries are consumed. Finally these changes improve the determination of whether the resulting list is truncated. Signed-off-by: J. Eric Ivancich (cherry picked from commit 729c0ba1361b76ecace94793b956354486b3dcd8) --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 023d54097777..87109b088a88 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -31,3 +31,8 @@ The warning can be silenced with:: ceph config set global mon_warn_on_pool_no_redundancy false + +* RGW: bucket listing performance on sharded bucket indexes has been + notably improved by heuristically -- and significantly, in many + cases -- reducing the number of entries requested from each bucket + index shard. diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 1e9b7227739c..350825fe9090 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2449,8 +2449,9 @@ int RGWRados::Bucket::List::list_objects_ordered( } } + constexpr int allowed_read_attempts = 2; string skip_after_delim; - while (truncated && count <= max) { + for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) { std::map ent_map; int r = store->cls_bucket_list_ordered(target->get_bucket_info(), shard_id, @@ -2546,7 +2547,7 @@ int RGWRados::Bucket::List::list_objects_ordered( result->emplace_back(std::move(entry)); count++; - } + } // eiter for loop if (!params.delim.empty()) { int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size()); @@ -2565,7 +2566,18 @@ int RGWRados::Bucket::List::list_objects_ordered( } } } - } + + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + if (!truncated || count >= max / 2) { + break; + } + + ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ << + " INFO ordered bucket listing requires read #" << (2 + attempt) << + dendl; + } // read attempt loop done: if (is_truncated) @@ -9120,6 +9132,36 @@ int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_ } +uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards) +{ + // We want to minimize the chances that when num_shards >> + // num_entries that we return much fewer than num_entries to the + // client. Given all the overhead of making a cls call to the osd, + // returning a few entries is not much more work than returning one + // entry. This minimum might be better tuned based on future + // experiments where num_shards >> num_entries. (Note: ">>" should + // be interpreted as "much greater than".) + constexpr uint32_t min_read = 8; + + // The following is based on _"Balls into Bins" -- A Simple and + // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle + // cases when num_shards >> num_entries (it almost serves as a + // ceiling calculation). We also assume alpha is 1.0 and extract it + // from the calculation. Future work could involve memoizing some of + // the transcendental functions to minimize repeatedly re-calling + // them with the same parameters, which we expect to be the case the + // majority of the time. + uint32_t calc_read = + 1 + + static_cast((num_entries / num_shards) + + sqrt((2 * num_entries) * + log(num_shards) / num_shards)); + + return std::max(min_read, calc_read); +} + + int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, int shard_id, const rgw_obj_index_key& start, @@ -9140,17 +9182,27 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, // value - list result for the corresponding oid (shard), it is filled by // the AIO callback map oids; - map list_results; int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id); - if (r < 0) + if (r < 0) { return r; + } + + const uint32_t shard_count = oids.size(); + const uint32_t num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + + ldout(cct, 10) << __func__ << " request from each of " << shard_count << + " shard(s) for " << num_entries_per_shard << " entries to get " << + num_entries << " total entries" << dendl; + map list_results; cls_rgw_obj_key start_key(start.name, start.instance); - r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, + r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries_per_shard, list_versions, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); - if (r < 0) + if (r < 0) { return r; + } // Create a list of iterators that are used to iterate each shard vector::iterator> vcurrents; @@ -9159,18 +9211,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, vcurrents.reserve(list_results.size()); vends.reserve(list_results.size()); vnames.reserve(list_results.size()); - map::iterator iter = list_results.begin(); - *is_truncated = false; - for (; iter != list_results.end(); ++iter) { - vcurrents.push_back(iter->second.dir.m.begin()); - vends.push_back(iter->second.dir.m.end()); - vnames.push_back(oids[iter->first]); - *is_truncated = (*is_truncated || iter->second.is_truncated); + for (auto& iter : list_results) { + vcurrents.push_back(iter.second.dir.m.begin()); + vends.push_back(iter.second.dir.m.end()); + vnames.push_back(oids[iter.first]); } - // Create a map to track the next candidate entry from each shard, if the entry - // from a specified shard is selected/erased, the next entry from that shard will - // be inserted for next round selection + // create a map to track the next candidate entry from each shard, + // if the entry from a specified shard is selected/erased, the next + // entry from that shard will be inserted for next round selection map candidates; for (size_t i = 0; i < vcurrents.size(); ++i) { if (vcurrents[i] != vends[i]) { @@ -9193,17 +9242,18 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) { - /* there are uncommitted ops. We need to check the current state, - * and if the tags are old we need to do cleanup as well. */ + /* there are uncommitted ops. We need to check the current + * state, and if the tags are old we need to do clean-up as + * well. */ librados::IoCtx sub_ctx; sub_ctx.dup(index_ctx); r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]); if (r < 0 && r != -ENOENT) { - return r; + return r; } } else { - r = 0; + r = 0; } if (r >= 0) { ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " << @@ -9212,37 +9262,49 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info, ++count; } - // Refresh the candidates map + // refresh the candidates map candidates.erase(candidates.begin()); - ++vcurrents[pos]; - if (vcurrents[pos] != vends[pos]) { + if (++vcurrents[pos] != vends[pos]) { // note: pre-increment candidates[vcurrents[pos]->first] = pos; + } else if (list_results[pos].is_truncated) { + // once we exhaust one shard that is truncated, we need to stop, + // as we cannot be certain that one of the next entries needs to + // come from that shard; S3 and swift protocols allow returning + // fewer than what was requested + break; } - } + } // while we haven't provided requested # of result entries - // Suggest updates if there is any - map::iterator miter = updates.begin(); - for (; miter != updates.end(); ++miter) { - if (miter->second.length()) { + // suggest updates if there is any + for (auto& miter : updates) { + if (miter.second.length()) { ObjectWriteOperation o; - cls_rgw_suggest_changes(o, miter->second); + cls_rgw_suggest_changes(o, miter.second); // we don't care if we lose suggested updates, send them off blindly AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); - index_ctx.aio_operate(miter->first, c, &o); + index_ctx.aio_operate(miter.first, c, &o); c->release(); } - } + } // updates loop - // Check if all the returned entries are consumed or not + *is_truncated = false; + // check if all the returned entries are consumed or not for (size_t i = 0; i < vcurrents.size(); ++i) { - if (vcurrents[i] != vends[i]) { + if (vcurrents[i] != vends[i] || list_results[i].is_truncated) { *is_truncated = true; break; } } - if (pos >= 0) + if (*is_truncated && count < num_entries) { + ldout(cct, 10) << "RGWRados::" << __func__ << + ": INFO requested " << num_entries << " entries but returning " << + count << ", which is truncated" << dendl; + } + + if (pos >= 0) { *last_entry = std::move((--vcurrents[pos])->first); + } return 0; } diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 69ee29ebf992..d772ccee67e5 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -2465,6 +2465,12 @@ public: bool *is_truncated, RGWAccessListFilter *filter); uint64_t next_bucket_id(); + + /** + * This is broken out to facilitate unit testing. + */ + static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards); }; class RGWStoreManager {