From: J. Eric Ivancich <ivancich@redhat.com>
Date: Mon, 14 Oct 2019 20:21:35 +0000 (-0400)
Subject: rgw: reduce per-shard entry count during ordered bucket listing
X-Git-Tag: v14.2.10~77^2~2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d89def43c20f574d46b07836c238f6fbd31a3ab7;p=ceph.git

rgw: reduce per-shard entry count during ordered bucket listing

Currently, if a client requests the 1000 next entries from a bucket,
each bucket index shard will receive a request for the 1000 next
entries. When there are hundreds, thousands, or tens of thousands of
bucket index shards, this results in a huge amplification of the
request, even though only 1000 entries will be returned.

These changes reduce the per-bucket index shard requests. These also
allow re-requests in edge cases where all of one shard's returned
entries are consumed. Finally these changes improve the determination
of whether the resulting list is truncated.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
(cherry picked from commit 729c0ba1361b76ecace94793b956354486b3dcd8)
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 023d54097777..87109b088a88 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -31,3 +31,8 @@
   The warning can be silenced with::
 
     ceph config set global mon_warn_on_pool_no_redundancy false
+
+* RGW: bucket listing performance on sharded bucket indexes has been
+  notably improved by heuristically -- and significantly, in many
+  cases -- reducing the number of entries requested from each bucket
+  index shard.
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 1e9b7227739c..350825fe9090 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -2449,8 +2449,9 @@ int RGWRados::Bucket::List::list_objects_ordered(
     }
   }
 
+  constexpr int allowed_read_attempts = 2;
   string skip_after_delim;
-  while (truncated && count <= max) {
+  for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) {
     std::map<string, rgw_bucket_dir_entry> ent_map;
     int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
 					   shard_id,
@@ -2546,7 +2547,7 @@ int RGWRados::Bucket::List::list_objects_ordered(
 
       result->emplace_back(std::move(entry));
       count++;
-    }
+    } // eiter for loop
 
     if (!params.delim.empty()) {
       int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
@@ -2565,7 +2566,18 @@ int RGWRados::Bucket::List::list_objects_ordered(
         }
       }
     }
-  }
+
+    // if we finished listing, or if we're returning at least half the
+    // requested entries, that's enough; S3 and swift protocols allow
+    // returning fewer than max entries
+    if (!truncated || count >= max / 2) {
+      break;
+    }
+
+    ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ <<
+      " INFO ordered bucket listing requires read #" << (2 + attempt) <<
+      dendl;
+  } // read attempt loop
 
 done:
   if (is_truncated)
@@ -9120,6 +9132,36 @@ int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_
 }
 
 
+uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+						      uint32_t num_shards)
+{
+  // We want to minimize the chances that when num_shards >>
+  // num_entries that we return much fewer than num_entries to the
+  // client. Given all the overhead of making a cls call to the osd,
+  // returning a few entries is not much more work than returning one
+  // entry. This minimum might be better tuned based on future
+  // experiments where num_shards >> num_entries. (Note: ">>" should
+  // be interpreted as "much greater than".)
+  constexpr uint32_t min_read = 8;
+
+  // The following is based on _"Balls into Bins" -- A Simple and
+  // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
+  // cases when num_shards >> num_entries (it almost serves as a
+  // ceiling calculation). We also assume alpha is 1.0 and extract it
+  // from the calculation. Future work could involve memoizing some of
+  // the transcendental functions to minimize repeatedly re-calling
+  // them with the same parameters, which we expect to be the case the
+  // majority of the time.
+  uint32_t calc_read =
+    1 +
+    static_cast<uint32_t>((num_entries / num_shards) +
+			  sqrt((2 * num_entries) *
+			       log(num_shards) / num_shards));
+
+  return std::max(min_read, calc_read);
+}
+
+
 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
 				      int shard_id,
 				      const rgw_obj_index_key& start,
@@ -9140,17 +9182,27 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
   // value - list result for the corresponding oid (shard), it is filled by
   //         the AIO callback
   map<int, string> oids;
-  map<int, struct rgw_cls_list_ret> list_results;
   int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
-  if (r < 0)
+  if (r < 0) {
     return r;
+  }
+
+  const uint32_t shard_count = oids.size();
+  const uint32_t num_entries_per_shard =
+    calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+
+  ldout(cct, 10) << __func__ << " request from each of " << shard_count <<
+    " shard(s) for " << num_entries_per_shard << " entries to get " <<
+    num_entries << " total entries" << dendl;
 
+  map<int, struct rgw_cls_list_ret> list_results;
   cls_rgw_obj_key start_key(start.name, start.instance);
-  r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
+  r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries_per_shard,
 			    list_versions, oids, list_results,
 			    cct->_conf->rgw_bucket_index_max_aio)();
-  if (r < 0)
+  if (r < 0) {
     return r;
+  }
 
   // Create a list of iterators that are used to iterate each shard
   vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents;
@@ -9159,18 +9211,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
   vcurrents.reserve(list_results.size());
   vends.reserve(list_results.size());
   vnames.reserve(list_results.size());
-  map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
-  *is_truncated = false;
-  for (; iter != list_results.end(); ++iter) {
-    vcurrents.push_back(iter->second.dir.m.begin());
-    vends.push_back(iter->second.dir.m.end());
-    vnames.push_back(oids[iter->first]);
-    *is_truncated = (*is_truncated || iter->second.is_truncated);
+  for (auto& iter : list_results) {
+    vcurrents.push_back(iter.second.dir.m.begin());
+    vends.push_back(iter.second.dir.m.end());
+    vnames.push_back(oids[iter.first]);
   }
 
-  // Create a map to track the next candidate entry from each shard, if the entry
-  // from a specified shard is selected/erased, the next entry from that shard will
-  // be inserted for next round selection
+  // create a map to track the next candidate entry from each shard,
+  // if the entry from a specified shard is selected/erased, the next
+  // entry from that shard will be inserted for next round selection
   map<string, size_t> candidates;
   for (size_t i = 0; i < vcurrents.size(); ++i) {
     if (vcurrents[i] != vends[i]) {
@@ -9193,17 +9242,18 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     if ((!dirent.exists && !dirent.is_delete_marker()) ||
         !dirent.pending_map.empty() ||
         force_check) {
-      /* there are uncommitted ops. We need to check the current state,
-       * and if the tags are old we need to do cleanup as well. */
+      /* there are uncommitted ops. We need to check the current
+       * state, and if the tags are old we need to do clean-up as
+       * well. */
       librados::IoCtx sub_ctx;
       sub_ctx.dup(index_ctx);
       r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
 			   updates[vnames[pos]]);
       if (r < 0 && r != -ENOENT) {
-          return r;
+	return r;
       }
     } else {
-        r = 0;
+      r = 0;
     }
     if (r >= 0) {
       ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
@@ -9212,37 +9262,49 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
       ++count;
     }
 
-    // Refresh the candidates map
+    // refresh the candidates map
     candidates.erase(candidates.begin());
-    ++vcurrents[pos];
-    if (vcurrents[pos] != vends[pos]) {
+    if (++vcurrents[pos] != vends[pos]) { // note: pre-increment
       candidates[vcurrents[pos]->first] = pos;
+    } else if (list_results[pos].is_truncated) {
+      // once we exhaust one shard that is truncated, we need to stop,
+      // as we cannot be certain that one of the next entries needs to
+      // come from that shard; S3 and swift protocols allow returning
+      // fewer than what was requested
+      break;
     }
-  }
+  } // while we haven't provided requested # of result entries
 
-  // Suggest updates if there is any
-  map<string, bufferlist>::iterator miter = updates.begin();
-  for (; miter != updates.end(); ++miter) {
-    if (miter->second.length()) {
+  // suggest updates if there is any
+  for (auto& miter : updates) {
+    if (miter.second.length()) {
       ObjectWriteOperation o;
-      cls_rgw_suggest_changes(o, miter->second);
+      cls_rgw_suggest_changes(o, miter.second);
       // we don't care if we lose suggested updates, send them off blindly
       AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
-      index_ctx.aio_operate(miter->first, c, &o);
+      index_ctx.aio_operate(miter.first, c, &o);
       c->release();
     }
-  }
+  } // updates loop
 
-  // Check if all the returned entries are consumed or not
+  *is_truncated = false;
+  // check if all the returned entries are consumed or not
   for (size_t i = 0; i < vcurrents.size(); ++i) {
-    if (vcurrents[i] != vends[i]) {
+    if (vcurrents[i] != vends[i] || list_results[i].is_truncated) {
       *is_truncated = true;
       break;
     }
   }
 
-  if (pos >= 0)
+  if (*is_truncated && count < num_entries) {
+    ldout(cct, 10) << "RGWRados::" << __func__ <<
+      ": INFO requested " << num_entries << " entries but returning " <<
+      count << ", which is truncated" << dendl;
+  }
+
+  if (pos >= 0) {
     *last_entry = std::move((--vcurrents[pos])->first);
+  }
 
   return 0;
 }
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 69ee29ebf992..d772ccee67e5 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -2465,6 +2465,12 @@ public:
                    bool *is_truncated, RGWAccessListFilter *filter);
 
   uint64_t next_bucket_id();
+
+  /**
+   * This is broken out to facilitate unit testing.
+   */
+  static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+						     uint32_t num_shards);
 };
 
 class RGWStoreManager {