]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: address 0-length listing results when non-vis entries dominate
authorJ. Eric Ivancich <ivancich@redhat.com>
Thu, 13 Feb 2020 01:38:44 +0000 (20:38 -0500)
committerJ. Eric Ivancich <ivancich@redhat.com>
Wed, 6 May 2020 15:41:01 +0000 (11:41 -0400)
A change to advance the marker in RGWRados::cls_bucket_list_ordered to
the last entry visited rather than the final entry in list to push
progress as far as possible.

Since non-vis entries tend to cluster on the same shard, such as
during incomplete multipart uploads, this can severely limit the
number of entries returned by a call to
RGWRados::cls_bucket_list_ordered since once that shard has provided
all its members, we must stop. This interacts with a recent
optimization to reduce the number of entries requested from each
shard. To address this the number of attempts is sent as a parameter,
so the number of entries requested from each shard can grow with each
attempt. Currently the growth is linear but perhaps exponential growth
(capped at number of entries requested) should be considered.

Previously RGWRados::Bucket::List::list_objects_ordered was capped at
2 attempts, but now we keep attempting to insure we make forward
progress and return entries when some exist. If we fail to make
forward progress, we log the error condition and stop looping.

Additional logging, mostly at level 20, is added to the two key
functions involved in ordered bucket listing to make it easier to
follow the logic and address potential future issues that might arise.

Additionally modify attempt number based on how many results were
received.

Change the per-shard request number, so it grows exponentially rather
than linearly as the attempts go up.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
(cherry picked from commit 28bd8bada4236060db6d5aed6b1eb345ab507890)

src/rgw/rgw_admin.cc
src/rgw/rgw_bucket.cc
src/rgw/rgw_rados.cc
src/rgw/rgw_rados.h

index 550e649ef7522ba868f20cd56d334e3dc9befb2a..20ede3f2e69ff2192f2d7177eeb513578b76430f 100644 (file)
@@ -6253,20 +6253,27 @@ next:
     formatter->open_object_section("result");
     formatter->dump_string("bucket", bucket_name);
     formatter->open_array_section("objects");
+
+    constexpr uint32_t NUM_ENTRIES = 1000;
+    uint16_t attempt = 1;
     while (is_truncated) {
       map<string, rgw_bucket_dir_entry> result;
       int r =
        store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD, marker,
-                                      prefix, 1000, true,
+                                      prefix, NUM_ENTRIES, true, attempt,
                                       result, &is_truncated, &marker,
                                       bucket_object_check_filter);
-
       if (r < 0 && r != -ENOENT) {
         cerr << "ERROR: failed operation r=" << r << std::endl;
+      } else if (r == -ENOENT) {
+        break;
       }
 
-      if (r == -ENOENT)
-        break;
+      if (result.size() < NUM_ENTRIES / 8) {
+       ++attempt;
+      } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) {
+       --attempt;
+      }
 
       map<string, rgw_bucket_dir_entry>::iterator iter;
       for (iter = result.begin(); iter != result.end(); ++iter) {
index 80ba1ba354bda799819710fe3152116a9cc86387..5092278c4f0dd4c69f3e4c46d59f3ef94a939271 100644 (file)
@@ -1164,11 +1164,13 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state,
 
   Formatter *formatter = flusher.get_formatter();
   formatter->open_object_section("objects");
+  constexpr uint32_t NUM_ENTRIES = 1000;
+  uint16_t attempt = 1;
   while (is_truncated) {
     map<string, rgw_bucket_dir_entry> result;
 
     int r = store->cls_bucket_list_ordered(bucket_info, RGW_NO_SHARD,
-                                          marker, prefix, 1000, true,
+                                          marker, prefix, NUM_ENTRIES, true, attempt,
                                           result, &is_truncated, &marker,
                                           bucket_object_check_filter);
     if (r == -ENOENT) {
@@ -1177,6 +1179,12 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state,
       set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
     }
 
+    if (result.size() < NUM_ENTRIES / 8) {
+      ++attempt;
+    } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) {
+      --attempt;
+    }
+
     dump_bucket_index(result, formatter);
     flusher.flush();
   }
index 350825fe9090394bd867e3670bf4c0bb58ebf8c0..79ab2b3186b7bdbcc0333a14cd13246ba599c4b1 100644 (file)
@@ -5,6 +5,8 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include <sstream>
+
 #include <boost/algorithm/string.hpp>
 #include <string_view>
 
@@ -2449,9 +2451,27 @@ int RGWRados::Bucket::List::list_objects_ordered(
     }
   }
 
-  constexpr int allowed_read_attempts = 2;
+  rgw_obj_index_key prev_marker;
   string skip_after_delim;
-  for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) {
+  uint16_t attempt = 0;
+  while (true) {
+    ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+      " beginning attempt=" << ++attempt << dendl;
+
+    // this loop is generally expected only to have a single
+    // iteration; the standard exit is at the bottom of the loop, but
+    // there's an error condition emergency exit as well
+
+    if (attempt > 1 && !(prev_marker < cur_marker)) {
+      // we've failed to make forward progress
+      ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ <<
+       ": ERROR marker failed to make forward progress; attempt=" << attempt <<
+       ", prev_marker=" << prev_marker <<
+       ", cur_marker=" << cur_marker << dendl;
+      break;
+    }
+    prev_marker = cur_marker;
+
     std::map<string, rgw_bucket_dir_entry> ent_map;
     int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
                                           shard_id,
@@ -2459,6 +2479,7 @@ int RGWRados::Bucket::List::list_objects_ordered(
                                           cur_prefix,
                                           read_ahead + 1 - count,
                                           params.list_versions,
+                                          attempt,
                                           ent_map,
                                           &truncated,
                                           &cur_marker);
@@ -2545,6 +2566,9 @@ int RGWRados::Bucket::List::list_objects_ordered(
         goto done;
       }
 
+      ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+       " adding entry " << entry.key << " to result" << dendl;
+
       result->emplace_back(std::move(entry));
       count++;
     } // eiter for loop
@@ -2567,15 +2591,23 @@ int RGWRados::Bucket::List::list_objects_ordered(
       }
     }
 
-    // if we finished listing, or if we're returning at least half the
-    // requested entries, that's enough; S3 and swift protocols allow
-    // returning fewer than max entries
-    if (!truncated || count >= max / 2) {
+    ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+      " INFO end of outer loop, truncated=" << truncated <<
+      ", count=" << count << ", attempt=" << attempt << dendl;
+
+    if (!truncated || count >= (max + 1) / 2) {
+      // if we finished listing, or if we're returning at least half the
+      // requested entries, that's enough; S3 and swift protocols allow
+      // returning fewer than max entries
+      break;
+    } else if (attempt > 8 && count >= 1) {
+      // if we've made at least 8 attempts and we have some, but very
+      // few, results, return with what we have
       break;
     }
 
     ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ <<
-      " INFO ordered bucket listing requires read #" << (2 + attempt) <<
+      " INFO ordered bucket listing requires read #" << (1 + attempt) <<
       dendl;
   } // read attempt loop
 
@@ -9163,19 +9195,24 @@ uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
 
 
 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
-                                     int shard_id,
-                                     const rgw_obj_index_key& start,
+                                     const int shard_id,
+                                     const rgw_obj_index_key& start_after,
                                      const string& prefix,
-                                     uint32_t num_entries,
-                                     bool list_versions,
+                                     const uint32_t num_entries,
+                                     const bool list_versions,
+                                     const uint16_t attempt,
                                      map<string, rgw_bucket_dir_entry>& m,
                                      bool *is_truncated,
                                      rgw_obj_index_key *last_entry,
                                      bool (*force_check_filter)(const string& name))
 {
-  ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
-    " start " << start.name << "[" << start.instance << "] num_entries " <<
-    num_entries << dendl;
+  ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after.name <<
+    "[" << start_after.instance <<
+    "]\", prefix=\"" << prefix <<
+    "\" num_entries=" << num_entries <<
+    ", list_versions=" << list_versions <<
+    ", attempt=" << attempt << dendl;
 
   librados::IoCtx index_ctx;
   // key   - oid (for different shards if there is any)
@@ -9188,15 +9225,27 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
   }
 
   const uint32_t shard_count = oids.size();
-  const uint32_t num_entries_per_shard =
-    calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  uint32_t num_entries_per_shard;
+  if (attempt == 0) {
+    num_entries_per_shard =
+      calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  } else if (attempt <= 11) {
+    // we'll max out the exponential multiplication factor at 1024 (2<<10)
+    num_entries_per_shard =
+      std::min(num_entries,
+              (uint32_t(1 << (attempt - 1)) *
+               calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+  } else {
+    num_entries_per_shard = num_entries;
+  }
 
-  ldout(cct, 10) << __func__ << " request from each of " << shard_count <<
+  ldout(cct, 10) << "RGWRados::" << __func__ <<
+    " request from each of " << shard_count <<
     " shard(s) for " << num_entries_per_shard << " entries to get " <<
     num_entries << " total entries" << dendl;
 
   map<int, struct rgw_cls_list_ret> list_results;
-  cls_rgw_obj_key start_key(start.name, start.instance);
+  cls_rgw_obj_key start_key(start_after.name, start_after.instance);
   r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries_per_shard,
                            list_versions, oids, list_results,
                            cct->_conf->rgw_bucket_index_max_aio)();
@@ -9237,8 +9286,12 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     const string& name = vcurrents[pos]->first;
     struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
 
-    bool force_check = force_check_filter &&
-        force_check_filter(dirent.key.name);
+    ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " <<
+      dirent.key << " from shard " << pos << dendl;
+
+    bool force_check =
+      force_check_filter && force_check_filter(dirent.key.name);
+
     if ((!dirent.exists && !dirent.is_delete_marker()) ||
         !dirent.pending_map.empty() ||
         force_check) {
@@ -9255,11 +9308,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     } else {
       r = 0;
     }
+
     if (r >= 0) {
-      ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
+      ldout(cct, 10) << "RGWRados::" << __func__ << ": got " <<
        dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
       m[name] = std::move(dirent);
       ++count;
+    } else {
+      ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " <<
+       dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
     }
 
     // refresh the candidates map
@@ -9275,7 +9332,7 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     }
   } // while we haven't provided requested # of result entries
 
-  // suggest updates if there is any
+  // suggest updates if there are any
   for (auto& miter : updates) {
     if (miter.second.length()) {
       ObjectWriteOperation o;
@@ -9296,6 +9353,10 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     }
   }
 
+  ldout(cct, 20) << "RGWRados::" << __func__ <<
+    ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+    dendl;
+
   if (*is_truncated && count < num_entries) {
     ldout(cct, 10) << "RGWRados::" << __func__ <<
       ": INFO requested " << num_entries << " entries but returning " <<
@@ -9304,6 +9365,11 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
 
   if (pos >= 0) {
     *last_entry = std::move((--vcurrents[pos])->first);
+    ldout(cct, 20) << "RGWRados::" << __func__ <<
+      ": returning, last_entry=" << *last_entry << dendl;
+  } else {
+    ldout(cct, 20) << "RGWRados::" << __func__ <<
+      ": returning, last_entry NOT SET" << dendl;
   }
 
   return 0;
index d772ccee67e5a2f38dbc4eeb88f422a304594cf4..162a9b15adb1743076d7d1b178fe972235b9b5f5 100644 (file)
@@ -2205,10 +2205,13 @@ public:
                            ceph::real_time& removed_mtime, list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
   int cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
   int cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout);
-  int cls_bucket_list_ordered(RGWBucketInfo& bucket_info, int shard_id,
-                             const rgw_obj_index_key& start,
+  int cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
+                             const int shard_id,
+                             const rgw_obj_index_key& start_after,
                              const string& prefix,
-                             uint32_t num_entries, bool list_versions,
+                             const uint32_t num_entries,
+                             const bool list_versions,
+                             const uint16_t attempt, // 0 means ignore
                              map<string, rgw_bucket_dir_entry>& m,
                              bool *is_truncated,
                              rgw_obj_index_key *last_entry,