]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: address 0-length listing results when non-vis entries dominate
authorJ. Eric Ivancich <ivancich@redhat.com>
Thu, 13 Feb 2020 01:38:44 +0000 (20:38 -0500)
committerJ. Eric Ivancich <ivancich@redhat.com>
Thu, 13 Feb 2020 01:38:44 +0000 (20:38 -0500)
A change to advance the marker in RGWRados::cls_bucket_list_ordered to
the last entry visited rather than the final entry in list to push
progress as far as possible.

Since non-vis entries tend to cluster on the same shard, such as
during incomplete multipart uploads, this can severely limit the
number of entries returned by a call to
RGWRados::cls_bucket_list_ordered since once that shard has provided
all its members, we must stop. This interacts with a recent
optimization to reduce the number of entries requested from each
shard. To address this the number of attempts is sent as a parameter,
so the number of entries requested from each shard can grow with each
attempt. Currently the growth is linear but perhaps exponential growth
(capped at number of entries requested) should be considered.

Previously RGWRados::Bucket::List::list_objects_ordered was capped at
2 attempts, but now we keep attempting to insure we make forward
progress and return entries when some exist. If we fail to make
forward progress, we log the error condition and stop looping.

Additional logging, mostly at level 20, is added to the two key
functions involved in ordered bucket listing to make it easier to
follow the logic and address potential future issues that might arise.

Additionally modify attempt number based on how many results were
received.

Change the per-shard request number, so it grows exponentially rather
than linearly as the attempts go up.

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
src/rgw/rgw_admin.cc
src/rgw/rgw_bucket.cc
src/rgw/rgw_rados.cc
src/rgw/rgw_rados.h

index bcfda46bf01ef84587ead137050f58201bb39f40..477f05f3d78859ed9678d2f74becd6d5ff70bd5b 100644 (file)
@@ -6749,21 +6749,31 @@ next:
     formatter->open_object_section("result");
     formatter->dump_string("bucket", bucket_name);
     formatter->open_array_section("objects");
+
+    constexpr uint32_t NUM_ENTRIES = 1000;
+    uint16_t attempt = 1;
     while (is_truncated) {
       RGWRados::ent_map_t result;
-      int r = store->getRados()->cls_bucket_list_ordered(
-       bucket_info, RGW_NO_SHARD,
-       marker, empty_prefix, empty_delimiter,
-       1000, true,
-       result, &is_truncated, &cls_filtered, &marker,
-       null_yield,
-       rgw_bucket_object_check_filter);
+      int r =
+       store->getRados()->cls_bucket_list_ordered(
+         bucket_info, RGW_NO_SHARD,
+         marker, empty_prefix, empty_delimiter,
+         NUM_ENTRIES, true, attempt,
+         result, &is_truncated, &cls_filtered, &marker,
+         null_yield,
+         rgw_bucket_object_check_filter);
+
       if (r < 0 && r != -ENOENT) {
         cerr << "ERROR: failed operation r=" << r << std::endl;
+      } else if (r == -ENOENT) {
+        break;
       }
 
-      if (r == -ENOENT)
-        break;
+      if (result.size() < NUM_ENTRIES / 8) {
+       ++attempt;
+      } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) {
+       --attempt;
+      }
 
       for (auto iter = result.begin(); iter != result.end(); ++iter) {
         rgw_obj_key key = iter->second.key;
index 258919a4bc0a19ff57ca4a5d6f267eba39ad59f0..c17936a825d94693a774eeaf2a874b88f659ed86 100644 (file)
@@ -1015,20 +1015,29 @@ int RGWBucket::check_object_index(RGWBucketAdminOpState& op_state,
 
   Formatter *formatter = flusher.get_formatter();
   formatter->open_object_section("objects");
+  constexpr uint32_t NUM_ENTRIES = 1000;
+  uint16_t attempt = 1;
   while (is_truncated) {
     RGWRados::ent_map_t result;
     result.reserve(listing_max_entries);
 
     int r = store->getRados()->cls_bucket_list_ordered(
       bucket_info, RGW_NO_SHARD, marker, prefix, empty_delimiter,
-      listing_max_entries, true, result, &is_truncated, &cls_filtered,
-      &marker, y, rgw_bucket_object_check_filter);
+      listing_max_entries, true, attempt,
+      result, &is_truncated, &cls_filtered, &marker,
+      y, rgw_bucket_object_check_filter);
     if (r == -ENOENT) {
       break;
     } else if (r < 0 && r != -ENOENT) {
       set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
     }
 
+    if (result.size() < NUM_ENTRIES / 8) {
+      ++attempt;
+    } else if (result.size() > NUM_ENTRIES * 7 / 8 && attempt > 1) {
+      --attempt;
+    }
+
     dump_bucket_index(result, formatter);
     flusher.flush();
   }
index 58b921e44e1db99d055d82af644d3c42279ec9fd..8b4bef1b9579f563c34bb2af67997e421a757024 100644 (file)
@@ -5,6 +5,8 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include <sstream>
+
 #include <boost/algorithm/string.hpp>
 #include <string_view>
 
@@ -1774,10 +1776,25 @@ int RGWRados::Bucket::List::list_objects_ordered(
     }
   }
 
-  constexpr int allowed_read_attempts = 2;
-  for (int attempt = 0; attempt < allowed_read_attempts; ++attempt) {
+  rgw_obj_index_key prev_marker;
+  uint16_t attempt = 0;
+  while (true) {
+    ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+      " beginning attempt=" << ++attempt << dendl;
+
     // this loop is generally expected only to have a single
-    // iteration; see bottom of loop for early exit
+    // iteration; the standard exit is at the bottom of the loop, but
+    // there's an error condition emergency exit as well
+
+    if (attempt > 1 && !(prev_marker < cur_marker)) {
+      // we've failed to make forward progress
+      ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ <<
+       ": ERROR marker failed to make forward progress; attempt=" << attempt <<
+       ", prev_marker=" << prev_marker <<
+       ", cur_marker=" << cur_marker << dendl;
+      break;
+    }
+    prev_marker = cur_marker;
 
     ent_map_t ent_map;
     ent_map.reserve(read_ahead);
@@ -1788,6 +1805,7 @@ int RGWRados::Bucket::List::list_objects_ordered(
                                           params.delim,
                                           read_ahead + 1 - count,
                                           params.list_versions,
+                                          attempt,
                                           ent_map,
                                           &truncated,
                                           &cls_filtered,
@@ -1914,6 +1932,9 @@ int RGWRados::Bucket::List::list_objects_ordered(
         goto done;
       }
 
+      ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+       " adding entry " << entry.key << " to result" << dendl;
+
       result->emplace_back(std::move(entry));
       count++;
     } // eiter for loop
@@ -1942,15 +1963,23 @@ int RGWRados::Bucket::List::list_objects_ordered(
       }
     } // if older osd didn't do delimiter filtering
 
-    // if we finished listing, or if we're returning at least half the
-    // requested entries, that's enough; S3 and swift protocols allow
-    // returning fewer than max entries
-    if (!truncated || count >= max / 2) {
+    ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
+      " INFO end of outer loop, truncated=" << truncated <<
+      ", count=" << count << ", attempt=" << attempt << dendl;
+
+    if (!truncated || count >= (max + 1) / 2) {
+      // if we finished listing, or if we're returning at least half the
+      // requested entries, that's enough; S3 and swift protocols allow
+      // returning fewer than max entries
+      break;
+    } else if (attempt > 8 && count >= 1) {
+      // if we've made at least 8 attempts and we have some, but very
+      // few, results, return with what we have
       break;
     }
 
     ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ <<
-      " INFO ordered bucket listing requires read #" << (2 + attempt) <<
+      " INFO ordered bucket listing requires read #" << (1 + attempt) <<
       dendl;
   } // read attempt loop
 
@@ -8136,12 +8165,13 @@ uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
 
 
 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
-                                     int shard_id,
+                                     const int shard_id,
                                      const rgw_obj_index_key& start_after,
                                      const string& prefix,
                                      const string& delimiter,
-                                     uint32_t num_entries,
-                                     bool list_versions,
+                                     const uint32_t num_entries,
+                                     const bool list_versions,
+                                     const uint16_t attempt,
                                      ent_map_t& m,
                                      bool* is_truncated,
                                      bool* cls_filtered,
@@ -8149,9 +8179,13 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
                                       optional_yield y,
                                      check_filter_t force_check_filter)
 {
-  ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
-    " start_after " << start_after.name << "[" << start_after.instance <<
-    "] num_entries " << num_entries << dendl;
+  ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after.name <<
+    "[" << start_after.instance <<
+    "]\", prefix=\"" << prefix <<
+    "\" num_entries=" << num_entries <<
+    ", list_versions=" << list_versions <<
+    ", attempt=" << attempt << dendl;
 
   RGWSI_RADOS::Pool index_pool;
   // key   - oid (for different shards if there is any)
@@ -8165,10 +8199,22 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
   }
 
   const uint32_t shard_count = oids.size();
-  const uint32_t num_entries_per_shard =
-    calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  uint32_t num_entries_per_shard;
+  if (attempt == 0) {
+    num_entries_per_shard =
+      calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  } else if (attempt <= 11) {
+    // we'll max out the exponential multiplication factor at 1024 (2<<10)
+    num_entries_per_shard =
+      std::min(num_entries,
+              (uint32_t(1 << (attempt - 1)) *
+               calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+  } else {
+    num_entries_per_shard = num_entries;
+  }
 
-  ldout(cct, 10) << __func__ << " request from each of " << shard_count <<
+  ldout(cct, 10) << "RGWRados::" << __func__ <<
+    " request from each of " << shard_count <<
     " shard(s) for " << num_entries_per_shard << " entries to get " <<
     num_entries << " total entries" << dendl;
 
@@ -8183,6 +8229,21 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     return r;
   }
 
+  auto result_info =
+    [](const map<int, struct rgw_cls_list_ret>& m) -> std::string {
+      std::stringstream out;
+      out << "{ size:" << m.size() << ", entries:[";
+      for (const auto& i : m) {
+       out << " { " << i.first << ", " << i.second.dir.m.size() << " },";
+      }
+      out << "] }";
+      return out.str();
+    };
+
+  ldout(cct, 20) << "RGWRados::" << __func__ <<
+    " CLSRGWIssueBucketList() result=" <<
+    result_info(list_results) << dendl;
+
   // create a list of iterators that are used to iterate each shard
   vector<RGWRados::ent_map_t::iterator> vcurrents;
   vector<RGWRados::ent_map_t::iterator> vends;
@@ -8226,6 +8287,9 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     const string& name = vcurrents[pos]->first;
     struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
 
+    ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " <<
+      dirent.key << " from shard " << pos << dendl;
+
     bool force_check =
       force_check_filter && force_check_filter(dirent.key.name);
 
@@ -8247,11 +8311,15 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     } else {
       r = 0;
     }
+
     if (r >= 0) {
-      ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
+      ldout(cct, 10) << "RGWRados::" << __func__ << ": got " <<
        dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
       m[name] = std::move(dirent);
       ++count;
+    } else {
+      ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " <<
+       dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
     }
 
     // refresh the candidates map
@@ -8267,7 +8335,7 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     }
   } // while we haven't provided requested # of result entries
 
-  // suggest updates if there is any
+  // suggest updates if there are any
   for (auto& miter : updates) {
     if (miter.second.length()) {
       ObjectWriteOperation o;
@@ -8289,14 +8357,24 @@ int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
     }
   }
 
+  ldout(cct, 20) << "RGWRados::" << __func__ <<
+    ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+    dendl;
+
   if (*is_truncated && count < num_entries) {
     ldout(cct, 10) << "RGWRados::" << __func__ <<
       ": INFO requested " << num_entries << " entries but returning " <<
       count << ", which is truncated" << dendl;
   }
 
-  if (pos >= 0)
+  if (pos >= 0) {
     *last_entry = std::move((--vcurrents[pos])->first);
+    ldout(cct, 20) << "RGWRados::" << __func__ <<
+      ": returning, last_entry=" << *last_entry << dendl;
+  } else {
+    ldout(cct, 20) << "RGWRados::" << __func__ <<
+      ": returning, last_entry NOT SET" << dendl;
+  }
 
   return 0;
 }
index c3187b3c9d10550278b582fd925729f0c54c63b0..6ad236a70672c2dcec5519f2de1ed4900da5e9fb 100644 (file)
@@ -1378,12 +1378,13 @@ public:
   using check_filter_t = bool (*)(const std::string&);
 
   int cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
-                             int shard_id,
+                             const int shard_id,
                              const rgw_obj_index_key& start_after,
                              const string& prefix,
                              const string& delimiter,
-                             uint32_t num_entries,
-                             bool list_versions,
+                             const uint32_t num_entries,
+                             const bool list_versions,
+                             const uint16_t attempt, // 0 means ignore
                              ent_map_t& m,
                              bool* is_truncated,
                              bool* cls_filtered,