rgw: allow ordered bucket listing to work when many filtered out entries

author J. Eric Ivancich <ivancich@redhat.com>

Mon, 19 Jul 2021 18:24:11 +0000 (14:24 -0400)

committer Dan van der Ster <daniel.vanderster@cern.ch>

Tue, 1 Mar 2022 10:31:01 +0000 (11:31 +0100)
author J. Eric Ivancich <ivancich@redhat.com>
Mon, 19 Jul 2021 18:24:11 +0000 (14:24 -0400)
committer Dan van der Ster <daniel.vanderster@cern.ch>
Tue, 1 Mar 2022 10:31:01 +0000 (11:31 +0100)
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc

index 19926d5f5954ed41985ab75df67b9e18d599bfc2..10bc040ad68d8c8832b5e4ad47a8e9da769d0081 100644 (file)
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -183,10 +183,10 @@ static int log_index_operation(cls_method_context_t hctx, cls_rgw_obj_key& obj_k
   * namespace".
   */
  static int get_obj_vals(cls_method_context_t hctx,
-                       const string& start,
-                       const string& filter_prefix,
+                       const std::string& start,
+                       const std::string& filter_prefix,
                          int num_entries,
-                       map<string, bufferlist> *pkeys,
+                       std::map<std::string, bufferlist> *pkeys,
                         bool *pmore)
  {
    int ret = cls_cxx_map_get_vals(hctx, start, filter_prefix,
@@ -199,7 +199,7 @@ static int get_obj_vals(cls_method_context_t hctx,
      return 0;
    }
  
-  auto last_element = pkeys->rbegin();
+  auto last_element = pkeys->crbegin();
    if ((unsigned char)last_element->first[0] < BI_PREFIX_CHAR) {
      /* if the first character of the last entry is less than the
       * prefix then all entries must preceed the "ugly namespace" and
@@ -208,11 +208,11 @@ static int get_obj_vals(cls_method_context_t hctx,
      return 0;
    }
  
-  auto first_element = pkeys->begin();
+  auto first_element = pkeys->cbegin();
    if ((unsigned char)first_element->first[0] > BI_PREFIX_CHAR) {
-    /* the first character of the last entry is in or after the "ugly
-     * namespace", so if the first character of the first entry
-     * follows the "ugly namespace" then all entries do and we're done
+    /* if the first character of the first entry is after the "ugly
+     * namespace" then all entries must follow the "ugly namespace"
+     * then all entries do and we're done
       */
      return 0;
    }
@@ -223,10 +223,10 @@ static int get_obj_vals(cls_method_context_t hctx,
     * outside the "ugly namespace"
     */
  
-  auto comp = [](const pair<string, bufferlist>& l, const string &r) {
+  auto comp = [](const pair<std::string, bufferlist>& l, const std::string &r) {
                 return l.first < r;
               };
-  string new_start = {static_cast<char>(BI_PREFIX_CHAR + 1)};
+  std::string new_start = {static_cast<char>(BI_PREFIX_CHAR + 1)};
  
    auto lower = pkeys->lower_bound(string{static_cast<char>(BI_PREFIX_CHAR)});
    auto upper = std::lower_bound(lower, pkeys->end(), new_start, comp);
@@ -236,11 +236,11 @@ static int get_obj_vals(cls_method_context_t hctx,
      return 0;
    }
  
-  if (pkeys->size() && new_start < pkeys->rbegin()->first) {
+  if (pkeys->size() && new_start < pkeys->crbegin()->first) {
      new_start = pkeys->rbegin()->first;
    }
  
-  map<string, bufferlist> new_keys;
+  std::map<std::string, bufferlist> new_keys;
  
    /* now get some more keys */
    ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix,
@@ -494,29 +494,42 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
      return rc;
    }
  
-  string start_after_key;   // key that we can start listing at, one of a)
-                            // sent in by caller, b) last item visited, or
-                            // c) when delimiter present, a key that will
-                            // move past the subdirectory
-  encode_list_index_key(hctx, op.start_obj, &start_after_key);
+  // some calls just want the header and request 0 entries
+  if (op.num_entries <= 0) {
+    ret.is_truncated = false;
+    encode(ret, *out);
+    return 0;
+  }
+
+  // key that we can start listing at, one of a) sent in by caller, b)
+  // last item visited, or c) when delimiter present, a key that will
+  // move past the subdirectory
+  std::string start_after_omap_key;
+  encode_list_index_key(hctx, op.start_obj, &start_after_omap_key);
+
+  // this is set whenenver start_after_omap_key is set to keep them in
+  // sync since this will be the returned marker when a marker is
+  // returned
+  cls_rgw_obj_key start_after_entry_key;
+
+  // last key stored in result, so if we have to call get_obj_vals
+  // multiple times, we do not add the overlap to result
+  std::string prev_omap_key;
  
-  string previous_key; // last key stored in result, so if we have to
-                      // call get_obj_vals multiple times, we do not
-                      // add the overlap to result
-  string previous_prefix_key; // last prefix_key stored in result, so
-                             // we can skip over entries with the
-                             // same prefix_key
+  // last prefix_key stored in result, so we can skip over entries
+  // with the same prefix_key
+  std::string prev_prefix_omap_key;
  
    bool done = false;   // whether we need to keep calling get_obj_vals
    bool more = true;    // output parameter of get_obj_vals
    bool has_delimiter = !op.delimiter.empty();
  
    if (has_delimiter &&
-      start_after_key > op.filter_prefix &&
-      boost::algorithm::ends_with(start_after_key, op.delimiter)) {
+      start_after_omap_key > op.filter_prefix &&
+      boost::algorithm::ends_with(start_after_omap_key, op.delimiter)) {
      // advance past all subdirectory entries if we start after a
      // subdirectory
-    start_after_key = cls_rgw_after_delim(start_after_key);
+    start_after_omap_key = cls_rgw_after_delim(start_after_omap_key);
    }
  
    for (int attempt = 0;
@@ -525,8 +538,12 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
          !done &&
          name_entry_map.size() < op.num_entries;
         ++attempt) {
-    map<string, bufferlist> keys;
-    rc = get_obj_vals(hctx, start_after_key, op.filter_prefix,
+    std::map<std::string, bufferlist> keys;
+
+    // note: get_obj_vals skips past the "ugly namespace" (i.e.,
+    // entries that start with the BI_PREFIX_CHAR), so no need to
+    // check for such entries
+    rc = get_obj_vals(hctx, start_after_omap_key, op.filter_prefix,
                       op.num_entries - name_entry_map.size(),
                       &keys, &more);
      if (rc < 0) {
@@ -538,13 +555,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
      done = keys.empty();
  
      for (auto kiter = keys.cbegin(); kiter != keys.cend(); ++kiter) {
-      if (!bi_is_plain_entry(kiter->first)) {
-       // we're done if we walked off the end of the objects area of
-       // the bucket index
-        done = true;
-        break;
-      }
-
        rgw_bucket_dir_entry entry;
        try {
         const bufferlist& entrybl = kiter->second;
@@ -556,7 +566,8 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
          return -EINVAL;
        }
  
-      start_after_key = kiter->first;
+      start_after_omap_key = kiter->first;
+      start_after_entry_key = entry.key;
        CLS_LOG(20, "%s: working on key=%s len=%zu",
               __func__, kiter->first.c_str(), kiter->first.size());
  
@@ -591,10 +602,10 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
            string prefix_key =
             key.name.substr(0, delim_pos + op.delimiter.length());
  
-         if (prefix_key == previous_prefix_key) {
+         if (prefix_key == prev_prefix_omap_key) {
             continue; // we've already added this;
           } else {
-           previous_prefix_key = prefix_key;
+           prev_prefix_omap_key = prefix_key;
           }
  
           if (name_entry_map.size() < op.num_entries) {
@@ -612,11 +623,12 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
           // make sure that if this is the last item added to the
           // result from this call to get_obj_vals, the next call will
           // skip past rest of "subdirectory"
-         start_after_key = cls_rgw_after_delim(prefix_key);
+         start_after_omap_key = cls_rgw_after_delim(prefix_key);
+         start_after_entry_key.set(start_after_omap_key);
  
-         // advance to past this subdirectory, but then back up one,
+         // advance past this subdirectory, but then back up one,
           // so the loop increment will put us in the right place
-         kiter = keys.lower_bound(start_after_key);
+         kiter = keys.lower_bound(start_after_omap_key);
           --kiter;
  
            continue;
@@ -627,9 +639,9 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
        }
  
        if (name_entry_map.size() < op.num_entries &&
-         kiter->first != previous_key) {
+         kiter->first != prev_omap_key) {
          name_entry_map[kiter->first] = entry;
-       previous_key = kiter->first;
+       prev_omap_key = kiter->first;
         CLS_LOG(20, "%s: got object entry %s[%s] num entries=%d\n",
                 __func__, key.name.c_str(), key.instance.c_str(),
                 int(name_entry_map.size()));
@@ -638,12 +650,22 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
    } // for (int attempt...
  
    ret.is_truncated = more && !done;
+  if (ret.is_truncated) {
+    ret.marker = start_after_entry_key;
+  }
    CLS_LOG(20, "%s: normal exit returning %ld entries, is_truncated=%d\n",
           __func__, ret.dir.m.size(), ret.is_truncated);
    encode(ret, *out);
-  return 0;
+
+  if (ret.is_truncated && name_entry_map.size() == 0) {
+    CLS_LOG(5, "%s: returning value RGWBIAdvanceAndRetryError\n", __func__);
+    return RGWBIAdvanceAndRetryError;
+  } else {
+    return 0;
+  }
  } // rgw_bucket_list
  
+
  static int check_index(cls_method_context_t hctx,
                        rgw_bucket_dir_header *existing_header,
                        rgw_bucket_dir_header *calc_header)
diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc

index 2f1a9164f33b3d615b8d4ccb063e500793dea707..da0fd9d0d572ef3f8b2183649ddcaa451411c771 100644 (file)
--- a/src/cls/rgw/cls_rgw_client.cc
+++ b/src/cls/rgw/cls_rgw_client.cc
@@ -348,12 +348,34 @@ static bool issue_bucket_list_op(librados::IoCtx& io_ctx,
  
  int CLSRGWIssueBucketList::issue_op(const int shard_id, const string& oid)
  {
+  // set the marker depending on whether we've already queried this
+  // shard and gotten a RGWBIAdvanceAndRetryError (defined
+  // constant) return value; if we have use the marker in the return
+  // to advance the search, otherwise use the marker passed in by the
+  // caller
+  cls_rgw_obj_key marker;
+  auto iter = result.find(shard_id);
+  if (iter != result.end()) {
+    marker = iter->second.marker;
+  } else {
+    marker = start_obj;
+  }
+
    return issue_bucket_list_op(io_ctx, shard_id, oid,
-                             start_obj, filter_prefix, delimiter,
+                             marker, filter_prefix, delimiter,
                               num_entries, list_versions, &manager,
                               &result[shard_id]);
  }
  
+
+void CLSRGWIssueBucketList::reset_container(std::map<int, std::string>& objs)
+{
+  objs_container.swap(objs);
+  iter = objs_container.begin();
+  objs.clear();
+}
+
+
  void cls_rgw_remove_obj(librados::ObjectWriteOperation& o, list<string>& keep_attr_prefixes)
  {
    bufferlist in;
diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h

index 7fa8e52e91cca0665d42af0e6529c097974b86bb..78c559d71eee1bc21d4f840c37152ad6dc4f4854 100644 (file)
--- a/src/cls/rgw/cls_rgw_client.h
+++ b/src/cls/rgw/cls_rgw_client.h
@@ -14,6 +14,7 @@
  #include "common/ceph_time.h"
  #include "common/ceph_mutex.h"
  
+
  // Forward declaration
  class BucketIndexAioManager;
  /*
@@ -426,6 +427,7 @@ class CLSRGWIssueBucketList : public CLSRGWConcurrentIO {
  
  protected:
    int issue_op(int shard_id, const std::string& oid) override;
+  void reset_container(std::map<int, std::string>& objs) override;
  
  public:
    CLSRGWIssueBucketList(librados::IoCtx& io_ctx,
diff --git a/src/cls/rgw/cls_rgw_ops.h b/src/cls/rgw/cls_rgw_ops.h

index 7dc3404ff0f1a8ef1e96a5dd2640d72a5c955d8e..44839addde66c3b15fc80276eebe4f59ce8f9513 100644 (file)
--- a/src/cls/rgw/cls_rgw_ops.h
+++ b/src/cls/rgw/cls_rgw_ops.h
@@ -428,6 +428,12 @@ struct rgw_cls_list_ret {
    rgw_bucket_dir dir;
    bool is_truncated;
  
+  // if is_truncated is true, starting marker for next iteration; this
+  // is necessary as it's possible after maximum number of tries we
+  // still might have zero entries to return, in which case we have to
+  // at least move the ball foward
+  cls_rgw_obj_key marker;
+
    // cls_filtered is not transmitted; it is assumed true for versions
    // on/after 3 and false for prior versions; this allows the rgw
    // layer to know when an older osd (cls) does not do the filtering
@@ -439,16 +445,20 @@ struct rgw_cls_list_ret {
    {}
  
    void encode(ceph::buffer::list &bl) const {
-    ENCODE_START(3, 2, bl);
+    ENCODE_START(4, 2, bl);
      encode(dir, bl);
      encode(is_truncated, bl);
+    encode(marker, bl);
      ENCODE_FINISH(bl);
    }
    void decode(ceph::buffer::list::const_iterator &bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
      decode(dir, bl);
      decode(is_truncated, bl);
      cls_filtered = struct_v >= 3;
+    if (struct_v >= 4) {
+      decode(marker, bl);
+    }
      DECODE_FINISH(bl);
    }
    void dump(ceph::Formatter *f) const;
diff --git a/src/cls/rgw/cls_rgw_types.h b/src/cls/rgw/cls_rgw_types.h

index f70c108c8227315b9edc0cad557533beedc6e7de..969c24e251951e5cb700c9d919b2c16e953f1907 100644 (file)
--- a/src/cls/rgw/cls_rgw_types.h
+++ b/src/cls/rgw/cls_rgw_types.h
@@ -335,6 +335,7 @@ struct rgw_bucket_entry_ver {
  };
  WRITE_CLASS_ENCODER(rgw_bucket_entry_ver)
  
+
  struct cls_rgw_obj_key {
    std::string name;
    std::string instance;
@@ -353,6 +354,7 @@ struct cls_rgw_obj_key {
  
    void set(const std::string& _name) {
      name = _name;
+    instance.clear();
    }
  
    bool operator==(const cls_rgw_obj_key& k) const {
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc

index 2ee887272a7f0c959fa6f33b4c8841398041a035..440ef288592533bf6ab6fea20edf316c5ecfa6c9 100644 (file)
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -8493,7 +8493,7 @@ int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
    // (key=candidate, value=index into results_trackers); as we consume
    // entries from shards, we replace them with the next entries in the
    // shards until we run out
-  map<string, size_t> candidates;
+  std::map<std::string, size_t> candidates;
    size_t tracker_idx = 0;
    for (auto& t : results_trackers) {
      // it's important that the values in the map refer to the index
author	J. Eric Ivancich <ivancich@redhat.com>
	Mon, 19 Jul 2021 18:24:11 +0000 (14:24 -0400)
committer	Dan van der Ster <daniel.vanderster@cern.ch>
	Tue, 1 Mar 2022 10:31:01 +0000 (11:31 +0100)
src/cls/rgw/cls_rgw.cc		patch \| blob \| history
src/cls/rgw/cls_rgw_client.cc		patch \| blob \| history
src/cls/rgw/cls_rgw_client.h		patch \| blob \| history
src/cls/rgw/cls_rgw_ops.h		patch \| blob \| history
src/cls/rgw/cls_rgw_types.h		patch \| blob \| history
src/rgw/rgw_rados.cc		patch \| blob \| history