rgw: fix rgw versioned bucket stat accounting during reshard and check index

author Cory Snyder <csnyder@1111systems.com>

Thu, 7 Sep 2023 17:23:14 +0000 (17:23 +0000)

committer Cory Snyder <csnyder@1111systems.com>

Fri, 15 Sep 2023 18:46:27 +0000 (18:46 +0000)
author Cory Snyder <csnyder@1111systems.com>
Thu, 7 Sep 2023 17:23:14 +0000 (17:23 +0000)
committer Cory Snyder <csnyder@1111systems.com>
Fri, 15 Sep 2023 18:46:27 +0000 (18:46 +0000)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 3d865704778dcc202b36cd4a1eccdba380d37f72..7ee2bed4ab5fa9e5954c8b55bf25fc60a00562f3 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -17,6 +17,39 @@
  * CephFS: The period specifier ``m`` now implies minutes and the period specifier
    ``M`` now implies months. This has been made consistent with the rest
    of the system.
+* RGW: New tools have been added to radosgw-admin for identifying and
+  correcting issues with versioned bucket indexes. Historical bugs with the
+  versioned bucket index transaction workflow made it possible for the index
+  to accumulate extraneous "book-keeping" olh entries and plain placeholder
+  entries. In some specific scenarios where clients made concurrent requests
+  referencing the same object key, it was likely that a lot of extra index
+  entries would accumulate. When a significant number of these entries are
+  present in a single bucket index shard, they can cause high bucket listing
+  latencies and lifecycle processing failures. To check whether a versioned
+  bucket has unnecessary olh entries, users can now run ``radosgw-admin
+  bucket check olh``. If the ``--fix`` flag is used, the extra entries will
+  be safely removed. A distinct issue from the one described thus far, it is
+  also possible that some versioned buckets are maintaining extra unlinked
+  objects that are not listable from the S3/ Swift APIs. These extra objects
+  are typically a result of PUT requests that exited abnormally, in the middle
+  of a bucket index transaction - so the client would not have received a
+  successful response. Bugs in prior releases made these unlinked objects easy
+  to reproduce with any PUT request that was made on a bucket that was actively
+  resharding. Besides the extra space that these hidden, unlinked objects
+  consume, there can be another side effect in certain scenarios, caused by
+  the nature of the failure mode that produced them, where a client of a bucket
+  that was a victim of this bug may find the object associated with the key to
+  be in an inconsistent state. To check whether a versioned bucket has unlinked
+  entries, users can now run ``radosgw-admin bucket check unlinked``. If the
+  ``--fix`` flag is used, the unlinked objects will be safely removed. Finally,
+  a third issue made it possible for versioned bucket index stats to be
+  accounted inaccurately. The tooling for recalculating versioned bucket stats
+  also had a bug, and was not previously capable of fixing these inaccuracies.
+  This release resolves those issues and users can now expect that the existing
+  ``radosgw-admin bucket check`` command will produce correct results. We
+  recommend that users with versioned buckets, especially those that existed
+  on prior releases, use these new tools to check whether their buckets are
+  affected and to clean them up accordingly.
  
  >=18.0.0
  
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc

index 8cc43020e7288dc36c86a1a89b84e42318e7704c..046c063bbfa3a79782867fa0df873fc9525fa19b 100644 (file)
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -676,77 +676,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
    }
  } // rgw_bucket_list
  
-
-static int check_index(cls_method_context_t hctx,
-                      rgw_bucket_dir_header *existing_header,
-                      rgw_bucket_dir_header *calc_header)
-{
-  int rc = read_bucket_header(hctx, existing_header);
-  if (rc < 0) {
-    CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
-    return rc;
-  }
-
-  calc_header->tag_timeout = existing_header->tag_timeout;
-  calc_header->ver = existing_header->ver;
-  calc_header->syncstopped = existing_header->syncstopped;
-
-  map<string, bufferlist> keys;
-  string start_obj;
-  string filter_prefix;
-
-#define CHECK_CHUNK_SIZE 1000
-  bool done = false;
-  bool more;
-
-  do {
-    rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more);
-    if (rc < 0)
-      return rc;
-
-    for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) {
-      if (!bi_is_plain_entry(kiter->first)) {
-        done = true;
-        break;
-      }
-
-      rgw_bucket_dir_entry entry;
-      auto eiter = kiter->second.cbegin();
-      try {
-        decode(entry, eiter);
-      } catch (ceph::buffer::error& err) {
-        CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str());
-        return -EIO;
-      }
-      if (entry.exists) {
-        rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
-        stats.num_entries++;
-        stats.total_size += entry.meta.accounted_size;
-        stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
-        stats.actual_size += entry.meta.size;
-      }
-      
-      start_obj = kiter->first;
-    }
-  } while (keys.size() == CHECK_CHUNK_SIZE && !done);
-
-  return 0;
-}
-
-int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
-{
-  CLS_LOG(10, "entered %s", __func__);
-  rgw_cls_check_index_ret ret;
-
-  int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
-  if (rc < 0)
-    return rc;
-
-  encode(ret, *out);
-
-  return 0;
-}
-
  static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header)
  {
    header->ver++;
@@ -757,18 +686,6 @@ static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header
  }
  
  
-int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
-{
-  CLS_LOG(10, "entered %s", __func__);
-  rgw_bucket_dir_header existing_header;
-  rgw_bucket_dir_header calc_header;
-  int rc = check_index(hctx, &existing_header, &calc_header);
-  if (rc < 0)
-    return rc;
-
-  return write_bucket_header(hctx, &calc_header);
-}
-
  int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
  {
    CLS_LOG(10, "entered %s", __func__);
@@ -3136,6 +3053,115 @@ static int list_olh_entries(cls_method_context_t hctx,
    return count;
  }
  
+static int check_index(cls_method_context_t hctx,
+                      rgw_bucket_dir_header *existing_header,
+                      rgw_bucket_dir_header *calc_header)
+{
+  int rc = read_bucket_header(hctx, existing_header);
+  if (rc < 0) {
+    CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
+    return rc;
+  }
+
+  calc_header->tag_timeout = existing_header->tag_timeout;
+  calc_header->ver = existing_header->ver;
+  calc_header->syncstopped = existing_header->syncstopped;
+
+  std::list<rgw_cls_bi_entry> entries;
+  string start_obj;
+  string filter_prefix;
+
+#define CHECK_CHUNK_SIZE 1000
+  bool more;
+
+  do {
+    rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
+    if (rc < 0) {
+      return rc;
+    }
+
+    for (const auto & bientry : entries) {
+      rgw_bucket_dir_entry entry;
+      auto diter = bientry.data.cbegin();
+      try {
+        decode(entry, diter);
+      } catch (ceph::buffer::error& err) {
+        CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
+        return -EIO;
+      }
+
+      if (entry.exists && entry.key.instance.empty()) {
+        rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
+        stats.num_entries++;
+        stats.total_size += entry.meta.accounted_size;
+        stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+        stats.actual_size += entry.meta.size;
+      }
+      start_obj = bientry.idx;
+    }
+    entries.clear();
+  } while (more);
+
+  start_obj = "";
+  do {
+    rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
+    if (rc < 0) {
+      return rc;
+    }
+
+    for (const auto & bientry : entries) {
+      rgw_bucket_dir_entry entry;
+      auto diter = bientry.data.cbegin();
+      try {
+        decode(entry, diter);
+      } catch (ceph::buffer::error& err) {
+        CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
+        return -EIO;
+      }
+
+      if (entry.exists) {
+        rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
+        stats.num_entries++;
+        stats.total_size += entry.meta.accounted_size;
+        stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+        stats.actual_size += entry.meta.size;
+      }
+      start_obj = bientry.idx;
+    }
+    entries.clear();
+  } while (more);
+
+  return 0;
+}
+
+int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  CLS_LOG(10, "entered %s", __func__);
+  rgw_bucket_dir_header existing_header;
+  rgw_bucket_dir_header calc_header;
+  int rc = check_index(hctx, &existing_header, &calc_header);
+  if (rc < 0)
+    return rc;
+
+  return write_bucket_header(hctx, &calc_header);
+}
+
+
+int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  CLS_LOG(10, "entered %s", __func__);
+  rgw_cls_check_index_ret ret;
+
+  int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
+  if (rc < 0)
+    return rc;
+
+  encode(ret, *out);
+
+  return 0;
+}
+
+
  /* Lists all the entries that appear in a bucket index listing.
   *
   * It may not be obvious why this function calls three other "segment"
diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc

index 4310f4cd4a4b9cb7900768182dcb3178fb7de2d3..7fb5d97c6d4e68e63a83fb215731e26d6320c35f 100644 (file)
--- a/src/cls/rgw/cls_rgw_types.cc
+++ b/src/cls/rgw/cls_rgw_types.cc
@@ -385,39 +385,31 @@ bool rgw_cls_bi_entry::get_info(cls_rgw_obj_key *key,
                                  RGWObjCategory *category,
                                  rgw_bucket_category_stats *accounted_stats)
  {
-  bool account = false;
-  auto iter = data.cbegin();
    using ceph::decode;
-  switch (type) {
-    case BIIndexType::Plain:
-        account = true;
-        // NO BREAK; falls through to case InstanceIdx:
-    case BIIndexType::Instance:
-      {
-        rgw_bucket_dir_entry entry;
-        decode(entry, iter);
-        account = (account && entry.exists);
-        *key = entry.key;
-        *category = entry.meta.category;
-        accounted_stats->num_entries++;
-        accounted_stats->total_size += entry.meta.accounted_size;
-        accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
-        accounted_stats->actual_size += entry.meta.size;
-      }
-      break;
-    case BIIndexType::OLH:
-      {
-        rgw_bucket_olh_entry entry;
-        decode(entry, iter);
-        *key = entry.key;
-      }
-      break;
-    default:
-      break;
+  auto iter = data.cbegin();
+  if (type == BIIndexType::OLH) {
+    rgw_bucket_olh_entry entry;
+    decode(entry, iter);
+    *key = entry.key;
+    return false;
    }
  
-  return account;
+  rgw_bucket_dir_entry entry;
+  decode(entry, iter);
+  *key = entry.key;
+  *category = entry.meta.category;
+  accounted_stats->num_entries++;
+  accounted_stats->total_size += entry.meta.accounted_size;
+  accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+  accounted_stats->actual_size += entry.meta.size;
+  if (type == BIIndexType::Plain) {
+    return entry.exists && entry.key.instance.empty();
+  } else if (type == BIIndexType::Instance) {
+    return entry.exists;
+  }
+  return false;
  }
+
  void rgw_cls_bi_entry::generate_test_instances(list<rgw_cls_bi_entry*>& o)
  {
    using ceph::encode;
author	Cory Snyder <csnyder@1111systems.com>
	Thu, 7 Sep 2023 17:23:14 +0000 (17:23 +0000)
committer	Cory Snyder <csnyder@1111systems.com>
	Fri, 15 Sep 2023 18:46:27 +0000 (18:46 +0000)
PendingReleaseNotes		patch \| blob \| history
src/cls/rgw/cls_rgw.cc		patch \| blob \| history
src/cls/rgw/cls_rgw_types.cc		patch \| blob \| history