From 8b1fe1edd9d4b2625d963db6f38687c1b38f868f Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Thu, 7 Sep 2023 17:23:14 +0000 Subject: [PATCH] rgw: fix rgw versioned bucket stat accounting during reshard and check index Fixes: https://tracker.ceph.com/issues/62760 Signed-off-by: Cory Snyder (cherry picked from commit 152aadb71b61c53a4832a1c8cf82fce3d64b68d1) Conflicts: src/cls/rgw/cls_rgw.cc src/cls/rgw/cls_rgw_types.cc Cherry-pick notes: - cls_rgw_types had extra generate_test_instances method on main - cls_rgw.cc had minor changes within check_index on main --- PendingReleaseNotes | 42 ++++++++ src/cls/rgw/cls_rgw.cc | 190 ++++++++++++++++++++--------------- src/cls/rgw/cls_rgw_types.cc | 49 ++++----- 3 files changed, 171 insertions(+), 110 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 50a6089ccd12c..471b57fa23556 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -51,6 +51,48 @@ For multi-site deployments that make any use of Server-Side Encryption, we recommended running this command against every bucket in every zone after all zones have upgraded. +* CEPHFS: MDS evicts clients which are not advancing their request tids which causes + a large buildup of session metadata resulting in the MDS going read-only due to + the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold` + config controls the maximum size that a (encoded) session metadata can grow. +* CephFS: For clusters with multiple CephFS file systems, all the snap-schedule + commands now expect the '--fs' argument. +* CephFS: The period specifier ``m`` now implies minutes and the period specifier + ``M`` now implies months. This has been made consistent with the rest + of the system. +* RGW: New tools have been added to radosgw-admin for identifying and + correcting issues with versioned bucket indexes. Historical bugs with the + versioned bucket index transaction workflow made it possible for the index + to accumulate extraneous "book-keeping" olh entries and plain placeholder + entries. In some specific scenarios where clients made concurrent requests + referencing the same object key, it was likely that a lot of extra index + entries would accumulate. When a significant number of these entries are + present in a single bucket index shard, they can cause high bucket listing + latencies and lifecycle processing failures. To check whether a versioned + bucket has unnecessary olh entries, users can now run ``radosgw-admin + bucket check olh``. If the ``--fix`` flag is used, the extra entries will + be safely removed. A distinct issue from the one described thus far, it is + also possible that some versioned buckets are maintaining extra unlinked + objects that are not listable from the S3/ Swift APIs. These extra objects + are typically a result of PUT requests that exited abnormally, in the middle + of a bucket index transaction - so the client would not have received a + successful response. Bugs in prior releases made these unlinked objects easy + to reproduce with any PUT request that was made on a bucket that was actively + resharding. Besides the extra space that these hidden, unlinked objects + consume, there can be another side effect in certain scenarios, caused by + the nature of the failure mode that produced them, where a client of a bucket + that was a victim of this bug may find the object associated with the key to + be in an inconsistent state. To check whether a versioned bucket has unlinked + entries, users can now run ``radosgw-admin bucket check unlinked``. If the + ``--fix`` flag is used, the unlinked objects will be safely removed. Finally, + a third issue made it possible for versioned bucket index stats to be + accounted inaccurately. The tooling for recalculating versioned bucket stats + also had a bug, and was not previously capable of fixing these inaccuracies. + This release resolves those issues and users can now expect that the existing + ``radosgw-admin bucket check`` command will produce correct results. We + recommend that users with versioned buckets, especially those that existed + on prior releases, use these new tools to check whether their buckets are + affected and to clean them up accordingly. >=18.0.0 diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index 67e213894564d..a21e613e62bc4 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -667,75 +667,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) } } // rgw_bucket_list - -static int check_index(cls_method_context_t hctx, - rgw_bucket_dir_header *existing_header, - rgw_bucket_dir_header *calc_header) -{ - int rc = read_bucket_header(hctx, existing_header); - if (rc < 0) { - CLS_LOG(1, "ERROR: check_index(): failed to read header\n"); - return rc; - } - - calc_header->tag_timeout = existing_header->tag_timeout; - calc_header->ver = existing_header->ver; - calc_header->syncstopped = existing_header->syncstopped; - - map keys; - string start_obj; - string filter_prefix; - -#define CHECK_CHUNK_SIZE 1000 - bool done = false; - bool more; - - do { - rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more); - if (rc < 0) - return rc; - - for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) { - if (!bi_is_plain_entry(kiter->first)) { - done = true; - break; - } - - rgw_bucket_dir_entry entry; - auto eiter = kiter->second.cbegin(); - try { - decode(entry, eiter); - } catch (ceph::buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str()); - return -EIO; - } - rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; - stats.num_entries++; - stats.total_size += entry.meta.accounted_size; - stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); - stats.actual_size += entry.meta.size; - - start_obj = kiter->first; - } - } while (keys.size() == CHECK_CHUNK_SIZE && !done); - - return 0; -} - -int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) -{ - CLS_LOG(10, "entered %s", __func__); - rgw_cls_check_index_ret ret; - - int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header); - if (rc < 0) - return rc; - - encode(ret, *out); - - return 0; -} - static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header) { header->ver++; @@ -746,18 +677,6 @@ static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header } -int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) -{ - CLS_LOG(10, "entered %s", __func__); - rgw_bucket_dir_header existing_header; - rgw_bucket_dir_header calc_header; - int rc = check_index(hctx, &existing_header, &calc_header); - if (rc < 0) - return rc; - - return write_bucket_header(hctx, &calc_header); -} - int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { CLS_LOG(10, "entered %s", __func__); @@ -2936,6 +2855,115 @@ static int list_olh_entries(cls_method_context_t hctx, return count; } +static int check_index(cls_method_context_t hctx, + rgw_bucket_dir_header *existing_header, + rgw_bucket_dir_header *calc_header) +{ + int rc = read_bucket_header(hctx, existing_header); + if (rc < 0) { + CLS_LOG(1, "ERROR: check_index(): failed to read header\n"); + return rc; + } + + calc_header->tag_timeout = existing_header->tag_timeout; + calc_header->ver = existing_header->ver; + calc_header->syncstopped = existing_header->syncstopped; + + std::list entries; + string start_obj; + string filter_prefix; + +#define CHECK_CHUNK_SIZE 1000 + bool more; + + do { + rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more); + if (rc < 0) { + return rc; + } + + for (const auto & bientry : entries) { + rgw_bucket_dir_entry entry; + auto diter = bientry.data.cbegin(); + try { + decode(entry, diter); + } catch (ceph::buffer::error& err) { + CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str()); + return -EIO; + } + + if (entry.exists && entry.key.instance.empty()) { + rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; + stats.num_entries++; + stats.total_size += entry.meta.accounted_size; + stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + stats.actual_size += entry.meta.size; + } + start_obj = bientry.idx; + } + entries.clear(); + } while (more); + + start_obj = ""; + do { + rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more); + if (rc < 0) { + return rc; + } + + for (const auto & bientry : entries) { + rgw_bucket_dir_entry entry; + auto diter = bientry.data.cbegin(); + try { + decode(entry, diter); + } catch (ceph::buffer::error& err) { + CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str()); + return -EIO; + } + + if (entry.exists) { + rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; + stats.num_entries++; + stats.total_size += entry.meta.accounted_size; + stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + stats.actual_size += entry.meta.size; + } + start_obj = bientry.idx; + } + entries.clear(); + } while (more); + + return 0; +} + +int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + CLS_LOG(10, "entered %s", __func__); + rgw_bucket_dir_header existing_header; + rgw_bucket_dir_header calc_header; + int rc = check_index(hctx, &existing_header, &calc_header); + if (rc < 0) + return rc; + + return write_bucket_header(hctx, &calc_header); +} + + +int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + CLS_LOG(10, "entered %s", __func__); + rgw_cls_check_index_ret ret; + + int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header); + if (rc < 0) + return rc; + + encode(ret, *out); + + return 0; +} + + /* Lists all the entries that appear in a bucket index listing. * * It may not be obvious why this function calls three other "segment" diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc index 54532c895668e..3ed82a1decb44 100644 --- a/src/cls/rgw/cls_rgw_types.cc +++ b/src/cls/rgw/cls_rgw_types.cc @@ -371,38 +371,29 @@ bool rgw_cls_bi_entry::get_info(cls_rgw_obj_key *key, RGWObjCategory *category, rgw_bucket_category_stats *accounted_stats) { - bool account = false; - auto iter = data.cbegin(); using ceph::decode; - switch (type) { - case BIIndexType::Plain: - account = true; - // NO BREAK; falls through to case InstanceIdx: - case BIIndexType::Instance: - { - rgw_bucket_dir_entry entry; - decode(entry, iter); - account = (account && entry.exists); - *key = entry.key; - *category = entry.meta.category; - accounted_stats->num_entries++; - accounted_stats->total_size += entry.meta.accounted_size; - accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); - accounted_stats->actual_size += entry.meta.size; - } - break; - case BIIndexType::OLH: - { - rgw_bucket_olh_entry entry; - decode(entry, iter); - *key = entry.key; - } - break; - default: - break; + auto iter = data.cbegin(); + if (type == BIIndexType::OLH) { + rgw_bucket_olh_entry entry; + decode(entry, iter); + *key = entry.key; + return false; } - return account; + rgw_bucket_dir_entry entry; + decode(entry, iter); + *key = entry.key; + *category = entry.meta.category; + accounted_stats->num_entries++; + accounted_stats->total_size += entry.meta.accounted_size; + accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + accounted_stats->actual_size += entry.meta.size; + if (type == BIIndexType::Plain) { + return entry.exists && entry.key.instance.empty(); + } else if (type == BIIndexType::Instance) { + return entry.exists; + } + return false; } void rgw_bucket_olh_entry::dump(Formatter *f) const -- 2.39.5