For multi-site deployments that make any use of Server-Side Encryption, we
recommended running this command against every bucket in every zone after all
zones have upgraded.
+* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
+ a large buildup of session metadata resulting in the MDS going read-only due to
+ the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
+ config controls the maximum size that a (encoded) session metadata can grow.
+* CephFS: For clusters with multiple CephFS file systems, all the snap-schedule
+ commands now expect the '--fs' argument.
+* CephFS: The period specifier ``m`` now implies minutes and the period specifier
+ ``M`` now implies months. This has been made consistent with the rest
+ of the system.
+* RGW: New tools have been added to radosgw-admin for identifying and
+ correcting issues with versioned bucket indexes. Historical bugs with the
+ versioned bucket index transaction workflow made it possible for the index
+ to accumulate extraneous "book-keeping" olh entries and plain placeholder
+ entries. In some specific scenarios where clients made concurrent requests
+ referencing the same object key, it was likely that a lot of extra index
+ entries would accumulate. When a significant number of these entries are
+ present in a single bucket index shard, they can cause high bucket listing
+ latencies and lifecycle processing failures. To check whether a versioned
+ bucket has unnecessary olh entries, users can now run ``radosgw-admin
+ bucket check olh``. If the ``--fix`` flag is used, the extra entries will
+ be safely removed. A distinct issue from the one described thus far, it is
+ also possible that some versioned buckets are maintaining extra unlinked
+ objects that are not listable from the S3/ Swift APIs. These extra objects
+ are typically a result of PUT requests that exited abnormally, in the middle
+ of a bucket index transaction - so the client would not have received a
+ successful response. Bugs in prior releases made these unlinked objects easy
+ to reproduce with any PUT request that was made on a bucket that was actively
+ resharding. Besides the extra space that these hidden, unlinked objects
+ consume, there can be another side effect in certain scenarios, caused by
+ the nature of the failure mode that produced them, where a client of a bucket
+ that was a victim of this bug may find the object associated with the key to
+ be in an inconsistent state. To check whether a versioned bucket has unlinked
+ entries, users can now run ``radosgw-admin bucket check unlinked``. If the
+ ``--fix`` flag is used, the unlinked objects will be safely removed. Finally,
+ a third issue made it possible for versioned bucket index stats to be
+ accounted inaccurately. The tooling for recalculating versioned bucket stats
+ also had a bug, and was not previously capable of fixing these inaccuracies.
+ This release resolves those issues and users can now expect that the existing
+ ``radosgw-admin bucket check`` command will produce correct results. We
+ recommend that users with versioned buckets, especially those that existed
+ on prior releases, use these new tools to check whether their buckets are
+ affected and to clean them up accordingly.
>=18.0.0
}
} // rgw_bucket_list
-
-static int check_index(cls_method_context_t hctx,
- rgw_bucket_dir_header *existing_header,
- rgw_bucket_dir_header *calc_header)
-{
- int rc = read_bucket_header(hctx, existing_header);
- if (rc < 0) {
- CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
- return rc;
- }
-
- calc_header->tag_timeout = existing_header->tag_timeout;
- calc_header->ver = existing_header->ver;
- calc_header->syncstopped = existing_header->syncstopped;
-
- map<string, bufferlist> keys;
- string start_obj;
- string filter_prefix;
-
-#define CHECK_CHUNK_SIZE 1000
- bool done = false;
- bool more;
-
- do {
- rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more);
- if (rc < 0)
- return rc;
-
- for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) {
- if (!bi_is_plain_entry(kiter->first)) {
- done = true;
- break;
- }
-
- rgw_bucket_dir_entry entry;
- auto eiter = kiter->second.cbegin();
- try {
- decode(entry, eiter);
- } catch (ceph::buffer::error& err) {
- CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str());
- return -EIO;
- }
- rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
- stats.num_entries++;
- stats.total_size += entry.meta.accounted_size;
- stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
- stats.actual_size += entry.meta.size;
-
- start_obj = kiter->first;
- }
- } while (keys.size() == CHECK_CHUNK_SIZE && !done);
-
- return 0;
-}
-
-int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
-{
- CLS_LOG(10, "entered %s", __func__);
- rgw_cls_check_index_ret ret;
-
- int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
- if (rc < 0)
- return rc;
-
- encode(ret, *out);
-
- return 0;
-}
-
static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header)
{
header->ver++;
}
-int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
-{
- CLS_LOG(10, "entered %s", __func__);
- rgw_bucket_dir_header existing_header;
- rgw_bucket_dir_header calc_header;
- int rc = check_index(hctx, &existing_header, &calc_header);
- if (rc < 0)
- return rc;
-
- return write_bucket_header(hctx, &calc_header);
-}
-
int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
return count;
}
+static int check_index(cls_method_context_t hctx,
+ rgw_bucket_dir_header *existing_header,
+ rgw_bucket_dir_header *calc_header)
+{
+ int rc = read_bucket_header(hctx, existing_header);
+ if (rc < 0) {
+ CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
+ return rc;
+ }
+
+ calc_header->tag_timeout = existing_header->tag_timeout;
+ calc_header->ver = existing_header->ver;
+ calc_header->syncstopped = existing_header->syncstopped;
+
+ std::list<rgw_cls_bi_entry> entries;
+ string start_obj;
+ string filter_prefix;
+
+#define CHECK_CHUNK_SIZE 1000
+ bool more;
+
+ do {
+ rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
+ if (rc < 0) {
+ return rc;
+ }
+
+ for (const auto & bientry : entries) {
+ rgw_bucket_dir_entry entry;
+ auto diter = bientry.data.cbegin();
+ try {
+ decode(entry, diter);
+ } catch (ceph::buffer::error& err) {
+ CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
+ return -EIO;
+ }
+
+ if (entry.exists && entry.key.instance.empty()) {
+ rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
+ stats.num_entries++;
+ stats.total_size += entry.meta.accounted_size;
+ stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+ stats.actual_size += entry.meta.size;
+ }
+ start_obj = bientry.idx;
+ }
+ entries.clear();
+ } while (more);
+
+ start_obj = "";
+ do {
+ rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
+ if (rc < 0) {
+ return rc;
+ }
+
+ for (const auto & bientry : entries) {
+ rgw_bucket_dir_entry entry;
+ auto diter = bientry.data.cbegin();
+ try {
+ decode(entry, diter);
+ } catch (ceph::buffer::error& err) {
+ CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
+ return -EIO;
+ }
+
+ if (entry.exists) {
+ rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
+ stats.num_entries++;
+ stats.total_size += entry.meta.accounted_size;
+ stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+ stats.actual_size += entry.meta.size;
+ }
+ start_obj = bientry.idx;
+ }
+ entries.clear();
+ } while (more);
+
+ return 0;
+}
+
+int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+ CLS_LOG(10, "entered %s", __func__);
+ rgw_bucket_dir_header existing_header;
+ rgw_bucket_dir_header calc_header;
+ int rc = check_index(hctx, &existing_header, &calc_header);
+ if (rc < 0)
+ return rc;
+
+ return write_bucket_header(hctx, &calc_header);
+}
+
+
+int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+ CLS_LOG(10, "entered %s", __func__);
+ rgw_cls_check_index_ret ret;
+
+ int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
+ if (rc < 0)
+ return rc;
+
+ encode(ret, *out);
+
+ return 0;
+}
+
+
/* Lists all the entries that appear in a bucket index listing.
*
* It may not be obvious why this function calls three other "segment"
RGWObjCategory *category,
rgw_bucket_category_stats *accounted_stats)
{
- bool account = false;
- auto iter = data.cbegin();
using ceph::decode;
- switch (type) {
- case BIIndexType::Plain:
- account = true;
- // NO BREAK; falls through to case InstanceIdx:
- case BIIndexType::Instance:
- {
- rgw_bucket_dir_entry entry;
- decode(entry, iter);
- account = (account && entry.exists);
- *key = entry.key;
- *category = entry.meta.category;
- accounted_stats->num_entries++;
- accounted_stats->total_size += entry.meta.accounted_size;
- accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
- accounted_stats->actual_size += entry.meta.size;
- }
- break;
- case BIIndexType::OLH:
- {
- rgw_bucket_olh_entry entry;
- decode(entry, iter);
- *key = entry.key;
- }
- break;
- default:
- break;
+ auto iter = data.cbegin();
+ if (type == BIIndexType::OLH) {
+ rgw_bucket_olh_entry entry;
+ decode(entry, iter);
+ *key = entry.key;
+ return false;
}
- return account;
+ rgw_bucket_dir_entry entry;
+ decode(entry, iter);
+ *key = entry.key;
+ *category = entry.meta.category;
+ accounted_stats->num_entries++;
+ accounted_stats->total_size += entry.meta.accounted_size;
+ accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
+ accounted_stats->actual_size += entry.meta.size;
+ if (type == BIIndexType::Plain) {
+ return entry.exists && entry.key.instance.empty();
+ } else if (type == BIIndexType::Instance) {
+ return entry.exists;
+ }
+ return false;
}
void rgw_bucket_olh_entry::dump(Formatter *f) const