From 8d21ac4b8da5817d2bb43bc9915fff3fdc47ab3f Mon Sep 17 00:00:00 2001 From: "Matthew N. Heler" Date: Wed, 25 Feb 2026 19:03:56 -0600 Subject: [PATCH] rgw: add RestoreStatus support to object listings S3 clients can request restore status in listing responses through the x-amz-optional-object-attributes header, but we had no support for it. This stores the restore state in the bucket index so listings can include without having to read each object's attrs individually. Signed-off-by: Matthew N. Heler --- PendingReleaseNotes | 5 ++ src/cls/rgw/cls_rgw_types.cc | 8 ++ src/cls/rgw/cls_rgw_types.h | 12 ++- src/rgw/driver/rados/rgw_rados.cc | 132 +++++++++++++++++++++++++----- src/rgw/driver/rados/rgw_rados.h | 4 +- src/rgw/rgw_rest_s3.cc | 46 +++++++++++ src/rgw/rgw_rest_s3.h | 1 + 7 files changed, 186 insertions(+), 22 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 8a691de69271..e8434cf45068 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -4,6 +4,11 @@ developers must update their method calls to use these new structs, which ensure read/write semantics are correctly applied. +* RGW: S3 ListObjects and ListObjectVersions now support the + ``x-amz-optional-object-attributes: RestoreStatus`` request header to include + restore status in listing responses. Restore status is stored in the bucket + index, so only objects written or restored after this upgrade will populate + the field. Existing objects are unaffected. * ceph-volume: Raw BlueStore OSD preparation now pre-formats NVMe devices and skips the slower BlueStore discard phase,reducing mkfs time on very large namespaces. diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc index 036a63dfe030..cc9f9e3ec43a 100644 --- a/src/cls/rgw/cls_rgw_types.cc +++ b/src/cls/rgw/cls_rgw_types.cc @@ -188,6 +188,8 @@ list rgw_bucket_dir_entry_meta::generate_test_instanc m.owner = "owner"; m.owner_display_name = "display name"; m.content_type = "content/type"; + m.restore_status = 2; // CloudRestored + m.restore_expiry_date = ceph::real_time{std::chrono::seconds(1234567890)}; o.push_back(std::move(m)); o.emplace_back(); return o; @@ -209,6 +211,8 @@ void rgw_bucket_dir_entry_meta::dump(Formatter *f) const encode_json("accounted_size", accounted_size, f); encode_json("user_data", user_data, f); encode_json("appendable", appendable, f); + encode_json("restore_status", static_cast(restore_status), f); + encode_json("restore_expiry_date", restore_expiry_date, f); } void rgw_bucket_dir_entry_meta::decode_json(JSONObj *obj) { @@ -227,6 +231,10 @@ void rgw_bucket_dir_entry_meta::decode_json(JSONObj *obj) { JSONDecoder::decode_json("accounted_size", accounted_size, obj); JSONDecoder::decode_json("user_data", user_data, obj); JSONDecoder::decode_json("appendable", appendable, obj); + int rs_val = 0; + JSONDecoder::decode_json("restore_status", rs_val, obj); + restore_status = static_cast(rs_val); + JSONDecoder::decode_json("restore_expiry_date", restore_expiry_date, obj); } list rgw_bucket_dir_entry::generate_test_instances() diff --git a/src/cls/rgw/cls_rgw_types.h b/src/cls/rgw/cls_rgw_types.h index 72cdbf3f872f..caea0407061b 100644 --- a/src/cls/rgw/cls_rgw_types.h +++ b/src/cls/rgw/cls_rgw_types.h @@ -215,9 +215,11 @@ struct rgw_bucket_dir_entry_meta { std::string user_data; std::string storage_class; bool appendable = false; + uint8_t restore_status = 0; // maps to RGWRestoreStatus enum + ceph::real_time restore_expiry_date; // zero when N/A void encode(ceph::buffer::list &bl) const { - ENCODE_START(7, 3, bl); + ENCODE_START(8, 3, bl); encode(category, bl); encode(size, bl); encode(mtime, bl); @@ -229,11 +231,13 @@ struct rgw_bucket_dir_entry_meta { encode(user_data, bl); encode(storage_class, bl); encode(appendable, bl); + encode(restore_status, bl); + encode(restore_expiry_date, bl); ENCODE_FINISH(bl); } void decode(ceph::buffer::list::const_iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(8, 3, 3, bl); decode(category, bl); decode(size, bl); decode(mtime, bl); @@ -252,6 +256,10 @@ struct rgw_bucket_dir_entry_meta { decode(storage_class, bl); if (struct_v >= 7) decode(appendable, bl); + if (struct_v >= 8) { + decode(restore_status, bl); + decode(restore_expiry_date, bl); + } DECODE_FINISH(bl); } void dump(ceph::Formatter *f) const; diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index c495903d9041..8bfe9f60d37f 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -154,6 +154,52 @@ static inline void read_attr(std::map& attrs, } } +/** + * Decode restore status and expiry date from an attrs map for the bucket index. + * Tries primary first; if a key is missing and fallback is non-null, tries fallback. + */ +static void decode_restore_index_fields( + const rgw::sal::Attrs& primary, + const rgw::sal::Attrs* fallback, + uint8_t& restore_status, + ceph::real_time& restore_expiry_date) +{ + restore_status = 0; + restore_expiry_date = {}; + + bufferlist rs_bl; + if (auto it = primary.find(RGW_ATTR_RESTORE_STATUS); it != primary.end()) { + rs_bl = it->second; + } else if (fallback) { + if (auto it2 = fallback->find(RGW_ATTR_RESTORE_STATUS); it2 != fallback->end()) { + rs_bl = it2->second; + } + } + if (rs_bl.length()) { + try { + rgw::sal::RGWRestoreStatus rs; + auto bl_iter = rs_bl.cbegin(); + decode(rs, bl_iter); + restore_status = static_cast(rs); + } catch (buffer::error&) {} + } + + bufferlist re_bl; + if (auto it = primary.find(RGW_ATTR_RESTORE_EXPIRY_DATE); it != primary.end()) { + re_bl = it->second; + } else if (fallback) { + if (auto it2 = fallback->find(RGW_ATTR_RESTORE_EXPIRY_DATE); it2 != fallback->end()) { + re_bl = it2->second; + } + } + if (re_bl.length()) { + try { + auto bl_iter = re_bl.cbegin(); + decode(restore_expiry_date, bl_iter); + } catch (buffer::error&) {} + } +} + rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados* store) const { if (!is_raw) { @@ -3434,6 +3480,12 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si } } + // extract restore fields for bucket index + uint8_t idx_restore_status = 0; + ceph::real_time idx_restore_expiry_date; + decode_restore_index_fields(attrs, nullptr, + idx_restore_status, idx_restore_expiry_date); + if (!op.size()) return 0; @@ -3494,7 +3546,8 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si meta.set_mtime, etag, content_type, storage_class, meta.owner, meta.category, meta.remove_objs, rctx.y, - meta.user_data, meta.appendable, log_op); + meta.user_data, meta.appendable, log_op, + idx_restore_status, idx_restore_expiry_date); tracepoint(rgw_rados, complete_exit, req_id.c_str()); if (r < 0) goto done_cancel; @@ -4097,6 +4150,12 @@ int RGWRados::reindex_obj(rgw::sal::Driver* driver, read_attr(attr_set, RGW_ATTR_OLH_INFO, olh_info_bl, &found_olh_info); read_attr(attr_set, RGW_ATTR_APPEND_PART_NUM, part_num_bl, &appendable); + // extract restore fields for bucket index + uint8_t idx_restore_status = 0; + ceph::real_time idx_restore_expiry_date; + decode_restore_index_fields(attr_set, nullptr, + idx_restore_status, idx_restore_expiry_date); + // check for a pure OLH object and if so exit early if (found_olh_info) { try { @@ -4175,7 +4234,10 @@ int RGWRados::reindex_obj(rgw::sal::Driver* driver, nullptr, // remove_objs list y, nullptr, // user data string - appendable); + appendable, + true, // log_op + idx_restore_status, + idx_restore_expiry_date); if (ret < 0) { ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": update index complete for " << p(head_obj) << " returned: " << @@ -4197,6 +4259,8 @@ int RGWRados::reindex_obj(rgw::sal::Driver* driver, meta.etag = etag; meta.content_type = content_type; meta.appendable = appendable; + meta.restore_status = idx_restore_status; + meta.restore_expiry_date = idx_restore_expiry_date; ret = link_helper(false, meta, "linking version"); } // if bucket is versioned @@ -7615,33 +7679,52 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu } int64_t poolid = ioctx.get_id(); - // Retain Object category as CloudTiered while restore is in - // progress or failed or if its temporarily restored copy + /* + * Retain Object category as CloudTiered while restore is in + * progress or failed or if its temporarily restored copy. + * Check new attrs first, fall back to existing attrs for partial updates. + */ RGWObjCategory category = RGWObjCategory::Main; - auto r_iter = attrs.find(RGW_ATTR_RESTORE_STATUS); - auto t_iter = attrs.find(RGW_ATTR_RESTORE_TYPE); - if (r_iter != attrs.end()) { - rgw::sal::RGWRestoreStatus st = rgw::sal::RGWRestoreStatus::None; - auto iter = r_iter->second.cbegin(); - + bufferlist rs_bl, rt_bl; + if (auto it = attrs.find(RGW_ATTR_RESTORE_STATUS); it != attrs.end()) { + rs_bl = it->second; + } else if (auto it2 = state->attrset.find(RGW_ATTR_RESTORE_STATUS); + it2 != state->attrset.end()) { + rs_bl = it2->second; + } + if (rs_bl.length()) { try { using ceph::decode; - decode(st, iter); + rgw::sal::RGWRestoreStatus st = rgw::sal::RGWRestoreStatus::None; + auto bl_iter = rs_bl.cbegin(); + decode(st, bl_iter); if (st != rgw::sal::RGWRestoreStatus::CloudRestored) { category = RGWObjCategory::CloudTiered; } else { // check if its temporary copy - if (t_iter != attrs.end()) { + if (auto it = attrs.find(RGW_ATTR_RESTORE_TYPE); it != attrs.end()) { + rt_bl = it->second; + } else if (auto it2 = state->attrset.find(RGW_ATTR_RESTORE_TYPE); + it2 != state->attrset.end()) { + rt_bl = it2->second; + } + if (rt_bl.length()) { rgw::sal::RGWRestoreType rt; - decode(rt, t_iter->second); + decode(rt, rt_bl); if (rt == rgw::sal::RGWRestoreType::Temporary) { category = RGWObjCategory::CloudTiered; // temporary restore; set storage-class to cloudtier storage class - auto c_iter = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS); - - if (c_iter != attrs.end()) { - storage_class = rgw_bl_str(c_iter->second); + bufferlist sc_bl; + if (auto it = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS); + it != attrs.end()) { + sc_bl = it->second; + } else if (auto it2 = state->attrset.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS); + it2 != state->attrset.end()) { + sc_bl = it2->second; + } + if (sc_bl.length()) { + storage_class = rgw_bl_str(sc_bl); } } } @@ -7649,10 +7732,17 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu } catch (buffer::error& err) { } } + // extract restore fields for index, with partial-update fallback + uint8_t idx_restore_status = 0; + ceph::real_time idx_restore_expiry_date; + decode_restore_index_fields(attrs, &state->attrset, + idx_restore_status, idx_restore_expiry_date); + ldpp_dout(dpp, 20) << "Setting obj category:" << category << ", storage_class:" << storage_class << dendl; r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size, mtime, etag, content_type, storage_class, owner, - category, nullptr, y, nullptr, false, log_op); + category, nullptr, y, nullptr, false, log_op, + idx_restore_status, idx_restore_expiry_date); } else { int ret = index_op.cancel(dpp, nullptr, y, log_op); if (ret < 0) { @@ -8086,7 +8176,9 @@ int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64 optional_yield y, const string *user_data, bool appendable, - bool log_op) + bool log_op, + uint8_t restore_status, + ceph::real_time restore_expiry_date) { if (blind) { return 0; @@ -8114,6 +8206,8 @@ int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64 ent.meta.owner_display_name = owner.display_name; ent.meta.content_type = content_type; ent.meta.appendable = appendable; + ent.meta.restore_status = restore_status; + ent.meta.restore_expiry_date = restore_expiry_date; bool add_log = log_op && store->svc.zone->need_to_log_data(); diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h index d8fe665fcf3e..8109a84f8eb2 100644 --- a/src/rgw/driver/rados/rgw_rados.h +++ b/src/rgw/driver/rados/rgw_rados.h @@ -1021,7 +1021,9 @@ public: optional_yield y, const std::string *user_data = nullptr, bool appendable = false, - bool log_op = true); + bool log_op = true, + uint8_t restore_status = 0, + ceph::real_time restore_expiry_date = {}); int complete_del(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, ceph::real_time& removed_mtime, /* mtime of removed object */ diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 92821498f8b7..dc052c2e04ae 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -1883,6 +1884,15 @@ int RGWListBucket_ObjStore_S3::get_common_params() shard_id = s->bucket_instance_shard_id; } } + + // Parse x-amz-optional-object-attributes header. + const char* opt_attrs = s->info.env->get("HTTP_X_AMZ_OPTIONAL_OBJECT_ATTRIBUTES"); + if (opt_attrs) { + auto tokens = ceph::split(opt_attrs, ", "); + fetch_restore_status = + std::find(tokens.begin(), tokens.end(), "RestoreStatus") != tokens.end(); + } + return 0; } @@ -1918,6 +1928,30 @@ if(!continuation_token_exist) { return 0; } +/** + * Emit XML element for a listing entry. + * Only emits for RestoreAlreadyInProgress and CloudRestored states. + */ +static void dump_restore_status(req_state* s, + const rgw_bucket_dir_entry_meta& meta) +{ + using RGWRestoreStatus = rgw::sal::RGWRestoreStatus; + auto status = static_cast(meta.restore_status); + + if (status != RGWRestoreStatus::RestoreAlreadyInProgress && + status != RGWRestoreStatus::CloudRestored) { + return; + } + + bool in_progress = (status == RGWRestoreStatus::RestoreAlreadyInProgress); + s->formatter->open_object_section("RestoreStatus"); + s->formatter->dump_bool("IsRestoreInProgress", in_progress); + if (!in_progress && meta.restore_expiry_date != ceph::real_time{}) { + dump_time(s, "RestoreExpiryDate", meta.restore_expiry_date); + } + s->formatter->close_section(); // RestoreStatus +} + void RGWListBucket_ObjStore_S3::send_common_versioned_response() { if (!s->bucket_tenant.empty()) { @@ -1999,6 +2033,9 @@ void RGWListBucket_ObjStore_S3::send_versioned_response() s->formatter->dump_int("Size", iter->meta.accounted_size); auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetch_restore_status) { + dump_restore_status(s, iter->meta); + } } dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); if (iter->meta.appendable) { @@ -2091,6 +2128,9 @@ void RGWListBucket_ObjStore_S3::send_response() s->formatter->dump_int("Size", iter->meta.accounted_size); auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetch_restore_status) { + dump_restore_status(s, iter->meta); + } dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); if (s->system_request) { s->formatter->dump_string("RgwxTag", iter->tag); @@ -2166,6 +2206,9 @@ void RGWListBucket_ObjStore_S3v2::send_versioned_response() s->formatter->dump_int("Size", iter->meta.accounted_size); auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetch_restore_status) { + dump_restore_status(s, iter->meta); + } } if (fetchOwner == true) { dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); @@ -2235,6 +2278,9 @@ void RGWListBucket_ObjStore_S3v2::send_response() s->formatter->dump_int("Size", iter->meta.accounted_size); auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class); s->formatter->dump_string("StorageClass", storage_class.c_str()); + if (fetch_restore_status) { + dump_restore_status(s, iter->meta); + } if (fetchOwner == true) { dump_owner(s, iter->meta.owner, iter->meta.owner_display_name); } diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index bd46763fd986..591af0554449 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -150,6 +150,7 @@ class RGWListBucket_ObjStore_S3 : public RGWListBucket_ObjStore { protected: bool objs_container; bool encode_key {false}; + bool fetch_restore_status {false}; int get_common_params(); void send_common_response(); void send_common_versioned_response(); -- 2.47.3