From: Kaleb S. KEITHLEY Date: Sat, 21 Jan 2023 17:37:05 +0000 (-0500) Subject: rgw: refactor selected files for better above- vs below-the-line X-Git-Tag: v18.1.0~479^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F49760%2Fhead;p=ceph.git rgw: refactor selected files for better above- vs below-the-line Move more files into driver/rados for better above- vs below- the-line. Use #pragma once everywhere (versus fixing all the Signed-off-by: Kaleb S. KEITHLEY --- diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt index 7b13abfb5773..3f792bf8e069 100644 --- a/src/rgw/CMakeLists.txt +++ b/src/rgw/CMakeLists.txt @@ -84,31 +84,24 @@ set(librgw_common_srcs rgw_multipart_meta_filter.cc rgw_obj_manifest.cc rgw_period.cc - rgw_pubsub.cc rgw_realm.cc rgw_sync.cc rgw_sync_policy.cc - rgw_pubsub_push.cc rgw_notify_event_type.cc rgw_period_history.cc rgw_period_puller.cc - rgw_reshard.cc rgw_coroutine.cc rgw_cr_rest.cc rgw_op.cc rgw_policy_s3.cc rgw_public_access.cc rgw_putobj.cc - rgw_putobj_processor.cc rgw_quota.cc - rgw_rados.cc rgw_resolve.cc rgw_rest.cc - rgw_rest_bucket.cc rgw_rest_client.cc rgw_rest_config.cc rgw_rest_conn.cc - rgw_rest_log.cc rgw_rest_metadata.cc rgw_rest_ratelimit.cc rgw_rest_role.cc @@ -168,6 +161,13 @@ set(librgw_common_srcs driver/rados/rgw_object_expirer_core.cc driver/rados/rgw_otp.cc driver/rados/rgw_period.cc + driver/rados/rgw_pubsub.cc + driver/rados/rgw_pubsub_push.cc + driver/rados/rgw_putobj_processor.cc + driver/rados/rgw_rados.cc + driver/rados/rgw_reshard.cc + driver/rados/rgw_rest_bucket.cc + driver/rados/rgw_rest_log.cc driver/rados/rgw_rest_pubsub.cc driver/rados/rgw_rest_realm.cc driver/rados/rgw_rest_user.cc @@ -345,20 +345,20 @@ set(rgw_a_srcs rgw_process.cc rgw_realm_reloader.cc rgw_realm_watcher.cc - rgw_rest_bucket.cc rgw_rest_config.cc rgw_rest_info.cc - rgw_rest_log.cc rgw_rest_metadata.cc rgw_rest_ratelimit.cc - driver/rados/rgw_rest_realm.cc rgw_rest_sts.cc rgw_rest_swift.cc rgw_rest_usage.cc rgw_signal.cc rgw_swift_auth.cc rgw_usage.cc - rgw_sts.cc) + rgw_sts.cc + driver/rados/rgw_rest_bucket.cc + driver/rados/rgw_rest_log.cc + driver/rados/rgw_rest_realm.cc) gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf rgw_iam_policy_keywords.frag.cc) diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h index 41aae8d8dfca..b4a7ca057b38 100644 --- a/src/rgw/driver/dbstore/common/dbstore.h +++ b/src/rgw/driver/dbstore/common/dbstore.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#ifndef DB_STORE_H -#define DB_STORE_H +#pragma once #include #include @@ -2016,5 +2015,3 @@ struct db_get_obj_data { }; } } // namespace rgw::store - -#endif diff --git a/src/rgw/driver/dbstore/common/dbstore_log.h b/src/rgw/driver/dbstore/common/dbstore_log.h index 8d981d5adc41..416508369ef9 100644 --- a/src/rgw/driver/dbstore/common/dbstore_log.h +++ b/src/rgw/driver/dbstore/common/dbstore_log.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#ifndef DB_STORE_LOG_H -#define DB_STORE_LOG_H +#pragma once #include #include @@ -14,5 +13,3 @@ #undef dout_prefix #define dout_prefix *_dout << "rgw dbstore: " - -#endif diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.h b/src/rgw/driver/dbstore/sqlite/sqliteDB.h index 4f651448a994..ec0ef2bb2826 100644 --- a/src/rgw/driver/dbstore/sqlite/sqliteDB.h +++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#ifndef SQLITE_DB_H -#define SQLITE_DB_H +#pragma once #include #include @@ -550,5 +549,3 @@ class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp { int Execute(const DoutPrefixProvider *dpp, DBOpParams *params); int Bind(const DoutPrefixProvider *dpp, DBOpParams *params); }; - -#endif diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h index 3ea54082d887..b0a68157e619 100644 --- a/src/rgw/driver/rados/cls_fifo_legacy.h +++ b/src/rgw/driver/rados/cls_fifo_legacy.h @@ -13,8 +13,7 @@ * */ -#ifndef CEPH_RGW_CLS_FIFO_LEGACY_H -#define CEPH_RGW_CLS_FIFO_LEGACY_H +#pragma once #include #include @@ -333,5 +332,3 @@ public: }; } - -#endif // CEPH_RGW_CLS_FIFO_LEGACY_H diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h index 3451376ee6ea..f17061b37ea2 100644 --- a/src/rgw/driver/rados/rgw_cr_rados.h +++ b/src/rgw/driver/rados/rgw_cr_rados.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_CR_RADOS_H -#define CEPH_RGW_CR_RADOS_H +#pragma once #include #include "include/ceph_assert.h" @@ -1640,4 +1639,3 @@ public: int operate(const DoutPrefixProvider* dpp) override; }; -#endif diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h index ebdbfeb51b7f..4cd97aa82f51 100644 --- a/src/rgw/driver/rados/rgw_cr_tools.h +++ b/src/rgw/driver/rados/rgw_cr_tools.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_CR_TOOLS_H -#define CEPH_RGW_CR_TOOLS_H +#pragma once #include "rgw_cr_rados.h" #include "rgw_tools.h" @@ -84,4 +83,3 @@ struct rgw_bucket_get_sync_policy_result { using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR; -#endif diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h index 5d3537f3b148..98e61b63fe30 100644 --- a/src/rgw/driver/rados/rgw_d3n_datacache.h +++ b/src/rgw/driver/rados/rgw_d3n_datacache.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGWD3NDATACACHE_H -#define CEPH_RGWD3NDATACACHE_H +#pragma once #include "rgw_rados.h" #include @@ -258,4 +257,3 @@ int D3nRGWDataCache::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const return 0; } -#endif diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h index ccaa20884eb5..9059bd14cce7 100644 --- a/src/rgw/driver/rados/rgw_data_sync.h +++ b/src/rgw/driver/rados/rgw_data_sync.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_DATA_SYNC_H -#define CEPH_RGW_DATA_SYNC_H +#pragma once #include #include @@ -867,5 +866,3 @@ public: bool supports_data_export() override { return false; } int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; }; - -#endif diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h index 695485bf2e41..1c9a00c1fffb 100644 --- a/src/rgw/driver/rados/rgw_datalog.h +++ b/src/rgw/driver/rados/rgw_datalog.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_DATALOG_H -#define CEPH_RGW_DATALOG_H +#pragma once #include #include @@ -379,6 +378,3 @@ public: // 1 on empty, 0 on non-empty, negative on error. virtual int is_empty(const DoutPrefixProvider *dpp) = 0; }; - - -#endif diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h index 56a679ebddd3..18a4f5a3fb63 100644 --- a/src/rgw/driver/rados/rgw_etag_verifier.h +++ b/src/rgw/driver/rados/rgw_etag_verifier.h @@ -12,8 +12,8 @@ * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag * on the MPU parts. */ -#ifndef CEPH_RGW_ETAG_VERIFIER_H -#define CEPH_RGW_ETAG_VERIFIER_H + +#pragma once #include "rgw_putobj.h" #include "rgw_op.h" @@ -88,5 +88,3 @@ int create_etag_verifier(const DoutPrefixProvider *dpp, etag_verifier_ptr& verifier); } // namespace rgw::putobj - -#endif /* CEPH_RGW_ETAG_VERIFIER_H */ diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h index 196f2802c164..f3df64099a1c 100644 --- a/src/rgw/driver/rados/rgw_gc.h +++ b/src/rgw/driver/rados/rgw_gc.h @@ -1,9 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_GC_H -#define CEPH_RGW_GC_H - +#pragma once #include "include/types.h" #include "include/rados/librados.hpp" @@ -82,6 +80,3 @@ public: std::ostream& gen_prefix(std::ostream& out) const; }; - - -#endif diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h index 1b21f262092a..729c4c304cd6 100644 --- a/src/rgw/driver/rados/rgw_lc_tier.h +++ b/src/rgw/driver/rados/rgw_lc_tier.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LC_TIER_H -#define CEPH_RGW_LC_TIER_H +#pragma once #include "rgw_lc.h" #include "rgw_rest_conn.h" @@ -50,5 +49,3 @@ struct RGWLCCloudTierCtx { /* Transition object to cloud endpoint */ int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set& cloud_targets); - -#endif diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h index a431574c46fa..3dfdb8ee4ef1 100644 --- a/src/rgw/driver/rados/rgw_log_backing.h +++ b/src/rgw/driver/rados/rgw_log_backing.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LOGBACKING_H -#define CEPH_RGW_LOGBACKING_H +#pragma once #include #include @@ -393,5 +392,3 @@ public: return 0; } }; - -#endif diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h index 72283702e7e1..c83db7c40437 100644 --- a/src/rgw/driver/rados/rgw_metadata.h +++ b/src/rgw/driver/rados/rgw_metadata.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_METADATA_H -#define CEPH_RGW_METADATA_H +#pragma once #include #include @@ -297,4 +296,3 @@ void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::s void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name); void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name); -#endif diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h index fccd4199e7ea..be63815c19ed 100644 --- a/src/rgw/driver/rados/rgw_object_expirer_core.h +++ b/src/rgw/driver/rados/rgw_object_expirer_core.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_OBJEXP_H -#define CEPH_OBJEXP_H +#pragma once #include #include @@ -145,4 +144,3 @@ public: void start_processor(); void stop_processor(); }; -#endif /* CEPH_OBJEXP_H */ diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h index eacff15314c2..885e8abb8e1d 100644 --- a/src/rgw/driver/rados/rgw_otp.h +++ b/src/rgw/driver/rados/rgw_otp.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_OTP_H -#define CEPH_RGW_OTP_H +#pragma once #include "rgw_sal_fwd.h" #include "cls/otp/cls_otp_types.h" @@ -109,6 +108,3 @@ public: const rgw_user& user, optional_yield y, const RemoveParams& params = {}); }; - -#endif - diff --git a/src/rgw/driver/rados/rgw_pubsub.cc b/src/rgw/driver/rados/rgw_pubsub.cc new file mode 100644 index 000000000000..b9aa54bacd8c --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub.cc @@ -0,0 +1,723 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "services/svc_zone.h" +#include "rgw_b64.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "rgw_pubsub.h" +#include "rgw_tools.h" +#include "rgw_xml.h" +#include "rgw_arn.h" +#include "rgw_pubsub_push.h" +#include +#include + +#define dout_subsys ceph_subsys_rgw + +using namespace std; +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) { + char buf[64]; + const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str()); + if (len > 0) { + id.assign(buf, len); + } +} + +bool rgw_s3_key_filter::decode_xml(XMLObj* obj) { + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + auto prefix_not_set = true; + auto suffix_not_set = true; + auto regex_not_set = true; + std::string name; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing); + if (name == "prefix" && prefix_not_set) { + prefix_not_set = false; + RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing); + } else if (name == "suffix" && suffix_not_set) { + suffix_not_set = false; + RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing); + } else if (name == "regex" && regex_not_set) { + regex_not_set = false; + RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing); + } else { + throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'"); + } + } + return true; +} + +void rgw_s3_key_filter::dump_xml(Formatter *f) const { + if (!prefix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "prefix", f); + ::encode_xml("Value", prefix_rule, f); + f->close_section(); + } + if (!suffix_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "suffix", f); + ::encode_xml("Value", suffix_rule, f); + f->close_section(); + } + if (!regex_rule.empty()) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", "regex", f); + ::encode_xml("Value", regex_rule, f); + f->close_section(); + } +} + +bool rgw_s3_key_filter::has_content() const { + return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty()); +} + +bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) { + kv.clear(); + XMLObjIter iter = obj->find("FilterRule"); + XMLObj *o; + + const auto throw_if_missing = true; + + std::string key; + std::string value; + + while ((o = iter.get_next())) { + RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing); + RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing); + kv.emplace(key, value); + } + return true; +} + +void rgw_s3_key_value_filter::dump_xml(Formatter *f) const { + for (const auto& key_value : kv) { + f->open_object_section("FilterRule"); + ::encode_xml("Name", key_value.first, f); + ::encode_xml("Value", key_value.second, f); + f->close_section(); + } +} + +bool rgw_s3_key_value_filter::has_content() const { + return !kv.empty(); +} + +bool rgw_s3_filter::decode_xml(XMLObj* obj) { + RGWXMLDecoder::decode_xml("S3Key", key_filter, obj); + RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj); + RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj); + return true; +} + +void rgw_s3_filter::dump_xml(Formatter *f) const { + if (key_filter.has_content()) { + ::encode_xml("S3Key", key_filter, f); + } + if (metadata_filter.has_content()) { + ::encode_xml("S3Metadata", metadata_filter, f); + } + if (tag_filter.has_content()) { + ::encode_xml("S3Tags", tag_filter, f); + } +} + +bool rgw_s3_filter::has_content() const { + return key_filter.has_content() || + metadata_filter.has_content() || + tag_filter.has_content(); +} + +bool match(const rgw_s3_key_filter& filter, const std::string& key) { + const auto key_size = key.size(); + const auto prefix_size = filter.prefix_rule.size(); + if (prefix_size != 0) { + // prefix rule exists + if (prefix_size > key_size) { + // if prefix is longer than key, we fail + return false; + } + if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) { + return false; + } + } + const auto suffix_size = filter.suffix_rule.size(); + if (suffix_size != 0) { + // suffix rule exists + if (suffix_size > key_size) { + // if suffix is longer than key, we fail + return false; + } + if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) { + return false; + } + } + if (!filter.regex_rule.empty()) { + // TODO add regex chaching in the filter + const std::regex base_regex(filter.regex_rule); + if (!std::regex_match(key, base_regex)) { + return false; + } + } + return true; +} + +bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) { + // all filter pairs must exist with the same value in the object's metadata/tags + // object metadata/tags may include items not in the filter + return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end()); +} + +bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) { + // all filter pairs must exist with the same value in the object's metadata/tags + // object metadata/tags may include items not in the filter + for (auto& filter : filter.kv) { + auto result = kv.equal_range(filter.first); + if (std::any_of(result.first, result.second, [&filter](const pair& p) { return p.second == filter.second;})) + continue; + else + return false; + } + return true; +} + +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) { + // if event list exists, and none of the events in the list matches the event type, filter the message + if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) { + return false; + } + return true; +} + +void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) { + l.clear(); + + XMLObjIter iter = obj->find(name); + XMLObj *o; + + while ((o = iter.get_next())) { + std::string val; + decode_xml_obj(val, o); + l.push_back(rgw::notify::from_string(val)); + } +} + +bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) { + const auto throw_if_missing = true; + RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing); + + RGWXMLDecoder::decode_xml("Filter", filter, obj); + + do_decode_xml_obj(events, "Event", obj); + if (events.empty()) { + // if no events are provided, we assume all events + events.push_back(rgw::notify::ObjectCreated); + events.push_back(rgw::notify::ObjectRemoved); + } + return true; +} + +void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const { + ::encode_xml("Id", id, f); + ::encode_xml("Topic", topic_arn.c_str(), f); + if (filter.has_content()) { + ::encode_xml("Filter", filter, f); + } + for (const auto& event : events) { + ::encode_xml("Event", rgw::notify::to_string(event), f); + } +} + +bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) { + do_decode_xml_obj(list, "TopicConfiguration", obj); + return true; +} + +rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) : + id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} + +void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const { + do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f); +} + +void rgw_pubsub_s3_event::dump(Formatter *f) const { + encode_json("eventVersion", eventVersion, f); + encode_json("eventSource", eventSource, f); + encode_json("awsRegion", awsRegion, f); + utime_t ut(eventTime); + encode_json("eventTime", ut, f); + encode_json("eventName", eventName, f); + { + Formatter::ObjectSection s(*f, "userIdentity"); + encode_json("principalId", userIdentity, f); + } + { + Formatter::ObjectSection s(*f, "requestParameters"); + encode_json("sourceIPAddress", sourceIPAddress, f); + } + { + Formatter::ObjectSection s(*f, "responseElements"); + encode_json("x-amz-request-id", x_amz_request_id, f); + encode_json("x-amz-id-2", x_amz_id_2, f); + } + { + Formatter::ObjectSection s(*f, "s3"); + encode_json("s3SchemaVersion", s3SchemaVersion, f); + encode_json("configurationId", configurationId, f); + { + Formatter::ObjectSection sub_s(*f, "bucket"); + encode_json("name", bucket_name, f); + { + Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity"); + encode_json("principalId", bucket_ownerIdentity, f); + } + encode_json("arn", bucket_arn, f); + encode_json("id", bucket_id, f); + } + { + Formatter::ObjectSection sub_s(*f, "object"); + encode_json("key", object_key, f); + encode_json("size", object_size, f); + encode_json("eTag", object_etag, f); + encode_json("versionId", object_versionId, f); + encode_json("sequencer", object_sequencer, f); + encode_json("metadata", x_meta_map, f); + encode_json("tags", tags, f); + } + } + encode_json("eventId", id, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_topic::dump(Formatter *f) const +{ + encode_json("user", user, f); + encode_json("name", name, f); + encode_json("dest", dest, f); + encode_json("arn", arn, f); + encode_json("opaqueData", opaque_data, f); +} + +void rgw_pubsub_topic::dump_xml(Formatter *f) const +{ + encode_xml("User", user, f); + encode_xml("Name", name, f); + encode_xml("EndPoint", dest, f); + encode_xml("TopicArn", arn, f); + encode_xml("OpaqueData", opaque_data, f); +} + +void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) { + f->open_object_section("entry"); + encode_xml("key", key, f); + encode_xml("value", value, f); + f->close_section(); // entry +} + +void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const +{ + f->open_array_section("Attributes"); + std::string str_user; + user.to_str(str_user); + encode_xml_key_value_entry("User", str_user, f); + encode_xml_key_value_entry("Name", name, f); + encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f); + encode_xml_key_value_entry("TopicArn", arn, f); + encode_xml_key_value_entry("OpaqueData", opaque_data, f); + f->close_section(); // Attributes +} + +void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f) +{ + f->open_array_section(name); + for (auto iter = l.cbegin(); iter != l.cend(); ++iter) { + f->dump_string("obj", rgw::notify::to_string(*iter)); + } + f->close_section(); +} + +void rgw_pubsub_topic_filter::dump(Formatter *f) const +{ + encode_json("topic", topic, f); + encode_json("events", events, f); +} + +void rgw_pubsub_topic_subs::dump(Formatter *f) const +{ + encode_json("topic", topic, f); + encode_json("subs", subs, f); +} + +void rgw_pubsub_bucket_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "topics"); + for (auto& t : topics) { + encode_json(t.first.c_str(), t.second, f); + } +} + +void rgw_pubsub_topics::dump(Formatter *f) const +{ + Formatter::ArraySection s(*f, "topics"); + for (auto& t : topics) { + encode_json(t.first.c_str(), t.second, f); + } +} + +void rgw_pubsub_topics::dump_xml(Formatter *f) const +{ + for (auto& t : topics) { + encode_xml("member", t.second.topic, f); + } +} + +void rgw_pubsub_sub_dest::dump(Formatter *f) const +{ + encode_json("bucket_name", bucket_name, f); + encode_json("oid_prefix", oid_prefix, f); + encode_json("push_endpoint", push_endpoint, f); + encode_json("push_endpoint_args", push_endpoint_args, f); + encode_json("push_endpoint_topic", arn_topic, f); + encode_json("stored_secret", stored_secret, f); + encode_json("persistent", persistent, f); +} + +void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const +{ + // first 2 members are omitted here since they + // dont apply to AWS compliant topics + encode_xml("EndpointAddress", push_endpoint, f); + encode_xml("EndpointArgs", push_endpoint_args, f); + encode_xml("EndpointTopic", arn_topic, f); + encode_xml("HasStoredSecret", stored_secret, f); + encode_xml("Persistent", persistent, f); +} + +std::string rgw_pubsub_sub_dest::to_json_str() const +{ + // first 2 members are omitted here since they + // dont apply to AWS compliant topics + JSONFormatter f; + f.open_object_section(""); + encode_json("EndpointAddress", push_endpoint, &f); + encode_json("EndpointArgs", push_endpoint_args, &f); + encode_json("EndpointTopic", arn_topic, &f); + encode_json("HasStoredSecret", stored_secret, &f); + encode_json("Persistent", persistent, &f); + f.close_section(); + std::stringstream ss; + f.flush(ss); + return ss.str(); +} + +void rgw_pubsub_sub_config::dump(Formatter *f) const +{ + encode_json("user", user, f); + encode_json("name", name, f); + encode_json("topic", topic, f); + encode_json("dest", dest, f); + encode_json("s3_id", s3_id, f); +} + +RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant) + : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj) +{ + get_meta_obj(&meta_obj); +} + +int RGWPubSub::remove(const DoutPrefixProvider *dpp, + const rgw_raw_obj& obj, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker) +{ + int ret = read(meta_obj, result, objv_tracker); + if (ret < 0) { + ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, + RGWObjVersionTracker *objv_tracker, optional_yield y) +{ + int ret = write(dpp, meta_obj, topics, objv_tracker, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::get_topics(rgw_pubsub_topics *result) +{ + return read_topics(result, nullptr); +} + +int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker) +{ + int ret = ps->read(bucket_meta_obj, result, objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + return 0; +} + +int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, + RGWObjVersionTracker *objv_tracker, + optional_yield y) +{ + int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y); + if (ret < 0) { + ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result) +{ + return read_topics(result, nullptr); +} + +int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result) +{ + rgw_pubsub_topics topics; + int ret = get_topics(&topics); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = topics.topics.find(name); + if (iter == topics.topics.end()) { + ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; + return -ENOENT; + } + + *result = iter->second; + return 0; +} + +int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result) +{ + rgw_pubsub_topics topics; + int ret = get_topics(&topics); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + auto iter = topics.topics.find(name); + if (iter == topics.topics.end()) { + ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; + return -ENOENT; + } + + *result = iter->second.topic; + return 0; +} + +int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) { + return create_notification(dpp, topic_name, events, std::nullopt, "", y); +} + +int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) { + rgw_pubsub_topic_subs topic_info; + + int ret = ps->get_topic(topic_name, &topic_info); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl; + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + ret = read_topics(&bucket_topics, &objv_tracker); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << + bucket.name << "': ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << + bucket.name << "'" << dendl; + + auto& topic_filter = bucket_topics.topics[topic_name]; + topic_filter.topic = topic_info.topic; + topic_filter.events = events; + topic_filter.s3_id = notif_name; + if (s3_filter) { + topic_filter.s3_filter = *s3_filter; + } + + ret = write_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl; + return ret; + } + + ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl; + + return 0; +} + +int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y) +{ + rgw_pubsub_topic_subs topic_info; + + int ret = ps->get_topic(topic_name, &topic_info); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl; + return ret; + } + + RGWObjVersionTracker objv_tracker; + rgw_pubsub_bucket_topics bucket_topics; + + ret = read_topics(&bucket_topics, &objv_tracker); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; + return ret; + } + + bucket_topics.topics.erase(topic_name); + + if (bucket_topics.topics.empty()) { + // no more topics - delete the notification object of the bucket + ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; + return ret; + } + return 0; + } + + // write back the notifications without the deleted one + ret = write_topics(dpp, bucket_topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) +{ + // get all topics on a bucket + rgw_pubsub_bucket_topics bucket_topics; + auto ret = get_topics(&bucket_topics); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl; + return ret ; + } + + // remove all auto-genrated topics + for (const auto& topic : bucket_topics.topics) { + const auto& topic_name = topic.first; + ret = ps->remove_topic(dpp, topic_name, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl; + } + } + + // delete the notification object of the bucket + ret = ps->remove(dpp, bucket_meta_obj, nullptr, y); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) { + return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y); +} + +int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) { + RGWObjVersionTracker objv_tracker; + rgw_pubsub_topics topics; + + int ret = read_topics(&topics, &objv_tracker); + if (ret < 0 && ret != -ENOENT) { + // its not an error if not topics exist, we create one + ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } + + rgw_pubsub_topic_subs& new_topic = topics.topics[name]; + new_topic.topic.user = rgw_user("", tenant); + new_topic.topic.name = name; + new_topic.topic.dest = dest; + new_topic.topic.arn = arn; + new_topic.topic.opaque_data = opaque_data; + + ret = write_topics(dpp, topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) +{ + RGWObjVersionTracker objv_tracker; + rgw_pubsub_topics topics; + + int ret = read_topics(&topics, &objv_tracker); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; + return ret; + } else if (ret == -ENOENT) { + // its not an error if no topics exist, just a no-op + ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl; + return 0; + } + + topics.topics.erase(name); + + ret = write_topics(dpp, topics, &objv_tracker, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl; + return ret; + } + + return 0; +} + +void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid()); +} + +void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket)); +} + +void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const { + *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name)); +} + diff --git a/src/rgw/driver/rados/rgw_pubsub.h b/src/rgw/driver/rados/rgw_pubsub.h new file mode 100644 index 000000000000..08a329e4c02a --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub.h @@ -0,0 +1,713 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "services/svc_sys_obj.h" +#include "rgw_tools.h" +#include "rgw_zone.h" +#include "rgw_notify_event_type.h" +#include + +namespace rgw::sal { class RadosStore; } + +class XMLObj; + +struct rgw_s3_key_filter { + std::string prefix_rule; + std::string suffix_rule; + std::string regex_rule; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(prefix_rule, bl); + encode(suffix_rule, bl); + encode(regex_rule, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(prefix_rule, bl); + decode(suffix_rule, bl); + decode(regex_rule, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_filter) + +using KeyValueMap = boost::container::flat_map; +using KeyMultiValueMap = std::multimap; + +struct rgw_s3_key_value_filter { + KeyValueMap kv; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(kv, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(kv, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_key_value_filter) + +struct rgw_s3_filter { + rgw_s3_key_filter key_filter; + rgw_s3_key_value_filter metadata_filter; + rgw_s3_key_value_filter tag_filter; + + bool has_content() const; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(key_filter, bl); + encode(metadata_filter, bl); + encode(tag_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(key_filter, bl); + decode(metadata_filter, bl); + if (struct_v >= 2) { + decode(tag_filter, bl); + } + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(rgw_s3_filter) + +using OptionalFilter = std::optional; + +struct rgw_pubsub_topic_filter; +/* S3 notification configuration + * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html + + + + + + suffix + jpg + + + + + + + + + + + + + + + + notification1 + arn:aws:sns::: + s3:ObjectCreated:* + s3:ObjectRemoved:* + + +*/ +struct rgw_pubsub_s3_notification { + // notification id + std::string id; + // types of events + rgw::notify::EventTypeList events; + // topic ARN + std::string topic_arn; + // filter rules + rgw_s3_filter filter; + + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; + + rgw_pubsub_s3_notification() = default; + // construct from rgw_pubsub_topic_filter (used by get/list notifications) + explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter); +}; + +// return true if the key matches the prefix/suffix/regex rules of the key filter +bool match(const rgw_s3_key_filter& filter, const std::string& key); + +// return true if the key matches the metadata rules of the metadata filter +bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv); + +// return true if the key matches the tag rules of the tag filter +bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv); + +// return true if the event type matches (equal or contained in) one of the events in the list +bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event); + +struct rgw_pubsub_s3_notifications { + std::list list; + bool decode_xml(XMLObj *obj); + void dump_xml(Formatter *f) const; +}; + +/* S3 event records structure + * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html +{ +"Records":[ + { + "eventVersion":"" + "eventSource":"", + "awsRegion":"", + "eventTime":"", + "eventName":"", + "userIdentity":{ + "principalId":"" + }, + "requestParameters":{ + "sourceIPAddress":"" + }, + "responseElements":{ + "x-amz-request-id":"", + "x-amz-id-2":"" + }, + "s3":{ + "s3SchemaVersion":"1.0", + "configurationId":"", + "bucket":{ + "name":"", + "ownerIdentity":{ + "principalId":"" + }, + "arn":"" + "id": "" + }, + "object":{ + "key":"", + "size": , + "eTag":"", + "versionId":"", + "sequencer": "", + "metadata": "" + "tags": "" + } + }, + "eventId":"", + } +] +}*/ + +struct rgw_pubsub_s3_event { + constexpr static const char* const json_type_plural = "Records"; + std::string eventVersion = "2.2"; + // aws:s3 + std::string eventSource = "ceph:s3"; + // zonegroup + std::string awsRegion; + // time of the request + ceph::real_time eventTime; + // type of the event + std::string eventName; + // user that sent the request + std::string userIdentity; + // IP address of source of the request (not implemented) + std::string sourceIPAddress; + // request ID (not implemented) + std::string x_amz_request_id; + // radosgw that received the request + std::string x_amz_id_2; + std::string s3SchemaVersion = "1.0"; + // ID received in the notification request + std::string configurationId; + // bucket name + std::string bucket_name; + // bucket owner + std::string bucket_ownerIdentity; + // bucket ARN + std::string bucket_arn; + // object key + std::string object_key; + // object size + uint64_t object_size = 0; + // object etag + std::string object_etag; + // object version id bucket is versioned + std::string object_versionId; + // hexadecimal value used to determine event order for specific key + std::string object_sequencer; + // this is an rgw extension (not S3 standard) + // used to store a globally unique identifier of the event + // that could be used for acking or any other identification of the event + std::string id; + // this is an rgw extension holding the internal bucket id + std::string bucket_id; + // meta data + KeyValueMap x_meta_map; + // tags + KeyMultiValueMap tags; + // opaque data received from the topic + // could be used to identify the gateway + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(4, 1, bl); + encode(eventVersion, bl); + encode(eventSource, bl); + encode(awsRegion, bl); + encode(eventTime, bl); + encode(eventName, bl); + encode(userIdentity, bl); + encode(sourceIPAddress, bl); + encode(x_amz_request_id, bl); + encode(x_amz_id_2, bl); + encode(s3SchemaVersion, bl); + encode(configurationId, bl); + encode(bucket_name, bl); + encode(bucket_ownerIdentity, bl); + encode(bucket_arn, bl); + encode(object_key, bl); + encode(object_size, bl); + encode(object_etag, bl); + encode(object_versionId, bl); + encode(object_sequencer, bl); + encode(id, bl); + encode(bucket_id, bl); + encode(x_meta_map, bl); + encode(tags, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(4, bl); + decode(eventVersion, bl); + decode(eventSource, bl); + decode(awsRegion, bl); + decode(eventTime, bl); + decode(eventName, bl); + decode(userIdentity, bl); + decode(sourceIPAddress, bl); + decode(x_amz_request_id, bl); + decode(x_amz_id_2, bl); + decode(s3SchemaVersion, bl); + decode(configurationId, bl); + decode(bucket_name, bl); + decode(bucket_ownerIdentity, bl); + decode(bucket_arn, bl); + decode(object_key, bl); + decode(object_size, bl); + decode(object_etag, bl); + decode(object_versionId, bl); + decode(object_sequencer, bl); + decode(id, bl); + if (struct_v >= 2) { + decode(bucket_id, bl); + decode(x_meta_map, bl); + } + if (struct_v >= 3) { + decode(tags, bl); + } + if (struct_v >= 4) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_s3_event) + +// setting a unique ID for an event based on object hash and timestamp +void set_event_id(std::string& id, const std::string& hash, const utime_t& ts); + +struct rgw_pubsub_sub_dest { + std::string bucket_name; + std::string oid_prefix; + std::string push_endpoint; + std::string push_endpoint_args; + std::string arn_topic; + bool stored_secret = false; + bool persistent = false; + + void encode(bufferlist& bl) const { + ENCODE_START(5, 1, bl); + encode(bucket_name, bl); + encode(oid_prefix, bl); + encode(push_endpoint, bl); + encode(push_endpoint_args, bl); + encode(arn_topic, bl); + encode(stored_secret, bl); + encode(persistent, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(5, bl); + decode(bucket_name, bl); + decode(oid_prefix, bl); + decode(push_endpoint, bl); + if (struct_v >= 2) { + decode(push_endpoint_args, bl); + } + if (struct_v >= 3) { + decode(arn_topic, bl); + } + if (struct_v >= 4) { + decode(stored_secret, bl); + } + if (struct_v >= 5) { + decode(persistent, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + std::string to_json_str() const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest) + +struct rgw_pubsub_sub_config { + rgw_user user; + std::string name; + std::string topic; + rgw_pubsub_sub_dest dest; + std::string s3_id; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(user, bl); + encode(name, bl); + encode(topic, bl); + encode(dest, bl); + encode(s3_id, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(2, bl); + decode(user, bl); + decode(name, bl); + decode(topic, bl); + decode(dest, bl); + if (struct_v >= 2) { + decode(s3_id, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_sub_config) + +struct rgw_pubsub_topic { + rgw_user user; + std::string name; + rgw_pubsub_sub_dest dest; + std::string arn; + std::string opaque_data; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(user, bl); + encode(name, bl); + encode(dest, bl); + encode(arn, bl); + encode(opaque_data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(user, bl); + decode(name, bl); + if (struct_v >= 2) { + decode(dest, bl); + decode(arn, bl); + } + if (struct_v >= 3) { + decode(opaque_data, bl); + } + DECODE_FINISH(bl); + } + + std::string to_str() const { + return user.tenant + "/" + name; + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; + void dump_xml_as_attributes(Formatter *f) const; + + bool operator<(const rgw_pubsub_topic& t) const { + return to_str().compare(t.to_str()); + } +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic) + +struct rgw_pubsub_topic_subs { + rgw_pubsub_topic topic; + std::set subs; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topic, bl); + encode(subs, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topic, bl); + decode(subs, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs) + +struct rgw_pubsub_topic_filter { + rgw_pubsub_topic topic; + rgw::notify::EventTypeList events; + std::string s3_id; + rgw_s3_filter s3_filter; + + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(topic, bl); + // events are stored as a vector of std::strings + std::vector tmp_events; + std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string); + encode(tmp_events, bl); + encode(s3_id, bl); + encode(s3_filter, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(topic, bl); + // events are stored as a vector of std::strings + events.clear(); + std::vector tmp_events; + decode(tmp_events, bl); + std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string); + if (struct_v >= 2) { + decode(s3_id, bl); + } + if (struct_v >= 3) { + decode(s3_filter, bl); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter) + +struct rgw_pubsub_bucket_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topics, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics) + +struct rgw_pubsub_topics { + std::map topics; + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(topics, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(topics, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + void dump_xml(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(rgw_pubsub_topics) + +static std::string pubsub_oid_prefix = "pubsub."; + +class RGWPubSub +{ + friend class Bucket; + + rgw::sal::RadosStore* store; + const std::string tenant; + RGWSI_SysObj* svc_sysobj; + + rgw_raw_obj meta_obj; + + std::string meta_oid() const { + return pubsub_oid_prefix + tenant; + } + + std::string bucket_meta_oid(const rgw_bucket& bucket) const { + return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker; + } + + std::string sub_meta_oid(const std::string& name) const { + return pubsub_oid_prefix + tenant + ".sub." + name; + } + + template + int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker); + + template + int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info, + RGWObjVersionTracker* obj_tracker, optional_yield y); + + int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker, + optional_yield y); + + int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker); + int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, + RGWObjVersionTracker* objv_tracker, optional_yield y); + +public: + RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant); + + class Bucket { + friend class RGWPubSub; + RGWPubSub *ps; + rgw_bucket bucket; + rgw_raw_obj bucket_meta_obj; + + // read the list of topics associated with a bucket and populate into result + // use version tacker to enforce atomicity between read/write + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker); + // set the list of topics associated with a bucket + // use version tacker to enforce atomicity between read/write + // return 0 on success, error code otherwise + int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, + RGWObjVersionTracker* objv_tracker, optional_yield y); + public: + Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) { + ps->get_bucket_meta_obj(bucket, &bucket_meta_obj); + } + + // read the list of topics associated with a bucket and populate into result + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int get_topics(rgw_pubsub_bucket_topics *result); + // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket + // assigning a notification name is optional (needed for S3 compatible notifications) + // if the topic already exist on the bucket, the filter event list may be updated + // for S3 compliant notifications the version with: s3_filter and notif_name should be used + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y); + int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y); + // remove a topic and filter from bucket + // if the topic does not exists on the bucket it is a no-op (considered success) + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y); + // remove all notifications (and autogenerated topics) associated with the bucket + // return 0 on success or if no topic was associated with the bucket, error code otherwise + int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y); + }; + + using BucketRef = std::shared_ptr; + + BucketRef get_bucket(const rgw_bucket& bucket) { + return std::make_shared(this, bucket); + } + + void get_meta_obj(rgw_raw_obj *obj) const; + void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const; + + void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const; + + // get all topics (per tenant, if used)) and populate them into "result" + // return 0 on success or if no topics exist, error code otherwise + int get_topics(rgw_pubsub_topics *result); + // get a topic with its subscriptions by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_topic(const std::string& name, rgw_pubsub_topic_subs *result); + // get a topic with by its name and populate it into "result" + // return -ENOENT if the topic does not exists + // return 0 on success, error code otherwise + int get_topic(const std::string& name, rgw_pubsub_topic *result); + // create a topic with a name only + // if the topic already exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y); + // create a topic with push destination information and ARN + // if the topic already exists the destination and ARN values may be updated (considered succsess) + // return 0 on success, error code otherwise + int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y); + // remove a topic according to its name + // if the topic does not exists it is a no-op (considered success) + // return 0 on success, error code otherwise + int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y); +}; + + +template +int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker) +{ + bufferlist bl; + int ret = rgw_get_system_obj(svc_sysobj, + obj.pool, obj.oid, + bl, + objv_tracker, + nullptr, null_yield, nullptr, nullptr); + if (ret < 0) { + return ret; + } + + auto iter = bl.cbegin(); + try { + decode(*result, iter); + } catch (buffer::error& err) { + return -EIO; + } + + return 0; +} + +template +int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info, + RGWObjVersionTracker* objv_tracker, optional_yield y) +{ + bufferlist bl; + encode(info, bl); + + return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid, + bl, false, objv_tracker, real_time(), y); +} diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc new file mode 100644 index 000000000000..2f734c21df83 --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub_push.cc @@ -0,0 +1,463 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_pubsub_push.h" +#include +#include +#include +#include "include/buffer_fwd.h" +#include "common/Formatter.h" +#include "common/iso_8601.h" +#include "common/async/completion.h" +#include "rgw_common.h" +#include "rgw_data_sync.h" +#include "rgw_pubsub.h" +#include "acconfig.h" +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +#include "rgw_amqp.h" +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +#include "rgw_kafka.h" +#endif +#include +#include +#include +#include "rgw_perf_counters.h" + +using namespace rgw; + +template +std::string json_format_pubsub_event(const EventType& event) { + std::stringstream ss; + JSONFormatter f(false); + { + Formatter::ObjectSection s(f, EventType::json_type_plural); + { + Formatter::ArraySection s(f, EventType::json_type_plural); + encode_json("", event, &f); + } + } + f.flush(ss); + return ss.str(); +} + +bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) { + bool value; + bool exists; + if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) { + throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name); + } + if (!exists) { + return default_value; + } + return value; +} + +class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint { +private: + const std::string endpoint; + typedef unsigned ack_level_t; + ack_level_t ack_level; // TODO: not used for now + const bool verify_ssl; + const bool cloudevents; + static const ack_level_t ACK_LEVEL_ANY = 0; + static const ack_level_t ACK_LEVEL_NON_ERROR = 1; + +public: + RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : + endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) + { + bool exists; + const auto& str_ack_level = args.get("http-ack-level", &exists); + if (!exists || str_ack_level == "any") { + // "any" is default + ack_level = ACK_LEVEL_ANY; + } else if (str_ack_level == "non-error") { + ack_level = ACK_LEVEL_NON_ERROR; + } else { + ack_level = std::atoi(str_ack_level.c_str()); + if (ack_level < 100 || ack_level >= 600) { + throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level); + } + } + } + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + bufferlist read_bl; + RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl); + const auto post_data = json_format_pubsub_event(event); + if (cloudevents) { + // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md + // using "Binary Content Mode" + request.append_header("ce-specversion", "1.0"); + request.append_header("ce-type", "com.amazonaws." + event.eventName); + request.append_header("ce-time", to_iso_8601(event.eventTime)); + // default output of iso8601 is also RFC3339 compatible + request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2); + request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name); + request.append_header("ce-subject", event.object_key); + } + request.set_post_data(post_data); + request.set_send_length(post_data.length()); + request.append_header("Content-Type", "application/json"); + if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); + const auto rc = RGWHTTP::process(&request, y); + if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); + // TODO: use read_bl to process return code and handle according to ack level + return rc; + } + + std::string to_str() const override { + std::string str("HTTP/S Endpoint"); + str += "\nURI: " + endpoint; + str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL"); + return str; + } +}; + +#ifdef WITH_RADOSGW_AMQP_ENDPOINT +class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + Routable + }; + CephContext* const cct; + const std::string endpoint; + const std::string topic; + const std::string exchange; + ack_level_t ack_level; + amqp::connection_ptr_t conn; + + bool get_verify_ssl(const RGWHTTPArgs& args) { + bool exists; + auto str_verify_ssl = args.get("verify-ssl", &exists); + if (!exists) { + // verify server certificate by default + return true; + } + boost::algorithm::to_lower(str_verify_ssl); + if (str_verify_ssl == "true") { + return true; + } + if (str_verify_ssl == "false") { + return false; + } + throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl); + } + + std::string get_exchange(const RGWHTTPArgs& args) { + bool exists; + const auto exchange = args.get("amqp-exchange", &exists); + if (!exists) { + throw configuration_error("AMQP: missing amqp-exchange"); + } + return exchange; + } + + ack_level_t get_ack_level(const RGWHTTPArgs& args) { + bool exists; + const auto& str_ack_level = args.get("amqp-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + return ack_level_t::Broker; + } + if (str_ack_level == "none") { + return ack_level_t::None; + } + if (str_ack_level == "routable") { + return ack_level_t::Routable; + } + throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level); + } + +public: + RGWPubSubAMQPEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + endpoint(_endpoint), + topic(_topic), + exchange(get_exchange(args)), + ack_level(get_ack_level(args)), + conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) { + if (!conn) { + throw configuration_error("AMQP: failed to create connection to: " + endpoint); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return amqp::publish(conn, topic, json_format_pubsub_event(event)); + } else { + // TODO: currently broker and routable are the same - this will require different flags but the same mechanism + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = amqp::publish_with_confirm(conn, + topic, + json_format_pubsub_event(event), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("AMQP(0.9.1) Endpoint"); + str += "\nURI: " + endpoint; + str += "\nTopic: " + topic; + str += "\nExchange: " + exchange; + return str; + } +}; + +static const std::string AMQP_0_9_1("0-9-1"); +static const std::string AMQP_1_0("1-0"); +static const std::string AMQP_SCHEMA("amqp"); +#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT + +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT +class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint { +private: + enum class ack_level_t { + None, + Broker, + }; + CephContext* const cct; + const std::string topic; + kafka::connection_ptr_t conn; + const ack_level_t ack_level; + + + ack_level_t get_ack_level(const RGWHTTPArgs& args) { + bool exists; + const auto& str_ack_level = args.get("kafka-ack-level", &exists); + if (!exists || str_ack_level == "broker") { + // "broker" is default + return ack_level_t::Broker; + } + if (str_ack_level == "none") { + return ack_level_t::None; + } + throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level); + } + +public: + RGWPubSubKafkaEndpoint(const std::string& _endpoint, + const std::string& _topic, + const RGWHTTPArgs& args, + CephContext* _cct) : + cct(_cct), + topic(_topic), + conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) , + ack_level(get_ack_level(args)) { + if (!conn) { + throw configuration_error("Kafka: failed to create connection to: " + _endpoint); + } + } + + // this allows waiting untill "finish()" is called from a different thread + // waiting could be blocking the waiting thread or yielding, depending + // with compilation flag support and whether the optional_yield is set + class Waiter { + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion; + std::unique_ptr completion = nullptr; + int ret; + + mutable std::atomic done = false; + mutable std::mutex lock; + mutable std::condition_variable cond; + + template + auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { + boost::asio::async_completion init(token); + auto& handler = init.completion_handler; + { + std::unique_lock l{lock}; + completion = Completion::create(ctx.get_executor(), std::move(handler)); + } + return init.result.get(); + } + + public: + int wait(optional_yield y) { + if (done) { + return ret; + } + if (y) { + auto& io_ctx = y.get_io_context(); + auto& yield_ctx = y.get_yield_context(); + boost::system::error_code ec; + async_wait(io_ctx, yield_ctx[ec]); + return -ec.value(); + } + std::unique_lock l(lock); + cond.wait(l, [this]{return (done==true);}); + return ret; + } + + void finish(int r) { + std::unique_lock l{lock}; + ret = r; + done = true; + if (completion) { + boost::system::error_code ec(-ret, boost::system::system_category()); + Completion::post(std::move(completion), ec); + } else { + cond.notify_all(); + } + } + }; + + int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { + ceph_assert(conn); + if (ack_level == ack_level_t::None) { + return kafka::publish(conn, topic, json_format_pubsub_event(event)); + } else { + // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine + auto w = std::unique_ptr(new Waiter); + const auto rc = kafka::publish_with_confirm(conn, + topic, + json_format_pubsub_event(event), + std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); + if (rc < 0) { + // failed to publish, does not wait for reply + return rc; + } + return w->wait(y); + } + } + + std::string to_str() const override { + std::string str("Kafka Endpoint"); + str += kafka::to_string(conn); + str += "\nTopic: " + topic; + return str; + } +}; + +static const std::string KAFKA_SCHEMA("kafka"); +#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT + +static const std::string WEBHOOK_SCHEMA("webhook"); +static const std::string UNKNOWN_SCHEMA("unknown"); +static const std::string NO_SCHEMA(""); + +const std::string& get_schema(const std::string& endpoint) { + if (endpoint.empty()) { + return NO_SCHEMA; + } + const auto pos = endpoint.find(':'); + if (pos == std::string::npos) { + return UNKNOWN_SCHEMA; + } + const auto& schema = endpoint.substr(0,pos); + if (schema == "http" || schema == "https") { + return WEBHOOK_SCHEMA; +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == "amqp" || schema == "amqps") { + return AMQP_SCHEMA; +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == "kafka") { + return KAFKA_SCHEMA; +#endif + } + return UNKNOWN_SCHEMA; +} + +RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, + const std::string& topic, + const RGWHTTPArgs& args, + CephContext* cct) { + const auto& schema = get_schema(endpoint); + if (schema == WEBHOOK_SCHEMA) { + return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args)); +#ifdef WITH_RADOSGW_AMQP_ENDPOINT + } else if (schema == AMQP_SCHEMA) { + bool exists; + std::string version = args.get("amqp-version", &exists); + if (!exists) { + version = AMQP_0_9_1; + } + if (version == AMQP_0_9_1) { + return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct)); + } else if (version == AMQP_1_0) { + throw configuration_error("AMQP: v1.0 not supported"); + return nullptr; + } else { + throw configuration_error("AMQP: unknown version: " + version); + return nullptr; + } +#endif +#ifdef WITH_RADOSGW_KAFKA_ENDPOINT + } else if (schema == KAFKA_SCHEMA) { + return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct)); +#endif + } + + throw configuration_error("unknown schema in: " + endpoint); + return nullptr; +} + diff --git a/src/rgw/driver/rados/rgw_pubsub_push.h b/src/rgw/driver/rados/rgw_pubsub_push.h new file mode 100644 index 000000000000..17905937c035 --- /dev/null +++ b/src/rgw/driver/rados/rgw_pubsub_push.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp +#pragma once + +#include +#include +#include +#include "include/buffer_fwd.h" +#include "include/common_fwd.h" +#include "common/async/yield_context.h" + +// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes +class RGWDataSyncEnv; +class RGWHTTPArgs; +struct rgw_pubsub_s3_event; + +// endpoint base class all endpoint - types should derive from it +class RGWPubSubEndpoint { +public: + RGWPubSubEndpoint() = default; + // endpoint should not be copied + RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete; + const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete; + + typedef std::unique_ptr Ptr; + + // factory method for the actual notification endpoint + // derived class specific arguments are passed in http args format + // may throw a configuration_error if creation fails + static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr); + + // this method is used in order to send notification (S3 compliant) and wait for completion + // in async manner via a coroutine when invoked in the frontend environment + virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0; + + // present as string + virtual std::string to_str() const { return ""; } + + virtual ~RGWPubSubEndpoint() = default; + + // exception object for configuration error + struct configuration_error : public std::logic_error { + configuration_error(const std::string& what_arg) : + std::logic_error("pubsub endpoint configuration error: " + what_arg) {} + }; +}; + diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc new file mode 100644 index 000000000000..8a6a157018ef --- /dev/null +++ b/src/rgw/driver/rados/rgw_putobj_processor.cc @@ -0,0 +1,704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "rgw_aio.h" +#include "rgw_putobj_processor.h" +#include "rgw_multi.h" +#include "rgw_compression.h" +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" +#include "rgw_sal_rados.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +namespace rgw::putobj { + +int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset) +{ + const bool flush = (data.length() == 0); + + // capture the first chunk for special handling + if (data_offset < head_chunk_size || data_offset == 0) { + if (flush) { + // flush partial chunk + return process_first_chunk(std::move(head_data), &processor); + } + + auto remaining = head_chunk_size - data_offset; + auto count = std::min(data.length(), remaining); + data.splice(0, count, &head_data); + data_offset += count; + + if (data_offset == head_chunk_size) { + // process the first complete chunk + ceph_assert(head_data.length() == head_chunk_size); + int r = process_first_chunk(std::move(head_data), &processor); + if (r < 0) { + return r; + } + } + if (data.length() == 0) { // avoid flushing stripe processor + return 0; + } + } + ceph_assert(processor); // process_first_chunk() must initialize + + // send everything else through the processor + auto write_offset = data_offset; + data_offset += data.length(); + return processor->process(std::move(data), write_offset); +} + + +static int process_completed(const AioResultList& completed, RawObjSet *written) +{ + std::optional error; + for (auto& r : completed) { + if (r.result >= 0) { + written->insert(r.obj.get_ref().obj); + } else if (!error) { // record first error code + error = r.result; + } + } + return error.value_or(0); +} + +void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) { + const rgw_obj obj = head_obj->get_obj(); + const RGWObjStateManifest *sm = obj_ctx.get_state(obj); + const bool compressed = sm->state.compressed; + uint32_t alloc_hint_flags = 0; + if (compressed) { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + } + + op.set_alloc_hint2(0, 0, alloc_hint_flags); +} + +int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj) +{ + stripe_obj = store->svc()->rados->obj(raw_obj); + return stripe_obj.open(dpp); +} + +int RadosWriter::process(bufferlist&& bl, uint64_t offset) +{ + bufferlist data = std::move(bl); + const uint64_t cost = data.length(); + if (cost == 0) { // no empty writes, use aio directly for creates + return 0; + } + librados::ObjectWriteOperation op; + add_write_hint(op); + if (offset == 0) { + op.write_full(data); + } else { + op.write(offset, data); + } + constexpr uint64_t id = 0; // unused + auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); + return process_completed(c, &written); +} + +int RadosWriter::write_exclusive(const bufferlist& data) +{ + const uint64_t cost = data.length(); + + librados::ObjectWriteOperation op; + op.create(true); // exclusive create + add_write_hint(op); + op.write_full(data); + + constexpr uint64_t id = 0; // unused + auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); + auto d = aio->drain(); + c.splice(c.end(), d); + return process_completed(c, &written); +} + +int RadosWriter::drain() +{ + return process_completed(aio->drain(), &written); +} + +RadosWriter::~RadosWriter() +{ + // wait on any outstanding aio completions + process_completed(aio->drain(), &written); + + bool need_to_remove_head = false; + std::optional raw_head; + if (!rgw::sal::Object::empty(head_obj.get())) { + raw_head.emplace(); + rgw::sal::RadosObject* obj = dynamic_cast(head_obj.get()); + obj->get_raw_obj(&*raw_head); + } + + /** + * We should delete the object in the "multipart" namespace to avoid race condition. + * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart + * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects + * written by the second upload may be deleted by the first upload. + * details is describled on #11749 + * + * The above comment still stands, but instead of searching for a specific object in the multipart + * namespace, we just make sure that we remove the object that is marked as the head object after + * we remove all the other raw objects. Note that we use different call to remove the head object, + * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme. + */ + for (const auto& obj : written) { + if (raw_head && obj == *raw_head) { + ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl; + need_to_remove_head = true; + continue; + } + + int r = store->delete_raw_obj(dpp, obj); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl; + } + } + + if (need_to_remove_head) { + std::string version_id; + ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl; + int r = head_obj->delete_object(dpp, null_yield); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl; + } + } +} + + +// advance to the next stripe +int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size) +{ + // advance the manifest + int r = manifest_gen.create_next(offset); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + chunk = ChunkProcessor(&writer, chunk_size); + *pstripe_size = manifest_gen.cur_stripe_max_size(); + return 0; +} + + + +int AtomicObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + first_chunk = std::move(data); + *processor = &stripe; + return 0; +} + +int AtomicObjectProcessor::prepare(optional_yield y) +{ + uint64_t max_head_chunk_size; + uint64_t head_max_size; + uint64_t chunk_size = 0; + uint64_t alignment; + + int r = dynamic_cast(head_obj.get())->get_max_chunk_size( + dpp, head_obj->get_bucket()->get_placement_rule(), + &max_head_chunk_size, &alignment); + if (r < 0) { + return r; + } + + bool same_pool = true; + if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) { + if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) { + same_pool = false; + r = dynamic_cast(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size); + if (r < 0) { + return r; + } + head_max_size = 0; + } + } + + if (same_pool) { + RGWZonePlacementInfo placement_info; + if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) { + head_max_size = max_head_chunk_size; + } else { + head_max_size = 0; + } + chunk_size = max_head_chunk_size; + } + + uint64_t stripe_size; + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + + dynamic_cast(head_obj.get())->get_max_aligned_size( + default_stripe_size, alignment, &stripe_size); + + manifest.set_trivial_rule(head_max_size, stripe_size); + + rgw_obj obj = head_obj->get_obj(); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + head_obj->get_bucket()->get_placement_rule(), + &tail_placement_rule, + obj.bucket, obj); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + + set_head_chunk_size(head_max_size); + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, head_max_size); + return 0; +} + +int AtomicObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + rgw::sal::Attrs& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled, optional_yield y) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + head_obj->set_atomic(); + + RGWRados::Object op_target(store->getRados(), + head_obj->get_bucket(), + obj_ctx, head_obj.get()); + RGWRados::Object::Write obj_op(&op_target); + + /* some object types shouldn't be versioned, e.g., multipart parts */ + op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled()); + obj_op.meta.data = &first_chunk; + obj_op.meta.manifest = &manifest; + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.if_match = if_match; + obj_op.meta.if_nomatch = if_nomatch; + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.olh_epoch = olh_epoch; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + if (r < 0) { + if (r == -ETIMEDOUT) { + // The head object write may eventually succeed, clear the set of objects for deletion. if it + // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write + writer.clear_written(); + } + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + + +int MultipartObjectProcessor::process_first_chunk(bufferlist&& data, + DataProcessor **processor) +{ + // write the first chunk of the head object as part of an exclusive create, + // then drain to wait for the result in case of EEXIST + int r = writer.write_exclusive(data); + if (r == -EEXIST) { + // randomize the oid prefix and reprepare the head/manifest + std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32); + + mp.init(target_obj->get_name(), upload_id, oid_rand); + manifest.set_prefix(target_obj->get_name() + "." + oid_rand); + + r = prepare_head(); + if (r < 0) { + return r; + } + // resubmit the write op on the new head object + r = writer.write_exclusive(data); + } + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int MultipartObjectProcessor::prepare_head() +{ + const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; + uint64_t chunk_size; + uint64_t stripe_size; + uint64_t alignment; + + int r = dynamic_cast(target_obj.get())->get_max_chunk_size(dpp, + tail_placement_rule, &chunk_size, &alignment); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl; + return r; + } + dynamic_cast(target_obj.get())->get_max_aligned_size( + default_stripe_size, alignment, &stripe_size); + + manifest.set_multipart_part_rule(stripe_size, part_num); + + r = manifest_gen.create_begin(store->ctx(), &manifest, + head_obj->get_bucket()->get_placement_rule(), + &tail_placement_rule, + target_obj->get_bucket()->get_key(), + target_obj->get_obj()); + if (r < 0) { + return r; + } + + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + dynamic_cast(head_obj.get())->raw_obj_to_obj(stripe_obj); + head_obj->set_hash_source(target_obj->get_name()); + + r = writer.set_stripe_obj(stripe_obj); + if (r < 0) { + return r; + } + stripe_size = manifest_gen.cur_stripe_max_size(); + set_head_chunk_size(stripe_size); + + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + return 0; +} + +int MultipartObjectProcessor::prepare(optional_yield y) +{ + manifest.set_prefix(target_obj->get_name() + "." + upload_id); + + return prepare_head(); +} + +int MultipartObjectProcessor::complete(size_t accounted_size, + const std::string& etag, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, + const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, + bool *pcanceled, optional_yield y) +{ + int r = writer.drain(); + if (r < 0) { + return r; + } + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + + RGWRados::Object op_target(store->getRados(), + head_obj->get_bucket(), + obj_ctx, head_obj.get()); + RGWRados::Object::Write obj_op(&op_target); + + op_target.set_versioning_disabled(true); + op_target.set_meta_placement_rule(&tail_placement_rule); + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.mtime = mtime; + obj_op.meta.owner = owner; + obj_op.meta.delete_at = delete_at; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + + r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); + if (r < 0) + return r; + + bufferlist bl; + RGWUploadPartInfo info; + string p = "part."; + bool sorted_omap = is_v2_upload_id(upload_id); + + if (sorted_omap) { + char buf[32]; + snprintf(buf, sizeof(buf), "%08d", part_num); + p.append(buf); + } else { + p.append(part_num_str); + } + info.num = part_num; + info.etag = etag; + info.size = actual_size; + info.accounted_size = accounted_size; + info.modified = real_clock::now(); + info.manifest = manifest; + + bool compressed; + r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); + if (r < 0) { + ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; + return r; + } + + encode(info, bl); + + std::unique_ptr meta_obj = + head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART)); + meta_obj->set_in_extra_data(true); + + r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield); + if (r < 0) { + return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r; + } + + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + return 0; +} + +int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor) +{ + int r = writer.write_exclusive(data); + if (r < 0) { + return r; + } + *processor = &stripe; + return 0; +} + +int AppendObjectProcessor::prepare(optional_yield y) +{ + RGWObjState *astate; + int r = head_obj->get_obj_state(dpp, &astate, y); + if (r < 0) { + return r; + } + cur_size = astate->size; + *cur_accounted_size = astate->accounted_size; + if (!astate->exists) { + if (position != 0) { + ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } else { + cur_part_num = 1; + //set the prefix + char buf[33]; + gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); + string oid_prefix = head_obj->get_name(); + oid_prefix.append("."); + oid_prefix.append(buf); + oid_prefix.append("_"); + manifest.set_prefix(oid_prefix); + } + } else { + // check whether the object appendable + map::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); + if (iter == astate->attrset.end()) { + ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl; + return -ERR_OBJECT_NOT_APPENDABLE; + } + if (position != *cur_accounted_size) { + ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl; + return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; + } + try { + using ceph::decode; + decode(cur_part_num, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl; + return -EIO; + } + cur_part_num++; + //get the current obj etag + iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + string s = rgw_string_unquote(iter->second.c_str()); + size_t pos = s.find("-"); + cur_etag = s.substr(0, pos); + } + + iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); + if (iter != astate->attrset.end()) { + tail_placement_rule.storage_class = iter->second.to_str(); + } else { + tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD; + } + cur_manifest = dynamic_cast(head_obj.get())->get_manifest(); + manifest.set_prefix(cur_manifest->get_prefix()); + astate->keep_tail = true; + } + manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num); + + rgw_obj obj = head_obj->get_obj(); + + r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj); + if (r < 0) { + return r; + } + rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); + + uint64_t chunk_size = 0; + r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size); + if (r < 0) { + return r; + } + r = writer.set_stripe_obj(std::move(stripe_obj)); + if (r < 0) { + return r; + } + + uint64_t stripe_size = manifest_gen.cur_stripe_max_size(); + + uint64_t max_head_size = std::min(chunk_size, stripe_size); + set_head_chunk_size(max_head_size); + + // initialize the processors + chunk = ChunkProcessor(&writer, chunk_size); + stripe = StripeProcessor(&chunk, this, stripe_size); + + return 0; +} + +int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime, + ceph::real_time set_mtime, rgw::sal::Attrs& attrs, + ceph::real_time delete_at, const char *if_match, const char *if_nomatch, + const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled, + optional_yield y) +{ + int r = writer.drain(); + if (r < 0) + return r; + const uint64_t actual_size = get_actual_size(); + r = manifest_gen.create_next(actual_size); + if (r < 0) { + return r; + } + head_obj->set_atomic(); + RGWRados::Object op_target(store->getRados(), + head_obj->get_bucket(), + obj_ctx, head_obj.get()); + RGWRados::Object::Write obj_op(&op_target); + //For Append obj, disable versioning + op_target.set_versioning_disabled(true); + if (cur_manifest) { + cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params()); + obj_op.meta.manifest = cur_manifest; + } else { + obj_op.meta.manifest = &manifest; + } + obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ + obj_op.meta.mtime = mtime; + obj_op.meta.set_mtime = set_mtime; + obj_op.meta.owner = owner; + obj_op.meta.flags = PUT_OBJ_CREATE; + obj_op.meta.delete_at = delete_at; + obj_op.meta.user_data = user_data; + obj_op.meta.zones_trace = zones_trace; + obj_op.meta.modify_tail = true; + obj_op.meta.appendable = true; + //Add the append part number + bufferlist cur_part_num_bl; + using ceph::encode; + encode(cur_part_num, cur_part_num_bl); + attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl; + //calculate the etag + if (!cur_etag.empty()) { + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; + hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); + hash.Update((const unsigned char *)petag, sizeof(petag)); + hash.Final((unsigned char *)final_etag); + buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); + snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, + "-%lld", (long long)cur_part_num); + bufferlist etag_bl; + etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); + attrs[RGW_ATTR_ETAG] = etag_bl; + } + r = obj_op.write_meta(dpp, actual_size + cur_size, + accounted_size + *cur_accounted_size, + attrs, y); + if (r < 0) { + return r; + } + if (!obj_op.meta.canceled) { + // on success, clear the set of objects for deletion + writer.clear_written(); + } + if (pcanceled) { + *pcanceled = obj_op.meta.canceled; + } + *cur_accounted_size += accounted_size; + + return 0; +} + +} // namespace rgw::putobj diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h new file mode 100644 index 000000000000..1beb9a724c05 --- /dev/null +++ b/src/rgw/driver/rados/rgw_putobj_processor.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include + +#include "rgw_putobj.h" +#include "services/svc_rados.h" +#include "services/svc_tier_rados.h" +#include "rgw_sal.h" +#include "rgw_obj_manifest.h" + +namespace rgw { + +namespace sal { + class RadosStore; +} + +class Aio; + +namespace putobj { + +// an object processor with special handling for the first chunk of the head. +// the virtual process_first_chunk() function returns a processor to handle the +// rest of the object +class HeadObjectProcessor : public rgw::sal::ObjectProcessor { + uint64_t head_chunk_size; + // buffer to capture the first chunk of the head object + bufferlist head_data; + // initialized after process_first_chunk() to process everything else + rgw::sal::DataProcessor *processor = nullptr; + uint64_t data_offset = 0; // maximum offset of data written (ie compressed) + protected: + uint64_t get_actual_size() const { return data_offset; } + + // process the first chunk of data and return a processor for the rest + virtual int process_first_chunk(bufferlist&& data, + rgw::sal::DataProcessor **processor) = 0; + public: + HeadObjectProcessor(uint64_t head_chunk_size) + : head_chunk_size(head_chunk_size) + {} + + void set_head_chunk_size(uint64_t size) { head_chunk_size = size; } + + // cache first chunk for process_first_chunk(), then forward everything else + // to the returned processor + int process(bufferlist&& data, uint64_t logical_offset) final override; +}; + +using RawObjSet = std::set; + +// a data sink that writes to rados objects and deletes them on cancelation +class RadosWriter : public rgw::sal::DataProcessor { + Aio *const aio; + rgw::sal::RadosStore *const store; + RGWObjectCtx& obj_ctx; + std::unique_ptr head_obj; + RGWSI_RADOS::Obj stripe_obj; // current stripe object + RawObjSet written; // set of written objects for deletion + const DoutPrefixProvider *dpp; + optional_yield y; + + public: + RadosWriter(Aio *aio, rgw::sal::RadosStore *store, + RGWObjectCtx& obj_ctx, std::unique_ptr _head_obj, + const DoutPrefixProvider *dpp, optional_yield y) + : aio(aio), store(store), + obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y) + {} + RadosWriter(RadosWriter&& r) + : aio(r.aio), store(r.store), + obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y) + {} + + ~RadosWriter(); + + // add alloc hint to osd + void add_write_hint(librados::ObjectWriteOperation& op); + + // change the current stripe object + int set_stripe_obj(const rgw_raw_obj& obj); + + // write the data at the given offset of the current stripe object + int process(bufferlist&& data, uint64_t stripe_offset) override; + + // write the data as an exclusive create and wait for it to complete + int write_exclusive(const bufferlist& data); + + int drain(); + + // when the operation completes successfully, clear the set of written objects + // so they aren't deleted on destruction + void clear_written() { written.clear(); } + +}; + + +// a rados object processor that stripes according to RGWObjManifest +class ManifestObjectProcessor : public HeadObjectProcessor, + public StripeGenerator { + protected: + rgw::sal::RadosStore* const store; + rgw_placement_rule tail_placement_rule; + rgw_user owner; + RGWObjectCtx& obj_ctx; + std::unique_ptr head_obj; + + RadosWriter writer; + RGWObjManifest manifest; + RGWObjManifest::generator manifest_gen; + ChunkProcessor chunk; + StripeProcessor stripe; + const DoutPrefixProvider *dpp; + + // implements StripeGenerator + int next(uint64_t offset, uint64_t *stripe_size) override; + + public: + ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& _obj_ctx, + std::unique_ptr _head_obj, + const DoutPrefixProvider* dpp, optional_yield y) + : HeadObjectProcessor(0), + store(store), + owner(owner), + obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)), + writer(aio, store, obj_ctx, head_obj->clone(), dpp, y), + chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) { + if (ptail_placement_rule) { + tail_placement_rule = *ptail_placement_rule; + } + } + + void set_owner(const rgw_user& _owner) { + owner = _owner; + } + + void set_tail_placement(const rgw_placement_rule& tpr) { + tail_placement_rule = tpr; + } + void set_tail_placement(const rgw_placement_rule&& tpr) { + tail_placement_rule = tpr; + } + +}; + + +// a processor that completes with an atomic write to the head object as part of +// a bucket index transaction +class AtomicObjectProcessor : public ManifestObjectProcessor { + const std::optional olh_epoch; + const std::string unique_tag; + bufferlist first_chunk; // written with the head in complete() + + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + public: + AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, + RGWObjectCtx& obj_ctx, + std::unique_ptr _head_obj, + std::optional olh_epoch, + const std::string& unique_tag, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, ptail_placement_rule, + owner, obj_ctx, std::move(_head_obj), dpp, y), + olh_epoch(olh_epoch), unique_tag(unique_tag) + {} + + // prepare a trivial manifest + int prepare(optional_yield y) override; + // write the head object atomically in a bucket index transaction + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + +}; + + +// a processor for multipart parts, which don't require atomic completion. the +// part's head is written with an exclusive create to detect racing uploads of +// the same part/upload id, which are restarted with a random oid prefix +class MultipartObjectProcessor : public ManifestObjectProcessor { + std::unique_ptr target_obj; // target multipart object + const std::string upload_id; + const int part_num; + const std::string part_num_str; + RGWMPObj mp; + + // write the first chunk and wait on aio->drain() for its completion. + // on EEXIST, retry with random prefix + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + // prepare the head stripe and manifest + int prepare_head(); + public: + MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + std::unique_ptr _head_obj, + const std::string& upload_id, uint64_t part_num, + const std::string& part_num_str, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, ptail_placement_rule, + owner, obj_ctx, std::move(_head_obj), dpp, y), + target_obj(head_obj->clone()), upload_id(upload_id), + part_num(part_num), part_num_str(part_num_str), + mp(head_obj->get_name(), upload_id) + {} + + // prepare a multipart manifest + int prepare(optional_yield y) override; + // write the head object attributes in a bucket index transaction, then + // register the completed part with the multipart meta object + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, + ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, + const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + +}; + + class AppendObjectProcessor : public ManifestObjectProcessor { + uint64_t cur_part_num; + uint64_t position; + uint64_t cur_size; + uint64_t *cur_accounted_size; + std::string cur_etag; + const std::string unique_tag; + + RGWObjManifest *cur_manifest; + + int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; + + public: + AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, + const rgw_placement_rule *ptail_placement_rule, + const rgw_user& owner, RGWObjectCtx& obj_ctx, + std::unique_ptr _head_obj, + const std::string& unique_tag, uint64_t position, + uint64_t *cur_accounted_size, + const DoutPrefixProvider *dpp, optional_yield y) + : ManifestObjectProcessor(aio, store, ptail_placement_rule, + owner, obj_ctx, std::move(_head_obj), dpp, y), + position(position), cur_size(0), cur_accounted_size(cur_accounted_size), + unique_tag(unique_tag), cur_manifest(nullptr) + {} + int prepare(optional_yield y) override; + int complete(size_t accounted_size, const std::string& etag, + ceph::real_time *mtime, ceph::real_time set_mtime, + std::map& attrs, ceph::real_time delete_at, + const char *if_match, const char *if_nomatch, const std::string *user_data, + rgw_zone_set *zones_trace, bool *canceled, + optional_yield y) override; + }; + +} // namespace putobj +} // namespace rgw + diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc new file mode 100644 index 000000000000..6779e519c466 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -0,0 +1,9715 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "include/compat.h" +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "common/ceph_json.h" + +#include "common/errno.h" +#include "common/Formatter.h" +#include "common/Throttle.h" +#include "common/BackTrace.h" + +#include "rgw_sal.h" +#include "rgw_zone.h" +#include "rgw_cache.h" +#include "rgw_acl.h" +#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ +#include "rgw_aio_throttle.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_rest_conn.h" +#include "rgw_cr_rados.h" +#include "rgw_cr_rest.h" +#include "rgw_datalog.h" +#include "rgw_putobj_processor.h" + +#include "cls/rgw/cls_rgw_ops.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/rgw/cls_rgw_const.h" +#include "cls/refcount/cls_refcount_client.h" +#include "cls/version/cls_version_client.h" +#include "osd/osd_types.h" + +#include "rgw_tools.h" +#include "rgw_coroutine.h" +#include "rgw_compression.h" +#include "rgw_etag_verifier.h" +#include "rgw_worker.h" +#include "rgw_notify.h" +#include "rgw_http_errors.h" + +#undef fork // fails to compile RGWPeriod::fork() below + +#include "common/Clock.h" + +#include +#include +#include +#include +#include +#include +#include "include/random.h" + +#include "rgw_gc.h" +#include "rgw_lc.h" + +#include "rgw_object_expirer_core.h" +#include "rgw_sync.h" +#include "rgw_sync_counters.h" +#include "rgw_sync_trace.h" +#include "rgw_trim_datalog.h" +#include "rgw_trim_mdlog.h" +#include "rgw_data_sync.h" +#include "rgw_realm_watcher.h" +#include "rgw_reshard.h" +#include "rgw_cr_rados.h" + +#include "services/svc_zone.h" +#include "services/svc_zone_utils.h" +#include "services/svc_quota.h" +#include "services/svc_sync_modules.h" +#include "services/svc_sys_obj.h" +#include "services/svc_sys_obj_cache.h" +#include "services/svc_bucket.h" +#include "services/svc_mdlog.h" + +#include "compressor/Compressor.h" + +#include "rgw_d3n_datacache.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/rgw_rados.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; +using namespace librados; + +#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: " +#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: " +#define dendl_bitx dendl ; } + +static string shadow_ns = "shadow"; +static string default_bucket_index_pool_suffix = "rgw.buckets.index"; +static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; + +static RGWObjCategory main_category = RGWObjCategory::Main; +#define RGW_USAGE_OBJ_PREFIX "usage." + +rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const +{ + if (!is_raw) { + rgw_raw_obj r; + driver->get_raw_obj(placement_rule, obj, &r); + return r; + } + return raw_obj; +} + +void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op) +{ + obj_version* check_objv = version_for_check(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + cls_version_read(*op, &read_version); +} + +void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op) +{ + obj_version* check_objv = version_for_check(); + obj_version* modify_version = version_for_write(); + + if (check_objv) { + cls_version_check(*op, *check_objv, VER_COND_EQ); + } + + if (modify_version) { + cls_version_set(*op, *modify_version); + } else { + cls_version_inc(*op); + } +} + +void RGWObjVersionTracker::apply_write() +{ + const bool checked = (read_version.ver != 0); + const bool incremented = (write_version.ver == 0); + + if (checked && incremented) { + // apply cls_version_inc() so our next operation can recheck it + ++read_version.ver; + } else { + read_version = write_version; + } + write_version = obj_version(); +} + +RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) { + RGWObjStateManifest *result; + typename std::map::iterator iter; + lock.lock_shared(); + assert (!obj.empty()); + iter = objs_state.find(obj); + if (iter != objs_state.end()) { + result = &iter->second; + lock.unlock_shared(); + } else { + lock.unlock_shared(); + lock.lock(); + result = &objs_state[obj]; + lock.unlock(); + } + return result; +} + +void RGWObjectCtx::set_compressed(const rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.compressed = true; +} + +void RGWObjectCtx::set_atomic(rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.is_atomic = true; +} +void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) { + std::unique_lock wl{lock}; + assert (!obj.empty()); + objs_state[obj].state.prefetch_data = true; +} + +void RGWObjectCtx::invalidate(const rgw_obj& obj) { + std::unique_lock wl{lock}; + auto iter = objs_state.find(obj); + if (iter == objs_state.end()) { + return; + } + bool is_atomic = iter->second.state.is_atomic; + bool prefetch_data = iter->second.state.prefetch_data; + bool compressed = iter->second.state.compressed; + + objs_state.erase(iter); + + if (is_atomic || prefetch_data) { + auto& sm = objs_state[obj]; + sm.state.is_atomic = is_atomic; + sm.state.prefetch_data = prefetch_data; + sm.state.compressed = compressed; + } +} + +class RGWMetaNotifierManager : public RGWCoroutinesManager { + RGWRados* store; + RGWHTTPManager http_manager; + +public: + RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(const DoutPrefixProvider *dpp, map& conn_map, set& shards) { + rgw_http_param_pair pairs[] = { { "type", "metadata" }, + { "notify", NULL }, + { NULL, NULL } }; + + list stacks; + for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWPostRESTResourceCR, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL)); + + stacks.push_back(stack); + } + return run(dpp, stacks); + } +}; + +class RGWDataNotifierManager : public RGWCoroutinesManager { + RGWRados* store; + RGWHTTPManager http_manager; + +public: + RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), + http_manager(store->ctx(), completion_mgr) { + http_manager.start(); + } + + int notify_all(const DoutPrefixProvider *dpp, map& conn_map, + bc::flat_map >& shards) { + + list stacks; + const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str(); + for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); + stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn)); + stacks.push_back(stack); + } + + return run(dpp, stacks); + } +}; + +/* class RGWRadosThread */ + +void RGWRadosThread::start() +{ + worker = new Worker(cct, this); + worker->create(thread_name.c_str()); +} + +void RGWRadosThread::stop() +{ + down_flag = true; + stop_process(); + if (worker) { + worker->signal(); + worker->join(); + } + delete worker; + worker = NULL; +} + +void *RGWRadosThread::Worker::entry() { + uint64_t msec = processor->interval_msec(); + auto interval = std::chrono::milliseconds(msec); + + do { + auto start = ceph::real_clock::now(); + int r = processor->process(this); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl; + } + + if (processor->going_down()) + break; + + auto end = ceph::real_clock::now() - start; + + uint64_t cur_msec = processor->interval_msec(); + if (cur_msec != msec) { /* was it reconfigured? */ + msec = cur_msec; + interval = std::chrono::milliseconds(msec); + } + + if (cur_msec > 0) { + if (interval <= end) + continue; // next round + + auto wait_time = interval - end; + wait_interval(wait_time); + } else { + wait(); + } + } while (!processor->going_down()); + + return NULL; +} + +class RGWMetaNotifier : public RGWRadosThread { + RGWMetaNotifierManager notify_mgr; + RGWMetadataLog *const log; + + uint64_t interval_msec() override { + return cct->_conf->rgw_md_notify_interval_msec; + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log) + : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {} + + int process(const DoutPrefixProvider *dpp) override; +}; + +int RGWMetaNotifier::process(const DoutPrefixProvider *dpp) +{ + set shards; + + log->read_clear_modified(shards); + + if (shards.empty()) { + return 0; + } + + for (set::iterator iter = shards.begin(); iter != shards.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl; + } + + notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards); + + return 0; +} + +class RGWDataNotifier : public RGWRadosThread { + RGWDataNotifierManager notify_mgr; + bc::flat_set entry; + + uint64_t interval_msec() override { + return cct->_conf.get_val("rgw_data_notify_interval_msec"); + } + void stop_process() override { + notify_mgr.stop(); + } +public: + RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {} + + int process(const DoutPrefixProvider *dpp) override; +}; + +int RGWDataNotifier::process(const DoutPrefixProvider *dpp) +{ + auto data_log = store->svc.datalog_rados; + if (!data_log) { + return 0; + } + + auto shards = data_log->read_clear_modified(); + + if (shards.empty()) { + return 0; + } + + for (const auto& [shard_id, entries] : shards) { + bc::flat_set::iterator it; + for (const auto& entry : entries) { + ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id=" + << shard_id << ":" << entry.gen << ":" << entry.key << dendl; + } + } + + notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards); + + return 0; +} + +class RGWSyncProcessorThread : public RGWRadosThread { +public: + RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {} + RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {} + ~RGWSyncProcessorThread() override {} + int init(const DoutPrefixProvider *dpp) override = 0 ; + int process(const DoutPrefixProvider *dpp) override = 0; +}; + +class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread +{ + RGWMetaSyncStatusManager sync; + + uint64_t interval_msec() override { + return 0; /* no interval associated, it'll run once until stopped */ + } + void stop_process() override { + sync.stop(); + } +public: + RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados) + : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {} + + void wakeup_sync_shards(set& shard_ids) { + for (set::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) { + sync.wakeup(*iter); + } + } + RGWMetaSyncStatusManager* get_manager() { return &sync; } + + int init(const DoutPrefixProvider *dpp) override { + int ret = sync.init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl; + return ret; + } + return 0; + } + + int process(const DoutPrefixProvider *dpp) override { + sync.run(dpp, null_yield); + return 0; + } +}; + +class RGWDataSyncProcessorThread : public RGWSyncProcessorThread +{ + PerfCountersRef counters; + RGWDataSyncStatusManager sync; + bool initialized; + + uint64_t interval_msec() override { + if (initialized) { + return 0; /* no interval associated, it'll run once until stopped */ + } else { +#define DATA_SYNC_INIT_WAIT_SEC 20 + return DATA_SYNC_INIT_WAIT_SEC * 1000; + } + } + void stop_process() override { + sync.stop(); + } +public: + RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados, + const RGWZone* source_zone) + : RGWSyncProcessorThread(_driver->getRados(), "data-sync"), + counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)), + sync(_driver, async_rados, source_zone->id, counters.get()), + initialized(false) {} + + void wakeup_sync_shards(bc::flat_map >& entries) { + for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + sync.wakeup(iter->first, iter->second); + } + } + + RGWDataSyncStatusManager* get_manager() { return &sync; } + + int init(const DoutPrefixProvider *dpp) override { + return 0; + } + + int process(const DoutPrefixProvider *dpp) override { + while (!initialized) { + if (going_down()) { + return 0; + } + int ret = sync.init(dpp); + if (ret >= 0) { + initialized = true; + break; + } + /* we'll be back! */ + return 0; + } + sync.run(dpp); + return 0; + } +}; + +class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider +{ + RGWCoroutinesManager crs; + rgw::sal::RadosStore* store; + rgw::BucketTrimManager *bucket_trim; + RGWHTTPManager http; + const utime_t trim_interval; + + uint64_t interval_msec() override { return 0; } + void stop_process() override { crs.stop(); } +public: + RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim, + int interval) + : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"), + crs(store->ctx(), store->getRados()->get_cr_registry()), store(store), + bucket_trim(bucket_trim), + http(store->ctx(), crs.get_completion_mgr()), + trim_interval(interval, 0) + {} + + int init(const DoutPrefixProvider *dpp) override { + return http.start(); + } + int process(const DoutPrefixProvider *dpp) override { + list stacks; + auto metatrimcr = create_meta_log_trim_cr(this, static_cast(store), &http, + cct->_conf->rgw_md_log_max_shards, + trim_interval); + if (!metatrimcr) { + ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl; + return -EINVAL; + } + auto meta = new RGWCoroutinesStack(store->ctx(), &crs); + meta->call(metatrimcr); + + stacks.push_back(meta); + + if (store->svc()->zone->sync_module_exports_data()) { + auto data = new RGWCoroutinesStack(store->ctx(), &crs); + data->call(create_data_log_trim_cr(dpp, static_cast(store), &http, + cct->_conf->rgw_data_log_num_shards, + trim_interval)); + stacks.push_back(data); + + auto bucket = new RGWCoroutinesStack(store->ctx(), &crs); + bucket->call(bucket_trim->create_bucket_trim_cr(&http)); + stacks.push_back(bucket); + } + + crs.run(dpp, stacks); + return 0; + } + + // implements DoutPrefixProvider + CephContext *get_cct() const override { return store->ctx(); } + unsigned get_subsys() const override + { + return dout_subsys; + } + + std::ostream& gen_prefix(std::ostream& out) const override + { + return out << "sync log trim: "; + } + +}; + +void RGWRados::wakeup_meta_sync_shards(set& shard_ids) +{ + std::lock_guard l{meta_sync_thread_lock}; + if (meta_sync_processor_thread) { + meta_sync_processor_thread->wakeup_sync_shards(shard_ids); + } +} + +void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries) +{ + ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl; + for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key + << ", gen=" << gen << dendl; + } + } + + std::lock_guard l{data_sync_thread_lock}; + auto iter = data_sync_processor_threads.find(source_zone); + if (iter == data_sync_processor_threads.end()) { + ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl; + return; + } + + RGWDataSyncProcessorThread *thread = iter->second; + ceph_assert(thread); + thread->wakeup_sync_shards(entries); +} + +RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager() +{ + std::lock_guard l{meta_sync_thread_lock}; + if (meta_sync_processor_thread) { + return meta_sync_processor_thread->get_manager(); + } + return nullptr; +} + +RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone) +{ + std::lock_guard l{data_sync_thread_lock}; + auto thread = data_sync_processor_threads.find(source_zone); + if (thread == data_sync_processor_threads.end()) { + return nullptr; + } + return thread->second->get_manager(); +} + +int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment) +{ + IoCtx ioctx; + int r = open_pool_ctx(dpp, pool, ioctx, false); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl; + return r; + } + + bool req; + r = ioctx.pool_requires_alignment2(&req); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned " + << r << dendl; + return r; + } + + if (!req) { + *alignment = 0; + return 0; + } + + uint64_t align; + r = ioctx.pool_required_alignment2(&align); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned " + << r << dendl; + return r; + } + if (align != 0) { + ldpp_dout(dpp, 20) << "required alignment=" << align << dendl; + } + *alignment = align; + return 0; +} + +void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size) +{ + if (alignment == 0) { + *max_size = size; + return; + } + + if (size <= alignment) { + *max_size = alignment; + return; + } + + *max_size = size - (size % alignment); +} + +int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) +{ + uint64_t alignment; + int r = get_required_alignment(dpp, pool, &alignment); + if (r < 0) { + return r; + } + + if (palignment) { + *palignment = alignment; + } + + uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size; + + get_max_aligned_size(config_chunk_size, alignment, max_chunk_size); + + ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl; + + return 0; +} + +int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, + uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) +{ + rgw_pool pool; + if (!get_obj_data_pool(placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl; + return -EIO; + } + return get_max_chunk_size(pool, max_chunk_size, dpp, palignment); +} + +void add_datalog_entry(const DoutPrefixProvider* dpp, + RGWDataChangesLog* datalog, + const RGWBucketInfo& bucket_info, + uint32_t shard_id) +{ + const auto& logs = bucket_info.layout.logs; + if (logs.empty()) { + return; + } + int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl; + } // datalog error is not fatal +} + +class RGWIndexCompletionManager; + +struct complete_op_data { + ceph::mutex lock = ceph::make_mutex("complete_op_data"); + AioCompletion *rados_completion{nullptr}; + int manager_shard_id{-1}; + RGWIndexCompletionManager *manager{nullptr}; + rgw_obj obj; + RGWModifyOp op; + string tag; + rgw_bucket_entry_ver ver; + cls_rgw_obj_key key; + rgw_bucket_dir_entry_meta dir_meta; + list remove_objs; + bool log_op; + uint16_t bilog_op; + rgw_zone_set zones_trace; + + bool stopped{false}; + + void stop() { + std::lock_guard l{lock}; + stopped = true; + } +}; + +class RGWIndexCompletionManager { + RGWRados* const store; + const uint32_t num_shards; + ceph::containers::tiny_vector locks; + std::vector> completions; + std::vector retry_completions; + + std::condition_variable cond; + std::mutex retry_completions_lock; + bool _stop{false}; + std::thread retry_thread; + + // used to distribute the completions and the locks they use across + // their respective vectors; it will get incremented and can wrap + // around back to 0 without issue + std::atomic cur_shard {0}; + + void process(); + + void add_completion(complete_op_data *completion); + + void stop() { + if (retry_thread.joinable()) { + _stop = true; + cond.notify_all(); + retry_thread.join(); + } + + for (uint32_t i = 0; i < num_shards; ++i) { + std::lock_guard l{locks[i]}; + for (auto c : completions[i]) { + c->stop(); + } + } + completions.clear(); + } + + uint32_t next_shard() { + return cur_shard++ % num_shards; + } + +public: + RGWIndexCompletionManager(RGWRados *_driver) : + store(_driver), + num_shards(store->ctx()->_conf->rgw_thread_pool_size), + locks{ceph::make_lock_container( + num_shards, + [](const size_t i) { + return ceph::make_mutex("RGWIndexCompletionManager::lock::" + + std::to_string(i)); + })}, + completions(num_shards), + retry_thread(&RGWIndexCompletionManager::process, this) + {} + + ~RGWIndexCompletionManager() { + stop(); + } + + void create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result); + + bool handle_completion(completion_t cb, complete_op_data *arg); + + CephContext* ctx() { + return store->ctx(); + } +}; + +static void obj_complete_cb(completion_t cb, void *arg) +{ + complete_op_data *completion = reinterpret_cast(arg); + completion->lock.lock(); + if (completion->stopped) { + completion->lock.unlock(); /* can drop lock, no one else is referencing us */ + delete completion; + return; + } + bool need_delete = completion->manager->handle_completion(cb, completion); + completion->lock.unlock(); + if (need_delete) { + delete completion; + } +} + +void RGWIndexCompletionManager::process() +{ + DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: "); + while(!_stop) { + std::vector comps; + + { + std::unique_lock l{retry_completions_lock}; + cond.wait(l, [this](){return _stop || !retry_completions.empty();}); + if (_stop) { + return; + } + retry_completions.swap(comps); + } + + for (auto c : comps) { + std::unique_ptr up{c}; + + ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl; + + RGWRados::BucketShard bs(store); + RGWBucketInfo bucket_info; + + int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp); + if (r < 0) { + ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl; + /* not much to do */ + continue; + } + + r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info, + [&](RGWRados::BucketShard *bs) -> int { + const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, &dpp, 10) << + "ENTERING " << __func__ << ": bucket-shard=" << bs << + " obj=" << c->obj << " tag=" << c->tag << + " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx; + ldout_bitx(bitx, &dpp, 25) << + "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx; + + librados::ObjectWriteOperation o; + o.assert_exists(); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, + c->log_op, c->bilog_op, &c->zones_trace); + int ret = bs->bucket_obj.operate(&dpp, &o, null_yield); + ldout_bitx(bitx, &dpp, 10) << + "EXITING " << __func__ << ": ret=" << dendl_bitx; + return ret; + }); + if (r < 0) { + ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl; + /* ignoring error, can't do anything about it */ + continue; + } + + add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id); + } + } +} + +void RGWIndexCompletionManager::create_completion(const rgw_obj& obj, + RGWModifyOp op, string& tag, + rgw_bucket_entry_ver& ver, + const cls_rgw_obj_key& key, + rgw_bucket_dir_entry_meta& dir_meta, + list *remove_objs, bool log_op, + uint16_t bilog_op, + rgw_zone_set *zones_trace, + complete_op_data **result) +{ + complete_op_data *entry = new complete_op_data; + + int shard_id = next_shard(); + + entry->manager_shard_id = shard_id; + entry->manager = this; + entry->obj = obj; + entry->op = op; + entry->tag = tag; + entry->ver = ver; + entry->key = key; + entry->dir_meta = dir_meta; + entry->log_op = log_op; + entry->bilog_op = bilog_op; + + if (remove_objs) { + for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) { + entry->remove_objs.push_back(*iter); + } + } + + if (zones_trace) { + entry->zones_trace = *zones_trace; + } else { + entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key()); + } + + *result = entry; + + entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb); + + std::lock_guard l{locks[shard_id]}; + const auto ok = completions[shard_id].insert(entry).second; + ceph_assert(ok); +} + +void RGWIndexCompletionManager::add_completion(complete_op_data *completion) { + { + std::lock_guard l{retry_completions_lock}; + retry_completions.push_back(completion); + } + cond.notify_all(); +} + +bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg) +{ + int shard_id = arg->manager_shard_id; + { + std::lock_guard l{locks[shard_id]}; + + auto& comps = completions[shard_id]; + + auto iter = comps.find(arg); + if (iter == comps.end()) { + ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl; + return true; + } + + comps.erase(iter); + } + + int r = rados_aio_get_return_value(cb); + if (r != -ERR_BUSY_RESHARDING) { + ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << + (r == 0 ? "ok" : "failed with " + to_string(r)) << + " for obj=" << arg->key << dendl; + return true; + } + add_completion(arg); + ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl; + return false; +} + +void RGWRados::finalize() +{ + /* Before joining any sync threads, drain outstanding requests & + * mark the async_processor as going_down() */ + if (svc.rados) { + svc.rados->stop_processor(); + } + + if (run_sync_thread) { + std::lock_guard l{meta_sync_thread_lock}; + meta_sync_processor_thread->stop(); + + std::lock_guard dl{data_sync_thread_lock}; + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + thread->stop(); + } + if (sync_log_trimmer) { + sync_log_trimmer->stop(); + } + } + if (run_sync_thread) { + delete meta_sync_processor_thread; + meta_sync_processor_thread = NULL; + std::lock_guard dl{data_sync_thread_lock}; + for (auto iter : data_sync_processor_threads) { + RGWDataSyncProcessorThread *thread = iter.second; + delete thread; + } + data_sync_processor_threads.clear(); + delete sync_log_trimmer; + sync_log_trimmer = nullptr; + bucket_trim = boost::none; + } + if (meta_notifier) { + meta_notifier->stop(); + delete meta_notifier; + } + if (data_notifier) { + data_notifier->stop(); + delete data_notifier; + } + delete sync_tracer; + + delete lc; + lc = NULL; + + delete gc; + gc = NULL; + + delete obj_expirer; + obj_expirer = NULL; + + RGWQuotaHandler::free_handler(quota_handler); + if (cr_registry) { + cr_registry->put(); + } + + svc.shutdown(); + + delete binfo_cache; + delete obj_tombstone_cache; + if (d3n_data_cache) + delete d3n_data_cache; + + if (reshard_wait.get()) { + reshard_wait->stop(); + reshard_wait.reset(); + } + + if (run_reshard_thread) { + reshard->stop_processor(); + } + delete reshard; + delete index_completion_manager; + + rgw::notify::shutdown(); +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_rados() +{ + int ret = 0; + + ret = rados.init_with_context(cct); + if (ret < 0) { + return ret; + } + ret = rados.connect(); + if (ret < 0) { + return ret; + } + + auto crs = std::unique_ptr{ + new RGWCoroutinesManagerRegistry(cct)}; + ret = crs->hook_to_admin_command("cr dump"); + if (ret < 0) { + return ret; + } + + cr_registry = crs.release(); + + if (use_datacache) { + d3n_data_cache = new D3nDataCache(); + d3n_data_cache->init(cct); + } + + return ret; +} + +int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map& meta) +{ + string name = cct->_conf->name.get_id(); + if (name.compare(0, 4, "rgw.") == 0) { + name = name.substr(4); + } + map metadata = meta; + metadata["num_handles"] = "1"s; + metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id(); + metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name(); + metadata["zone_name"] = svc.zone->zone_name(); + metadata["zone_id"] = svc.zone->zone_id().id; + metadata["realm_name"] = svc.zone->get_realm().get_name(); + metadata["realm_id"] = svc.zone->get_realm().get_id(); + metadata["id"] = name; + int ret = rados.service_daemon_register( + daemon_type, + stringify(rados.get_instance_id()), + metadata); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map&& status) +{ + int ret = rados.service_daemon_update_status(move(status)); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_complete(const DoutPrefixProvider *dpp) +{ + int ret; + + /* + * create sync module instance even if we don't run sync thread, might need it for radosgw-admin + */ + sync_module = svc.sync_modules->get_sync_module(); + + ret = open_root_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_gc_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_lc_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_objexp_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_reshard_pool_ctx(dpp); + if (ret < 0) + return ret; + + ret = open_notif_pool_ctx(dpp); + if (ret < 0) + return ret; + + pools_initialized = true; + + if (use_gc) { + gc = new RGWGC(); + gc->initialize(cct, this); + } else { + ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl; + } + + obj_expirer = new RGWObjectExpirer(this->driver); + + if (use_gc_thread && use_gc) { + gc->start_processor(); + obj_expirer->start_processor(); + } + + auto& current_period = svc.zone->get_current_period(); + auto& zonegroup = svc.zone->get_zonegroup(); + auto& zone_params = svc.zone->get_zone_params(); + auto& zone = svc.zone->get_zone(); + + /* no point of running sync thread if we don't have a master zone configured + or there is no rest_master_conn */ + if (!svc.zone->need_to_sync()) { + run_sync_thread = false; + } + + if (svc.zone->is_meta_master()) { + auto md_log = svc.mdlog->get_log(current_period.get_id()); + meta_notifier = new RGWMetaNotifier(this, md_log); + meta_notifier->start(); + } + + /* init it anyway, might run sync through radosgw-admin explicitly */ + sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size); + sync_tracer->init(this); + ret = sync_tracer->hook_to_admin_command(); + if (ret < 0) { + return ret; + } + + if (run_sync_thread) { + for (const auto &pt: zonegroup.placement_targets) { + if (zone_params.placement_pools.find(pt.second.name) + == zone_params.placement_pools.end()){ + ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target " + << pt.second.name << " present in zonegroup" << dendl; + } + } + auto async_processor = svc.rados->get_async_processor(); + std::lock_guard l{meta_sync_thread_lock}; + meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor); + ret = meta_sync_processor_thread->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl; + return ret; + } + meta_sync_processor_thread->start(); + + // configure the bucket trim manager + rgw::BucketTrimConfig config; + rgw::configure_bucket_trim(cct, config); + + bucket_trim.emplace(this->driver, config); + ret = bucket_trim->init(); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl; + return ret; + } + svc.datalog_rados->set_observer(&*bucket_trim); + + std::lock_guard dl{data_sync_thread_lock}; + for (auto source_zone : svc.zone->get_data_sync_source_zones()) { + ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl; + auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone); + ret = thread->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl; + return ret; + } + thread->start(); + data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread; + } + auto interval = cct->_conf->rgw_sync_log_trim_interval; + if (interval > 0) { + sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval); + ret = sync_log_trimmer->init(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl; + return ret; + } + sync_log_trimmer->start(); + } + } + if (cct->_conf->rgw_data_notify_interval_msec) { + data_notifier = new RGWDataNotifier(this); + data_notifier->start(); + } + + binfo_cache = new RGWChainedCacheImpl; + binfo_cache->init(svc.cache); + + lc = new RGWLC(); + lc->initialize(cct, this->driver); + + if (use_lc_thread) + lc->start_processor(); + + quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads); + + bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards : + zone.bucket_index_max_shards); + if (bucket_index_max_shards > get_max_bucket_shards()) { + bucket_index_max_shards = get_max_bucket_shards(); + ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: " + << get_max_bucket_shards() << dendl; + } + ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl; + + bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */ + + if (need_tombstone_cache) { + obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size); + } + + reshard_wait = std::make_shared(); + + reshard = new RGWReshard(this->driver); + + // disable reshard thread based on zone/zonegroup support + run_reshard_thread = run_reshard_thread && svc.zone->can_reshard(); + + if (run_reshard_thread) { + reshard->start_processor(); + } + + index_completion_manager = new RGWIndexCompletionManager(this); + ret = rgw::notify::init(cct, driver, dpp); + if (ret < 0 ) { + ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl; + } + + return ret; +} + +int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp) +{ + if (raw) { + return svc.init_raw(cct, use_cache, null_yield, dpp); + } + + return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp); +} + +int RGWRados::init_ctl(const DoutPrefixProvider *dpp) +{ + return ctl.init(&svc, driver, dpp); +} + +/** + * Initialize the RADOS instance and prepare to do other ops + * Returns 0 on success, -ERR# on failure. + */ +int RGWRados::init_begin(const DoutPrefixProvider *dpp) +{ + int ret; + + inject_notify_timeout_probability = + cct->_conf.get_val("rgw_inject_notify_timeout_probability"); + max_notify_retries = cct->_conf.get_val("rgw_max_notify_retries"); + + ret = init_svc(false, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + ret = init_ctl(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl; + return ret; + } + + host_id = svc.zone_utils->gen_host_id(); + + return init_rados(); +} + +/** + * Open the pool used as root for this gateway + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true); +} + +int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true); +} + +int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true); +} + +int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true); +} + +int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true); +} + +int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp) +{ + return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true); +} + +int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap) +{ + constexpr bool create = true; // create the pool if it doesn't exist + return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap); +} + +/**** logs ****/ + +struct log_list_state { + string prefix; + librados::IoCtx io_ctx; + librados::NObjectIterator obit; +}; + +int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle) +{ + log_list_state *state = new log_list_state; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + state->prefix = prefix; + state->obit = state->io_ctx.nobjects_begin(); + *handle = (RGWAccessHandle)state; + return 0; +} + +int RGWRados::log_list_next(RGWAccessHandle handle, string *name) +{ + log_list_state *state = static_cast(handle); + while (true) { + if (state->obit == state->io_ctx.nobjects_end()) { + delete state; + return -ENOENT; + } + if (state->prefix.length() && + state->obit->get_oid().find(state->prefix) != 0) { + state->obit++; + continue; + } + *name = state->obit->get_oid(); + state->obit++; + break; + } + return 0; +} + +int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name) +{ + librados::IoCtx io_ctx; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); + if (r < 0) + return r; + return io_ctx.remove(name); +} + +struct log_show_state { + librados::IoCtx io_ctx; + bufferlist bl; + bufferlist::const_iterator p; + string name; + uint64_t pos; + bool eof; + log_show_state() : pos(0), eof(false) {} +}; + +int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle) +{ + log_show_state *state = new log_show_state; + int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); + if (r < 0) { + delete state; + return r; + } + state->name = name; + *handle = (RGWAccessHandle)state; + return 0; +} + +int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry) +{ + log_show_state *state = static_cast(handle); + off_t off = state->p.get_off(); + + ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length() + << " off " << off + << " eof " << (int)state->eof + << dendl; + // read some? + unsigned chunk = 1024*1024; + if ((state->bl.length() - off) < chunk/2 && !state->eof) { + bufferlist more; + int r = state->io_ctx.read(state->name, more, chunk, state->pos); + if (r < 0) + return r; + state->pos += r; + bufferlist old; + try { + old.substr_of(state->bl, off, state->bl.length() - off); + } catch (buffer::error& err) { + return -EINVAL; + } + state->bl = std::move(old); + state->bl.claim_append(more); + state->p = state->bl.cbegin(); + if ((unsigned)r < chunk) + state->eof = true; + ldpp_dout(dpp, 10) << " read " << r << dendl; + } + + if (state->p.end()) + return 0; // end of file + try { + decode(*entry, state->p); + } + catch (const buffer::error &e) { + return -EINVAL; + } + return 1; +} + +/** + * usage_log_hash: get usage log key hash, based on name and index + * + * Get the usage object name. Since a user may have more than 1 + * object holding that info (multiple shards), we use index to + * specify that shard number. Once index exceeds max shards it + * wraps. + * If name is not being set, results for all users will be returned + * and index will wrap only after total shards number. + * + * @param cct [in] ceph context + * @param name [in] user name + * @param hash [out] hash value + * @param index [in] shard index number + */ +static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index) +{ + uint32_t val = index; + + if (!name.empty()) { + int max_user_shards = cct->_conf->rgw_usage_max_user_shards; + val %= max_user_shards; + val += ceph_str_hash_linux(name.c_str(), name.size()); + } + char buf[17]; + int max_shards = cct->_conf->rgw_usage_max_shards; + snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards)); + hash = buf; +} + +int RGWRados::log_usage(const DoutPrefixProvider *dpp, map& usage_info) +{ + uint32_t index = 0; + + map log_objs; + + string hash; + string last_user; + + /* restructure usage map, zone by object hash */ + map::iterator iter; + for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) { + const rgw_user_bucket& ub = iter->first; + RGWUsageBatch& info = iter->second; + + if (ub.user.empty()) { + ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl; + continue; + } + + if (ub.user != last_user) { + /* index *should* be random, but why waste extra cycles + in most cases max user shards is not going to exceed 1, + so just incrementing it */ + usage_log_hash(cct, ub.user, hash, index++); + } + last_user = ub.user; + vector& v = log_objs[hash].entries; + + for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) { + v.push_back(miter->second); + } + } + + map::iterator liter; + + for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) { + int r = cls_obj_usage_log_add(dpp, liter->first, liter->second); + if (r < 0) + return r; + } + return 0; +} + +int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map& usage) +{ + uint32_t num = max_entries; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, 0); + + if (usage_iter.index) { + usage_log_hash(cct, user_str, hash, usage_iter.index); + } else { + hash = first_hash; + } + + usage.clear(); + + do { + map ret_usage; + map::iterator iter; + + int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num, + usage_iter.read_iter, ret_usage, is_truncated); + if (ret == -ENOENT) + goto next; + + if (ret < 0) + return ret; + + num -= ret_usage.size(); + + for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) { + usage[iter->first].aggregate(iter->second); + } + +next: + if (!*is_truncated) { + usage_iter.read_iter.clear(); + usage_log_hash(cct, user_str, hash, ++usage_iter.index); + } + } while (num && !*is_truncated && hash != first_hash); + return 0; +} + +int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch) +{ + uint32_t index = 0; + string hash, first_hash; + string user_str = user.to_str(); + usage_log_hash(cct, user_str, first_hash, index); + + hash = first_hash; + do { + int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch); + + if (ret < 0 && ret != -ENOENT) + return ret; + + usage_log_hash(cct, user_str, hash, ++index); + } while (hash != first_hash); + + return 0; +} + + +int RGWRados::clear_usage(const DoutPrefixProvider *dpp) +{ + auto max_shards = cct->_conf->rgw_usage_max_shards; + int ret=0; + for (unsigned i=0; i < max_shards; i++){ + string oid = RGW_USAGE_OBJ_PREFIX + to_string(i); + ret = cls_obj_usage_log_clear(dpp, oid); + if (ret < 0){ + ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +int RGWRados::decode_policy(const DoutPrefixProvider *dpp, + ceph::buffer::list& bl, + ACLOwner *owner) +{ + auto i = bl.cbegin(); + RGWAccessControlPolicy policy(cct); + try { + policy.decode_owner(i); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + *owner = policy.get_owner(); + return 0; +} + +int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp) +{ + rgw_bucket bucket = bucket_info.bucket; + bucket.update_bucket_id(new_bucket_id); + + bucket_info.objv_tracker.clear(); + int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp); + if (ret < 0) { + return ret; + } + + return 0; +} + + +/** + * Get ordered listing of the objects in a bucket. + * + * max_p: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: do not include results that match this string. + * Any skipped results will have the matching portion of their name + * inserted in common_prefixes with a "true" mark. + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: if delim is filled in, any matching prefixes are + * placed here. + * is_truncated: if number of objects in the bucket is bigger than + * max, then truncated. + */ +int RGWRados::Bucket::List::list_objects_ordered( + const DoutPrefixProvider *dpp, + int64_t max_p, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y) +{ + RGWRados *store = target->get_store(); + CephContext *cct = store->ctx(); + int shard_id = target->get_shard_id(); + const auto& current_index = target->get_bucket_info().layout.current_index; + + int count = 0; + bool truncated = true; + bool cls_filtered = false; + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + std::string cur_prefix = prefix_obj.get_index_key_name(); + std::string after_delim_s; /* needed in !params.delim.empty() AND later */ + + if (!params.delim.empty()) { + after_delim_s = cls_rgw_after_delim(params.delim); + /* if marker points at a common prefix, fast forward it into its + * upper bound string */ + int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size()); + if (delim_pos >= 0) { + string s = cur_marker.name.substr(0, delim_pos); + s.append(after_delim_s); + cur_marker = s; + } + } + + // we'll stop after this many attempts as long we return at least + // one entry; but we will also go beyond this number of attempts + // until we return at least one entry + constexpr uint16_t SOFT_MAX_ATTEMPTS = 8; + + rgw_obj_index_key prev_marker; + for (uint16_t attempt = 1; /* empty */; ++attempt) { + ldpp_dout(dpp, 20) << __func__ << + ": starting attempt " << attempt << dendl; + + if (attempt > 1 && !(prev_marker < cur_marker)) { + // we've failed to make forward progress + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " marker failed to make forward progress; attempt=" << attempt << + ", prev_marker=" << prev_marker << + ", cur_marker=" << cur_marker << dendl; + break; + } + prev_marker = cur_marker; + + ent_map_t ent_map; + ent_map.reserve(read_ahead); + int r = store->cls_bucket_list_ordered(dpp, + target->get_bucket_info(), + current_index, + shard_id, + cur_marker, + cur_prefix, + params.delim, + read_ahead + 1 - count, + params.list_versions, + attempt, + ent_map, + &truncated, + &cls_filtered, + &cur_marker, + y, + params.force_check_filter); + if (r < 0) { + return r; + } + + for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) { + rgw_bucket_dir_entry& entry = eiter->second; + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); + + ldpp_dout(dpp, 20) << __func__ << + ": considering entry " << entry.key << dendl; + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " could not parse object name: " << obj.name << dendl; + continue; + } + + bool matched_ns = (obj.ns == params.ns); + if (!params.list_versions && !entry.is_visible()) { + ldpp_dout(dpp, 10) << __func__ << + ": skipping not visible entry \"" << entry.key << "\"" << dendl; + continue; + } + + if (params.enforce_ns && !matched_ns) { + if (!params.ns.empty()) { + /* we've iterated past the namespace we're searching -- done now */ + truncated = false; + ldpp_dout(dpp, 10) << __func__ << + ": finished due to getting past requested namespace \"" << + params.ns << "\"" << dendl; + goto done; + } + + /* we're skipping past namespaced objects */ + ldpp_dout(dpp, 20) << __func__ << + ": skipping past namespaced objects, including \"" << entry.key << + "\"" << dendl; + continue; + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + truncated = false; + ldpp_dout(dpp, 10) << __func__ << + ": finished due to gitting end marker of \"" << cur_end_marker << + "\" with \"" << entry.key << "\"" << dendl; + goto done; + } + + if (count < max) { + params.marker = index_key; + next_marker = index_key; + } + + if (params.access_list_filter && + ! params.access_list_filter->filter(obj.name, index_key.name)) { + ldpp_dout(dpp, 20) << __func__ << + ": skipping past namespaced objects, including \"" << entry.key << + "\"" << dendl; + continue; + } + + if (params.prefix.size() && + 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) { + ldpp_dout(dpp, 20) << __func__ << + ": skipping object \"" << entry.key << + "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl; + continue; + } + + if (!params.delim.empty()) { + const int delim_pos = obj.name.find(params.delim, params.prefix.size()); + if (delim_pos >= 0) { + // run either the code where delimiter filtering is done a) + // in the OSD/CLS or b) here. + if (cls_filtered) { + // NOTE: this condition is for the newer versions of the + // OSD that does filtering on the CLS side should only + // find one delimiter at the end if it finds any after the + // prefix + if (delim_pos != + int(obj.name.length() - params.delim.length())) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << + " found delimiter in place other than the end of " + "the prefix; obj.name=" << obj.name << + ", prefix=" << params.prefix << dendl; + } + if (common_prefixes) { + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with common prefix \"" << entry.key << + "\" because requested number (" << max << + ") reached (cls filtered)" << dendl; + goto done; + } + + (*common_prefixes)[obj.name] = true; + count++; + } + + ldpp_dout(dpp, 20) << __func__ << + ": finished entry with common prefix \"" << entry.key << + "\" so continuing loop (cls filtered)" << dendl; + continue; + } else { + // NOTE: this condition is for older versions of the OSD + // that do not filter on the CLS side, so the following code + // must do the filtering; once we reach version 16 of ceph, + // this code can be removed along with the conditional that + // can lead this way + + /* extract key -with trailing delimiter- for CommonPrefix */ + string prefix_key = + obj.name.substr(0, delim_pos + params.delim.length()); + + if (common_prefixes && + common_prefixes->find(prefix_key) == common_prefixes->end()) { + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with common prefix \"" << entry.key << + "\" because requested number (" << max << + ") reached (not cls filtered)" << dendl; + goto done; + } + next_marker = prefix_key; + (*common_prefixes)[prefix_key] = true; + + count++; + } + + ldpp_dout(dpp, 20) << __func__ << + ": finished entry with common prefix \"" << entry.key << + "\" so continuing loop (not cls filtered)" << dendl; + continue; + } // if we're running an older OSD version + } // if a delimiter was found after prefix + } // if a delimiter was passed in + + if (count >= max) { + truncated = true; + ldpp_dout(dpp, 10) << __func__ << + ": stopping early with entry \"" << entry.key << + "\" because requested number (" << max << + ") reached" << dendl; + goto done; + } + + ldpp_dout(dpp, 20) << __func__ << + ": adding entry " << entry.key << " to result" << dendl; + + result->emplace_back(std::move(entry)); + count++; + } // eiter for loop + + // NOTE: the following conditional is needed by older versions of + // the OSD that don't do delimiter filtering on the CLS side; once + // we reach version 16 of ceph, the following conditional and the + // code within can be removed + if (!cls_filtered && !params.delim.empty()) { + int marker_delim_pos = + cur_marker.name.find(params.delim, cur_prefix.size()); + if (marker_delim_pos >= 0) { + std::string skip_after_delim = + cur_marker.name.substr(0, marker_delim_pos); + skip_after_delim.append(after_delim_s); + + ldpp_dout(dpp, 20) << __func__ << + ": skip_after_delim=" << skip_after_delim << dendl; + + if (skip_after_delim > cur_marker.name) { + cur_marker = skip_after_delim; + ldpp_dout(dpp, 20) << __func__ << + ": setting cur_marker=" << cur_marker.name << + "[" << cur_marker.instance << "]" << dendl; + } + } + } // if older osd didn't do delimiter filtering + + ldpp_dout(dpp, 10) << __func__ << + ": end of outer loop, truncated=" << truncated << + ", count=" << count << ", attempt=" << attempt << dendl; + + if (!truncated || count >= (max + 1) / 2) { + // if we finished listing, or if we're returning at least half the + // requested entries, that's enough; S3 and swift protocols allow + // returning fewer than max entries + ldpp_dout(dpp, 10) << __func__ << + ": exiting attempt loop because we reached end (" << truncated << + ") or we're returning half the requested entries (" << count << + " of " << max << ")" << dendl; + break; + } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) { + // if we've made at least 8 attempts and we have some, but very + // few, results, return with what we have + ldpp_dout(dpp, 10) << __func__ << + ": exiting attempt loop because we made " << attempt << + " attempts and we're returning " << count << " entries" << dendl; + break; + } + } // for (uint16_t attempt... + +done: + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} // list_objects_ordered + + +/** + * Get listing of the objects in a bucket and allow the results to be out + * of order. + * + * Even though there are key differences with the ordered counterpart, + * the parameters are the same to maintain some compatability. + * + * max: maximum number of results to return + * bucket: bucket to list contents of + * prefix: only return results that match this prefix + * delim: should not be set; if it is we should have indicated an error + * marker: if filled in, begin the listing with this object. + * end_marker: if filled in, end the listing with this object. + * result: the objects are put in here. + * common_prefixes: this is never filled with an unordered list; the param + * is maintained for compatibility + * is_truncated: if number of objects in the bucket is bigger than max, then + * truncated. + */ +int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp, + int64_t max_p, + std::vector* result, + std::map* common_prefixes, + bool* is_truncated, + optional_yield y) +{ + RGWRados *store = target->get_store(); + int shard_id = target->get_shard_id(); + const auto& current_index = target->get_bucket_info().layout.current_index; + + int count = 0; + bool truncated = true; + + const int64_t max = // protect against memory issues and negative vals + std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); + + // read a few extra in each call to cls_bucket_list_unordered in + // case some are filtered out due to namespace matching, versioning, + // filtering, etc. + const int64_t max_read_ahead = 100; + const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead)); + + result->clear(); + + // use a local marker; either the marker will have a previous entry + // or it will be empty; either way it's OK to copy + rgw_obj_key marker_obj(params.marker.name, + params.marker.instance, + params.ns.empty() ? params.marker.ns : params.ns); + rgw_obj_index_key cur_marker; + marker_obj.get_index_key(&cur_marker); + + rgw_obj_key end_marker_obj(params.end_marker.name, + params.end_marker.instance, + params.ns.empty() ? params.end_marker.ns : params.ns); + rgw_obj_index_key cur_end_marker; + end_marker_obj.get_index_key(&cur_end_marker); + const bool cur_end_marker_valid = !params.end_marker.empty(); + + rgw_obj_key prefix_obj(params.prefix); + prefix_obj.set_ns(params.ns); + std::string cur_prefix = prefix_obj.get_index_key_name(); + + while (truncated && count <= max) { + std::vector ent_list; + ent_list.reserve(read_ahead); + + int r = store->cls_bucket_list_unordered(dpp, + target->get_bucket_info(), + current_index, + shard_id, + cur_marker, + cur_prefix, + read_ahead, + params.list_versions, + ent_list, + &truncated, + &cur_marker, + y); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " cls_bucket_list_unordered returned " << r << " for " << + target->get_bucket_info().bucket << dendl; + return r; + } + + // NB: while regions of ent_list will be sorted, we have no + // guarantee that all items will be sorted since they can cross + // shard boundaries + + for (auto& entry : ent_list) { + rgw_obj_index_key index_key = entry.key; + rgw_obj_key obj(index_key); + + if (count < max) { + params.marker.set(index_key); + next_marker.set(index_key); + } + + /* note that parse_raw_oid() here will not set the correct + * object's instance, as rgw_obj_index_key encodes that + * separately. We don't need to set the instance because it's + * not needed for the checks here and we end up using the raw + * entry for the return vector + */ + bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); + if (!valid) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " could not parse object name: " << obj.name << dendl; + continue; + } + + if (!params.list_versions && !entry.is_visible()) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because not listing versions and entry not visibile" << dendl; + continue; + } + + if (params.enforce_ns && obj.ns != params.ns) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because namespace does not match" << dendl; + continue; + } + + if (cur_end_marker_valid && cur_end_marker <= index_key) { + // we're not guaranteed items will come in order, so we have + // to loop through all + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because after end_marker" << dendl; + continue; + } + + if (params.access_list_filter && + !params.access_list_filter->filter(obj.name, index_key.name)) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because doesn't match filter" << dendl; + continue; + } + + if (params.prefix.size() && + (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) { + ldpp_dout(dpp, 20) << __func__ << + ": skippping \"" << index_key << + "\" because doesn't match prefix" << dendl; + continue; + } + + if (count >= max) { + truncated = true; + goto done; + } + + result->emplace_back(std::move(entry)); + count++; + } // for (auto& entry : ent_list) + } // while (truncated && count <= max) + +done: + + if (is_truncated) { + *is_truncated = truncated; + } + + return 0; +} // list_objects_unordered + + +/** + * create a rados pool, associated meta info + * returns 0 on success, -ERR# otherwise. + */ +int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool) +{ + librados::IoCtx io_ctx; + constexpr bool create = true; + return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create); +} + +void RGWRados::create_bucket_id(string *bucket_id) +{ + uint64_t iid = instance_id(); + uint64_t bid = next_bucket_id(); + char buf[svc.zone->get_zone_params().get_id().size() + 48]; + snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64, + svc.zone->get_zone_params().get_id().c_str(), iid, bid); + *bucket_id = buf; +} + +int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + map& attrs, + RGWBucketInfo& info, + obj_version *pobjv, + obj_version *pep_objv, + real_time creation_time, + rgw_bucket *pmaster_bucket, + uint32_t *pmaster_num_shards, + optional_yield y, + const DoutPrefixProvider *dpp, + bool exclusive) +{ +#define MAX_CREATE_RETRIES 20 /* need to bound retries */ + rgw_placement_rule selected_placement_rule; + RGWZonePlacementInfo rule_info; + + for (int i = 0; i < MAX_CREATE_RETRIES; i++) { + int ret = 0; + ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule, + &selected_placement_rule, &rule_info, y); + if (ret < 0) + return ret; + + if (!pmaster_bucket) { + create_bucket_id(&bucket.marker); + bucket.bucket_id = bucket.marker; + } else { + bucket.marker = pmaster_bucket->marker; + bucket.bucket_id = pmaster_bucket->bucket_id; + } + + RGWObjVersionTracker& objv_tracker = info.objv_tracker; + + objv_tracker.read_version.clear(); + + if (pobjv) { + objv_tracker.write_version = *pobjv; + } else { + objv_tracker.generate_new_write_ver(cct); + } + + info.bucket = bucket; + info.owner = owner.user_id; + info.zonegroup = zonegroup_id; + info.placement_rule = selected_placement_rule; + info.swift_ver_location = swift_ver_location; + info.swift_versioning = (!swift_ver_location.empty()); + + init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(), + pmaster_num_shards ? + std::optional{*pmaster_num_shards} : + std::nullopt, + rule_info.index_type); + + info.requester_pays = false; + if (real_clock::is_zero(creation_time)) { + info.creation_time = ceph::real_clock::now(); + } else { + info.creation_time = creation_time; + } + if (pquota_info) { + info.quota = *pquota_info; + } + + int r = svc.bi->init_index(dpp, info, info.layout.current_index); + if (r < 0) { + return r; + } + + ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp); + if (ret == -ECANCELED) { + ret = -EEXIST; + } + if (ret == -EEXIST) { + /* we need to reread the info and return it, caller will have a use for it */ + RGWBucketInfo orig_info; + r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL); + if (r < 0) { + if (r == -ENOENT) { + continue; + } + ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl; + return r; + } + + /* only remove it if it's a different bucket instance */ + if (orig_info.bucket.bucket_id != bucket.bucket_id) { + int r = svc.bi->clean_index(dpp, info, info.layout.current_index); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl; + } + r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl; + /* continue anyway */ + } + } + + info = std::move(orig_info); + /* ret == -EEXIST here */ + } + return ret; + } + + /* this is highly unlikely */ + ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl; + return -ENOENT; +} + +bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj) +{ + get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); + + return get_obj_data_pool(placement_rule, obj, &raw_obj->pool); +} + +std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y) +{ + return svc.rados->cluster_fsid(); +} + +int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + librados::IoCtx *ioctx) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + rgw_pool pool; + if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << + ", probably misconfiguration" << dendl; + return -EIO; + } + + int r = open_pool_ctx(dpp, pool, *ioctx, false); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() << + " for obj=" << obj << " with error-code=" << r << dendl; + return r; + } + + ioctx->locator_set_key(key); + + return 0; +} + +int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, + const rgw_placement_rule& target_placement_rule, + const rgw_obj& obj, + rgw_rados_ref *ref) +{ + get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc); + + rgw_pool pool; + if (!get_obj_data_pool(target_placement_rule, obj, &pool)) { + ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl; + return -EIO; + } + + ref->pool = svc.rados->pool(pool); + + int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() + .set_mostly_omap(false)); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl; + return r; + } + + ref->pool.ioctx().locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + rgw_rados_ref *ref) +{ + return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref); +} + +int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + ref->obj = obj; + + if (ref->obj.oid.empty()) { + ref->obj.oid = obj.pool.to_str(); + ref->obj.pool = svc.zone->get_zone_params().domain_root; + } + ref->pool = svc.rados->pool(obj.pool); + int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() + .set_mostly_omap(false)); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl; + return r; + } + + ref->pool.ioctx().locator_set_key(ref->obj.loc); + + return 0; +} + +int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) +{ + return get_raw_obj_ref(dpp, obj, ref); +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key) +{ + const rgw_bucket& bucket = bucket_info.bucket; + string oid; + string locator; + + rgw_obj obj(bucket, key); + + get_obj_bucket_and_oid_loc(obj, oid, locator); + + if (locator.empty()) { + ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl; + return 0; + } + + librados::IoCtx ioctx; + + int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx); + if (ret < 0) { + cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl; + return ret; + } + ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */ + + uint64_t size; + bufferlist data; + + struct timespec mtime_ts; + map attrs; + librados::ObjectReadOperation op; + op.getxattrs(&attrs, NULL); + op.stat2(&size, &mtime_ts, NULL); +#define HEAD_SIZE 512 * 1024 + op.read(0, HEAD_SIZE, &data, NULL); + + ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl; + return ret; + } + + if (size > HEAD_SIZE) { + ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl; + return -EIO; + } + + if (size != data.length()) { + ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl; + return -EIO; + } + + if (copy_obj) { + librados::ObjectWriteOperation wop; + + wop.mtime2(&mtime_ts); + + map::iterator iter; + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + wop.setxattr(iter->first.c_str(), iter->second); + } + + wop.write(0, data); + + ioctx.locator_set_key(locator); + rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield); + } + + if (remove_bad) { + ioctx.locator_set_key(string()); + + ret = ioctx.remove(oid); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl; + return ret; + } + } + + return 0; +} + +int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp, + librados::IoCtx& src_ioctx, + const string& src_oid, const string& src_locator, + librados::IoCtx& dst_ioctx, + const string& dst_oid, const string& dst_locator) +{ + +#define COPY_BUF_SIZE (4 * 1024 * 1024) + bool done = false; + uint64_t chunk_size = COPY_BUF_SIZE; + uint64_t ofs = 0; + int ret = 0; + real_time mtime; + struct timespec mtime_ts; + uint64_t size; + + if (src_oid == dst_oid && src_locator == dst_locator) { + return 0; + } + + src_ioctx.locator_set_key(src_locator); + dst_ioctx.locator_set_key(dst_locator); + + do { + bufferlist data; + ObjectReadOperation rop; + ObjectWriteOperation wop; + + if (ofs == 0) { + rop.stat2(&size, &mtime_ts, NULL); + mtime = real_clock::from_timespec(mtime_ts); + } + rop.read(ofs, chunk_size, &data, NULL); + ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield); + if (ret < 0) { + goto done_err; + } + + if (data.length() == 0) { + break; + } + + if (ofs == 0) { + wop.create(true); /* make it exclusive */ + wop.mtime2(&mtime_ts); + mtime = real_clock::from_timespec(mtime_ts); + } + wop.write(ofs, data); + ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield); + if (ret < 0) { + goto done_err; + } + ofs += data.length(); + done = data.length() != chunk_size; + } while (!done); + + if (ofs != size) { + ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid + << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl; + ret = -EIO; + goto done_err; + } + + src_ioctx.remove(src_oid); + + return 0; + +done_err: + // TODO: clean up dst_oid if we created it + ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl; + return ret; +} + +/* + * fixes an issue where head objects were supposed to have a locator created, but ended + * up without one + */ +int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, rgw_obj_key& key, + bool fix, bool *need_fix, optional_yield y) +{ + std::unique_ptr bucket; + driver->get_bucket(nullptr, bucket_info, &bucket); + std::unique_ptr obj = bucket->get_object(key); + + if (need_fix) { + *need_fix = false; + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); + if (r < 0) { + return r; + } + + RGWObjState *astate = nullptr; + RGWObjManifest* manifest = nullptr; + RGWObjectCtx rctx(this->driver); + r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y); + if (r < 0) + return r; + + if (manifest) { + RGWObjManifest::obj_iterator miter; + for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { + rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver); + rgw_obj loc; + string oid; + string locator; + + RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc); + + if (loc.key.ns.empty()) { + /* continue, we're only interested in tail objects */ + continue; + } + + auto& ioctx = ref.pool.ioctx(); + + get_obj_bucket_and_oid_loc(loc, oid, locator); + ref.pool.ioctx().locator_set_key(locator); + + ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl; + + r = ioctx.stat(oid, NULL, NULL); + if (r != -ENOENT) { + continue; + } + + string bad_loc; + prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc); + + /* create a new ioctx with the bad locator */ + librados::IoCtx src_ioctx; + src_ioctx.dup(ioctx); + src_ioctx.locator_set_key(bad_loc); + + r = src_ioctx.stat(oid, NULL, NULL); + if (r != 0) { + /* cannot find a broken part */ + continue; + } + ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl; + if (need_fix) { + *need_fix = true; + } + if (fix) { + r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator); + if (r < 0) { + ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl; + } + } + } + } + + return 0; +} + +int RGWRados::BucketShard::init(const rgw_bucket& _bucket, + const rgw_obj& obj, + RGWBucketInfo* bucket_info_out, + const DoutPrefixProvider *dpp) +{ + bucket = _bucket; + + RGWBucketInfo bucket_info; + RGWBucketInfo* bucket_info_p = + bucket_info_out ? bucket_info_out : &bucket_info; + + int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + string oid; + + ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + bucket = bucket_info.bucket; + + int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, + obj.get_hash_object(), + &bucket_obj, + &shard_id); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + +int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + int sid) +{ + bucket = bucket_info.bucket; + shard_id = sid; + + int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, + num_shards(index), index.gen, + &bucket_obj); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; + return ret; + } + ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; + + return 0; +} + + +/* Execute @handler on last item in bucket listing for bucket specified + * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing + * to objects matching these criterias. */ +int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler) +{ + RGWRados::Bucket target(this, bucket_info); + RGWRados::Bucket::List list_op(&target); + + list_op.params.prefix = obj_prefix; + list_op.params.delim = obj_delim; + + ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name + << ", obj_prefix=" << obj_prefix + << ", obj_delim=" << obj_delim + << dendl; + + bool is_truncated = false; + + boost::optional last_entry; + /* We need to rewind to the last object in a listing. */ + do { + /* List bucket entries in chunks. */ + static constexpr int MAX_LIST_OBJS = 100; + std::vector entries(MAX_LIST_OBJS); + + int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr, + &is_truncated, null_yield); + if (ret < 0) { + return ret; + } else if (!entries.empty()) { + last_entry = entries.back(); + } + } while (is_truncated); + + if (last_entry) { + return handler(*last_entry); + } + + /* Empty listing - no items we can run handler on. */ + return 0; +} + +bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const +{ + return bucket->get_info().has_swift_versioning() && + bucket->get_info().swift_ver_location.size(); +} + +int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx, + const rgw_user& user, + rgw::sal::Bucket* bucket, + rgw::sal::Object* obj, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + if (! swift_versioning_enabled(bucket)) { + return 0; + } + + obj->set_atomic(); + + RGWObjState * state = nullptr; + RGWObjManifest *manifest = nullptr; + int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y); + if (r < 0) { + return r; + } + + if (!state->exists) { + return 0; + } + + const string& src_name = obj->get_oid(); + char buf[src_name.size() + 32]; + struct timespec ts = ceph::real_clock::to_timespec(state->mtime); + snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(), + src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000); + + RGWBucketInfo dest_bucket_info; + + r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl; + if (r == -ENOENT) { + return -ERR_PRECONDITION_FAILED; + } + return r; + } + + if (dest_bucket_info.owner != bucket->get_info().owner) { + return -ERR_PRECONDITION_FAILED; + } + + rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info); + rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket); + + if (dest_bucket_info.versioning_enabled()){ + dest_obj.gen_rand_obj_instance_name(); + } + + dest_obj.set_atomic(); + + rgw_zone_id no_zone; + + r = copy_obj(obj_ctx, + user, + NULL, /* req_info *info */ + no_zone, + &dest_obj, + obj, + &dest_bucket, + bucket, + bucket->get_placement_rule(), + NULL, /* time_t *src_mtime */ + NULL, /* time_t *mtime */ + NULL, /* const time_t *mod_ptr */ + NULL, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + NULL, /* const char *if_match */ + NULL, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + state->attrset, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + NULL, /* string *version_id */ + NULL, /* string *ptag */ + NULL, /* string *petag */ + NULL, /* void (*progress_cb)(off_t, void *) */ + NULL, /* void *progress_data */ + dpp, + null_yield); + if (r == -ECANCELED || r == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } + + return r; +} + +int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx, + const rgw_user& user, + rgw::sal::Bucket* bucket, + rgw::sal::Object* obj, + bool& restored, /* out */ + const DoutPrefixProvider *dpp) +{ + if (! swift_versioning_enabled(bucket)) { + return 0; + } + + /* Bucket info of the bucket that stores previous versions of our object. */ + RGWBucketInfo archive_binfo; + + int ret = get_bucket_info(&svc, bucket->get_tenant(), + bucket->get_info().swift_ver_location, + archive_binfo, nullptr, null_yield, nullptr); + if (ret < 0) { + return ret; + } + + /* Abort the operation if the bucket storing our archive belongs to someone + * else. This is a limitation in comparison to Swift as we aren't taking ACLs + * into consideration. For we can live with that. + * + * TODO: delegate this check to un upper layer and compare with ACLs. */ + if (bucket->get_info().owner != archive_binfo.owner) { + return -EPERM; + } + + /* This code will be executed on latest version of the object. */ + const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int { + rgw_zone_id no_zone; + + /* We don't support object versioning of Swift API on those buckets that + * are already versioned using the S3 mechanism. This affects also bucket + * storing archived objects. Otherwise the delete operation would create + * a deletion marker. */ + if (archive_binfo.versioned()) { + restored = false; + return -ERR_PRECONDITION_FAILED; + } + + /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly + * irrelevant and may be safely skipped. */ + std::map no_attrs; + + rgw::sal::RadosBucket archive_bucket(driver, archive_binfo); + rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket); + + if (bucket->versioning_enabled()){ + obj->gen_rand_obj_instance_name(); + } + + archive_obj.set_atomic(); + obj->set_atomic(); + + int ret = copy_obj(obj_ctx, + user, + nullptr, /* req_info *info */ + no_zone, + obj, /* dest obj */ + &archive_obj, /* src obj */ + bucket, /* dest bucket info */ + &archive_bucket, /* src bucket info */ + bucket->get_placement_rule(), /* placement_rule */ + nullptr, /* time_t *src_mtime */ + nullptr, /* time_t *mtime */ + nullptr, /* const time_t *mod_ptr */ + nullptr, /* const time_t *unmod_ptr */ + false, /* bool high_precision_time */ + nullptr, /* const char *if_match */ + nullptr, /* const char *if_nomatch */ + RGWRados::ATTRSMOD_NONE, + true, /* bool copy_if_newer */ + no_attrs, + RGWObjCategory::Main, + 0, /* uint64_t olh_epoch */ + real_time(), /* time_t delete_at */ + nullptr, /* string *version_id */ + nullptr, /* string *ptag */ + nullptr, /* string *petag */ + nullptr, /* void (*progress_cb)(off_t, void *) */ + nullptr, /* void *progress_data */ + dpp, + null_yield); + if (ret == -ECANCELED || ret == -ENOENT) { + /* Has already been overwritten, meaning another rgw process already + * copied it out */ + return 0; + } else if (ret < 0) { + return ret; + } else { + restored = true; + } + + /* Need to remove the archived copy. */ + ret = delete_obj(dpp, archive_binfo, &archive_obj, + archive_binfo.versioning_status()); + + return ret; + }; + + const std::string& obj_name = obj->get_oid(); + const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size() + % obj_name); + + return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(), + handler); +} + +int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + map& attrs, + bool assume_noent, bool modify_tail, + void *_index_op, optional_yield y) +{ + RGWRados::Bucket::UpdateIndex *index_op = static_cast(_index_op); + RGWRados *store = target->get_store(); + + ObjectWriteOperation op; +#ifdef WITH_LTTNG + const req_state* s = get_req_state(); + string req_id; + if (!s) { + // fake req_id + req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id()); + } else { + req_id = s->req_id; + } +#endif + + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent); + if (r < 0) + return r; + + rgw_obj obj = target->get_obj(); + + if (obj.get_oid().empty()) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; + return -EIO; + } + + rgw_rados_ref ref; + r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref); + if (r < 0) + return r; + + bool is_olh = state->is_olh; + + bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0; + + const string *ptag = meta.ptag; + if (!ptag && !index_op->get_optag()->empty()) { + ptag = index_op->get_optag(); + } + r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y); + if (r < 0) + return r; + + if (real_clock::is_zero(meta.set_mtime)) { + meta.set_mtime = real_clock::now(); + } + + if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) { + auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (iter == attrs.end()) { + real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime); + string mode = target->get_bucket_info().obj_lock.get_mode(); + RGWObjectRetention obj_retention(mode, lock_until_date); + bufferlist bl; + obj_retention.encode(bl); + op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl); + } + } + + if (state->is_olh) { + op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag); + } + + struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime); + op.mtime2(&mtime_ts); + + if (meta.data) { + /* if we want to overwrite the data, we also want to overwrite the + xattrs, so just remove the object */ + op.write_full(*meta.data); + if (state->compressed) { + uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + op.set_alloc_hint2(0, 0, alloc_hint_flags); + } + } + + string etag; + string content_type; + bufferlist acl_bl; + string storage_class; + + map::iterator iter; + if (meta.rmattrs) { + for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + if (meta.manifest) { + storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class; + + /* remove existing manifest attr */ + iter = attrs.find(RGW_ATTR_MANIFEST); + if (iter != attrs.end()) + attrs.erase(iter); + + bufferlist bl; + encode(*meta.manifest, bl); + op.setxattr(RGW_ATTR_MANIFEST, bl); + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_ETAG) == 0) { + etag = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) { + content_type = rgw_bl_str(bl); + } else if (name.compare(RGW_ATTR_ACL) == 0) { + acl_bl = bl; + } + } + if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) { + cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER); + } + + if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) { + bufferlist bl; + encode(store->svc.zone->get_zone_short_id(), bl); + op.setxattr(RGW_ATTR_SOURCE_ZONE, bl); + } + + if (!storage_class.empty()) { + bufferlist bl; + bl.append(storage_class); + op.setxattr(RGW_ATTR_STORAGE_CLASS, bl); + } + + if (!op.size()) + return 0; + + uint64_t epoch; + int64_t poolid; + bool orig_exists; + uint64_t orig_size; + + if (!reset_obj) { //Multipart upload, it has immutable head. + orig_exists = false; + orig_size = 0; + } else { + orig_exists = state->exists; + orig_size = state->accounted_size; + } + + bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) || + !obj.key.instance.empty(); + + bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target); + + if (versioned_op) { + index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP); + } + + if (!index_op->is_prepared()) { + tracepoint(rgw_rados, prepare_enter, req_id.c_str()); + r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); + tracepoint(rgw_rados, prepare_exit, req_id.c_str()); + if (r < 0) + return r; + } + + auto& ioctx = ref.pool.ioctx(); + + tracepoint(rgw_rados, operate_enter, req_id.c_str()); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + tracepoint(rgw_rados, operate_exit, req_id.c_str()); + if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under, + or -ENOENT if was removed, or -EEXIST if it did not exist + before and now it does */ + if (r == -EEXIST && assume_noent) { + target->invalidate_state(); + return r; + } + goto done_cancel; + } + + epoch = ioctx.get_last_version(); + poolid = ioctx.get_id(); + + r = target->complete_atomic_modification(dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; + } + + tracepoint(rgw_rados, complete_enter, req_id.c_str()); + r = index_op->complete(dpp, poolid, epoch, size, accounted_size, + meta.set_mtime, etag, content_type, + storage_class, &acl_bl, + meta.category, meta.remove_objs, meta.user_data, meta.appendable); + tracepoint(rgw_rados, complete_exit, req_id.c_str()); + if (r < 0) + goto done_cancel; + + if (meta.mtime) { + *meta.mtime = meta.set_mtime; + } + + /* note that index_op was using state so we couldn't invalidate it earlier */ + target->invalidate_state(); + state = NULL; + + if (versioned_op && meta.olh_epoch) { + r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace); + if (r < 0) { + return r; + } + } + + if (!real_clock::is_zero(meta.delete_at)) { + rgw_obj_index_key obj_key; + obj.key.get_index_key(&obj_key); + + r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name, + obj.bucket.bucket_id, obj_key); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; + /* ignoring error, nothing we can do at this point */ + } + } + meta.canceled = false; + + /* update quota cache */ + if (meta.completeMultipart){ + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + 0, orig_size); + } + else { + store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), + accounted_size, orig_size); + } + return 0; + +done_cancel: + int ret = index_op->cancel(dpp, meta.remove_objs); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + } + + meta.canceled = true; + + /* we lost in a race. There are a few options: + * - existing object was rewritten (ECANCELED) + * - non existing object was created (EEXIST) + * - object was removed (ENOENT) + * should treat it as a success + */ + if (meta.if_match == NULL && meta.if_nomatch == NULL) { + if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) { + r = 0; + } + } else { + if (meta.if_match != NULL) { + // only overwrite existing object + if (strcmp(meta.if_match, "*") == 0) { + if (r == -ENOENT) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ECANCELED) { + r = 0; + } + } + } + + if (meta.if_nomatch != NULL) { + // only create a new object + if (strcmp(meta.if_nomatch, "*") == 0) { + if (r == -EEXIST) { + r = -ERR_PRECONDITION_FAILED; + } else if (r == -ENOENT) { + r = 0; + } + } + } + } + + return r; +} + +int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, + map& attrs, optional_yield y) +{ + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(target->get_store(), bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj()); + index_op.set_zones_trace(meta.zones_trace); + + bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL); + int r; + if (assume_noent) { + r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + if (r == -EEXIST) { + assume_noent = false; + } + } + if (!assume_noent) { + r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); + } + return r; +} + +class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB +{ + const DoutPrefixProvider *dpp; + CephContext* cct; + rgw_obj obj; + rgw::sal::DataProcessor *filter; + boost::optional& compressor; + bool try_etag_verify; + rgw::putobj::etag_verifier_ptr etag_verifier; + boost::optional buffering; + CompressorRef& plugin; + rgw::sal::ObjectProcessor *processor; + void (*progress_cb)(off_t, void *); + void *progress_data; + bufferlist extra_data_bl, manifest_bl; + std::optional compression_info; + uint64_t extra_data_left{0}; + bool need_to_process_attrs{true}; + uint64_t data_len{0}; + map src_attrs; + uint64_t ofs{0}; + uint64_t lofs{0}; /* logical ofs */ + std::function&)> attrs_handler; + +public: + RGWRadosPutObj(const DoutPrefixProvider *dpp, + CephContext* cct, + CompressorRef& plugin, + boost::optional& compressor, + rgw::sal::ObjectProcessor *p, + void (*_progress_cb)(off_t, void *), + void *_progress_data, + std::function&)> _attrs_handler) : + dpp(dpp), + cct(cct), + filter(p), + compressor(compressor), + try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify), + plugin(plugin), + processor(p), + progress_cb(_progress_cb), + progress_data(_progress_data), + attrs_handler(_attrs_handler) {} + + + int process_attrs(void) { + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + auto iter = src_attrs.find(RGW_ATTR_COMPRESSION); + if (iter != src_attrs.end()) { + const bufferlist bl = std::move(iter->second); + src_attrs.erase(iter); // don't preserve source compression info + + if (try_etag_verify) { + // if we're trying to verify etags, we need to convert compressed + // ranges in the manifest back into logical multipart part offsets + RGWCompressionInfo info; + bool compressed = false; + int r = rgw_compression_info_from_attr(bl, compressed, info); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to decode compression info, " + "disabling etag verification" << dendl; + try_etag_verify = false; + } else if (compressed) { + compression_info = std::move(info); + } + } + } + /* We need the manifest to recompute the ETag for verification */ + iter = src_attrs.find(RGW_ATTR_MANIFEST); + if (iter != src_attrs.end()) { + manifest_bl = std::move(iter->second); + src_attrs.erase(iter); + } + + // filter out olh attributes + iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX); + while (iter != src_attrs.end()) { + if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) { + break; + } + iter = src_attrs.erase(iter); + } + } + + int ret = attrs_handler(src_attrs); + if (ret < 0) { + return ret; + } + + if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { + //do not compress if object is encrypted + compressor = boost::in_place(cct, plugin, filter); + // add a filter that buffers data so we don't try to compress tiny blocks. + // libcurl reads in 16k at a time, and we need at least 64k to get a good + // compression ratio + constexpr unsigned buffer_size = 512 * 1024; + buffering = boost::in_place(&*compressor, buffer_size); + filter = &*buffering; + } + + /* + * Presently we don't support ETag based verification if encryption is + * requested. We can enable simultaneous support once we have a mechanism + * to know the sequence in which the filters must be applied. + */ + if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { + ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl, + compression_info, + etag_verifier); + if (ret < 0) { + ldpp_dout(dpp, 4) << "failed to initial etag verifier, " + "disabling etag verification" << dendl; + } else { + filter = etag_verifier.get(); + } + } + + need_to_process_attrs = false; + + return 0; + } + + int handle_data(bufferlist& bl, bool *pause) override { + if (progress_cb) { + progress_cb(data_len, progress_data); + } + if (extra_data_left) { + uint64_t extra_len = bl.length(); + if (extra_len > extra_data_left) + extra_len = extra_data_left; + + bufferlist extra; + bl.splice(0, extra_len, &extra); + extra_data_bl.append(extra); + + extra_data_left -= extra_len; + if (extra_data_left == 0) { + int res = process_attrs(); + if (res < 0) + return res; + } + ofs += extra_len; + if (bl.length() == 0) { + return 0; + } + } + if (need_to_process_attrs) { + /* need to call process_attrs() even if we don't get any attrs, + * need it to call attrs_handler(). + */ + int res = process_attrs(); + if (res < 0) { + return res; + } + } + + ceph_assert(uint64_t(ofs) >= extra_data_len); + + uint64_t size = bl.length(); + ofs += size; + + const uint64_t lofs = data_len; + data_len += size; + + return filter->process(std::move(bl), lofs); + } + + int flush() { + return filter->process({}, data_len); + } + + bufferlist& get_extra_data() { return extra_data_bl; } + + map& get_attrs() { return src_attrs; } + + void set_extra_data_len(uint64_t len) override { + extra_data_left = len; + RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len); + } + + uint64_t get_data_len() { + return data_len; + } + + std::string get_verifier_etag() { + if (etag_verifier) { + etag_verifier->calculate_etag(); + return etag_verifier->get_calculated_etag(); + } else { + return ""; + } + } +}; + +/* + * prepare attrset depending on attrs_mod. + */ +static void set_copy_attrs(map& src_attrs, + map& attrs, + RGWRados::AttrsMod attrs_mod) +{ + switch (attrs_mod) { + case RGWRados::ATTRSMOD_NONE: + attrs = src_attrs; + break; + case RGWRados::ATTRSMOD_REPLACE: + if (!attrs[RGW_ATTR_ETAG].length()) { + attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG]; + } + if (!attrs[RGW_ATTR_TAIL_TAG].length()) { + auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG); + if (ttiter != src_attrs.end()) { + attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG]; + } + } + break; + case RGWRados::ATTRSMOD_MERGE: + for (map::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) { + if (attrs.find(it->first) == attrs.end()) { + attrs[it->first] = it->second; + } + } + break; + } +} + +int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y) +{ + RGWObjectCtx rctx(this->driver); + rgw::sal::Attrs attrset; + uint64_t obj_size; + ceph::real_time mtime; + RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrset; + read_op.params.obj_size = &obj_size; + read_op.params.lastmod = &mtime; + + int ret = read_op.prepare(y, dpp); + if (ret < 0) + return ret; + + attrset.erase(RGW_ATTR_ID_TAG); + attrset.erase(RGW_ATTR_TAIL_TAG); + attrset.erase(RGW_ATTR_STORAGE_CLASS); + + return this->copy_obj_data(rctx, obj->get_bucket(), + obj->get_bucket()->get_info().placement_rule, + read_op, obj_size - 1, obj, NULL, mtime, + attrset, 0, real_time(), NULL, dpp, y); +} + +struct obj_time_weight { + real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + bool high_precision; + + obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {} + + bool compare_low_precision(const obj_time_weight& rhs) { + struct timespec l = ceph::real_clock::to_timespec(mtime); + struct timespec r = ceph::real_clock::to_timespec(rhs.mtime); + l.tv_nsec = 0; + r.tv_nsec = 0; + if (l > r) { + return false; + } + if (l < r) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + + } + + bool operator<(const obj_time_weight& rhs) { + if (!high_precision || !rhs.high_precision) { + return compare_low_precision(rhs); + } + if (mtime > rhs.mtime) { + return false; + } + if (mtime < rhs.mtime) { + return true; + } + if (!zone_short_id || !rhs.zone_short_id) { + /* don't compare zone ids, if one wasn't provided */ + return false; + } + if (zone_short_id != rhs.zone_short_id) { + return (zone_short_id < rhs.zone_short_id); + } + return (pg_ver < rhs.pg_ver); + } + + void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) { + mtime = _mtime; + zone_short_id = _short_id; + pg_ver = _pg_ver; + } + + void init(RGWObjState *state) { + mtime = state->mtime; + zone_short_id = state->zone_short_id; + pg_ver = state->pg_ver; + } +}; + +inline ostream& operator<<(ostream& out, const obj_time_weight &o) { + out << o.mtime; + + if (o.zone_short_id != 0 || o.pg_ver != 0) { + out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]"; + } + + return out; +} + +class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { + bufferlist extra_data; +public: + RGWGetExtraDataCB() {} + int handle_data(bufferlist& bl, bool *pause) override { + int bl_len = (int)bl.length(); + if (extra_data.length() < extra_data_len) { + off_t max = extra_data_len - extra_data.length(); + if (max > bl_len) { + max = bl_len; + } + bl.splice(0, max, &extra_data); + } + return bl_len; + } + + bufferlist& get_extra_data() { + return extra_data; + } +}; + +int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* src_obj, + const RGWBucketInfo *src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + map *pattrs, + map *pheaders, + string *version_id, + string *ptag, + string *petag) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + map src_attrs; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + + RGWRESTConn *conn; + if (source_zone.empty()) { + if (!src_bucket_info || src_bucket_info->zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + map::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + auto iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + RGWGetExtraDataCB cb; + map req_headers; + real_time set_mtime; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + + constexpr bool prepend_meta = true; + constexpr bool get_op = true; + constexpr bool rgwx_stat = true; + constexpr bool sync_manifest = true; + constexpr bool skip_decrypt = true; + int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, + true, &cb, &in_stream_req); + if (ret < 0) { + return ret; + } + + ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, + nullptr, pheaders, null_yield); + if (ret < 0) { + return ret; + } + + bufferlist& extra_data_bl = cb.get_extra_data(); + if (extra_data_bl.length()) { + JSONParser jp; + if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { + ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; + return -EIO; + } + + JSONDecoder::decode_json("attrs", src_attrs, &jp); + + src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + map::iterator iter = src_attrs.find(RGW_ATTR_ETAG); + if (iter != src_attrs.end()) { + bufferlist& etagbl = iter->second; + *petag = etagbl.to_str(); + while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') { + *petag = petag->substr(0, petag->size() - 1); + } + } + } + + if (pattrs) { + *pattrs = std::move(src_attrs); + } + + return 0; +} + +int RGWFetchObjFilter_Default::filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) +{ + const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr); + if (!ptail_rule) { + auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != obj_attrs.end()) { + dest_rule.storage_class = iter->second.to_str(); + dest_rule.inherit_from(dest_bucket_info.placement_rule); + ptail_rule = &dest_rule; + } else { + ptail_rule = &dest_bucket_info.placement_rule; + } + } + *prule = ptail_rule; + return 0; +} + +int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_obj, + rgw::sal::Object* src_obj, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + std::optional dest_placement_rule, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + std::optional olh_epoch, + real_time delete_at, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + RGWFetchObjFilter *filter, + rgw_zone_set *zones_trace, + std::optional* bytes_transferred) +{ + /* source is in a different zonegroup, copy from there */ + + RGWRESTStreamRWRequest *in_stream_req; + string tag; + int i; + append_rand_alpha(cct, tag, tag, 32); + obj_time_weight set_mtime_weight; + set_mtime_weight.high_precision = high_precision_time; + int ret; + + rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id, + obj_ctx, dest_obj->clone(), olh_epoch, + tag, dpp, null_yield); + RGWRESTConn *conn; + auto& zone_conn_map = svc.zone->get_zone_conn_map(); + auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); + if (source_zone.empty()) { + if (!src_bucket || src_bucket->get_info().zonegroup.empty()) { + /* source is in the master zonegroup */ + conn = svc.zone->get_master_conn(); + } else { + map::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup); + if (iter == zonegroup_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + } else { + auto iter = zone_conn_map.find(source_zone); + if (iter == zone_conn_map.end()) { + ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } + + boost::optional compressor; + CompressorRef plugin; + + RGWFetchObjFilter_Default source_filter; + if (!filter) { + filter = &source_filter; + } + + std::optional override_owner; + + RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data, + [&](map& obj_attrs) { + const rgw_placement_rule *ptail_rule; + + int ret = filter->filter(cct, + src_obj->get_key(), + dest_bucket->get_info(), + dest_placement_rule, + obj_attrs, + &override_owner, + &ptail_rule); + if (ret < 0) { + ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl; + return ret; + } + + processor.set_tail_placement(*ptail_rule); + + const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule); + if (compression_type != "none") { + plugin = Compressor::create(cct, compression_type); + if (!plugin) { + ldpp_dout(dpp, 1) << "Cannot load plugin for compression type " + << compression_type << dendl; + } + } + + ret = processor.prepare(null_yield); + if (ret < 0) { + return ret; + } + return 0; + }); + + string etag; + real_time set_mtime; + uint64_t expected_size = 0; + + RGWObjState *dest_state = NULL; + RGWObjManifest *manifest = nullptr; + + const real_time *pmod = mod_ptr; + + obj_time_weight dest_mtime_weight; + + if (copy_if_newer) { + /* need to get mtime for destination */ + ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield); + if (ret < 0) + goto set_err_state; + + if (!real_clock::is_zero(dest_state->mtime)) { + dest_mtime_weight.init(dest_state); + pmod = &dest_mtime_weight.mtime; + } + } + + static constexpr bool prepend_meta = true; + static constexpr bool get_op = true; + static constexpr bool rgwx_stat = false; + static constexpr bool sync_manifest = true; + static constexpr bool skip_decrypt = true; + ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, + dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, + prepend_meta, get_op, rgwx_stat, + sync_manifest, skip_decrypt, + true, + &cb, &in_stream_req); + if (ret < 0) { + goto set_err_state; + } + + ret = conn->complete_request(in_stream_req, &etag, &set_mtime, + &expected_size, nullptr, nullptr, null_yield); + if (ret < 0) { + goto set_err_state; + } + ret = cb.flush(); + if (ret < 0) { + goto set_err_state; + } + if (cb.get_data_len() != expected_size) { + ret = -EIO; + ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected " + << expected_size << " bytes but received " << cb.get_data_len() << dendl; + goto set_err_state; + } + if (compressor && compressor->is_compressed()) { + bufferlist tmp; + RGWCompressionInfo cs_info; + cs_info.compression_type = plugin->get_type_name(); + cs_info.orig_size = cb.get_data_len(); + cs_info.compressor_message = compressor->get_compressor_message(); + cs_info.blocks = move(compressor->get_compression_blocks()); + encode(cs_info, tmp); + cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp; + } + + if (override_owner) { + processor.set_owner(*override_owner); + + auto& obj_attrs = cb.get_attrs(); + + RGWUserInfo owner_info; + if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) { + ldpp_dout(dpp, 10) << "owner info does not exist" << dendl; + return -EINVAL; + } + + RGWAccessControlPolicy acl; + + auto aiter = obj_attrs.find(RGW_ATTR_ACL); + if (aiter == obj_attrs.end()) { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl; + acl.create_default(owner_info.user_id, owner_info.display_name); + } else { + auto iter = aiter->second.cbegin(); + try { + acl.decode(iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; + return -EIO; + } + } + + ACLOwner new_owner; + new_owner.set_id(*override_owner); + new_owner.set_name(owner_info.display_name); + + acl.set_owner(new_owner); + + bufferlist bl; + acl.encode(bl); + obj_attrs[RGW_ATTR_ACL] = std::move(bl); + } + + if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */ + cb.get_attrs().erase(RGW_ATTR_DELETE_AT); + } else { + map::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT); + if (iter != cb.get_attrs().end()) { + try { + decode(delete_at, iter->second); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; + } + } + } + + if (src_mtime) { + *src_mtime = set_mtime; + } + + if (petag) { + const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG); + if (iter != cb.get_attrs().end()) { + *petag = iter->second.to_str(); + } + } + + //erase the append attr + cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM); + + { // add x-amz-replication-status=REPLICA + auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS]; + bl.clear(); // overwrite source's status + bl.append("REPLICA"); + } + + if (source_zone.empty()) { + set_copy_attrs(cb.get_attrs(), attrs, attrs_mod); + } else { + attrs = cb.get_attrs(); + } + + if (copy_if_newer) { + uint64_t pg_ver = 0; + auto i = attrs.find(RGW_ATTR_PG_VER); + if (i != attrs.end() && i->second.length() > 0) { + auto iter = i->second.cbegin(); + try { + decode(pg_ver, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; + /* non critical error */ + } + } + set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver); + } + + /* Perform ETag verification is we have computed the object's MD5 sum at our end */ + if (const auto& verifier_etag = cb.get_verifier_etag(); + !verifier_etag.empty()) { + string trimmed_etag = etag; + + /* Remove the leading and trailing double quotes from etag */ + trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'), + trimmed_etag.end()); + + if (verifier_etag != trimmed_etag) { + ret = -EIO; + ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:" + << trimmed_etag << " Computed etag:" << verifier_etag << dendl; + goto set_err_state; + } + } + +#define MAX_COMPLETE_RETRY 100 + for (i = 0; i < MAX_COMPLETE_RETRY; i++) { + bool canceled = false; + ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime, + attrs, delete_at, nullptr, nullptr, nullptr, + zones_trace, &canceled, null_yield); + if (ret < 0) { + goto set_err_state; + } + + if (copy_if_newer && canceled) { + ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl; + obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */ + ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; + goto set_err_state; + } + dest_mtime_weight.init(dest_state); + dest_mtime_weight.high_precision = high_precision_time; + if (!dest_state->exists || + dest_mtime_weight < set_mtime_weight) { + ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + continue; + } else { + ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; + } + } + break; + } + + if (i == MAX_COMPLETE_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; + ret = -EIO; + goto set_err_state; + } + + if (bytes_transferred) { + *bytes_transferred = cb.get_data_len(); + } + return 0; +set_err_state: + if (copy_if_newer && ret == -ERR_NOT_MODIFIED) { + // we may have already fetched during sync of OP_ADD, but were waiting + // for OP_LINK_OLH to call set_olh() with a real olh_epoch + if (olh_epoch && *olh_epoch > 0) { + constexpr bool log_data_change = true; + ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr, + *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change); + } else { + // we already have the latest copy + ret = 0; + } + } + return ret; +} + + +int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, + RGWObjState *astate, + map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + rgw::sal::Object* dest_obj, + real_time *mtime) +{ + string etag; + + RGWRESTStreamS3PutObj *out_stream_req; + + auto rest_master_conn = svc.zone->get_master_conn(); + + int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req); + if (ret < 0) { + return ret; + } + + out_stream_req->set_send_length(astate->size); + + ret = RGWHTTP::send(out_stream_req); + if (ret < 0) { + delete out_stream_req; + return ret; + } + + ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield); + if (ret < 0) { + delete out_stream_req; + return ret; + } + + ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield); + if (ret < 0) + return ret; + + return 0; +} + +/** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * err: stores any errors resulting from the get of the original object + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_obj, + rgw::sal::Object* src_obj, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + real_time *src_mtime, + real_time *mtime, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + real_time delete_at, + string *version_id, + string *ptag, + string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + int ret; + uint64_t obj_size; + rgw_obj shadow_obj = dest_obj->get_obj(); + string shadow_oid; + + bool remote_src; + bool remote_dest; + + append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32); + shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns); + + auto& zonegroup = svc.zone->get_zonegroup(); + + remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup); + remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup); + + if (remote_src && remote_dest) { + ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl; + return -EINVAL; + } + + ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl; + + if (remote_src || !source_zone.empty()) { + return fetch_remote_obj(obj_ctx, user_id, info, source_zone, + dest_obj, src_obj, dest_bucket, src_bucket, + dest_placement, src_mtime, mtime, mod_ptr, + unmod_ptr, high_precision_time, + if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, + olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp, + nullptr /* filter */); + } + + map src_attrs; + RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj); + RGWRados::Object::Read read_op(&src_op_target); + + read_op.conds.mod_ptr = mod_ptr; + read_op.conds.unmod_ptr = unmod_ptr; + read_op.conds.high_precision_time = high_precision_time; + read_op.conds.if_match = if_match; + read_op.conds.if_nomatch = if_nomatch; + read_op.params.attrs = &src_attrs; + read_op.params.lastmod = src_mtime; + read_op.params.obj_size = &obj_size; + + ret = read_op.prepare(y, dpp); + if (ret < 0) { + return ret; + } + if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) { + // Current implementation does not follow S3 spec and even + // may result in data corruption silently when copying + // multipart objects acorss pools. So reject COPY operations + //on encrypted objects before it is fully functional. + ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj + << " has not been implemented." << dendl; + return -ERR_NOT_IMPLEMENTED; + } + + src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL]; + src_attrs.erase(RGW_ATTR_DELETE_AT); + + src_attrs.erase(RGW_ATTR_OBJECT_RETENTION); + src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD); + map::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION); + if (rt != attrs.end()) + src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second; + map::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); + if (lh != attrs.end()) + src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second; + + set_copy_attrs(src_attrs, attrs, attrs_mod); + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_PG_VER); + attrs.erase(RGW_ATTR_SOURCE_ZONE); + map::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION); + if (cmp != src_attrs.end()) + attrs[RGW_ATTR_COMPRESSION] = cmp->second; + + RGWObjManifest manifest; + RGWObjState *astate = NULL; + RGWObjManifest *amanifest = nullptr; + + ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y); + if (ret < 0) { + return ret; + } + + vector ref_objs; + + if (remote_dest) { + /* dest is in a different zonegroup, copy it there */ + return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime); + } + uint64_t max_chunk_size; + + ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl; + return ret; + } + + rgw_pool src_pool; + rgw_pool dest_pool; + + const rgw_placement_rule *src_rule{nullptr}; + + if (amanifest) { + src_rule = &amanifest->get_tail_placement().placement_rule; + ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl; + } + + if (!src_rule || src_rule->empty()) { + src_rule = &src_bucket->get_placement_rule(); + } + + if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl; + return -EIO; + } + + if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) { + ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl; + return -EIO; + } + + ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool + << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl; + + bool copy_data = (!amanifest) || + (*src_rule != dest_placement) || + (src_pool != dest_pool); + + bool copy_first = false; + if (amanifest) { + if (!amanifest->has_tail()) { + copy_data = true; + } else { + uint64_t head_size = amanifest->get_head_size(); + + if (head_size > 0) { + if (head_size > max_chunk_size) { + copy_data = true; + } else { + copy_first = true; + } + } + } + } + + if (petag) { + const auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + *petag = iter->second.to_str(); + } + } + + if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */ + attrs.erase(RGW_ATTR_TAIL_TAG); + return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj, + mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y); + } + + /* This has been in for 2 years, so we can safely assume amanifest is not NULL */ + RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp); + + if (copy_first) { // we need to copy first chunk, not increase refcount + ++miter; + } + + bufferlist first_chunk; + + const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj()); + RGWObjManifest *pmanifest; + ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl; + + RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj); + RGWRados::Object::Write write_op(&dest_op_target); + + string tag; + + if (ptag) { + tag = *ptag; + } + + if (tag.empty()) { + append_rand_alpha(cct, tag, tag, 32); + } + + std::unique_ptr aio; + rgw::AioResultList all_results; + if (!copy_itself) { + aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y); + attrs.erase(RGW_ATTR_TAIL_TAG); + manifest = *amanifest; + const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); + if (tail_placement.bucket.name.empty()) { + manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key()); + } + string ref_tag; + for (; miter != amanifest->obj_end(dpp); ++miter) { + ObjectWriteOperation op; + ref_tag = tag + '\0'; + cls_refcount_get(op, ref_tag, true); + + auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver)); + ret = obj.open(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl; + goto done_ret; + } + + static constexpr uint64_t cost = 1; // 1 throttle unit per request + static constexpr uint64_t id = 0; // ids unused + rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id); + ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl; + goto done_ret; + } + } + + rgw::AioResultList completed = aio->drain(); + ret = rgw::check_for_errors(completed); + all_results.splice(all_results.end(), completed); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <keep_tail = true; + } + + if (copy_first) { + ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp); + if (ret < 0) { + goto done_ret; + } + + pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length()); + } else { + pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0); + } + + write_op.meta.data = &first_chunk; + write_op.meta.manifest = pmanifest; + write_op.meta.ptag = &tag; + write_op.meta.owner = dest_bucket->get_info().owner; + write_op.meta.mtime = mtime; + write_op.meta.flags = PUT_OBJ_CREATE; + write_op.meta.category = category; + write_op.meta.olh_epoch = olh_epoch; + write_op.meta.delete_at = delete_at; + write_op.meta.modify_tail = !copy_itself; + + ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y); + if (ret < 0) { + goto done_ret; + } + + return 0; + +done_ret: + if (!copy_itself) { + + /* wait all pending op done */ + rgw::AioResultList completed = aio->drain(); + all_results.splice(all_results.end(), completed); + + /* rollback reference */ + string ref_tag = tag + '\0'; + int ret2 = 0; + for (auto& r : all_results) { + if (r.result < 0) { + continue; // skip errors + } + ObjectWriteOperation op; + cls_refcount_put(op, ref_tag, true); + + static constexpr uint64_t cost = 1; // 1 throttle unit per request + static constexpr uint64_t id = 0; // ids unused + rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl; + } + } + completed = aio->drain(); + ret2 = rgw::check_for_errors(completed); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <_conf->rgw_put_obj_min_window_size); + using namespace rgw::putobj; + // do not change the null_yield in the initialization of this AtomicObjectProcessor + // it causes crashes in the ragweed tests + AtomicObjectProcessor processor(&aio, this->driver, &dest_placement, + bucket->get_info().owner, obj_ctx, + dest_obj->clone(), olh_epoch, tag, + dpp, null_yield); + int ret = processor.prepare(y); + if (ret < 0) + return ret; + + off_t ofs = 0; + + do { + bufferlist bl; + ret = read_op.read(ofs, end, bl, y, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl; + return ret; + } + + uint64_t read_len = ret; + ret = processor.process(std::move(bl), ofs); + if (ret < 0) { + return ret; + } + + ofs += read_len; + } while (ofs <= end); + + // flush + ret = processor.process({}, ofs); + if (ret < 0) { + return ret; + } + + string etag; + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + bufferlist& bl = iter->second; + etag = bl.to_str(); + if (petag) { + *petag = etag; + } + } + + uint64_t accounted_size; + { + bool compressed{false}; + RGWCompressionInfo cs_info; + ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl; + return ret; + } + // pass original size if compressed + accounted_size = compressed ? cs_info.orig_size : ofs; + } + + return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, + nullptr, nullptr, nullptr, nullptr, nullptr, y); +} + +int RGWRados::transition_obj(RGWObjectCtx& obj_ctx, + rgw::sal::Bucket* bucket, + rgw::sal::Object& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider *dpp, + optional_yield y) +{ + rgw::sal::Attrs attrs; + real_time read_mtime; + uint64_t obj_size; + + obj.set_atomic(); + RGWRados::Object op_target(this, bucket, obj_ctx, &obj); + RGWRados::Object::Read read_op(&op_target); + + read_op.params.attrs = &attrs; + read_op.params.lastmod = &read_mtime; + read_op.params.obj_size = &obj_size; + + int ret = read_op.prepare(y, dpp); + if (ret < 0) { + return ret; + } + + if (read_mtime != mtime) { + /* raced */ + return -ECANCELED; + } + + attrs.erase(RGW_ATTR_ID_TAG); + attrs.erase(RGW_ATTR_TAIL_TAG); + + ret = copy_obj_data(obj_ctx, + bucket, + placement_rule, + read_op, + obj_size - 1, + &obj, + nullptr /* pmtime */, + mtime, + attrs, + olh_epoch, + real_time(), + nullptr /* petag */, + dpp, + y); + if (ret < 0) { + return ret; + } + + return 0; +} + +int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y) +{ + constexpr uint NUM_ENTRIES = 1000u; + + rgw_obj_index_key marker; + string prefix; + bool is_truncated; + + do { + std::vector ent_list; + ent_list.reserve(NUM_ENTRIES); + + int r = cls_bucket_list_unordered(dpp, + bucket_info, + bucket_info.layout.current_index, + RGW_NO_SHARD, + marker, + prefix, + NUM_ENTRIES, + true, + ent_list, + &is_truncated, + &marker, + y); + if (r < 0) { + return r; + } + + string ns; + for (auto const& dirent : ent_list) { + rgw_obj_key obj; + + if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) { + return -ENOTEMPTY; + } + } + } while (is_truncated); + + return 0; +} + +/** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ +int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty) +{ + const rgw_bucket& bucket = bucket_info.bucket; + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + if (check_empty) { + r = check_bucket_empty(dpp, bucket_info, y); + if (r < 0) { + return r; + } + } + + bool remove_ep = true; + + if (objv_tracker.read_version.empty()) { + RGWBucketEntryPoint ep; + r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket, + &ep, + null_yield, + dpp, + RGWBucketCtl::Bucket::GetParams() + .set_objv_tracker(&objv_tracker)); + if (r < 0 || + (!bucket_info.bucket.bucket_id.empty() && + ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) { + if (r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl; + /* we have no idea what caused the error, will not try to remove it */ + } + /* + * either failed to read bucket entrypoint, or it points to a different bucket instance than + * requested + */ + remove_ep = false; + } + } + + if (remove_ep) { + r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp, + RGWBucketCtl::Bucket::RemoveParams() + .set_objv_tracker(&objv_tracker)); + if (r < 0) + return r; + } + + /* if the bucket is not synced we can remove the meta file */ + if (!svc.zone->is_syncing_bucket_meta(bucket)) { + RGWObjVersionTracker objv_tracker; + r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp); + if (r < 0) { + return r; + } + + /* remove bucket index objects asynchronously by best effort */ + (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(), + bucket_objs, + cct->_conf->rgw_bucket_index_max_aio)(); + } + + return 0; +} + +int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp) +{ + RGWBucketInfo info; + map attrs; + int r; + + if (bucket.bucket_id.empty()) { + r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); + } else { + r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp); + } + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + info.owner = owner.get_id(); + + r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; + return r; + } + + return 0; +} + + +int RGWRados::set_buckets_enabled(vector& buckets, bool enabled, const DoutPrefixProvider *dpp) +{ + int ret = 0; + + vector::iterator iter; + + for (iter = buckets.begin(); iter != buckets.end(); ++iter) { + rgw_bucket& bucket = *iter; + if (enabled) { + ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl; + } else { + ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl; + } + + RGWBucketInfo info; + map attrs; + int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + if (enabled) { + info.flags &= ~BUCKET_SUSPENDED; + } else { + info.flags |= BUCKET_SUSPENDED; + } + + r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; + ret = r; + continue; + } + } + return ret; +} + +int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended) +{ + RGWBucketInfo bucket_info; + int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0); + return 0; +} + +int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp) +{ + if ((!manifest)|| state->keep_tail) + return 0; + + cls_rgw_obj_chain chain; + store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain); + + if (chain.empty()) { + return 0; + } + + string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str()); + if (store->gc == nullptr) { + ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl; + //Delete objects inline just in case gc hasn't been initialised, prevents crashes + store->delete_objs_inline(dpp, chain, tag); + } else { + auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously + if (ret < 0 && leftover_chain) { + //Delete objects inline if send chain to gc fails + store->delete_objs_inline(dpp, *leftover_chain, tag); + } + } + return 0; +} + +void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain) +{ + RGWObjManifest::obj_iterator iter; + rgw_raw_obj raw_head; + obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head); + for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) { + const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver); + if (mobj == raw_head) + continue; + cls_rgw_obj_key key(mobj.oid); + chain->push_obj(mobj.pool.to_str(), key, mobj.loc); + } +} + +std::tuple> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag) +{ + if (chain.empty()) { + return {0, std::nullopt}; + } + + return gc->send_split_chain(chain, tag); +} + +void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag) +{ + string last_pool; + std::unique_ptr ctx(new IoCtx); + int ret = 0; + for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { + cls_rgw_obj& obj = *liter; + if (obj.pool != last_pool) { + ctx.reset(new IoCtx); + ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx); + if (ret < 0) { + last_pool = ""; + ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" << + obj.pool << dendl; + continue; + } + last_pool = obj.pool; + } + ctx->locator_set_key(obj.loc); + const string& oid = obj.key.name; /* just stored raw oid there */ + ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool << + ":" << obj.key.name << dendl; + ObjectWriteOperation op; + cls_refcount_put(op, tag, true); + ret = ctx->operate(oid, &op); + if (ret < 0) { + ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl; + } + } +} + +static void accumulate_raw_stats(const rgw_bucket_dir_header& header, + map& stats) +{ + for (const auto& pair : header.stats) { + const RGWObjCategory category = static_cast(pair.first); + const rgw_bucket_category_stats& header_stats = pair.second; + + RGWStorageStats& s = stats[category]; + + s.category = category; + s.size += header_stats.total_size; + s.size_rounded += header_stats.total_size_rounded; + s.size_utilized += header_stats.actual_size; + s.num_objects += header_stats.num_entries; + } +} + +int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + map *existing_stats, + map *calculated_stats) +{ + RGWSI_RADOS::Pool index_pool; + + // key - bucket index object id + // value - bucket index check OP returned result with the given bucket index object (shard) + map oids; + + int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr); + if (ret < 0) { + return ret; + } + + // declare and pre-populate + map bucket_objs_ret; + for (auto& iter : oids) { + bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret()); + } + + ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)(); + if (ret < 0) { + return ret; + } + + // aggregate results (from different shards if there are any) + for (const auto& iter : bucket_objs_ret) { + accumulate_raw_stats(iter.second.existing_header, *existing_stats); + accumulate_raw_stats(iter.second.calculated_header, *calculated_stats); + } + + return 0; +} + +int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) { + return r; + } + + return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); +} + +int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to open bucket index, r=" << r << " (" << + cpp_strerror(-r) << ")" << dendl; + return r; + } + + r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to issue set bucket resharding, r=" << r << " (" << + cpp_strerror(-r) << ")" << dendl; + } + return r; +} + +int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key); + if (!rctx) + return 0; + + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y); + if (r < 0) + return r; + + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl; + return -EINVAL; + } + + string tag; + + if (state->tail_tag.length() > 0) { + tag = state->tail_tag.c_str(); + } else if (state->obj_tag.length() > 0) { + tag = state->obj_tag.c_str(); + } else { + ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl; + return -EINVAL; + } + + ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl; + + cls_rgw_obj_chain chain; + update_gc_chain(dpp, state->obj, *manifest, &chain); + return gc->async_defer_chain(tag, chain); +} + +void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op) +{ + list prefixes; + prefixes.push_back(RGW_ATTR_OLH_PREFIX); + cls_rgw_remove_obj(op, prefixes); +} + +void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist) +{ + cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist); +} + +void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type) +{ + cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type); +} + +struct tombstone_entry { + ceph::real_time mtime; + uint32_t zone_short_id; + uint64_t pg_ver; + + tombstone_entry() = default; + explicit tombstone_entry(const RGWObjState& state) + : mtime(state.mtime), zone_short_id(state.zone_short_id), + pg_ver(state.pg_ver) {} +}; + +/** + * Delete an object. + * bucket: name of the bucket storing the object + * obj: name of the object to delete + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWRados *store = target->get_store(); + const string& instance = target->get_instance(); + rgw_obj obj = target->get_obj(); + + if (instance == "null") { + obj.key.instance.clear(); + } + + bool explicit_marker_version = (!params.marker_version_id.empty()); + + if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) { + if (instance.empty() || explicit_marker_version) { + std::unique_ptr marker = target->get_target()->clone(); + marker->clear_instance(); + + if (!params.marker_version_id.empty()) { + if (params.marker_version_id != "null") { + marker->set_instance(params.marker_version_id); + } + } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) { + marker->gen_rand_obj_instance_name(); + } + + result.version_id = marker->get_instance(); + if (result.version_id.empty()) + result.version_id = "null"; + result.delete_marker = true; + + struct rgw_bucket_dir_entry_meta meta; + + meta.owner = params.obj_owner.get_id().to_str(); + meta.owner_display_name = params.obj_owner.get_display_name(); + + if (real_clock::is_zero(params.mtime)) { + meta.mtime = real_clock::now(); + } else { + meta.mtime = params.mtime; + } + + int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace); + if (r < 0) { + return r; + } + } else { + rgw_bucket_dir_entry dirent; + + int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent); + if (r < 0) { + return r; + } + result.delete_marker = dirent.is_delete_marker(); + r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace); + if (r < 0) { + return r; + } + result.version_id = instance; + } + + BucketShard *bs = nullptr; + int r = target->get_bucket_shard(&bs, dpp); + if (r < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl; + return r; + } + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->get_bucket_info(), bs->shard_id); + + return 0; + } + + rgw_rados_ref ref; + int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref); + if (r < 0) { + return r; + } + + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + r = target->get_state(dpp, &state, &manifest, false, y); + if (r < 0) + return r; + + ObjectWriteOperation op; + + if (!real_clock::is_zero(params.unmod_since)) { + struct timespec ctime = ceph::real_clock::to_timespec(state->mtime); + struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since); + if (!params.high_precision_time) { + ctime.tv_nsec = 0; + unmod.tv_nsec = 0; + } + + ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl; + if (ctime > unmod) { + return -ERR_PRECONDITION_FAILED; + } + + /* only delete object if mtime is less than or equal to params.unmod_since */ + store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE); + } + uint64_t obj_accounted_size = state->accounted_size; + + if(params.abortmp) { + obj_accounted_size = params.parts_accounted_size; + } + + if (!real_clock::is_zero(params.expiration_time)) { + bufferlist bl; + real_time delete_at; + + if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) { + try { + auto iter = bl.cbegin(); + decode(delete_at, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; + return -EIO; + } + + if (params.expiration_time != delete_at) { + return -ERR_PRECONDITION_FAILED; + } + } else { + return -ERR_PRECONDITION_FAILED; + } + } + + if (!state->exists) { + target->invalidate_state(); + return -ENOENT; + } + + r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y); + if (r < 0) + return r; + + RGWBucketInfo& bucket_info = target->get_bucket_info(); + + RGWRados::Bucket bop(store, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + index_op.set_zones_trace(params.zones_trace); + index_op.set_bilog_flags(params.bilog_flags); + + r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y); + if (r < 0) + return r; + + store->remove_rgw_head_obj(op); + + auto& ioctx = ref.pool.ioctx(); + r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y); + + /* raced with another operation, object state is indeterminate */ + const bool need_invalidate = (r == -ECANCELED); + + int64_t poolid = ioctx.get_id(); + if (r >= 0) { + tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache(); + if (obj_tombstone_cache) { + tombstone_entry entry{*state}; + obj_tombstone_cache->add(obj, entry); + } + r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs); + + int ret = target->complete_atomic_modification(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl; + } + /* other than that, no need to propagate error */ + } else { + int ret = index_op.cancel(dpp, params.remove_objs); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; + } + } + + if (need_invalidate) { + target->invalidate_state(); + } + + if (r < 0) + return r; + + /* update quota cache */ + store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size); + + return 0; +} + +int RGWRados::delete_obj(rgw::sal::Driver* store, + const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw_obj& obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags, + const real_time& expiration_time, + rgw_zone_set *zones_trace) +{ + std::unique_ptr bucket; + store->get_bucket(nullptr, bucket_info, &bucket); + std::unique_ptr object = bucket->get_object(obj.key); + + return delete_obj(dpp, bucket_info, object.get(), versioning_status, + bilog_flags, expiration_time, zones_trace); +} + +int RGWRados::delete_obj(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + rgw::sal::Object* obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags, + const real_time& expiration_time, + rgw_zone_set *zones_trace) +{ + std::unique_ptr del_op = obj->get_delete_op(); + + del_op->params.bucket_owner = bucket_info.owner; + del_op->params.versioning_status = versioning_status; + del_op->params.bilog_flags = bilog_flags; + del_op->params.expiration_time = expiration_time; + del_op->params.zones_trace = zones_trace; + + return del_op->delete_obj(dpp, null_yield); +} + +int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + + op.remove(); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp) +{ + std::string oid, key; + get_obj_bucket_and_oid_loc(obj, oid, key); + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl; + return ret; + } + + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL); +} + +static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl) +{ + string tag; + + RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp); + if (mi != manifest.obj_end(dpp)) { + if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part + ++mi; + rgw::sal::RadosStore* rstore = dynamic_cast(store); + tag = mi.get_location().get_raw_obj(rstore).oid; + tag.append("_"); + } + + unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; + char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; + MD5 hash; + // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes + hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length()); + + map::iterator iter = attrset.find(RGW_ATTR_ETAG); + if (iter != attrset.end()) { + bufferlist& bl = iter->second; + hash.Update((const unsigned char *)bl.c_str(), bl.length()); + } + + hash.Final(md5); + buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str); + tag.append(md5_str); + + ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl; + + tag_bl.append(tag.c_str(), tag.size() + 1); +} + +static bool is_olh(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_INFO); + return (iter != attrs.end()); +} + +static bool has_olh_tag(map& attrs) +{ + map::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG); + return (iter != attrs.end()); +} + +int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& + obj_ctx, RGWBucketInfo& bucket_info, + rgw::sal::Object* obj, RGWObjState *olh_state, + RGWObjState **target_state, + RGWObjManifest **target_manifest, optional_yield y) +{ + ceph_assert(olh_state->is_olh); + + rgw_obj target; + int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */ + if (r < 0) { + return r; + } + + std::unique_ptr bucket; + driver->get_bucket(nullptr, bucket_info, &bucket); + std::unique_ptr target_obj = bucket->get_object(target.key); + + r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state, + target_manifest, false, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, + RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent) +{ + if (obj->empty()) { + return -EINVAL; + } + + bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty(); + *manifest = nullptr; + + RGWObjStateManifest *sm = rctx->get_state(obj->get_obj()); + RGWObjState *s = &(sm->state); + ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; + *state = s; + if (sm->manifest) { + *manifest = &(*sm->manifest); + } + if (s->has_attrs) { + if (s->is_olh && need_follow_olh) { + return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + } + return 0; + } + + s->obj = obj->get_obj(); + + rgw_raw_obj raw_obj; + obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj); + + int r = -ENOENT; + + if (!assume_noent) { + r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y); + } + + if (r == -ENOENT) { + s->exists = false; + s->has_attrs = true; + tombstone_entry entry; + if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) { + s->mtime = entry.mtime; + s->zone_short_id = entry.zone_short_id; + s->pg_ver = entry.pg_ver; + ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj + << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl; + } else { + s->mtime = real_time(); + } + return 0; + } + if (r < 0) + return r; + + s->exists = true; + s->has_attrs = true; + s->accounted_size = s->size; + + auto iter = s->attrset.find(RGW_ATTR_ETAG); + if (iter != s->attrset.end()) { + /* get rid of extra null character at the end of the etag, as we used to store it like that */ + bufferlist& bletag = iter->second; + if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') { + bufferlist newbl; + bletag.splice(0, bletag.length() - 1, &newbl); + bletag = std::move(newbl); + } + } + + iter = s->attrset.find(RGW_ATTR_COMPRESSION); + const bool compressed = (iter != s->attrset.end()); + if (compressed) { + // use uncompressed size for accounted_size + try { + RGWCompressionInfo info; + auto p = iter->second.cbegin(); + decode(info, p); + s->accounted_size = info.orig_size; + } catch (buffer::error&) { + ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl; + return -EIO; + } + } + + iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); + if (iter != s->attrset.end()) { + bufferlist bl = iter->second; + bufferlist::iterator it = bl.begin(); + it.copy(bl.length(), s->shadow_obj); + s->shadow_obj[bl.length()] = '\0'; + } + s->obj_tag = s->attrset[RGW_ATTR_ID_TAG]; + auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG); + if (ttiter != s->attrset.end()) { + s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG]; + } + + bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST]; + if (manifest_bl.length()) { + auto miter = manifest_bl.cbegin(); + try { + sm->manifest.emplace(); + decode(*sm->manifest, miter); + sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be + broken due to old bugs */ + s->size = sm->manifest->get_obj_size(); + if (!compressed) + s->accounted_size = s->size; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl; + return -EIO; + } + *manifest = &(*sm->manifest); + ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl; + if (cct->_conf->subsys.should_gather() && \ + sm->manifest->has_explicit_objs()) { + RGWObjManifest::obj_iterator mi; + for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) { + ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl; + } + } + + if (!s->obj_tag.length()) { + /* + * Uh oh, something's wrong, object with manifest should have tag. Let's + * create one out of the manifest, would be unique + */ + generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag); + s->fake_tag = true; + } + } + map::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER); + if (aiter != s->attrset.end()) { + bufferlist& pg_ver_bl = aiter->second; + if (pg_ver_bl.length()) { + auto pgbl = pg_ver_bl.cbegin(); + try { + decode(s->pg_ver, pgbl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); + if (aiter != s->attrset.end()) { + bufferlist& zone_short_id_bl = aiter->second; + if (zone_short_id_bl.length()) { + auto zbl = zone_short_id_bl.cbegin(); + try { + decode(s->zone_short_id, zbl); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl; + } + } + } + if (s->obj_tag.length()) { + ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl; + } else { + ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl; + } + + /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if + * it exist, and not only if is_olh() returns true + */ + iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); + if (iter != s->attrset.end()) { + s->olh_tag = iter->second; + } + + if (is_olh(s->attrset)) { + s->is_olh = true; + + ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl; + + if (need_follow_olh) { + return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); + } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) { + // read null version, and the head object only have olh info + s->exists = false; + return -ENOENT; + } + } + + return 0; +} + +int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent) +{ + int ret; + + do { + ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent); + } while (ret == -EAGAIN); + + return ret; +} + +int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y) +{ + RGWObjState *astate; + int r = get_state(dpp, &astate, pmanifest, true, y); + if (r < 0) { + return r; + } + + return 0; +} + +int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y) +{ + RGWObjState *state; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &state, &manifest, true, y); + if (r < 0) + return r; + if (!state->exists) + return -ENOENT; + if (!state->get_attr(name, dest)) + return -ENODATA; + + return 0; +} + +int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp) +{ + rgw::sal::Object* target = source->get_target(); + rgw_obj obj = target->get_obj(); + RGWRados *store = source->get_store(); + + result.obj = obj; + if (target->has_attrs()) { + state.ret = 0; + result.size = target->get_obj_size(); + result.mtime = ceph::real_clock::to_timespec(target->get_mtime()); + result.attrs = target->get_attrs(); + //result.manifest = sm->manifest; + return 0; + } + + string oid; + string loc; + get_obj_bucket_and_oid_loc(obj, oid, loc); + + int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx); + if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + op.stat2(&result.size, &result.mtime, NULL); + op.getxattrs(&result.attrs, NULL); + state.completion = librados::Rados::aio_create_completion(nullptr, nullptr); + state.io_ctx.locator_set_key(loc); + r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL); + if (r < 0) { + ldpp_dout(dpp, 5) << __func__ + << ": ERROR: aio_operate() returned ret=" << r + << dendl; + return r; + } + + return 0; +} + + +int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp) +{ + if (!state.completion) { + return state.ret; + } + + state.completion->wait_for_complete(); + state.ret = state.completion->get_return_value(); + state.completion->release(); + + if (state.ret != 0) { + return state.ret; + } + + return finish(dpp); +} + +int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp) +{ + map::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST); + if (iter != result.attrs.end()) { + bufferlist& bl = iter->second; + auto biter = bl.cbegin(); + try { + result.manifest.emplace(); + decode(*result.manifest, biter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl; + return -EIO; + } + } + + return 0; +} + +int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + ObjectOperation& op, RGWObjState **pstate, + RGWObjManifest** pmanifest, optional_yield y) +{ + int r = obj->get_obj_state(dpp, pstate, y, false); + if (r < 0) + return r; + + return append_atomic_test(dpp, *pstate, op); +} + +int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, + const RGWObjState* state, + librados::ObjectOperation& op) +{ + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl; + return 0; + } + + if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + } else { + ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl; + } + return 0; +} + +int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent) +{ + int r = obj->get_obj_state(dpp, pstate, y, follow_olh); + if (r < 0) { + return r; + } + *pmanifest = static_cast(obj)->get_manifest(); + + return r; +} + +void RGWRados::Object::invalidate_state() +{ + obj->invalidate(); +} + +int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp, + ObjectWriteOperation& op, bool reset_obj, const string *ptag, + const char *if_match, const char *if_nomatch, bool removal_op, + bool modify_tail, optional_yield y) +{ + int r = get_state(dpp, &state, &manifest, false, y); + if (r < 0) + return r; + + bool need_guard = ((manifest) || (state->obj_tag.length() != 0) || + if_match != NULL || if_nomatch != NULL) && + (!state->fake_tag); + + if (!state->is_atomic) { + ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl; + + if (reset_obj) { + op.create(false); + store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object + } + + return 0; + } + + if (need_guard) { + /* first verify that the object wasn't replaced under */ + if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) { + op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); + // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion + } + + if (if_match) { + if (strcmp(if_match, "*") == 0) { + // test the object is existing + if (!state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_match, bl.c_str(), bl.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + + if (if_nomatch) { + if (strcmp(if_nomatch, "*") == 0) { + // test the object is NOT existing + if (state->exists) { + return -ERR_PRECONDITION_FAILED; + } + } else { + bufferlist bl; + if (!state->get_attr(RGW_ATTR_ETAG, bl) || + strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) { + return -ERR_PRECONDITION_FAILED; + } + } + } + } + + if (reset_obj) { + if (state->exists) { + op.create(false); + store->remove_rgw_head_obj(op); + } else { + op.create(true); + } + } + + if (removal_op) { + /* the object is being removed, no need to update its tag */ + return 0; + } + + if (ptag) { + state->write_tag = *ptag; + } else { + append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32); + } + bufferlist bl; + bl.append(state->write_tag.c_str(), state->write_tag.size() + 1); + + ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl; + + op.setxattr(RGW_ATTR_ID_TAG, bl); + if (modify_tail) { + op.setxattr(RGW_ATTR_TAIL_TAG, bl); + } + + return 0; +} + +/** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ +int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl) +{ + map attrs; + attrs[name] = bl; + return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield); +} + +int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj, + map& attrs, + map* rmattrs, + optional_yield y) +{ + std::unique_ptr obj = src_obj->clone(); + if (obj->get_instance() == "null") { + obj->clear_instance(); + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y); + if (r < 0) + return r; + + // ensure null version object exist + if (src_obj->get_instance() == "null" && !manifest) { + return -ENOENT; + } + + map::iterator iter; + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + const string& name = iter->first; + op.rmxattr(name.c_str()); + } + } + + const rgw_bucket& bucket = obj->get_bucket()->get_key(); + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + const string& name = iter->first; + bufferlist& bl = iter->second; + + if (!bl.length()) + continue; + + op.setxattr(name.c_str(), bl); + + if (name.compare(RGW_ATTR_DELETE_AT) == 0) { + real_time ts; + try { + decode(ts, bl); + + rgw_obj_index_key obj_key; + obj->get_key().get_index_key(&obj_key); + + obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl; + } + } + } + + if (!op.size()) + return 0; + + bufferlist bl; + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj()); + + if (state) { + string tag; + append_rand_alpha(cct, tag, tag, 32); + state->write_tag = tag; + r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); + + if (r < 0) + return r; + + bl.append(tag.c_str(), tag.size() + 1); + op.setxattr(RGW_ATTR_ID_TAG, bl); + } + + + real_time mtime = real_clock::now(); + struct timespec mtime_ts = real_clock::to_timespec(mtime); + op.mtime2(&mtime_ts); + auto& ioctx = ref.pool.ioctx(); + r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield); + if (state) { + if (r >= 0) { + bufferlist acl_bl = attrs[RGW_ATTR_ACL]; + bufferlist etag_bl = attrs[RGW_ATTR_ETAG]; + bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE]; + string etag = rgw_bl_str(etag_bl); + string content_type = rgw_bl_str(content_type_bl); + string storage_class; + auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != attrs.end()) { + storage_class = rgw_bl_str(iter->second); + } + uint64_t epoch = ioctx.get_last_version(); + int64_t poolid = ioctx.get_id(); + r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size, + mtime, etag, content_type, storage_class, &acl_bl, + RGWObjCategory::Main, NULL); + } else { + int ret = index_op.cancel(dpp, nullptr); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl; + } + } + } + if (r < 0) + return r; + + if (state) { + state->obj_tag.swap(bl); + if (rmattrs) { + for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { + state->attrset.erase(iter->first); + } + } + + for (iter = attrs.begin(); iter != attrs.end(); ++iter) { + state->attrset[iter->first] = iter->second; + } + + auto iter = state->attrset.find(RGW_ATTR_ID_TAG); + if (iter != state->attrset.end()) { + iter->second = state->obj_tag; + } + } + + return 0; +} + +int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + + bufferlist etag; + + map::iterator iter; + + RGWObjState *astate; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &astate, &manifest, true, y); + if (r < 0) + return r; + + if (!astate->exists) { + return -ENOENT; + } + + const RGWBucketInfo& bucket_info = source->get_bucket_info(); + + state.obj = astate->obj; + store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj); + + state.cur_pool = state.head_obj.pool; + state.cur_ioctx = &state.io_ctxs[state.cur_pool]; + + r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx); + if (r < 0) { + return r; + } + if (params.target_obj) { + *params.target_obj = state.obj; + } + if (params.attrs) { + *params.attrs = astate->attrset; + if (cct->_conf->subsys.should_gather()) { + for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) { + ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl; + } + } + } + + /* Convert all times go GMT to make them compatible */ + if (conds.mod_ptr || conds.unmod_ptr) { + obj_time_weight src_weight; + src_weight.init(astate); + src_weight.high_precision = conds.high_precision_time; + + obj_time_weight dest_weight; + dest_weight.high_precision = conds.high_precision_time; + + if (conds.mod_ptr && !conds.if_nomatch) { + dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (!(dest_weight < src_weight)) { + return -ERR_NOT_MODIFIED; + } + } + + if (conds.unmod_ptr && !conds.if_match) { + dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver); + ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; + if (dest_weight < src_weight) { + return -ERR_PRECONDITION_FAILED; + } + } + } + if (conds.if_match || conds.if_nomatch) { + r = get_attr(dpp, RGW_ATTR_ETAG, etag, y); + if (r < 0) + return r; + + if (conds.if_match) { + string if_match_str = rgw_string_unquote(conds.if_match); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl; + if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) { + return -ERR_PRECONDITION_FAILED; + } + } + + if (conds.if_nomatch) { + string if_nomatch_str = rgw_string_unquote(conds.if_nomatch); + ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl; + if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) { + return -ERR_NOT_MODIFIED; + } + } + } + + if (params.obj_size) + *params.obj_size = astate->size; + if (params.lastmod) + *params.lastmod = astate->mtime; + + return 0; +} + +int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) +{ + if (ofs < 0) { + ofs += obj_size; + if (ofs < 0) + ofs = 0; + end = obj_size - 1; + } else if (end < 0) { + end = obj_size - 1; + } + + if (obj_size > 0) { + if (ofs >= (off_t)obj_size) { + return -ERANGE; + } + if (end >= (off_t)obj_size) { + end = obj_size - 1; + } + } + return 0; +} + +int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call) +{ + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + int r; + +#define NUM_RESHARD_RETRIES 10 + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << + obj_instance.key << ". ret=" << ret << dendl; + return ret; + } + + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + + ldpp_dout(dpp, 10) << + "NOTICE: resharding operation on bucket index detected, blocking. obj=" << + obj_instance.key << dendl; + + r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp); + if (r == -ERR_BUSY_RESHARDING) { + ldpp_dout(dpp, 10) << __func__ << + " NOTICE: block_while_resharding() still busy. obj=" << + obj_instance.key << dendl; + continue; + } else if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: block_while_resharding() failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + invalidate_bs(); + } // for loop + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + if (pbs) { + *pbs = bs; + } + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + + if (write_tag && write_tag->length()) { + optag = string(write_tag->c_str(), write_tag->length()); + } else { + if (optag.empty()) { + append_rand_alpha(store->ctx(), optag, optag, 32); + } + } + + int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int { + return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace); + }); + + if (r < 0) { + return r; + } + prepared = true; + + return 0; +} + +int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, + uint64_t size, uint64_t accounted_size, + ceph::real_time& ut, const string& etag, + const string& content_type, const string& storage_class, + bufferlist *acl_bl, + RGWObjCategory category, + list *remove_objs, const string *user_data, + bool appendable) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + ent.meta.size = size; + ent.meta.accounted_size = accounted_size; + ent.meta.mtime = ut; + ent.meta.etag = etag; + ent.meta.storage_class = storage_class; + if (user_data) + ent.meta.user_data = *user_data; + + ACLOwner owner; + if (acl_bl && acl_bl->length()) { + int ret = store->decode_policy(dpp, *acl_bl, &owner); + if (ret < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl; + } + } + ent.meta.owner = owner.get_id().to_str(); + ent.meta.owner_display_name = owner.get_display_name(); + ent.meta.content_type = content_type; + ent.meta.appendable = appendable; + + ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id); + + return ret; +} + +int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp, + int64_t poolid, uint64_t epoch, + real_time& removed_mtime, + list *remove_objs) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs = nullptr; + + int ret = get_bucket_shard(&bs, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; + return ret; + } + + ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace); + + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id); + + return ret; +} + + +int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp, + list *remove_objs) +{ + if (blind) { + return 0; + } + RGWRados *store = target->get_store(); + BucketShard *bs; + + int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int { + return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace); + }); + + /* + * need to update data log anyhow, so that whoever follows needs to update its internal markers + * for following the specific bucket shard log. Otherwise they end up staying behind, and users + * have no way to tell that they're all caught up + */ + add_datalog_entry(dpp, store->svc.datalog_rados, + target->bucket_info, bs->shard_id); + + return ret; +} + +/* + * Read up through index `end` inclusive. Number of bytes read is up + * to `end - ofs + 1`. + */ +int RGWRados::Object::Read::read(int64_t ofs, int64_t end, + bufferlist& bl, optional_yield y, + const DoutPrefixProvider *dpp) +{ + RGWRados *store = source->get_store(); + + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len, read_len; + bool reading_from_head = true; + ObjectReadOperation op; + + bool merge_bl = false; + bufferlist *pbl = &bl; + bufferlist read_bl; + uint64_t max_chunk_size; + + RGWObjState *astate; + RGWObjManifest *manifest = nullptr; + int r = source->get_state(dpp, &astate, &manifest, true, y); + if (r < 0) + return r; + + if (astate->size == 0) { + end = 0; + } else if (end >= (int64_t)astate->size) { + end = astate->size - 1; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (manifest && manifest->has_tail()) { + /* now get the relevant object part */ + RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); + + uint64_t stripe_ofs = iter.get_stripe_ofs(); + read_obj = iter.get_location().get_raw_obj(store->driver); + len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + reading_from_head = (read_obj == state.head_obj); + } else { + read_obj = state.head_obj; + } + + r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl; + return r; + } + + if (len > max_chunk_size) + len = max_chunk_size; + + + read_len = len; + + if (reading_from_head) { + /* only when reading from the head object do we need to do the atomic test */ + std::unique_ptr obj = source->bucket->get_object(state.obj.key); + r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y); + if (r < 0) + return r; + + if (astate && astate->prefetch_data) { + if (!ofs && astate->data.length() >= len) { + bl = astate->data; + return bl.length(); + } + + if (ofs < astate->data.length()) { + unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len); + astate->data.begin(ofs).copy(copy_len, bl); + read_len -= copy_len; + read_ofs += copy_len; + if (!read_len) + return bl.length(); + + merge_bl = true; + pbl = &read_bl; + } + } + } + + ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl; + op.read(read_ofs, read_len, pbl, NULL); + + if (state.cur_pool != read_obj.pool) { + auto iter = state.io_ctxs.find(read_obj.pool); + if (iter == state.io_ctxs.end()) { + state.cur_ioctx = &state.io_ctxs[read_obj.pool]; + r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false); + if (r < 0) { + ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl; + return r; + } + } else { + state.cur_ioctx = &iter->second; + } + state.cur_pool = read_obj.pool; + } + + state.cur_ioctx->locator_set_key(read_obj.loc); + + r = state.cur_ioctx->operate(read_obj.oid, &op, NULL); + ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl; + + if (r < 0) { + return r; + } + + if (merge_bl) { + bl.append(read_bl); + } + + return bl.length(); +} + +int get_obj_data::flush(rgw::AioResultList&& results) { + int r = rgw::check_for_errors(results); + if (r < 0) { + return r; + } + std::list bl_list; + + auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; }; + results.sort(cmp); // merge() requires results to be sorted first + completed.merge(results, cmp); // merge results in sorted order + + while (!completed.empty() && completed.front().id == offset) { + auto bl = std::move(completed.front().data); + + bl_list.push_back(bl); + offset += bl.length(); + int r = client_cb->handle_data(bl, 0, bl.length()); + if (r < 0) { + return r; + } + + if (rgwrados->get_use_datacache()) { + const std::lock_guard l(d3n_get_data.d3n_lock); + auto oid = completed.front().obj.get_ref().obj.oid; + if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) { + lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl; + rgwrados->d3n_data_cache->put(bl, bl.length(), oid); + } else { + lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl; + } + } + completed.pop_front_and_dispose(std::default_delete{}); + } + return 0; +} + +static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + struct get_obj_data* d = static_cast(arg); + return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len, + is_head_obj, astate, arg); +} + +int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg) +{ + ObjectReadOperation op; + struct get_obj_data* d = static_cast(arg); + string oid, key; + + if (is_head_obj) { + /* only when reading from the head object do we need to do the atomic test */ + int r = append_atomic_test(dpp, astate, op); + if (r < 0) + return r; + + if (astate && + obj_ofs < astate->data.length()) { + unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); + + r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); + if (r < 0) + return r; + + len -= chunk_len; + d->offset += chunk_len; + read_ofs += chunk_len; + obj_ofs += chunk_len; + if (!len) + return 0; + } + } + + auto obj = d->rgwrados->svc.rados->obj(read_obj); + int r = obj.open(dpp); + if (r < 0) { + ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; + op.read(read_ofs, len, nullptr, nullptr); + + const uint64_t cost = len; + const uint64_t id = obj_ofs; // use logical object offset for sorting replies + + auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); + + return d->flush(std::move(completed)); +} + +int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, + optional_yield y) +{ + RGWRados *store = source->get_store(); + CephContext *cct = store->ctx(); + const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size; + const uint64_t window_size = cct->_conf->rgw_get_obj_window_size; + + auto aio = rgw::make_throttle(window_size, y); + get_obj_data data(store, cb, &*aio, ofs, y); + + int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(), + source->get_target(), + ofs, end, chunk_size, _get_obj_iterate_cb, &data, y); + if (r < 0) { + ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl; + data.cancel(); // drain completions without writing back to client + return r; + } + + return data.drain(); +} + +int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + off_t ofs, off_t end, uint64_t max_chunk_size, + iterate_obj_cb cb, void *arg, optional_yield y) +{ + rgw_raw_obj head_obj; + rgw_raw_obj read_obj; + uint64_t read_ofs = ofs; + uint64_t len; + bool reading_from_head = true; + RGWObjState *astate = NULL; + RGWObjManifest *manifest = nullptr; + + obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj); + + int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y); + if (r < 0) { + return r; + } + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (manifest) { + /* now get the relevant object stripe */ + RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); + + RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp); + + for (; iter != obj_end && ofs <= end; ++iter) { + off_t stripe_ofs = iter.get_stripe_ofs(); + off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size(); + + while (ofs < next_stripe_ofs && ofs <= end) { + read_obj = iter.get_location().get_raw_obj(driver); + uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); + read_ofs = iter.location_ofs() + (ofs - stripe_ofs); + + if (read_len > max_chunk_size) { + read_len = max_chunk_size; + } + + reading_from_head = (read_obj == head_obj); + r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + } else { + while (ofs <= end) { + read_obj = head_obj; + uint64_t read_len = std::min(len, max_chunk_size); + + r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg); + if (r < 0) { + return r; + } + + len -= read_len; + ofs += read_len; + } + } + + return 0; +} + +int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield); +} + +int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + + bufferlist outbl; + + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield); +} + +int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag) +{ + ObjectWriteOperation op; + + ceph_assert(olh_obj.key.instance.empty()); + + bool has_tag = (state.exists && has_olh_tag(state.attrset)); + + if (!state.exists) { + op.create(true); + } else { + op.assert_exists(); + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + } + + /* + * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object. + * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two + * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to + * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh + * log will reflect that. + * + * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag + * is used for object data instance, olh_tag for olh instance. + */ + if (has_tag) { + /* guard against racing writes */ + bucket_index_guard_olh_op(dpp, state, op); + } + + if (!has_tag) { + /* obj tag */ + string obj_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist bl; + bl.append(obj_tag.c_str(), obj_tag.size()); + op.setxattr(RGW_ATTR_ID_TAG, bl); + + state.attrset[RGW_ATTR_ID_TAG] = bl; + state.obj_tag = bl; + + /* olh tag */ + string olh_tag = gen_rand_alphanumeric_lower(cct, 32); + + bufferlist olh_bl; + olh_bl.append(olh_tag.c_str(), olh_tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl); + + state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl; + state.olh_tag = olh_bl; + state.is_olh = true; + + bufferlist verbl; + op.setxattr(RGW_ATTR_OLH_VER, verbl); + } + + bufferlist bl; + RGWOLHPendingInfo pending_info; + pending_info.time = real_clock::now(); + encode(pending_info, bl); + +#define OLH_PENDING_TAG_LEN 32 + /* tag will start with current time epoch, this so that entries are sorted by time */ + char buf[32]; + utime_t ut(pending_info.time); + snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec()); + *op_tag = buf; + + string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size()); + + op_tag->append(s); + + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(*op_tag); + + op.setxattr(attr_name.c_str(), bl); + + int ret = obj_operate(dpp, bucket_info, olh_obj, &op); + if (ret < 0) { + return ret; + } + + state.exists = true; + state.attrset[attr_name] = bl; + + return 0; +} + +int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag) +{ + int ret; + + ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag); + if (ret == -EEXIST) { + ret = -ECANCELED; + } + + return ret; +} + +int RGWRados::guard_reshard(const DoutPrefixProvider *dpp, + BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + std::function call) +{ + rgw_obj obj; + const rgw_obj *pobj = &obj_instance; + int r; + + for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { + r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp); + if (r < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl; + return r; + } + + r = call(bs); + if (r != -ERR_BUSY_RESHARDING) { + break; + } + + ldpp_dout(dpp, 10) << + "NOTICE: resharding operation on bucket index detected, blocking. obj=" << + obj_instance.key << dendl; + + r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp); + if (r == -ERR_BUSY_RESHARDING) { + ldpp_dout(dpp, 10) << __func__ << + " NOTICE: block_while_resharding() still busy. obj=" << + obj_instance.key << dendl; + continue; + } else if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: block_while_resharding() failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + ldpp_dout(dpp, 20) << "reshard completion identified" << dendl; + i = 0; /* resharding is finished, make sure we can retry */ + } // for loop + + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << + obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; + return r; + } + + return 0; +} + + +int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + int ret = 0; + cls_rgw_bucket_instance_entry entry; + + // gets loaded by fetch_new_bucket_info; can be used by + // clear_resharding + std::map bucket_attrs; + + // since we want to run this recovery code from two distinct places, + // let's just put it in a lambda so we can easily re-use; if the + // lambda successfully fetches a new bucket id, it sets + // new_bucket_id and returns 0, otherwise it returns a negative + // error code + auto fetch_new_bucket_info = + [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int { + int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name, + bucket_info, nullptr, y, dpp, &bucket_attrs); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to refresh bucket info after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + ret = bs->init(dpp, bucket_info, obj_instance); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to refresh bucket shard generation after reshard at " << + log_tag << ": " << cpp_strerror(-ret) << dendl; + return ret; + } + + const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen; + ldpp_dout(dpp, 20) << __func__ << + " INFO: refreshed bucket info after reshard at " << + log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl; + + return 0; + }; // lambda fetch_new_bucket_info + + constexpr int num_retries = 10; + for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop + auto& ref = bs->bucket_obj.get_ref(); + ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry); + if (ret == -ENOENT) { + ret = fetch_new_bucket_info("get_bucket_resharding_failed"); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " failed to refresh bucket info after reshard when get bucket " + "resharding failed, error: " << cpp_strerror(-ret) << dendl; + return ret; + } + } else if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) << + dendl; + return ret; + } + + if (!entry.resharding_in_progress()) { + ret = fetch_new_bucket_info("get_bucket_resharding_succeeded"); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " failed to refresh bucket info after reshard when get bucket " + "resharding succeeded, error: " << cpp_strerror(-ret) << dendl; + return ret; + } + } + + ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " << + (i < num_retries ? "retrying" : "too many retries") << dendl; + + if (i == num_retries) { + break; + } + + // If bucket is erroneously marked as resharding (e.g., crash or + // other error) then fix it. If we can take the bucket reshard + // lock then it means no other resharding should be taking place, + // and we're free to clear the flags. + { + // since we expect to do this rarely, we'll do our work in a + // block and erase our work after each try + + RGWObjectCtx obj_ctx(this->driver); + const rgw_bucket& b = bs->bucket; + std::string bucket_id = b.get_key(); + RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true); + ret = reshard_lock.lock(dpp); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + ldpp_dout(dpp, 20) << __func__ << + " ERROR: failed to take reshard lock for bucket " << + bucket_id << "; expected if resharding underway" << dendl; + } else { + ldpp_dout(dpp, 10) << __func__ << + " INFO: was able to take reshard lock for bucket " << + bucket_id << dendl; + // the reshard may have finished, so call clear_resharding() + // with its current bucket info; ALSO this will load + // bucket_attrs for call to clear_resharding below + ret = fetch_new_bucket_info("trying_to_clear_resharding"); + if (ret < 0) { + reshard_lock.unlock(); + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to update bucket info before clear resharding for bucket " << + bucket_id << dendl; + continue; // try again + } + + ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp); + reshard_lock.unlock(); + if (ret == -ENOENT) { + ldpp_dout(dpp, 5) << __func__ << + " INFO: no need to reset reshard flags; old shards apparently" + " removed after successful resharding of bucket " << + bucket_id << dendl; + continue; // immediately test again + } else if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: failed to clear resharding flags for bucket " << + bucket_id << ", " << cpp_strerror(-ret) << dendl; + // wait and then test again + } else { + ldpp_dout(dpp, 5) << __func__ << + " INFO: apparently successfully cleared resharding flags for " + "bucket " << bucket_id << dendl; + continue; // if we apparently succeed immediately test again + } // if clear resharding succeeded + } // if taking of lock succeeded + } // block to encapsulate recovery from incomplete reshard + + ret = reshard_wait->wait(y); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return ret; + } + } // for loop + + ldpp_dout(dpp, 0) << __func__ << + " ERROR: bucket is still resharding, please retry" << dendl; + return -ERR_BUSY_RESHARDING; +} + +int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + RGWObjState& olh_state, const rgw_obj& obj_instance, + bool delete_marker, const string& op_tag, + struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + real_time unmod_since, bool high_precision_time, + rgw_zone_set *_zones_trace, bool log_data_change) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); + + BucketShard bs(this); + + r = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + auto& ref = bs->bucket_obj.get_ref(); + librados::ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag, + delete_marker, op_tag, meta, olh_epoch, + unmod_since, high_precision_time, + svc.zone->get_zone().log_data, zones_trace); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (r < 0) { + ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl; + return r; + } + + add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id); + + return 0; +} + +void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op) +{ + ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl; + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag); +} + +int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw_obj& obj_instance, + const string& op_tag, const string& olh_tag, + uint64_t olh_epoch, rgw_zone_set *_zones_trace) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); + + BucketShard bs(this); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); + r = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *bs) -> int { + auto& ref = bs->bucket_obj.get_ref(); + librados::ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_unlink_instance(op, key, op_tag, + olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (r < 0) { + ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl; + return r; + } + + return 0; +} + +int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver_marker, + std::map > *log, + bool *is_truncated) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + auto& shard_ref = bs.bucket_obj.get_ref(); + ObjectReadOperation op; + + rgw_cls_read_olh_log_ret log_ret; + int op_ret = 0; + cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); + bufferlist outbl; + r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield); + if (r < 0) { + return r; + } + if (op_ret < 0) { + ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl; + return op_ret; + } + + *log = std::move(log_ret.log); + *is_truncated = log_ret.is_truncated; + + return 0; +} + +// a multisite sync bug resulted in the OLH head attributes being overwritten by +// the attributes from another zone, causing link_olh() to fail endlessly due to +// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH +// attributes from the bucket index. see http://tracker.ceph.com/issues/37792 +int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj) +{ + // fetch the current olh entry from the bucket index + rgw_bucket_olh_entry olh; + int r = bi_get_olh(dpp, bucket_info, obj, &olh); + if (r < 0) { + ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl; + return r; + } + if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved? + return 0; + } + + ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag + << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl; + + // rewrite OLH_ID_TAG and OLH_INFO from current olh + ObjectWriteOperation op; + // assert this is the same olh tag we think we're fixing + bucket_index_guard_olh_op(dpp, *state, op); + // preserve existing mtime + struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime); + op.mtime2(&mtime_ts); + { + bufferlist bl; + bl.append(olh.tag.c_str(), olh.tag.size()); + op.setxattr(RGW_ATTR_OLH_ID_TAG, bl); + } + { + RGWOLHInfo info; + info.target = rgw_obj(bucket_info.bucket, olh.key); + info.removed = olh.delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + rgw_rados_ref ref; + r = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (r < 0) { + return r; + } + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + int ret = + bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_trim_olh_log(op, key, ver, olh_tag); + return pbs->bucket_obj.operate(dpp, &op, null_yield); + }); + if (ret < 0) { + ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + RGWObjState& state, + const rgw_obj& obj_instance) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); + if (r < 0) { + return r; + } + + BucketShard bs(this); + + string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); + + cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); + + int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, + [&](BucketShard *pbs) -> int { + ObjectWriteOperation op; + op.assert_exists(); // bucket index shard must exist + auto& ref = pbs->bucket_obj.get_ref(); + cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); + cls_rgw_clear_olh(op, key, olh_tag); + return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + }); + if (ret < 0) { + ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh) +{ + try { + auto biter = bl.cbegin(); + decode(*olh, biter); + return 0; + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl; + return -EIO; + } +} + +int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp, + RGWObjState& state, + RGWBucketInfo& bucket_info, + const rgw::sal::Object* obj, + bufferlist& olh_tag, + std::map >& log, + uint64_t *plast_ver, + rgw_zone_set* zones_trace) +{ + if (log.empty()) { + return 0; + } + + librados::ObjectWriteOperation op; + + uint64_t last_ver = log.rbegin()->first; + *plast_ver = last_ver; + + map >::iterator iter = log.begin(); + + op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); + op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver); + + bufferlist ver_bl; + string last_ver_s = to_string(last_ver); + ver_bl.append(last_ver_s.c_str(), last_ver_s.size()); + op.setxattr(RGW_ATTR_OLH_VER, ver_bl); + + struct timespec mtime_ts = real_clock::to_timespec(state.mtime); + op.mtime2(&mtime_ts); + + bool need_to_link = false; + uint64_t link_epoch = 0; + cls_rgw_obj_key key; + bool delete_marker = false; + list remove_instances; + bool need_to_remove = false; + + // decode current epoch and instance + auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER); + if (olh_ver != state.attrset.end()) { + std::string str = olh_ver->second.to_str(); + std::string err; + link_epoch = strict_strtoll(str.c_str(), 10, &err); + } + auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO); + if (olh_info != state.attrset.end()) { + RGWOLHInfo info; + int r = decode_olh_info(dpp, cct, olh_info->second, &info); + if (r < 0) { + return r; + } + info.target.key.get_index_key(&key); + delete_marker = info.removed; + } + + for (iter = log.begin(); iter != log.end(); ++iter) { + vector::iterator viter = iter->second.begin(); + for (; viter != iter->second.end(); ++viter) { + rgw_bucket_olh_log_entry& entry = *viter; + + ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op + << " key=" << entry.key.name << "[" << entry.key.instance << "] " + << (entry.delete_marker ? "(delete)" : "") << dendl; + switch (entry.op) { + case CLS_RGW_OLH_OP_REMOVE_INSTANCE: + remove_instances.push_back(entry.key); + break; + case CLS_RGW_OLH_OP_LINK_OLH: + // only overwrite a link of the same epoch if its key sorts before + if (link_epoch < iter->first || key.instance.empty() || + key.instance > entry.key.instance) { + ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + need_to_link = true; + need_to_remove = false; + key = entry.key; + delete_marker = entry.delete_marker; + } else { + ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker + << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; + } + break; + case CLS_RGW_OLH_OP_UNLINK_OLH: + need_to_remove = true; + need_to_link = false; + break; + default: + ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl; + return -EIO; + } + string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; + attr_name.append(entry.op_tag); + op.rmxattr(attr_name.c_str()); + } + } + + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); + if (r < 0) { + return r; + } + + rgw::sal::Bucket* bucket = obj->get_bucket(); + + if (need_to_link) { + rgw_obj target(bucket->get_key(), key); + RGWOLHInfo info; + info.target = target; + info.removed = delete_marker; + bufferlist bl; + encode(info, bl); + op.setxattr(RGW_ATTR_OLH_INFO, bl); + } + + /* first remove object instances */ + for (list::iterator liter = remove_instances.begin(); + liter != remove_instances.end(); ++liter) { + cls_rgw_obj_key& key = *liter; + std::unique_ptr obj_instance = bucket->get_object(key); + int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace); + if (ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl; + return ret; + } + } + + /* update olh object */ + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + + r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl; + return r; + } + + if (need_to_remove) { + ObjectWriteOperation rm_op; + + rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); + rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver); + cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */ + rm_op.remove(); + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield); + if (r == -ECANCELED) { + return 0; /* someone else won this race */ + } else { + /* + * only clear if was successful, otherwise we might clobber pending operations on this object + */ + r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj()); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl; + return r; + } + } + } + + return 0; +} + +/* + * read olh log and apply it + */ +int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace) +{ + map > log; + bool is_truncated; + uint64_t ver_marker = 0; + + do { + int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated); + if (ret < 0) { + return ret; + } + ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace); + if (ret < 0) { + return ret; + } + } while (is_truncated); + + return 0; +} + +int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, + RGWBucketInfo& bucket_info, + rgw::sal::Object* target_obj, bool delete_marker, + rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, + optional_yield y, rgw_zone_set *zones_trace, bool log_data_change) +{ + string op_tag; + + std::unique_ptr olh_obj = target_obj->clone(); + olh_obj->clear_instance(); + + RGWObjState *state = NULL; + RGWObjManifest *manifest = nullptr; + + int ret = 0; + int i; + +#define MAX_ECANCELED_RETRY 100 + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + olh_obj->invalidate(); + } + + ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */ + if (ret < 0) { + return ret; + } + + ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag); + if (ret < 0) { + ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(), + delete_marker, op_tag, meta, olh_epoch, unmod_since, + high_precision_time, zones_trace, log_data_change); + if (ret < 0) { + ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; + if (ret == -ECANCELED) { + // the bucket index rejected the link_olh() due to olh tag mismatch; + // attempt to reconstruct olh head attributes based on the bucket index + int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj()); + if (r2 < 0 && r2 != -ECANCELED) { + return r2; + } + continue; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(dpp, state, bucket_info, olh_obj.get()); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + ret = 0; + } + if (ret < 0) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, + uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace) +{ + string op_tag; + + std::unique_ptr olh_obj = target_obj->clone(); + olh_obj->clear_instance(); + + RGWObjState *state = NULL; + + int ret = 0; + int i; + + for (i = 0; i < MAX_ECANCELED_RETRY; i++) { + if (ret == -ECANCELED) { + olh_obj->invalidate(); + } + + ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */ + if (ret < 0) + return ret; + + ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag); + if (ret < 0) { + ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + + string olh_tag(state->olh_tag.c_str(), state->olh_tag.length()); + + ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace); + if (ret < 0) { + ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl; + if (ret == -ECANCELED) { + continue; + } + return ret; + } + break; + } + + if (i == MAX_ECANCELED_RETRY) { + ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; + return -EIO; + } + + ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace); + if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ + return 0; + } + if (ret < 0) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; + return ret; + } + + return 0; +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key) +{ +#define OBJ_INSTANCE_LEN 32 + char buf[OBJ_INSTANCE_LEN + 1]; + + gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped, + no underscore for instance name due to the way we encode the raw keys */ + + target_key->set_instance(buf); +} + +void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj) +{ + gen_rand_obj_instance_name(&target_obj->key); +} + +int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh) +{ + map attrset; + + ObjectReadOperation op; + op.getxattrs(&attrset, NULL); + + int r = obj_operate(dpp, bucket_info, obj, &op); + if (r < 0) { + return r; + } + + auto iter = attrset.find(RGW_ATTR_OLH_INFO); + if (iter == attrset.end()) { /* not an olh */ + return -EINVAL; + } + + return decode_olh_info(dpp, cct, iter->second, olh); +} + +void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp, + map& pending_entries, + map *rm_pending_entries) +{ + map::iterator iter = pending_entries.begin(); + + real_time now = real_clock::now(); + + while (iter != pending_entries.end()) { + auto biter = iter->second.cbegin(); + RGWOLHPendingInfo pending_info; + try { + decode(pending_info, biter); + } catch (buffer::error& err) { + /* skipping bad entry, we could remove it but it might hide a bug */ + ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl; + ++iter; + continue; + } + + map::iterator cur_iter = iter; + ++iter; + if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) { + (*rm_pending_entries)[cur_iter->first] = cur_iter->second; + pending_entries.erase(cur_iter); + } else { + /* entries names are sorted by time (rounded to a second) */ + break; + } + } +} + +int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map& pending_attrs) +{ + rgw_rados_ref ref; + int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref); + if (r < 0) { + return r; + } + + // trim no more than 1000 entries per osd op + constexpr int max_entries = 1000; + + auto i = pending_attrs.begin(); + while (i != pending_attrs.end()) { + ObjectWriteOperation op; + bucket_index_guard_olh_op(dpp, state, op); + + for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) { + op.rmxattr(i->first.c_str()); + } + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r == -ENOENT || r == -ECANCELED) { + /* raced with some other change, shouldn't sweat about it */ + return 0; + } + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; + return r; + } + } + return 0; +} + +int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target) +{ + map pending_entries; + rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries); + + map rm_pending_entries; + check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries); + + if (!rm_pending_entries.empty()) { + int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries); + if (ret < 0) { + ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl; + return ret; + } + } + if (!pending_entries.empty()) { + ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl; + + int ret = update_olh(dpp, state, bucket_info, olh_obj); + if (ret < 0) { + if (ret == -ECANCELED) { + // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object. + // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We + // return ENOENT to indicate that the OLH object was removed. + ret = -ENOENT; + } + return ret; + } + } + + auto iter = state->attrset.find(RGW_ATTR_OLH_INFO); + if (iter == state->attrset.end()) { + return -EINVAL; + } + + RGWOLHInfo olh; + int ret = decode_olh_info(dpp, cct, iter->second, &olh); + if (ret < 0) { + return ret; + } + + if (olh.removed) { + return -ENOENT; + } + + *target = olh.target; + + return 0; +} + +int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, + map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker, optional_yield y) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + map unfiltered_attrset; + uint64_t size = 0; + struct timespec mtime_ts; + + ObjectReadOperation op; + if (objv_tracker) { + objv_tracker->prepare_op_for_read(&op); + } + if (attrs) { + op.getxattrs(&unfiltered_attrset, NULL); + } + if (psize || pmtime) { + op.stat2(&size, &mtime_ts, NULL); + } + if (first_chunk) { + op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL); + } + bufferlist outbl; + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y); + + if (epoch) { + *epoch = ref.pool.ioctx().get_last_version(); + } + + if (r < 0) + return r; + + if (psize) + *psize = size; + if (pmtime) + *pmtime = ceph::real_clock::from_timespec(mtime_ts); + if (attrs) { + rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); + } + + return 0; +} + +int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, string *bucket_ver, string *master_ver, + map& stats, + string *max_marker, bool *syncstopped) +{ + vector headers; + map bucket_instance_ids; + int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids); + if (r < 0) { + return r; + } + + ceph_assert(headers.size() == bucket_instance_ids.size()); + + auto iter = headers.begin(); + map::iterator viter = bucket_instance_ids.begin(); + BucketIndexShardsManager ver_mgr; + BucketIndexShardsManager master_ver_mgr; + BucketIndexShardsManager marker_mgr; + char buf[64]; + for(; iter != headers.end(); ++iter, ++viter) { + accumulate_raw_stats(*iter, stats); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver); + ver_mgr.add(viter->first, string(buf)); + snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver); + master_ver_mgr.add(viter->first, string(buf)); + if (shard_id >= 0) { + *max_marker = iter->max_marker; + } else { + marker_mgr.add(viter->first, iter->max_marker); + } + if (syncstopped != NULL) + *syncstopped = iter->syncstopped; + } + ver_mgr.to_string(bucket_ver); + master_ver_mgr.to_string(master_ver); + if (shard_id < 0) { + marker_mgr.to_string(max_marker); + } + return 0; +} + +class RGWGetBucketStatsContext : public RGWGetDirHeader_CB { + RGWGetBucketStats_CB *cb; + uint32_t pendings; + map stats; + int ret_code; + bool should_cb; + ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext"); + +public: + RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings) + : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true) + {} + + void handle_response(int r, rgw_bucket_dir_header& header) override { + std::lock_guard l{lock}; + if (should_cb) { + if ( r >= 0) { + accumulate_raw_stats(header, stats); + } else { + ret_code = r; + } + + // Are we all done? + if (--pendings == 0) { + if (!ret_code) { + cb->set_response(&stats); + } + cb->handle_response(ret_code); + cb->put(); + } + } + } + + void unset_cb() { + std::lock_guard l{lock}; + should_cb = false; + } +}; + +int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx) +{ + int num_aio = 0; + RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1); + ceph_assert(get_ctx); + int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio); + if (r < 0) { + ctx->put(); + if (num_aio) { + get_ctx->unset_cb(); + } + } + get_ctx->put(); + return r; +} + +int RGWRados::get_bucket_instance_info(const string& meta_key, + RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + optional_yield y, + const DoutPrefixProvider *dpp) +{ + rgw_bucket bucket; + rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr); + + return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp); +} + +int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, + real_time *pmtime, map *pattrs, optional_yield y, + const DoutPrefixProvider *dpp) +{ + return ctl.bucket->read_bucket_instance_info(bucket, &info, + y, + dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs)); +} + +int RGWRados::get_bucket_info(RGWServices *svc, + const string& tenant, const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, + optional_yield y, + const DoutPrefixProvider *dpp, map *pattrs) +{ + rgw_bucket bucket; + bucket.tenant = tenant; + bucket.name = bucket_name; + return ctl.bucket->read_bucket_info(bucket, &info, y, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs)); +} + +int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + const DoutPrefixProvider *dpp, + map *pattrs) +{ + rgw_bucket bucket = info.bucket; + bucket.bucket_id.clear(); + + auto rv = info.objv_tracker.read_version; + + return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp, + RGWBucketCtl::BucketInstance::GetParams() + .set_mtime(pmtime) + .set_attrs(pattrs) + .set_refresh_version(rv)); +} + +int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, + real_time mtime, map *pattrs, + const DoutPrefixProvider *dpp) +{ + return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp, + RGWBucketCtl::BucketInstance::PutParams() + .set_exclusive(exclusive) + .set_mtime(mtime) + .set_attrs(pattrs)); +} + +int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv, + map *pattrs, bool create_entry_point, + const DoutPrefixProvider *dpp) +{ + bool create_head = !info.has_instance_obj || create_entry_point; + + int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp); + if (ret < 0) { + return ret; + } + + if (!create_head) + return 0; /* done! */ + + RGWBucketEntryPoint entry_point; + entry_point.bucket = info.bucket; + entry_point.owner = info.owner; + entry_point.creation_time = info.creation_time; + entry_point.linked = true; + RGWObjVersionTracker ot; + if (pep_objv && !pep_objv->tag.empty()) { + ot.write_version = *pep_objv; + } else { + ot.generate_new_write_ver(cct); + if (pep_objv) { + *pep_objv = ot.write_version; + } + } + ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams() + .set_exclusive(exclusive) + .set_objv_tracker(&ot) + .set_mtime(mtime)); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::update_containers_stats(map& m, const DoutPrefixProvider *dpp) +{ + map::iterator iter; + for (iter = m.begin(); iter != m.end(); ++iter) { + RGWBucketEnt& ent = iter->second; + rgw_bucket& bucket = ent.bucket; + ent.count = 0; + ent.size = 0; + ent.size_rounded = 0; + + vector headers; + + RGWBucketInfo bucket_info; + int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp); + if (ret < 0) { + return ret; + } + + int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers); + if (r < 0) + return r; + + auto hiter = headers.begin(); + for (; hiter != headers.end(); ++hiter) { + RGWObjCategory category = main_category; + auto iter = (hiter->stats).find(category); + if (iter != hiter->stats.end()) { + struct rgw_bucket_category_stats& stats = iter->second; + ent.count += stats.num_entries; + ent.size += stats.total_size; + ent.size_rounded += stats.total_size_rounded; + } + } + + // fill in placement_rule from the bucket instance for use in swift's + // per-storage policy statistics + ent.placement_rule = std::move(bucket_info.placement_rule); + } + + return m.size(); +} + +int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl) +{ + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + librados::Rados *rad = get_rados_handle(); + librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr); + + r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size); + completion->release(); + return r; +} + +int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(dpp, pool, io_ctx, false); + if (r < 0) + return r; + + iter = io_ctx.nobjects_begin(); + + return 0; +} + +int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + int r = open_pool_ctx(dpp, pool, io_ctx, false); + if (r < 0) + return r; + + librados::ObjectCursor oc; + if (!oc.from_str(cursor)) { + ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl; + return -EINVAL; + } + + try { + iter = io_ctx.nobjects_begin(oc); + return 0; + } catch (const std::system_error& e) { + r = -e.code().value(); + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx) +{ + return ctx.iter.get_cursor().to_str(); +} + +static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num, + vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + librados::IoCtx& io_ctx = ctx.io_ctx; + librados::NObjectIterator& iter = ctx.iter; + + if (iter == io_ctx.nobjects_end()) + return -ENOENT; + + uint32_t i; + + for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { + rgw_bucket_dir_entry e; + + string oid = iter->get_oid(); + ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + + // fill it in with initial values; we may correct later + if (filter && !filter->filter(oid, oid)) + continue; + + e.key = oid; + objs.push_back(e); + } + + if (is_truncated) + *is_truncated = (iter != io_ctx.nobjects_end()); + + return objs.size(); +} + +int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector& objs, + bool *is_truncated, RGWAccessListFilter *filter) +{ + // catch exceptions from NObjectIterator::operator++() + try { + return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter); + } catch (const std::system_error& e) { + int r = -e.code().value(); + ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() + << ", returning " << r << dendl; + return r; + } catch (const std::exception& e) { + ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() + << ", returning -5" << dendl; + return -EIO; + } +} + +int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx) +{ + if (!ctx->initialized) { + int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx); + if (r < 0) { + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl; + return r; + } + ctx->initialized = true; + } + return 0; +} + +int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + return -EINVAL; + } + RGWAccessListFilterPrefix filter(prefix_filter); + vector objs; + int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter); + if (r < 0) { + if(r != -ENOENT) + ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; + return r; + } + + vector::iterator iter; + for (iter = objs.begin(); iter != objs.end(); ++iter) { + oids.push_back(iter->key.name); + } + + return oids.size(); +} + +int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter, + int max, RGWListRawObjsCtx& ctx, list& oids, + bool *is_truncated) +{ + if (!ctx.initialized) { + int r = list_raw_objects_init(dpp, pool, string(), &ctx); + if (r < 0) { + return r; + } + } + + return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated); +} + +string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx) +{ + return pool_iterate_get_cursor(ctx.iter_ctx); +} + +int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_dir_entry *dirent) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*dirent, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + rgw_bucket_olh_entry *olh) +{ + rgw_cls_bi_entry bi_entry; + int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; + } + if (r < 0) { + return r; + } + auto iter = bi_entry.data.cbegin(); + try { + decode(*olh, iter); + } catch (buffer::error& err) { + ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; + return -EIO; + } + + return 0; +} + +int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, + BIIndexType index_type, rgw_cls_bi_entry *entry) +{ + BucketShard bs(this); + int ret = bs.init(dpp, bucket_info, obj); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + + auto& ref = bs.bucket_obj.get_ref(); + + return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry); +} + +void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry) +{ + auto& ref = bs.bucket_obj.get_ref(); + cls_rgw_bi_put(op, ref.obj.oid, entry); +} + +int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry) +{ + // make sure incomplete multipart uploads are hashed correctly + if (obj.key.ns == RGW_OBJ_NS_MULTIPART) { + RGWMPObj mp; + mp.from_meta(obj.key.name); + obj.index_hash_source = mp.get_key(); + } + BucketShard bs(this); + + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_put(bs, entry); +} + +int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, + const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + rgw_obj obj(bucket, obj_name_filter); + BucketShard bs(this); + int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + auto& ref = bs.bucket_obj.get_ref(); + ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); + if (ret == -ENOENT) { + *is_truncated = false; + } + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); + if (ret < 0) + return ret; + + return 0; +} + +int RGWRados::bi_list(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max, + list *entries, bool *is_truncated) +{ + BucketShard bs(this); + int ret = bs.init(dpp, bucket_info, + bucket_info.layout.current_index, + shard_id); + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; + return ret; + } + + return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated); +} + +int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs) +{ + auto& ref = bs.bucket_obj.get_ref(); + int ret = ref.pool.ioctx().remove(ref.obj.oid); + if (ret == -ENOENT) { + ret = 0; + } + if (ret < 0) { + ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl; + return ret; + } + + return 0; +} + +int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op) +{ + return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield); +} + +int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c, + librados::ObjectWriteOperation *op) +{ + return gc_pool_ctx.aio_operate(oid, c, op); +} + +int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl) +{ + return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield); +} + +int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue) +{ + return gc->list(index, marker, max, expired_only, result, truncated, processing_queue); +} + +int RGWRados::process_gc(bool expired_only) +{ + return gc->process(expired_only); +} + +int RGWRados::list_lc_progress(string& marker, uint32_t max_entries, + vector>& progress_map, + int& index) +{ + return lc->list_lc_progress(marker, max_entries, progress_map, index); +} + +int RGWRados::process_lc(const std::unique_ptr& optional_bucket) +{ + RGWLC lc; + lc.initialize(cct, this->driver); + RGWLC::LCWorker worker(&lc, cct, &lc, 0); + auto ret = lc.process(&worker, optional_bucket, true /* once */); + lc.stop_processor(); // sets down_flag, but returns immediately + return ret; +} + +bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp) +{ + return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now()); +} + +int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag, + rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); + + ObjectWriteOperation o; + o.assert_exists(); // bucket index shard must exist + + cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace); + int ret = bs.bucket_obj.operate(dpp, &o, y); + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; + return ret; +} + +int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << + " obj=" << obj << " tag=" << tag << " op=" << op << + ", remove_objs=" << (remove_objs ? *remove_objs : std::list()) << dendl_bitx; + ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + ObjectWriteOperation o; + o.assert_exists(); // bucket index shard must exist + + rgw_bucket_dir_entry_meta dir_meta; + dir_meta = ent.meta; + dir_meta.category = category; + + rgw_zone_set zones_trace; + if (_zones_trace) { + zones_trace = *_zones_trace; + } + zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); + + rgw_bucket_entry_ver ver; + ver.pool = pool; + ver.epoch = epoch; + cls_rgw_obj_key key(ent.key.name, ent.key.instance); + cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); + cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->get_zone().log_data, bilog_flags, &zones_trace); + complete_op_data *arg; + index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs, + svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg); + librados::AioCompletion *completion = arg->rados_completion; + int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o); + completion->release(); /* can't reference arg here, as it might have already been released */ + + ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; + return ret; +} + +int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag, + int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, + list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag, + int64_t pool, uint64_t epoch, + rgw_obj& obj, + real_time& removed_mtime, + list *remove_objs, + uint16_t bilog_flags, + rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + ent.meta.mtime = removed_mtime; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, + ent, RGWObjCategory::None, remove_objs, + bilog_flags, zones_trace); +} + +int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, + list *remove_objs, + uint16_t bilog_flags, rgw_zone_set *zones_trace) +{ + rgw_bucket_dir_entry ent; + obj.key.get_index_key(&ent.key); + return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, + -1 /* pool id */, 0, ent, + RGWObjCategory::None, remove_objs, bilog_flags, + zones_trace); +} + +int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)(); +} + + +// returns 0 if there is an error in calculation +uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards) +{ + if (num_shards == 0) { + // we'll get a floating point exception since we divide by + // num_shards + return 0; + } + + // We want to minimize the chances that when num_shards >> + // num_entries that we return much fewer than num_entries to the + // client. Given all the overhead of making a cls call to the osd, + // returning a few entries is not much more work than returning one + // entry. This minimum might be better tuned based on future + // experiments where num_shards >> num_entries. (Note: ">>" should + // be interpreted as "much greater than".) + constexpr uint32_t min_read = 8; + + // The following is based on _"Balls into Bins" -- A Simple and + // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle + // cases when num_shards >> num_entries (it almost serves as a + // ceiling calculation). We also assume alpha is 1.0 and extract it + // from the calculation. Future work could involve memoizing some of + // the transcendental functions to minimize repeatedly re-calling + // them with the same parameters, which we expect to be the case the + // majority of the time. + uint32_t calc_read = + 1 + + static_cast((num_entries / num_shards) + + sqrt((2 * num_entries) * + log(num_shards) / num_shards)); + + return std::max(min_read, calc_read); +} + + +int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + const int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + const std::string& delimiter, + const uint32_t num_entries, + const bool list_versions, + const uint16_t expansion_factor, + ent_map_t& m, + bool* is_truncated, + bool* cls_filtered, + rgw_obj_index_key* last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + + /* expansion_factor allows the number of entries to read to grow + * exponentially; this is used when earlier reads are producing too + * few results, perhaps due to filtering or to a series of + * namespaced entries */ + + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after.to_string() << + "\", prefix=\"" << prefix << + ", delimiter=\"" << delimiter << + "\", shard_id=" << shard_id << + "\", num_entries=" << num_entries << + ", shard_id=" << shard_id << + ", list_versions=" << list_versions << + ", expansion_factor=" << expansion_factor << + ", force_check_filter is " << + (force_check_filter ? "set" : "unset") << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + m.clear(); + + RGWSI_RADOS::Pool index_pool; + // key - oid (for different shards if there is any) + // value - list result for the corresponding oid (shard), it is filled by + // the AIO callback + std::map shard_oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, + &index_pool, &shard_oids, + nullptr); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl; + return r; + } + + const uint32_t shard_count = shard_oids.size(); + if (shard_count == 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": the bucket index shard count appears to be 0, " + "which is an illegal value" << dendl; + return -ERR_INVALID_BUCKET_STATE; + } + + uint32_t num_entries_per_shard; + if (expansion_factor == 0) { + num_entries_per_shard = + calc_ordered_bucket_list_per_shard(num_entries, shard_count); + } else if (expansion_factor <= 11) { + // we'll max out the exponential multiplication factor at 1024 (2<<10) + num_entries_per_shard = + std::min(num_entries, + (uint32_t(1 << (expansion_factor - 1)) * + calc_ordered_bucket_list_per_shard(num_entries, shard_count))); + } else { + num_entries_per_shard = num_entries; + } + + if (num_entries_per_shard == 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": unable to calculate the number of entries to read from each " + "bucket index shard" << dendl; + return -ERR_INVALID_BUCKET_STATE; + } + + ldpp_dout(dpp, 10) << __func__ << + ": request from each of " << shard_count << + " shard(s) for " << num_entries_per_shard << " entries to get " << + num_entries << " total entries" << dendl; + + auto& ioctx = index_pool.ioctx(); + std::map shard_list_results; + cls_rgw_obj_key start_after_key(start_after.name, start_after.instance); + r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter, + num_entries_per_shard, + list_versions, shard_oids, shard_list_results, + cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": CLSRGWIssueBucketList for " << bucket_info.bucket << + " failed" << dendl; + return r; + } + + // to manage the iterators through each shard's list results + struct ShardTracker { + const size_t shard_idx; + rgw_cls_list_ret& result; + const std::string& oid_name; + RGWRados::ent_map_t::iterator cursor; + RGWRados::ent_map_t::iterator end; + + // manages an iterator through a shard and provides other + // accessors + ShardTracker(size_t _shard_idx, + rgw_cls_list_ret& _result, + const std::string& _oid_name): + shard_idx(_shard_idx), + result(_result), + oid_name(_oid_name), + cursor(_result.dir.m.begin()), + end(_result.dir.m.end()) + {} + + inline const std::string& entry_name() const { + return cursor->first; + } + rgw_bucket_dir_entry& dir_entry() const { + return cursor->second; + } + inline bool is_truncated() const { + return result.is_truncated; + } + inline ShardTracker& advance() { + ++cursor; + // return a self-reference to allow for chaining of calls, such + // as x.advance().at_end() + return *this; + } + inline bool at_end() const { + return cursor == end; + } + }; // ShardTracker + + // add the next unique candidate, or return false if we reach the end + auto next_candidate = [] (CephContext *cct, ShardTracker& t, + std::multimap& candidates, + size_t tracker_idx) { + if (!t.at_end()) { + candidates.emplace(t.entry_name(), tracker_idx); + } + return; + }; + + // one tracker per shard requested (may not be all shards) + std::vector results_trackers; + results_trackers.reserve(shard_list_results.size()); + for (auto& r : shard_list_results) { + results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]); + + // if any *one* shard's result is trucated, the entire result is + // truncated + *is_truncated = *is_truncated || r.second.is_truncated; + + // unless *all* are shards are cls_filtered, the entire result is + // not filtered + *cls_filtered = *cls_filtered && r.second.cls_filtered; + } + + // create a map to track the next candidate entry from ShardTracker + // (key=candidate, value=index into results_trackers); as we consume + // entries from shards, we replace them with the next entries in the + // shards until we run out + std::multimap candidates; + size_t tracker_idx = 0; + std::vector vidx; + vidx.reserve(shard_list_results.size()); + for (auto& t : results_trackers) { + // it's important that the values in the map refer to the index + // into the results_trackers vector, which may not be the same + // as the shard number (i.e., when not all shards are requested) + next_candidate(cct, t, candidates, tracker_idx); + ++tracker_idx; + } + + rgw_bucket_dir_entry* + last_entry_visited = nullptr; // to set last_entry (marker) + std::map updates; + uint32_t count = 0; + while (count < num_entries && !candidates.empty()) { + r = 0; + // select the next entry in lexical order (first key in map); + // again tracker_idx is not necessarily shard number, but is index + // into results_trackers vector + tracker_idx = candidates.begin()->second; + auto& tracker = results_trackers.at(tracker_idx); + + const std::string& name = tracker.entry_name(); + rgw_bucket_dir_entry& dirent = tracker.dir_entry(); + + ldpp_dout(dpp, 20) << __func__ << ": currently processing " << + dirent.key << " from shard " << tracker.shard_idx << dendl; + + const bool force_check = + force_check_filter && force_check_filter(dirent.key.name); + + if ((!dirent.exists && + !dirent.is_delete_marker() && + !dirent.is_common_prefix()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current + * state, and if the tags are old we need to do clean-up as + * well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(ioctx); + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + " calling check_disk_state bucket=" << bucket_info.bucket << + " entry=" << dirent.key << dendl_bitx; + r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, + updates[tracker.oid_name], y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << __func__ << + ": check_disk_state for \"" << dirent.key << + "\" failed with r=" << r << dendl; + return r; + } + } else { + r = 0; + } + + // at this point either r >= 0 or r == -ENOENT + if (r >= 0) { // i.e., if r != -ENOENT + ldpp_dout(dpp, 10) << __func__ << ": got " << + dirent.key << dendl; + + auto [it, inserted] = m.insert_or_assign(name, std::move(dirent)); + last_entry_visited = &it->second; + if (inserted) { + ++count; + } else { + ldpp_dout(dpp, 0) << "WARNING: " << __func__ << + " reassigned map value at \"" << name << + "\", which should not happen" << dendl; + } + } else { + ldpp_dout(dpp, 10) << __func__ << ": skipping " << + dirent.key.name << "[" << dirent.key.instance << "]" << dendl; + last_entry_visited = &tracker.dir_entry(); + } + + // refresh the candidates map + vidx.clear(); + bool need_to_stop = false; + auto range = candidates.equal_range(name); + for (auto i = range.first; i != range.second; ++i) { + vidx.push_back(i->second); + } + candidates.erase(range.first, range.second); + for (auto idx : vidx) { + auto& tracker_match = results_trackers.at(idx); + tracker_match.advance(); + next_candidate(cct, tracker_match, candidates, idx); + if (tracker_match.at_end() && tracker_match.is_truncated()) { + need_to_stop = true; + break; + } + } + if (need_to_stop) { + // once we exhaust one shard that is truncated, we need to stop, + // as we cannot be certain that one of the next entries needs to + // come from that shard; S3 and swift protocols allow returning + // fewer than what was requested + ldpp_dout(dpp, 10) << __func__ << + ": stopped accumulating results at count=" << count << + ", dirent=\"" << dirent.key << + "\", because its shard is truncated and exhausted" << dendl; + break; + } + } // while we haven't provided requested # of result entries + + // suggest updates if there are any + for (auto& miter : updates) { + if (miter.second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter.second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = + librados::Rados::aio_create_completion(nullptr, nullptr); + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": doing dir_suggest on " << miter.first << dendl_bitx; + ioctx.aio_operate(miter.first, c, &o); + c->release(); + } + } // updates loop + + // determine truncation by checking if all the returned entries are + // consumed or not + *is_truncated = false; + for (const auto& t : results_trackers) { + if (!t.at_end() || t.is_truncated()) { + *is_truncated = true; + break; + } + } + + ldpp_dout(dpp, 20) << __func__ << + ": returning, count=" << count << ", is_truncated=" << *is_truncated << + dendl; + + if (*is_truncated && count < num_entries) { + ldpp_dout(dpp, 10) << __func__ << + ": requested " << num_entries << " entries but returning " << + count << ", which is truncated" << dendl; + } + + if (last_entry_visited != nullptr && last_entry) { + *last_entry = last_entry_visited->key; + ldpp_dout(dpp, 20) << __func__ << + ": returning, last_entry=" << *last_entry << dendl; + } else { + ldpp_dout(dpp, 20) << __func__ << + ": returning, last_entry NOT SET" << dendl; + } + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::cls_bucket_list_ordered + + +// A helper function to retrieve the hash source from an incomplete +// multipart entry by removing everything from the second to last +// period on. +static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) { + std::size_t found = oid_wo_ns.rfind('.'); + if (found == std::string::npos || found < 1) { + return -EINVAL; + } + found = oid_wo_ns.rfind('.', found - 1); + if (found == std::string::npos || found < 1) { + return -EINVAL; + } + *index_hash_source = oid_wo_ns.substr(0, found); + return 0; +} + + +int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + uint32_t num_entries, + bool list_versions, + std::vector& ent_list, + bool *is_truncated, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter) { + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << + " start_after=\"" << start_after << + "\", prefix=\"" << prefix << + "\", shard_id=" << shard_id << + "\", num_entries=" << num_entries << + ", list_versions=" << list_versions << + (force_check_filter ? "set" : "unset") << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + ent_list.clear(); + static MultipartMetaFilter multipart_meta_filter; + + *is_truncated = false; + RGWSI_RADOS::Pool index_pool; + + std::map oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr); + if (r < 0) { + return r; + } + + auto& ioctx = index_pool.ioctx(); + + const uint32_t num_shards = oids.size(); + + rgw_obj_index_key marker = start_after; + uint32_t current_shard; + if (shard_id >= 0) { + current_shard = shard_id; + } else if (start_after.empty()) { + current_shard = 0u; + } else { + // at this point we have a marker (start_after) that has something + // in it, so we need to get to the bucket shard index, so we can + // start reading from there + + + // now convert the key (oid) to an rgw_obj_key since that will + // separate out the namespace, name, and instance + rgw_obj_key obj_key; + bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key); + if (!parsed) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " received an invalid start marker: \"" << start_after << "\"" << + dendl; + return -EINVAL; + } else if (obj_key.name.empty()) { + // if the name is empty that means the object name came in with + // a namespace only, and therefore we need to start our scan at + // the first bucket index shard + current_shard = 0u; + } else { + // so now we have the key used to compute the bucket index shard + // and can extract the specific shard from it + if (obj_key.ns == RGW_OBJ_NS_MULTIPART) { + // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of + // the implementation relying on MultipartMetaFilter + // because MultipartMetaFilter only checks .meta suffix, which may + // exclude data multiparts but include some regular objects with .meta suffix + // by mistake. + string index_hash_source; + r = parse_index_hash_source(obj_key.name, &index_hash_source); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + " parse_index_hash_source unable to parse \"" << obj_key.name << + "\", r=" << r << dendl; + return r; + } + current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards); + } else { + current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards); + } + } + } + + uint32_t count = 0u; + std::map updates; + rgw_obj_index_key last_added_entry; + while (count <= num_entries && + ((shard_id >= 0 && current_shard == uint32_t(shard_id)) || + current_shard < num_shards)) { + const std::string& oid = oids[current_shard]; + rgw_cls_list_ret result; + + librados::ObjectReadOperation op; + const std::string empty_delimiter; + cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter, + num_entries, + list_versions, &result); + r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield); + if (r < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": error in rgw_rados_operate (bucket list op), r=" << r << dendl; + return r; + } + + for (auto& entry : result.dir.m) { + rgw_bucket_dir_entry& dirent = entry.second; + + bool force_check = force_check_filter && + force_check_filter(dirent.key.name); + if ((!dirent.exists && !dirent.is_delete_marker()) || + !dirent.pending_map.empty() || + force_check) { + /* there are uncommitted ops. We need to check the current state, + * and if the tags are old we need to do cleanup as well. */ + librados::IoCtx sub_ctx; + sub_ctx.dup(ioctx); + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + ": calling check_disk_state bucket=" << bucket_info.bucket << + " entry=" << dirent.key << dendl_bitx; + r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y); + if (r < 0 && r != -ENOENT) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << + ": error in check_disk_state, r=" << r << dendl; + return r; + } + } else { + r = 0; + } + + // at this point either r >= 0 or r == -ENOENT + if (r >= 0) { // i.e., if r != -ENOENT + ldpp_dout(dpp, 10) << __func__ << ": got " << + dirent.key << dendl; + + if (count < num_entries) { + marker = last_added_entry = dirent.key; // double assign + ent_list.emplace_back(std::move(dirent)); + ++count; + } else { + last_added_entry = dirent.key; + *is_truncated = true; + ldpp_dout(dpp, 10) << "INFO: " << __func__ << + ": reached max entries (" << num_entries << ") to return at \"" << + dirent.key << "\"" << dendl; + goto check_updates; + } + } else { // r == -ENOENT + // in the case of -ENOENT, make sure we're advancing marker + // for possible next call to CLSRGWIssueBucketList + marker = dirent.key; + } + } // entry for loop + + if (!result.is_truncated) { + // if we reached the end of the shard read next shard + ++current_shard; + marker = rgw_obj_index_key(); + } + } // shard loop + +check_updates: + + // suggest updates if there is any + std::map::iterator miter = updates.begin(); + for (; miter != updates.end(); ++miter) { + if (miter->second.length()) { + ObjectWriteOperation o; + cls_rgw_suggest_changes(o, miter->second); + // we don't care if we lose suggested updates, send them off blindly + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + " doing dir_suggest on " << miter->first << dendl_bitx; + ioctx.aio_operate(miter->first, c, &o); + c->release(); + } + } + + if (last_entry && !ent_list.empty()) { + *last_entry = last_added_entry; + } + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::cls_bucket_list_unordered + + +int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid, + rgw_usage_log_info& info) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + ObjectWriteOperation op; + cls_rgw_usage_log_add(op, info); + + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + return r; +} + +int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, + string& read_iter, map& usage, + bool *is_truncated) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + *is_truncated = false; + + r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch, + max_entries, read_iter, usage, is_truncated); + + return r; +} + +static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch) +{ + bool done = false; + do { + librados::ObjectWriteOperation op; + cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch); + int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + if (r == -ENODATA) + done = true; + else if (r < 0) + return r; + } while (!done); + + return 0; +} + +int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, + uint64_t start_epoch, uint64_t end_epoch) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + + r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch); + return r; +} + +int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid) +{ + rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); + + rgw_rados_ref ref; + int r = get_raw_obj_ref(dpp, obj, &ref); + if (r < 0) { + return r; + } + librados::ObjectWriteOperation op; + cls_rgw_usage_log_clear(op); + r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); + return r; +} + + +// note: this removes entries from the rados bucket index objects +// without going through CLS; this is known to be called from +// "radosgw-admin unlink" and "radosgw-admin bucket check --fix" +int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::list& entry_key_list) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket << + " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx; + ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; + + const auto& current_index = bucket_info.get_current_index(); + if (is_layout_indexless(current_index)) { + return -EINVAL; + } + const uint32_t num_shards = current_index.layout.normal.num_shards; + + RGWSI_RADOS::Pool index_pool; + std::map index_oids; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, + bucket_info.layout.current_index, + &index_pool, &index_oids, nullptr); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << + " open_bucket_index returned " << r << dendl_bitx; + return r; + } + + // split up removals by shard + std::map> sharded_removals; + for (const auto& entry_key : entry_key_list) { + const rgw_obj_key obj_key(entry_key); + const uint32_t shard = + RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards); + + // entry_key already combines namespace and name, so we first have + // to break that apart before we can then combine with instance + std::string name; + std::string ns; // namespace + rgw_obj_key::parse_index_key(entry_key.name, &name, &ns); + rgw_obj_key full_key(name, entry_key.instance, ns); + std::string combined_key = full_key.get_oid(); + + sharded_removals[shard].insert(combined_key); + + ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << + ": removal from bucket index, bucket=" << bucket_info.bucket << + " key=" << combined_key << " designated for shard " << shard << + dendl_bitx; + } + + for (const auto& removals : sharded_removals) { + const int shard = removals.first; + const std::string& oid = index_oids[shard]; + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": removal from bucket index, bucket=" << bucket_info.bucket << + ", shard=" << shard << ", oid=" << oid << ", num_keys=" << + removals.second.size() << dendl_bitx; + + r = index_pool.ioctx().omap_rm_keys(oid, removals.second); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << + ": omap_rm_keys returned ret=" << r << + dendl_bitx; + return r; + } + } + + ldout_bitx(bitx, dpp, 5) << + "EXITING " << __func__ << " and returning " << r << dendl_bitx; + + return r; +} + +int RGWRados::check_disk_state(const DoutPrefixProvider *dpp, + librados::IoCtx io_ctx, + RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates, + optional_yield y) +{ + const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; + ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << + bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx; + + std::unique_ptr bucket; + driver->get_bucket(nullptr, bucket_info, &bucket); + uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0); + + std::string loc; + + std::unique_ptr obj = bucket->get_object(list_state.key); + MultipartMetaFilter multipart_meta_filter; + string temp_key; + if (multipart_meta_filter.filter(list_state.key.name, temp_key)) { + obj->set_in_extra_data(true); + } + + string oid; + get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc); + + if (loc != list_state.locator) { + ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl; + } + + io_ctx.locator_set_key(list_state.locator); + + RGWObjState *astate = NULL; + RGWObjManifest *manifest = nullptr; + RGWObjectCtx rctx(this->driver); + int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y); + if (r < 0) + return r; + + list_state.pending_map.clear(); // we don't need this and it inflates size + if (!list_state.is_delete_marker() && !astate->exists) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx; + /* object doesn't exist right now -- hopefully because it's + * marked as !exists and got deleted */ + if (list_state.exists) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx; + /* FIXME: what should happen now? Work out if there are any + * non-bad ways this could happen (there probably are, but annoying + * to handle!) */ + } + + // encode a suggested removal of that key + list_state.ver.epoch = io_ctx.get_last_version(); + list_state.ver.pool = io_ctx.get_id(); + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx; + cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates); + return -ENOENT; + } + + string etag; + string content_type; + string storage_class; + ACLOwner owner; + bool appendable = false; + + object.meta.size = astate->size; + object.meta.accounted_size = astate->accounted_size; + object.meta.mtime = astate->mtime; + + map::iterator iter = astate->attrset.find(RGW_ATTR_ETAG); + if (iter != astate->attrset.end()) { + etag = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE); + if (iter != astate->attrset.end()) { + content_type = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); + if (iter != astate->attrset.end()) { + storage_class = rgw_bl_str(iter->second); + } + iter = astate->attrset.find(RGW_ATTR_ACL); + if (iter != astate->attrset.end()) { + r = decode_policy(dpp, iter->second, &owner); + if (r < 0) { + ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl; + } + } + iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); + if (iter != astate->attrset.end()) { + appendable = true; + } + + if (manifest) { + RGWObjManifest::obj_iterator miter; + for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { + const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver); + rgw_obj loc; + RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc); + + if (loc.key.ns == RGW_OBJ_NS_MULTIPART) { + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx; + r = delete_obj_index(loc, astate->mtime, dpp); + if (r < 0) { + ldout_bitx(bitx, dpp, 0) << + "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx; + } + } + } + } + + object.meta.etag = etag; + object.meta.content_type = content_type; + object.meta.storage_class = storage_class; + object.meta.owner = owner.get_id().to_str(); + object.meta.owner_display_name = owner.get_display_name(); + object.meta.appendable = appendable; + + // encode suggested updates + + list_state.meta.size = object.meta.size; + list_state.meta.accounted_size = object.meta.accounted_size; + list_state.meta.mtime = object.meta.mtime; + list_state.meta.category = main_category; + list_state.meta.etag = etag; + list_state.meta.appendable = appendable; + list_state.meta.content_type = content_type; + list_state.meta.storage_class = storage_class; + + librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id + r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx); + if (r < 0) { + ldpp_dout(dpp, 0) << __func__ << + " WARNING: unable to find head object data pool for \"" << + obj << "\", not updating version pool/epoch" << dendl; + } else { + list_state.ver.pool = head_obj_ctx.get_id(); + list_state.ver.epoch = astate->epoch; + } + + if (astate->obj_tag.length() > 0) { + list_state.tag = astate->obj_tag.c_str(); + } + + list_state.meta.owner = owner.get_id().to_str(); + list_state.meta.owner_display_name = owner.get_display_name(); + + list_state.exists = true; + + ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << + ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx; + cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates); + + ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; + return 0; +} // RGWRados::check_disk_state + +int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector& headers, map *bucket_instance_ids) +{ + RGWSI_RADOS::Pool index_pool; + map oids; + map list_results; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids); + if (r < 0) { + ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned " + << r << dendl; + return r; + } + + r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); + if (r < 0) { + ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned " + << r << dendl; + return r; + } + + map::iterator iter = list_results.begin(); + for(; iter != list_results.end(); ++iter) { + headers.push_back(std::move(iter->second.dir.header)); + } + return 0; +} + +int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio) +{ + RGWSI_RADOS::Pool index_pool; + map bucket_objs; + int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr); + if (r < 0) + return r; + + map::iterator iter = bucket_objs.begin(); + for (; iter != bucket_objs.end(); ++iter) { + r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast(ctx->get())); + if (r < 0) { + ctx->put(); + break; + } else { + (*num_aio)++; + } + } + return r; +} + +int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, + const rgw_bucket& bucket, + uint64_t num_objs, + const DoutPrefixProvider *dpp) +{ + if (! cct->_conf.get_val("rgw_dynamic_resharding")) { + return 0; + } + + bool need_resharding = false; + uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); + const uint32_t max_dynamic_shards = + uint32_t(cct->_conf.get_val("rgw_max_dynamic_shards")); + + if (num_source_shards >= max_dynamic_shards) { + return 0; + } + + uint32_t suggested_num_shards = 0; + const uint64_t max_objs_per_shard = + cct->_conf.get_val("rgw_max_objs_per_shard"); + + // TODO: consider per-bucket sync policy here? + const bool is_multisite = svc.zone->get_zone().log_data; + + quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards, + num_objs, is_multisite, need_resharding, + &suggested_num_shards); + if (! need_resharding) { + return 0; + } + + const uint32_t final_num_shards = + RGWBucketReshard::get_preferred_shards(suggested_num_shards, + max_dynamic_shards); + // final verification, so we don't reduce number of shards + if (final_num_shards <= num_source_shards) { + return 0; + } + + ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name << + " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards << + "; new num shards " << final_num_shards << " (suggested " << + suggested_num_shards << ")" << dendl; + + return add_bucket_to_reshard(dpp, bucket_info, final_num_shards); +} + +int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards) +{ + RGWReshard reshard(this->driver, dpp); + + uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); + + new_num_shards = std::min(new_num_shards, get_max_bucket_shards()); + if (new_num_shards <= num_source_shards) { + ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl; + return 0; + } + + cls_rgw_reshard_entry entry; + entry.time = real_clock::now(); + entry.tenant = bucket_info.owner.tenant; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.old_num_shards = num_source_shards; + entry.new_num_shards = new_num_shards; + + return reshard.add(dpp, entry); +} + +int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuota& quota, + uint64_t obj_size, optional_yield y, + bool check_size_only) +{ + // if we only check size, then num_objs will set to 0 + if(check_size_only) + return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y); + + return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y); +} + +int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key, + int *shard_id) +{ + int r = 0; + switch (layout.hash_type) { + case rgw::BucketHashType::Mod: + if (!layout.num_shards) { + if (shard_id) { + *shard_id = -1; + } + } else { + uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards); + if (shard_id) { + *shard_id = (int)sid; + } + } + break; + default: + r = -ENOTSUP; + } + return r; +} + +uint64_t RGWRados::instance_id() +{ + return get_rados_handle()->get_instance_id(); +} + +uint64_t RGWRados::next_bucket_id() +{ + std::lock_guard l{bucket_id_lock}; + return ++max_bucket_id; +} + +librados::Rados* RGWRados::get_rados_handle() +{ + return &rados; +} + +int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list& handles) +{ + rgw_rados_ref ref; + int ret = get_raw_obj_ref(dpp, obj, &ref); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + return 0; +} + +int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, + RGWBucketInfo& bucket_info, RGWObjState *astate, + list& handles, bool keep_index_consistent, + optional_yield y) +{ + rgw_rados_ref ref; + int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; + return ret; + } + + if (keep_index_consistent) { + RGWRados::Bucket bop(this, bucket_info); + RGWRados::Bucket::UpdateIndex index_op(&bop, obj); + + ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl; + return ret; + } + } + + ObjectWriteOperation op; + list prefixes; + cls_rgw_remove_obj(op, prefixes); + + AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); + ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; + c->release(); + return ret; + } + + handles.push_back(c); + + if (keep_index_consistent) { + ret = delete_obj_index(obj, astate->mtime, dpp); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl; + return ret; + } + } + return ret; +} + +void objexp_hint_entry::generate_test_instances(list& o) +{ + auto it = new objexp_hint_entry; + it->tenant = "tenant1"; + it->bucket_name = "bucket1"; + it->bucket_id = "1234"; + it->obj_key = rgw_obj_key("obj"); + o.push_back(it); + o.push_back(new objexp_hint_entry); +} + +void objexp_hint_entry::dump(Formatter *f) const +{ + f->open_object_section("objexp_hint_entry"); + encode_json("tenant", tenant, f); + encode_json("bucket_name", bucket_name, f); + encode_json("bucket_id", bucket_id, f); + encode_json("rgw_obj_key", obj_key, f); + utime_t ut(exp_time); + encode_json("exp_time", ut, f); + f->close_section(); +} + +void RGWOLHInfo::generate_test_instances(list &o) +{ + RGWOLHInfo *olh = new RGWOLHInfo; + olh->removed = false; + o.push_back(olh); + o.push_back(new RGWOLHInfo); +} + +void RGWOLHInfo::dump(Formatter *f) const +{ + encode_json("target", target, f); +} + +void RGWOLHPendingInfo::dump(Formatter *f) const +{ + utime_t ut(time); + encode_json("time", ut, f); +} + diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h new file mode 100644 index 000000000000..a3258ac8b723 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rados.h @@ -0,0 +1,1632 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include + +#include "include/rados/librados.hpp" +#include "include/Context.h" +#include "include/random.h" +#include "common/RefCountedObj.h" +#include "common/ceph_time.h" +#include "common/Timer.h" +#include "rgw_common.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/version/cls_version_types.h" +#include "cls/log/cls_log_types.h" +#include "cls/timeindex/cls_timeindex_types.h" +#include "cls/otp/cls_otp_types.h" +#include "rgw_quota.h" +#include "rgw_log.h" +#include "rgw_metadata.h" +#include "rgw_meta_sync_status.h" +#include "rgw_period_puller.h" +#include "rgw_obj_manifest.h" +#include "rgw_sync_module.h" +#include "rgw_trim_bilog.h" +#include "rgw_service.h" +#include "rgw_sal.h" +#include "rgw_aio.h" +#include "rgw_d3n_cacherequest.h" + +#include "services/svc_rados.h" +#include "services/svc_bi_rados.h" +#include "common/Throttle.h" +#include "common/ceph_mutex.h" +#include "rgw_cache.h" +#include "rgw_sal_fwd.h" + +struct D3nDataCache; + +class RGWWatcher; +class ACLOwner; +class RGWGC; +class RGWMetaNotifier; +class RGWDataNotifier; +class RGWLC; +class RGWObjectExpirer; +class RGWMetaSyncProcessorThread; +class RGWDataSyncProcessorThread; +class RGWSyncLogTrimThread; +class RGWSyncTraceManager; +struct RGWZoneGroup; +struct RGWZoneParams; +class RGWReshard; +class RGWReshardWait; + +struct get_obj_data; + +/* flags for put_obj_meta() */ +#define PUT_OBJ_CREATE 0x01 +#define PUT_OBJ_EXCL 0x02 +#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL) + +static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid) +{ + if (bucket.marker.empty() || orig_oid.empty()) { + oid = orig_oid; + } else { + oid = bucket.marker; + oid.append("_"); + oid.append(orig_oid); + } +} + +static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator) +{ + const rgw_bucket& bucket = obj.bucket; + prepend_bucket_marker(bucket, obj.get_oid(), oid); + const std::string& loc = obj.key.get_loc(); + if (!loc.empty()) { + prepend_bucket_marker(bucket, loc, locator); + } else { + locator.clear(); + } +} + +struct RGWOLHInfo { + rgw_obj target; + bool removed; + + RGWOLHInfo() : removed(false) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(target, bl); + encode(removed, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(target, bl); + decode(removed, bl); + DECODE_FINISH(bl); + } + static void generate_test_instances(std::list& o); + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHInfo) + +struct RGWOLHPendingInfo { + ceph::real_time time; + + RGWOLHPendingInfo() {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(time, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(time, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; +}; +WRITE_CLASS_ENCODER(RGWOLHPendingInfo) + +struct RGWUsageBatch { + std::map m; + + void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) { + bool exists = m.find(t) != m.end(); + *account = !exists; + m[t].aggregate(entry); + } +}; + +struct RGWCloneRangeInfo { + rgw_obj src; + off_t src_ofs; + off_t dst_ofs; + uint64_t len; +}; + +class RGWFetchObjFilter { +public: + virtual ~RGWFetchObjFilter() {} + + virtual int filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const std::map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) = 0; +}; + +class RGWFetchObjFilter_Default : public RGWFetchObjFilter { +protected: + rgw_placement_rule dest_rule; +public: + RGWFetchObjFilter_Default() {} + + int filter(CephContext *cct, + const rgw_obj_key& source_key, + const RGWBucketInfo& dest_bucket_info, + std::optional dest_placement_rule, + const std::map& obj_attrs, + std::optional *poverride_owner, + const rgw_placement_rule **prule) override; +}; + +struct RGWObjStateManifest { + RGWObjState state; + std::optional manifest; +}; + +class RGWObjectCtx { + rgw::sal::Driver* driver; + ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx"); + + std::map objs_state; +public: + explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {} + RGWObjectCtx(RGWObjectCtx& _o) { + std::unique_lock wl{lock}; + this->driver = _o.driver; + this->objs_state = _o.objs_state; + } + + rgw::sal::Driver* get_driver() { + return driver; + } + + RGWObjStateManifest *get_state(const rgw_obj& obj); + + void set_compressed(const rgw_obj& obj); + void set_atomic(rgw_obj& obj); + void set_prefetch_data(const rgw_obj& obj); + void invalidate(const rgw_obj& obj); +}; + + +struct RGWRawObjState { + rgw_raw_obj obj; + bool has_attrs{false}; + bool exists{false}; + uint64_t size{0}; + ceph::real_time mtime; + uint64_t epoch{0}; + bufferlist obj_tag; + bool has_data{false}; + bufferlist data; + bool prefetch_data{false}; + uint64_t pg_ver{0}; + + /* important! don't forget to update copy constructor */ + + RGWObjVersionTracker objv_tracker; + + std::map attrset; + RGWRawObjState() {} + RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) { + has_attrs = rhs.has_attrs; + exists = rhs.exists; + size = rhs.size; + mtime = rhs.mtime; + epoch = rhs.epoch; + if (rhs.obj_tag.length()) { + obj_tag = rhs.obj_tag; + } + has_data = rhs.has_data; + if (rhs.data.length()) { + data = rhs.data; + } + prefetch_data = rhs.prefetch_data; + pg_ver = rhs.pg_ver; + objv_tracker = rhs.objv_tracker; + } +}; + +struct RGWPoolIterCtx { + librados::IoCtx io_ctx; + librados::NObjectIterator iter; +}; + +struct RGWListRawObjsCtx { + bool initialized; + RGWPoolIterCtx iter_ctx; + + RGWListRawObjsCtx() : initialized(false) {} +}; + +struct objexp_hint_entry { + std::string tenant; + std::string bucket_name; + std::string bucket_id; + rgw_obj_key obj_key; + ceph::real_time exp_time; + + void encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + encode(bucket_name, bl); + encode(bucket_id, bl); + encode(obj_key, bl); + encode(exp_time, bl); + encode(tenant, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator& bl) { + // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ? + DECODE_START(2, bl); + decode(bucket_name, bl); + decode(bucket_id, bl); + decode(obj_key, bl); + decode(exp_time, bl); + if (struct_v >= 2) { + decode(tenant, bl); + } else { + tenant.clear(); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(std::list& o); +}; +WRITE_CLASS_ENCODER(objexp_hint_entry) + +class RGWMetaSyncStatusManager; +class RGWDataSyncStatusManager; +class RGWCoroutinesManagerRegistry; + +class RGWGetDirHeader_CB; +class RGWGetUserHeader_CB; +namespace rgw { namespace sal { + class RadosStore; + class MPRadosSerializer; + class LCRadosSerializer; +} } + +class RGWAsyncRadosProcessor; + +template +class RGWChainedCacheImpl; + +struct bucket_info_entry { + RGWBucketInfo info; + real_time mtime; + std::map attrs; +}; + +struct tombstone_entry; + +template +class lru_map; +using tombstone_cache_t = lru_map; + +class RGWIndexCompletionManager; + +class RGWRados +{ + friend class RGWGC; + friend class RGWMetaNotifier; + friend class RGWDataNotifier; + friend class RGWObjectExpirer; + friend class RGWMetaSyncProcessorThread; + friend class RGWDataSyncProcessorThread; + friend class RGWReshard; + friend class RGWBucketReshard; + friend class RGWBucketReshardLock; + friend class BucketIndexLockGuard; + friend class rgw::sal::MPRadosSerializer; + friend class rgw::sal::LCRadosSerializer; + friend class rgw::sal::RadosStore; + + /** Open the pool used as root for this gateway */ + int open_root_pool_ctx(const DoutPrefixProvider *dpp); + int open_gc_pool_ctx(const DoutPrefixProvider *dpp); + int open_lc_pool_ctx(const DoutPrefixProvider *dpp); + int open_objexp_pool_ctx(const DoutPrefixProvider *dpp); + int open_reshard_pool_ctx(const DoutPrefixProvider *dpp); + int open_notif_pool_ctx(const DoutPrefixProvider *dpp); + + int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, + bool mostly_omap); + + + ceph::mutex lock = ceph::make_mutex("rados_timer_lock"); + SafeTimer *timer; + + rgw::sal::RadosStore* driver = nullptr; + RGWGC *gc = nullptr; + RGWLC *lc; + RGWObjectExpirer *obj_expirer; + bool use_gc_thread; + bool use_lc_thread; + bool quota_threads; + bool run_sync_thread; + bool run_reshard_thread; + + RGWMetaNotifier *meta_notifier; + RGWDataNotifier *data_notifier; + RGWMetaSyncProcessorThread *meta_sync_processor_thread; + RGWSyncTraceManager *sync_tracer = nullptr; + std::map data_sync_processor_threads; + + boost::optional bucket_trim; + RGWSyncLogTrimThread *sync_log_trimmer{nullptr}; + + ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock"); + ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock"); + + librados::IoCtx root_pool_ctx; // .rgw + + double inject_notify_timeout_probability = 0; + unsigned max_notify_retries = 0; + + friend class RGWWatcher; + + ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id"); + + // This field represents the number of bucket index object shards + uint32_t bucket_index_max_shards; + + std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y); + + int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref); + int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref); + int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); + uint64_t max_bucket_id; + + int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx, + RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + RGWObjState *olh_state, RGWObjState **target_state, + RGWObjManifest **target_manifest, optional_yield y); + int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent = false); + int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + librados::ObjectOperation& op, RGWObjState **state, + RGWObjManifest** pmanifest, optional_yield y); + + int update_placement_map(); + int store_bucket_info(RGWBucketInfo& info, std::map *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive); + + void remove_rgw_head_obj(librados::ObjectWriteOperation& op); + void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist); + void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type); +protected: + CephContext *cct; + + librados::Rados rados; + + using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl; + RGWChainedCacheImpl_bucket_info_entry *binfo_cache; + + tombstone_cache_t *obj_tombstone_cache; + + librados::IoCtx gc_pool_ctx; // .rgw.gc + librados::IoCtx lc_pool_ctx; // .rgw.lc + librados::IoCtx objexp_pool_ctx; + librados::IoCtx reshard_pool_ctx; + librados::IoCtx notif_pool_ctx; // .rgw.notif + + bool pools_initialized; + + RGWQuotaHandler *quota_handler; + + RGWCoroutinesManagerRegistry *cr_registry; + + RGWSyncModuleInstanceRef sync_module; + bool writeable_zone{false}; + + RGWIndexCompletionManager *index_completion_manager{nullptr}; + + bool use_cache{false}; + bool use_gc{true}; + bool use_datacache{false}; + + int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx); +public: + RGWRados(): timer(NULL), + gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false), + run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL), + data_notifier(NULL), meta_sync_processor_thread(NULL), + bucket_index_max_shards(0), + max_bucket_id(0), cct(NULL), + binfo_cache(NULL), obj_tombstone_cache(nullptr), + pools_initialized(false), + quota_handler(NULL), + cr_registry(NULL), + pctl(&ctl), + reshard(NULL) {} + + RGWRados& set_use_cache(bool status) { + use_cache = status; + return *this; + } + + RGWRados& set_use_gc(bool status) { + use_gc = status; + return *this; + } + + RGWRados& set_use_datacache(bool status) { + use_datacache = status; + return *this; + } + + bool get_use_datacache() { + return use_datacache; + } + + RGWLC *get_lc() { + return lc; + } + + RGWGC *get_gc() { + return gc; + } + + RGWRados& set_run_gc_thread(bool _use_gc_thread) { + use_gc_thread = _use_gc_thread; + return *this; + } + + RGWRados& set_run_lc_thread(bool _use_lc_thread) { + use_lc_thread = _use_lc_thread; + return *this; + } + + RGWRados& set_run_quota_threads(bool _run_quota_threads) { + quota_threads = _run_quota_threads; + return *this; + } + + RGWRados& set_run_sync_thread(bool _run_sync_thread) { + run_sync_thread = _run_sync_thread; + return *this; + } + + RGWRados& set_run_reshard_thread(bool _run_reshard_thread) { + run_reshard_thread = _run_reshard_thread; + return *this; + } + + librados::IoCtx* get_lc_pool_ctx() { + return &lc_pool_ctx; + } + + librados::IoCtx& get_notif_pool_ctx() { + return notif_pool_ctx; + } + + void set_context(CephContext *_cct) { + cct = _cct; + } + void set_store(rgw::sal::RadosStore* _driver) { + driver = _driver; + } + + RGWServices svc; + RGWCtl ctl; + + RGWCtl *pctl{nullptr}; + + /** + * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we + * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed. + */ + std::string host_id; + + RGWReshard *reshard; + std::shared_ptr reshard_wait; + + virtual ~RGWRados() = default; + + tombstone_cache_t *get_tombstone_cache() { + return obj_tombstone_cache; + } + const RGWSyncModuleInstanceRef& get_sync_module() { + return sync_module; + } + RGWSyncTraceManager *get_sync_tracer() { + return sync_tracer; + } + + int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment); + void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size); + int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); + int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); + + uint32_t get_max_bucket_shards() { + return RGWSI_BucketIndex_RADOS::shards_max(); + } + + + int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); + + int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx); + int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, std::list& oids, + bool *is_truncated); + int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max, + RGWListRawObjsCtx& ctx, std::list& oids, + bool *is_truncated); + std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx); + + CephContext *ctx() { return cct; } + /** do all necessary setup of the storage device */ + int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) { + set_context(_cct); + return init_begin(dpp); + } + /** Initialize the RADOS instance and prepare to do other ops */ + int init_svc(bool raw, const DoutPrefixProvider *dpp); + int init_ctl(const DoutPrefixProvider *dpp); + virtual int init_rados(); + int init_begin(const DoutPrefixProvider *dpp); + int init_complete(const DoutPrefixProvider *dpp); + void finalize(); + + int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map& meta); + int update_service_map(const DoutPrefixProvider *dpp, std::map&& status); + + /// list logs + int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle); + int log_list_next(RGWAccessHandle handle, std::string *name); + + /// remove log + int log_remove(const DoutPrefixProvider *dpp, const std::string& name); + + /// show log + int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle); + int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry); + + // log bandwidth info + int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info); + int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, + uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map& usage); + int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch); + int clear_usage(const DoutPrefixProvider *dpp); + + int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool); + + void create_bucket_id(std::string *bucket_id); + + bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool); + bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj); + + int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, + const std::string& zonegroup_id, + const rgw_placement_rule& placement_rule, + const std::string& swift_ver_location, + const RGWQuotaInfo * pquota_info, + std::map& attrs, + RGWBucketInfo& bucket_info, + obj_version *pobjv, + obj_version *pep_objv, + ceph::real_time creation_time, + rgw_bucket *master_bucket, + uint32_t *master_num_shards, + optional_yield y, + const DoutPrefixProvider *dpp, + bool exclusive = true); + + RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; } + + struct BucketShard { + RGWRados *store; + rgw_bucket bucket; + int shard_id; + RGWSI_RADOS::Obj bucket_obj; + + explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {} + int init(const rgw_bucket& _bucket, const rgw_obj& obj, + RGWBucketInfo* out, const DoutPrefixProvider *dpp); + int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj); + int init(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, int sid); + + friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) { + out << "BucketShard:{ bucket=" << bs.bucket << + ", shard_id=" << bs.shard_id << + ", bucket_ojb=" << bs.bucket_obj << "}"; + return out; + } + }; + + class Object { + RGWRados *store; + rgw::sal::Bucket* bucket; + RGWObjectCtx& ctx; + rgw::sal::Object* obj; + + BucketShard bs; + + RGWObjState *state; + RGWObjManifest *manifest; + + bool versioning_disabled; + + bool bs_initialized; + + const rgw_placement_rule *pmeta_placement_rule; + + protected: + int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false); + void invalidate_state(); + + int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag, + const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y); + int complete_atomic_modification(const DoutPrefixProvider *dpp); + + public: + Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket), + ctx(_ctx), obj(_obj), bs(store), + state(NULL), manifest(nullptr), versioning_disabled(false), + bs_initialized(false), + pmeta_placement_rule(nullptr) {} + + RGWRados *get_store() { return store; } + rgw_obj get_obj() { return obj->get_obj(); } + RGWObjectCtx& get_ctx() { return ctx; } + RGWBucketInfo& get_bucket_info() { return bucket->get_info(); } + const std::string& get_instance() { return obj->get_instance(); } + rgw::sal::Object* get_target() { return obj; } + int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y); + + int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { + if (!bs_initialized) { + int r = + bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp); + if (r < 0) { + return r; + } + bs_initialized = true; + } + *pbs = &bs; + return 0; + } + + void set_versioning_disabled(bool status) { + versioning_disabled = status; + } + + bool versioning_enabled() { + return (!versioning_disabled && bucket->versioning_enabled()); + } + + void set_meta_placement_rule(const rgw_placement_rule *p) { + pmeta_placement_rule = p; + } + + const rgw_placement_rule& get_meta_placement_rule() { + return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule(); + } + + struct Read { + RGWRados::Object *source; + + struct GetObjState { + std::map io_ctxs; + rgw_pool cur_pool; + librados::IoCtx *cur_ioctx{nullptr}; + rgw_obj obj; + rgw_raw_obj head_obj; + } state; + + struct ConditionParams { + const ceph::real_time *mod_ptr; + const ceph::real_time *unmod_ptr; + bool high_precision_time; + uint32_t mod_zone_id; + uint64_t mod_pg_ver; + const char *if_match; + const char *if_nomatch; + + ConditionParams() : + mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0), + if_match(NULL), if_nomatch(NULL) {} + } conds; + + struct Params { + ceph::real_time *lastmod; + uint64_t *obj_size; + std::map *attrs; + rgw_obj *target_obj; + + Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr), + target_obj(nullptr) {} + } params; + + explicit Read(RGWRados::Object *_source) : source(_source) {} + + int prepare(optional_yield y, const DoutPrefixProvider *dpp); + static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); + int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp); + int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y); + int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y); + }; + + struct Write { + RGWRados::Object *target; + + struct MetaParams { + ceph::real_time *mtime; + std::map* rmattrs; + const bufferlist *data; + RGWObjManifest *manifest; + const std::string *ptag; + std::list *remove_objs; + ceph::real_time set_mtime; + rgw_user owner; + RGWObjCategory category; + int flags; + const char *if_match; + const char *if_nomatch; + std::optional olh_epoch; + ceph::real_time delete_at; + bool canceled; + const std::string *user_data; + rgw_zone_set *zones_trace; + bool modify_tail; + bool completeMultipart; + bool appendable; + + MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), + remove_objs(NULL), category(RGWObjCategory::Main), flags(0), + if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr), + modify_tail(false), completeMultipart(false), appendable(false) {} + } meta; + + explicit Write(RGWRados::Object *_target) : target(_target) {} + + int _do_write_meta(const DoutPrefixProvider *dpp, + uint64_t size, uint64_t accounted_size, + std::map& attrs, + bool modify_tail, bool assume_noent, + void *index_op, optional_yield y); + int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, + std::map& attrs, optional_yield y); + int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive); + const req_state* get_req_state() { + return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */ + } + }; + + struct Delete { + RGWRados::Object *target; + + struct DeleteParams { + rgw_user bucket_owner; + int versioning_status; // versioning flags defined in enum RGWBucketFlags + ACLOwner obj_owner; // needed for creation of deletion marker + uint64_t olh_epoch; + std::string marker_version_id; + uint32_t bilog_flags; + std::list *remove_objs; + ceph::real_time expiration_time; + ceph::real_time unmod_since; + ceph::real_time mtime; /* for setting delete marker mtime */ + bool high_precision_time; + rgw_zone_set *zones_trace; + bool abortmp; + uint64_t parts_accounted_size; + + DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {} + } params; + + struct DeleteResult { + bool delete_marker; + std::string version_id; + + DeleteResult() : delete_marker(false) {} + } result; + + explicit Delete(RGWRados::Object *_target) : target(_target) {} + + int delete_obj(optional_yield y, const DoutPrefixProvider *dpp); + }; + + struct Stat { + RGWRados::Object *source; + + struct Result { + rgw_obj obj; + std::optional manifest; + uint64_t size{0}; + struct timespec mtime {}; + std::map attrs; + } result; + + struct State { + librados::IoCtx io_ctx; + librados::AioCompletion *completion; + int ret; + + State() : completion(NULL), ret(0) {} + } state; + + + explicit Stat(RGWRados::Object *_source) : source(_source) {} + + int stat_async(const DoutPrefixProvider *dpp); + int wait(const DoutPrefixProvider *dpp); + int stat(); + private: + int finish(const DoutPrefixProvider *dpp); + }; + }; + + class Bucket { + RGWRados *store; + RGWBucketInfo bucket_info; + rgw_bucket& bucket; + int shard_id; + + public: + Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket), + shard_id(RGW_NO_SHARD) {} + RGWRados *get_store() { return store; } + rgw_bucket& get_bucket() { return bucket; } + RGWBucketInfo& get_bucket_info() { return bucket_info; } + + int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp); + + int get_shard_id() { return shard_id; } + void set_shard_id(int id) { + shard_id = id; + } + + class UpdateIndex { + RGWRados::Bucket *target; + std::string optag; + rgw_obj obj; + uint16_t bilog_flags{0}; + BucketShard bs; + bool bs_initialized{false}; + bool blind; + bool prepared{false}; + rgw_zone_set *zones_trace{nullptr}; + + int init_bs(const DoutPrefixProvider *dpp) { + int r = + bs.init(target->get_bucket(), obj, &target->bucket_info, dpp); + if (r < 0) { + return r; + } + bs_initialized = true; + return 0; + } + + void invalidate_bs() { + bs_initialized = false; + } + + int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call); + public: + + UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj), + bs(target->get_store()) { + blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless); + } + + int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { + if (!bs_initialized) { + int r = init_bs(dpp); + if (r < 0) { + return r; + } + } + *pbs = &bs; + return 0; + } + + void set_bilog_flags(uint16_t flags) { + bilog_flags = flags; + } + + void set_zones_trace(rgw_zone_set *_zones_trace) { + zones_trace = _zones_trace; + } + + int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y); + int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size, + uint64_t accounted_size, ceph::real_time& ut, + const std::string& etag, const std::string& content_type, + const std::string& storage_class, + bufferlist *acl_bl, RGWObjCategory category, + std::list *remove_objs, const std::string *user_data = nullptr, bool appendable = false); + int complete_del(const DoutPrefixProvider *dpp, + int64_t poolid, uint64_t epoch, + ceph::real_time& removed_mtime, /* mtime of removed object */ + std::list *remove_objs); + int cancel(const DoutPrefixProvider *dpp, + std::list *remove_objs); + + const std::string *get_optag() { return &optag; } + + bool is_prepared() { return prepared; } + }; // class UpdateIndex + + class List { + protected: + // absolute maximum number of objects that + // list_objects_(un)ordered can return + static constexpr int64_t bucket_list_objects_absolute_max = 25000; + + RGWRados::Bucket *target; + rgw_obj_key next_marker; + + int list_objects_ordered(const DoutPrefixProvider *dpp, + int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y); + int list_objects_unordered(const DoutPrefixProvider *dpp, + int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y); + + public: + + struct Params { + std::string prefix; + std::string delim; + rgw_obj_key marker; + rgw_obj_key end_marker; + std::string ns; + bool enforce_ns; + RGWAccessListFilter* access_list_filter; + RGWBucketListNameFilter force_check_filter; + bool list_versions; + bool allow_unordered; + + Params() : + enforce_ns(true), + access_list_filter(nullptr), + list_versions(false), + allow_unordered(false) + {} + } params; + + explicit List(RGWRados::Bucket *_target) : target(_target) {} + + int list_objects(const DoutPrefixProvider *dpp, int64_t max, + std::vector *result, + std::map *common_prefixes, + bool *is_truncated, + optional_yield y) { + if (params.allow_unordered) { + return list_objects_unordered(dpp, max, result, common_prefixes, + is_truncated, y); + } else { + return list_objects_ordered(dpp, max, result, common_prefixes, + is_truncated, y); + } + } + rgw_obj_key& get_next_marker() { + return next_marker; + } + }; // class List + }; // class Bucket + + int on_last_entry_in_listing(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::string& obj_prefix, + const std::string& obj_delim, + std::function handler); + + bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const; + + int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + rgw::sal::Bucket* bucket, /* in */ + rgw::sal::Object* obj, /* in */ + const DoutPrefixProvider *dpp, /* in/out */ + optional_yield y); /* in */ + int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */ + const rgw_user& user, /* in */ + rgw::sal::Bucket* bucket, /* in */ + rgw::sal::Object* obj, /* in */ + bool& restored, /* out */ + const DoutPrefixProvider *dpp); /* in/out */ + int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, + RGWObjState *astate, + std::map& src_attrs, + RGWRados::Object::Read& read_op, + const rgw_user& user_id, + rgw::sal::Object* dest_obj, + ceph::real_time *mtime); + + enum AttrsMod { + ATTRSMOD_NONE = 0, + ATTRSMOD_REPLACE = 1, + ATTRSMOD_MERGE = 2 + }; + + D3nDataCache* d3n_data_cache{nullptr}; + + int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y); + + int stat_remote_obj(const DoutPrefixProvider *dpp, + RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* src_obj, + const RGWBucketInfo *src_bucket_info, + real_time *src_mtime, + uint64_t *psize, + const real_time *mod_ptr, + const real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + std::map *pattrs, + std::map *pheaders, + std::string *version_id, + std::string *ptag, + std::string *petag); + + int fetch_remote_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_obj, + rgw::sal::Object* src_obj, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + std::optional dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + rgw::sal::Attrs& attrs, + RGWObjCategory category, + std::optional olh_epoch, + ceph::real_time delete_at, + std::string *ptag, + std::string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + RGWFetchObjFilter *filter, + rgw_zone_set *zones_trace= nullptr, + std::optional* bytes_transferred = 0); + /** + * Copy an object. + * dest_obj: the object to copy into + * src_obj: the object to copy from + * attrs: usage depends on attrs_mod parameter + * attrs_mod: the modification mode of the attrs, may have the following values: + * ATTRSMOD_NONE - the attributes of the source object will be + * copied without modifications, attrs parameter is ignored; + * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs + * parameter, source object attributes are not copied; + * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes + * are overwritten by values contained in attrs parameter. + * Returns: 0 on success, -ERR# otherwise. + */ + int copy_obj(RGWObjectCtx& obj_ctx, + const rgw_user& user_id, + req_info *info, + const rgw_zone_id& source_zone, + rgw::sal::Object* dest_obj, + rgw::sal::Object* src_obj, + rgw::sal::Bucket* dest_bucket, + rgw::sal::Bucket* src_bucket, + const rgw_placement_rule& dest_placement, + ceph::real_time *src_mtime, + ceph::real_time *mtime, + const ceph::real_time *mod_ptr, + const ceph::real_time *unmod_ptr, + bool high_precision_time, + const char *if_match, + const char *if_nomatch, + AttrsMod attrs_mod, + bool copy_if_newer, + std::map& attrs, + RGWObjCategory category, + uint64_t olh_epoch, + ceph::real_time delete_at, + std::string *version_id, + std::string *ptag, + std::string *petag, + void (*progress_cb)(off_t, void *), + void *progress_data, + const DoutPrefixProvider *dpp, + optional_yield y); + + int copy_obj_data(RGWObjectCtx& obj_ctx, + rgw::sal::Bucket* bucket, + const rgw_placement_rule& dest_placement, + RGWRados::Object::Read& read_op, off_t end, + rgw::sal::Object* dest_obj, + ceph::real_time *mtime, + ceph::real_time set_mtime, + std::map& attrs, + uint64_t olh_epoch, + ceph::real_time delete_at, + std::string *petag, + const DoutPrefixProvider *dpp, + optional_yield y); + + int transition_obj(RGWObjectCtx& obj_ctx, + rgw::sal::Bucket* bucket, + rgw::sal::Object& obj, + const rgw_placement_rule& placement_rule, + const real_time& mtime, + uint64_t olh_epoch, + const DoutPrefixProvider *dpp, + optional_yield y); + + int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y); + + /** + * Delete a bucket. + * bucket: the name of the bucket to delete + * Returns 0 on success, -ERR# otherwise. + */ + int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true); + + void wakeup_meta_sync_shards(std::set& shard_ids); + + void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries); + + RGWMetaSyncStatusManager* get_meta_sync_manager(); + RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone); + + int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp); + int set_buckets_enabled(std::vector& buckets, bool enabled, const DoutPrefixProvider *dpp); + int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended); + + /** Delete an object.*/ + int delete_obj(rgw::sal::Driver* driver, + const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_owner, + const rgw_obj& src_obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags = 0, + const ceph::real_time& expiration_time = ceph::real_time(), + rgw_zone_set *zones_trace = nullptr); + int delete_obj(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_owner, + rgw::sal::Object* src_obj, + int versioning_status, // versioning flags defined in enum RGWBucketFlags + uint16_t bilog_flags = 0, + const ceph::real_time& expiration_time = ceph::real_time(), + rgw_zone_set *zones_trace = nullptr); + + int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj); + + /** Remove an object from the bucket index */ + int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp); + + /** + * Set an attr on an object. + * bucket: name of the bucket holding the object + * obj: name of the object to set the attr on + * name: the attr to set + * bl: the contents of the attr + * Returns: 0 on success, -ERR# otherwise. + */ + int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl); + + int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, + std::map& attrs, + std::map* rmattrs, + optional_yield y); + + int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, + bool follow_olh, optional_yield y, bool assume_noent = false); + int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) { + return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y); + } + + using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t, + off_t, bool, RGWObjState*, void*); + + int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info, + rgw::sal::Object* obj, off_t ofs, off_t end, + uint64_t max_chunk_size, iterate_obj_cb cb, void *arg, + optional_yield y); + + int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op); + + virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp, + const rgw_raw_obj& read_obj, off_t obj_ofs, + off_t read_ofs, off_t len, bool is_head_obj, + RGWObjState *astate, void *arg); + + /** + * a simple object read without keeping state + */ + + int raw_obj_stat(const DoutPrefixProvider *dpp, + rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch, + std::map *attrs, bufferlist *first_chunk, + RGWObjVersionTracker *objv_tracker, optional_yield y); + + int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op); + int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op); + + int guard_reshard(const DoutPrefixProvider *dpp, + BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + std::function call); + int block_while_resharding(RGWRados::BucketShard *bs, + const rgw_obj& obj_instance, + RGWBucketInfo& bucket_info, + optional_yield y, + const DoutPrefixProvider *dpp); + + void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op); + int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); + int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); + int bucket_index_link_olh(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& olh_state, + const rgw_obj& obj_instance, bool delete_marker, + const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, + ceph::real_time unmod_since, bool high_precision_time, + rgw_zone_set *zones_trace = nullptr, + bool log_data_change = false); + int bucket_index_unlink_instance(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw_obj& obj_instance, + const std::string& op_tag, const std::string& olh_tag, + uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr); + int bucket_index_read_olh_log(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, RGWObjState& state, + const rgw_obj& obj_instance, uint64_t ver_marker, + std::map > *log, bool *is_truncated); + int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver); + int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance); + int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, + bufferlist& obj_tag, std::map >& log, + uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr); + int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr); + int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta, + uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time, + optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false); + int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, + const rgw_obj& obj); + int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, + uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr); + + void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map& pending_entries, std::map *rm_pending_entries); + int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map& pending_attrs); + int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target); + int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh); + + void gen_rand_obj_instance_name(rgw_obj_key *target_key); + void gen_rand_obj_instance_name(rgw_obj *target); + + int update_containers_stats(std::map& m, const DoutPrefixProvider *dpp); + int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl); + +public: + void set_atomic(void *ctx, rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_atomic(obj); + } + void set_prefetch_data(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_prefetch_data(obj); + } + void set_compressed(void *ctx, const rgw_obj& obj) { + RGWObjectCtx *rctx = static_cast(ctx); + rctx->set_compressed(obj); + } + int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner); + int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver, + std::map& stats, std::string *max_marker, bool* syncstopped = NULL); + int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb); + + int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map *pattrs, const DoutPrefixProvider *dpp); + /* xxx dang obj_ctx -> svc */ + int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); + int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); + + static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry); + + int get_bucket_info(RGWServices *svc, + const std::string& tenant_name, const std::string& bucket_name, + RGWBucketInfo& info, + ceph::real_time *pmtime, optional_yield y, + const DoutPrefixProvider *dpp, std::map *pattrs = NULL); + + // Returns 0 on successful refresh. Returns error code if there was + // an error or the version stored on the OSD is the same as that + // presented in the BucketInfo structure. + // + int try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + const DoutPrefixProvider *dpp, + std::map *pattrs = nullptr); + + int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv, + std::map *pattrs, bool create_entry_point, + const DoutPrefixProvider *dpp); + + int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch, + rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent, + RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj, + ceph::real_time& removed_mtime, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj, + std::list *remove_objs, + uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); + int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout); + + using ent_map_t = + boost::container::flat_map; + + int cls_bucket_list_ordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + const int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + const std::string& delimiter, + const uint32_t num_entries, + const bool list_versions, + const uint16_t exp_factor, // 0 means ignore + ent_map_t& m, + bool* is_truncated, + bool* cls_filtered, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter = {}); + int cls_bucket_list_unordered(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, + const rgw_obj_index_key& start_after, + const std::string& prefix, + uint32_t num_entries, + bool list_versions, + std::vector& ent_list, + bool *is_truncated, + rgw_obj_index_key *last_entry, + optional_yield y, + RGWBucketListNameFilter force_check_filter = {}); + int cls_bucket_head(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, std::vector& headers, + std::map *bucket_instance_ids = NULL); + int cls_bucket_head_async(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& idx_layout, + int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio); + int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent); + int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh); + int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry); + void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry); + int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry); + int bi_list(const DoutPrefixProvider *dpp, + const RGWBucketInfo& bucket_info, + int shard_id, + const std::string& filter_obj, + const std::string& marker, + uint32_t max, + std::list *entries, + bool *is_truncated); + int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list *entries, bool *is_truncated); + int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max, + std::list *entries, bool *is_truncated); + int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs); + + int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info); + int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, + uint64_t end_epoch, uint32_t max_entries, std::string& read_iter, + std::map& usage, bool *is_truncated); + int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, + uint64_t end_epoch); + int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid); + + int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id); + + int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id); + int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id); + + void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain); + std::tuple> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag); + void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag); + int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op); + int gc_aio_operate(const std::string& oid, librados::AioCompletion *c, + librados::ObjectWriteOperation *op); + int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl); + + int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue); + int process_gc(bool expired_only); + bool process_expire_objects(const DoutPrefixProvider *dpp); + int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y); + + int process_lc(const std::unique_ptr& optional_bucket); + int list_lc_progress(std::string& marker, uint32_t max_entries, + std::vector>& progress_map, + int& index); + + int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + std::map *existing_stats, + std::map *calculated_stats); + int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info); + int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); + int remove_objs_from_index(const DoutPrefixProvider *dpp, + RGWBucketInfo& bucket_info, + const std::list& oid_list); + int move_rados_obj(const DoutPrefixProvider *dpp, + librados::IoCtx& src_ioctx, + const std::string& src_oid, const std::string& src_locator, + librados::IoCtx& dst_ioctx, + const std::string& dst_oid, const std::string& dst_locator); + int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key); + int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, + rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y); + + int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, + RGWQuota& quota, uint64_t obj_size, + optional_yield y, bool check_size_only = false); + + int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, + uint64_t num_objs, const DoutPrefixProvider *dpp); + + int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards); + + uint64_t instance_id(); + + librados::Rados* get_rados_handle(); + + int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list& handles); + int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate, + std::list& handles, bool keep_index_consistent, + optional_yield y); + + private: + /** + * Check the actual on-disk state of the object specified + * by list_state, and fill in the time and size of object. + * Then append any changes to suggested_updates for + * the rgw class' dir_suggest_changes function. + * + * Note that this can maul list_state; don't use it afterwards. Also + * it expects object to already be filled in from list_state; it only + * sets the size and mtime. + * + * Returns 0 on success, -ENOENT if the object doesn't exist on disk, + * and -errno on other failures. (-ENOENT is not a failure, and it + * will encode that info as a suggested update.) + */ + int check_disk_state(const DoutPrefixProvider *dpp, + librados::IoCtx io_ctx, + RGWBucketInfo& bucket_info, + rgw_bucket_dir_entry& list_state, + rgw_bucket_dir_entry& object, + bufferlist& suggested_updates, + optional_yield y); + + /** + * Init pool iteration + * pool: pool to use for the ctx initialization + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx); + + /** + * Init pool iteration + * pool: pool to use + * cursor: position to start iteration + * ctx: context object to use for the iteration + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx); + + /** + * Get pool iteration position + * ctx: context object to use for the iteration + * Returns: std::string representation of position + */ + std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx); + + /** + * Iterate over pool return object names, use optional filter + * ctx: iteration context, initialized with pool_iterate_begin() + * num: max number of objects to return + * objs: a vector that the results will append into + * is_truncated: if not NULL, will hold true iff iteration is complete + * filter: if not NULL, will be used to filter returned objects + * Returns: 0 on success, -ERR# otherwise. + */ + int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, + std::vector& objs, + bool *is_truncated, RGWAccessListFilter *filter); + + uint64_t next_bucket_id(); + + /** + * This is broken out to facilitate unit testing. + */ + static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries, + uint32_t num_shards); +}; + + +struct get_obj_data { + RGWRados* rgwrados; + RGWGetDataCB* client_cb = nullptr; + rgw::Aio* aio; + uint64_t offset; // next offset to write to client + rgw::AioResultList completed; // completed read results, sorted by offset + optional_yield yield; + + get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio, + uint64_t offset, optional_yield yield) + : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {} + ~get_obj_data() { + if (rgwrados->get_use_datacache()) { + const std::lock_guard l(d3n_get_data.d3n_lock); + } + } + + D3nGetObjData d3n_get_data; + std::atomic_bool d3n_bypass_cache_write{false}; + + int flush(rgw::AioResultList&& results); + + void cancel() { + // wait for all completions to drain and ignore the results + aio->drain(); + } + + int drain() { + auto c = aio->wait(); + while (!c.empty()) { + int r = flush(std::move(c)); + if (r < 0) { + cancel(); + return r; + } + c = aio->wait(); + } + return flush(std::move(c)); + } +}; diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc new file mode 100644 index 000000000000..b2dec7af1c86 --- /dev/null +++ b/src/rgw/driver/rados/rgw_reshard.cc @@ -0,0 +1,1407 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include +#include + +#include "rgw_zone.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_reshard.h" +#include "rgw_sal.h" +#include "rgw_sal_rados.h" +#include "cls/rgw/cls_rgw_client.h" +#include "cls/lock/cls_lock_client.h" +#include "common/errno.h" +#include "common/ceph_json.h" + +#include "common/dout.h" + +#include "services/svc_zone.h" +#include "services/svc_sys_obj.h" +#include "services/svc_tier_rados.h" +#include "services/svc_bilog_rados.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +const string reshard_oid_prefix = "reshard."; +const string reshard_lock_name = "reshard_process"; +const string bucket_instance_lock_name = "bucket_instance_lock"; + +/* All primes up to 2000 used to attempt to make dynamic sharding use + * a prime numbers of shards. Note: this list also includes 1 for when + * 1 shard is the most appropriate, even though 1 is not prime. + */ +const std::initializer_list RGWBucketReshard::reshard_primes = { + 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, + 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, + 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, + 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, + 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, + 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, + 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, + 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, + 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, + 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, + 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, + 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, + 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, + 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, + 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, + 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, + 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, + 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, + 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, + 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, + 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, + 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831, + 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, + 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999 +}; + +class BucketReshardShard { + rgw::sal::RadosStore* store; + const RGWBucketInfo& bucket_info; + int shard_id; + RGWRados::BucketShard bs; + vector entries; + map stats; + deque& aio_completions; + uint64_t max_aio_completions; + uint64_t reshard_shard_batch_size; + + int wait_next_completion() { + librados::AioCompletion *c = aio_completions.front(); + aio_completions.pop_front(); + + c->wait_for_complete(); + + int ret = c->get_return_value(); + c->release(); + + if (ret < 0) { + derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int get_completion(librados::AioCompletion **c) { + if (aio_completions.size() >= max_aio_completions) { + int ret = wait_next_completion(); + if (ret < 0) { + return ret; + } + } + + *c = librados::Rados::aio_create_completion(nullptr, nullptr); + aio_completions.push_back(*c); + + return 0; + } + +public: + BucketReshardShard(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info, + const rgw::bucket_index_layout_generation& index, + int shard_id, deque& _completions) : + store(_store), bucket_info(_bucket_info), shard_id(shard_id), + bs(store->getRados()), aio_completions(_completions) + { + bs.init(dpp, bucket_info, index, shard_id); + + max_aio_completions = + store->ctx()->_conf.get_val("rgw_reshard_max_aio"); + reshard_shard_batch_size = + store->ctx()->_conf.get_val("rgw_reshard_batch_size"); + } + + int get_shard_id() const { + return shard_id; + } + + int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + entries.push_back(entry); + if (account) { + rgw_bucket_category_stats& target = stats[category]; + target.num_entries += entry_stats.num_entries; + target.total_size += entry_stats.total_size; + target.total_size_rounded += entry_stats.total_size_rounded; + target.actual_size += entry_stats.actual_size; + } + if (entries.size() >= reshard_shard_batch_size) { + int ret = flush(); + if (ret < 0) { + return ret; + } + } + + return 0; + } + + int flush() { + if (entries.size() == 0) { + return 0; + } + + librados::ObjectWriteOperation op; + for (auto& entry : entries) { + store->getRados()->bi_put(op, bs, entry); + } + cls_rgw_bucket_update_stats(op, false, stats); + + librados::AioCompletion *c; + int ret = get_completion(&c); + if (ret < 0) { + return ret; + } + ret = bs.bucket_obj.aio_operate(c, &op); + if (ret < 0) { + derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl; + return ret; + } + entries.clear(); + stats.clear(); + return 0; + } + + int wait_all_aio() { + int ret = 0; + while (!aio_completions.empty()) { + int r = wait_next_completion(); + if (r < 0) { + ret = r; + } + } + return ret; + } +}; // class BucketReshardShard + + +class BucketReshardManager { + rgw::sal::RadosStore *store; + deque completions; + vector target_shards; + +public: + BucketReshardManager(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore *_store, + const RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& target) + : store(_store) + { + const int num_shards = target.layout.normal.num_shards; + target_shards.reserve(num_shards); + for (int i = 0; i < num_shards; ++i) { + target_shards.emplace_back(dpp, store, bucket_info, target, i, completions); + } + } + + ~BucketReshardManager() { + for (auto& shard : target_shards) { + int ret = shard.wait_all_aio(); + if (ret < 0) { + ldout(store->ctx(), 20) << __func__ << + ": shard->wait_all_aio() returned ret=" << ret << dendl; + } + } + } + + int add_entry(int shard_index, + rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, + const rgw_bucket_category_stats& entry_stats) { + int ret = target_shards[shard_index].add_entry(entry, account, category, + entry_stats); + if (ret < 0) { + derr << "ERROR: target_shards.add_entry(" << entry.idx << + ") returned error: " << cpp_strerror(-ret) << dendl; + return ret; + } + + return 0; + } + + int finish() { + int ret = 0; + for (auto& shard : target_shards) { + int r = shard.flush(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + } + for (auto& shard : target_shards) { + int r = shard.wait_all_aio(); + if (r < 0) { + derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl; + ret = r; + } + } + target_shards.clear(); + return ret; + } +}; // class BucketReshardManager + +RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store, + const RGWBucketInfo& _bucket_info, + const std::map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock) : + store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs), + reshard_lock(store, bucket_info, true), + outer_reshard_lock(_outer_reshard_lock) +{ } + +// sets reshard status of bucket index shards for the current index layout +static int set_resharding_status(const DoutPrefixProvider *dpp, + rgw::sal::RadosStore* store, + const RGWBucketInfo& bucket_info, + cls_rgw_reshard_status status) +{ + cls_rgw_bucket_instance_entry instance_entry; + instance_entry.set_status(status); + + int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: " + << cpp_strerror(-ret) << dendl; + return ret; + } + return 0; +} + +static int remove_old_reshard_instance(rgw::sal::RadosStore* store, + const rgw_bucket& bucket, + const DoutPrefixProvider* dpp) +{ + RGWBucketInfo info; + int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr, + nullptr, null_yield, dpp); + if (r < 0) { + return r; + } + + // delete its shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, info, info.layout.current_index); + // delete the bucket instance metadata + return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp); +} + +// initialize the new bucket index shard objects +static int init_target_index(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + const rgw::bucket_index_layout_generation& index, + const DoutPrefixProvider* dpp) +{ + int ret = store->svc()->bi->init_index(dpp, bucket_info, index); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize " + "target index shard objects: " << cpp_strerror(ret) << dendl; + return ret; + } + + if (!bucket_info.datasync_flag_enabled()) { + // if bucket sync is disabled, disable it on each of the new shards too + auto log = rgw::log_layout_from_index(0, index); + ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable " + "bucket sync on the target index shard objects: " + << cpp_strerror(ret) << dendl; + store->svc()->bi->clean_index(dpp, bucket_info, index); + return ret; + } + } + + return ret; +} + +// initialize a target index layout, create its bucket index shard objects, and +// write the target layout to the bucket instance metadata +static int init_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + uint32_t new_num_shards, + const DoutPrefixProvider* dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + const auto current = prev.current_index; + + // initialize a new normal target index layout generation + rgw::bucket_index_layout_generation target; + target.layout.type = rgw::BucketIndexType::Normal; + target.layout.normal.num_shards = new_num_shards; + target.gen = current.gen + 1; + + if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) { + // backward-compatible cleanup of old reshards, where the target was in a + // different bucket instance + if (!bucket_info.new_bucket_instance_id.empty()) { + rgw_bucket new_bucket = bucket_info.bucket; + new_bucket.bucket_id = bucket_info.new_bucket_instance_id; + ldout(store->ctx(), 10) << __func__ << " removing target bucket instance " + "from a previous reshard attempt" << dendl; + // ignore errors + remove_old_reshard_instance(store, new_bucket, dpp); + } + bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING; + } + + if (bucket_info.layout.target_index) { + // a previous reshard failed or stalled, and its reshard lock dropped + ldpp_dout(dpp, 10) << __func__ << " removing existing target index " + "objects from a previous reshard attempt" << dendl; + // delete its existing shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index); + // don't reuse this same generation in the new target layout, in case + // something is still trying to operate on its shard objects + target.gen = bucket_info.layout.target_index->gen + 1; + } + + // create the index shard objects + int ret = init_target_index(store, bucket_info, target, dpp); + if (ret < 0) { + return ret; + } + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + do { + // update resharding state + bucket_info.layout.target_index = target; + bucket_info.layout.resharding = rgw::BucketReshardState::InProgress; + + if (ret = fault.check("set_target_layout"); + ret == 0) { // no fault injected, write the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info(bucket_info, false, + real_time(), &bucket_attrs, dpp); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding != rgw::BucketReshardState::None || + bucket_info.layout.current_index != current) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + break; + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write " + "target index layout to bucket info: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + + // delete the target shard objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, target); + return ret; + } + return 0; +} // init_target_layout + +// delete the bucket index shards associated with the target layout and remove +// it from the bucket instance metadata +static int revert_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider* dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + + // remove target index shard objects + int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove " + "target index with: " << cpp_strerror(ret) << dendl; + ret = 0; // non-fatal error + } + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + do { + // clear target_index and resharding state + bucket_info.layout.target_index = std::nullopt; + bucket_info.layout.resharding = rgw::BucketReshardState::None; + + if (ret = fault.check("revert_target_layout"); + ret == 0) { // no fault injected, revert the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info(bucket_info, false, + real_time(), + &bucket_attrs, dpp); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding == rgw::BucketReshardState::None) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "reshard cancel" << dendl; + return -ECANCELED; + } + if (bucket_info.layout.current_index != prev.current_index || + bucket_info.layout.target_index != prev.target_index) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + return -ECANCELED; + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear " + "target index layout in bucket info: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + return ret; + } + return 0; +} // remove_target_layout + +static int init_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + uint32_t new_num_shards, + const DoutPrefixProvider *dpp) +{ + int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp); + if (ret < 0) { + return ret; + } + + if (ret = fault.check("block_writes"); + ret == 0) { // no fault injected, block writes to the current index shards + ret = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::IN_PROGRESS); + } + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause " + "writes to the current index: " << cpp_strerror(ret) << dendl; + // clean up the target layout (ignore errors) + revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + return ret; + } + return 0; +} // init_reshard + +static int cancel_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + // unblock writes to the current index shard objects + int ret = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " + "writes to current index objects: " << cpp_strerror(ret) << dendl; + ret = 0; // non-fatal error + } + + if (bucket_info.layout.target_index) { + return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + } + // there is nothing to revert + return 0; +} // cancel_reshard + +static int commit_target_layout(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + auto& layout = bucket_info.layout; + const auto next_log_gen = layout.logs.empty() ? 1 : + layout.logs.back().gen + 1; + + if (!store->svc()->zone->need_to_log_data()) { + // if we're not syncing data, we can drop any existing logs + layout.logs.clear(); + } + + // use the new index layout as current + ceph_assert(layout.target_index); + layout.current_index = std::move(*layout.target_index); + layout.target_index = std::nullopt; + layout.resharding = rgw::BucketReshardState::None; + // add the in-index log layout + layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index)); + + int ret = fault.check("commit_target_layout"); + if (ret == 0) { // no fault injected, write the bucket instance metadata + ret = store->getRados()->put_bucket_instance_info( + bucket_info, false, real_time(), &bucket_attrs, dpp); + } else if (ret == -ECANCELED) { + fault.clear(); // clear the fault so a retry can succeed + } + return ret; +} // commit_target_layout + +static int commit_reshard(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + ReshardFaultInjector& fault, + const DoutPrefixProvider *dpp) +{ + auto prev = bucket_info.layout; // make a copy for cleanup + + // retry in case of racing writes to the bucket instance metadata + static constexpr auto max_retries = 10; + int tries = 0; + int ret = 0; + do { + ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp); + if (ret == -ECANCELED) { + // racing write detected, read the latest bucket info and try again + int ret2 = store->getRados()->get_bucket_instance_info( + bucket_info.bucket, bucket_info, + nullptr, &bucket_attrs, null_yield, dpp); + if (ret2 < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " + "bucket info: " << cpp_strerror(ret2) << dendl; + ret = ret2; + break; + } + + // check that we're still in the reshard state we started in + if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "reshard cancel" << dendl; + return -ECANCELED; // whatever canceled us already did the cleanup + } + if (bucket_info.layout.current_index != prev.current_index || + bucket_info.layout.target_index != prev.target_index) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " + "another reshard" << dendl; + return -ECANCELED; // whatever canceled us already did the cleanup + } + + prev = bucket_info.layout; // update the copy + } + ++tries; + } while (ret == -ECANCELED && tries < max_retries); + + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit " + "target index layout: " << cpp_strerror(ret) << dendl; + + bucket_info.layout = std::move(prev); // restore in-memory layout + + // unblock writes to the current index shard objects + int ret2 = set_resharding_status(dpp, store, bucket_info, + cls_rgw_reshard_status::NOT_RESHARDING); + if (ret2 < 0) { + ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " + "writes to current index objects: " << cpp_strerror(ret2) << dendl; + // non-fatal error + } + return ret; + } + + if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() && + prev.current_index.layout.type == rgw::BucketIndexType::Normal) { + // write a datalog entry for each shard of the previous index. triggering + // sync on the old shards will force them to detect the end-of-log for that + // generation, and eventually transition to the next + // TODO: use a log layout to support types other than BucketLogType::InIndex + for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) { + ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket=" + << bucket_info.bucket << ", shard_id=" << shard_id << "of generation=" + << prev.logs.back().gen << ")" << dendl; + } // datalog error is not fatal + } + } + + // check whether the old index objects are still needed for bilogs + const auto& logs = bucket_info.layout.logs; + auto log = std::find_if(logs.begin(), logs.end(), + [&prev] (const rgw::bucket_log_layout_generation& log) { + return log.layout.type == rgw::BucketLogType::InIndex + && log.layout.in_index.gen == prev.current_index.gen; + }); + if (log == logs.end()) { + // delete the index objects (ignore errors) + store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index); + } + return 0; +} // commit_reshard + +int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + const DoutPrefixProvider* dpp) +{ + ReshardFaultInjector no_fault; + return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp); +} + +int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp) +{ + int ret = reshard_lock.lock(dpp); + if (ret < 0) { + return ret; + } + + if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { + ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl; + ret = -EINVAL; + } else { + ret = clear_resharding(store, bucket_info, bucket_attrs, dpp); + } + + reshard_lock.unlock(); + return ret; +} + +RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const std::string& reshard_lock_oid, + bool _ephemeral) : + store(_store), + lock_oid(reshard_lock_oid), + ephemeral(_ephemeral), + internal_lock(reshard_lock_name) +{ + const int lock_dur_secs = store->ctx()->_conf.get_val( + "rgw_reshard_bucket_lock_duration"); + duration = std::chrono::seconds(lock_dur_secs); + +#define COOKIE_LEN 16 + char cookie_buf[COOKIE_LEN + 1]; + gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1); + cookie_buf[COOKIE_LEN] = '\0'; + + internal_lock.set_cookie(cookie_buf); + internal_lock.set_duration(duration); +} + +int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) { + internal_lock.set_must_renew(false); + + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); + } + + if (ret == -EBUSY) { + ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ << + " found lock on " << lock_oid << + " to be held by another RGW process; skipping for now" << dendl; + return ret; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ << + " failed to acquire lock on " << lock_oid << ": " << + cpp_strerror(-ret) << dendl; + return ret; + } + + reset_time(Clock::now()); + + return 0; +} + +void RGWBucketReshardLock::unlock() { + int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid); + if (ret < 0) { + ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ << + " failed to drop lock on " << lock_oid << " ret=" << ret << dendl; + } +} + +int RGWBucketReshardLock::renew(const Clock::time_point& now) { + internal_lock.set_must_renew(true); + int ret; + if (ephemeral) { + ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, + lock_oid); + } else { + ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); + } + if (ret < 0) { /* expired or already locked by another processor */ + std::stringstream error_s; + if (-ENOENT == ret) { + error_s << "ENOENT (lock expired or never initially locked)"; + } else { + error_s << ret << " (" << cpp_strerror(-ret) << ")"; + } + ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " << + lock_oid << " with error " << error_s.str() << dendl; + return ret; + } + internal_lock.set_must_renew(false); + + reset_time(now); + ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " << + lock_oid << dendl; + + return 0; +} + + +int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current, + const rgw::bucket_index_layout_generation& target, + int max_entries, + bool verbose, + ostream *out, + Formatter *formatter, + const DoutPrefixProvider *dpp) +{ + if (out) { + (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl; + (*out) << "bucket name: " << bucket_info.bucket.name << std::endl; + } + + /* update bucket info -- in progress*/ + list entries; + + if (max_entries < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": can't reshard, negative max_entries" << dendl; + return -EINVAL; + } + + BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target); + + bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr); + + if (verbose_json_out) { + formatter->open_array_section("entries"); + } + + uint64_t total_entries = 0; + + if (!verbose_json_out && out) { + (*out) << "total entries:"; + } + + const int num_source_shards = current.layout.normal.num_shards; + string marker; + for (int i = 0; i < num_source_shards; ++i) { + bool is_truncated = true; + marker.clear(); + const std::string null_object_filter; // empty string since we're not filtering by object + while (is_truncated) { + entries.clear(); + int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated); + if (ret < 0 && ret != -ENOENT) { + derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl; + return ret; + } + + for (auto iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_cls_bi_entry& entry = *iter; + if (verbose_json_out) { + formatter->open_object_section("entry"); + + encode_json("shard_id", i, formatter); + encode_json("num_entry", total_entries, formatter); + encode_json("entry", entry, formatter); + } + total_entries++; + + marker = entry.idx; + + int target_shard_id; + cls_rgw_obj_key cls_key; + RGWObjCategory category; + rgw_bucket_category_stats stats; + bool account = entry.get_info(&cls_key, &category, &stats); + rgw_obj_key key(cls_key); + if (entry.type == BIIndexType::OLH && key.empty()) { + // bogus entry created by https://tracker.ceph.com/issues/46456 + // to fix, skip so it doesn't get include in the new bucket instance + total_entries--; + ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl; + continue; + } + rgw_obj obj(bucket_info.bucket, key); + RGWMPObj mp; + if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) { + // place the multipart .meta object on the same shard as its head object + obj.index_hash_source = mp.get_key(); + } + ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal, + obj.get_hash_object(), &target_shard_id); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl; + return ret; + } + + int shard_index = (target_shard_id > 0 ? target_shard_id : 0); + + ret = target_shards_mgr.add_entry(shard_index, entry, account, + category, stats); + if (ret < 0) { + return ret; + } + + Clock::time_point now = Clock::now(); + if (reshard_lock.should_renew(now)) { + // assume outer locks have timespans at least the size of ours, so + // can call inside conditional + if (outer_reshard_lock) { + ret = outer_reshard_lock->renew(now); + if (ret < 0) { + return ret; + } + } + ret = reshard_lock.renew(now); + if (ret < 0) { + ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl; + return ret; + } + } + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out && !(total_entries % 1000)) { + (*out) << " " << total_entries; + } + } // entries loop + } + } + + if (verbose_json_out) { + formatter->close_section(); + formatter->flush(*out); + } else if (out) { + (*out) << " " << total_entries << std::endl; + } + + int ret = target_shards_mgr.finish(); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl; + return -EIO; + } + return 0; +} // RGWBucketReshard::do_reshard + +int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list *status) +{ + return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status); +} + +int RGWBucketReshard::execute(int num_shards, + ReshardFaultInjector& fault, + int max_op_entries, + const DoutPrefixProvider *dpp, + bool verbose, ostream *out, + Formatter *formatter, + RGWReshard* reshard_log) +{ + // take a reshard lock on the bucket + int ret = reshard_lock.lock(dpp); + if (ret < 0) { + return ret; + } + // unlock when scope exits + auto unlock = make_scope_guard([this] { reshard_lock.unlock(); }); + + if (reshard_log) { + ret = reshard_log->update(dpp, bucket_info); + if (ret < 0) { + return ret; + } + } + + // prepare the target index and add its layout the bucket info + ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp); + if (ret < 0) { + return ret; + } + + if (ret = fault.check("do_reshard"); + ret == 0) { // no fault injected, do the reshard + ret = do_reshard(bucket_info.layout.current_index, + *bucket_info.layout.target_index, + max_op_entries, verbose, out, formatter, dpp); + } + + if (ret < 0) { + cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp); + + ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" + << bucket_info.bucket.name << "\" canceled due to errors" << dendl; + return ret; + } + + ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp); + if (ret < 0) { + return ret; + } + + ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" + << bucket_info.bucket.name << "\" completed successfully" << dendl; + return 0; +} // execute + +bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket, + const RGWSI_Zone* zone_svc) +{ + return !zone_svc->need_to_log_data() || + bucket.layout.logs.size() < max_bilog_history; +} + + +RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out, + Formatter *_formatter) : + store(_store), instance_lock(bucket_instance_lock_name), + verbose(_verbose), out(_out), formatter(_formatter) +{ + num_logshards = store->ctx()->_conf.get_val("rgw_reshard_num_logs"); +} + +string RGWReshard::get_logshard_key(const string& tenant, + const string& bucket_name) +{ + return tenant + ":" + bucket_name; +} + +#define MAX_RESHARD_LOGSHARDS_PRIME 7877 + +void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid) +{ + string key = get_logshard_key(tenant, bucket_name); + + uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); + uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); + sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards; + + get_logshard_oid(int(sid), oid); +} + +int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) +{ + if (!store->svc()->zone->can_reshard()) { + ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl; + return 0; + } + + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_add(op, entry); + + int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + return 0; +} + +int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info) +{ + cls_rgw_reshard_entry entry; + entry.bucket_name = bucket_info.bucket.name; + entry.bucket_id = bucket_info.bucket.bucket_id; + entry.tenant = bucket_info.owner.tenant; + + int ret = get(dpp, entry); + if (ret < 0) { + return ret; + } + + ret = add(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " << + cpp_strerror(-ret) << dendl; + } + + return ret; +} + + +int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list& entries, bool *is_truncated) +{ + string logshard_oid; + + get_logshard_oid(logshard_num, &logshard_oid); + + int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated); + + if (ret == -ENOENT) { + // these shard objects aren't created until we actually write something to + // them, so treat ENOENT as a successful empty listing + *is_truncated = false; + ret = 0; + } else if (ret == -EACCES) { + ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool + << ". Fix the pool access permissions of your client" << dendl; + } else if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid=" + << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl; + } + + return ret; +} + +int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry); + if (ret < 0) { + if (ret != -ENOENT) { + ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << + " bucket=" << entry.bucket_name << dendl; + } + return ret; + } + + return 0; +} + +int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry) +{ + string logshard_oid; + + get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); + + librados::ObjectWriteOperation op; + cls_rgw_reshard_remove(op, entry); + + int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; + return ret; + } + + return ret; +} + +int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry) +{ + int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid); + if (ret < 0) { + ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl; + return ret; + } + + return 0; +} + +int RGWReshardWait::wait(optional_yield y) +{ + std::unique_lock lock(mutex); + + if (going_down) { + return -ECANCELED; + } + + if (y) { + auto& context = y.get_io_context(); + auto& yield = y.get_yield_context(); + + Waiter waiter(context); + waiters.push_back(waiter); + lock.unlock(); + + waiter.timer.expires_after(duration); + + boost::system::error_code ec; + waiter.timer.async_wait(yield[ec]); + + lock.lock(); + waiters.erase(waiters.iterator_to(waiter)); + return -ec.value(); + } + + cond.wait_for(lock, duration); + + if (going_down) { + return -ECANCELED; + } + + return 0; +} + +void RGWReshardWait::stop() +{ + std::scoped_lock lock(mutex); + going_down = true; + cond.notify_all(); + for (auto& waiter : waiters) { + // unblock any waiters with ECANCELED + waiter.timer.cancel(); + } +} + +int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry, + int max_entries, const DoutPrefixProvider *dpp) +{ + ldpp_dout(dpp, 20) << __func__ << " resharding " << + entry.bucket_name << dendl; + + rgw_bucket bucket; + RGWBucketInfo bucket_info; + std::map bucket_attrs; + + int ret = store->getRados()->get_bucket_info(store->svc(), + entry.tenant, + entry.bucket_name, + bucket_info, nullptr, + null_yield, dpp, + &bucket_attrs); + if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) { + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error in get_bucket_info for bucket " << entry.bucket_name << + ": " << cpp_strerror(-ret) << dendl; + if (ret != -ENOENT) { + // any error other than ENOENT will abort + return ret; + } + } else { + ldpp_dout(dpp, 0) << __func__ << + ": Bucket: " << entry.bucket_name << + " already resharded by someone, skipping " << dendl; + } + + // we've encountered a reshard queue entry for an apparently + // non-existent bucket; let's try to recover by cleaning up + ldpp_dout(dpp, 0) << __func__ << + ": removing reshard queue entry for a resharded or non-existent bucket" << + entry.bucket_name << dendl; + + ret = remove(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error removing non-existent bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + + // we cleaned up, move on to the next entry + return 0; + } + + if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) { + ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not " + "eligible for resharding until peer zones finish syncing one " + "or more of its old log generations" << dendl; + return remove(dpp, entry); + } + + RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr); + + ReshardFaultInjector f; // no fault injected + ret = br.execute(entry.new_num_shards, f, max_entries, dpp, + false, nullptr, nullptr, this); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << + ": Error during resharding bucket " << entry.bucket_name << ":" << + cpp_strerror(-ret)<< dendl; + return ret; + } + + ldpp_dout(dpp, 20) << __func__ << + " removing reshard queue entry for bucket " << entry.bucket_name << + dendl; + + ret = remove(dpp, entry); + if (ret < 0) { + ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " << + entry.bucket_name << " from resharding queue: " << + cpp_strerror(-ret) << dendl; + return ret; + } + return 0; +} + +int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp) +{ + string marker; + bool truncated = true; + + constexpr uint32_t max_entries = 1000; + + string logshard_oid; + get_logshard_oid(logshard_num, &logshard_oid); + + RGWBucketReshardLock logshard_lock(store, logshard_oid, false); + + int ret = logshard_lock.lock(dpp); + if (ret < 0) { + ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << + logshard_oid << ", ret = " << ret < entries; + ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated); + if (ret < 0) { + ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" << + logshard_oid << dendl; + continue; + } + + for(auto& entry: entries) { // logshard entries + process_entry(entry, max_entries, dpp); + if (ret < 0) { + return ret; + } + + Clock::time_point now = Clock::now(); + if (logshard_lock.should_renew(now)) { + ret = logshard_lock.renew(now); + if (ret < 0) { + return ret; + } + } + + entry.get_key(&marker); + } // entry for loop + } while (truncated); + + logshard_lock.unlock(); + return 0; +} + + +void RGWReshard::get_logshard_oid(int shard_num, string *logshard) +{ + char buf[32]; + snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num); + + string objname(reshard_oid_prefix); + *logshard = objname + buf; +} + +int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp) +{ + int ret = 0; + + for (int i = 0; i < num_logshards; i++) { + string logshard; + get_logshard_oid(i, &logshard); + + ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl; + + ret = process_single_logshard(i, dpp); + + ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl; + } + + return 0; +} + +bool RGWReshard::going_down() +{ + return down_flag; +} + +void RGWReshard::start_processor() +{ + worker = new ReshardWorker(store->ctx(), this); + worker->create("rgw_reshard"); +} + +void RGWReshard::stop_processor() +{ + down_flag = true; + if (worker) { + worker->stop(); + worker->join(); + } + delete worker; + worker = nullptr; +} + +void *RGWReshard::ReshardWorker::entry() { + do { + utime_t start = ceph_clock_now(); + reshard->process_all_logshards(this); + + if (reshard->going_down()) + break; + + utime_t end = ceph_clock_now(); + end -= start; + int secs = cct->_conf.get_val("rgw_reshard_thread_interval"); + + if (secs <= end.sec()) + continue; // next round + + secs -= end.sec(); + + std::unique_lock locker{lock}; + cond.wait_for(locker, std::chrono::seconds(secs)); + } while (!reshard->going_down()); + + return NULL; +} + +void RGWReshard::ReshardWorker::stop() +{ + std::lock_guard l{lock}; + cond.notify_all(); +} + +CephContext *RGWReshard::ReshardWorker::get_cct() const +{ + return cct; +} + +unsigned RGWReshard::ReshardWorker::get_subsys() const +{ + return dout_subsys; +} + +std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const +{ + return out << "rgw reshard worker thread: "; +} diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h new file mode 100644 index 000000000000..59819f3a58d2 --- /dev/null +++ b/src/rgw/driver/rados/rgw_reshard.h @@ -0,0 +1,274 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "common/ceph_time.h" +#include "common/async/yield_context.h" +#include "cls/rgw/cls_rgw_types.h" +#include "cls/lock/cls_lock_client.h" + +#include "rgw_common.h" +#include "common/fault_injector.h" + + +class RGWReshard; +namespace rgw { namespace sal { + class RadosStore; +} } + +using ReshardFaultInjector = FaultInjector; + +class RGWBucketReshardLock { + using Clock = ceph::coarse_mono_clock; + + rgw::sal::RadosStore* store; + const std::string lock_oid; + const bool ephemeral; + rados::cls::lock::Lock internal_lock; + std::chrono::seconds duration; + + Clock::time_point start_time; + Clock::time_point renew_thresh; + + void reset_time(const Clock::time_point& now) { + start_time = now; + renew_thresh = start_time + duration / 2; + } + +public: + RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const std::string& reshard_lock_oid, + bool _ephemeral); + RGWBucketReshardLock(rgw::sal::RadosStore* _store, + const RGWBucketInfo& bucket_info, + bool _ephemeral) : + RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral) + {} + + int lock(const DoutPrefixProvider *dpp); + void unlock(); + int renew(const Clock::time_point&); + + bool should_renew(const Clock::time_point& now) const { + return now >= renew_thresh; + } +}; // class RGWBucketReshardLock + +class RGWBucketReshard { + public: + using Clock = ceph::coarse_mono_clock; + + private: + rgw::sal::RadosStore *store; + RGWBucketInfo bucket_info; + std::map bucket_attrs; + + RGWBucketReshardLock reshard_lock; + RGWBucketReshardLock* outer_reshard_lock; + + // using an initializer_list as an array in contiguous memory + // allocated in at once + static const std::initializer_list reshard_primes; + + int do_reshard(const rgw::bucket_index_layout_generation& current, + const rgw::bucket_index_layout_generation& target, + int max_entries, + bool verbose, + std::ostream *os, + Formatter *formatter, + const DoutPrefixProvider *dpp); +public: + + // pass nullptr for the final parameter if no outer reshard lock to + // manage + RGWBucketReshard(rgw::sal::RadosStore* _store, + const RGWBucketInfo& _bucket_info, + const std::map& _bucket_attrs, + RGWBucketReshardLock* _outer_reshard_lock); + int execute(int num_shards, ReshardFaultInjector& f, + int max_op_entries, const DoutPrefixProvider *dpp, + bool verbose = false, std::ostream *out = nullptr, + ceph::Formatter *formatter = nullptr, + RGWReshard *reshard_log = nullptr); + int get_status(const DoutPrefixProvider *dpp, std::list *status); + int cancel(const DoutPrefixProvider* dpp); + + static int clear_resharding(rgw::sal::RadosStore* store, + RGWBucketInfo& bucket_info, + std::map& bucket_attrs, + const DoutPrefixProvider* dpp); + + static uint32_t get_max_prime_shards() { + return *std::crbegin(reshard_primes); + } + + // returns the prime in our list less than or equal to the + // parameter; the lowest value that can be returned is 1 + static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) { + auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(), + requested_shards); + if (it == reshard_primes.begin()) { + return 1; + } else { + return *(--it); + } + } + + // returns the prime in our list greater than or equal to the + // parameter; if we do not have such a prime, 0 is returned + static uint32_t get_prime_shards_greater_or_equal( + uint32_t requested_shards) + { + auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(), + requested_shards); + if (it == reshard_primes.end()) { + return 0; + } else { + return *it; + } + } + + // returns a preferred number of shards given a calculated number of + // shards based on max_dynamic_shards and the list of prime values + static uint32_t get_preferred_shards(uint32_t suggested_shards, + uint32_t max_dynamic_shards) { + + // use a prime if max is within our prime range, otherwise use + // specified max + const uint32_t absolute_max = + max_dynamic_shards >= get_max_prime_shards() ? + max_dynamic_shards : + get_prime_shards_less_or_equal(max_dynamic_shards); + + // if we can use a prime number, use it, otherwise use suggested; + // note get_prime_shards_greater_or_equal will return 0 if no prime in + // prime range + const uint32_t prime_ish_num_shards = + std::max(get_prime_shards_greater_or_equal(suggested_shards), + suggested_shards); + + // dynamic sharding cannot reshard more than defined maximum + const uint32_t final_num_shards = + std::min(prime_ish_num_shards, absolute_max); + + return final_num_shards; + } + + const std::map& get_bucket_attrs() const { + return bucket_attrs; + } + + // for multisite, the RGWBucketInfo keeps a history of old log generations + // until all peers are done with them. prevent this log history from growing + // too large by refusing to reshard the bucket until the old logs get trimmed + static constexpr size_t max_bilog_history = 4; + + static bool can_reshard(const RGWBucketInfo& bucket, + const RGWSI_Zone* zone_svc); +}; // RGWBucketReshard + + +class RGWReshard { +public: + using Clock = ceph::coarse_mono_clock; + +private: + rgw::sal::RadosStore* store; + std::string lock_name; + rados::cls::lock::Lock instance_lock; + int num_logshards; + + bool verbose; + std::ostream *out; + Formatter *formatter; + + void get_logshard_oid(int shard_num, std::string *shard); +protected: + class ReshardWorker : public Thread, public DoutPrefixProvider { + CephContext *cct; + RGWReshard *reshard; + ceph::mutex lock = ceph::make_mutex("ReshardWorker"); + ceph::condition_variable cond; + + public: + ReshardWorker(CephContext * const _cct, + RGWReshard * const _reshard) + : cct(_cct), + reshard(_reshard) {} + + void *entry() override; + void stop(); + + CephContext *get_cct() const override; + unsigned get_subsys() const override; + std::ostream& gen_prefix(std::ostream& out) const override; + }; + + ReshardWorker *worker = nullptr; + std::atomic down_flag = { false }; + + std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name); + void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid); + +public: + RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr); + int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); + int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info); + int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); + int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry); + int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list& entries, bool *is_truncated); + int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry); + + /* reshard thread */ + int process_entry(const cls_rgw_reshard_entry& entry, int max_entries, + const DoutPrefixProvider *dpp); + int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp); + int process_all_logshards(const DoutPrefixProvider *dpp); + bool going_down(); + void start_processor(); + void stop_processor(); +}; + +class RGWReshardWait { + public: + // the blocking wait uses std::condition_variable::wait_for(), which uses the + // std::chrono::steady_clock. use that for the async waits as well + using Clock = std::chrono::steady_clock; + private: + const ceph::timespan duration; + ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock"); + ceph::condition_variable cond; + + struct Waiter : boost::intrusive::list_base_hook<> { + using Executor = boost::asio::io_context::executor_type; + using Timer = boost::asio::basic_waitable_timer, Executor>; + Timer timer; + explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {} + }; + boost::intrusive::list waiters; + + bool going_down{false}; + +public: + RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5)) + : duration(duration) {} + ~RGWReshardWait() { + ceph_assert(going_down); + } + int wait(optional_yield y); + // unblock any threads waiting on reshard + void stop(); +}; diff --git a/src/rgw/driver/rados/rgw_rest_bucket.cc b/src/rgw/driver/rados/rgw_rest_bucket.cc new file mode 100644 index 000000000000..ebe4e429cc98 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_bucket.cc @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#include "rgw_op.h" +#include "driver/rados/rgw_bucket.h" +#include "rgw_rest_bucket.h" +#include "rgw_sal.h" + +#include "include/str_list.h" + +#include "services/svc_sys_obj.h" +#include "services/svc_zone.h" + +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +class RGWOp_Bucket_Info : public RGWRESTOp { + +public: + RGWOp_Bucket_Info() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_bucket_info"; } +}; + +void RGWOp_Bucket_Info::execute(optional_yield y) +{ + RGWBucketAdminOpState op_state; + + bool fetch_stats; + + std::string bucket; + + string uid_str; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "stats", false, &fetch_stats); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_fetch_stats(fetch_stats); + + op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this); +} + +class RGWOp_Get_Policy : public RGWRESTOp { + +public: + RGWOp_Get_Policy() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_READ); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "get_policy"; } +}; + +void RGWOp_Get_Policy::execute(optional_yield y) +{ + RGWBucketAdminOpState op_state; + + std::string bucket; + std::string object; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this); +} + +class RGWOp_Check_Bucket_Index : public RGWRESTOp { + +public: + RGWOp_Check_Bucket_Index() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "check_bucket_index"; } +}; + +void RGWOp_Check_Bucket_Index::execute(optional_yield y) +{ + std::string bucket; + + bool fix_index; + bool check_objects; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_bool(s, "fix", false, &fix_index); + RESTArgs::get_bool(s, "check-objects", false, &check_objects); + + op_state.set_bucket_name(bucket); + op_state.set_fix_index(fix_index); + op_state.set_check_objects(check_objects); + + op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s); +} + +class RGWOp_Bucket_Link : public RGWRESTOp { + +public: + RGWOp_Bucket_Link() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "link_bucket"; } +}; + +void RGWOp_Bucket_Link::execute(optional_yield y) +{ + std::string uid_str; + std::string bucket; + std::string bucket_id; + std::string new_bucket_name; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id); + RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name); + + rgw_user uid(uid_str); + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + op_state.set_bucket_id(bucket_id); + op_state.set_new_bucket_name(new_bucket_name); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWBucketAdminOp::link(driver, op_state, s); +} + +class RGWOp_Bucket_Unlink : public RGWRESTOp { + +public: + RGWOp_Bucket_Unlink() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "unlink_bucket"; } +}; + +void RGWOp_Bucket_Unlink::execute(optional_yield y) +{ + std::string uid_str; + std::string bucket; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "uid", uid_str, &uid_str); + rgw_user uid(uid_str); + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket); + + bufferlist data; + op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; + return; + } + op_ret = RGWBucketAdminOp::unlink(driver, op_state, s); +} + +class RGWOp_Bucket_Remove : public RGWRESTOp { + +public: + RGWOp_Bucket_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_bucket"; } +}; + +void RGWOp_Bucket_Remove::execute(optional_yield y) +{ + std::string bucket_name; + bool delete_children; + std::unique_ptr bucket; + + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); + RESTArgs::get_bool(s, "purge-objects", false, &delete_children); + + /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to + * the master. This user is actually the OP caller, not the bucket owner. */ + op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl; + if (op_ret == -ENOENT) { + op_ret = -ERR_NO_SUCH_BUCKET; + } + return; + } + + op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield); +} + +class RGWOp_Set_Bucket_Quota : public RGWRESTOp { + +public: + RGWOp_Set_Bucket_Quota() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "set_bucket_quota"; } +}; + +#define QUOTA_INPUT_MAX_LEN 1024 + +void RGWOp_Set_Bucket_Quota::execute(optional_yield y) +{ + bool uid_arg_existed = false; + std::string uid_str; + RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed); + if (! uid_arg_existed) { + op_ret = -EINVAL; + return; + } + rgw_user uid(uid_str); + bool bucket_arg_existed = false; + std::string bucket_name; + RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed); + if (! bucket_arg_existed) { + op_ret = -EINVAL; + return; + } + + bool use_http_params; + + if (s->content_length > 0) { + use_http_params = false; + } else { + const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); + use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); + } + RGWQuotaInfo quota; + if (!use_http_params) { + bool empty; + op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); + if (op_ret < 0) { + if (!empty) + return; + /* was probably chunked input, but no content provided, configure via http params */ + use_http_params = true; + } + } + if (use_http_params) { + std::unique_ptr bucket; + op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield); + if (op_ret < 0) { + return; + } + RGWQuotaInfo *old_quota = &bucket->get_info().quota; + int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size); + int64_t max_size_kb; + bool has_max_size_kb = false; + RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); + RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size); + RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb); + if (has_max_size_kb) + quota.max_size = max_size_kb * 1024; + RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); + } + + RGWBucketAdminOpState op_state; + op_state.set_user_id(uid); + op_state.set_bucket_name(bucket_name); + op_state.set_quota(quota); + + op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s); +} + +class RGWOp_Sync_Bucket : public RGWRESTOp { + +public: + RGWOp_Sync_Bucket() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "sync_bucket"; } +}; + +void RGWOp_Sync_Bucket::execute(optional_yield y) +{ + std::string bucket; + std::string tenant; + bool sync_bucket; + + RGWBucketAdminOpState op_state; + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "tenant", tenant, &tenant); + RESTArgs::get_bool(s, "sync", true, &sync_bucket); + + op_state.set_bucket_name(bucket); + op_state.set_tenant(tenant); + op_state.set_sync_bucket(sync_bucket); + + op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s); +} + +class RGWOp_Object_Remove: public RGWRESTOp { + +public: + RGWOp_Object_Remove() {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("buckets", RGW_CAP_WRITE); + } + + void execute(optional_yield y) override; + + const char* name() const override { return "remove_object"; } +}; + +void RGWOp_Object_Remove::execute(optional_yield y) +{ + std::string bucket; + std::string object; + + RGWBucketAdminOpState op_state; + + RESTArgs::get_string(s, "bucket", bucket, &bucket); + RESTArgs::get_string(s, "object", object, &object); + + op_state.set_bucket_name(bucket); + op_state.set_object(object); + + op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s); +} + + +RGWOp *RGWHandler_Bucket::op_get() +{ + + if (s->info.args.sub_resource_exists("policy")) + return new RGWOp_Get_Policy; + + if (s->info.args.sub_resource_exists("index")) + return new RGWOp_Check_Bucket_Index; + + return new RGWOp_Bucket_Info; +} + +RGWOp *RGWHandler_Bucket::op_put() +{ + if (s->info.args.sub_resource_exists("quota")) + return new RGWOp_Set_Bucket_Quota; + + if (s->info.args.sub_resource_exists("sync")) + return new RGWOp_Sync_Bucket; + + return new RGWOp_Bucket_Link; +} + +RGWOp *RGWHandler_Bucket::op_post() +{ + return new RGWOp_Bucket_Unlink; +} + +RGWOp *RGWHandler_Bucket::op_delete() +{ + if (s->info.args.sub_resource_exists("object")) + return new RGWOp_Object_Remove; + + return new RGWOp_Bucket_Remove; +} diff --git a/src/rgw/driver/rados/rgw_rest_bucket.h b/src/rgw/driver/rados/rgw_rest_bucket.h new file mode 100644 index 000000000000..00f0b64397a2 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_bucket.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +#pragma once + +#include "rgw_rest.h" +#include "rgw_rest_s3.h" + + +class RGWHandler_Bucket : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_put() override; + RGWOp *op_post() override; + RGWOp *op_delete() override; +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Bucket() override = default; + + int read_permissions(RGWOp*, optional_yield y) override { + return 0; + } +}; + +class RGWRESTMgr_Bucket : public RGWRESTMgr { +public: + RGWRESTMgr_Bucket() = default; + ~RGWRESTMgr_Bucket() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state*, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string&) override { + return new RGWHandler_Bucket(auth_registry); + } +}; diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc new file mode 100644 index 000000000000..3563cf051bd7 --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_log.cc @@ -0,0 +1,1267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ceph_json.h" +#include "common/strtol.h" +#include "rgw_rest.h" +#include "rgw_op.h" +#include "rgw_rest_s3.h" +#include "rgw_rest_log.h" +#include "rgw_client_io.h" +#include "rgw_sync.h" +#include "rgw_data_sync.h" +#include "rgw_common.h" +#include "rgw_zone.h" +#include "rgw_mdlog.h" +#include "rgw_datalog_notify.h" +#include "rgw_trim_bilog.h" + +#include "services/svc_zone.h" +#include "services/svc_mdlog.h" +#include "services/svc_bilog_rados.h" + +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_context g_ceph_context +#define LOG_CLASS_LIST_MAX_ENTRIES (1000) +#define dout_subsys ceph_subsys_rgw + +using namespace std; + +void RGWOp_MDLog_List::execute(optional_yield y) { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string max_entries_str = s->info.args.get("max-entries"); + string marker = s->info.args.get("marker"), + err; + void *handle; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + return; + } + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; + op_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + meta_log.init_list_entries(shard_id, {}, {}, marker, &handle); + + op_ret = meta_log.list_entries(this, handle, max_entries, entries, + &last_marker, &truncated); + + meta_log.complete_list_entries(handle); +} + +void RGWOp_MDLog_List::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (list::iterator iter = entries.begin(); + iter != entries.end(); ++iter) { + cls_log_entry& entry = *iter; + static_cast(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter); + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_Info::execute(optional_yield y) { + num_objects = s->cct->_conf->rgw_md_log_max_shards; + period = static_cast(driver)->svc()->mdlog->read_oldest_log_period(y, s); + op_ret = period.get_error(); +} + +void RGWOp_MDLog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("mdlog"); + s->formatter->dump_unsigned("num_objects", num_objects); + if (period) { + s->formatter->dump_string("period", period.get_period().get_id()); + s->formatter->dump_unsigned("realm_epoch", period.get_epoch()); + } + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_MDLog_ShardInfo::execute(optional_yield y) { + string period = s->info.args.get("period"); + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + op_ret = meta_log.get_info(this, shard_id, &info); +} + +void RGWOp_MDLog_ShardInfo::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_MDLog_Delete::execute(optional_yield y) { + string marker = s->info.args.get("marker"), + period = s->info.args.get("period"), + shard = s->info.args.get("id"), + err; + unsigned shard_id; + + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("start-marker")) { + ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("end-marker")) { + if (!s->info.args.exists("marker")) { + marker = s->info.args.get("end-marker"); + } else { + ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; + op_ret = -EINVAL; + } + } + + op_ret = 0; + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (marker.empty()) { /* bounding end */ + op_ret = -EINVAL; + return; + } + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id" << dendl; + op_ret = -EINVAL; + return; + } + } + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + + op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker); +} + +void RGWOp_MDLog_Lock::execute(optional_yield y) { + string period, shard_id_str, duration_str, locker_id, zone_id; + unsigned shard_id; + + op_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + duration_str = s->info.args.get("length"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + (duration_str.empty()) || + locker_id.empty() || + zone_id.empty()) { + ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; + op_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + unsigned dur; + dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err); + if (!err.empty() || dur <= 0) { + ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl; + op_ret = -EINVAL; + return; + } + op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id, + locker_id); + if (op_ret == -EBUSY) + op_ret = -ERR_LOCKED; +} + +void RGWOp_MDLog_Unlock::execute(optional_yield y) { + string period, shard_id_str, locker_id, zone_id; + unsigned shard_id; + + op_ret = 0; + + period = s->info.args.get("period"); + shard_id_str = s->info.args.get("id"); + locker_id = s->info.args.get("locker-id"); + zone_id = s->info.args.get("zone-id"); + + if (period.empty()) { + ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; + period = driver->get_zone()->get_current_period_id(); + } + + if (period.empty() || + shard_id_str.empty() || + locker_id.empty() || + zone_id.empty()) { + ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; + op_ret = -EINVAL; + return; + } + + RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; + op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id); +} + +void RGWOp_MDLog_Notify::execute(optional_yield y) { +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + set updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (set::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl; + } + } + + driver->wakeup_meta_sync_shards(updated_shards); + + op_ret = 0; +} + +void RGWOp_BILog_List::execute(optional_yield y) { + bool gen_specified = false; + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + marker = s->info.args.get("marker"), + max_entries_str = s->info.args.get("max-entries"), + bucket_instance = s->info.args.get("bucket-instance"), + gen_str = s->info.args.get("generation", &gen_specified), + format_version_str = s->info.args.get("format-ver"); + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + unsigned max_entries; + + if (bucket_name.empty() && bucket_instance.empty()) { + ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + std::optional gen; + if (gen_specified) { + gen = strict_strtoll(gen_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; + op_ret = -EINVAL; + return; + } + } + + if (!format_version_str.empty()) { + format_ver = strict_strtoll(format_version_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl; + op_ret = -EINVAL; + return; + } + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + const auto& logs = bucket->get_info().layout.logs; + if (logs.empty()) { + ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; + op_ret = -ENOENT; + return; + } + + auto log = std::prev(logs.end()); + if (gen) { + log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen)); + if (log == logs.end()) { + ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl; + op_ret = -ENOENT; + return; + } + } + if (auto next = std::next(log); next != logs.end()) { + next_log_layout = *next; // get the next log after the current latest + } + auto& log_layout = *log; // current log layout for log listing + + unsigned count = 0; + + + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + send_response(); + do { + list entries; + int ret = static_cast(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id, + marker, max_entries - count, + entries, &truncated); + if (ret < 0) { + ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl; + return; + } + + count += entries.size(); + + send_response(entries, marker); + } while (truncated && count < max_entries); + + send_response_end(); +} + +void RGWOp_BILog_List::send_response() { + if (sent_header) + return; + + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + sent_header = true; + + if (op_ret < 0) + return; + + if (format_ver >= 2) { + s->formatter->open_object_section("result"); + } + + s->formatter->open_array_section("entries"); +} + +void RGWOp_BILog_List::send_response(list& entries, string& marker) +{ + for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { + rgw_bi_log_entry& entry = *iter; + encode_json("entry", entry, s->formatter); + + marker = entry.id; + flusher.flush(); + } +} + +void RGWOp_BILog_List::send_response_end() { + s->formatter->close_section(); + + if (format_ver >= 2) { + encode_json("truncated", truncated, s->formatter); + + if (next_log_layout) { + s->formatter->open_object_section("next_log"); + encode_json("generation", next_log_layout->gen, s->formatter); + encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter); + s->formatter->close_section(); // next_log + } + + s->formatter->close_section(); // result + } + + flusher.flush(); +} + +void RGWOp_BILog_Info::execute(optional_yield y) { + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + bucket_instance = s->info.args.get("bucket-instance"); + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + if (bucket_name.empty() && bucket_instance.empty()) { + ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; + op_ret = -EINVAL; + return; + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + const auto& logs = bucket->get_info().layout.logs; + if (logs.empty()) { + ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; + op_ret = -ENOENT; + return; + } + + map stats; + const auto& index = log_to_index_layout(logs.back()); + + int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped); + if (ret < 0 && ret != -ENOENT) { + op_ret = ret; + return; + } + + oldest_gen = logs.front().gen; + latest_gen = logs.back().gen; + + for (auto& log : logs) { + uint32_t num_shards = log.layout.in_index.layout.num_shards; + generations.push_back({log.gen, num_shards}); + } +} + +void RGWOp_BILog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("info"); + encode_json("bucket_ver", bucket_ver, s->formatter); + encode_json("master_ver", master_ver, s->formatter); + encode_json("max_marker", max_marker, s->formatter); + encode_json("syncstopped", syncstopped, s->formatter); + encode_json("oldest_gen", oldest_gen, s->formatter); + encode_json("latest_gen", latest_gen, s->formatter); + encode_json("generations", generations, s->formatter); + s->formatter->close_section(); + + flusher.flush(); +} + +void RGWOp_BILog_Delete::execute(optional_yield y) { + bool gen_specified = false; + string tenant_name = s->info.args.get("tenant"), + bucket_name = s->info.args.get("bucket"), + start_marker = s->info.args.get("start-marker"), + end_marker = s->info.args.get("end-marker"), + bucket_instance = s->info.args.get("bucket-instance"), + gen_str = s->info.args.get("generation", &gen_specified); + + std::unique_ptr bucket; + rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); + + op_ret = 0; + if ((bucket_name.empty() && bucket_instance.empty()) || + end_marker.empty()) { + ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl; + op_ret = -EINVAL; + return; + } + + string err; + uint64_t gen = 0; + if (gen_specified) { + gen = strict_strtoll(gen_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; + op_ret = -EINVAL; + return; + } + } + + int shard_id; + string bn; + op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); + if (op_ret < 0) { + return; + } + + if (!bucket_instance.empty()) { + b.name = bn; + b.bucket_id = bucket_instance; + } + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; + return; + } + + op_ret = bilog_trim(this, static_cast(driver), + bucket->get_info(), gen, shard_id, + start_marker, end_marker); + if (op_ret < 0) { + ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl; + } + + return; +} + +void RGWOp_DATALog_List::execute(optional_yield y) { + string shard = s->info.args.get("id"); + + string max_entries_str = s->info.args.get("max-entries"), + marker = s->info.args.get("marker"), + err; + unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + s->info.args.get_bool("extra-info", &extra_info, false); + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + if (!max_entries_str.empty()) { + max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; + op_ret = -EINVAL; + return; + } + if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { + max_entries = LOG_CLASS_LIST_MAX_ENTRIES; + } + } + + // Note that last_marker is updated to be the marker of the last + // entry listed + op_ret = static_cast(driver)->svc()->datalog_rados->list_entries(this, shard_id, + max_entries, entries, + marker, &last_marker, + &truncated); +} + +void RGWOp_DATALog_List::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret < 0) + return; + + s->formatter->open_object_section("log_entries"); + s->formatter->dump_string("marker", last_marker); + s->formatter->dump_bool("truncated", truncated); + { + s->formatter->open_array_section("entries"); + for (const auto& entry : entries) { + if (!extra_info) { + encode_json("entry", entry.entry, s->formatter); + } else { + encode_json("entry", entry, s->formatter); + } + flusher.flush(); + } + s->formatter->close_section(); + } + s->formatter->close_section(); + flusher.flush(); +} + + +void RGWOp_DATALog_Info::execute(optional_yield y) { + num_objects = s->cct->_conf->rgw_data_log_num_shards; + op_ret = 0; +} + +void RGWOp_DATALog_Info::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + s->formatter->open_object_section("num_objects"); + s->formatter->dump_unsigned("num_objects", num_objects); + s->formatter->close_section(); + flusher.flush(); +} + +void RGWOp_DATALog_ShardInfo::execute(optional_yield y) { + string shard = s->info.args.get("id"); + string err; + + unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + + op_ret = static_cast(driver)->svc()->datalog_rados->get_info(this, shard_id, &info); +} + +void RGWOp_DATALog_ShardInfo::send_response() { + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + encode_json("info", info, s->formatter); + flusher.flush(); +} + +void RGWOp_DATALog_Notify::execute(optional_yield y) { + string source_zone = s->info.args.get("source-zone"); +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + bc::flat_map> updated_shards; + try { + auto decoder = rgw_data_notify_v1_decoder{updated_shards}; + decode_json_obj(decoder, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (bc::flat_map >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(this, 20) << __func__ << "(): modified key=" << key + << " of gen=" << gen << dendl; + } + } + } + + driver->wakeup_data_sync_shards(this, source_zone, updated_shards); + + op_ret = 0; +} + +void RGWOp_DATALog_Notify2::execute(optional_yield y) { + string source_zone = s->info.args.get("source-zone"); +#define LARGE_ENOUGH_BUF (128 * 1024) + + int r = 0; + bufferlist data; + std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF); + if (r < 0) { + op_ret = r; + return; + } + + char* buf = data.c_str(); + ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl; + + JSONParser p; + r = p.parse(buf, data.length()); + if (r < 0) { + ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl; + op_ret = r; + return; + } + + bc::flat_map > updated_shards; + try { + decode_json_obj(updated_shards, &p); + } catch (JSONDecoder::err& err) { + ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; + op_ret = -EINVAL; + return; + } + + if (driver->ctx()->_conf->subsys.should_gather()) { + for (bc::flat_map >::iterator iter = + updated_shards.begin(); iter != updated_shards.end(); ++iter) { + ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; + bc::flat_set& entries = iter->second; + for (const auto& [key, gen] : entries) { + ldpp_dout(this, 20) << __func__ << "(): modified key=" << key << + " of generation=" << gen << dendl; + } + } + } + + driver->wakeup_data_sync_shards(this, source_zone, updated_shards); + + op_ret = 0; +} + +void RGWOp_DATALog_Delete::execute(optional_yield y) { + string marker = s->info.args.get("marker"), + shard = s->info.args.get("id"), + err; + unsigned shard_id; + + op_ret = 0; + + if (s->info.args.exists("start-time") || + s->info.args.exists("end-time")) { + ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("start-marker")) { + ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; + op_ret = -EINVAL; + } + + if (s->info.args.exists("end-marker")) { + if (!s->info.args.exists("marker")) { + marker = s->info.args.get("end-marker"); + } else { + ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; + op_ret = -EINVAL; + } + } + + shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); + if (!err.empty()) { + ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; + op_ret = -EINVAL; + return; + } + if (marker.empty()) { /* bounding end */ + op_ret = -EINVAL; + return; + } + + op_ret = static_cast(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker); +} + +// not in header to avoid pulling in rgw_sync.h +class RGWOp_MDLog_Status : public RGWRESTOp { + rgw_meta_sync_status status; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "get_metadata_log_status"; } +}; + +void RGWOp_MDLog_Status::execute(optional_yield y) +{ + auto sync = static_cast(driver)->getRados()->get_meta_sync_manager(); + if (sync == nullptr) { + ldpp_dout(this, 1) << "no sync manager" << dendl; + op_ret = -ENOENT; + return; + } + op_ret = sync->read_sync_status(this, &status); +} + +void RGWOp_MDLog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_BILog_Status : public RGWRESTOp { + bilog_status_v2 status; + int version = 1; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { return "get_bucket_index_log_status"; } +}; + +void RGWOp_BILog_Status::execute(optional_yield y) +{ + const auto options = s->info.args.get("options"); + bool merge = (options == "merge"); + const auto source_zone = s->info.args.get("source-zone"); + const auto source_key = s->info.args.get("source-bucket"); + auto key = s->info.args.get("bucket"); + op_ret = s->info.args.get_int("version", &version, 1); + + if (key.empty()) { + key = source_key; + } + if (key.empty()) { + ldpp_dout(this, 4) << "no 'bucket' provided" << dendl; + op_ret = -EINVAL; + return; + } + + rgw_bucket b; + int shard_id{-1}; // unused + op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id); + if (op_ret < 0) { + ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl; + op_ret = -EINVAL; + return; + } + + // read the bucket instance info for num_shards + std::unique_ptr bucket; + op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl; + return; + } + + rgw_bucket source_bucket; + + if (source_key.empty() || + source_key == key) { + source_bucket = bucket->get_key(); + } else { + op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr); + if (op_ret < 0) { + ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl; + return; + } + } + + const auto& local_zone_id = driver->get_zone()->get_id(); + + if (!merge) { + rgw_sync_bucket_pipe pipe; + pipe.source.zone = source_zone; + pipe.source.bucket = source_bucket; + pipe.dest.zone = local_zone_id; + pipe.dest.bucket = bucket->get_key(); + + ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; + + op_ret = rgw_read_bucket_full_sync_status( + this, + static_cast(driver), + pipe, + &status.sync_status, + s->yield); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + return; + } + status.inc_status.resize(status.sync_status.shards_done_with_gen.size()); + + op_ret = rgw_read_bucket_inc_sync_status( + this, + static_cast(driver), + pipe, + status.sync_status.incremental_gen, + &status.inc_status); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + } + return; + } + + rgw_zone_id source_zone_id(source_zone); + + RGWBucketSyncPolicyHandlerRef source_handler; + op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y); + if (op_ret < 0) { + ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl; + return; + } + + auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id); + + std::vector current_status; + for (auto& entry : local_dests) { + auto pipe = entry.second; + + ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; + + RGWBucketInfo *pinfo = &bucket->get_info(); + std::optional opt_dest_info; + + if (!pipe.dest.bucket) { + /* Uh oh, something went wrong */ + ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; + op_ret = -EIO; + return; + } + + if (*pipe.dest.bucket != pinfo->bucket) { + opt_dest_info.emplace(); + std::unique_ptr dest_bucket; + op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl; + return; + } + + *opt_dest_info = dest_bucket->get_info(); + pinfo = &(*opt_dest_info); + pipe.dest.bucket = pinfo->bucket; + } + + op_ret = rgw_read_bucket_full_sync_status( + this, + static_cast(driver), + pipe, + &status.sync_status, + s->yield); + if (op_ret < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; + return; + } + + current_status.resize(status.sync_status.shards_done_with_gen.size()); + int r = rgw_read_bucket_inc_sync_status(this, static_cast(driver), + pipe, status.sync_status.incremental_gen, ¤t_status); + if (r < 0) { + ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl; + op_ret = r; + return; + } + + if (status.inc_status.empty()) { + status.inc_status = std::move(current_status); + } else { + if (current_status.size() != status.inc_status.size()) { + op_ret = -EINVAL; + ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets " + "syncing from the same source: status.size()= " + << status.inc_status.size() + << " current_status.size()=" + << current_status.size() << dendl; + return; + } + auto m = status.inc_status.begin(); + for (auto& cur_shard_status : current_status) { + auto& result_shard_status = *m++; + // always take the first marker, or any later marker that's smaller + if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) { + result_shard_status = std::move(cur_shard_status); + } + } + } + } +} + +void RGWOp_BILog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + if (version < 2) { + encode_json("status", status.inc_status, s->formatter); + } else { + encode_json("status", status, s->formatter); + } + } + flusher.flush(); +} + +// not in header to avoid pulling in rgw_data_sync.h +class RGWOp_DATALog_Status : public RGWRESTOp { + rgw_data_sync_status status; +public: + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override ; + void send_response() override; + const char* name() const override { return "get_data_changes_log_status"; } +}; + +void RGWOp_DATALog_Status::execute(optional_yield y) +{ + const auto source_zone = s->info.args.get("source-zone"); + auto sync = driver->get_data_sync_manager(source_zone); + if (sync == nullptr) { + ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl; + op_ret = -ENOENT; + return; + } + op_ret = sync->read_sync_status(this, &status); +} + +void RGWOp_DATALog_Status::send_response() +{ + set_req_state_err(s, op_ret); + dump_errno(s); + end_header(s); + + if (op_ret >= 0) { + encode_json("status", status, s->formatter); + } + flusher.flush(); +} + + +RGWOp *RGWHandler_Log::op_get() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_MDLog_ShardInfo; + } else { + return new RGWOp_MDLog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_MDLog_Status; + } else { + return new RGWOp_MDLog_Info; + } + } else if (type.compare("bucket-index") == 0) { + if (s->info.args.exists("info")) { + return new RGWOp_BILog_Info; + } else if (s->info.args.exists("status")) { + return new RGWOp_BILog_Status; + } else { + return new RGWOp_BILog_List; + } + } else if (type.compare("data") == 0) { + if (s->info.args.exists("id")) { + if (s->info.args.exists("info")) { + return new RGWOp_DATALog_ShardInfo; + } else { + return new RGWOp_DATALog_List; + } + } else if (s->info.args.exists("status")) { + return new RGWOp_DATALog_Status; + } else { + return new RGWOp_DATALog_Info; + } + } + return NULL; +} + +RGWOp *RGWHandler_Log::op_delete() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) + return new RGWOp_MDLog_Delete; + else if (type.compare("bucket-index") == 0) + return new RGWOp_BILog_Delete; + else if (type.compare("data") == 0) + return new RGWOp_DATALog_Delete; + return NULL; +} + +RGWOp *RGWHandler_Log::op_post() { + bool exists; + string type = s->info.args.get("type", &exists); + + if (!exists) { + return NULL; + } + + if (type.compare("metadata") == 0) { + if (s->info.args.exists("lock")) + return new RGWOp_MDLog_Lock; + else if (s->info.args.exists("unlock")) + return new RGWOp_MDLog_Unlock; + else if (s->info.args.exists("notify")) + return new RGWOp_MDLog_Notify; + } else if (type.compare("data") == 0) { + if (s->info.args.exists("notify")) { + return new RGWOp_DATALog_Notify; + } else if (s->info.args.exists("notify2")) { + return new RGWOp_DATALog_Notify2; + } + } + return NULL; +} + diff --git a/src/rgw/driver/rados/rgw_rest_log.h b/src/rgw/driver/rados/rgw_rest_log.h new file mode 100644 index 000000000000..c8a0c4df07be --- /dev/null +++ b/src/rgw/driver/rados/rgw_rest_log.h @@ -0,0 +1,337 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab ft=cpp + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 eNovance SAS + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "rgw_datalog.h" +#include "rgw_rest.h" +#include "rgw_rest_s3.h" +#include "rgw_metadata.h" +#include "rgw_mdlog.h" +#include "rgw_data_sync.h" + +class RGWOp_BILog_List : public RGWRESTOp { + bool sent_header; + uint32_t format_ver{0}; + bool truncated{false}; + std::optional next_log_layout; + +public: + RGWOp_BILog_List() : sent_header(false) {} + ~RGWOp_BILog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void send_response() override; + virtual void send_response(std::list& entries, std::string& marker); + virtual void send_response_end(); + void execute(optional_yield y) override; + const char* name() const override { + return "list_bucket_index_log"; + } +}; + +class RGWOp_BILog_Info : public RGWRESTOp { + std::string bucket_ver; + std::string master_ver; + std::string max_marker; + bool syncstopped; + uint64_t oldest_gen = 0; + uint64_t latest_gen = 0; + std::vector generations; + +public: + RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {} + ~RGWOp_BILog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void send_response() override; + void execute(optional_yield y) override; + const char* name() const override { + return "bucket_index_log_info"; + } +}; + +class RGWOp_BILog_Delete : public RGWRESTOp { +public: + RGWOp_BILog_Delete() {} + ~RGWOp_BILog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("bilog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_bucket_index_log"; + } +}; + +class RGWOp_MDLog_List : public RGWRESTOp { + std::list entries; + std::string last_marker; + bool truncated; +public: + RGWOp_MDLog_List() : truncated(false) {} + ~RGWOp_MDLog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "list_metadata_log"; + } +}; + +class RGWOp_MDLog_Info : public RGWRESTOp { + unsigned num_objects; + RGWPeriodHistory::Cursor period; +public: + RGWOp_MDLog_Info() : num_objects(0) {} + ~RGWOp_MDLog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_info"; + } +}; + +class RGWOp_MDLog_ShardInfo : public RGWRESTOp { + RGWMetadataLogInfo info; +public: + RGWOp_MDLog_ShardInfo() {} + ~RGWOp_MDLog_ShardInfo() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_metadata_log_shard_info"; + } +}; + +class RGWOp_MDLog_Lock : public RGWRESTOp { +public: + RGWOp_MDLog_Lock() {} + ~RGWOp_MDLog_Lock() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "lock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Unlock : public RGWRESTOp { +public: + RGWOp_MDLog_Unlock() {} + ~RGWOp_MDLog_Unlock() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "unlock_mdlog_object"; + } +}; + +class RGWOp_MDLog_Notify : public RGWRESTOp { +public: + RGWOp_MDLog_Notify() {} + ~RGWOp_MDLog_Notify() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "mdlog_notify"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; } +}; + +class RGWOp_MDLog_Delete : public RGWRESTOp { +public: + RGWOp_MDLog_Delete() {} + ~RGWOp_MDLog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("mdlog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_metadata_log"; + } +}; + +class RGWOp_DATALog_List : public RGWRESTOp { + std::vector entries; + std::string last_marker; + bool truncated; + bool extra_info; +public: + RGWOp_DATALog_List() : truncated(false), extra_info(false) {} + ~RGWOp_DATALog_List() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "list_data_changes_log"; + } +}; + +class RGWOp_DATALog_Info : public RGWRESTOp { + unsigned num_objects; +public: + RGWOp_DATALog_Info() : num_objects(0) {} + ~RGWOp_DATALog_Info() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_info"; + } +}; + +class RGWOp_DATALog_ShardInfo : public RGWRESTOp { + RGWDataChangesLogInfo info; +public: + RGWOp_DATALog_ShardInfo() {} + ~RGWOp_DATALog_ShardInfo() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_READ); + } + int verify_permission(optional_yield y) override { + return check_caps(s->user->get_caps()); + } + void execute(optional_yield y) override; + void send_response() override; + const char* name() const override { + return "get_data_changes_log_shard_info"; + } +}; + +class RGWOp_DATALog_Notify : public RGWRESTOp { +public: + RGWOp_DATALog_Notify() {} + ~RGWOp_DATALog_Notify() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "datalog_notify"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; } +}; + +class RGWOp_DATALog_Notify2 : public RGWRESTOp { + rgw_data_notify_entry data_notify; +public: + RGWOp_DATALog_Notify2() {} + ~RGWOp_DATALog_Notify2() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "datalog_notify2"; + } + RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; } +}; + +class RGWOp_DATALog_Delete : public RGWRESTOp { +public: + RGWOp_DATALog_Delete() {} + ~RGWOp_DATALog_Delete() override {} + + int check_caps(const RGWUserCaps& caps) override { + return caps.check_cap("datalog", RGW_CAP_WRITE); + } + void execute(optional_yield y) override; + const char* name() const override { + return "trim_data_changes_log"; + } +}; + +class RGWHandler_Log : public RGWHandler_Auth_S3 { +protected: + RGWOp *op_get() override; + RGWOp *op_delete() override; + RGWOp *op_post() override; + + int read_permissions(RGWOp*, optional_yield) override { + return 0; + } +public: + using RGWHandler_Auth_S3::RGWHandler_Auth_S3; + ~RGWHandler_Log() override = default; +}; + +class RGWRESTMgr_Log : public RGWRESTMgr { +public: + RGWRESTMgr_Log() = default; + ~RGWRESTMgr_Log() override = default; + + RGWHandler_REST* get_handler(rgw::sal::Driver* driver, + req_state* const, + const rgw::auth::StrategyRegistry& auth_registry, + const std::string& frontend_prefixs) override { + return new RGWHandler_Log(auth_registry); + } +}; diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h index dc4991388a98..4c0b8d8421f7 100644 --- a/src/rgw/driver/rados/rgw_service.h +++ b/src/rgw/driver/rados/rgw_service.h @@ -1,9 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SERVICE_H -#define CEPH_RGW_SERVICE_H - +#pragma once #include #include @@ -215,5 +213,3 @@ struct RGWCtl { int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp); }; - -#endif diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h index 8c4e511ae3ee..e6c255cc6014 100644 --- a/src/rgw/driver/rados/rgw_sync.h +++ b/src/rgw/driver/rados/rgw_sync.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SYNC_H -#define CEPH_RGW_SYNC_H +#pragma once #include @@ -546,4 +545,3 @@ RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env, uint32_t max_entries, rgw_mdlog_shard_data *result); -#endif diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h index 6d974c39a274..494e88608c11 100644 --- a/src/rgw/driver/rados/rgw_sync_module.h +++ b/src/rgw/driver/rados/rgw_sync_module.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SYNC_MODULE_H -#define CEPH_RGW_SYNC_MODULE_H +#pragma once #include "rgw_common.h" #include "rgw_coroutine.h" @@ -198,5 +197,3 @@ public: }; void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager); - -#endif diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h index 48f0145fdf92..92532ff00e7f 100644 --- a/src/rgw/driver/rados/rgw_sync_module_aws.h +++ b/src/rgw/driver/rados/rgw_sync_module_aws.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_SYNC_MODULE_AWS_H -#define RGW_SYNC_MODULE_AWS_H +#pragma once #include "rgw_sync_module.h" @@ -107,5 +106,3 @@ class RGWAWSSyncModule : public RGWSyncModule { bool supports_data_export() override { return false;} int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; }; - -#endif /* RGW_SYNC_MODULE_AWS_H */ diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h index 6c0c422c39cc..c8c9fcc439c5 100644 --- a/src/rgw/driver/rados/rgw_sync_module_es.h +++ b/src/rgw/driver/rados/rgw_sync_module_es.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SYNC_MODULE_ES_H -#define CEPH_RGW_SYNC_MODULE_ES_H +#pragma once #include "rgw_sync_module.h" @@ -58,5 +57,3 @@ public: return true; } }; - -#endif diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h index ecf3bb78911e..ab475959da37 100644 --- a/src/rgw/driver/rados/rgw_sync_module_log.h +++ b/src/rgw/driver/rados/rgw_sync_module_log.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SYNC_MODULE_LOG_H -#define CEPH_RGW_SYNC_MODULE_LOG_H +#pragma once #include "rgw_sync_module.h" @@ -14,5 +13,3 @@ public: } int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override; }; - -#endif diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h index 9617dac70dbc..1fcc8bed8301 100644 --- a/src/rgw/driver/rados/rgw_sync_trace.h +++ b/src/rgw/driver/rados/rgw_sync_trace.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SYNC_LOG_H -#define CEPH_RGW_SYNC_LOG_H +#pragma once #include @@ -140,6 +139,3 @@ public: bufferlist& out) override; std::string get_active_names(); }; - - -#endif diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h index 6aeb9b891005..d96912cb866b 100644 --- a/src/rgw/driver/rados/rgw_tools.h +++ b/src/rgw/driver/rados/rgw_tools.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_TOOLS_H -#define CEPH_RGW_TOOLS_H +#pragma once #include @@ -274,4 +273,3 @@ void rgw_complete_aio_completion(librados::AioCompletion* c, int r); // (Currently providing nullptr will wipe all attributes.) std::map* no_change_attrs(); -#endif diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h index 5b9c4cdd7ec1..6a11d24766be 100644 --- a/src/rgw/driver/rados/rgw_trim_bilog.h +++ b/src/rgw/driver/rados/rgw_trim_bilog.h @@ -14,8 +14,7 @@ * Foundation. See file COPYING. */ -#ifndef RGW_SYNC_LOG_TRIM_H -#define RGW_SYNC_LOG_TRIM_H +#pragma once #include #include @@ -120,5 +119,3 @@ WRITE_CLASS_ENCODER(rgw::BucketTrimStatus); int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store, RGWBucketInfo& bucket_info, uint64_t gen, int shard_id, std::string_view start_marker, std::string_view end_marker); - -#endif // RGW_SYNC_LOG_TRIM_H diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h index 110124cdbc78..83e3720f71bd 100644 --- a/src/rgw/driver/rados/rgw_user.h +++ b/src/rgw/driver/rados/rgw_user.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_USER_H -#define CEPH_RGW_USER_H +#pragma once #include #include @@ -882,6 +881,3 @@ class RGWUserMetaHandlerAllocator { public: static RGWMetadataHandler *alloc(RGWSI_User *user_svc); }; - - -#endif diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h index bede6c7a4481..2d69d5f1c723 100644 --- a/src/rgw/driver/rados/rgw_zone.h +++ b/src/rgw/driver/rados/rgw_zone.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ZONE_H -#define CEPH_RGW_ZONE_H +#pragma once #include #include "rgw_zone_types.h" @@ -942,5 +941,3 @@ int delete_zone(const DoutPrefixProvider* dpp, optional_yield y, sal::ZoneWriter& writer); } // namespace rgw - -#endif diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h index aa346a63f13f..c520501583b8 100644 --- a/src/rgw/rgw_acl.h +++ b/src/rgw/rgw_acl.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ACL_H -#define CEPH_RGW_ACL_H +#pragma once #include #include @@ -413,5 +412,3 @@ public: friend bool operator!=(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs); }; WRITE_CLASS_ENCODER(RGWAccessControlPolicy) - -#endif diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h index 9521b9f47371..c234d722b997 100644 --- a/src/rgw/rgw_acl_s3.h +++ b/src/rgw/rgw_acl_s3.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ACL_S3_H -#define CEPH_RGW_ACL_S3_H +#pragma once #include #include @@ -114,5 +113,3 @@ class RGWACLXMLParser_S3 : public RGWXMLParser public: explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {} }; - -#endif diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h index 8d263e854d26..4cb1e4b8f8f5 100644 --- a/src/rgw/rgw_acl_swift.h +++ b/src/rgw/rgw_acl_swift.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ACL_SWIFT_H -#define CEPH_RGW_ACL_SWIFT_H +#pragma once #include #include @@ -57,4 +56,3 @@ public: const std::string& acl_str); boost::optional to_str() const; }; -#endif diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h index a595b0351997..f3e92b7e51c1 100644 --- a/src/rgw/rgw_asio_client.h +++ b/src/rgw/rgw_asio_client.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_ASIO_CLIENT_H -#define RGW_ASIO_CLIENT_H +#pragma once #include #include @@ -58,5 +57,3 @@ class ClientIO : public io::RestfulClient, } // namespace asio } // namespace rgw - -#endif // RGW_ASIO_CLIENT_H diff --git a/src/rgw/rgw_asio_frontend.h b/src/rgw/rgw_asio_frontend.h index 940b717b5f2b..2de6f337a9fb 100644 --- a/src/rgw/rgw_asio_frontend.h +++ b/src/rgw/rgw_asio_frontend.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_ASIO_FRONTEND_H -#define RGW_ASIO_FRONTEND_H +#pragma once #include #include "rgw_frontend.h" @@ -24,5 +23,3 @@ public: void pause_for_new_config() override; void unpause_with_new_config() override; }; - -#endif // RGW_ASIO_FRONTEND_H diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h index b0beb185bac1..82e0d0c9755a 100644 --- a/src/rgw/rgw_auth.h +++ b/src/rgw/rgw_auth.h @@ -1,9 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef CEPH_RGW_AUTH_H -#define CEPH_RGW_AUTH_H +#pragma once #include #include @@ -791,5 +789,3 @@ uint32_t rgw_perms_from_aclspec_default_strategy( const rgw_user& uid, const rgw::auth::Identity::aclspec_t& aclspec, const DoutPrefixProvider *dpp); - -#endif /* CEPH_RGW_AUTH_H */ diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h index 08f6d659c90d..9e3818bef071 100644 --- a/src/rgw/rgw_auth_filters.h +++ b/src/rgw/rgw_auth_filters.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_AUTH_FILTERS_H -#define CEPH_RGW_AUTH_FILTERS_H +#pragma once #include @@ -301,5 +300,3 @@ SysReqApplier add_sysreq(CephContext* const cct, } /* namespace auth */ } /* namespace rgw */ - -#endif /* CEPH_RGW_AUTH_FILTERS_H */ diff --git a/src/rgw/rgw_auth_keystone.h b/src/rgw/rgw_auth_keystone.h index 31a4388080a9..f3c9604370b9 100644 --- a/src/rgw/rgw_auth_keystone.h +++ b/src/rgw/rgw_auth_keystone.h @@ -1,9 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef CEPH_RGW_AUTH_KEYSTONE_H -#define CEPH_RGW_AUTH_KEYSTONE_H +#pragma once #include #include @@ -196,5 +194,3 @@ public: }; /* namespace keystone */ }; /* namespace auth */ }; /* namespace rgw */ - -#endif /* CEPH_RGW_AUTH_KEYSTONE_H */ diff --git a/src/rgw/rgw_auth_registry.h b/src/rgw/rgw_auth_registry.h index 992ee46e81c7..b9d239aecbd1 100644 --- a/src/rgw/rgw_auth_registry.h +++ b/src/rgw/rgw_auth_registry.h @@ -1,9 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef CEPH_RGW_AUTH_REGISTRY_H -#define CEPH_RGW_AUTH_REGISTRY_H +#pragma once #include #include @@ -97,5 +95,3 @@ public: using rgw_auth_registry_t = rgw::auth::StrategyRegistry; using rgw_auth_registry_ptr_t = std::unique_ptr; - -#endif /* CEPH_RGW_AUTH_REGISTRY_H */ diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h index 2984c051662d..a4471467b860 100644 --- a/src/rgw/rgw_auth_s3.h +++ b/src/rgw/rgw_auth_s3.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_AUTH_S3_H -#define CEPH_RGW_AUTH_S3_H +#pragma once #include #include @@ -645,5 +644,3 @@ get_v2_signature(CephContext*, } /* namespace s3 */ } /* namespace auth */ } /* namespace rgw */ - -#endif diff --git a/src/rgw/rgw_b64.h b/src/rgw/rgw_b64.h index a1699ef61bcc..2948f6f31586 100644 --- a/src/rgw/rgw_b64.h +++ b/src/rgw/rgw_b64.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_B64_H -#define RGW_B64_H +#pragma once #include #include @@ -83,5 +82,3 @@ namespace rgw { return outstr; } } /* namespace */ - -#endif /* RGW_B64_H */ diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h index 168e8bc63b81..1ccd160ba7ea 100644 --- a/src/rgw/rgw_basic_types.h +++ b/src/rgw/rgw_basic_types.h @@ -18,8 +18,7 @@ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h) */ -#ifndef CEPH_RGW_BASIC_TYPES_H -#define CEPH_RGW_BASIC_TYPES_H +#pragma once #include #include @@ -283,5 +282,3 @@ struct RGWUploadPartInfo { static void generate_test_instances(std::list& o); }; WRITE_CLASS_ENCODER(RGWUploadPartInfo) - -#endif /* CEPH_RGW_BASIC_TYPES_H */ diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h index cffa6573d63e..e70beb064462 100644 --- a/src/rgw/rgw_cache.h +++ b/src/rgw/rgw_cache.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGWCACHE_H -#define CEPH_RGWCACHE_H +#pragma once #include #include @@ -221,5 +220,3 @@ public: void unchain_cache(RGWChainedCache *cache); void invalidate_all(); }; - -#endif diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h index 5e47aee0627a..aedfe4500b8d 100644 --- a/src/rgw/rgw_client_io.h +++ b/src/rgw/rgw_client_io.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_CLIENT_IO_H -#define CEPH_RGW_CLIENT_IO_H +#pragma once #include #include @@ -434,5 +433,3 @@ public: std::istream(static_cast(this)) { } }; - -#endif /* CEPH_RGW_CLIENT_IO_H */ diff --git a/src/rgw/rgw_client_io_filters.h b/src/rgw/rgw_client_io_filters.h index 538d7f16723d..55d405e1bb23 100644 --- a/src/rgw/rgw_client_io_filters.h +++ b/src/rgw/rgw_client_io_filters.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_CLIENT_IO_DECOIMPL_H -#define CEPH_RGW_CLIENT_IO_DECOIMPL_H +#pragma once #include @@ -453,4 +452,3 @@ ReorderingFilter add_reordering(T&& t) { } /* namespace io */ } /* namespace rgw */ -#endif /* CEPH_RGW_CLIENT_IO_DECOIMPL_H */ diff --git a/src/rgw/rgw_compression.h b/src/rgw/rgw_compression.h index e558f3bbd8ef..84250bfe43cc 100644 --- a/src/rgw/rgw_compression.h +++ b/src/rgw/rgw_compression.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_COMPRESSION_H -#define CEPH_RGW_COMPRESSION_H +#pragma once #include @@ -61,5 +60,3 @@ public: std::optional get_compressor_message() { return compressor_message; } }; /* RGWPutObj_Compress */ - -#endif /* CEPH_RGW_COMPRESSION_H */ diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h index d43d0afbf87b..eb3216640c64 100644 --- a/src/rgw/rgw_coroutine.h +++ b/src/rgw/rgw_coroutine.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_COROUTINE_H -#define CEPH_RGW_COROUTINE_H +#pragma once #ifdef _ASSERT_H #define NEED_ASSERT_H @@ -721,5 +720,3 @@ public: virtual int finish() { return 0; } virtual void request_cleanup() {} }; - -#endif diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h index 1620795e8b48..17eaeeb82301 100644 --- a/src/rgw/rgw_cors.h +++ b/src/rgw/rgw_cors.h @@ -13,8 +13,7 @@ * */ -#ifndef CEPH_RGW_CORS_H -#define CEPH_RGW_CORS_H +#pragma once #include #include @@ -133,4 +132,3 @@ static inline int validate_name_string(std::string_view o) { return -1; return 0; } -#endif /*CEPH_RGW_CORS_H*/ diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h index f4ec8a1f95bf..8d92a3c5fd34 100644 --- a/src/rgw/rgw_cors_s3.h +++ b/src/rgw/rgw_cors_s3.h @@ -13,8 +13,7 @@ * */ -#ifndef CEPH_RGW_CORS_S3_H -#define CEPH_RGW_CORS_S3_H +#pragma once #include #include @@ -57,4 +56,3 @@ class RGWCORSXMLParser_S3 : public RGWXMLParser public: explicit RGWCORSXMLParser_S3(const DoutPrefixProvider *_dpp, CephContext *_cct) : dpp(_dpp), cct(_cct) {} }; -#endif /*CEPH_RGW_CORS_S3_H*/ diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h index 3eff9bea32be..f5a1b14a0919 100644 --- a/src/rgw/rgw_cors_swift.h +++ b/src/rgw/rgw_cors_swift.h @@ -13,8 +13,7 @@ * */ -#ifndef CEPH_RGW_CORS_SWIFT3_H -#define CEPH_RGW_CORS_SWIFT3_H +#pragma once #include #include @@ -82,4 +81,3 @@ class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration return 0; } }; -#endif /*CEPH_RGW_CORS_SWIFT3_H*/ diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h index 33d6bc3bbc38..6008dd05eaea 100644 --- a/src/rgw/rgw_crypt.h +++ b/src/rgw/rgw_crypt.h @@ -5,8 +5,7 @@ * Crypto filters for Put/Post/Get operations. */ -#ifndef CEPH_RGW_CRYPT_H -#define CEPH_RGW_CRYPT_H +#pragma once #include @@ -171,5 +170,3 @@ static inline std::string get_str_attribute(std::map& a } int rgw_remove_sse_s3_bucket_key(req_state *s); - -#endif diff --git a/src/rgw/rgw_crypt_sanitize.h b/src/rgw/rgw_crypt_sanitize.h index 1f862089cbdf..aa0261fc2518 100644 --- a/src/rgw/rgw_crypt_sanitize.h +++ b/src/rgw/rgw_crypt_sanitize.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_RGW_CRYPT_SANITIZE_H_ -#define RGW_RGW_CRYPT_SANITIZE_H_ +#pragma once #include #include "rgw_common.h" @@ -67,4 +66,3 @@ std::ostream& operator<<(std::ostream& out, const auth& x); std::ostream& operator<<(std::ostream& out, const log_content& x); } } -#endif /* RGW_RGW_CRYPT_SANITIZE_H_ */ diff --git a/src/rgw/rgw_d3n_cacherequest.h b/src/rgw/rgw_d3n_cacherequest.h index ad93a689f9c6..eac8c7610eeb 100644 --- a/src/rgw/rgw_d3n_cacherequest.h +++ b/src/rgw/rgw_d3n_cacherequest.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_CACHEREQUEST_H -#define RGW_CACHEREQUEST_H +#pragma once #include #include @@ -144,5 +143,3 @@ struct D3nL1CacheRequest { } }; - -#endif diff --git a/src/rgw/rgw_dmclock.h b/src/rgw/rgw_dmclock.h index 6805c0050003..6fad9cc1895e 100644 --- a/src/rgw/rgw_dmclock.h +++ b/src/rgw/rgw_dmclock.h @@ -14,8 +14,8 @@ * */ -#ifndef RGW_DMCLOCK_H -#define RGW_DMCLOCK_H +#pragma once + #include "dmclock/src/dmclock_server.h" namespace rgw::dmclock { @@ -50,5 +50,3 @@ inline scheduler_t get_scheduler_t(CephContext* const cct) } } // namespace rgw::dmclock - -#endif /* RGW_DMCLOCK_H */ diff --git a/src/rgw/rgw_dmclock_async_scheduler.h b/src/rgw/rgw_dmclock_async_scheduler.h index ccac09be7b05..7bde75870a5a 100644 --- a/src/rgw/rgw_dmclock_async_scheduler.h +++ b/src/rgw/rgw_dmclock_async_scheduler.h @@ -12,8 +12,7 @@ * */ -#ifndef RGW_DMCLOCK_ASYNC_SCHEDULER_H -#define RGW_DMCLOCK_ASYNC_SCHEDULER_H +#pragma once #include "common/async/completion.h" @@ -216,4 +215,3 @@ private: }; } // namespace rgw::dmclock -#endif /* RGW_DMCLOCK_ASYNC_SCHEDULER_H */ diff --git a/src/rgw/rgw_dmclock_scheduler.h b/src/rgw/rgw_dmclock_scheduler.h index f3dc229db384..655e12bef352 100644 --- a/src/rgw/rgw_dmclock_scheduler.h +++ b/src/rgw/rgw_dmclock_scheduler.h @@ -12,8 +12,7 @@ * */ -#ifndef RGW_DMCLOCK_SCHEDULER_H -#define RGW_DMCLOCK_SCHEDULER_H +#pragma once #include "common/ceph_time.h" #include "common/ceph_context.h" @@ -85,5 +84,3 @@ private: }; } // namespace rgw::dmclock - -#endif // RGW_DMCLOCK_SCHEDULER_H diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.h b/src/rgw/rgw_dmclock_scheduler_ctx.h index be3b2cc27941..f27b81c266e5 100644 --- a/src/rgw/rgw_dmclock_scheduler_ctx.h +++ b/src/rgw/rgw_dmclock_scheduler_ctx.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_DMCLOCK_SCHEDULER_CTX_H -#define RGW_DMCLOCK_SCHEDULER_CTX_H +#pragma once #include "common/perf_counters.h" #include "common/ceph_context.h" @@ -118,5 +117,3 @@ private: }; } // namespace rgw::dmclock - -#endif /* RGW_DMCLOCK_SCHEDULER_CTX_H */ diff --git a/src/rgw/rgw_dmclock_sync_scheduler.h b/src/rgw/rgw_dmclock_sync_scheduler.h index 8b3edc3a6138..740234965e93 100644 --- a/src/rgw/rgw_dmclock_sync_scheduler.h +++ b/src/rgw/rgw_dmclock_sync_scheduler.h @@ -12,8 +12,7 @@ * */ -#ifndef RGW_DMCLOCK_SYNC_SCHEDULER_H -#define RGW_DMCLOCK_SYNC_SCHEDULER_H +#pragma once #include "rgw_dmclock_scheduler.h" #include "rgw_dmclock_scheduler_ctx.h" @@ -76,4 +75,3 @@ SyncScheduler::SyncScheduler(CephContext *cct, GetClientCounters&& counters, {} } // namespace rgw::dmclock -#endif /* RGW_DMCLOCK_SYNC_SCHEDULER_H */ diff --git a/src/rgw/rgw_es_query.h b/src/rgw/rgw_es_query.h index 27cc36d75260..f96e06f75057 100644 --- a/src/rgw/rgw_es_query.h +++ b/src/rgw/rgw_es_query.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ES_QUERY_H -#define CEPH_RGW_ES_QUERY_H +#pragma once #include "rgw_string.h" @@ -163,6 +162,3 @@ public: return (restricted_fields && restricted_fields->find(f) != restricted_fields->end()); } }; - - -#endif diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h index fbda86ba4cb3..fbbe782b73e0 100644 --- a/src/rgw/rgw_file.h +++ b/src/rgw/rgw_file.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_FILE_H -#define RGW_FILE_H +#pragma once #include "include/rados/rgw_file.h" @@ -2855,5 +2854,3 @@ public: } /* namespace rgw */ - -#endif /* RGW_FILE_H */ diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h index d7e47259d583..12ad224a3dbb 100644 --- a/src/rgw/rgw_formats.h +++ b/src/rgw/rgw_formats.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_FORMATS_H -#define CEPH_RGW_FORMATS_H +#pragma once #include "common/Formatter.h" @@ -132,5 +131,3 @@ protected: public: RGWNullFlusher() : RGWFormatterFlusher(nullptr) {} }; - -#endif diff --git a/src/rgw/rgw_frontend.h b/src/rgw/rgw_frontend.h index 34d3b06586cb..4876fb8f8537 100644 --- a/src/rgw/rgw_frontend.h +++ b/src/rgw/rgw_frontend.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_FRONTEND_H -#define RGW_FRONTEND_H +#pragma once #include #include @@ -210,5 +209,3 @@ class RGWFrontendPauser : public RGWRealmReloader::Pauser { pauser->resume(driver); } }; - -#endif /* RGW_FRONTEND_H */ diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h index d20b28cf4f94..dbd705a1880d 100644 --- a/src/rgw/rgw_http_client.h +++ b/src/rgw/rgw_http_client.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_HTTP_CLIENT_H -#define CEPH_RGW_HTTP_CLIENT_H +#pragma once #include "common/async/yield_context.h" #include "common/Cond.h" @@ -347,4 +346,3 @@ public: static int send(RGWHTTPClient *req); static int process(RGWHTTPClient *req, optional_yield y); }; -#endif diff --git a/src/rgw/rgw_http_client_curl.h b/src/rgw/rgw_http_client_curl.h index 2a49a2c36c4d..a28826b0d839 100644 --- a/src/rgw/rgw_http_client_curl.h +++ b/src/rgw/rgw_http_client_curl.h @@ -13,8 +13,7 @@ * */ -#ifndef RGW_HTTP_CLIENT_CURL_H -#define RGW_HTTP_CLIENT_CURL_H +#pragma once #include #include @@ -28,5 +27,3 @@ void setup_curl(boost::optional m); void cleanup_curl(); } } - -#endif diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h index d8674552ab6a..5e052819e052 100644 --- a/src/rgw/rgw_http_errors.h +++ b/src/rgw/rgw_http_errors.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_HTTP_ERRORS_H_ -#define RGW_HTTP_ERRORS_H_ +#pragma once #include "rgw_common.h" @@ -43,6 +42,3 @@ static inline int rgw_http_error_to_errno(int http_err) return 0; /* unreachable */ } - - -#endif diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h index 564ddd530c0b..ff6061c7550e 100644 --- a/src/rgw/rgw_iam_policy.h +++ b/src/rgw/rgw_iam_policy.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_IAM_POLICY_H -#define CEPH_RGW_IAM_POLICY_H +#pragma once #include #include @@ -580,5 +579,3 @@ bool is_public(const Policy& p); } } - -#endif diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h index 1c94dfe17103..8130ace456c6 100644 --- a/src/rgw/rgw_iam_policy_keywords.h +++ b/src/rgw/rgw_iam_policy_keywords.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_POLICY_S3V2_KEYWORDS_H -#define CEPH_RGW_POLICY_S3V2_KEYWORDS_H +#pragma once namespace rgw { namespace IAM { @@ -138,5 +137,3 @@ enum class Type { }; } } - -#endif // CEPH_RGW_POLICY_S3V2_KEYWORDS_H diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h index 84961e4f9d29..0ba88278268d 100644 --- a/src/rgw/rgw_keystone.h +++ b/src/rgw/rgw_keystone.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_KEYSTONE_H -#define CEPH_RGW_KEYSTONE_H +#pragma once #include #include @@ -332,5 +331,3 @@ public: }; /* namespace keystone */ }; /* namespace rgw */ - -#endif diff --git a/src/rgw/rgw_kmip_client.h b/src/rgw/rgw_kmip_client.h index efc7db325b90..2992921136e5 100644 --- a/src/rgw/rgw_kmip_client.h +++ b/src/rgw/rgw_kmip_client.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_KMIP_CLIENT_H -#define CEPH_RGW_KMIP_CLIENT_H +#pragma once class RGWKMIPManager; @@ -64,4 +63,3 @@ public: void rgw_kmip_client_init(RGWKMIPManager &); void rgw_kmip_client_cleanup(); -#endif diff --git a/src/rgw/rgw_kmip_client_impl.h b/src/rgw/rgw_kmip_client_impl.h index 841df87f4c33..d36903a4b795 100644 --- a/src/rgw/rgw_kmip_client_impl.h +++ b/src/rgw/rgw_kmip_client_impl.h @@ -1,8 +1,8 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_KMIP_CLIENT_IMPL_H -#define CEPH_RGW_KMIP_CLIENT_IMPL_H +#pragma once + struct RGWKmipWorker; class RGWKMIPManagerImpl: public RGWKMIPManager { protected: @@ -25,5 +25,3 @@ public: void stop(); friend RGWKmipWorker; }; -#endif - diff --git a/src/rgw/rgw_kms.h b/src/rgw/rgw_kms.h index ba9b436139ed..f8e8655f261c 100644 --- a/src/rgw/rgw_kms.h +++ b/src/rgw/rgw_kms.h @@ -5,8 +5,7 @@ * Server-side encryption integrations with Key Management Systems (SSE-KMS) */ -#ifndef CEPH_RGW_KMS_H -#define CEPH_RGW_KMS_H +#pragma once #include @@ -63,4 +62,3 @@ public: virtual int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) = 0; virtual ~SecretEngine(){}; }; -#endif diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h index e74b67fca6d0..bd8efd9b6d03 100644 --- a/src/rgw/rgw_lc.h +++ b/src/rgw/rgw_lc.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LC_H -#define CEPH_RGW_LC_H +#pragma once #include #include @@ -639,5 +638,3 @@ bool s3_multipart_abort_header( std::string& rule_id); } // namespace rgw::lc - -#endif diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h index 84ffdc6c8ad8..5486aef35805 100644 --- a/src/rgw/rgw_lc_s3.h +++ b/src/rgw/rgw_lc_s3.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LC_S3_H -#define CEPH_RGW_LC_S3_H +#pragma once #include #include @@ -99,5 +98,3 @@ public: int rebuild(RGWLifecycleConfiguration& dest); void dump_xml(Formatter *f) const; }; - -#endif diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h index 06986e5f59ac..05a48ce19000 100644 --- a/src/rgw/rgw_ldap.h +++ b/src/rgw/rgw_ldap.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_LDAP_H -#define RGW_LDAP_H +#pragma once #include "acconfig.h" @@ -137,5 +136,3 @@ namespace rgw { #include "include/ceph_assert.h" std::string parse_rgw_ldap_bindpw(CephContext* ctx); - -#endif /* RGW_LDAP_H */ diff --git a/src/rgw/rgw_lib.h b/src/rgw/rgw_lib.h index 02317ea8e084..1ad54b49b485 100644 --- a/src/rgw/rgw_lib.h +++ b/src/rgw/rgw_lib.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_LIB_H -#define RGW_LIB_H +#pragma once #include #include "rgw_common.h" @@ -208,5 +207,3 @@ namespace rgw { }; /* RGWLibContinuedReq */ } /* namespace rgw */ - -#endif /* RGW_LIB_H */ diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h index 57e58c522e59..1772724d218e 100644 --- a/src/rgw/rgw_lib_frontend.h +++ b/src/rgw/rgw_lib_frontend.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_LIB_FRONTEND_H -#define RGW_LIB_FRONTEND_H +#pragma once #include @@ -112,5 +111,3 @@ namespace rgw { }; /* RGWLibFrontend */ } /* namespace rgw */ - -#endif /* RGW_LIB_FRONTEND_H */ diff --git a/src/rgw/rgw_loadgen.h b/src/rgw/rgw_loadgen.h index 5a0abca57f79..7f3f847c2b10 100644 --- a/src/rgw/rgw_loadgen.h +++ b/src/rgw/rgw_loadgen.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LOADGEN_H -#define CEPH_RGW_LOADGEN_H +#pragma once #include #include @@ -71,5 +70,3 @@ public: size_t complete_request() override; }; - -#endif diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h index 0c97b2f8a64b..1dd79273e6a1 100644 --- a/src/rgw/rgw_log.h +++ b/src/rgw/rgw_log.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_LOG_H -#define CEPH_RGW_LOG_H +#pragma once #include #include "rgw_common.h" @@ -288,6 +287,3 @@ void rgw_log_usage_init(CephContext* cct, rgw::sal::Driver* driver); void rgw_log_usage_finalize(); void rgw_format_ops_log_entry(struct rgw_log_entry& entry, ceph::Formatter *formatter); - -#endif /* CEPH_RGW_LOG_H */ - diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h index 6272b471db01..f8a2ae3ee7ba 100644 --- a/src/rgw/rgw_meta_sync_status.h +++ b/src/rgw/rgw_meta_sync_status.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_META_SYNC_STATUS_H -#define RGW_META_SYNC_STATUS_H +#pragma once #include @@ -120,5 +119,3 @@ struct rgw_meta_sync_status { static void generate_test_instances(std::list& ls); }; WRITE_CLASS_ENCODER(rgw_meta_sync_status) - -#endif diff --git a/src/rgw/rgw_multi.h b/src/rgw/rgw_multi.h index bb371e71c5e5..f57c90e74895 100644 --- a/src/rgw/rgw_multi.h +++ b/src/rgw/rgw_multi.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_MULTI_H -#define CEPH_RGW_MULTI_H +#pragma once #include #include "rgw_xml.h" @@ -61,5 +60,3 @@ public: }; extern bool is_v2_upload_id(const std::string& upload_id); - -#endif diff --git a/src/rgw/rgw_multi_del.h b/src/rgw/rgw_multi_del.h index 6187aae37e8d..b060decf420a 100644 --- a/src/rgw/rgw_multi_del.h +++ b/src/rgw/rgw_multi_del.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_MULTI_DELETE_H_ -#define RGW_MULTI_DELETE_H_ +#pragma once #include #include "rgw_xml.h" @@ -61,6 +60,3 @@ public: RGWMultiDelXMLParser() {} ~RGWMultiDelXMLParser() override {} }; - - -#endif diff --git a/src/rgw/rgw_object_lock.h b/src/rgw/rgw_object_lock.h index 997c660e4cd1..27c73feaec92 100644 --- a/src/rgw/rgw_object_lock.h +++ b/src/rgw/rgw_object_lock.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_OBJECT_LOCK_H -#define CEPH_RGW_OBJECT_LOCK_H +#pragma once #include #include "common/ceph_time.h" @@ -221,4 +220,3 @@ public: bool is_enabled() const; }; WRITE_CLASS_ENCODER(RGWObjectLegalHold) -#endif //CEPH_RGW_OBJECT_LOCK_H diff --git a/src/rgw/rgw_oidc_provider.h b/src/rgw/rgw_oidc_provider.h index c3b794df0d62..581ee879a642 100644 --- a/src/rgw/rgw_oidc_provider.h +++ b/src/rgw/rgw_oidc_provider.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_OIDC_PROVIDER_H -#define CEPH_RGW_OIDC_PROVIDER_H +#pragma once #include @@ -120,5 +119,3 @@ public: WRITE_CLASS_ENCODER(RGWOIDCProvider) } } // namespace rgw::sal -#endif /* CEPH_RGW_OIDC_PROVIDER_H */ - diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index d0ff70b3132d..a0e8b273ce07 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -10,8 +10,7 @@ * to provide additional virtual methods such as send_response or get_params. */ -#ifndef CEPH_RGW_OP_H -#define CEPH_RGW_OP_H +#pragma once #include @@ -2669,5 +2668,3 @@ int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, std::map& attrset, RGWAccessControlPolicy *policy); - -#endif /* CEPH_RGW_OP_H */ diff --git a/src/rgw/rgw_opa.h b/src/rgw/rgw_opa.h index 38824c58aeb2..6fd3b21bdcf0 100644 --- a/src/rgw/rgw_opa.h +++ b/src/rgw/rgw_opa.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_OPA_H -#define RGW_OPA_H +#pragma once #include "rgw_common.h" #include "rgw_op.h" @@ -10,5 +9,3 @@ /* authorize request using OPA */ int rgw_opa_authorize(RGWOp*& op, req_state* s); - -#endif /* RGW_OPA_H */ diff --git a/src/rgw/rgw_os_lib.h b/src/rgw/rgw_os_lib.h index b65c7c22accb..65df0a726659 100644 --- a/src/rgw/rgw_os_lib.h +++ b/src/rgw/rgw_os_lib.h @@ -1,12 +1,9 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_OS_LIB_H -#define RGW_OS_LIB_H +#pragma once #include #include "rgw_common.h" #include "rgw_lib.h" - -#endif /* RGW_OS_LIB_H */ diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h index 0d412c76a3be..3d18fbf9e227 100644 --- a/src/rgw/rgw_period_history.h +++ b/src/rgw/rgw_period_history.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_PERIOD_HISTORY_H -#define RGW_PERIOD_HISTORY_H +#pragma once #include #include @@ -113,5 +112,3 @@ class RGWPeriodHistory final { /// the current_history Cursor lookup(epoch_t realm_epoch); }; - -#endif // RGW_PERIOD_HISTORY_H diff --git a/src/rgw/rgw_period_puller.h b/src/rgw/rgw_period_puller.h index 654029dd1c4e..88138d36b8ca 100644 --- a/src/rgw/rgw_period_puller.h +++ b/src/rgw/rgw_period_puller.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_PERIOD_PULLER_H -#define CEPH_RGW_PERIOD_PULLER_H +#pragma once #include "rgw_period_history.h" #include "include/common_fwd.h" @@ -23,5 +22,3 @@ class RGWPeriodPuller : public RGWPeriodHistory::Puller { int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y) override; }; - -#endif // CEPH_RGW_PERIOD_PULLER_H diff --git a/src/rgw/rgw_period_pusher.h b/src/rgw/rgw_period_pusher.h index ae267a11e78e..3ea7bd7ddebe 100644 --- a/src/rgw/rgw_period_pusher.h +++ b/src/rgw/rgw_period_pusher.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_PERIOD_PUSHER_H -#define RGW_PERIOD_PUSHER_H +#pragma once #include #include @@ -53,5 +52,3 @@ class RGWPeriodPusher final : public RGWRealmWatcher::Watcher, class CRThread; //< contains thread, coroutine manager, http manager std::unique_ptr cr_thread; //< thread to run the push coroutines }; - -#endif // RGW_PERIOD_PUSHER_H diff --git a/src/rgw/rgw_policy_s3.h b/src/rgw/rgw_policy_s3.h index 14ad6c4e3af1..2a8a7ab096fd 100644 --- a/src/rgw/rgw_policy_s3.h +++ b/src/rgw/rgw_policy_s3.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_POLICY_H -#define CEPH_RGW_POLICY_H +#pragma once #include @@ -56,4 +55,3 @@ public: int check(RGWPolicyEnv *env, std::string& err_msg); int from_json(bufferlist& bl, std::string& err_msg); }; -#endif diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h index 9d45362248e1..67ebb710a4c2 100644 --- a/src/rgw/rgw_process.h +++ b/src/rgw/rgw_process.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_PROCESS_H -#define RGW_PROCESS_H +#pragma once #include "rgw_common.h" #include "rgw_acl.h" @@ -158,5 +157,3 @@ extern int rgw_process_authenticated(RGWHandler_REST* handler, bool skip_retarget = false); #undef dout_context - -#endif /* RGW_PROCESS_H */ diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc deleted file mode 100644 index b9aa54bacd8c..000000000000 --- a/src/rgw/rgw_pubsub.cc +++ /dev/null @@ -1,723 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#include "services/svc_zone.h" -#include "rgw_b64.h" -#include "rgw_sal.h" -#include "rgw_sal_rados.h" -#include "rgw_pubsub.h" -#include "rgw_tools.h" -#include "rgw_xml.h" -#include "rgw_arn.h" -#include "rgw_pubsub_push.h" -#include -#include - -#define dout_subsys ceph_subsys_rgw - -using namespace std; -void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) { - char buf[64]; - const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str()); - if (len > 0) { - id.assign(buf, len); - } -} - -bool rgw_s3_key_filter::decode_xml(XMLObj* obj) { - XMLObjIter iter = obj->find("FilterRule"); - XMLObj *o; - - const auto throw_if_missing = true; - auto prefix_not_set = true; - auto suffix_not_set = true; - auto regex_not_set = true; - std::string name; - - while ((o = iter.get_next())) { - RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing); - if (name == "prefix" && prefix_not_set) { - prefix_not_set = false; - RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing); - } else if (name == "suffix" && suffix_not_set) { - suffix_not_set = false; - RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing); - } else if (name == "regex" && regex_not_set) { - regex_not_set = false; - RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing); - } else { - throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'"); - } - } - return true; -} - -void rgw_s3_key_filter::dump_xml(Formatter *f) const { - if (!prefix_rule.empty()) { - f->open_object_section("FilterRule"); - ::encode_xml("Name", "prefix", f); - ::encode_xml("Value", prefix_rule, f); - f->close_section(); - } - if (!suffix_rule.empty()) { - f->open_object_section("FilterRule"); - ::encode_xml("Name", "suffix", f); - ::encode_xml("Value", suffix_rule, f); - f->close_section(); - } - if (!regex_rule.empty()) { - f->open_object_section("FilterRule"); - ::encode_xml("Name", "regex", f); - ::encode_xml("Value", regex_rule, f); - f->close_section(); - } -} - -bool rgw_s3_key_filter::has_content() const { - return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty()); -} - -bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) { - kv.clear(); - XMLObjIter iter = obj->find("FilterRule"); - XMLObj *o; - - const auto throw_if_missing = true; - - std::string key; - std::string value; - - while ((o = iter.get_next())) { - RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing); - RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing); - kv.emplace(key, value); - } - return true; -} - -void rgw_s3_key_value_filter::dump_xml(Formatter *f) const { - for (const auto& key_value : kv) { - f->open_object_section("FilterRule"); - ::encode_xml("Name", key_value.first, f); - ::encode_xml("Value", key_value.second, f); - f->close_section(); - } -} - -bool rgw_s3_key_value_filter::has_content() const { - return !kv.empty(); -} - -bool rgw_s3_filter::decode_xml(XMLObj* obj) { - RGWXMLDecoder::decode_xml("S3Key", key_filter, obj); - RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj); - RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj); - return true; -} - -void rgw_s3_filter::dump_xml(Formatter *f) const { - if (key_filter.has_content()) { - ::encode_xml("S3Key", key_filter, f); - } - if (metadata_filter.has_content()) { - ::encode_xml("S3Metadata", metadata_filter, f); - } - if (tag_filter.has_content()) { - ::encode_xml("S3Tags", tag_filter, f); - } -} - -bool rgw_s3_filter::has_content() const { - return key_filter.has_content() || - metadata_filter.has_content() || - tag_filter.has_content(); -} - -bool match(const rgw_s3_key_filter& filter, const std::string& key) { - const auto key_size = key.size(); - const auto prefix_size = filter.prefix_rule.size(); - if (prefix_size != 0) { - // prefix rule exists - if (prefix_size > key_size) { - // if prefix is longer than key, we fail - return false; - } - if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) { - return false; - } - } - const auto suffix_size = filter.suffix_rule.size(); - if (suffix_size != 0) { - // suffix rule exists - if (suffix_size > key_size) { - // if suffix is longer than key, we fail - return false; - } - if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) { - return false; - } - } - if (!filter.regex_rule.empty()) { - // TODO add regex chaching in the filter - const std::regex base_regex(filter.regex_rule); - if (!std::regex_match(key, base_regex)) { - return false; - } - } - return true; -} - -bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) { - // all filter pairs must exist with the same value in the object's metadata/tags - // object metadata/tags may include items not in the filter - return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end()); -} - -bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) { - // all filter pairs must exist with the same value in the object's metadata/tags - // object metadata/tags may include items not in the filter - for (auto& filter : filter.kv) { - auto result = kv.equal_range(filter.first); - if (std::any_of(result.first, result.second, [&filter](const pair& p) { return p.second == filter.second;})) - continue; - else - return false; - } - return true; -} - -bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) { - // if event list exists, and none of the events in the list matches the event type, filter the message - if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) { - return false; - } - return true; -} - -void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) { - l.clear(); - - XMLObjIter iter = obj->find(name); - XMLObj *o; - - while ((o = iter.get_next())) { - std::string val; - decode_xml_obj(val, o); - l.push_back(rgw::notify::from_string(val)); - } -} - -bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) { - const auto throw_if_missing = true; - RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing); - - RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing); - - RGWXMLDecoder::decode_xml("Filter", filter, obj); - - do_decode_xml_obj(events, "Event", obj); - if (events.empty()) { - // if no events are provided, we assume all events - events.push_back(rgw::notify::ObjectCreated); - events.push_back(rgw::notify::ObjectRemoved); - } - return true; -} - -void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const { - ::encode_xml("Id", id, f); - ::encode_xml("Topic", topic_arn.c_str(), f); - if (filter.has_content()) { - ::encode_xml("Filter", filter, f); - } - for (const auto& event : events) { - ::encode_xml("Event", rgw::notify::to_string(event), f); - } -} - -bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) { - do_decode_xml_obj(list, "TopicConfiguration", obj); - return true; -} - -rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) : - id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} - -void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const { - do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f); -} - -void rgw_pubsub_s3_event::dump(Formatter *f) const { - encode_json("eventVersion", eventVersion, f); - encode_json("eventSource", eventSource, f); - encode_json("awsRegion", awsRegion, f); - utime_t ut(eventTime); - encode_json("eventTime", ut, f); - encode_json("eventName", eventName, f); - { - Formatter::ObjectSection s(*f, "userIdentity"); - encode_json("principalId", userIdentity, f); - } - { - Formatter::ObjectSection s(*f, "requestParameters"); - encode_json("sourceIPAddress", sourceIPAddress, f); - } - { - Formatter::ObjectSection s(*f, "responseElements"); - encode_json("x-amz-request-id", x_amz_request_id, f); - encode_json("x-amz-id-2", x_amz_id_2, f); - } - { - Formatter::ObjectSection s(*f, "s3"); - encode_json("s3SchemaVersion", s3SchemaVersion, f); - encode_json("configurationId", configurationId, f); - { - Formatter::ObjectSection sub_s(*f, "bucket"); - encode_json("name", bucket_name, f); - { - Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity"); - encode_json("principalId", bucket_ownerIdentity, f); - } - encode_json("arn", bucket_arn, f); - encode_json("id", bucket_id, f); - } - { - Formatter::ObjectSection sub_s(*f, "object"); - encode_json("key", object_key, f); - encode_json("size", object_size, f); - encode_json("eTag", object_etag, f); - encode_json("versionId", object_versionId, f); - encode_json("sequencer", object_sequencer, f); - encode_json("metadata", x_meta_map, f); - encode_json("tags", tags, f); - } - } - encode_json("eventId", id, f); - encode_json("opaqueData", opaque_data, f); -} - -void rgw_pubsub_topic::dump(Formatter *f) const -{ - encode_json("user", user, f); - encode_json("name", name, f); - encode_json("dest", dest, f); - encode_json("arn", arn, f); - encode_json("opaqueData", opaque_data, f); -} - -void rgw_pubsub_topic::dump_xml(Formatter *f) const -{ - encode_xml("User", user, f); - encode_xml("Name", name, f); - encode_xml("EndPoint", dest, f); - encode_xml("TopicArn", arn, f); - encode_xml("OpaqueData", opaque_data, f); -} - -void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) { - f->open_object_section("entry"); - encode_xml("key", key, f); - encode_xml("value", value, f); - f->close_section(); // entry -} - -void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const -{ - f->open_array_section("Attributes"); - std::string str_user; - user.to_str(str_user); - encode_xml_key_value_entry("User", str_user, f); - encode_xml_key_value_entry("Name", name, f); - encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f); - encode_xml_key_value_entry("TopicArn", arn, f); - encode_xml_key_value_entry("OpaqueData", opaque_data, f); - f->close_section(); // Attributes -} - -void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f) -{ - f->open_array_section(name); - for (auto iter = l.cbegin(); iter != l.cend(); ++iter) { - f->dump_string("obj", rgw::notify::to_string(*iter)); - } - f->close_section(); -} - -void rgw_pubsub_topic_filter::dump(Formatter *f) const -{ - encode_json("topic", topic, f); - encode_json("events", events, f); -} - -void rgw_pubsub_topic_subs::dump(Formatter *f) const -{ - encode_json("topic", topic, f); - encode_json("subs", subs, f); -} - -void rgw_pubsub_bucket_topics::dump(Formatter *f) const -{ - Formatter::ArraySection s(*f, "topics"); - for (auto& t : topics) { - encode_json(t.first.c_str(), t.second, f); - } -} - -void rgw_pubsub_topics::dump(Formatter *f) const -{ - Formatter::ArraySection s(*f, "topics"); - for (auto& t : topics) { - encode_json(t.first.c_str(), t.second, f); - } -} - -void rgw_pubsub_topics::dump_xml(Formatter *f) const -{ - for (auto& t : topics) { - encode_xml("member", t.second.topic, f); - } -} - -void rgw_pubsub_sub_dest::dump(Formatter *f) const -{ - encode_json("bucket_name", bucket_name, f); - encode_json("oid_prefix", oid_prefix, f); - encode_json("push_endpoint", push_endpoint, f); - encode_json("push_endpoint_args", push_endpoint_args, f); - encode_json("push_endpoint_topic", arn_topic, f); - encode_json("stored_secret", stored_secret, f); - encode_json("persistent", persistent, f); -} - -void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const -{ - // first 2 members are omitted here since they - // dont apply to AWS compliant topics - encode_xml("EndpointAddress", push_endpoint, f); - encode_xml("EndpointArgs", push_endpoint_args, f); - encode_xml("EndpointTopic", arn_topic, f); - encode_xml("HasStoredSecret", stored_secret, f); - encode_xml("Persistent", persistent, f); -} - -std::string rgw_pubsub_sub_dest::to_json_str() const -{ - // first 2 members are omitted here since they - // dont apply to AWS compliant topics - JSONFormatter f; - f.open_object_section(""); - encode_json("EndpointAddress", push_endpoint, &f); - encode_json("EndpointArgs", push_endpoint_args, &f); - encode_json("EndpointTopic", arn_topic, &f); - encode_json("HasStoredSecret", stored_secret, &f); - encode_json("Persistent", persistent, &f); - f.close_section(); - std::stringstream ss; - f.flush(ss); - return ss.str(); -} - -void rgw_pubsub_sub_config::dump(Formatter *f) const -{ - encode_json("user", user, f); - encode_json("name", name, f); - encode_json("topic", topic, f); - encode_json("dest", dest, f); - encode_json("s3_id", s3_id, f); -} - -RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant) - : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj) -{ - get_meta_obj(&meta_obj); -} - -int RGWPubSub::remove(const DoutPrefixProvider *dpp, - const rgw_raw_obj& obj, - RGWObjVersionTracker *objv_tracker, - optional_yield y) -{ - int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y); - if (ret < 0) { - return ret; - } - - return 0; -} - -int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker) -{ - int ret = read(meta_obj, result, objv_tracker); - if (ret < 0) { - ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl; - return ret; - } - return 0; -} - -int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, - RGWObjVersionTracker *objv_tracker, optional_yield y) -{ - int ret = write(dpp, meta_obj, topics, objv_tracker, y); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; - return ret; - } - return 0; -} - -int RGWPubSub::get_topics(rgw_pubsub_topics *result) -{ - return read_topics(result, nullptr); -} - -int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker) -{ - int ret = ps->read(bucket_meta_obj, result, objv_tracker); - if (ret < 0 && ret != -ENOENT) { - ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; - return ret; - } - return 0; -} - -int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, - RGWObjVersionTracker *objv_tracker, - optional_yield y) -{ - int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y); - if (ret < 0) { - ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result) -{ - return read_topics(result, nullptr); -} - -int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result) -{ - rgw_pubsub_topics topics; - int ret = get_topics(&topics); - if (ret < 0) { - ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; - return ret; - } - - auto iter = topics.topics.find(name); - if (iter == topics.topics.end()) { - ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; - return -ENOENT; - } - - *result = iter->second; - return 0; -} - -int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result) -{ - rgw_pubsub_topics topics; - int ret = get_topics(&topics); - if (ret < 0) { - ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; - return ret; - } - - auto iter = topics.topics.find(name); - if (iter == topics.topics.end()) { - ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl; - return -ENOENT; - } - - *result = iter->second.topic; - return 0; -} - -int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) { - return create_notification(dpp, topic_name, events, std::nullopt, "", y); -} - -int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) { - rgw_pubsub_topic_subs topic_info; - - int ret = ps->get_topic(topic_name, &topic_info); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl; - return ret; - } - ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl; - - RGWObjVersionTracker objv_tracker; - rgw_pubsub_bucket_topics bucket_topics; - - ret = read_topics(&bucket_topics, &objv_tracker); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << - bucket.name << "': ret=" << ret << dendl; - return ret; - } - ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << - bucket.name << "'" << dendl; - - auto& topic_filter = bucket_topics.topics[topic_name]; - topic_filter.topic = topic_info.topic; - topic_filter.events = events; - topic_filter.s3_id = notif_name; - if (s3_filter) { - topic_filter.s3_filter = *s3_filter; - } - - ret = write_topics(dpp, bucket_topics, &objv_tracker, y); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl; - return ret; - } - - ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl; - - return 0; -} - -int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y) -{ - rgw_pubsub_topic_subs topic_info; - - int ret = ps->get_topic(topic_name, &topic_info); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl; - return ret; - } - - RGWObjVersionTracker objv_tracker; - rgw_pubsub_bucket_topics bucket_topics; - - ret = read_topics(&bucket_topics, &objv_tracker); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl; - return ret; - } - - bucket_topics.topics.erase(topic_name); - - if (bucket_topics.topics.empty()) { - // no more topics - delete the notification object of the bucket - ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; - return ret; - } - return 0; - } - - // write back the notifications without the deleted one - ret = write_topics(dpp, bucket_topics, &objv_tracker, y); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) -{ - // get all topics on a bucket - rgw_pubsub_bucket_topics bucket_topics; - auto ret = get_topics(&bucket_topics); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl; - return ret ; - } - - // remove all auto-genrated topics - for (const auto& topic : bucket_topics.topics) { - const auto& topic_name = topic.first; - ret = ps->remove_topic(dpp, topic_name, y); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl; - } - } - - // delete the notification object of the bucket - ret = ps->remove(dpp, bucket_meta_obj, nullptr, y); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) { - return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y); -} - -int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) { - RGWObjVersionTracker objv_tracker; - rgw_pubsub_topics topics; - - int ret = read_topics(&topics, &objv_tracker); - if (ret < 0 && ret != -ENOENT) { - // its not an error if not topics exist, we create one - ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; - return ret; - } - - rgw_pubsub_topic_subs& new_topic = topics.topics[name]; - new_topic.topic.user = rgw_user("", tenant); - new_topic.topic.name = name; - new_topic.topic.dest = dest; - new_topic.topic.arn = arn; - new_topic.topic.opaque_data = opaque_data; - - ret = write_topics(dpp, topics, &objv_tracker, y); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) -{ - RGWObjVersionTracker objv_tracker; - rgw_pubsub_topics topics; - - int ret = read_topics(&topics, &objv_tracker); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl; - return ret; - } else if (ret == -ENOENT) { - // its not an error if no topics exist, just a no-op - ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl; - return 0; - } - - topics.topics.erase(name); - - ret = write_topics(dpp, topics, &objv_tracker, y); - if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl; - return ret; - } - - return 0; -} - -void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const { - *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid()); -} - -void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const { - *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket)); -} - -void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const { - *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name)); -} - diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h deleted file mode 100644 index c0e9e144bb2f..000000000000 --- a/src/rgw/rgw_pubsub.h +++ /dev/null @@ -1,716 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef CEPH_RGW_PUBSUB_H -#define CEPH_RGW_PUBSUB_H - -#include "services/svc_sys_obj.h" -#include "rgw_tools.h" -#include "rgw_zone.h" -#include "rgw_notify_event_type.h" -#include - -namespace rgw::sal { class RadosStore; } - -class XMLObj; - -struct rgw_s3_key_filter { - std::string prefix_rule; - std::string suffix_rule; - std::string regex_rule; - - bool has_content() const; - - bool decode_xml(XMLObj *obj); - void dump_xml(Formatter *f) const; - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(prefix_rule, bl); - encode(suffix_rule, bl); - encode(regex_rule, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(prefix_rule, bl); - decode(suffix_rule, bl); - decode(regex_rule, bl); - DECODE_FINISH(bl); - } -}; -WRITE_CLASS_ENCODER(rgw_s3_key_filter) - -using KeyValueMap = boost::container::flat_map; -using KeyMultiValueMap = std::multimap; - -struct rgw_s3_key_value_filter { - KeyValueMap kv; - - bool has_content() const; - - bool decode_xml(XMLObj *obj); - void dump_xml(Formatter *f) const; - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(kv, bl); - ENCODE_FINISH(bl); - } - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(kv, bl); - DECODE_FINISH(bl); - } -}; -WRITE_CLASS_ENCODER(rgw_s3_key_value_filter) - -struct rgw_s3_filter { - rgw_s3_key_filter key_filter; - rgw_s3_key_value_filter metadata_filter; - rgw_s3_key_value_filter tag_filter; - - bool has_content() const; - - bool decode_xml(XMLObj *obj); - void dump_xml(Formatter *f) const; - - void encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); - encode(key_filter, bl); - encode(metadata_filter, bl); - encode(tag_filter, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(2, bl); - decode(key_filter, bl); - decode(metadata_filter, bl); - if (struct_v >= 2) { - decode(tag_filter, bl); - } - DECODE_FINISH(bl); - } -}; -WRITE_CLASS_ENCODER(rgw_s3_filter) - -using OptionalFilter = std::optional; - -struct rgw_pubsub_topic_filter; -/* S3 notification configuration - * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html - - - - - - suffix - jpg - - - - - - - - - - - - - - - - notification1 - arn:aws:sns::: - s3:ObjectCreated:* - s3:ObjectRemoved:* - - -*/ -struct rgw_pubsub_s3_notification { - // notification id - std::string id; - // types of events - rgw::notify::EventTypeList events; - // topic ARN - std::string topic_arn; - // filter rules - rgw_s3_filter filter; - - bool decode_xml(XMLObj *obj); - void dump_xml(Formatter *f) const; - - rgw_pubsub_s3_notification() = default; - // construct from rgw_pubsub_topic_filter (used by get/list notifications) - explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter); -}; - -// return true if the key matches the prefix/suffix/regex rules of the key filter -bool match(const rgw_s3_key_filter& filter, const std::string& key); - -// return true if the key matches the metadata rules of the metadata filter -bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv); - -// return true if the key matches the tag rules of the tag filter -bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv); - -// return true if the event type matches (equal or contained in) one of the events in the list -bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event); - -struct rgw_pubsub_s3_notifications { - std::list list; - bool decode_xml(XMLObj *obj); - void dump_xml(Formatter *f) const; -}; - -/* S3 event records structure - * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html -{ -"Records":[ - { - "eventVersion":"" - "eventSource":"", - "awsRegion":"", - "eventTime":"", - "eventName":"", - "userIdentity":{ - "principalId":"" - }, - "requestParameters":{ - "sourceIPAddress":"" - }, - "responseElements":{ - "x-amz-request-id":"", - "x-amz-id-2":"" - }, - "s3":{ - "s3SchemaVersion":"1.0", - "configurationId":"", - "bucket":{ - "name":"", - "ownerIdentity":{ - "principalId":"" - }, - "arn":"" - "id": "" - }, - "object":{ - "key":"", - "size": , - "eTag":"", - "versionId":"", - "sequencer": "", - "metadata": "" - "tags": "" - } - }, - "eventId":"", - } -] -}*/ - -struct rgw_pubsub_s3_event { - constexpr static const char* const json_type_plural = "Records"; - std::string eventVersion = "2.2"; - // aws:s3 - std::string eventSource = "ceph:s3"; - // zonegroup - std::string awsRegion; - // time of the request - ceph::real_time eventTime; - // type of the event - std::string eventName; - // user that sent the request - std::string userIdentity; - // IP address of source of the request (not implemented) - std::string sourceIPAddress; - // request ID (not implemented) - std::string x_amz_request_id; - // radosgw that received the request - std::string x_amz_id_2; - std::string s3SchemaVersion = "1.0"; - // ID received in the notification request - std::string configurationId; - // bucket name - std::string bucket_name; - // bucket owner - std::string bucket_ownerIdentity; - // bucket ARN - std::string bucket_arn; - // object key - std::string object_key; - // object size - uint64_t object_size = 0; - // object etag - std::string object_etag; - // object version id bucket is versioned - std::string object_versionId; - // hexadecimal value used to determine event order for specific key - std::string object_sequencer; - // this is an rgw extension (not S3 standard) - // used to store a globally unique identifier of the event - // that could be used for acking or any other identification of the event - std::string id; - // this is an rgw extension holding the internal bucket id - std::string bucket_id; - // meta data - KeyValueMap x_meta_map; - // tags - KeyMultiValueMap tags; - // opaque data received from the topic - // could be used to identify the gateway - std::string opaque_data; - - void encode(bufferlist& bl) const { - ENCODE_START(4, 1, bl); - encode(eventVersion, bl); - encode(eventSource, bl); - encode(awsRegion, bl); - encode(eventTime, bl); - encode(eventName, bl); - encode(userIdentity, bl); - encode(sourceIPAddress, bl); - encode(x_amz_request_id, bl); - encode(x_amz_id_2, bl); - encode(s3SchemaVersion, bl); - encode(configurationId, bl); - encode(bucket_name, bl); - encode(bucket_ownerIdentity, bl); - encode(bucket_arn, bl); - encode(object_key, bl); - encode(object_size, bl); - encode(object_etag, bl); - encode(object_versionId, bl); - encode(object_sequencer, bl); - encode(id, bl); - encode(bucket_id, bl); - encode(x_meta_map, bl); - encode(tags, bl); - encode(opaque_data, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(4, bl); - decode(eventVersion, bl); - decode(eventSource, bl); - decode(awsRegion, bl); - decode(eventTime, bl); - decode(eventName, bl); - decode(userIdentity, bl); - decode(sourceIPAddress, bl); - decode(x_amz_request_id, bl); - decode(x_amz_id_2, bl); - decode(s3SchemaVersion, bl); - decode(configurationId, bl); - decode(bucket_name, bl); - decode(bucket_ownerIdentity, bl); - decode(bucket_arn, bl); - decode(object_key, bl); - decode(object_size, bl); - decode(object_etag, bl); - decode(object_versionId, bl); - decode(object_sequencer, bl); - decode(id, bl); - if (struct_v >= 2) { - decode(bucket_id, bl); - decode(x_meta_map, bl); - } - if (struct_v >= 3) { - decode(tags, bl); - } - if (struct_v >= 4) { - decode(opaque_data, bl); - } - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_s3_event) - -// setting a unique ID for an event based on object hash and timestamp -void set_event_id(std::string& id, const std::string& hash, const utime_t& ts); - -struct rgw_pubsub_sub_dest { - std::string bucket_name; - std::string oid_prefix; - std::string push_endpoint; - std::string push_endpoint_args; - std::string arn_topic; - bool stored_secret = false; - bool persistent = false; - - void encode(bufferlist& bl) const { - ENCODE_START(5, 1, bl); - encode(bucket_name, bl); - encode(oid_prefix, bl); - encode(push_endpoint, bl); - encode(push_endpoint_args, bl); - encode(arn_topic, bl); - encode(stored_secret, bl); - encode(persistent, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(5, bl); - decode(bucket_name, bl); - decode(oid_prefix, bl); - decode(push_endpoint, bl); - if (struct_v >= 2) { - decode(push_endpoint_args, bl); - } - if (struct_v >= 3) { - decode(arn_topic, bl); - } - if (struct_v >= 4) { - decode(stored_secret, bl); - } - if (struct_v >= 5) { - decode(persistent, bl); - } - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; - void dump_xml(Formatter *f) const; - std::string to_json_str() const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest) - -struct rgw_pubsub_sub_config { - rgw_user user; - std::string name; - std::string topic; - rgw_pubsub_sub_dest dest; - std::string s3_id; - - void encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); - encode(user, bl); - encode(name, bl); - encode(topic, bl); - encode(dest, bl); - encode(s3_id, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(2, bl); - decode(user, bl); - decode(name, bl); - decode(topic, bl); - decode(dest, bl); - if (struct_v >= 2) { - decode(s3_id, bl); - } - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_sub_config) - -struct rgw_pubsub_topic { - rgw_user user; - std::string name; - rgw_pubsub_sub_dest dest; - std::string arn; - std::string opaque_data; - - void encode(bufferlist& bl) const { - ENCODE_START(3, 1, bl); - encode(user, bl); - encode(name, bl); - encode(dest, bl); - encode(arn, bl); - encode(opaque_data, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(3, bl); - decode(user, bl); - decode(name, bl); - if (struct_v >= 2) { - decode(dest, bl); - decode(arn, bl); - } - if (struct_v >= 3) { - decode(opaque_data, bl); - } - DECODE_FINISH(bl); - } - - std::string to_str() const { - return user.tenant + "/" + name; - } - - void dump(Formatter *f) const; - void dump_xml(Formatter *f) const; - void dump_xml_as_attributes(Formatter *f) const; - - bool operator<(const rgw_pubsub_topic& t) const { - return to_str().compare(t.to_str()); - } -}; -WRITE_CLASS_ENCODER(rgw_pubsub_topic) - -struct rgw_pubsub_topic_subs { - rgw_pubsub_topic topic; - std::set subs; - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(topic, bl); - encode(subs, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(topic, bl); - decode(subs, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs) - -struct rgw_pubsub_topic_filter { - rgw_pubsub_topic topic; - rgw::notify::EventTypeList events; - std::string s3_id; - rgw_s3_filter s3_filter; - - void encode(bufferlist& bl) const { - ENCODE_START(3, 1, bl); - encode(topic, bl); - // events are stored as a vector of std::strings - std::vector tmp_events; - std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string); - encode(tmp_events, bl); - encode(s3_id, bl); - encode(s3_filter, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(3, bl); - decode(topic, bl); - // events are stored as a vector of std::strings - events.clear(); - std::vector tmp_events; - decode(tmp_events, bl); - std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string); - if (struct_v >= 2) { - decode(s3_id, bl); - } - if (struct_v >= 3) { - decode(s3_filter, bl); - } - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter) - -struct rgw_pubsub_bucket_topics { - std::map topics; - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(topics, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(topics, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics) - -struct rgw_pubsub_topics { - std::map topics; - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(topics, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(topics, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; - void dump_xml(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(rgw_pubsub_topics) - -static std::string pubsub_oid_prefix = "pubsub."; - -class RGWPubSub -{ - friend class Bucket; - - rgw::sal::RadosStore* store; - const std::string tenant; - RGWSI_SysObj* svc_sysobj; - - rgw_raw_obj meta_obj; - - std::string meta_oid() const { - return pubsub_oid_prefix + tenant; - } - - std::string bucket_meta_oid(const rgw_bucket& bucket) const { - return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker; - } - - std::string sub_meta_oid(const std::string& name) const { - return pubsub_oid_prefix + tenant + ".sub." + name; - } - - template - int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker); - - template - int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info, - RGWObjVersionTracker* obj_tracker, optional_yield y); - - int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker, - optional_yield y); - - int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker); - int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics, - RGWObjVersionTracker* objv_tracker, optional_yield y); - -public: - RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant); - - class Bucket { - friend class RGWPubSub; - RGWPubSub *ps; - rgw_bucket bucket; - rgw_raw_obj bucket_meta_obj; - - // read the list of topics associated with a bucket and populate into result - // use version tacker to enforce atomicity between read/write - // return 0 on success or if no topic was associated with the bucket, error code otherwise - int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker); - // set the list of topics associated with a bucket - // use version tacker to enforce atomicity between read/write - // return 0 on success, error code otherwise - int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics, - RGWObjVersionTracker* objv_tracker, optional_yield y); - public: - Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) { - ps->get_bucket_meta_obj(bucket, &bucket_meta_obj); - } - - // read the list of topics associated with a bucket and populate into result - // return 0 on success or if no topic was associated with the bucket, error code otherwise - int get_topics(rgw_pubsub_bucket_topics *result); - // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket - // assigning a notification name is optional (needed for S3 compatible notifications) - // if the topic already exist on the bucket, the filter event list may be updated - // for S3 compliant notifications the version with: s3_filter and notif_name should be used - // return -ENOENT if the topic does not exists - // return 0 on success, error code otherwise - int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y); - int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y); - // remove a topic and filter from bucket - // if the topic does not exists on the bucket it is a no-op (considered success) - // return -ENOENT if the topic does not exists - // return 0 on success, error code otherwise - int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y); - // remove all notifications (and autogenerated topics) associated with the bucket - // return 0 on success or if no topic was associated with the bucket, error code otherwise - int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y); - }; - - using BucketRef = std::shared_ptr; - - BucketRef get_bucket(const rgw_bucket& bucket) { - return std::make_shared(this, bucket); - } - - void get_meta_obj(rgw_raw_obj *obj) const; - void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const; - - void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const; - - // get all topics (per tenant, if used)) and populate them into "result" - // return 0 on success or if no topics exist, error code otherwise - int get_topics(rgw_pubsub_topics *result); - // get a topic with its subscriptions by its name and populate it into "result" - // return -ENOENT if the topic does not exists - // return 0 on success, error code otherwise - int get_topic(const std::string& name, rgw_pubsub_topic_subs *result); - // get a topic with by its name and populate it into "result" - // return -ENOENT if the topic does not exists - // return 0 on success, error code otherwise - int get_topic(const std::string& name, rgw_pubsub_topic *result); - // create a topic with a name only - // if the topic already exists it is a no-op (considered success) - // return 0 on success, error code otherwise - int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y); - // create a topic with push destination information and ARN - // if the topic already exists the destination and ARN values may be updated (considered succsess) - // return 0 on success, error code otherwise - int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y); - // remove a topic according to its name - // if the topic does not exists it is a no-op (considered success) - // return 0 on success, error code otherwise - int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y); -}; - - -template -int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker) -{ - bufferlist bl; - int ret = rgw_get_system_obj(svc_sysobj, - obj.pool, obj.oid, - bl, - objv_tracker, - nullptr, null_yield, nullptr, nullptr); - if (ret < 0) { - return ret; - } - - auto iter = bl.cbegin(); - try { - decode(*result, iter); - } catch (buffer::error& err) { - return -EIO; - } - - return 0; -} - -template -int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info, - RGWObjVersionTracker* objv_tracker, optional_yield y) -{ - bufferlist bl; - encode(info, bl); - - return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid, - bl, false, objv_tracker, real_time(), y); -} - -#endif diff --git a/src/rgw/rgw_pubsub_push.cc b/src/rgw/rgw_pubsub_push.cc deleted file mode 100644 index 2f734c21df83..000000000000 --- a/src/rgw/rgw_pubsub_push.cc +++ /dev/null @@ -1,463 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#include "rgw_pubsub_push.h" -#include -#include -#include -#include "include/buffer_fwd.h" -#include "common/Formatter.h" -#include "common/iso_8601.h" -#include "common/async/completion.h" -#include "rgw_common.h" -#include "rgw_data_sync.h" -#include "rgw_pubsub.h" -#include "acconfig.h" -#ifdef WITH_RADOSGW_AMQP_ENDPOINT -#include "rgw_amqp.h" -#endif -#ifdef WITH_RADOSGW_KAFKA_ENDPOINT -#include "rgw_kafka.h" -#endif -#include -#include -#include -#include "rgw_perf_counters.h" - -using namespace rgw; - -template -std::string json_format_pubsub_event(const EventType& event) { - std::stringstream ss; - JSONFormatter f(false); - { - Formatter::ObjectSection s(f, EventType::json_type_plural); - { - Formatter::ArraySection s(f, EventType::json_type_plural); - encode_json("", event, &f); - } - } - f.flush(ss); - return ss.str(); -} - -bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) { - bool value; - bool exists; - if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) { - throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name); - } - if (!exists) { - return default_value; - } - return value; -} - -class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint { -private: - const std::string endpoint; - typedef unsigned ack_level_t; - ack_level_t ack_level; // TODO: not used for now - const bool verify_ssl; - const bool cloudevents; - static const ack_level_t ACK_LEVEL_ANY = 0; - static const ack_level_t ACK_LEVEL_NON_ERROR = 1; - -public: - RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : - endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) - { - bool exists; - const auto& str_ack_level = args.get("http-ack-level", &exists); - if (!exists || str_ack_level == "any") { - // "any" is default - ack_level = ACK_LEVEL_ANY; - } else if (str_ack_level == "non-error") { - ack_level = ACK_LEVEL_NON_ERROR; - } else { - ack_level = std::atoi(str_ack_level.c_str()); - if (ack_level < 100 || ack_level >= 600) { - throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level); - } - } - } - - int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { - bufferlist read_bl; - RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl); - const auto post_data = json_format_pubsub_event(event); - if (cloudevents) { - // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md - // using "Binary Content Mode" - request.append_header("ce-specversion", "1.0"); - request.append_header("ce-type", "com.amazonaws." + event.eventName); - request.append_header("ce-time", to_iso_8601(event.eventTime)); - // default output of iso8601 is also RFC3339 compatible - request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2); - request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name); - request.append_header("ce-subject", event.object_key); - } - request.set_post_data(post_data); - request.set_send_length(post_data.length()); - request.append_header("Content-Type", "application/json"); - if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending); - const auto rc = RGWHTTP::process(&request, y); - if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending); - // TODO: use read_bl to process return code and handle according to ack level - return rc; - } - - std::string to_str() const override { - std::string str("HTTP/S Endpoint"); - str += "\nURI: " + endpoint; - str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL"); - return str; - } -}; - -#ifdef WITH_RADOSGW_AMQP_ENDPOINT -class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint { -private: - enum class ack_level_t { - None, - Broker, - Routable - }; - CephContext* const cct; - const std::string endpoint; - const std::string topic; - const std::string exchange; - ack_level_t ack_level; - amqp::connection_ptr_t conn; - - bool get_verify_ssl(const RGWHTTPArgs& args) { - bool exists; - auto str_verify_ssl = args.get("verify-ssl", &exists); - if (!exists) { - // verify server certificate by default - return true; - } - boost::algorithm::to_lower(str_verify_ssl); - if (str_verify_ssl == "true") { - return true; - } - if (str_verify_ssl == "false") { - return false; - } - throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl); - } - - std::string get_exchange(const RGWHTTPArgs& args) { - bool exists; - const auto exchange = args.get("amqp-exchange", &exists); - if (!exists) { - throw configuration_error("AMQP: missing amqp-exchange"); - } - return exchange; - } - - ack_level_t get_ack_level(const RGWHTTPArgs& args) { - bool exists; - const auto& str_ack_level = args.get("amqp-ack-level", &exists); - if (!exists || str_ack_level == "broker") { - // "broker" is default - return ack_level_t::Broker; - } - if (str_ack_level == "none") { - return ack_level_t::None; - } - if (str_ack_level == "routable") { - return ack_level_t::Routable; - } - throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level); - } - -public: - RGWPubSubAMQPEndpoint(const std::string& _endpoint, - const std::string& _topic, - const RGWHTTPArgs& args, - CephContext* _cct) : - cct(_cct), - endpoint(_endpoint), - topic(_topic), - exchange(get_exchange(args)), - ack_level(get_ack_level(args)), - conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) { - if (!conn) { - throw configuration_error("AMQP: failed to create connection to: " + endpoint); - } - } - - // this allows waiting untill "finish()" is called from a different thread - // waiting could be blocking the waiting thread or yielding, depending - // with compilation flag support and whether the optional_yield is set - class Waiter { - using Signature = void(boost::system::error_code); - using Completion = ceph::async::Completion; - std::unique_ptr completion = nullptr; - int ret; - - mutable std::atomic done = false; - mutable std::mutex lock; - mutable std::condition_variable cond; - - template - auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { - boost::asio::async_completion init(token); - auto& handler = init.completion_handler; - { - std::unique_lock l{lock}; - completion = Completion::create(ctx.get_executor(), std::move(handler)); - } - return init.result.get(); - } - - public: - int wait(optional_yield y) { - if (done) { - return ret; - } - if (y) { - auto& io_ctx = y.get_io_context(); - auto& yield_ctx = y.get_yield_context(); - boost::system::error_code ec; - async_wait(io_ctx, yield_ctx[ec]); - return -ec.value(); - } - std::unique_lock l(lock); - cond.wait(l, [this]{return (done==true);}); - return ret; - } - - void finish(int r) { - std::unique_lock l{lock}; - ret = r; - done = true; - if (completion) { - boost::system::error_code ec(-ret, boost::system::system_category()); - Completion::post(std::move(completion), ec); - } else { - cond.notify_all(); - } - } - }; - - int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { - ceph_assert(conn); - if (ack_level == ack_level_t::None) { - return amqp::publish(conn, topic, json_format_pubsub_event(event)); - } else { - // TODO: currently broker and routable are the same - this will require different flags but the same mechanism - // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine - auto w = std::unique_ptr(new Waiter); - const auto rc = amqp::publish_with_confirm(conn, - topic, - json_format_pubsub_event(event), - std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); - if (rc < 0) { - // failed to publish, does not wait for reply - return rc; - } - return w->wait(y); - } - } - - std::string to_str() const override { - std::string str("AMQP(0.9.1) Endpoint"); - str += "\nURI: " + endpoint; - str += "\nTopic: " + topic; - str += "\nExchange: " + exchange; - return str; - } -}; - -static const std::string AMQP_0_9_1("0-9-1"); -static const std::string AMQP_1_0("1-0"); -static const std::string AMQP_SCHEMA("amqp"); -#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT - -#ifdef WITH_RADOSGW_KAFKA_ENDPOINT -class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint { -private: - enum class ack_level_t { - None, - Broker, - }; - CephContext* const cct; - const std::string topic; - kafka::connection_ptr_t conn; - const ack_level_t ack_level; - - - ack_level_t get_ack_level(const RGWHTTPArgs& args) { - bool exists; - const auto& str_ack_level = args.get("kafka-ack-level", &exists); - if (!exists || str_ack_level == "broker") { - // "broker" is default - return ack_level_t::Broker; - } - if (str_ack_level == "none") { - return ack_level_t::None; - } - throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level); - } - -public: - RGWPubSubKafkaEndpoint(const std::string& _endpoint, - const std::string& _topic, - const RGWHTTPArgs& args, - CephContext* _cct) : - cct(_cct), - topic(_topic), - conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) , - ack_level(get_ack_level(args)) { - if (!conn) { - throw configuration_error("Kafka: failed to create connection to: " + _endpoint); - } - } - - // this allows waiting untill "finish()" is called from a different thread - // waiting could be blocking the waiting thread or yielding, depending - // with compilation flag support and whether the optional_yield is set - class Waiter { - using Signature = void(boost::system::error_code); - using Completion = ceph::async::Completion; - std::unique_ptr completion = nullptr; - int ret; - - mutable std::atomic done = false; - mutable std::mutex lock; - mutable std::condition_variable cond; - - template - auto async_wait(ExecutionContext& ctx, CompletionToken&& token) { - boost::asio::async_completion init(token); - auto& handler = init.completion_handler; - { - std::unique_lock l{lock}; - completion = Completion::create(ctx.get_executor(), std::move(handler)); - } - return init.result.get(); - } - - public: - int wait(optional_yield y) { - if (done) { - return ret; - } - if (y) { - auto& io_ctx = y.get_io_context(); - auto& yield_ctx = y.get_yield_context(); - boost::system::error_code ec; - async_wait(io_ctx, yield_ctx[ec]); - return -ec.value(); - } - std::unique_lock l(lock); - cond.wait(l, [this]{return (done==true);}); - return ret; - } - - void finish(int r) { - std::unique_lock l{lock}; - ret = r; - done = true; - if (completion) { - boost::system::error_code ec(-ret, boost::system::system_category()); - Completion::post(std::move(completion), ec); - } else { - cond.notify_all(); - } - } - }; - - int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override { - ceph_assert(conn); - if (ack_level == ack_level_t::None) { - return kafka::publish(conn, topic, json_format_pubsub_event(event)); - } else { - // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine - auto w = std::unique_ptr(new Waiter); - const auto rc = kafka::publish_with_confirm(conn, - topic, - json_format_pubsub_event(event), - std::bind(&Waiter::finish, w.get(), std::placeholders::_1)); - if (rc < 0) { - // failed to publish, does not wait for reply - return rc; - } - return w->wait(y); - } - } - - std::string to_str() const override { - std::string str("Kafka Endpoint"); - str += kafka::to_string(conn); - str += "\nTopic: " + topic; - return str; - } -}; - -static const std::string KAFKA_SCHEMA("kafka"); -#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT - -static const std::string WEBHOOK_SCHEMA("webhook"); -static const std::string UNKNOWN_SCHEMA("unknown"); -static const std::string NO_SCHEMA(""); - -const std::string& get_schema(const std::string& endpoint) { - if (endpoint.empty()) { - return NO_SCHEMA; - } - const auto pos = endpoint.find(':'); - if (pos == std::string::npos) { - return UNKNOWN_SCHEMA; - } - const auto& schema = endpoint.substr(0,pos); - if (schema == "http" || schema == "https") { - return WEBHOOK_SCHEMA; -#ifdef WITH_RADOSGW_AMQP_ENDPOINT - } else if (schema == "amqp" || schema == "amqps") { - return AMQP_SCHEMA; -#endif -#ifdef WITH_RADOSGW_KAFKA_ENDPOINT - } else if (schema == "kafka") { - return KAFKA_SCHEMA; -#endif - } - return UNKNOWN_SCHEMA; -} - -RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, - const std::string& topic, - const RGWHTTPArgs& args, - CephContext* cct) { - const auto& schema = get_schema(endpoint); - if (schema == WEBHOOK_SCHEMA) { - return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args)); -#ifdef WITH_RADOSGW_AMQP_ENDPOINT - } else if (schema == AMQP_SCHEMA) { - bool exists; - std::string version = args.get("amqp-version", &exists); - if (!exists) { - version = AMQP_0_9_1; - } - if (version == AMQP_0_9_1) { - return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct)); - } else if (version == AMQP_1_0) { - throw configuration_error("AMQP: v1.0 not supported"); - return nullptr; - } else { - throw configuration_error("AMQP: unknown version: " + version); - return nullptr; - } -#endif -#ifdef WITH_RADOSGW_KAFKA_ENDPOINT - } else if (schema == KAFKA_SCHEMA) { - return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct)); -#endif - } - - throw configuration_error("unknown schema in: " + endpoint); - return nullptr; -} - diff --git a/src/rgw/rgw_pubsub_push.h b/src/rgw/rgw_pubsub_push.h deleted file mode 100644 index 17905937c035..000000000000 --- a/src/rgw/rgw_pubsub_push.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp -#pragma once - -#include -#include -#include -#include "include/buffer_fwd.h" -#include "include/common_fwd.h" -#include "common/async/yield_context.h" - -// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes -class RGWDataSyncEnv; -class RGWHTTPArgs; -struct rgw_pubsub_s3_event; - -// endpoint base class all endpoint - types should derive from it -class RGWPubSubEndpoint { -public: - RGWPubSubEndpoint() = default; - // endpoint should not be copied - RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete; - const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete; - - typedef std::unique_ptr Ptr; - - // factory method for the actual notification endpoint - // derived class specific arguments are passed in http args format - // may throw a configuration_error if creation fails - static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr); - - // this method is used in order to send notification (S3 compliant) and wait for completion - // in async manner via a coroutine when invoked in the frontend environment - virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0; - - // present as string - virtual std::string to_str() const { return ""; } - - virtual ~RGWPubSubEndpoint() = default; - - // exception object for configuration error - struct configuration_error : public std::logic_error { - configuration_error(const std::string& what_arg) : - std::logic_error("pubsub endpoint configuration error: " + what_arg) {} - }; -}; - diff --git a/src/rgw/rgw_putobj_processor.cc b/src/rgw/rgw_putobj_processor.cc deleted file mode 100644 index 8a6a157018ef..000000000000 --- a/src/rgw/rgw_putobj_processor.cc +++ /dev/null @@ -1,704 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2018 Red Hat, Inc. - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "rgw_aio.h" -#include "rgw_putobj_processor.h" -#include "rgw_multi.h" -#include "rgw_compression.h" -#include "services/svc_sys_obj.h" -#include "services/svc_zone.h" -#include "rgw_sal_rados.h" - -#define dout_subsys ceph_subsys_rgw - -using namespace std; - -namespace rgw::putobj { - -int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset) -{ - const bool flush = (data.length() == 0); - - // capture the first chunk for special handling - if (data_offset < head_chunk_size || data_offset == 0) { - if (flush) { - // flush partial chunk - return process_first_chunk(std::move(head_data), &processor); - } - - auto remaining = head_chunk_size - data_offset; - auto count = std::min(data.length(), remaining); - data.splice(0, count, &head_data); - data_offset += count; - - if (data_offset == head_chunk_size) { - // process the first complete chunk - ceph_assert(head_data.length() == head_chunk_size); - int r = process_first_chunk(std::move(head_data), &processor); - if (r < 0) { - return r; - } - } - if (data.length() == 0) { // avoid flushing stripe processor - return 0; - } - } - ceph_assert(processor); // process_first_chunk() must initialize - - // send everything else through the processor - auto write_offset = data_offset; - data_offset += data.length(); - return processor->process(std::move(data), write_offset); -} - - -static int process_completed(const AioResultList& completed, RawObjSet *written) -{ - std::optional error; - for (auto& r : completed) { - if (r.result >= 0) { - written->insert(r.obj.get_ref().obj); - } else if (!error) { // record first error code - error = r.result; - } - } - return error.value_or(0); -} - -void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) { - const rgw_obj obj = head_obj->get_obj(); - const RGWObjStateManifest *sm = obj_ctx.get_state(obj); - const bool compressed = sm->state.compressed; - uint32_t alloc_hint_flags = 0; - if (compressed) { - alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; - } - - op.set_alloc_hint2(0, 0, alloc_hint_flags); -} - -int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj) -{ - stripe_obj = store->svc()->rados->obj(raw_obj); - return stripe_obj.open(dpp); -} - -int RadosWriter::process(bufferlist&& bl, uint64_t offset) -{ - bufferlist data = std::move(bl); - const uint64_t cost = data.length(); - if (cost == 0) { // no empty writes, use aio directly for creates - return 0; - } - librados::ObjectWriteOperation op; - add_write_hint(op); - if (offset == 0) { - op.write_full(data); - } else { - op.write(offset, data); - } - constexpr uint64_t id = 0; // unused - auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); - return process_completed(c, &written); -} - -int RadosWriter::write_exclusive(const bufferlist& data) -{ - const uint64_t cost = data.length(); - - librados::ObjectWriteOperation op; - op.create(true); // exclusive create - add_write_hint(op); - op.write_full(data); - - constexpr uint64_t id = 0; // unused - auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id); - auto d = aio->drain(); - c.splice(c.end(), d); - return process_completed(c, &written); -} - -int RadosWriter::drain() -{ - return process_completed(aio->drain(), &written); -} - -RadosWriter::~RadosWriter() -{ - // wait on any outstanding aio completions - process_completed(aio->drain(), &written); - - bool need_to_remove_head = false; - std::optional raw_head; - if (!rgw::sal::Object::empty(head_obj.get())) { - raw_head.emplace(); - rgw::sal::RadosObject* obj = dynamic_cast(head_obj.get()); - obj->get_raw_obj(&*raw_head); - } - - /** - * We should delete the object in the "multipart" namespace to avoid race condition. - * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart - * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects - * written by the second upload may be deleted by the first upload. - * details is describled on #11749 - * - * The above comment still stands, but instead of searching for a specific object in the multipart - * namespace, we just make sure that we remove the object that is marked as the head object after - * we remove all the other raw objects. Note that we use different call to remove the head object, - * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme. - */ - for (const auto& obj : written) { - if (raw_head && obj == *raw_head) { - ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl; - need_to_remove_head = true; - continue; - } - - int r = store->delete_raw_obj(dpp, obj); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl; - } - } - - if (need_to_remove_head) { - std::string version_id; - ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl; - int r = head_obj->delete_object(dpp, null_yield); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl; - } - } -} - - -// advance to the next stripe -int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size) -{ - // advance the manifest - int r = manifest_gen.create_next(offset); - if (r < 0) { - return r; - } - - rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); - - uint64_t chunk_size = 0; - r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size); - if (r < 0) { - return r; - } - r = writer.set_stripe_obj(stripe_obj); - if (r < 0) { - return r; - } - - chunk = ChunkProcessor(&writer, chunk_size); - *pstripe_size = manifest_gen.cur_stripe_max_size(); - return 0; -} - - - -int AtomicObjectProcessor::process_first_chunk(bufferlist&& data, - DataProcessor **processor) -{ - first_chunk = std::move(data); - *processor = &stripe; - return 0; -} - -int AtomicObjectProcessor::prepare(optional_yield y) -{ - uint64_t max_head_chunk_size; - uint64_t head_max_size; - uint64_t chunk_size = 0; - uint64_t alignment; - - int r = dynamic_cast(head_obj.get())->get_max_chunk_size( - dpp, head_obj->get_bucket()->get_placement_rule(), - &max_head_chunk_size, &alignment); - if (r < 0) { - return r; - } - - bool same_pool = true; - if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) { - if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) { - same_pool = false; - r = dynamic_cast(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size); - if (r < 0) { - return r; - } - head_max_size = 0; - } - } - - if (same_pool) { - RGWZonePlacementInfo placement_info; - if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) { - head_max_size = max_head_chunk_size; - } else { - head_max_size = 0; - } - chunk_size = max_head_chunk_size; - } - - uint64_t stripe_size; - const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; - - dynamic_cast(head_obj.get())->get_max_aligned_size( - default_stripe_size, alignment, &stripe_size); - - manifest.set_trivial_rule(head_max_size, stripe_size); - - rgw_obj obj = head_obj->get_obj(); - - r = manifest_gen.create_begin(store->ctx(), &manifest, - head_obj->get_bucket()->get_placement_rule(), - &tail_placement_rule, - obj.bucket, obj); - if (r < 0) { - return r; - } - - rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); - - r = writer.set_stripe_obj(stripe_obj); - if (r < 0) { - return r; - } - - set_head_chunk_size(head_max_size); - // initialize the processors - chunk = ChunkProcessor(&writer, chunk_size); - stripe = StripeProcessor(&chunk, this, head_max_size); - return 0; -} - -int AtomicObjectProcessor::complete(size_t accounted_size, - const std::string& etag, - ceph::real_time *mtime, - ceph::real_time set_mtime, - rgw::sal::Attrs& attrs, - ceph::real_time delete_at, - const char *if_match, - const char *if_nomatch, - const std::string *user_data, - rgw_zone_set *zones_trace, - bool *pcanceled, optional_yield y) -{ - int r = writer.drain(); - if (r < 0) { - return r; - } - const uint64_t actual_size = get_actual_size(); - r = manifest_gen.create_next(actual_size); - if (r < 0) { - return r; - } - - head_obj->set_atomic(); - - RGWRados::Object op_target(store->getRados(), - head_obj->get_bucket(), - obj_ctx, head_obj.get()); - RGWRados::Object::Write obj_op(&op_target); - - /* some object types shouldn't be versioned, e.g., multipart parts */ - op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled()); - obj_op.meta.data = &first_chunk; - obj_op.meta.manifest = &manifest; - obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ - obj_op.meta.if_match = if_match; - obj_op.meta.if_nomatch = if_nomatch; - obj_op.meta.mtime = mtime; - obj_op.meta.set_mtime = set_mtime; - obj_op.meta.owner = owner; - obj_op.meta.flags = PUT_OBJ_CREATE; - obj_op.meta.olh_epoch = olh_epoch; - obj_op.meta.delete_at = delete_at; - obj_op.meta.user_data = user_data; - obj_op.meta.zones_trace = zones_trace; - obj_op.meta.modify_tail = true; - - r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); - if (r < 0) { - if (r == -ETIMEDOUT) { - // The head object write may eventually succeed, clear the set of objects for deletion. if it - // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write - writer.clear_written(); - } - return r; - } - if (!obj_op.meta.canceled) { - // on success, clear the set of objects for deletion - writer.clear_written(); - } - if (pcanceled) { - *pcanceled = obj_op.meta.canceled; - } - return 0; -} - - -int MultipartObjectProcessor::process_first_chunk(bufferlist&& data, - DataProcessor **processor) -{ - // write the first chunk of the head object as part of an exclusive create, - // then drain to wait for the result in case of EEXIST - int r = writer.write_exclusive(data); - if (r == -EEXIST) { - // randomize the oid prefix and reprepare the head/manifest - std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32); - - mp.init(target_obj->get_name(), upload_id, oid_rand); - manifest.set_prefix(target_obj->get_name() + "." + oid_rand); - - r = prepare_head(); - if (r < 0) { - return r; - } - // resubmit the write op on the new head object - r = writer.write_exclusive(data); - } - if (r < 0) { - return r; - } - *processor = &stripe; - return 0; -} - -int MultipartObjectProcessor::prepare_head() -{ - const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size; - uint64_t chunk_size; - uint64_t stripe_size; - uint64_t alignment; - - int r = dynamic_cast(target_obj.get())->get_max_chunk_size(dpp, - tail_placement_rule, &chunk_size, &alignment); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl; - return r; - } - dynamic_cast(target_obj.get())->get_max_aligned_size( - default_stripe_size, alignment, &stripe_size); - - manifest.set_multipart_part_rule(stripe_size, part_num); - - r = manifest_gen.create_begin(store->ctx(), &manifest, - head_obj->get_bucket()->get_placement_rule(), - &tail_placement_rule, - target_obj->get_bucket()->get_key(), - target_obj->get_obj()); - if (r < 0) { - return r; - } - - rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); - dynamic_cast(head_obj.get())->raw_obj_to_obj(stripe_obj); - head_obj->set_hash_source(target_obj->get_name()); - - r = writer.set_stripe_obj(stripe_obj); - if (r < 0) { - return r; - } - stripe_size = manifest_gen.cur_stripe_max_size(); - set_head_chunk_size(stripe_size); - - chunk = ChunkProcessor(&writer, chunk_size); - stripe = StripeProcessor(&chunk, this, stripe_size); - return 0; -} - -int MultipartObjectProcessor::prepare(optional_yield y) -{ - manifest.set_prefix(target_obj->get_name() + "." + upload_id); - - return prepare_head(); -} - -int MultipartObjectProcessor::complete(size_t accounted_size, - const std::string& etag, - ceph::real_time *mtime, - ceph::real_time set_mtime, - std::map& attrs, - ceph::real_time delete_at, - const char *if_match, - const char *if_nomatch, - const std::string *user_data, - rgw_zone_set *zones_trace, - bool *pcanceled, optional_yield y) -{ - int r = writer.drain(); - if (r < 0) { - return r; - } - const uint64_t actual_size = get_actual_size(); - r = manifest_gen.create_next(actual_size); - if (r < 0) { - return r; - } - - RGWRados::Object op_target(store->getRados(), - head_obj->get_bucket(), - obj_ctx, head_obj.get()); - RGWRados::Object::Write obj_op(&op_target); - - op_target.set_versioning_disabled(true); - op_target.set_meta_placement_rule(&tail_placement_rule); - obj_op.meta.set_mtime = set_mtime; - obj_op.meta.mtime = mtime; - obj_op.meta.owner = owner; - obj_op.meta.delete_at = delete_at; - obj_op.meta.zones_trace = zones_trace; - obj_op.meta.modify_tail = true; - - r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y); - if (r < 0) - return r; - - bufferlist bl; - RGWUploadPartInfo info; - string p = "part."; - bool sorted_omap = is_v2_upload_id(upload_id); - - if (sorted_omap) { - char buf[32]; - snprintf(buf, sizeof(buf), "%08d", part_num); - p.append(buf); - } else { - p.append(part_num_str); - } - info.num = part_num; - info.etag = etag; - info.size = actual_size; - info.accounted_size = accounted_size; - info.modified = real_clock::now(); - info.manifest = manifest; - - bool compressed; - r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info); - if (r < 0) { - ldpp_dout(dpp, 1) << "cannot get compression info" << dendl; - return r; - } - - encode(info, bl); - - std::unique_ptr meta_obj = - head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART)); - meta_obj->set_in_extra_data(true); - - r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield); - if (r < 0) { - return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r; - } - - if (!obj_op.meta.canceled) { - // on success, clear the set of objects for deletion - writer.clear_written(); - } - if (pcanceled) { - *pcanceled = obj_op.meta.canceled; - } - return 0; -} - -int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor) -{ - int r = writer.write_exclusive(data); - if (r < 0) { - return r; - } - *processor = &stripe; - return 0; -} - -int AppendObjectProcessor::prepare(optional_yield y) -{ - RGWObjState *astate; - int r = head_obj->get_obj_state(dpp, &astate, y); - if (r < 0) { - return r; - } - cur_size = astate->size; - *cur_accounted_size = astate->accounted_size; - if (!astate->exists) { - if (position != 0) { - ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl; - return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; - } else { - cur_part_num = 1; - //set the prefix - char buf[33]; - gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1); - string oid_prefix = head_obj->get_name(); - oid_prefix.append("."); - oid_prefix.append(buf); - oid_prefix.append("_"); - manifest.set_prefix(oid_prefix); - } - } else { - // check whether the object appendable - map::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); - if (iter == astate->attrset.end()) { - ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl; - return -ERR_OBJECT_NOT_APPENDABLE; - } - if (position != *cur_accounted_size) { - ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl; - return -ERR_POSITION_NOT_EQUAL_TO_LENGTH; - } - try { - using ceph::decode; - decode(cur_part_num, iter->second); - } catch (buffer::error& err) { - ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl; - return -EIO; - } - cur_part_num++; - //get the current obj etag - iter = astate->attrset.find(RGW_ATTR_ETAG); - if (iter != astate->attrset.end()) { - string s = rgw_string_unquote(iter->second.c_str()); - size_t pos = s.find("-"); - cur_etag = s.substr(0, pos); - } - - iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); - if (iter != astate->attrset.end()) { - tail_placement_rule.storage_class = iter->second.to_str(); - } else { - tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD; - } - cur_manifest = dynamic_cast(head_obj.get())->get_manifest(); - manifest.set_prefix(cur_manifest->get_prefix()); - astate->keep_tail = true; - } - manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num); - - rgw_obj obj = head_obj->get_obj(); - - r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj); - if (r < 0) { - return r; - } - rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store); - - uint64_t chunk_size = 0; - r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size); - if (r < 0) { - return r; - } - r = writer.set_stripe_obj(std::move(stripe_obj)); - if (r < 0) { - return r; - } - - uint64_t stripe_size = manifest_gen.cur_stripe_max_size(); - - uint64_t max_head_size = std::min(chunk_size, stripe_size); - set_head_chunk_size(max_head_size); - - // initialize the processors - chunk = ChunkProcessor(&writer, chunk_size); - stripe = StripeProcessor(&chunk, this, stripe_size); - - return 0; -} - -int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime, - ceph::real_time set_mtime, rgw::sal::Attrs& attrs, - ceph::real_time delete_at, const char *if_match, const char *if_nomatch, - const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled, - optional_yield y) -{ - int r = writer.drain(); - if (r < 0) - return r; - const uint64_t actual_size = get_actual_size(); - r = manifest_gen.create_next(actual_size); - if (r < 0) { - return r; - } - head_obj->set_atomic(); - RGWRados::Object op_target(store->getRados(), - head_obj->get_bucket(), - obj_ctx, head_obj.get()); - RGWRados::Object::Write obj_op(&op_target); - //For Append obj, disable versioning - op_target.set_versioning_disabled(true); - if (cur_manifest) { - cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params()); - obj_op.meta.manifest = cur_manifest; - } else { - obj_op.meta.manifest = &manifest; - } - obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */ - obj_op.meta.mtime = mtime; - obj_op.meta.set_mtime = set_mtime; - obj_op.meta.owner = owner; - obj_op.meta.flags = PUT_OBJ_CREATE; - obj_op.meta.delete_at = delete_at; - obj_op.meta.user_data = user_data; - obj_op.meta.zones_trace = zones_trace; - obj_op.meta.modify_tail = true; - obj_op.meta.appendable = true; - //Add the append part number - bufferlist cur_part_num_bl; - using ceph::encode; - encode(cur_part_num, cur_part_num_bl); - attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl; - //calculate the etag - if (!cur_etag.empty()) { - MD5 hash; - // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes - hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); - char petag[CEPH_CRYPTO_MD5_DIGESTSIZE]; - char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE]; - char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16]; - hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); - hash.Update((const unsigned char *)petag, sizeof(petag)); - hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE); - hash.Update((const unsigned char *)petag, sizeof(petag)); - hash.Final((unsigned char *)final_etag); - buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str); - snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, - "-%lld", (long long)cur_part_num); - bufferlist etag_bl; - etag_bl.append(final_etag_str, strlen(final_etag_str) + 1); - attrs[RGW_ATTR_ETAG] = etag_bl; - } - r = obj_op.write_meta(dpp, actual_size + cur_size, - accounted_size + *cur_accounted_size, - attrs, y); - if (r < 0) { - return r; - } - if (!obj_op.meta.canceled) { - // on success, clear the set of objects for deletion - writer.clear_written(); - } - if (pcanceled) { - *pcanceled = obj_op.meta.canceled; - } - *cur_accounted_size += accounted_size; - - return 0; -} - -} // namespace rgw::putobj diff --git a/src/rgw/rgw_putobj_processor.h b/src/rgw/rgw_putobj_processor.h deleted file mode 100644 index 1beb9a724c05..000000000000 --- a/src/rgw/rgw_putobj_processor.h +++ /dev/null @@ -1,281 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2018 Red Hat, Inc. - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once - -#include - -#include "rgw_putobj.h" -#include "services/svc_rados.h" -#include "services/svc_tier_rados.h" -#include "rgw_sal.h" -#include "rgw_obj_manifest.h" - -namespace rgw { - -namespace sal { - class RadosStore; -} - -class Aio; - -namespace putobj { - -// an object processor with special handling for the first chunk of the head. -// the virtual process_first_chunk() function returns a processor to handle the -// rest of the object -class HeadObjectProcessor : public rgw::sal::ObjectProcessor { - uint64_t head_chunk_size; - // buffer to capture the first chunk of the head object - bufferlist head_data; - // initialized after process_first_chunk() to process everything else - rgw::sal::DataProcessor *processor = nullptr; - uint64_t data_offset = 0; // maximum offset of data written (ie compressed) - protected: - uint64_t get_actual_size() const { return data_offset; } - - // process the first chunk of data and return a processor for the rest - virtual int process_first_chunk(bufferlist&& data, - rgw::sal::DataProcessor **processor) = 0; - public: - HeadObjectProcessor(uint64_t head_chunk_size) - : head_chunk_size(head_chunk_size) - {} - - void set_head_chunk_size(uint64_t size) { head_chunk_size = size; } - - // cache first chunk for process_first_chunk(), then forward everything else - // to the returned processor - int process(bufferlist&& data, uint64_t logical_offset) final override; -}; - -using RawObjSet = std::set; - -// a data sink that writes to rados objects and deletes them on cancelation -class RadosWriter : public rgw::sal::DataProcessor { - Aio *const aio; - rgw::sal::RadosStore *const store; - RGWObjectCtx& obj_ctx; - std::unique_ptr head_obj; - RGWSI_RADOS::Obj stripe_obj; // current stripe object - RawObjSet written; // set of written objects for deletion - const DoutPrefixProvider *dpp; - optional_yield y; - - public: - RadosWriter(Aio *aio, rgw::sal::RadosStore *store, - RGWObjectCtx& obj_ctx, std::unique_ptr _head_obj, - const DoutPrefixProvider *dpp, optional_yield y) - : aio(aio), store(store), - obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y) - {} - RadosWriter(RadosWriter&& r) - : aio(r.aio), store(r.store), - obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y) - {} - - ~RadosWriter(); - - // add alloc hint to osd - void add_write_hint(librados::ObjectWriteOperation& op); - - // change the current stripe object - int set_stripe_obj(const rgw_raw_obj& obj); - - // write the data at the given offset of the current stripe object - int process(bufferlist&& data, uint64_t stripe_offset) override; - - // write the data as an exclusive create and wait for it to complete - int write_exclusive(const bufferlist& data); - - int drain(); - - // when the operation completes successfully, clear the set of written objects - // so they aren't deleted on destruction - void clear_written() { written.clear(); } - -}; - - -// a rados object processor that stripes according to RGWObjManifest -class ManifestObjectProcessor : public HeadObjectProcessor, - public StripeGenerator { - protected: - rgw::sal::RadosStore* const store; - rgw_placement_rule tail_placement_rule; - rgw_user owner; - RGWObjectCtx& obj_ctx; - std::unique_ptr head_obj; - - RadosWriter writer; - RGWObjManifest manifest; - RGWObjManifest::generator manifest_gen; - ChunkProcessor chunk; - StripeProcessor stripe; - const DoutPrefixProvider *dpp; - - // implements StripeGenerator - int next(uint64_t offset, uint64_t *stripe_size) override; - - public: - ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, - const rgw_placement_rule *ptail_placement_rule, - const rgw_user& owner, RGWObjectCtx& _obj_ctx, - std::unique_ptr _head_obj, - const DoutPrefixProvider* dpp, optional_yield y) - : HeadObjectProcessor(0), - store(store), - owner(owner), - obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)), - writer(aio, store, obj_ctx, head_obj->clone(), dpp, y), - chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) { - if (ptail_placement_rule) { - tail_placement_rule = *ptail_placement_rule; - } - } - - void set_owner(const rgw_user& _owner) { - owner = _owner; - } - - void set_tail_placement(const rgw_placement_rule& tpr) { - tail_placement_rule = tpr; - } - void set_tail_placement(const rgw_placement_rule&& tpr) { - tail_placement_rule = tpr; - } - -}; - - -// a processor that completes with an atomic write to the head object as part of -// a bucket index transaction -class AtomicObjectProcessor : public ManifestObjectProcessor { - const std::optional olh_epoch; - const std::string unique_tag; - bufferlist first_chunk; // written with the head in complete() - - int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; - public: - AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, - const rgw_placement_rule *ptail_placement_rule, - const rgw_user& owner, - RGWObjectCtx& obj_ctx, - std::unique_ptr _head_obj, - std::optional olh_epoch, - const std::string& unique_tag, - const DoutPrefixProvider *dpp, optional_yield y) - : ManifestObjectProcessor(aio, store, ptail_placement_rule, - owner, obj_ctx, std::move(_head_obj), dpp, y), - olh_epoch(olh_epoch), unique_tag(unique_tag) - {} - - // prepare a trivial manifest - int prepare(optional_yield y) override; - // write the head object atomically in a bucket index transaction - int complete(size_t accounted_size, const std::string& etag, - ceph::real_time *mtime, ceph::real_time set_mtime, - std::map& attrs, - ceph::real_time delete_at, - const char *if_match, const char *if_nomatch, - const std::string *user_data, - rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; - -}; - - -// a processor for multipart parts, which don't require atomic completion. the -// part's head is written with an exclusive create to detect racing uploads of -// the same part/upload id, which are restarted with a random oid prefix -class MultipartObjectProcessor : public ManifestObjectProcessor { - std::unique_ptr target_obj; // target multipart object - const std::string upload_id; - const int part_num; - const std::string part_num_str; - RGWMPObj mp; - - // write the first chunk and wait on aio->drain() for its completion. - // on EEXIST, retry with random prefix - int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; - // prepare the head stripe and manifest - int prepare_head(); - public: - MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, - const rgw_placement_rule *ptail_placement_rule, - const rgw_user& owner, RGWObjectCtx& obj_ctx, - std::unique_ptr _head_obj, - const std::string& upload_id, uint64_t part_num, - const std::string& part_num_str, - const DoutPrefixProvider *dpp, optional_yield y) - : ManifestObjectProcessor(aio, store, ptail_placement_rule, - owner, obj_ctx, std::move(_head_obj), dpp, y), - target_obj(head_obj->clone()), upload_id(upload_id), - part_num(part_num), part_num_str(part_num_str), - mp(head_obj->get_name(), upload_id) - {} - - // prepare a multipart manifest - int prepare(optional_yield y) override; - // write the head object attributes in a bucket index transaction, then - // register the completed part with the multipart meta object - int complete(size_t accounted_size, const std::string& etag, - ceph::real_time *mtime, ceph::real_time set_mtime, - std::map& attrs, - ceph::real_time delete_at, - const char *if_match, const char *if_nomatch, - const std::string *user_data, - rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; - -}; - - class AppendObjectProcessor : public ManifestObjectProcessor { - uint64_t cur_part_num; - uint64_t position; - uint64_t cur_size; - uint64_t *cur_accounted_size; - std::string cur_etag; - const std::string unique_tag; - - RGWObjManifest *cur_manifest; - - int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override; - - public: - AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store, - const rgw_placement_rule *ptail_placement_rule, - const rgw_user& owner, RGWObjectCtx& obj_ctx, - std::unique_ptr _head_obj, - const std::string& unique_tag, uint64_t position, - uint64_t *cur_accounted_size, - const DoutPrefixProvider *dpp, optional_yield y) - : ManifestObjectProcessor(aio, store, ptail_placement_rule, - owner, obj_ctx, std::move(_head_obj), dpp, y), - position(position), cur_size(0), cur_accounted_size(cur_accounted_size), - unique_tag(unique_tag), cur_manifest(nullptr) - {} - int prepare(optional_yield y) override; - int complete(size_t accounted_size, const std::string& etag, - ceph::real_time *mtime, ceph::real_time set_mtime, - std::map& attrs, ceph::real_time delete_at, - const char *if_match, const char *if_nomatch, const std::string *user_data, - rgw_zone_set *zones_trace, bool *canceled, - optional_yield y) override; - }; - -} // namespace putobj -} // namespace rgw - diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h index b97e83a69b13..632cb48171b2 100644 --- a/src/rgw/rgw_quota.h +++ b/src/rgw/rgw_quota.h @@ -13,8 +13,7 @@ * */ -#ifndef CEPH_RGW_QUOTA_H -#define CEPH_RGW_QUOTA_H +#pragma once #include "include/utime.h" #include "common/config_fwd.h" @@ -48,5 +47,3 @@ public: // apply default quotas from configuration void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf); - -#endif diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc deleted file mode 100644 index 6779e519c466..000000000000 --- a/src/rgw/rgw_rados.cc +++ /dev/null @@ -1,9715 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#include "include/compat.h" -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include "common/ceph_json.h" - -#include "common/errno.h" -#include "common/Formatter.h" -#include "common/Throttle.h" -#include "common/BackTrace.h" - -#include "rgw_sal.h" -#include "rgw_zone.h" -#include "rgw_cache.h" -#include "rgw_acl.h" -#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */ -#include "rgw_aio_throttle.h" -#include "driver/rados/rgw_bucket.h" -#include "rgw_rest_conn.h" -#include "rgw_cr_rados.h" -#include "rgw_cr_rest.h" -#include "rgw_datalog.h" -#include "rgw_putobj_processor.h" - -#include "cls/rgw/cls_rgw_ops.h" -#include "cls/rgw/cls_rgw_client.h" -#include "cls/rgw/cls_rgw_const.h" -#include "cls/refcount/cls_refcount_client.h" -#include "cls/version/cls_version_client.h" -#include "osd/osd_types.h" - -#include "rgw_tools.h" -#include "rgw_coroutine.h" -#include "rgw_compression.h" -#include "rgw_etag_verifier.h" -#include "rgw_worker.h" -#include "rgw_notify.h" -#include "rgw_http_errors.h" - -#undef fork // fails to compile RGWPeriod::fork() below - -#include "common/Clock.h" - -#include -#include -#include -#include -#include -#include -#include "include/random.h" - -#include "rgw_gc.h" -#include "rgw_lc.h" - -#include "rgw_object_expirer_core.h" -#include "rgw_sync.h" -#include "rgw_sync_counters.h" -#include "rgw_sync_trace.h" -#include "rgw_trim_datalog.h" -#include "rgw_trim_mdlog.h" -#include "rgw_data_sync.h" -#include "rgw_realm_watcher.h" -#include "rgw_reshard.h" -#include "rgw_cr_rados.h" - -#include "services/svc_zone.h" -#include "services/svc_zone_utils.h" -#include "services/svc_quota.h" -#include "services/svc_sync_modules.h" -#include "services/svc_sys_obj.h" -#include "services/svc_sys_obj_cache.h" -#include "services/svc_bucket.h" -#include "services/svc_mdlog.h" - -#include "compressor/Compressor.h" - -#include "rgw_d3n_datacache.h" - -#ifdef WITH_LTTNG -#define TRACEPOINT_DEFINE -#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE -#include "tracing/rgw_rados.h" -#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE -#undef TRACEPOINT_DEFINE -#else -#define tracepoint(...) -#endif - -#define dout_context g_ceph_context -#define dout_subsys ceph_subsys_rgw - -using namespace std; -using namespace librados; - -#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: " -#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: " -#define dendl_bitx dendl ; } - -static string shadow_ns = "shadow"; -static string default_bucket_index_pool_suffix = "rgw.buckets.index"; -static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec"; - -static RGWObjCategory main_category = RGWObjCategory::Main; -#define RGW_USAGE_OBJ_PREFIX "usage." - -rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const -{ - if (!is_raw) { - rgw_raw_obj r; - driver->get_raw_obj(placement_rule, obj, &r); - return r; - } - return raw_obj; -} - -void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op) -{ - obj_version* check_objv = version_for_check(); - - if (check_objv) { - cls_version_check(*op, *check_objv, VER_COND_EQ); - } - - cls_version_read(*op, &read_version); -} - -void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op) -{ - obj_version* check_objv = version_for_check(); - obj_version* modify_version = version_for_write(); - - if (check_objv) { - cls_version_check(*op, *check_objv, VER_COND_EQ); - } - - if (modify_version) { - cls_version_set(*op, *modify_version); - } else { - cls_version_inc(*op); - } -} - -void RGWObjVersionTracker::apply_write() -{ - const bool checked = (read_version.ver != 0); - const bool incremented = (write_version.ver == 0); - - if (checked && incremented) { - // apply cls_version_inc() so our next operation can recheck it - ++read_version.ver; - } else { - read_version = write_version; - } - write_version = obj_version(); -} - -RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) { - RGWObjStateManifest *result; - typename std::map::iterator iter; - lock.lock_shared(); - assert (!obj.empty()); - iter = objs_state.find(obj); - if (iter != objs_state.end()) { - result = &iter->second; - lock.unlock_shared(); - } else { - lock.unlock_shared(); - lock.lock(); - result = &objs_state[obj]; - lock.unlock(); - } - return result; -} - -void RGWObjectCtx::set_compressed(const rgw_obj& obj) { - std::unique_lock wl{lock}; - assert (!obj.empty()); - objs_state[obj].state.compressed = true; -} - -void RGWObjectCtx::set_atomic(rgw_obj& obj) { - std::unique_lock wl{lock}; - assert (!obj.empty()); - objs_state[obj].state.is_atomic = true; -} -void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) { - std::unique_lock wl{lock}; - assert (!obj.empty()); - objs_state[obj].state.prefetch_data = true; -} - -void RGWObjectCtx::invalidate(const rgw_obj& obj) { - std::unique_lock wl{lock}; - auto iter = objs_state.find(obj); - if (iter == objs_state.end()) { - return; - } - bool is_atomic = iter->second.state.is_atomic; - bool prefetch_data = iter->second.state.prefetch_data; - bool compressed = iter->second.state.compressed; - - objs_state.erase(iter); - - if (is_atomic || prefetch_data) { - auto& sm = objs_state[obj]; - sm.state.is_atomic = is_atomic; - sm.state.prefetch_data = prefetch_data; - sm.state.compressed = compressed; - } -} - -class RGWMetaNotifierManager : public RGWCoroutinesManager { - RGWRados* store; - RGWHTTPManager http_manager; - -public: - RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), - http_manager(store->ctx(), completion_mgr) { - http_manager.start(); - } - - int notify_all(const DoutPrefixProvider *dpp, map& conn_map, set& shards) { - rgw_http_param_pair pairs[] = { { "type", "metadata" }, - { "notify", NULL }, - { NULL, NULL } }; - - list stacks; - for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { - RGWRESTConn *conn = iter->second; - RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); - stack->call(new RGWPostRESTResourceCR, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL)); - - stacks.push_back(stack); - } - return run(dpp, stacks); - } -}; - -class RGWDataNotifierManager : public RGWCoroutinesManager { - RGWRados* store; - RGWHTTPManager http_manager; - -public: - RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver), - http_manager(store->ctx(), completion_mgr) { - http_manager.start(); - } - - int notify_all(const DoutPrefixProvider *dpp, map& conn_map, - bc::flat_map >& shards) { - - list stacks; - const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str(); - for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) { - RGWRESTConn *conn = iter->second; - RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this); - stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn)); - stacks.push_back(stack); - } - - return run(dpp, stacks); - } -}; - -/* class RGWRadosThread */ - -void RGWRadosThread::start() -{ - worker = new Worker(cct, this); - worker->create(thread_name.c_str()); -} - -void RGWRadosThread::stop() -{ - down_flag = true; - stop_process(); - if (worker) { - worker->signal(); - worker->join(); - } - delete worker; - worker = NULL; -} - -void *RGWRadosThread::Worker::entry() { - uint64_t msec = processor->interval_msec(); - auto interval = std::chrono::milliseconds(msec); - - do { - auto start = ceph::real_clock::now(); - int r = processor->process(this); - if (r < 0) { - ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl; - } - - if (processor->going_down()) - break; - - auto end = ceph::real_clock::now() - start; - - uint64_t cur_msec = processor->interval_msec(); - if (cur_msec != msec) { /* was it reconfigured? */ - msec = cur_msec; - interval = std::chrono::milliseconds(msec); - } - - if (cur_msec > 0) { - if (interval <= end) - continue; // next round - - auto wait_time = interval - end; - wait_interval(wait_time); - } else { - wait(); - } - } while (!processor->going_down()); - - return NULL; -} - -class RGWMetaNotifier : public RGWRadosThread { - RGWMetaNotifierManager notify_mgr; - RGWMetadataLog *const log; - - uint64_t interval_msec() override { - return cct->_conf->rgw_md_notify_interval_msec; - } - void stop_process() override { - notify_mgr.stop(); - } -public: - RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log) - : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {} - - int process(const DoutPrefixProvider *dpp) override; -}; - -int RGWMetaNotifier::process(const DoutPrefixProvider *dpp) -{ - set shards; - - log->read_clear_modified(shards); - - if (shards.empty()) { - return 0; - } - - for (set::iterator iter = shards.begin(); iter != shards.end(); ++iter) { - ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl; - } - - notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards); - - return 0; -} - -class RGWDataNotifier : public RGWRadosThread { - RGWDataNotifierManager notify_mgr; - bc::flat_set entry; - - uint64_t interval_msec() override { - return cct->_conf.get_val("rgw_data_notify_interval_msec"); - } - void stop_process() override { - notify_mgr.stop(); - } -public: - RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {} - - int process(const DoutPrefixProvider *dpp) override; -}; - -int RGWDataNotifier::process(const DoutPrefixProvider *dpp) -{ - auto data_log = store->svc.datalog_rados; - if (!data_log) { - return 0; - } - - auto shards = data_log->read_clear_modified(); - - if (shards.empty()) { - return 0; - } - - for (const auto& [shard_id, entries] : shards) { - bc::flat_set::iterator it; - for (const auto& entry : entries) { - ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id=" - << shard_id << ":" << entry.gen << ":" << entry.key << dendl; - } - } - - notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards); - - return 0; -} - -class RGWSyncProcessorThread : public RGWRadosThread { -public: - RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {} - RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {} - ~RGWSyncProcessorThread() override {} - int init(const DoutPrefixProvider *dpp) override = 0 ; - int process(const DoutPrefixProvider *dpp) override = 0; -}; - -class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread -{ - RGWMetaSyncStatusManager sync; - - uint64_t interval_msec() override { - return 0; /* no interval associated, it'll run once until stopped */ - } - void stop_process() override { - sync.stop(); - } -public: - RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados) - : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {} - - void wakeup_sync_shards(set& shard_ids) { - for (set::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) { - sync.wakeup(*iter); - } - } - RGWMetaSyncStatusManager* get_manager() { return &sync; } - - int init(const DoutPrefixProvider *dpp) override { - int ret = sync.init(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl; - return ret; - } - return 0; - } - - int process(const DoutPrefixProvider *dpp) override { - sync.run(dpp, null_yield); - return 0; - } -}; - -class RGWDataSyncProcessorThread : public RGWSyncProcessorThread -{ - PerfCountersRef counters; - RGWDataSyncStatusManager sync; - bool initialized; - - uint64_t interval_msec() override { - if (initialized) { - return 0; /* no interval associated, it'll run once until stopped */ - } else { -#define DATA_SYNC_INIT_WAIT_SEC 20 - return DATA_SYNC_INIT_WAIT_SEC * 1000; - } - } - void stop_process() override { - sync.stop(); - } -public: - RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados, - const RGWZone* source_zone) - : RGWSyncProcessorThread(_driver->getRados(), "data-sync"), - counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)), - sync(_driver, async_rados, source_zone->id, counters.get()), - initialized(false) {} - - void wakeup_sync_shards(bc::flat_map >& entries) { - for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { - sync.wakeup(iter->first, iter->second); - } - } - - RGWDataSyncStatusManager* get_manager() { return &sync; } - - int init(const DoutPrefixProvider *dpp) override { - return 0; - } - - int process(const DoutPrefixProvider *dpp) override { - while (!initialized) { - if (going_down()) { - return 0; - } - int ret = sync.init(dpp); - if (ret >= 0) { - initialized = true; - break; - } - /* we'll be back! */ - return 0; - } - sync.run(dpp); - return 0; - } -}; - -class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider -{ - RGWCoroutinesManager crs; - rgw::sal::RadosStore* store; - rgw::BucketTrimManager *bucket_trim; - RGWHTTPManager http; - const utime_t trim_interval; - - uint64_t interval_msec() override { return 0; } - void stop_process() override { crs.stop(); } -public: - RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim, - int interval) - : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"), - crs(store->ctx(), store->getRados()->get_cr_registry()), store(store), - bucket_trim(bucket_trim), - http(store->ctx(), crs.get_completion_mgr()), - trim_interval(interval, 0) - {} - - int init(const DoutPrefixProvider *dpp) override { - return http.start(); - } - int process(const DoutPrefixProvider *dpp) override { - list stacks; - auto metatrimcr = create_meta_log_trim_cr(this, static_cast(store), &http, - cct->_conf->rgw_md_log_max_shards, - trim_interval); - if (!metatrimcr) { - ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl; - return -EINVAL; - } - auto meta = new RGWCoroutinesStack(store->ctx(), &crs); - meta->call(metatrimcr); - - stacks.push_back(meta); - - if (store->svc()->zone->sync_module_exports_data()) { - auto data = new RGWCoroutinesStack(store->ctx(), &crs); - data->call(create_data_log_trim_cr(dpp, static_cast(store), &http, - cct->_conf->rgw_data_log_num_shards, - trim_interval)); - stacks.push_back(data); - - auto bucket = new RGWCoroutinesStack(store->ctx(), &crs); - bucket->call(bucket_trim->create_bucket_trim_cr(&http)); - stacks.push_back(bucket); - } - - crs.run(dpp, stacks); - return 0; - } - - // implements DoutPrefixProvider - CephContext *get_cct() const override { return store->ctx(); } - unsigned get_subsys() const override - { - return dout_subsys; - } - - std::ostream& gen_prefix(std::ostream& out) const override - { - return out << "sync log trim: "; - } - -}; - -void RGWRados::wakeup_meta_sync_shards(set& shard_ids) -{ - std::lock_guard l{meta_sync_thread_lock}; - if (meta_sync_processor_thread) { - meta_sync_processor_thread->wakeup_sync_shards(shard_ids); - } -} - -void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries) -{ - ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl; - for (bc::flat_map >::iterator iter = entries.begin(); iter != entries.end(); ++iter) { - ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl; - bc::flat_set& entries = iter->second; - for (const auto& [key, gen] : entries) { - ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key - << ", gen=" << gen << dendl; - } - } - - std::lock_guard l{data_sync_thread_lock}; - auto iter = data_sync_processor_threads.find(source_zone); - if (iter == data_sync_processor_threads.end()) { - ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl; - return; - } - - RGWDataSyncProcessorThread *thread = iter->second; - ceph_assert(thread); - thread->wakeup_sync_shards(entries); -} - -RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager() -{ - std::lock_guard l{meta_sync_thread_lock}; - if (meta_sync_processor_thread) { - return meta_sync_processor_thread->get_manager(); - } - return nullptr; -} - -RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone) -{ - std::lock_guard l{data_sync_thread_lock}; - auto thread = data_sync_processor_threads.find(source_zone); - if (thread == data_sync_processor_threads.end()) { - return nullptr; - } - return thread->second->get_manager(); -} - -int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment) -{ - IoCtx ioctx; - int r = open_pool_ctx(dpp, pool, ioctx, false); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl; - return r; - } - - bool req; - r = ioctx.pool_requires_alignment2(&req); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned " - << r << dendl; - return r; - } - - if (!req) { - *alignment = 0; - return 0; - } - - uint64_t align; - r = ioctx.pool_required_alignment2(&align); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned " - << r << dendl; - return r; - } - if (align != 0) { - ldpp_dout(dpp, 20) << "required alignment=" << align << dendl; - } - *alignment = align; - return 0; -} - -void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size) -{ - if (alignment == 0) { - *max_size = size; - return; - } - - if (size <= alignment) { - *max_size = alignment; - return; - } - - *max_size = size - (size % alignment); -} - -int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) -{ - uint64_t alignment; - int r = get_required_alignment(dpp, pool, &alignment); - if (r < 0) { - return r; - } - - if (palignment) { - *palignment = alignment; - } - - uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size; - - get_max_aligned_size(config_chunk_size, alignment, max_chunk_size); - - ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl; - - return 0; -} - -int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, - uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment) -{ - rgw_pool pool; - if (!get_obj_data_pool(placement_rule, obj, &pool)) { - ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl; - return -EIO; - } - return get_max_chunk_size(pool, max_chunk_size, dpp, palignment); -} - -void add_datalog_entry(const DoutPrefixProvider* dpp, - RGWDataChangesLog* datalog, - const RGWBucketInfo& bucket_info, - uint32_t shard_id) -{ - const auto& logs = bucket_info.layout.logs; - if (logs.empty()) { - return; - } - int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id); - if (r < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl; - } // datalog error is not fatal -} - -class RGWIndexCompletionManager; - -struct complete_op_data { - ceph::mutex lock = ceph::make_mutex("complete_op_data"); - AioCompletion *rados_completion{nullptr}; - int manager_shard_id{-1}; - RGWIndexCompletionManager *manager{nullptr}; - rgw_obj obj; - RGWModifyOp op; - string tag; - rgw_bucket_entry_ver ver; - cls_rgw_obj_key key; - rgw_bucket_dir_entry_meta dir_meta; - list remove_objs; - bool log_op; - uint16_t bilog_op; - rgw_zone_set zones_trace; - - bool stopped{false}; - - void stop() { - std::lock_guard l{lock}; - stopped = true; - } -}; - -class RGWIndexCompletionManager { - RGWRados* const store; - const uint32_t num_shards; - ceph::containers::tiny_vector locks; - std::vector> completions; - std::vector retry_completions; - - std::condition_variable cond; - std::mutex retry_completions_lock; - bool _stop{false}; - std::thread retry_thread; - - // used to distribute the completions and the locks they use across - // their respective vectors; it will get incremented and can wrap - // around back to 0 without issue - std::atomic cur_shard {0}; - - void process(); - - void add_completion(complete_op_data *completion); - - void stop() { - if (retry_thread.joinable()) { - _stop = true; - cond.notify_all(); - retry_thread.join(); - } - - for (uint32_t i = 0; i < num_shards; ++i) { - std::lock_guard l{locks[i]}; - for (auto c : completions[i]) { - c->stop(); - } - } - completions.clear(); - } - - uint32_t next_shard() { - return cur_shard++ % num_shards; - } - -public: - RGWIndexCompletionManager(RGWRados *_driver) : - store(_driver), - num_shards(store->ctx()->_conf->rgw_thread_pool_size), - locks{ceph::make_lock_container( - num_shards, - [](const size_t i) { - return ceph::make_mutex("RGWIndexCompletionManager::lock::" + - std::to_string(i)); - })}, - completions(num_shards), - retry_thread(&RGWIndexCompletionManager::process, this) - {} - - ~RGWIndexCompletionManager() { - stop(); - } - - void create_completion(const rgw_obj& obj, - RGWModifyOp op, string& tag, - rgw_bucket_entry_ver& ver, - const cls_rgw_obj_key& key, - rgw_bucket_dir_entry_meta& dir_meta, - list *remove_objs, bool log_op, - uint16_t bilog_op, - rgw_zone_set *zones_trace, - complete_op_data **result); - - bool handle_completion(completion_t cb, complete_op_data *arg); - - CephContext* ctx() { - return store->ctx(); - } -}; - -static void obj_complete_cb(completion_t cb, void *arg) -{ - complete_op_data *completion = reinterpret_cast(arg); - completion->lock.lock(); - if (completion->stopped) { - completion->lock.unlock(); /* can drop lock, no one else is referencing us */ - delete completion; - return; - } - bool need_delete = completion->manager->handle_completion(cb, completion); - completion->lock.unlock(); - if (need_delete) { - delete completion; - } -} - -void RGWIndexCompletionManager::process() -{ - DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: "); - while(!_stop) { - std::vector comps; - - { - std::unique_lock l{retry_completions_lock}; - cond.wait(l, [this](){return _stop || !retry_completions.empty();}); - if (_stop) { - return; - } - retry_completions.swap(comps); - } - - for (auto c : comps) { - std::unique_ptr up{c}; - - ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl; - - RGWRados::BucketShard bs(store); - RGWBucketInfo bucket_info; - - int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp); - if (r < 0) { - ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl; - /* not much to do */ - continue; - } - - r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info, - [&](RGWRados::BucketShard *bs) -> int { - const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation; - ldout_bitx(bitx, &dpp, 10) << - "ENTERING " << __func__ << ": bucket-shard=" << bs << - " obj=" << c->obj << " tag=" << c->tag << - " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx; - ldout_bitx(bitx, &dpp, 25) << - "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx; - - librados::ObjectWriteOperation o; - o.assert_exists(); - cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs, - c->log_op, c->bilog_op, &c->zones_trace); - int ret = bs->bucket_obj.operate(&dpp, &o, null_yield); - ldout_bitx(bitx, &dpp, 10) << - "EXITING " << __func__ << ": ret=" << dendl_bitx; - return ret; - }); - if (r < 0) { - ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl; - /* ignoring error, can't do anything about it */ - continue; - } - - add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id); - } - } -} - -void RGWIndexCompletionManager::create_completion(const rgw_obj& obj, - RGWModifyOp op, string& tag, - rgw_bucket_entry_ver& ver, - const cls_rgw_obj_key& key, - rgw_bucket_dir_entry_meta& dir_meta, - list *remove_objs, bool log_op, - uint16_t bilog_op, - rgw_zone_set *zones_trace, - complete_op_data **result) -{ - complete_op_data *entry = new complete_op_data; - - int shard_id = next_shard(); - - entry->manager_shard_id = shard_id; - entry->manager = this; - entry->obj = obj; - entry->op = op; - entry->tag = tag; - entry->ver = ver; - entry->key = key; - entry->dir_meta = dir_meta; - entry->log_op = log_op; - entry->bilog_op = bilog_op; - - if (remove_objs) { - for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) { - entry->remove_objs.push_back(*iter); - } - } - - if (zones_trace) { - entry->zones_trace = *zones_trace; - } else { - entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key()); - } - - *result = entry; - - entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb); - - std::lock_guard l{locks[shard_id]}; - const auto ok = completions[shard_id].insert(entry).second; - ceph_assert(ok); -} - -void RGWIndexCompletionManager::add_completion(complete_op_data *completion) { - { - std::lock_guard l{retry_completions_lock}; - retry_completions.push_back(completion); - } - cond.notify_all(); -} - -bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg) -{ - int shard_id = arg->manager_shard_id; - { - std::lock_guard l{locks[shard_id]}; - - auto& comps = completions[shard_id]; - - auto iter = comps.find(arg); - if (iter == comps.end()) { - ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl; - return true; - } - - comps.erase(iter); - } - - int r = rados_aio_get_return_value(cb); - if (r != -ERR_BUSY_RESHARDING) { - ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << - (r == 0 ? "ok" : "failed with " + to_string(r)) << - " for obj=" << arg->key << dendl; - return true; - } - add_completion(arg); - ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl; - return false; -} - -void RGWRados::finalize() -{ - /* Before joining any sync threads, drain outstanding requests & - * mark the async_processor as going_down() */ - if (svc.rados) { - svc.rados->stop_processor(); - } - - if (run_sync_thread) { - std::lock_guard l{meta_sync_thread_lock}; - meta_sync_processor_thread->stop(); - - std::lock_guard dl{data_sync_thread_lock}; - for (auto iter : data_sync_processor_threads) { - RGWDataSyncProcessorThread *thread = iter.second; - thread->stop(); - } - if (sync_log_trimmer) { - sync_log_trimmer->stop(); - } - } - if (run_sync_thread) { - delete meta_sync_processor_thread; - meta_sync_processor_thread = NULL; - std::lock_guard dl{data_sync_thread_lock}; - for (auto iter : data_sync_processor_threads) { - RGWDataSyncProcessorThread *thread = iter.second; - delete thread; - } - data_sync_processor_threads.clear(); - delete sync_log_trimmer; - sync_log_trimmer = nullptr; - bucket_trim = boost::none; - } - if (meta_notifier) { - meta_notifier->stop(); - delete meta_notifier; - } - if (data_notifier) { - data_notifier->stop(); - delete data_notifier; - } - delete sync_tracer; - - delete lc; - lc = NULL; - - delete gc; - gc = NULL; - - delete obj_expirer; - obj_expirer = NULL; - - RGWQuotaHandler::free_handler(quota_handler); - if (cr_registry) { - cr_registry->put(); - } - - svc.shutdown(); - - delete binfo_cache; - delete obj_tombstone_cache; - if (d3n_data_cache) - delete d3n_data_cache; - - if (reshard_wait.get()) { - reshard_wait->stop(); - reshard_wait.reset(); - } - - if (run_reshard_thread) { - reshard->stop_processor(); - } - delete reshard; - delete index_completion_manager; - - rgw::notify::shutdown(); -} - -/** - * Initialize the RADOS instance and prepare to do other ops - * Returns 0 on success, -ERR# on failure. - */ -int RGWRados::init_rados() -{ - int ret = 0; - - ret = rados.init_with_context(cct); - if (ret < 0) { - return ret; - } - ret = rados.connect(); - if (ret < 0) { - return ret; - } - - auto crs = std::unique_ptr{ - new RGWCoroutinesManagerRegistry(cct)}; - ret = crs->hook_to_admin_command("cr dump"); - if (ret < 0) { - return ret; - } - - cr_registry = crs.release(); - - if (use_datacache) { - d3n_data_cache = new D3nDataCache(); - d3n_data_cache->init(cct); - } - - return ret; -} - -int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map& meta) -{ - string name = cct->_conf->name.get_id(); - if (name.compare(0, 4, "rgw.") == 0) { - name = name.substr(4); - } - map metadata = meta; - metadata["num_handles"] = "1"s; - metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id(); - metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name(); - metadata["zone_name"] = svc.zone->zone_name(); - metadata["zone_id"] = svc.zone->zone_id().id; - metadata["realm_name"] = svc.zone->get_realm().get_name(); - metadata["realm_id"] = svc.zone->get_realm().get_id(); - metadata["id"] = name; - int ret = rados.service_daemon_register( - daemon_type, - stringify(rados.get_instance_id()), - metadata); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; - return ret; - } - - return 0; -} - -int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map&& status) -{ - int ret = rados.service_daemon_update_status(move(status)); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl; - return ret; - } - - return 0; -} - -/** - * Initialize the RADOS instance and prepare to do other ops - * Returns 0 on success, -ERR# on failure. - */ -int RGWRados::init_complete(const DoutPrefixProvider *dpp) -{ - int ret; - - /* - * create sync module instance even if we don't run sync thread, might need it for radosgw-admin - */ - sync_module = svc.sync_modules->get_sync_module(); - - ret = open_root_pool_ctx(dpp); - if (ret < 0) - return ret; - - ret = open_gc_pool_ctx(dpp); - if (ret < 0) - return ret; - - ret = open_lc_pool_ctx(dpp); - if (ret < 0) - return ret; - - ret = open_objexp_pool_ctx(dpp); - if (ret < 0) - return ret; - - ret = open_reshard_pool_ctx(dpp); - if (ret < 0) - return ret; - - ret = open_notif_pool_ctx(dpp); - if (ret < 0) - return ret; - - pools_initialized = true; - - if (use_gc) { - gc = new RGWGC(); - gc->initialize(cct, this); - } else { - ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl; - } - - obj_expirer = new RGWObjectExpirer(this->driver); - - if (use_gc_thread && use_gc) { - gc->start_processor(); - obj_expirer->start_processor(); - } - - auto& current_period = svc.zone->get_current_period(); - auto& zonegroup = svc.zone->get_zonegroup(); - auto& zone_params = svc.zone->get_zone_params(); - auto& zone = svc.zone->get_zone(); - - /* no point of running sync thread if we don't have a master zone configured - or there is no rest_master_conn */ - if (!svc.zone->need_to_sync()) { - run_sync_thread = false; - } - - if (svc.zone->is_meta_master()) { - auto md_log = svc.mdlog->get_log(current_period.get_id()); - meta_notifier = new RGWMetaNotifier(this, md_log); - meta_notifier->start(); - } - - /* init it anyway, might run sync through radosgw-admin explicitly */ - sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size); - sync_tracer->init(this); - ret = sync_tracer->hook_to_admin_command(); - if (ret < 0) { - return ret; - } - - if (run_sync_thread) { - for (const auto &pt: zonegroup.placement_targets) { - if (zone_params.placement_pools.find(pt.second.name) - == zone_params.placement_pools.end()){ - ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target " - << pt.second.name << " present in zonegroup" << dendl; - } - } - auto async_processor = svc.rados->get_async_processor(); - std::lock_guard l{meta_sync_thread_lock}; - meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor); - ret = meta_sync_processor_thread->init(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl; - return ret; - } - meta_sync_processor_thread->start(); - - // configure the bucket trim manager - rgw::BucketTrimConfig config; - rgw::configure_bucket_trim(cct, config); - - bucket_trim.emplace(this->driver, config); - ret = bucket_trim->init(); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl; - return ret; - } - svc.datalog_rados->set_observer(&*bucket_trim); - - std::lock_guard dl{data_sync_thread_lock}; - for (auto source_zone : svc.zone->get_data_sync_source_zones()) { - ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl; - auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone); - ret = thread->init(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl; - return ret; - } - thread->start(); - data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread; - } - auto interval = cct->_conf->rgw_sync_log_trim_interval; - if (interval > 0) { - sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval); - ret = sync_log_trimmer->init(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl; - return ret; - } - sync_log_trimmer->start(); - } - } - if (cct->_conf->rgw_data_notify_interval_msec) { - data_notifier = new RGWDataNotifier(this); - data_notifier->start(); - } - - binfo_cache = new RGWChainedCacheImpl; - binfo_cache->init(svc.cache); - - lc = new RGWLC(); - lc->initialize(cct, this->driver); - - if (use_lc_thread) - lc->start_processor(); - - quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads); - - bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards : - zone.bucket_index_max_shards); - if (bucket_index_max_shards > get_max_bucket_shards()) { - bucket_index_max_shards = get_max_bucket_shards(); - ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: " - << get_max_bucket_shards() << dendl; - } - ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl; - - bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */ - - if (need_tombstone_cache) { - obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size); - } - - reshard_wait = std::make_shared(); - - reshard = new RGWReshard(this->driver); - - // disable reshard thread based on zone/zonegroup support - run_reshard_thread = run_reshard_thread && svc.zone->can_reshard(); - - if (run_reshard_thread) { - reshard->start_processor(); - } - - index_completion_manager = new RGWIndexCompletionManager(this); - ret = rgw::notify::init(cct, driver, dpp); - if (ret < 0 ) { - ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl; - } - - return ret; -} - -int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp) -{ - if (raw) { - return svc.init_raw(cct, use_cache, null_yield, dpp); - } - - return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp); -} - -int RGWRados::init_ctl(const DoutPrefixProvider *dpp) -{ - return ctl.init(&svc, driver, dpp); -} - -/** - * Initialize the RADOS instance and prepare to do other ops - * Returns 0 on success, -ERR# on failure. - */ -int RGWRados::init_begin(const DoutPrefixProvider *dpp) -{ - int ret; - - inject_notify_timeout_probability = - cct->_conf.get_val("rgw_inject_notify_timeout_probability"); - max_notify_retries = cct->_conf.get_val("rgw_max_notify_retries"); - - ret = init_svc(false, dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl; - return ret; - } - - ret = init_ctl(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl; - return ret; - } - - host_id = svc.zone_utils->gen_host_id(); - - return init_rados(); -} - -/** - * Open the pool used as root for this gateway - * Returns: 0 on success, -ERR# otherwise. - */ -int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true); -} - -int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true); -} - -int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true); -} - -int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true); -} - -int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true); -} - -int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp) -{ - return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true); -} - -int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, - bool mostly_omap) -{ - constexpr bool create = true; // create the pool if it doesn't exist - return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap); -} - -/**** logs ****/ - -struct log_list_state { - string prefix; - librados::IoCtx io_ctx; - librados::NObjectIterator obit; -}; - -int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle) -{ - log_list_state *state = new log_list_state; - int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); - if (r < 0) { - delete state; - return r; - } - state->prefix = prefix; - state->obit = state->io_ctx.nobjects_begin(); - *handle = (RGWAccessHandle)state; - return 0; -} - -int RGWRados::log_list_next(RGWAccessHandle handle, string *name) -{ - log_list_state *state = static_cast(handle); - while (true) { - if (state->obit == state->io_ctx.nobjects_end()) { - delete state; - return -ENOENT; - } - if (state->prefix.length() && - state->obit->get_oid().find(state->prefix) != 0) { - state->obit++; - continue; - } - *name = state->obit->get_oid(); - state->obit++; - break; - } - return 0; -} - -int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name) -{ - librados::IoCtx io_ctx; - int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx); - if (r < 0) - return r; - return io_ctx.remove(name); -} - -struct log_show_state { - librados::IoCtx io_ctx; - bufferlist bl; - bufferlist::const_iterator p; - string name; - uint64_t pos; - bool eof; - log_show_state() : pos(0), eof(false) {} -}; - -int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle) -{ - log_show_state *state = new log_show_state; - int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx); - if (r < 0) { - delete state; - return r; - } - state->name = name; - *handle = (RGWAccessHandle)state; - return 0; -} - -int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry) -{ - log_show_state *state = static_cast(handle); - off_t off = state->p.get_off(); - - ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length() - << " off " << off - << " eof " << (int)state->eof - << dendl; - // read some? - unsigned chunk = 1024*1024; - if ((state->bl.length() - off) < chunk/2 && !state->eof) { - bufferlist more; - int r = state->io_ctx.read(state->name, more, chunk, state->pos); - if (r < 0) - return r; - state->pos += r; - bufferlist old; - try { - old.substr_of(state->bl, off, state->bl.length() - off); - } catch (buffer::error& err) { - return -EINVAL; - } - state->bl = std::move(old); - state->bl.claim_append(more); - state->p = state->bl.cbegin(); - if ((unsigned)r < chunk) - state->eof = true; - ldpp_dout(dpp, 10) << " read " << r << dendl; - } - - if (state->p.end()) - return 0; // end of file - try { - decode(*entry, state->p); - } - catch (const buffer::error &e) { - return -EINVAL; - } - return 1; -} - -/** - * usage_log_hash: get usage log key hash, based on name and index - * - * Get the usage object name. Since a user may have more than 1 - * object holding that info (multiple shards), we use index to - * specify that shard number. Once index exceeds max shards it - * wraps. - * If name is not being set, results for all users will be returned - * and index will wrap only after total shards number. - * - * @param cct [in] ceph context - * @param name [in] user name - * @param hash [out] hash value - * @param index [in] shard index number - */ -static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index) -{ - uint32_t val = index; - - if (!name.empty()) { - int max_user_shards = cct->_conf->rgw_usage_max_user_shards; - val %= max_user_shards; - val += ceph_str_hash_linux(name.c_str(), name.size()); - } - char buf[17]; - int max_shards = cct->_conf->rgw_usage_max_shards; - snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards)); - hash = buf; -} - -int RGWRados::log_usage(const DoutPrefixProvider *dpp, map& usage_info) -{ - uint32_t index = 0; - - map log_objs; - - string hash; - string last_user; - - /* restructure usage map, zone by object hash */ - map::iterator iter; - for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) { - const rgw_user_bucket& ub = iter->first; - RGWUsageBatch& info = iter->second; - - if (ub.user.empty()) { - ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl; - continue; - } - - if (ub.user != last_user) { - /* index *should* be random, but why waste extra cycles - in most cases max user shards is not going to exceed 1, - so just incrementing it */ - usage_log_hash(cct, ub.user, hash, index++); - } - last_user = ub.user; - vector& v = log_objs[hash].entries; - - for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) { - v.push_back(miter->second); - } - } - - map::iterator liter; - - for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) { - int r = cls_obj_usage_log_add(dpp, liter->first, liter->second); - if (r < 0) - return r; - } - return 0; -} - -int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, - uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map& usage) -{ - uint32_t num = max_entries; - string hash, first_hash; - string user_str = user.to_str(); - usage_log_hash(cct, user_str, first_hash, 0); - - if (usage_iter.index) { - usage_log_hash(cct, user_str, hash, usage_iter.index); - } else { - hash = first_hash; - } - - usage.clear(); - - do { - map ret_usage; - map::iterator iter; - - int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num, - usage_iter.read_iter, ret_usage, is_truncated); - if (ret == -ENOENT) - goto next; - - if (ret < 0) - return ret; - - num -= ret_usage.size(); - - for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) { - usage[iter->first].aggregate(iter->second); - } - -next: - if (!*is_truncated) { - usage_iter.read_iter.clear(); - usage_log_hash(cct, user_str, hash, ++usage_iter.index); - } - } while (num && !*is_truncated && hash != first_hash); - return 0; -} - -int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch) -{ - uint32_t index = 0; - string hash, first_hash; - string user_str = user.to_str(); - usage_log_hash(cct, user_str, first_hash, index); - - hash = first_hash; - do { - int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch); - - if (ret < 0 && ret != -ENOENT) - return ret; - - usage_log_hash(cct, user_str, hash, ++index); - } while (hash != first_hash); - - return 0; -} - - -int RGWRados::clear_usage(const DoutPrefixProvider *dpp) -{ - auto max_shards = cct->_conf->rgw_usage_max_shards; - int ret=0; - for (unsigned i=0; i < max_shards; i++){ - string oid = RGW_USAGE_OBJ_PREFIX + to_string(i); - ret = cls_obj_usage_log_clear(dpp, oid); - if (ret < 0){ - ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl; - return ret; - } - } - return ret; -} - -int RGWRados::decode_policy(const DoutPrefixProvider *dpp, - ceph::buffer::list& bl, - ACLOwner *owner) -{ - auto i = bl.cbegin(); - RGWAccessControlPolicy policy(cct); - try { - policy.decode_owner(i); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl; - return -EIO; - } - *owner = policy.get_owner(); - return 0; -} - -int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp) -{ - rgw_bucket bucket = bucket_info.bucket; - bucket.update_bucket_id(new_bucket_id); - - bucket_info.objv_tracker.clear(); - int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp); - if (ret < 0) { - return ret; - } - - return 0; -} - - -/** - * Get ordered listing of the objects in a bucket. - * - * max_p: maximum number of results to return - * bucket: bucket to list contents of - * prefix: only return results that match this prefix - * delim: do not include results that match this string. - * Any skipped results will have the matching portion of their name - * inserted in common_prefixes with a "true" mark. - * marker: if filled in, begin the listing with this object. - * end_marker: if filled in, end the listing with this object. - * result: the objects are put in here. - * common_prefixes: if delim is filled in, any matching prefixes are - * placed here. - * is_truncated: if number of objects in the bucket is bigger than - * max, then truncated. - */ -int RGWRados::Bucket::List::list_objects_ordered( - const DoutPrefixProvider *dpp, - int64_t max_p, - std::vector *result, - std::map *common_prefixes, - bool *is_truncated, - optional_yield y) -{ - RGWRados *store = target->get_store(); - CephContext *cct = store->ctx(); - int shard_id = target->get_shard_id(); - const auto& current_index = target->get_bucket_info().layout.current_index; - - int count = 0; - bool truncated = true; - bool cls_filtered = false; - const int64_t max = // protect against memory issues and negative vals - std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); - int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max); - - result->clear(); - - // use a local marker; either the marker will have a previous entry - // or it will be empty; either way it's OK to copy - rgw_obj_key marker_obj(params.marker.name, - params.marker.instance, - params.ns.empty() ? params.marker.ns : params.ns); - rgw_obj_index_key cur_marker; - marker_obj.get_index_key(&cur_marker); - - rgw_obj_key end_marker_obj(params.end_marker.name, - params.end_marker.instance, - params.ns.empty() ? params.end_marker.ns : params.ns); - rgw_obj_index_key cur_end_marker; - end_marker_obj.get_index_key(&cur_end_marker); - const bool cur_end_marker_valid = !params.end_marker.empty(); - - rgw_obj_key prefix_obj(params.prefix); - prefix_obj.set_ns(params.ns); - std::string cur_prefix = prefix_obj.get_index_key_name(); - std::string after_delim_s; /* needed in !params.delim.empty() AND later */ - - if (!params.delim.empty()) { - after_delim_s = cls_rgw_after_delim(params.delim); - /* if marker points at a common prefix, fast forward it into its - * upper bound string */ - int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size()); - if (delim_pos >= 0) { - string s = cur_marker.name.substr(0, delim_pos); - s.append(after_delim_s); - cur_marker = s; - } - } - - // we'll stop after this many attempts as long we return at least - // one entry; but we will also go beyond this number of attempts - // until we return at least one entry - constexpr uint16_t SOFT_MAX_ATTEMPTS = 8; - - rgw_obj_index_key prev_marker; - for (uint16_t attempt = 1; /* empty */; ++attempt) { - ldpp_dout(dpp, 20) << __func__ << - ": starting attempt " << attempt << dendl; - - if (attempt > 1 && !(prev_marker < cur_marker)) { - // we've failed to make forward progress - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " marker failed to make forward progress; attempt=" << attempt << - ", prev_marker=" << prev_marker << - ", cur_marker=" << cur_marker << dendl; - break; - } - prev_marker = cur_marker; - - ent_map_t ent_map; - ent_map.reserve(read_ahead); - int r = store->cls_bucket_list_ordered(dpp, - target->get_bucket_info(), - current_index, - shard_id, - cur_marker, - cur_prefix, - params.delim, - read_ahead + 1 - count, - params.list_versions, - attempt, - ent_map, - &truncated, - &cls_filtered, - &cur_marker, - y, - params.force_check_filter); - if (r < 0) { - return r; - } - - for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) { - rgw_bucket_dir_entry& entry = eiter->second; - rgw_obj_index_key index_key = entry.key; - rgw_obj_key obj(index_key); - - ldpp_dout(dpp, 20) << __func__ << - ": considering entry " << entry.key << dendl; - - /* note that parse_raw_oid() here will not set the correct - * object's instance, as rgw_obj_index_key encodes that - * separately. We don't need to set the instance because it's - * not needed for the checks here and we end up using the raw - * entry for the return vector - */ - bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); - if (!valid) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " could not parse object name: " << obj.name << dendl; - continue; - } - - bool matched_ns = (obj.ns == params.ns); - if (!params.list_versions && !entry.is_visible()) { - ldpp_dout(dpp, 10) << __func__ << - ": skipping not visible entry \"" << entry.key << "\"" << dendl; - continue; - } - - if (params.enforce_ns && !matched_ns) { - if (!params.ns.empty()) { - /* we've iterated past the namespace we're searching -- done now */ - truncated = false; - ldpp_dout(dpp, 10) << __func__ << - ": finished due to getting past requested namespace \"" << - params.ns << "\"" << dendl; - goto done; - } - - /* we're skipping past namespaced objects */ - ldpp_dout(dpp, 20) << __func__ << - ": skipping past namespaced objects, including \"" << entry.key << - "\"" << dendl; - continue; - } - - if (cur_end_marker_valid && cur_end_marker <= index_key) { - truncated = false; - ldpp_dout(dpp, 10) << __func__ << - ": finished due to gitting end marker of \"" << cur_end_marker << - "\" with \"" << entry.key << "\"" << dendl; - goto done; - } - - if (count < max) { - params.marker = index_key; - next_marker = index_key; - } - - if (params.access_list_filter && - ! params.access_list_filter->filter(obj.name, index_key.name)) { - ldpp_dout(dpp, 20) << __func__ << - ": skipping past namespaced objects, including \"" << entry.key << - "\"" << dendl; - continue; - } - - if (params.prefix.size() && - 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) { - ldpp_dout(dpp, 20) << __func__ << - ": skipping object \"" << entry.key << - "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl; - continue; - } - - if (!params.delim.empty()) { - const int delim_pos = obj.name.find(params.delim, params.prefix.size()); - if (delim_pos >= 0) { - // run either the code where delimiter filtering is done a) - // in the OSD/CLS or b) here. - if (cls_filtered) { - // NOTE: this condition is for the newer versions of the - // OSD that does filtering on the CLS side should only - // find one delimiter at the end if it finds any after the - // prefix - if (delim_pos != - int(obj.name.length() - params.delim.length())) { - ldpp_dout(dpp, 0) << "WARNING: " << __func__ << - " found delimiter in place other than the end of " - "the prefix; obj.name=" << obj.name << - ", prefix=" << params.prefix << dendl; - } - if (common_prefixes) { - if (count >= max) { - truncated = true; - ldpp_dout(dpp, 10) << __func__ << - ": stopping early with common prefix \"" << entry.key << - "\" because requested number (" << max << - ") reached (cls filtered)" << dendl; - goto done; - } - - (*common_prefixes)[obj.name] = true; - count++; - } - - ldpp_dout(dpp, 20) << __func__ << - ": finished entry with common prefix \"" << entry.key << - "\" so continuing loop (cls filtered)" << dendl; - continue; - } else { - // NOTE: this condition is for older versions of the OSD - // that do not filter on the CLS side, so the following code - // must do the filtering; once we reach version 16 of ceph, - // this code can be removed along with the conditional that - // can lead this way - - /* extract key -with trailing delimiter- for CommonPrefix */ - string prefix_key = - obj.name.substr(0, delim_pos + params.delim.length()); - - if (common_prefixes && - common_prefixes->find(prefix_key) == common_prefixes->end()) { - if (count >= max) { - truncated = true; - ldpp_dout(dpp, 10) << __func__ << - ": stopping early with common prefix \"" << entry.key << - "\" because requested number (" << max << - ") reached (not cls filtered)" << dendl; - goto done; - } - next_marker = prefix_key; - (*common_prefixes)[prefix_key] = true; - - count++; - } - - ldpp_dout(dpp, 20) << __func__ << - ": finished entry with common prefix \"" << entry.key << - "\" so continuing loop (not cls filtered)" << dendl; - continue; - } // if we're running an older OSD version - } // if a delimiter was found after prefix - } // if a delimiter was passed in - - if (count >= max) { - truncated = true; - ldpp_dout(dpp, 10) << __func__ << - ": stopping early with entry \"" << entry.key << - "\" because requested number (" << max << - ") reached" << dendl; - goto done; - } - - ldpp_dout(dpp, 20) << __func__ << - ": adding entry " << entry.key << " to result" << dendl; - - result->emplace_back(std::move(entry)); - count++; - } // eiter for loop - - // NOTE: the following conditional is needed by older versions of - // the OSD that don't do delimiter filtering on the CLS side; once - // we reach version 16 of ceph, the following conditional and the - // code within can be removed - if (!cls_filtered && !params.delim.empty()) { - int marker_delim_pos = - cur_marker.name.find(params.delim, cur_prefix.size()); - if (marker_delim_pos >= 0) { - std::string skip_after_delim = - cur_marker.name.substr(0, marker_delim_pos); - skip_after_delim.append(after_delim_s); - - ldpp_dout(dpp, 20) << __func__ << - ": skip_after_delim=" << skip_after_delim << dendl; - - if (skip_after_delim > cur_marker.name) { - cur_marker = skip_after_delim; - ldpp_dout(dpp, 20) << __func__ << - ": setting cur_marker=" << cur_marker.name << - "[" << cur_marker.instance << "]" << dendl; - } - } - } // if older osd didn't do delimiter filtering - - ldpp_dout(dpp, 10) << __func__ << - ": end of outer loop, truncated=" << truncated << - ", count=" << count << ", attempt=" << attempt << dendl; - - if (!truncated || count >= (max + 1) / 2) { - // if we finished listing, or if we're returning at least half the - // requested entries, that's enough; S3 and swift protocols allow - // returning fewer than max entries - ldpp_dout(dpp, 10) << __func__ << - ": exiting attempt loop because we reached end (" << truncated << - ") or we're returning half the requested entries (" << count << - " of " << max << ")" << dendl; - break; - } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) { - // if we've made at least 8 attempts and we have some, but very - // few, results, return with what we have - ldpp_dout(dpp, 10) << __func__ << - ": exiting attempt loop because we made " << attempt << - " attempts and we're returning " << count << " entries" << dendl; - break; - } - } // for (uint16_t attempt... - -done: - - if (is_truncated) { - *is_truncated = truncated; - } - - return 0; -} // list_objects_ordered - - -/** - * Get listing of the objects in a bucket and allow the results to be out - * of order. - * - * Even though there are key differences with the ordered counterpart, - * the parameters are the same to maintain some compatability. - * - * max: maximum number of results to return - * bucket: bucket to list contents of - * prefix: only return results that match this prefix - * delim: should not be set; if it is we should have indicated an error - * marker: if filled in, begin the listing with this object. - * end_marker: if filled in, end the listing with this object. - * result: the objects are put in here. - * common_prefixes: this is never filled with an unordered list; the param - * is maintained for compatibility - * is_truncated: if number of objects in the bucket is bigger than max, then - * truncated. - */ -int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp, - int64_t max_p, - std::vector* result, - std::map* common_prefixes, - bool* is_truncated, - optional_yield y) -{ - RGWRados *store = target->get_store(); - int shard_id = target->get_shard_id(); - const auto& current_index = target->get_bucket_info().layout.current_index; - - int count = 0; - bool truncated = true; - - const int64_t max = // protect against memory issues and negative vals - std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p)); - - // read a few extra in each call to cls_bucket_list_unordered in - // case some are filtered out due to namespace matching, versioning, - // filtering, etc. - const int64_t max_read_ahead = 100; - const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead)); - - result->clear(); - - // use a local marker; either the marker will have a previous entry - // or it will be empty; either way it's OK to copy - rgw_obj_key marker_obj(params.marker.name, - params.marker.instance, - params.ns.empty() ? params.marker.ns : params.ns); - rgw_obj_index_key cur_marker; - marker_obj.get_index_key(&cur_marker); - - rgw_obj_key end_marker_obj(params.end_marker.name, - params.end_marker.instance, - params.ns.empty() ? params.end_marker.ns : params.ns); - rgw_obj_index_key cur_end_marker; - end_marker_obj.get_index_key(&cur_end_marker); - const bool cur_end_marker_valid = !params.end_marker.empty(); - - rgw_obj_key prefix_obj(params.prefix); - prefix_obj.set_ns(params.ns); - std::string cur_prefix = prefix_obj.get_index_key_name(); - - while (truncated && count <= max) { - std::vector ent_list; - ent_list.reserve(read_ahead); - - int r = store->cls_bucket_list_unordered(dpp, - target->get_bucket_info(), - current_index, - shard_id, - cur_marker, - cur_prefix, - read_ahead, - params.list_versions, - ent_list, - &truncated, - &cur_marker, - y); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " cls_bucket_list_unordered returned " << r << " for " << - target->get_bucket_info().bucket << dendl; - return r; - } - - // NB: while regions of ent_list will be sorted, we have no - // guarantee that all items will be sorted since they can cross - // shard boundaries - - for (auto& entry : ent_list) { - rgw_obj_index_key index_key = entry.key; - rgw_obj_key obj(index_key); - - if (count < max) { - params.marker.set(index_key); - next_marker.set(index_key); - } - - /* note that parse_raw_oid() here will not set the correct - * object's instance, as rgw_obj_index_key encodes that - * separately. We don't need to set the instance because it's - * not needed for the checks here and we end up using the raw - * entry for the return vector - */ - bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj); - if (!valid) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " could not parse object name: " << obj.name << dendl; - continue; - } - - if (!params.list_versions && !entry.is_visible()) { - ldpp_dout(dpp, 20) << __func__ << - ": skippping \"" << index_key << - "\" because not listing versions and entry not visibile" << dendl; - continue; - } - - if (params.enforce_ns && obj.ns != params.ns) { - ldpp_dout(dpp, 20) << __func__ << - ": skippping \"" << index_key << - "\" because namespace does not match" << dendl; - continue; - } - - if (cur_end_marker_valid && cur_end_marker <= index_key) { - // we're not guaranteed items will come in order, so we have - // to loop through all - ldpp_dout(dpp, 20) << __func__ << - ": skippping \"" << index_key << - "\" because after end_marker" << dendl; - continue; - } - - if (params.access_list_filter && - !params.access_list_filter->filter(obj.name, index_key.name)) { - ldpp_dout(dpp, 20) << __func__ << - ": skippping \"" << index_key << - "\" because doesn't match filter" << dendl; - continue; - } - - if (params.prefix.size() && - (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) { - ldpp_dout(dpp, 20) << __func__ << - ": skippping \"" << index_key << - "\" because doesn't match prefix" << dendl; - continue; - } - - if (count >= max) { - truncated = true; - goto done; - } - - result->emplace_back(std::move(entry)); - count++; - } // for (auto& entry : ent_list) - } // while (truncated && count <= max) - -done: - - if (is_truncated) { - *is_truncated = truncated; - } - - return 0; -} // list_objects_unordered - - -/** - * create a rados pool, associated meta info - * returns 0 on success, -ERR# otherwise. - */ -int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool) -{ - librados::IoCtx io_ctx; - constexpr bool create = true; - return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create); -} - -void RGWRados::create_bucket_id(string *bucket_id) -{ - uint64_t iid = instance_id(); - uint64_t bid = next_bucket_id(); - char buf[svc.zone->get_zone_params().get_id().size() + 48]; - snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64, - svc.zone->get_zone_params().get_id().c_str(), iid, bid); - *bucket_id = buf; -} - -int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, - const string& zonegroup_id, - const rgw_placement_rule& placement_rule, - const string& swift_ver_location, - const RGWQuotaInfo * pquota_info, - map& attrs, - RGWBucketInfo& info, - obj_version *pobjv, - obj_version *pep_objv, - real_time creation_time, - rgw_bucket *pmaster_bucket, - uint32_t *pmaster_num_shards, - optional_yield y, - const DoutPrefixProvider *dpp, - bool exclusive) -{ -#define MAX_CREATE_RETRIES 20 /* need to bound retries */ - rgw_placement_rule selected_placement_rule; - RGWZonePlacementInfo rule_info; - - for (int i = 0; i < MAX_CREATE_RETRIES; i++) { - int ret = 0; - ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule, - &selected_placement_rule, &rule_info, y); - if (ret < 0) - return ret; - - if (!pmaster_bucket) { - create_bucket_id(&bucket.marker); - bucket.bucket_id = bucket.marker; - } else { - bucket.marker = pmaster_bucket->marker; - bucket.bucket_id = pmaster_bucket->bucket_id; - } - - RGWObjVersionTracker& objv_tracker = info.objv_tracker; - - objv_tracker.read_version.clear(); - - if (pobjv) { - objv_tracker.write_version = *pobjv; - } else { - objv_tracker.generate_new_write_ver(cct); - } - - info.bucket = bucket; - info.owner = owner.user_id; - info.zonegroup = zonegroup_id; - info.placement_rule = selected_placement_rule; - info.swift_ver_location = swift_ver_location; - info.swift_versioning = (!swift_ver_location.empty()); - - init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(), - pmaster_num_shards ? - std::optional{*pmaster_num_shards} : - std::nullopt, - rule_info.index_type); - - info.requester_pays = false; - if (real_clock::is_zero(creation_time)) { - info.creation_time = ceph::real_clock::now(); - } else { - info.creation_time = creation_time; - } - if (pquota_info) { - info.quota = *pquota_info; - } - - int r = svc.bi->init_index(dpp, info, info.layout.current_index); - if (r < 0) { - return r; - } - - ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp); - if (ret == -ECANCELED) { - ret = -EEXIST; - } - if (ret == -EEXIST) { - /* we need to reread the info and return it, caller will have a use for it */ - RGWBucketInfo orig_info; - r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL); - if (r < 0) { - if (r == -ENOENT) { - continue; - } - ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl; - return r; - } - - /* only remove it if it's a different bucket instance */ - if (orig_info.bucket.bucket_id != bucket.bucket_id) { - int r = svc.bi->clean_index(dpp, info, info.layout.current_index); - if (r < 0) { - ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl; - } - r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp); - if (r < 0) { - ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl; - /* continue anyway */ - } - } - - info = std::move(orig_info); - /* ret == -EEXIST here */ - } - return ret; - } - - /* this is highly unlikely */ - ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl; - return -ENOENT; -} - -bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj) -{ - get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc); - - return get_obj_data_pool(placement_rule, obj, &raw_obj->pool); -} - -std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y) -{ - return svc.rados->cluster_fsid(); -} - -int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw_obj& obj, - librados::IoCtx *ioctx) -{ - std::string oid, key; - get_obj_bucket_and_oid_loc(obj, oid, key); - - rgw_pool pool; - if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) { - ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << - ", probably misconfiguration" << dendl; - return -EIO; - } - - int r = open_pool_ctx(dpp, pool, *ioctx, false); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() << - " for obj=" << obj << " with error-code=" << r << dendl; - return r; - } - - ioctx->locator_set_key(key); - - return 0; -} - -int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, - const rgw_placement_rule& target_placement_rule, - const rgw_obj& obj, - rgw_rados_ref *ref) -{ - get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc); - - rgw_pool pool; - if (!get_obj_data_pool(target_placement_rule, obj, &pool)) { - ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl; - return -EIO; - } - - ref->pool = svc.rados->pool(pool); - - int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() - .set_mostly_omap(false)); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl; - return r; - } - - ref->pool.ioctx().locator_set_key(ref->obj.loc); - - return 0; -} - -int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw_obj& obj, - rgw_rados_ref *ref) -{ - return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref); -} - -int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) -{ - ref->obj = obj; - - if (ref->obj.oid.empty()) { - ref->obj.oid = obj.pool.to_str(); - ref->obj.pool = svc.zone->get_zone_params().domain_root; - } - ref->pool = svc.rados->pool(obj.pool); - int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams() - .set_mostly_omap(false)); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl; - return r; - } - - ref->pool.ioctx().locator_set_key(ref->obj.loc); - - return 0; -} - -int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref) -{ - return get_raw_obj_ref(dpp, obj, ref); -} - -/* - * fixes an issue where head objects were supposed to have a locator created, but ended - * up without one - */ -int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key) -{ - const rgw_bucket& bucket = bucket_info.bucket; - string oid; - string locator; - - rgw_obj obj(bucket, key); - - get_obj_bucket_and_oid_loc(obj, oid, locator); - - if (locator.empty()) { - ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl; - return 0; - } - - librados::IoCtx ioctx; - - int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx); - if (ret < 0) { - cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl; - return ret; - } - ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */ - - uint64_t size; - bufferlist data; - - struct timespec mtime_ts; - map attrs; - librados::ObjectReadOperation op; - op.getxattrs(&attrs, NULL); - op.stat2(&size, &mtime_ts, NULL); -#define HEAD_SIZE 512 * 1024 - op.read(0, HEAD_SIZE, &data, NULL); - - ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl; - return ret; - } - - if (size > HEAD_SIZE) { - ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl; - return -EIO; - } - - if (size != data.length()) { - ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl; - return -EIO; - } - - if (copy_obj) { - librados::ObjectWriteOperation wop; - - wop.mtime2(&mtime_ts); - - map::iterator iter; - for (iter = attrs.begin(); iter != attrs.end(); ++iter) { - wop.setxattr(iter->first.c_str(), iter->second); - } - - wop.write(0, data); - - ioctx.locator_set_key(locator); - rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield); - } - - if (remove_bad) { - ioctx.locator_set_key(string()); - - ret = ioctx.remove(oid); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl; - return ret; - } - } - - return 0; -} - -int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp, - librados::IoCtx& src_ioctx, - const string& src_oid, const string& src_locator, - librados::IoCtx& dst_ioctx, - const string& dst_oid, const string& dst_locator) -{ - -#define COPY_BUF_SIZE (4 * 1024 * 1024) - bool done = false; - uint64_t chunk_size = COPY_BUF_SIZE; - uint64_t ofs = 0; - int ret = 0; - real_time mtime; - struct timespec mtime_ts; - uint64_t size; - - if (src_oid == dst_oid && src_locator == dst_locator) { - return 0; - } - - src_ioctx.locator_set_key(src_locator); - dst_ioctx.locator_set_key(dst_locator); - - do { - bufferlist data; - ObjectReadOperation rop; - ObjectWriteOperation wop; - - if (ofs == 0) { - rop.stat2(&size, &mtime_ts, NULL); - mtime = real_clock::from_timespec(mtime_ts); - } - rop.read(ofs, chunk_size, &data, NULL); - ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield); - if (ret < 0) { - goto done_err; - } - - if (data.length() == 0) { - break; - } - - if (ofs == 0) { - wop.create(true); /* make it exclusive */ - wop.mtime2(&mtime_ts); - mtime = real_clock::from_timespec(mtime_ts); - } - wop.write(ofs, data); - ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield); - if (ret < 0) { - goto done_err; - } - ofs += data.length(); - done = data.length() != chunk_size; - } while (!done); - - if (ofs != size) { - ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid - << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl; - ret = -EIO; - goto done_err; - } - - src_ioctx.remove(src_oid); - - return 0; - -done_err: - // TODO: clean up dst_oid if we created it - ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl; - return ret; -} - -/* - * fixes an issue where head objects were supposed to have a locator created, but ended - * up without one - */ -int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, rgw_obj_key& key, - bool fix, bool *need_fix, optional_yield y) -{ - std::unique_ptr bucket; - driver->get_bucket(nullptr, bucket_info, &bucket); - std::unique_ptr obj = bucket->get_object(key); - - if (need_fix) { - *need_fix = false; - } - - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); - if (r < 0) { - return r; - } - - RGWObjState *astate = nullptr; - RGWObjManifest* manifest = nullptr; - RGWObjectCtx rctx(this->driver); - r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y); - if (r < 0) - return r; - - if (manifest) { - RGWObjManifest::obj_iterator miter; - for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { - rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver); - rgw_obj loc; - string oid; - string locator; - - RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc); - - if (loc.key.ns.empty()) { - /* continue, we're only interested in tail objects */ - continue; - } - - auto& ioctx = ref.pool.ioctx(); - - get_obj_bucket_and_oid_loc(loc, oid, locator); - ref.pool.ioctx().locator_set_key(locator); - - ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl; - - r = ioctx.stat(oid, NULL, NULL); - if (r != -ENOENT) { - continue; - } - - string bad_loc; - prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc); - - /* create a new ioctx with the bad locator */ - librados::IoCtx src_ioctx; - src_ioctx.dup(ioctx); - src_ioctx.locator_set_key(bad_loc); - - r = src_ioctx.stat(oid, NULL, NULL); - if (r != 0) { - /* cannot find a broken part */ - continue; - } - ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl; - if (need_fix) { - *need_fix = true; - } - if (fix) { - r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator); - if (r < 0) { - ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl; - } - } - } - } - - return 0; -} - -int RGWRados::BucketShard::init(const rgw_bucket& _bucket, - const rgw_obj& obj, - RGWBucketInfo* bucket_info_out, - const DoutPrefixProvider *dpp) -{ - bucket = _bucket; - - RGWBucketInfo bucket_info; - RGWBucketInfo* bucket_info_p = - bucket_info_out ? bucket_info_out : &bucket_info; - - int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp); - if (ret < 0) { - return ret; - } - - string oid; - - ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; - return ret; - } - ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl; - - return 0; -} - -int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, - const rgw_obj& obj) -{ - bucket = bucket_info.bucket; - - int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, - obj.get_hash_object(), - &bucket_obj, - &shard_id); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; - return ret; - } - ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; - - return 0; -} - -int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& index, - int sid) -{ - bucket = bucket_info.bucket; - shard_id = sid; - - int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, - num_shards(index), index.gen, - &bucket_obj); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl; - return ret; - } - ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl; - - return 0; -} - - -/* Execute @handler on last item in bucket listing for bucket specified - * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing - * to objects matching these criterias. */ -int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const std::string& obj_prefix, - const std::string& obj_delim, - std::function handler) -{ - RGWRados::Bucket target(this, bucket_info); - RGWRados::Bucket::List list_op(&target); - - list_op.params.prefix = obj_prefix; - list_op.params.delim = obj_delim; - - ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name - << ", obj_prefix=" << obj_prefix - << ", obj_delim=" << obj_delim - << dendl; - - bool is_truncated = false; - - boost::optional last_entry; - /* We need to rewind to the last object in a listing. */ - do { - /* List bucket entries in chunks. */ - static constexpr int MAX_LIST_OBJS = 100; - std::vector entries(MAX_LIST_OBJS); - - int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr, - &is_truncated, null_yield); - if (ret < 0) { - return ret; - } else if (!entries.empty()) { - last_entry = entries.back(); - } - } while (is_truncated); - - if (last_entry) { - return handler(*last_entry); - } - - /* Empty listing - no items we can run handler on. */ - return 0; -} - -bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const -{ - return bucket->get_info().has_swift_versioning() && - bucket->get_info().swift_ver_location.size(); -} - -int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx, - const rgw_user& user, - rgw::sal::Bucket* bucket, - rgw::sal::Object* obj, - const DoutPrefixProvider *dpp, - optional_yield y) -{ - if (! swift_versioning_enabled(bucket)) { - return 0; - } - - obj->set_atomic(); - - RGWObjState * state = nullptr; - RGWObjManifest *manifest = nullptr; - int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y); - if (r < 0) { - return r; - } - - if (!state->exists) { - return 0; - } - - const string& src_name = obj->get_oid(); - char buf[src_name.size() + 32]; - struct timespec ts = ceph::real_clock::to_timespec(state->mtime); - snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(), - src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000); - - RGWBucketInfo dest_bucket_info; - - r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL); - if (r < 0) { - ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl; - if (r == -ENOENT) { - return -ERR_PRECONDITION_FAILED; - } - return r; - } - - if (dest_bucket_info.owner != bucket->get_info().owner) { - return -ERR_PRECONDITION_FAILED; - } - - rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info); - rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket); - - if (dest_bucket_info.versioning_enabled()){ - dest_obj.gen_rand_obj_instance_name(); - } - - dest_obj.set_atomic(); - - rgw_zone_id no_zone; - - r = copy_obj(obj_ctx, - user, - NULL, /* req_info *info */ - no_zone, - &dest_obj, - obj, - &dest_bucket, - bucket, - bucket->get_placement_rule(), - NULL, /* time_t *src_mtime */ - NULL, /* time_t *mtime */ - NULL, /* const time_t *mod_ptr */ - NULL, /* const time_t *unmod_ptr */ - false, /* bool high_precision_time */ - NULL, /* const char *if_match */ - NULL, /* const char *if_nomatch */ - RGWRados::ATTRSMOD_NONE, - true, /* bool copy_if_newer */ - state->attrset, - RGWObjCategory::Main, - 0, /* uint64_t olh_epoch */ - real_time(), /* time_t delete_at */ - NULL, /* string *version_id */ - NULL, /* string *ptag */ - NULL, /* string *petag */ - NULL, /* void (*progress_cb)(off_t, void *) */ - NULL, /* void *progress_data */ - dpp, - null_yield); - if (r == -ECANCELED || r == -ENOENT) { - /* Has already been overwritten, meaning another rgw process already - * copied it out */ - return 0; - } - - return r; -} - -int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx, - const rgw_user& user, - rgw::sal::Bucket* bucket, - rgw::sal::Object* obj, - bool& restored, /* out */ - const DoutPrefixProvider *dpp) -{ - if (! swift_versioning_enabled(bucket)) { - return 0; - } - - /* Bucket info of the bucket that stores previous versions of our object. */ - RGWBucketInfo archive_binfo; - - int ret = get_bucket_info(&svc, bucket->get_tenant(), - bucket->get_info().swift_ver_location, - archive_binfo, nullptr, null_yield, nullptr); - if (ret < 0) { - return ret; - } - - /* Abort the operation if the bucket storing our archive belongs to someone - * else. This is a limitation in comparison to Swift as we aren't taking ACLs - * into consideration. For we can live with that. - * - * TODO: delegate this check to un upper layer and compare with ACLs. */ - if (bucket->get_info().owner != archive_binfo.owner) { - return -EPERM; - } - - /* This code will be executed on latest version of the object. */ - const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int { - rgw_zone_id no_zone; - - /* We don't support object versioning of Swift API on those buckets that - * are already versioned using the S3 mechanism. This affects also bucket - * storing archived objects. Otherwise the delete operation would create - * a deletion marker. */ - if (archive_binfo.versioned()) { - restored = false; - return -ERR_PRECONDITION_FAILED; - } - - /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly - * irrelevant and may be safely skipped. */ - std::map no_attrs; - - rgw::sal::RadosBucket archive_bucket(driver, archive_binfo); - rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket); - - if (bucket->versioning_enabled()){ - obj->gen_rand_obj_instance_name(); - } - - archive_obj.set_atomic(); - obj->set_atomic(); - - int ret = copy_obj(obj_ctx, - user, - nullptr, /* req_info *info */ - no_zone, - obj, /* dest obj */ - &archive_obj, /* src obj */ - bucket, /* dest bucket info */ - &archive_bucket, /* src bucket info */ - bucket->get_placement_rule(), /* placement_rule */ - nullptr, /* time_t *src_mtime */ - nullptr, /* time_t *mtime */ - nullptr, /* const time_t *mod_ptr */ - nullptr, /* const time_t *unmod_ptr */ - false, /* bool high_precision_time */ - nullptr, /* const char *if_match */ - nullptr, /* const char *if_nomatch */ - RGWRados::ATTRSMOD_NONE, - true, /* bool copy_if_newer */ - no_attrs, - RGWObjCategory::Main, - 0, /* uint64_t olh_epoch */ - real_time(), /* time_t delete_at */ - nullptr, /* string *version_id */ - nullptr, /* string *ptag */ - nullptr, /* string *petag */ - nullptr, /* void (*progress_cb)(off_t, void *) */ - nullptr, /* void *progress_data */ - dpp, - null_yield); - if (ret == -ECANCELED || ret == -ENOENT) { - /* Has already been overwritten, meaning another rgw process already - * copied it out */ - return 0; - } else if (ret < 0) { - return ret; - } else { - restored = true; - } - - /* Need to remove the archived copy. */ - ret = delete_obj(dpp, archive_binfo, &archive_obj, - archive_binfo.versioning_status()); - - return ret; - }; - - const std::string& obj_name = obj->get_oid(); - const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size() - % obj_name); - - return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(), - handler); -} - -int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp, - uint64_t size, uint64_t accounted_size, - map& attrs, - bool assume_noent, bool modify_tail, - void *_index_op, optional_yield y) -{ - RGWRados::Bucket::UpdateIndex *index_op = static_cast(_index_op); - RGWRados *store = target->get_store(); - - ObjectWriteOperation op; -#ifdef WITH_LTTNG - const req_state* s = get_req_state(); - string req_id; - if (!s) { - // fake req_id - req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id()); - } else { - req_id = s->req_id; - } -#endif - - RGWObjState *state; - RGWObjManifest *manifest = nullptr; - int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent); - if (r < 0) - return r; - - rgw_obj obj = target->get_obj(); - - if (obj.get_oid().empty()) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl; - return -EIO; - } - - rgw_rados_ref ref; - r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref); - if (r < 0) - return r; - - bool is_olh = state->is_olh; - - bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0; - - const string *ptag = meta.ptag; - if (!ptag && !index_op->get_optag()->empty()) { - ptag = index_op->get_optag(); - } - r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y); - if (r < 0) - return r; - - if (real_clock::is_zero(meta.set_mtime)) { - meta.set_mtime = real_clock::now(); - } - - if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) { - auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION); - if (iter == attrs.end()) { - real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime); - string mode = target->get_bucket_info().obj_lock.get_mode(); - RGWObjectRetention obj_retention(mode, lock_until_date); - bufferlist bl; - obj_retention.encode(bl); - op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl); - } - } - - if (state->is_olh) { - op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag); - } - - struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime); - op.mtime2(&mtime_ts); - - if (meta.data) { - /* if we want to overwrite the data, we also want to overwrite the - xattrs, so just remove the object */ - op.write_full(*meta.data); - if (state->compressed) { - uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; - op.set_alloc_hint2(0, 0, alloc_hint_flags); - } - } - - string etag; - string content_type; - bufferlist acl_bl; - string storage_class; - - map::iterator iter; - if (meta.rmattrs) { - for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) { - const string& name = iter->first; - op.rmxattr(name.c_str()); - } - } - - if (meta.manifest) { - storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class; - - /* remove existing manifest attr */ - iter = attrs.find(RGW_ATTR_MANIFEST); - if (iter != attrs.end()) - attrs.erase(iter); - - bufferlist bl; - encode(*meta.manifest, bl); - op.setxattr(RGW_ATTR_MANIFEST, bl); - } - - for (iter = attrs.begin(); iter != attrs.end(); ++iter) { - const string& name = iter->first; - bufferlist& bl = iter->second; - - if (!bl.length()) - continue; - - op.setxattr(name.c_str(), bl); - - if (name.compare(RGW_ATTR_ETAG) == 0) { - etag = rgw_bl_str(bl); - } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) { - content_type = rgw_bl_str(bl); - } else if (name.compare(RGW_ATTR_ACL) == 0) { - acl_bl = bl; - } - } - if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) { - cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER); - } - - if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) { - bufferlist bl; - encode(store->svc.zone->get_zone_short_id(), bl); - op.setxattr(RGW_ATTR_SOURCE_ZONE, bl); - } - - if (!storage_class.empty()) { - bufferlist bl; - bl.append(storage_class); - op.setxattr(RGW_ATTR_STORAGE_CLASS, bl); - } - - if (!op.size()) - return 0; - - uint64_t epoch; - int64_t poolid; - bool orig_exists; - uint64_t orig_size; - - if (!reset_obj) { //Multipart upload, it has immutable head. - orig_exists = false; - orig_size = 0; - } else { - orig_exists = state->exists; - orig_size = state->accounted_size; - } - - bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) || - !obj.key.instance.empty(); - - bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target); - - if (versioned_op) { - index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP); - } - - if (!index_op->is_prepared()) { - tracepoint(rgw_rados, prepare_enter, req_id.c_str()); - r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); - tracepoint(rgw_rados, prepare_exit, req_id.c_str()); - if (r < 0) - return r; - } - - auto& ioctx = ref.pool.ioctx(); - - tracepoint(rgw_rados, operate_enter, req_id.c_str()); - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - tracepoint(rgw_rados, operate_exit, req_id.c_str()); - if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under, - or -ENOENT if was removed, or -EEXIST if it did not exist - before and now it does */ - if (r == -EEXIST && assume_noent) { - target->invalidate_state(); - return r; - } - goto done_cancel; - } - - epoch = ioctx.get_last_version(); - poolid = ioctx.get_id(); - - r = target->complete_atomic_modification(dpp); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl; - } - - tracepoint(rgw_rados, complete_enter, req_id.c_str()); - r = index_op->complete(dpp, poolid, epoch, size, accounted_size, - meta.set_mtime, etag, content_type, - storage_class, &acl_bl, - meta.category, meta.remove_objs, meta.user_data, meta.appendable); - tracepoint(rgw_rados, complete_exit, req_id.c_str()); - if (r < 0) - goto done_cancel; - - if (meta.mtime) { - *meta.mtime = meta.set_mtime; - } - - /* note that index_op was using state so we couldn't invalidate it earlier */ - target->invalidate_state(); - state = NULL; - - if (versioned_op && meta.olh_epoch) { - r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace); - if (r < 0) { - return r; - } - } - - if (!real_clock::is_zero(meta.delete_at)) { - rgw_obj_index_key obj_key; - obj.key.get_index_key(&obj_key); - - r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name, - obj.bucket.bucket_id, obj_key); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl; - /* ignoring error, nothing we can do at this point */ - } - } - meta.canceled = false; - - /* update quota cache */ - if (meta.completeMultipart){ - store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), - 0, orig_size); - } - else { - store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1), - accounted_size, orig_size); - } - return 0; - -done_cancel: - int ret = index_op->cancel(dpp, meta.remove_objs); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; - } - - meta.canceled = true; - - /* we lost in a race. There are a few options: - * - existing object was rewritten (ECANCELED) - * - non existing object was created (EEXIST) - * - object was removed (ENOENT) - * should treat it as a success - */ - if (meta.if_match == NULL && meta.if_nomatch == NULL) { - if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) { - r = 0; - } - } else { - if (meta.if_match != NULL) { - // only overwrite existing object - if (strcmp(meta.if_match, "*") == 0) { - if (r == -ENOENT) { - r = -ERR_PRECONDITION_FAILED; - } else if (r == -ECANCELED) { - r = 0; - } - } - } - - if (meta.if_nomatch != NULL) { - // only create a new object - if (strcmp(meta.if_nomatch, "*") == 0) { - if (r == -EEXIST) { - r = -ERR_PRECONDITION_FAILED; - } else if (r == -ENOENT) { - r = 0; - } - } - } - } - - return r; -} - -int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, - map& attrs, optional_yield y) -{ - RGWBucketInfo& bucket_info = target->get_bucket_info(); - - RGWRados::Bucket bop(target->get_store(), bucket_info); - RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj()); - index_op.set_zones_trace(meta.zones_trace); - - bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL); - int r; - if (assume_noent) { - r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); - if (r == -EEXIST) { - assume_noent = false; - } - } - if (!assume_noent) { - r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y); - } - return r; -} - -class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB -{ - const DoutPrefixProvider *dpp; - CephContext* cct; - rgw_obj obj; - rgw::sal::DataProcessor *filter; - boost::optional& compressor; - bool try_etag_verify; - rgw::putobj::etag_verifier_ptr etag_verifier; - boost::optional buffering; - CompressorRef& plugin; - rgw::sal::ObjectProcessor *processor; - void (*progress_cb)(off_t, void *); - void *progress_data; - bufferlist extra_data_bl, manifest_bl; - std::optional compression_info; - uint64_t extra_data_left{0}; - bool need_to_process_attrs{true}; - uint64_t data_len{0}; - map src_attrs; - uint64_t ofs{0}; - uint64_t lofs{0}; /* logical ofs */ - std::function&)> attrs_handler; - -public: - RGWRadosPutObj(const DoutPrefixProvider *dpp, - CephContext* cct, - CompressorRef& plugin, - boost::optional& compressor, - rgw::sal::ObjectProcessor *p, - void (*_progress_cb)(off_t, void *), - void *_progress_data, - std::function&)> _attrs_handler) : - dpp(dpp), - cct(cct), - filter(p), - compressor(compressor), - try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify), - plugin(plugin), - processor(p), - progress_cb(_progress_cb), - progress_data(_progress_data), - attrs_handler(_attrs_handler) {} - - - int process_attrs(void) { - if (extra_data_bl.length()) { - JSONParser jp; - if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { - ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; - return -EIO; - } - - JSONDecoder::decode_json("attrs", src_attrs, &jp); - - auto iter = src_attrs.find(RGW_ATTR_COMPRESSION); - if (iter != src_attrs.end()) { - const bufferlist bl = std::move(iter->second); - src_attrs.erase(iter); // don't preserve source compression info - - if (try_etag_verify) { - // if we're trying to verify etags, we need to convert compressed - // ranges in the manifest back into logical multipart part offsets - RGWCompressionInfo info; - bool compressed = false; - int r = rgw_compression_info_from_attr(bl, compressed, info); - if (r < 0) { - ldpp_dout(dpp, 4) << "failed to decode compression info, " - "disabling etag verification" << dendl; - try_etag_verify = false; - } else if (compressed) { - compression_info = std::move(info); - } - } - } - /* We need the manifest to recompute the ETag for verification */ - iter = src_attrs.find(RGW_ATTR_MANIFEST); - if (iter != src_attrs.end()) { - manifest_bl = std::move(iter->second); - src_attrs.erase(iter); - } - - // filter out olh attributes - iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX); - while (iter != src_attrs.end()) { - if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) { - break; - } - iter = src_attrs.erase(iter); - } - } - - int ret = attrs_handler(src_attrs); - if (ret < 0) { - return ret; - } - - if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { - //do not compress if object is encrypted - compressor = boost::in_place(cct, plugin, filter); - // add a filter that buffers data so we don't try to compress tiny blocks. - // libcurl reads in 16k at a time, and we need at least 64k to get a good - // compression ratio - constexpr unsigned buffer_size = 512 * 1024; - buffering = boost::in_place(&*compressor, buffer_size); - filter = &*buffering; - } - - /* - * Presently we don't support ETag based verification if encryption is - * requested. We can enable simultaneous support once we have a mechanism - * to know the sequence in which the filters must be applied. - */ - if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) { - ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl, - compression_info, - etag_verifier); - if (ret < 0) { - ldpp_dout(dpp, 4) << "failed to initial etag verifier, " - "disabling etag verification" << dendl; - } else { - filter = etag_verifier.get(); - } - } - - need_to_process_attrs = false; - - return 0; - } - - int handle_data(bufferlist& bl, bool *pause) override { - if (progress_cb) { - progress_cb(data_len, progress_data); - } - if (extra_data_left) { - uint64_t extra_len = bl.length(); - if (extra_len > extra_data_left) - extra_len = extra_data_left; - - bufferlist extra; - bl.splice(0, extra_len, &extra); - extra_data_bl.append(extra); - - extra_data_left -= extra_len; - if (extra_data_left == 0) { - int res = process_attrs(); - if (res < 0) - return res; - } - ofs += extra_len; - if (bl.length() == 0) { - return 0; - } - } - if (need_to_process_attrs) { - /* need to call process_attrs() even if we don't get any attrs, - * need it to call attrs_handler(). - */ - int res = process_attrs(); - if (res < 0) { - return res; - } - } - - ceph_assert(uint64_t(ofs) >= extra_data_len); - - uint64_t size = bl.length(); - ofs += size; - - const uint64_t lofs = data_len; - data_len += size; - - return filter->process(std::move(bl), lofs); - } - - int flush() { - return filter->process({}, data_len); - } - - bufferlist& get_extra_data() { return extra_data_bl; } - - map& get_attrs() { return src_attrs; } - - void set_extra_data_len(uint64_t len) override { - extra_data_left = len; - RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len); - } - - uint64_t get_data_len() { - return data_len; - } - - std::string get_verifier_etag() { - if (etag_verifier) { - etag_verifier->calculate_etag(); - return etag_verifier->get_calculated_etag(); - } else { - return ""; - } - } -}; - -/* - * prepare attrset depending on attrs_mod. - */ -static void set_copy_attrs(map& src_attrs, - map& attrs, - RGWRados::AttrsMod attrs_mod) -{ - switch (attrs_mod) { - case RGWRados::ATTRSMOD_NONE: - attrs = src_attrs; - break; - case RGWRados::ATTRSMOD_REPLACE: - if (!attrs[RGW_ATTR_ETAG].length()) { - attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG]; - } - if (!attrs[RGW_ATTR_TAIL_TAG].length()) { - auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG); - if (ttiter != src_attrs.end()) { - attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG]; - } - } - break; - case RGWRados::ATTRSMOD_MERGE: - for (map::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) { - if (attrs.find(it->first) == attrs.end()) { - attrs[it->first] = it->second; - } - } - break; - } -} - -int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y) -{ - RGWObjectCtx rctx(this->driver); - rgw::sal::Attrs attrset; - uint64_t obj_size; - ceph::real_time mtime; - RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj); - RGWRados::Object::Read read_op(&op_target); - - read_op.params.attrs = &attrset; - read_op.params.obj_size = &obj_size; - read_op.params.lastmod = &mtime; - - int ret = read_op.prepare(y, dpp); - if (ret < 0) - return ret; - - attrset.erase(RGW_ATTR_ID_TAG); - attrset.erase(RGW_ATTR_TAIL_TAG); - attrset.erase(RGW_ATTR_STORAGE_CLASS); - - return this->copy_obj_data(rctx, obj->get_bucket(), - obj->get_bucket()->get_info().placement_rule, - read_op, obj_size - 1, obj, NULL, mtime, - attrset, 0, real_time(), NULL, dpp, y); -} - -struct obj_time_weight { - real_time mtime; - uint32_t zone_short_id; - uint64_t pg_ver; - bool high_precision; - - obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {} - - bool compare_low_precision(const obj_time_weight& rhs) { - struct timespec l = ceph::real_clock::to_timespec(mtime); - struct timespec r = ceph::real_clock::to_timespec(rhs.mtime); - l.tv_nsec = 0; - r.tv_nsec = 0; - if (l > r) { - return false; - } - if (l < r) { - return true; - } - if (!zone_short_id || !rhs.zone_short_id) { - /* don't compare zone ids, if one wasn't provided */ - return false; - } - if (zone_short_id != rhs.zone_short_id) { - return (zone_short_id < rhs.zone_short_id); - } - return (pg_ver < rhs.pg_ver); - - } - - bool operator<(const obj_time_weight& rhs) { - if (!high_precision || !rhs.high_precision) { - return compare_low_precision(rhs); - } - if (mtime > rhs.mtime) { - return false; - } - if (mtime < rhs.mtime) { - return true; - } - if (!zone_short_id || !rhs.zone_short_id) { - /* don't compare zone ids, if one wasn't provided */ - return false; - } - if (zone_short_id != rhs.zone_short_id) { - return (zone_short_id < rhs.zone_short_id); - } - return (pg_ver < rhs.pg_ver); - } - - void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) { - mtime = _mtime; - zone_short_id = _short_id; - pg_ver = _pg_ver; - } - - void init(RGWObjState *state) { - mtime = state->mtime; - zone_short_id = state->zone_short_id; - pg_ver = state->pg_ver; - } -}; - -inline ostream& operator<<(ostream& out, const obj_time_weight &o) { - out << o.mtime; - - if (o.zone_short_id != 0 || o.pg_ver != 0) { - out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]"; - } - - return out; -} - -class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB { - bufferlist extra_data; -public: - RGWGetExtraDataCB() {} - int handle_data(bufferlist& bl, bool *pause) override { - int bl_len = (int)bl.length(); - if (extra_data.length() < extra_data_len) { - off_t max = extra_data_len - extra_data.length(); - if (max > bl_len) { - max = bl_len; - } - bl.splice(0, max, &extra_data); - } - return bl_len; - } - - bufferlist& get_extra_data() { - return extra_data; - } -}; - -int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp, - RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* src_obj, - const RGWBucketInfo *src_bucket_info, - real_time *src_mtime, - uint64_t *psize, - const real_time *mod_ptr, - const real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - map *pattrs, - map *pheaders, - string *version_id, - string *ptag, - string *petag) -{ - /* source is in a different zonegroup, copy from there */ - - RGWRESTStreamRWRequest *in_stream_req; - string tag; - map src_attrs; - append_rand_alpha(cct, tag, tag, 32); - obj_time_weight set_mtime_weight; - set_mtime_weight.high_precision = high_precision_time; - - RGWRESTConn *conn; - if (source_zone.empty()) { - if (!src_bucket_info || src_bucket_info->zonegroup.empty()) { - /* source is in the master zonegroup */ - conn = svc.zone->get_master_conn(); - } else { - auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); - map::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup); - if (iter == zonegroup_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; - return -ENOENT; - } - conn = iter->second; - } - } else { - auto& zone_conn_map = svc.zone->get_zone_conn_map(); - auto iter = zone_conn_map.find(source_zone); - if (iter == zone_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; - return -ENOENT; - } - conn = iter->second; - } - - RGWGetExtraDataCB cb; - map req_headers; - real_time set_mtime; - - const real_time *pmod = mod_ptr; - - obj_time_weight dest_mtime_weight; - - constexpr bool prepend_meta = true; - constexpr bool get_op = true; - constexpr bool rgwx_stat = true; - constexpr bool sync_manifest = true; - constexpr bool skip_decrypt = true; - int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, - dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, - prepend_meta, get_op, rgwx_stat, - sync_manifest, skip_decrypt, - true, &cb, &in_stream_req); - if (ret < 0) { - return ret; - } - - ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, - nullptr, pheaders, null_yield); - if (ret < 0) { - return ret; - } - - bufferlist& extra_data_bl = cb.get_extra_data(); - if (extra_data_bl.length()) { - JSONParser jp; - if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) { - ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl; - return -EIO; - } - - JSONDecoder::decode_json("attrs", src_attrs, &jp); - - src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout - } - - if (src_mtime) { - *src_mtime = set_mtime; - } - - if (petag) { - map::iterator iter = src_attrs.find(RGW_ATTR_ETAG); - if (iter != src_attrs.end()) { - bufferlist& etagbl = iter->second; - *petag = etagbl.to_str(); - while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') { - *petag = petag->substr(0, petag->size() - 1); - } - } - } - - if (pattrs) { - *pattrs = std::move(src_attrs); - } - - return 0; -} - -int RGWFetchObjFilter_Default::filter(CephContext *cct, - const rgw_obj_key& source_key, - const RGWBucketInfo& dest_bucket_info, - std::optional dest_placement_rule, - const map& obj_attrs, - std::optional *poverride_owner, - const rgw_placement_rule **prule) -{ - const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr); - if (!ptail_rule) { - auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS); - if (iter != obj_attrs.end()) { - dest_rule.storage_class = iter->second.to_str(); - dest_rule.inherit_from(dest_bucket_info.placement_rule); - ptail_rule = &dest_rule; - } else { - ptail_rule = &dest_bucket_info.placement_rule; - } - } - *prule = ptail_rule; - return 0; -} - -int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* dest_obj, - rgw::sal::Object* src_obj, - rgw::sal::Bucket* dest_bucket, - rgw::sal::Bucket* src_bucket, - std::optional dest_placement_rule, - real_time *src_mtime, - real_time *mtime, - const real_time *mod_ptr, - const real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - AttrsMod attrs_mod, - bool copy_if_newer, - rgw::sal::Attrs& attrs, - RGWObjCategory category, - std::optional olh_epoch, - real_time delete_at, - string *ptag, - string *petag, - void (*progress_cb)(off_t, void *), - void *progress_data, - const DoutPrefixProvider *dpp, - RGWFetchObjFilter *filter, - rgw_zone_set *zones_trace, - std::optional* bytes_transferred) -{ - /* source is in a different zonegroup, copy from there */ - - RGWRESTStreamRWRequest *in_stream_req; - string tag; - int i; - append_rand_alpha(cct, tag, tag, 32); - obj_time_weight set_mtime_weight; - set_mtime_weight.high_precision = high_precision_time; - int ret; - - rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size); - using namespace rgw::putobj; - AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id, - obj_ctx, dest_obj->clone(), olh_epoch, - tag, dpp, null_yield); - RGWRESTConn *conn; - auto& zone_conn_map = svc.zone->get_zone_conn_map(); - auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map(); - if (source_zone.empty()) { - if (!src_bucket || src_bucket->get_info().zonegroup.empty()) { - /* source is in the master zonegroup */ - conn = svc.zone->get_master_conn(); - } else { - map::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup); - if (iter == zonegroup_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl; - return -ENOENT; - } - conn = iter->second; - } - } else { - auto iter = zone_conn_map.find(source_zone); - if (iter == zone_conn_map.end()) { - ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl; - return -ENOENT; - } - conn = iter->second; - } - - boost::optional compressor; - CompressorRef plugin; - - RGWFetchObjFilter_Default source_filter; - if (!filter) { - filter = &source_filter; - } - - std::optional override_owner; - - RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data, - [&](map& obj_attrs) { - const rgw_placement_rule *ptail_rule; - - int ret = filter->filter(cct, - src_obj->get_key(), - dest_bucket->get_info(), - dest_placement_rule, - obj_attrs, - &override_owner, - &ptail_rule); - if (ret < 0) { - ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl; - return ret; - } - - processor.set_tail_placement(*ptail_rule); - - const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule); - if (compression_type != "none") { - plugin = Compressor::create(cct, compression_type); - if (!plugin) { - ldpp_dout(dpp, 1) << "Cannot load plugin for compression type " - << compression_type << dendl; - } - } - - ret = processor.prepare(null_yield); - if (ret < 0) { - return ret; - } - return 0; - }); - - string etag; - real_time set_mtime; - uint64_t expected_size = 0; - - RGWObjState *dest_state = NULL; - RGWObjManifest *manifest = nullptr; - - const real_time *pmod = mod_ptr; - - obj_time_weight dest_mtime_weight; - - if (copy_if_newer) { - /* need to get mtime for destination */ - ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield); - if (ret < 0) - goto set_err_state; - - if (!real_clock::is_zero(dest_state->mtime)) { - dest_mtime_weight.init(dest_state); - pmod = &dest_mtime_weight.mtime; - } - } - - static constexpr bool prepend_meta = true; - static constexpr bool get_op = true; - static constexpr bool rgwx_stat = false; - static constexpr bool sync_manifest = true; - static constexpr bool skip_decrypt = true; - ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr, - dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver, - prepend_meta, get_op, rgwx_stat, - sync_manifest, skip_decrypt, - true, - &cb, &in_stream_req); - if (ret < 0) { - goto set_err_state; - } - - ret = conn->complete_request(in_stream_req, &etag, &set_mtime, - &expected_size, nullptr, nullptr, null_yield); - if (ret < 0) { - goto set_err_state; - } - ret = cb.flush(); - if (ret < 0) { - goto set_err_state; - } - if (cb.get_data_len() != expected_size) { - ret = -EIO; - ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected " - << expected_size << " bytes but received " << cb.get_data_len() << dendl; - goto set_err_state; - } - if (compressor && compressor->is_compressed()) { - bufferlist tmp; - RGWCompressionInfo cs_info; - cs_info.compression_type = plugin->get_type_name(); - cs_info.orig_size = cb.get_data_len(); - cs_info.compressor_message = compressor->get_compressor_message(); - cs_info.blocks = move(compressor->get_compression_blocks()); - encode(cs_info, tmp); - cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp; - } - - if (override_owner) { - processor.set_owner(*override_owner); - - auto& obj_attrs = cb.get_attrs(); - - RGWUserInfo owner_info; - if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) { - ldpp_dout(dpp, 10) << "owner info does not exist" << dendl; - return -EINVAL; - } - - RGWAccessControlPolicy acl; - - auto aiter = obj_attrs.find(RGW_ATTR_ACL); - if (aiter == obj_attrs.end()) { - ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl; - acl.create_default(owner_info.user_id, owner_info.display_name); - } else { - auto iter = aiter->second.cbegin(); - try { - acl.decode(iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl; - return -EIO; - } - } - - ACLOwner new_owner; - new_owner.set_id(*override_owner); - new_owner.set_name(owner_info.display_name); - - acl.set_owner(new_owner); - - bufferlist bl; - acl.encode(bl); - obj_attrs[RGW_ATTR_ACL] = std::move(bl); - } - - if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */ - cb.get_attrs().erase(RGW_ATTR_DELETE_AT); - } else { - map::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT); - if (iter != cb.get_attrs().end()) { - try { - decode(delete_at, iter->second); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl; - } - } - } - - if (src_mtime) { - *src_mtime = set_mtime; - } - - if (petag) { - const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG); - if (iter != cb.get_attrs().end()) { - *petag = iter->second.to_str(); - } - } - - //erase the append attr - cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM); - - { // add x-amz-replication-status=REPLICA - auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS]; - bl.clear(); // overwrite source's status - bl.append("REPLICA"); - } - - if (source_zone.empty()) { - set_copy_attrs(cb.get_attrs(), attrs, attrs_mod); - } else { - attrs = cb.get_attrs(); - } - - if (copy_if_newer) { - uint64_t pg_ver = 0; - auto i = attrs.find(RGW_ATTR_PG_VER); - if (i != attrs.end() && i->second.length() > 0) { - auto iter = i->second.cbegin(); - try { - decode(pg_ver, iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl; - /* non critical error */ - } - } - set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver); - } - - /* Perform ETag verification is we have computed the object's MD5 sum at our end */ - if (const auto& verifier_etag = cb.get_verifier_etag(); - !verifier_etag.empty()) { - string trimmed_etag = etag; - - /* Remove the leading and trailing double quotes from etag */ - trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'), - trimmed_etag.end()); - - if (verifier_etag != trimmed_etag) { - ret = -EIO; - ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:" - << trimmed_etag << " Computed etag:" << verifier_etag << dendl; - goto set_err_state; - } - } - -#define MAX_COMPLETE_RETRY 100 - for (i = 0; i < MAX_COMPLETE_RETRY; i++) { - bool canceled = false; - ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime, - attrs, delete_at, nullptr, nullptr, nullptr, - zones_trace, &canceled, null_yield); - if (ret < 0) { - goto set_err_state; - } - - if (copy_if_newer && canceled) { - ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl; - obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */ - ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl; - goto set_err_state; - } - dest_mtime_weight.init(dest_state); - dest_mtime_weight.high_precision = high_precision_time; - if (!dest_state->exists || - dest_mtime_weight < set_mtime_weight) { - ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; - continue; - } else { - ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl; - } - } - break; - } - - if (i == MAX_COMPLETE_RETRY) { - ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl; - ret = -EIO; - goto set_err_state; - } - - if (bytes_transferred) { - *bytes_transferred = cb.get_data_len(); - } - return 0; -set_err_state: - if (copy_if_newer && ret == -ERR_NOT_MODIFIED) { - // we may have already fetched during sync of OP_ADD, but were waiting - // for OP_LINK_OLH to call set_olh() with a real olh_epoch - if (olh_epoch && *olh_epoch > 0) { - constexpr bool log_data_change = true; - ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr, - *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change); - } else { - // we already have the latest copy - ret = 0; - } - } - return ret; -} - - -int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, - RGWObjState *astate, - map& src_attrs, - RGWRados::Object::Read& read_op, - const rgw_user& user_id, - rgw::sal::Object* dest_obj, - real_time *mtime) -{ - string etag; - - RGWRESTStreamS3PutObj *out_stream_req; - - auto rest_master_conn = svc.zone->get_master_conn(); - - int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req); - if (ret < 0) { - return ret; - } - - out_stream_req->set_send_length(astate->size); - - ret = RGWHTTP::send(out_stream_req); - if (ret < 0) { - delete out_stream_req; - return ret; - } - - ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield); - if (ret < 0) { - delete out_stream_req; - return ret; - } - - ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield); - if (ret < 0) - return ret; - - return 0; -} - -/** - * Copy an object. - * dest_obj: the object to copy into - * src_obj: the object to copy from - * attrs: usage depends on attrs_mod parameter - * attrs_mod: the modification mode of the attrs, may have the following values: - * ATTRSMOD_NONE - the attributes of the source object will be - * copied without modifications, attrs parameter is ignored; - * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs - * parameter, source object attributes are not copied; - * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes - * are overwritten by values contained in attrs parameter. - * err: stores any errors resulting from the get of the original object - * Returns: 0 on success, -ERR# otherwise. - */ -int RGWRados::copy_obj(RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* dest_obj, - rgw::sal::Object* src_obj, - rgw::sal::Bucket* dest_bucket, - rgw::sal::Bucket* src_bucket, - const rgw_placement_rule& dest_placement, - real_time *src_mtime, - real_time *mtime, - const real_time *mod_ptr, - const real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - AttrsMod attrs_mod, - bool copy_if_newer, - rgw::sal::Attrs& attrs, - RGWObjCategory category, - uint64_t olh_epoch, - real_time delete_at, - string *version_id, - string *ptag, - string *petag, - void (*progress_cb)(off_t, void *), - void *progress_data, - const DoutPrefixProvider *dpp, - optional_yield y) -{ - int ret; - uint64_t obj_size; - rgw_obj shadow_obj = dest_obj->get_obj(); - string shadow_oid; - - bool remote_src; - bool remote_dest; - - append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32); - shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns); - - auto& zonegroup = svc.zone->get_zonegroup(); - - remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup); - remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup); - - if (remote_src && remote_dest) { - ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl; - return -EINVAL; - } - - ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl; - - if (remote_src || !source_zone.empty()) { - return fetch_remote_obj(obj_ctx, user_id, info, source_zone, - dest_obj, src_obj, dest_bucket, src_bucket, - dest_placement, src_mtime, mtime, mod_ptr, - unmod_ptr, high_precision_time, - if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category, - olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp, - nullptr /* filter */); - } - - map src_attrs; - RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj); - RGWRados::Object::Read read_op(&src_op_target); - - read_op.conds.mod_ptr = mod_ptr; - read_op.conds.unmod_ptr = unmod_ptr; - read_op.conds.high_precision_time = high_precision_time; - read_op.conds.if_match = if_match; - read_op.conds.if_nomatch = if_nomatch; - read_op.params.attrs = &src_attrs; - read_op.params.lastmod = src_mtime; - read_op.params.obj_size = &obj_size; - - ret = read_op.prepare(y, dpp); - if (ret < 0) { - return ret; - } - if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) { - // Current implementation does not follow S3 spec and even - // may result in data corruption silently when copying - // multipart objects acorss pools. So reject COPY operations - //on encrypted objects before it is fully functional. - ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj - << " has not been implemented." << dendl; - return -ERR_NOT_IMPLEMENTED; - } - - src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL]; - src_attrs.erase(RGW_ATTR_DELETE_AT); - - src_attrs.erase(RGW_ATTR_OBJECT_RETENTION); - src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD); - map::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION); - if (rt != attrs.end()) - src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second; - map::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD); - if (lh != attrs.end()) - src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second; - - set_copy_attrs(src_attrs, attrs, attrs_mod); - attrs.erase(RGW_ATTR_ID_TAG); - attrs.erase(RGW_ATTR_PG_VER); - attrs.erase(RGW_ATTR_SOURCE_ZONE); - map::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION); - if (cmp != src_attrs.end()) - attrs[RGW_ATTR_COMPRESSION] = cmp->second; - - RGWObjManifest manifest; - RGWObjState *astate = NULL; - RGWObjManifest *amanifest = nullptr; - - ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y); - if (ret < 0) { - return ret; - } - - vector ref_objs; - - if (remote_dest) { - /* dest is in a different zonegroup, copy it there */ - return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime); - } - uint64_t max_chunk_size; - - ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl; - return ret; - } - - rgw_pool src_pool; - rgw_pool dest_pool; - - const rgw_placement_rule *src_rule{nullptr}; - - if (amanifest) { - src_rule = &amanifest->get_tail_placement().placement_rule; - ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl; - } - - if (!src_rule || src_rule->empty()) { - src_rule = &src_bucket->get_placement_rule(); - } - - if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) { - ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl; - return -EIO; - } - - if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) { - ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl; - return -EIO; - } - - ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool - << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl; - - bool copy_data = (!amanifest) || - (*src_rule != dest_placement) || - (src_pool != dest_pool); - - bool copy_first = false; - if (amanifest) { - if (!amanifest->has_tail()) { - copy_data = true; - } else { - uint64_t head_size = amanifest->get_head_size(); - - if (head_size > 0) { - if (head_size > max_chunk_size) { - copy_data = true; - } else { - copy_first = true; - } - } - } - } - - if (petag) { - const auto iter = attrs.find(RGW_ATTR_ETAG); - if (iter != attrs.end()) { - *petag = iter->second.to_str(); - } - } - - if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */ - attrs.erase(RGW_ATTR_TAIL_TAG); - return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj, - mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y); - } - - /* This has been in for 2 years, so we can safely assume amanifest is not NULL */ - RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp); - - if (copy_first) { // we need to copy first chunk, not increase refcount - ++miter; - } - - bufferlist first_chunk; - - const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj()); - RGWObjManifest *pmanifest; - ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl; - - RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj); - RGWRados::Object::Write write_op(&dest_op_target); - - string tag; - - if (ptag) { - tag = *ptag; - } - - if (tag.empty()) { - append_rand_alpha(cct, tag, tag, 32); - } - - std::unique_ptr aio; - rgw::AioResultList all_results; - if (!copy_itself) { - aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y); - attrs.erase(RGW_ATTR_TAIL_TAG); - manifest = *amanifest; - const rgw_bucket_placement& tail_placement = manifest.get_tail_placement(); - if (tail_placement.bucket.name.empty()) { - manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key()); - } - string ref_tag; - for (; miter != amanifest->obj_end(dpp); ++miter) { - ObjectWriteOperation op; - ref_tag = tag + '\0'; - cls_refcount_get(op, ref_tag, true); - - auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver)); - ret = obj.open(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl; - goto done_ret; - } - - static constexpr uint64_t cost = 1; // 1 throttle unit per request - static constexpr uint64_t id = 0; // ids unused - rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id); - ret = rgw::check_for_errors(completed); - all_results.splice(all_results.end(), completed); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl; - goto done_ret; - } - } - - rgw::AioResultList completed = aio->drain(); - ret = rgw::check_for_errors(completed); - all_results.splice(all_results.end(), completed); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <keep_tail = true; - } - - if (copy_first) { - ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp); - if (ret < 0) { - goto done_ret; - } - - pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length()); - } else { - pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0); - } - - write_op.meta.data = &first_chunk; - write_op.meta.manifest = pmanifest; - write_op.meta.ptag = &tag; - write_op.meta.owner = dest_bucket->get_info().owner; - write_op.meta.mtime = mtime; - write_op.meta.flags = PUT_OBJ_CREATE; - write_op.meta.category = category; - write_op.meta.olh_epoch = olh_epoch; - write_op.meta.delete_at = delete_at; - write_op.meta.modify_tail = !copy_itself; - - ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y); - if (ret < 0) { - goto done_ret; - } - - return 0; - -done_ret: - if (!copy_itself) { - - /* wait all pending op done */ - rgw::AioResultList completed = aio->drain(); - all_results.splice(all_results.end(), completed); - - /* rollback reference */ - string ref_tag = tag + '\0'; - int ret2 = 0; - for (auto& r : all_results) { - if (r.result < 0) { - continue; // skip errors - } - ObjectWriteOperation op; - cls_refcount_put(op, ref_tag, true); - - static constexpr uint64_t cost = 1; // 1 throttle unit per request - static constexpr uint64_t id = 0; // ids unused - rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id); - ret2 = rgw::check_for_errors(completed); - if (ret2 < 0) { - ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl; - } - } - completed = aio->drain(); - ret2 = rgw::check_for_errors(completed); - if (ret2 < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <_conf->rgw_put_obj_min_window_size); - using namespace rgw::putobj; - // do not change the null_yield in the initialization of this AtomicObjectProcessor - // it causes crashes in the ragweed tests - AtomicObjectProcessor processor(&aio, this->driver, &dest_placement, - bucket->get_info().owner, obj_ctx, - dest_obj->clone(), olh_epoch, tag, - dpp, null_yield); - int ret = processor.prepare(y); - if (ret < 0) - return ret; - - off_t ofs = 0; - - do { - bufferlist bl; - ret = read_op.read(ofs, end, bl, y, dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl; - return ret; - } - - uint64_t read_len = ret; - ret = processor.process(std::move(bl), ofs); - if (ret < 0) { - return ret; - } - - ofs += read_len; - } while (ofs <= end); - - // flush - ret = processor.process({}, ofs); - if (ret < 0) { - return ret; - } - - string etag; - auto iter = attrs.find(RGW_ATTR_ETAG); - if (iter != attrs.end()) { - bufferlist& bl = iter->second; - etag = bl.to_str(); - if (petag) { - *petag = etag; - } - } - - uint64_t accounted_size; - { - bool compressed{false}; - RGWCompressionInfo cs_info; - ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl; - return ret; - } - // pass original size if compressed - accounted_size = compressed ? cs_info.orig_size : ofs; - } - - return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, - nullptr, nullptr, nullptr, nullptr, nullptr, y); -} - -int RGWRados::transition_obj(RGWObjectCtx& obj_ctx, - rgw::sal::Bucket* bucket, - rgw::sal::Object& obj, - const rgw_placement_rule& placement_rule, - const real_time& mtime, - uint64_t olh_epoch, - const DoutPrefixProvider *dpp, - optional_yield y) -{ - rgw::sal::Attrs attrs; - real_time read_mtime; - uint64_t obj_size; - - obj.set_atomic(); - RGWRados::Object op_target(this, bucket, obj_ctx, &obj); - RGWRados::Object::Read read_op(&op_target); - - read_op.params.attrs = &attrs; - read_op.params.lastmod = &read_mtime; - read_op.params.obj_size = &obj_size; - - int ret = read_op.prepare(y, dpp); - if (ret < 0) { - return ret; - } - - if (read_mtime != mtime) { - /* raced */ - return -ECANCELED; - } - - attrs.erase(RGW_ATTR_ID_TAG); - attrs.erase(RGW_ATTR_TAIL_TAG); - - ret = copy_obj_data(obj_ctx, - bucket, - placement_rule, - read_op, - obj_size - 1, - &obj, - nullptr /* pmtime */, - mtime, - attrs, - olh_epoch, - real_time(), - nullptr /* petag */, - dpp, - y); - if (ret < 0) { - return ret; - } - - return 0; -} - -int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y) -{ - constexpr uint NUM_ENTRIES = 1000u; - - rgw_obj_index_key marker; - string prefix; - bool is_truncated; - - do { - std::vector ent_list; - ent_list.reserve(NUM_ENTRIES); - - int r = cls_bucket_list_unordered(dpp, - bucket_info, - bucket_info.layout.current_index, - RGW_NO_SHARD, - marker, - prefix, - NUM_ENTRIES, - true, - ent_list, - &is_truncated, - &marker, - y); - if (r < 0) { - return r; - } - - string ns; - for (auto const& dirent : ent_list) { - rgw_obj_key obj; - - if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) { - return -ENOTEMPTY; - } - } - } while (is_truncated); - - return 0; -} - -/** - * Delete a bucket. - * bucket: the name of the bucket to delete - * Returns 0 on success, -ERR# otherwise. - */ -int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty) -{ - const rgw_bucket& bucket = bucket_info.bucket; - RGWSI_RADOS::Pool index_pool; - map bucket_objs; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); - if (r < 0) - return r; - - if (check_empty) { - r = check_bucket_empty(dpp, bucket_info, y); - if (r < 0) { - return r; - } - } - - bool remove_ep = true; - - if (objv_tracker.read_version.empty()) { - RGWBucketEntryPoint ep; - r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket, - &ep, - null_yield, - dpp, - RGWBucketCtl::Bucket::GetParams() - .set_objv_tracker(&objv_tracker)); - if (r < 0 || - (!bucket_info.bucket.bucket_id.empty() && - ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) { - if (r != -ENOENT) { - ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl; - /* we have no idea what caused the error, will not try to remove it */ - } - /* - * either failed to read bucket entrypoint, or it points to a different bucket instance than - * requested - */ - remove_ep = false; - } - } - - if (remove_ep) { - r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp, - RGWBucketCtl::Bucket::RemoveParams() - .set_objv_tracker(&objv_tracker)); - if (r < 0) - return r; - } - - /* if the bucket is not synced we can remove the meta file */ - if (!svc.zone->is_syncing_bucket_meta(bucket)) { - RGWObjVersionTracker objv_tracker; - r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp); - if (r < 0) { - return r; - } - - /* remove bucket index objects asynchronously by best effort */ - (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(), - bucket_objs, - cct->_conf->rgw_bucket_index_max_aio)(); - } - - return 0; -} - -int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp) -{ - RGWBucketInfo info; - map attrs; - int r; - - if (bucket.bucket_id.empty()) { - r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); - } else { - r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp); - } - if (r < 0) { - ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; - return r; - } - - info.owner = owner.get_id(); - - r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp); - if (r < 0) { - ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; - return r; - } - - return 0; -} - - -int RGWRados::set_buckets_enabled(vector& buckets, bool enabled, const DoutPrefixProvider *dpp) -{ - int ret = 0; - - vector::iterator iter; - - for (iter = buckets.begin(); iter != buckets.end(); ++iter) { - rgw_bucket& bucket = *iter; - if (enabled) { - ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl; - } else { - ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl; - } - - RGWBucketInfo info; - map attrs; - int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs); - if (r < 0) { - ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; - ret = r; - continue; - } - if (enabled) { - info.flags &= ~BUCKET_SUSPENDED; - } else { - info.flags |= BUCKET_SUSPENDED; - } - - r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp); - if (r < 0) { - ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; - ret = r; - continue; - } - } - return ret; -} - -int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended) -{ - RGWBucketInfo bucket_info; - int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp); - if (ret < 0) { - return ret; - } - - *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0); - return 0; -} - -int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp) -{ - if ((!manifest)|| state->keep_tail) - return 0; - - cls_rgw_obj_chain chain; - store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain); - - if (chain.empty()) { - return 0; - } - - string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str()); - if (store->gc == nullptr) { - ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl; - //Delete objects inline just in case gc hasn't been initialised, prevents crashes - store->delete_objs_inline(dpp, chain, tag); - } else { - auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously - if (ret < 0 && leftover_chain) { - //Delete objects inline if send chain to gc fails - store->delete_objs_inline(dpp, *leftover_chain, tag); - } - } - return 0; -} - -void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain) -{ - RGWObjManifest::obj_iterator iter; - rgw_raw_obj raw_head; - obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head); - for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) { - const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver); - if (mobj == raw_head) - continue; - cls_rgw_obj_key key(mobj.oid); - chain->push_obj(mobj.pool.to_str(), key, mobj.loc); - } -} - -std::tuple> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag) -{ - if (chain.empty()) { - return {0, std::nullopt}; - } - - return gc->send_split_chain(chain, tag); -} - -void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag) -{ - string last_pool; - std::unique_ptr ctx(new IoCtx); - int ret = 0; - for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) { - cls_rgw_obj& obj = *liter; - if (obj.pool != last_pool) { - ctx.reset(new IoCtx); - ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx); - if (ret < 0) { - last_pool = ""; - ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" << - obj.pool << dendl; - continue; - } - last_pool = obj.pool; - } - ctx->locator_set_key(obj.loc); - const string& oid = obj.key.name; /* just stored raw oid there */ - ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool << - ":" << obj.key.name << dendl; - ObjectWriteOperation op; - cls_refcount_put(op, tag, true); - ret = ctx->operate(oid, &op); - if (ret < 0) { - ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl; - } - } -} - -static void accumulate_raw_stats(const rgw_bucket_dir_header& header, - map& stats) -{ - for (const auto& pair : header.stats) { - const RGWObjCategory category = static_cast(pair.first); - const rgw_bucket_category_stats& header_stats = pair.second; - - RGWStorageStats& s = stats[category]; - - s.category = category; - s.size += header_stats.total_size; - s.size_rounded += header_stats.total_size_rounded; - s.size_utilized += header_stats.actual_size; - s.num_objects += header_stats.num_entries; - } -} - -int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, - map *existing_stats, - map *calculated_stats) -{ - RGWSI_RADOS::Pool index_pool; - - // key - bucket index object id - // value - bucket index check OP returned result with the given bucket index object (shard) - map oids; - - int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr); - if (ret < 0) { - return ret; - } - - // declare and pre-populate - map bucket_objs_ret; - for (auto& iter : oids) { - bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret()); - } - - ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)(); - if (ret < 0) { - return ret; - } - - // aggregate results (from different shards if there are any) - for (const auto& iter : bucket_objs_ret) { - accumulate_raw_stats(iter.second.existing_header, *existing_stats); - accumulate_raw_stats(iter.second.calculated_header, *calculated_stats); - } - - return 0; -} - -int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info) -{ - RGWSI_RADOS::Pool index_pool; - map bucket_objs; - - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); - if (r < 0) { - return r; - } - - return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)(); -} - -int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry) -{ - RGWSI_RADOS::Pool index_pool; - map bucket_objs; - - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": unable to open bucket index, r=" << r << " (" << - cpp_strerror(-r) << ")" << dendl; - return r; - } - - r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)(); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": unable to issue set bucket resharding, r=" << r << " (" << - cpp_strerror(-r) << ")" << dendl; - } - return r; -} - -int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y) -{ - std::string oid, key; - get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key); - if (!rctx) - return 0; - - RGWObjState *state = NULL; - RGWObjManifest *manifest = nullptr; - - int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y); - if (r < 0) - return r; - - if (!state->is_atomic) { - ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl; - return -EINVAL; - } - - string tag; - - if (state->tail_tag.length() > 0) { - tag = state->tail_tag.c_str(); - } else if (state->obj_tag.length() > 0) { - tag = state->obj_tag.c_str(); - } else { - ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl; - return -EINVAL; - } - - ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl; - - cls_rgw_obj_chain chain; - update_gc_chain(dpp, state->obj, *manifest, &chain); - return gc->async_defer_chain(tag, chain); -} - -void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op) -{ - list prefixes; - prefixes.push_back(RGW_ATTR_OLH_PREFIX); - cls_rgw_remove_obj(op, prefixes); -} - -void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist) -{ - cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist); -} - -void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type) -{ - cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type); -} - -struct tombstone_entry { - ceph::real_time mtime; - uint32_t zone_short_id; - uint64_t pg_ver; - - tombstone_entry() = default; - explicit tombstone_entry(const RGWObjState& state) - : mtime(state.mtime), zone_short_id(state.zone_short_id), - pg_ver(state.pg_ver) {} -}; - -/** - * Delete an object. - * bucket: name of the bucket storing the object - * obj: name of the object to delete - * Returns: 0 on success, -ERR# otherwise. - */ -int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp) -{ - RGWRados *store = target->get_store(); - const string& instance = target->get_instance(); - rgw_obj obj = target->get_obj(); - - if (instance == "null") { - obj.key.instance.clear(); - } - - bool explicit_marker_version = (!params.marker_version_id.empty()); - - if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) { - if (instance.empty() || explicit_marker_version) { - std::unique_ptr marker = target->get_target()->clone(); - marker->clear_instance(); - - if (!params.marker_version_id.empty()) { - if (params.marker_version_id != "null") { - marker->set_instance(params.marker_version_id); - } - } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) { - marker->gen_rand_obj_instance_name(); - } - - result.version_id = marker->get_instance(); - if (result.version_id.empty()) - result.version_id = "null"; - result.delete_marker = true; - - struct rgw_bucket_dir_entry_meta meta; - - meta.owner = params.obj_owner.get_id().to_str(); - meta.owner_display_name = params.obj_owner.get_display_name(); - - if (real_clock::is_zero(params.mtime)) { - meta.mtime = real_clock::now(); - } else { - meta.mtime = params.mtime; - } - - int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace); - if (r < 0) { - return r; - } - } else { - rgw_bucket_dir_entry dirent; - - int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent); - if (r < 0) { - return r; - } - result.delete_marker = dirent.is_delete_marker(); - r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace); - if (r < 0) { - return r; - } - result.version_id = instance; - } - - BucketShard *bs = nullptr; - int r = target->get_bucket_shard(&bs, dpp); - if (r < 0) { - ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl; - return r; - } - - add_datalog_entry(dpp, store->svc.datalog_rados, - target->get_bucket_info(), bs->shard_id); - - return 0; - } - - rgw_rados_ref ref; - int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref); - if (r < 0) { - return r; - } - - RGWObjState *state; - RGWObjManifest *manifest = nullptr; - r = target->get_state(dpp, &state, &manifest, false, y); - if (r < 0) - return r; - - ObjectWriteOperation op; - - if (!real_clock::is_zero(params.unmod_since)) { - struct timespec ctime = ceph::real_clock::to_timespec(state->mtime); - struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since); - if (!params.high_precision_time) { - ctime.tv_nsec = 0; - unmod.tv_nsec = 0; - } - - ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl; - if (ctime > unmod) { - return -ERR_PRECONDITION_FAILED; - } - - /* only delete object if mtime is less than or equal to params.unmod_since */ - store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE); - } - uint64_t obj_accounted_size = state->accounted_size; - - if(params.abortmp) { - obj_accounted_size = params.parts_accounted_size; - } - - if (!real_clock::is_zero(params.expiration_time)) { - bufferlist bl; - real_time delete_at; - - if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) { - try { - auto iter = bl.cbegin(); - decode(delete_at, iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl; - return -EIO; - } - - if (params.expiration_time != delete_at) { - return -ERR_PRECONDITION_FAILED; - } - } else { - return -ERR_PRECONDITION_FAILED; - } - } - - if (!state->exists) { - target->invalidate_state(); - return -ENOENT; - } - - r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y); - if (r < 0) - return r; - - RGWBucketInfo& bucket_info = target->get_bucket_info(); - - RGWRados::Bucket bop(store, bucket_info); - RGWRados::Bucket::UpdateIndex index_op(&bop, obj); - - index_op.set_zones_trace(params.zones_trace); - index_op.set_bilog_flags(params.bilog_flags); - - r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y); - if (r < 0) - return r; - - store->remove_rgw_head_obj(op); - - auto& ioctx = ref.pool.ioctx(); - r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y); - - /* raced with another operation, object state is indeterminate */ - const bool need_invalidate = (r == -ECANCELED); - - int64_t poolid = ioctx.get_id(); - if (r >= 0) { - tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache(); - if (obj_tombstone_cache) { - tombstone_entry entry{*state}; - obj_tombstone_cache->add(obj, entry); - } - r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs); - - int ret = target->complete_atomic_modification(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl; - } - /* other than that, no need to propagate error */ - } else { - int ret = index_op.cancel(dpp, params.remove_objs); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl; - } - } - - if (need_invalidate) { - target->invalidate_state(); - } - - if (r < 0) - return r; - - /* update quota cache */ - store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size); - - return 0; -} - -int RGWRados::delete_obj(rgw::sal::Driver* store, - const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw_obj& obj, - int versioning_status, // versioning flags defined in enum RGWBucketFlags - uint16_t bilog_flags, - const real_time& expiration_time, - rgw_zone_set *zones_trace) -{ - std::unique_ptr bucket; - store->get_bucket(nullptr, bucket_info, &bucket); - std::unique_ptr object = bucket->get_object(obj.key); - - return delete_obj(dpp, bucket_info, object.get(), versioning_status, - bilog_flags, expiration_time, zones_trace); -} - -int RGWRados::delete_obj(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - rgw::sal::Object* obj, - int versioning_status, // versioning flags defined in enum RGWBucketFlags - uint16_t bilog_flags, - const real_time& expiration_time, - rgw_zone_set *zones_trace) -{ - std::unique_ptr del_op = obj->get_delete_op(); - - del_op->params.bucket_owner = bucket_info.owner; - del_op->params.versioning_status = versioning_status; - del_op->params.bilog_flags = bilog_flags; - del_op->params.expiration_time = expiration_time; - del_op->params.zones_trace = zones_trace; - - return del_op->delete_obj(dpp, null_yield); -} - -int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj) -{ - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - - ObjectWriteOperation op; - - op.remove(); - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - if (r < 0) - return r; - - return 0; -} - -int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp) -{ - std::string oid, key; - get_obj_bucket_and_oid_loc(obj, oid, key); - - RGWBucketInfo bucket_info; - int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl; - return ret; - } - - RGWRados::Bucket bop(this, bucket_info); - RGWRados::Bucket::UpdateIndex index_op(&bop, obj); - - return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL); -} - -static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl) -{ - string tag; - - RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp); - if (mi != manifest.obj_end(dpp)) { - if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part - ++mi; - rgw::sal::RadosStore* rstore = dynamic_cast(store); - tag = mi.get_location().get_raw_obj(rstore).oid; - tag.append("_"); - } - - unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE]; - char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1]; - MD5 hash; - // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes - hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); - hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length()); - - map::iterator iter = attrset.find(RGW_ATTR_ETAG); - if (iter != attrset.end()) { - bufferlist& bl = iter->second; - hash.Update((const unsigned char *)bl.c_str(), bl.length()); - } - - hash.Final(md5); - buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str); - tag.append(md5_str); - - ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl; - - tag_bl.append(tag.c_str(), tag.size() + 1); -} - -static bool is_olh(map& attrs) -{ - map::iterator iter = attrs.find(RGW_ATTR_OLH_INFO); - return (iter != attrs.end()); -} - -static bool has_olh_tag(map& attrs) -{ - map::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG); - return (iter != attrs.end()); -} - -int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& - obj_ctx, RGWBucketInfo& bucket_info, - rgw::sal::Object* obj, RGWObjState *olh_state, - RGWObjState **target_state, - RGWObjManifest **target_manifest, optional_yield y) -{ - ceph_assert(olh_state->is_olh); - - rgw_obj target; - int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */ - if (r < 0) { - return r; - } - - std::unique_ptr bucket; - driver->get_bucket(nullptr, bucket_info, &bucket); - std::unique_ptr target_obj = bucket->get_object(target.key); - - r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state, - target_manifest, false, y); - if (r < 0) { - return r; - } - - return 0; -} - -int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, - RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - RGWObjState **state, RGWObjManifest** manifest, - bool follow_olh, optional_yield y, bool assume_noent) -{ - if (obj->empty()) { - return -EINVAL; - } - - bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty(); - *manifest = nullptr; - - RGWObjStateManifest *sm = rctx->get_state(obj->get_obj()); - RGWObjState *s = &(sm->state); - ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl; - *state = s; - if (sm->manifest) { - *manifest = &(*sm->manifest); - } - if (s->has_attrs) { - if (s->is_olh && need_follow_olh) { - return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); - } - return 0; - } - - s->obj = obj->get_obj(); - - rgw_raw_obj raw_obj; - obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj); - - int r = -ENOENT; - - if (!assume_noent) { - r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y); - } - - if (r == -ENOENT) { - s->exists = false; - s->has_attrs = true; - tombstone_entry entry; - if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) { - s->mtime = entry.mtime; - s->zone_short_id = entry.zone_short_id; - s->pg_ver = entry.pg_ver; - ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj - << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl; - } else { - s->mtime = real_time(); - } - return 0; - } - if (r < 0) - return r; - - s->exists = true; - s->has_attrs = true; - s->accounted_size = s->size; - - auto iter = s->attrset.find(RGW_ATTR_ETAG); - if (iter != s->attrset.end()) { - /* get rid of extra null character at the end of the etag, as we used to store it like that */ - bufferlist& bletag = iter->second; - if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') { - bufferlist newbl; - bletag.splice(0, bletag.length() - 1, &newbl); - bletag = std::move(newbl); - } - } - - iter = s->attrset.find(RGW_ATTR_COMPRESSION); - const bool compressed = (iter != s->attrset.end()); - if (compressed) { - // use uncompressed size for accounted_size - try { - RGWCompressionInfo info; - auto p = iter->second.cbegin(); - decode(info, p); - s->accounted_size = info.orig_size; - } catch (buffer::error&) { - ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl; - return -EIO; - } - } - - iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); - if (iter != s->attrset.end()) { - bufferlist bl = iter->second; - bufferlist::iterator it = bl.begin(); - it.copy(bl.length(), s->shadow_obj); - s->shadow_obj[bl.length()] = '\0'; - } - s->obj_tag = s->attrset[RGW_ATTR_ID_TAG]; - auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG); - if (ttiter != s->attrset.end()) { - s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG]; - } - - bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST]; - if (manifest_bl.length()) { - auto miter = manifest_bl.cbegin(); - try { - sm->manifest.emplace(); - decode(*sm->manifest, miter); - sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be - broken due to old bugs */ - s->size = sm->manifest->get_obj_size(); - if (!compressed) - s->accounted_size = s->size; - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl; - return -EIO; - } - *manifest = &(*sm->manifest); - ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl; - if (cct->_conf->subsys.should_gather() && \ - sm->manifest->has_explicit_objs()) { - RGWObjManifest::obj_iterator mi; - for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) { - ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl; - } - } - - if (!s->obj_tag.length()) { - /* - * Uh oh, something's wrong, object with manifest should have tag. Let's - * create one out of the manifest, would be unique - */ - generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag); - s->fake_tag = true; - } - } - map::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER); - if (aiter != s->attrset.end()) { - bufferlist& pg_ver_bl = aiter->second; - if (pg_ver_bl.length()) { - auto pgbl = pg_ver_bl.cbegin(); - try { - decode(s->pg_ver, pgbl); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl; - } - } - } - aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); - if (aiter != s->attrset.end()) { - bufferlist& zone_short_id_bl = aiter->second; - if (zone_short_id_bl.length()) { - auto zbl = zone_short_id_bl.cbegin(); - try { - decode(s->zone_short_id, zbl); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl; - } - } - } - if (s->obj_tag.length()) { - ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl; - } else { - ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl; - } - - /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if - * it exist, and not only if is_olh() returns true - */ - iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); - if (iter != s->attrset.end()) { - s->olh_tag = iter->second; - } - - if (is_olh(s->attrset)) { - s->is_olh = true; - - ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl; - - if (need_follow_olh) { - return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y); - } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) { - // read null version, and the head object only have olh info - s->exists = false; - return -ENOENT; - } - } - - return 0; -} - -int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, - bool follow_olh, optional_yield y, bool assume_noent) -{ - int ret; - - do { - ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent); - } while (ret == -EAGAIN); - - return ret; -} - -int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y) -{ - RGWObjState *astate; - int r = get_state(dpp, &astate, pmanifest, true, y); - if (r < 0) { - return r; - } - - return 0; -} - -int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y) -{ - RGWObjState *state; - RGWObjManifest *manifest = nullptr; - int r = source->get_state(dpp, &state, &manifest, true, y); - if (r < 0) - return r; - if (!state->exists) - return -ENOENT; - if (!state->get_attr(name, dest)) - return -ENODATA; - - return 0; -} - -int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp) -{ - rgw::sal::Object* target = source->get_target(); - rgw_obj obj = target->get_obj(); - RGWRados *store = source->get_store(); - - result.obj = obj; - if (target->has_attrs()) { - state.ret = 0; - result.size = target->get_obj_size(); - result.mtime = ceph::real_clock::to_timespec(target->get_mtime()); - result.attrs = target->get_attrs(); - //result.manifest = sm->manifest; - return 0; - } - - string oid; - string loc; - get_obj_bucket_and_oid_loc(obj, oid, loc); - - int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx); - if (r < 0) { - return r; - } - - librados::ObjectReadOperation op; - op.stat2(&result.size, &result.mtime, NULL); - op.getxattrs(&result.attrs, NULL); - state.completion = librados::Rados::aio_create_completion(nullptr, nullptr); - state.io_ctx.locator_set_key(loc); - r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL); - if (r < 0) { - ldpp_dout(dpp, 5) << __func__ - << ": ERROR: aio_operate() returned ret=" << r - << dendl; - return r; - } - - return 0; -} - - -int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp) -{ - if (!state.completion) { - return state.ret; - } - - state.completion->wait_for_complete(); - state.ret = state.completion->get_return_value(); - state.completion->release(); - - if (state.ret != 0) { - return state.ret; - } - - return finish(dpp); -} - -int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp) -{ - map::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST); - if (iter != result.attrs.end()) { - bufferlist& bl = iter->second; - auto biter = bl.cbegin(); - try { - result.manifest.emplace(); - decode(*result.manifest, biter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl; - return -EIO; - } - } - - return 0; -} - -int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - ObjectOperation& op, RGWObjState **pstate, - RGWObjManifest** pmanifest, optional_yield y) -{ - int r = obj->get_obj_state(dpp, pstate, y, false); - if (r < 0) - return r; - - return append_atomic_test(dpp, *pstate, op); -} - -int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, - const RGWObjState* state, - librados::ObjectOperation& op) -{ - if (!state->is_atomic) { - ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl; - return 0; - } - - if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility - op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); - } else { - ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl; - } - return 0; -} - -int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent) -{ - int r = obj->get_obj_state(dpp, pstate, y, follow_olh); - if (r < 0) { - return r; - } - *pmanifest = static_cast(obj)->get_manifest(); - - return r; -} - -void RGWRados::Object::invalidate_state() -{ - obj->invalidate(); -} - -int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp, - ObjectWriteOperation& op, bool reset_obj, const string *ptag, - const char *if_match, const char *if_nomatch, bool removal_op, - bool modify_tail, optional_yield y) -{ - int r = get_state(dpp, &state, &manifest, false, y); - if (r < 0) - return r; - - bool need_guard = ((manifest) || (state->obj_tag.length() != 0) || - if_match != NULL || if_nomatch != NULL) && - (!state->fake_tag); - - if (!state->is_atomic) { - ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl; - - if (reset_obj) { - op.create(false); - store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object - } - - return 0; - } - - if (need_guard) { - /* first verify that the object wasn't replaced under */ - if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) { - op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); - // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion - } - - if (if_match) { - if (strcmp(if_match, "*") == 0) { - // test the object is existing - if (!state->exists) { - return -ERR_PRECONDITION_FAILED; - } - } else { - bufferlist bl; - if (!state->get_attr(RGW_ATTR_ETAG, bl) || - strncmp(if_match, bl.c_str(), bl.length()) != 0) { - return -ERR_PRECONDITION_FAILED; - } - } - } - - if (if_nomatch) { - if (strcmp(if_nomatch, "*") == 0) { - // test the object is NOT existing - if (state->exists) { - return -ERR_PRECONDITION_FAILED; - } - } else { - bufferlist bl; - if (!state->get_attr(RGW_ATTR_ETAG, bl) || - strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) { - return -ERR_PRECONDITION_FAILED; - } - } - } - } - - if (reset_obj) { - if (state->exists) { - op.create(false); - store->remove_rgw_head_obj(op); - } else { - op.create(true); - } - } - - if (removal_op) { - /* the object is being removed, no need to update its tag */ - return 0; - } - - if (ptag) { - state->write_tag = *ptag; - } else { - append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32); - } - bufferlist bl; - bl.append(state->write_tag.c_str(), state->write_tag.size() + 1); - - ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl; - - op.setxattr(RGW_ATTR_ID_TAG, bl); - if (modify_tail) { - op.setxattr(RGW_ATTR_TAIL_TAG, bl); - } - - return 0; -} - -/** - * Set an attr on an object. - * bucket: name of the bucket holding the object - * obj: name of the object to set the attr on - * name: the attr to set - * bl: the contents of the attr - * Returns: 0 on success, -ERR# otherwise. - */ -int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl) -{ - map attrs; - attrs[name] = bl; - return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield); -} - -int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj, - map& attrs, - map* rmattrs, - optional_yield y) -{ - std::unique_ptr obj = src_obj->clone(); - if (obj->get_instance() == "null") { - obj->clear_instance(); - } - - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); - if (r < 0) { - return r; - } - - ObjectWriteOperation op; - RGWObjState *state = NULL; - RGWObjManifest *manifest = nullptr; - - r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y); - if (r < 0) - return r; - - // ensure null version object exist - if (src_obj->get_instance() == "null" && !manifest) { - return -ENOENT; - } - - map::iterator iter; - if (rmattrs) { - for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { - const string& name = iter->first; - op.rmxattr(name.c_str()); - } - } - - const rgw_bucket& bucket = obj->get_bucket()->get_key(); - - for (iter = attrs.begin(); iter != attrs.end(); ++iter) { - const string& name = iter->first; - bufferlist& bl = iter->second; - - if (!bl.length()) - continue; - - op.setxattr(name.c_str(), bl); - - if (name.compare(RGW_ATTR_DELETE_AT) == 0) { - real_time ts; - try { - decode(ts, bl); - - rgw_obj_index_key obj_key; - obj->get_key().get_index_key(&obj_key); - - obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl; - } - } - } - - if (!op.size()) - return 0; - - bufferlist bl; - RGWRados::Bucket bop(this, bucket_info); - RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj()); - - if (state) { - string tag; - append_rand_alpha(cct, tag, tag, 32); - state->write_tag = tag; - r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y); - - if (r < 0) - return r; - - bl.append(tag.c_str(), tag.size() + 1); - op.setxattr(RGW_ATTR_ID_TAG, bl); - } - - - real_time mtime = real_clock::now(); - struct timespec mtime_ts = real_clock::to_timespec(mtime); - op.mtime2(&mtime_ts); - auto& ioctx = ref.pool.ioctx(); - r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield); - if (state) { - if (r >= 0) { - bufferlist acl_bl = attrs[RGW_ATTR_ACL]; - bufferlist etag_bl = attrs[RGW_ATTR_ETAG]; - bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE]; - string etag = rgw_bl_str(etag_bl); - string content_type = rgw_bl_str(content_type_bl); - string storage_class; - auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS); - if (iter != attrs.end()) { - storage_class = rgw_bl_str(iter->second); - } - uint64_t epoch = ioctx.get_last_version(); - int64_t poolid = ioctx.get_id(); - r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size, - mtime, etag, content_type, storage_class, &acl_bl, - RGWObjCategory::Main, NULL); - } else { - int ret = index_op.cancel(dpp, nullptr); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl; - } - } - } - if (r < 0) - return r; - - if (state) { - state->obj_tag.swap(bl); - if (rmattrs) { - for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) { - state->attrset.erase(iter->first); - } - } - - for (iter = attrs.begin(); iter != attrs.end(); ++iter) { - state->attrset[iter->first] = iter->second; - } - - auto iter = state->attrset.find(RGW_ATTR_ID_TAG); - if (iter != state->attrset.end()) { - iter->second = state->obj_tag; - } - } - - return 0; -} - -int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp) -{ - RGWRados *store = source->get_store(); - CephContext *cct = store->ctx(); - - bufferlist etag; - - map::iterator iter; - - RGWObjState *astate; - RGWObjManifest *manifest = nullptr; - int r = source->get_state(dpp, &astate, &manifest, true, y); - if (r < 0) - return r; - - if (!astate->exists) { - return -ENOENT; - } - - const RGWBucketInfo& bucket_info = source->get_bucket_info(); - - state.obj = astate->obj; - store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj); - - state.cur_pool = state.head_obj.pool; - state.cur_ioctx = &state.io_ctxs[state.cur_pool]; - - r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx); - if (r < 0) { - return r; - } - if (params.target_obj) { - *params.target_obj = state.obj; - } - if (params.attrs) { - *params.attrs = astate->attrset; - if (cct->_conf->subsys.should_gather()) { - for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) { - ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl; - } - } - } - - /* Convert all times go GMT to make them compatible */ - if (conds.mod_ptr || conds.unmod_ptr) { - obj_time_weight src_weight; - src_weight.init(astate); - src_weight.high_precision = conds.high_precision_time; - - obj_time_weight dest_weight; - dest_weight.high_precision = conds.high_precision_time; - - if (conds.mod_ptr && !conds.if_nomatch) { - dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver); - ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; - if (!(dest_weight < src_weight)) { - return -ERR_NOT_MODIFIED; - } - } - - if (conds.unmod_ptr && !conds.if_match) { - dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver); - ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl; - if (dest_weight < src_weight) { - return -ERR_PRECONDITION_FAILED; - } - } - } - if (conds.if_match || conds.if_nomatch) { - r = get_attr(dpp, RGW_ATTR_ETAG, etag, y); - if (r < 0) - return r; - - if (conds.if_match) { - string if_match_str = rgw_string_unquote(conds.if_match); - ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl; - if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) { - return -ERR_PRECONDITION_FAILED; - } - } - - if (conds.if_nomatch) { - string if_nomatch_str = rgw_string_unquote(conds.if_nomatch); - ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl; - if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) { - return -ERR_NOT_MODIFIED; - } - } - } - - if (params.obj_size) - *params.obj_size = astate->size; - if (params.lastmod) - *params.lastmod = astate->mtime; - - return 0; -} - -int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) -{ - if (ofs < 0) { - ofs += obj_size; - if (ofs < 0) - ofs = 0; - end = obj_size - 1; - } else if (end < 0) { - end = obj_size - 1; - } - - if (obj_size > 0) { - if (ofs >= (off_t)obj_size) { - return -ERANGE; - } - if (end >= (off_t)obj_size) { - end = obj_size - 1; - } - } - return 0; -} - -int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call) -{ - RGWRados *store = target->get_store(); - BucketShard *bs = nullptr; - int r; - -#define NUM_RESHARD_RETRIES 10 - for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { - int ret = get_bucket_shard(&bs, dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << - obj_instance.key << ". ret=" << ret << dendl; - return ret; - } - - r = call(bs); - if (r != -ERR_BUSY_RESHARDING) { - break; - } - - ldpp_dout(dpp, 10) << - "NOTICE: resharding operation on bucket index detected, blocking. obj=" << - obj_instance.key << dendl; - - r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp); - if (r == -ERR_BUSY_RESHARDING) { - ldpp_dout(dpp, 10) << __func__ << - " NOTICE: block_while_resharding() still busy. obj=" << - obj_instance.key << dendl; - continue; - } else if (r < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: block_while_resharding() failed. obj=" << - obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; - return r; - } - - ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl; - i = 0; /* resharding is finished, make sure we can retry */ - invalidate_bs(); - } // for loop - - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << - obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; - return r; - } - - if (pbs) { - *pbs = bs; - } - - return 0; -} - -int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y) -{ - if (blind) { - return 0; - } - RGWRados *store = target->get_store(); - - if (write_tag && write_tag->length()) { - optag = string(write_tag->c_str(), write_tag->length()); - } else { - if (optag.empty()) { - append_rand_alpha(store->ctx(), optag, optag, 32); - } - } - - int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int { - return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace); - }); - - if (r < 0) { - return r; - } - prepared = true; - - return 0; -} - -int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, - uint64_t size, uint64_t accounted_size, - ceph::real_time& ut, const string& etag, - const string& content_type, const string& storage_class, - bufferlist *acl_bl, - RGWObjCategory category, - list *remove_objs, const string *user_data, - bool appendable) -{ - if (blind) { - return 0; - } - RGWRados *store = target->get_store(); - BucketShard *bs = nullptr; - - int ret = get_bucket_shard(&bs, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; - return ret; - } - - rgw_bucket_dir_entry ent; - obj.key.get_index_key(&ent.key); - ent.meta.size = size; - ent.meta.accounted_size = accounted_size; - ent.meta.mtime = ut; - ent.meta.etag = etag; - ent.meta.storage_class = storage_class; - if (user_data) - ent.meta.user_data = *user_data; - - ACLOwner owner; - if (acl_bl && acl_bl->length()) { - int ret = store->decode_policy(dpp, *acl_bl, &owner); - if (ret < 0) { - ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl; - } - } - ent.meta.owner = owner.get_id().to_str(); - ent.meta.owner_display_name = owner.get_display_name(); - ent.meta.content_type = content_type; - ent.meta.appendable = appendable; - - ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); - - add_datalog_entry(dpp, store->svc.datalog_rados, - target->bucket_info, bs->shard_id); - - return ret; -} - -int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp, - int64_t poolid, uint64_t epoch, - real_time& removed_mtime, - list *remove_objs) -{ - if (blind) { - return 0; - } - RGWRados *store = target->get_store(); - BucketShard *bs = nullptr; - - int ret = get_bucket_shard(&bs, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl; - return ret; - } - - ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace); - - add_datalog_entry(dpp, store->svc.datalog_rados, - target->bucket_info, bs->shard_id); - - return ret; -} - - -int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp, - list *remove_objs) -{ - if (blind) { - return 0; - } - RGWRados *store = target->get_store(); - BucketShard *bs; - - int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int { - return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace); - }); - - /* - * need to update data log anyhow, so that whoever follows needs to update its internal markers - * for following the specific bucket shard log. Otherwise they end up staying behind, and users - * have no way to tell that they're all caught up - */ - add_datalog_entry(dpp, store->svc.datalog_rados, - target->bucket_info, bs->shard_id); - - return ret; -} - -/* - * Read up through index `end` inclusive. Number of bytes read is up - * to `end - ofs + 1`. - */ -int RGWRados::Object::Read::read(int64_t ofs, int64_t end, - bufferlist& bl, optional_yield y, - const DoutPrefixProvider *dpp) -{ - RGWRados *store = source->get_store(); - - rgw_raw_obj read_obj; - uint64_t read_ofs = ofs; - uint64_t len, read_len; - bool reading_from_head = true; - ObjectReadOperation op; - - bool merge_bl = false; - bufferlist *pbl = &bl; - bufferlist read_bl; - uint64_t max_chunk_size; - - RGWObjState *astate; - RGWObjManifest *manifest = nullptr; - int r = source->get_state(dpp, &astate, &manifest, true, y); - if (r < 0) - return r; - - if (astate->size == 0) { - end = 0; - } else if (end >= (int64_t)astate->size) { - end = astate->size - 1; - } - - if (end < 0) - len = 0; - else - len = end - ofs + 1; - - if (manifest && manifest->has_tail()) { - /* now get the relevant object part */ - RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); - - uint64_t stripe_ofs = iter.get_stripe_ofs(); - read_obj = iter.get_location().get_raw_obj(store->driver); - len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); - read_ofs = iter.location_ofs() + (ofs - stripe_ofs); - reading_from_head = (read_obj == state.head_obj); - } else { - read_obj = state.head_obj; - } - - r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl; - return r; - } - - if (len > max_chunk_size) - len = max_chunk_size; - - - read_len = len; - - if (reading_from_head) { - /* only when reading from the head object do we need to do the atomic test */ - std::unique_ptr obj = source->bucket->get_object(state.obj.key); - r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y); - if (r < 0) - return r; - - if (astate && astate->prefetch_data) { - if (!ofs && astate->data.length() >= len) { - bl = astate->data; - return bl.length(); - } - - if (ofs < astate->data.length()) { - unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len); - astate->data.begin(ofs).copy(copy_len, bl); - read_len -= copy_len; - read_ofs += copy_len; - if (!read_len) - return bl.length(); - - merge_bl = true; - pbl = &read_bl; - } - } - } - - ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl; - op.read(read_ofs, read_len, pbl, NULL); - - if (state.cur_pool != read_obj.pool) { - auto iter = state.io_ctxs.find(read_obj.pool); - if (iter == state.io_ctxs.end()) { - state.cur_ioctx = &state.io_ctxs[read_obj.pool]; - r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false); - if (r < 0) { - ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl; - return r; - } - } else { - state.cur_ioctx = &iter->second; - } - state.cur_pool = read_obj.pool; - } - - state.cur_ioctx->locator_set_key(read_obj.loc); - - r = state.cur_ioctx->operate(read_obj.oid, &op, NULL); - ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl; - - if (r < 0) { - return r; - } - - if (merge_bl) { - bl.append(read_bl); - } - - return bl.length(); -} - -int get_obj_data::flush(rgw::AioResultList&& results) { - int r = rgw::check_for_errors(results); - if (r < 0) { - return r; - } - std::list bl_list; - - auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; }; - results.sort(cmp); // merge() requires results to be sorted first - completed.merge(results, cmp); // merge results in sorted order - - while (!completed.empty() && completed.front().id == offset) { - auto bl = std::move(completed.front().data); - - bl_list.push_back(bl); - offset += bl.length(); - int r = client_cb->handle_data(bl, 0, bl.length()); - if (r < 0) { - return r; - } - - if (rgwrados->get_use_datacache()) { - const std::lock_guard l(d3n_get_data.d3n_lock); - auto oid = completed.front().obj.get_ref().obj.oid; - if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) { - lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl; - rgwrados->d3n_data_cache->put(bl, bl.length(), oid); - } else { - lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl; - } - } - completed.pop_front_and_dispose(std::default_delete{}); - } - return 0; -} - -static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp, - const rgw_raw_obj& read_obj, off_t obj_ofs, - off_t read_ofs, off_t len, bool is_head_obj, - RGWObjState *astate, void *arg) -{ - struct get_obj_data* d = static_cast(arg); - return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len, - is_head_obj, astate, arg); -} - -int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp, - const rgw_raw_obj& read_obj, off_t obj_ofs, - off_t read_ofs, off_t len, bool is_head_obj, - RGWObjState *astate, void *arg) -{ - ObjectReadOperation op; - struct get_obj_data* d = static_cast(arg); - string oid, key; - - if (is_head_obj) { - /* only when reading from the head object do we need to do the atomic test */ - int r = append_atomic_test(dpp, astate, op); - if (r < 0) - return r; - - if (astate && - obj_ofs < astate->data.length()) { - unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); - - r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); - if (r < 0) - return r; - - len -= chunk_len; - d->offset += chunk_len; - read_ofs += chunk_len; - obj_ofs += chunk_len; - if (!len) - return 0; - } - } - - auto obj = d->rgwrados->svc.rados->obj(read_obj); - int r = obj.open(dpp); - if (r < 0) { - ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl; - return r; - } - - ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; - op.read(read_ofs, len, nullptr, nullptr); - - const uint64_t cost = len; - const uint64_t id = obj_ofs; // use logical object offset for sorting replies - - auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id); - - return d->flush(std::move(completed)); -} - -int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, - optional_yield y) -{ - RGWRados *store = source->get_store(); - CephContext *cct = store->ctx(); - const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size; - const uint64_t window_size = cct->_conf->rgw_get_obj_window_size; - - auto aio = rgw::make_throttle(window_size, y); - get_obj_data data(store, cb, &*aio, ofs, y); - - int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(), - source->get_target(), - ofs, end, chunk_size, _get_obj_iterate_cb, &data, y); - if (r < 0) { - ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl; - data.cancel(); // drain completions without writing back to client - return r; - } - - return data.drain(); -} - -int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, - RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - off_t ofs, off_t end, uint64_t max_chunk_size, - iterate_obj_cb cb, void *arg, optional_yield y) -{ - rgw_raw_obj head_obj; - rgw_raw_obj read_obj; - uint64_t read_ofs = ofs; - uint64_t len; - bool reading_from_head = true; - RGWObjState *astate = NULL; - RGWObjManifest *manifest = nullptr; - - obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj); - - int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y); - if (r < 0) { - return r; - } - - if (end < 0) - len = 0; - else - len = end - ofs + 1; - - if (manifest) { - /* now get the relevant object stripe */ - RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs); - - RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp); - - for (; iter != obj_end && ofs <= end; ++iter) { - off_t stripe_ofs = iter.get_stripe_ofs(); - off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size(); - - while (ofs < next_stripe_ofs && ofs <= end) { - read_obj = iter.get_location().get_raw_obj(driver); - uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs)); - read_ofs = iter.location_ofs() + (ofs - stripe_ofs); - - if (read_len > max_chunk_size) { - read_len = max_chunk_size; - } - - reading_from_head = (read_obj == head_obj); - r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg); - if (r < 0) { - return r; - } - - len -= read_len; - ofs += read_len; - } - } - } else { - while (ofs <= end) { - read_obj = head_obj; - uint64_t read_len = std::min(len, max_chunk_size); - - r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg); - if (r < 0) { - return r; - } - - len -= read_len; - ofs += read_len; - } - } - - return 0; -} - -int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); - if (r < 0) { - return r; - } - - return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield); -} - -int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj, &ref); - if (r < 0) { - return r; - } - - bufferlist outbl; - - return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield); -} - -int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag) -{ - ObjectWriteOperation op; - - ceph_assert(olh_obj.key.instance.empty()); - - bool has_tag = (state.exists && has_olh_tag(state.attrset)); - - if (!state.exists) { - op.create(true); - } else { - op.assert_exists(); - struct timespec mtime_ts = real_clock::to_timespec(state.mtime); - op.mtime2(&mtime_ts); - } - - /* - * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object. - * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two - * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to - * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh - * log will reflect that. - * - * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag - * is used for object data instance, olh_tag for olh instance. - */ - if (has_tag) { - /* guard against racing writes */ - bucket_index_guard_olh_op(dpp, state, op); - } - - if (!has_tag) { - /* obj tag */ - string obj_tag = gen_rand_alphanumeric_lower(cct, 32); - - bufferlist bl; - bl.append(obj_tag.c_str(), obj_tag.size()); - op.setxattr(RGW_ATTR_ID_TAG, bl); - - state.attrset[RGW_ATTR_ID_TAG] = bl; - state.obj_tag = bl; - - /* olh tag */ - string olh_tag = gen_rand_alphanumeric_lower(cct, 32); - - bufferlist olh_bl; - olh_bl.append(olh_tag.c_str(), olh_tag.size()); - op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl); - - state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl; - state.olh_tag = olh_bl; - state.is_olh = true; - - bufferlist verbl; - op.setxattr(RGW_ATTR_OLH_VER, verbl); - } - - bufferlist bl; - RGWOLHPendingInfo pending_info; - pending_info.time = real_clock::now(); - encode(pending_info, bl); - -#define OLH_PENDING_TAG_LEN 32 - /* tag will start with current time epoch, this so that entries are sorted by time */ - char buf[32]; - utime_t ut(pending_info.time); - snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec()); - *op_tag = buf; - - string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size()); - - op_tag->append(s); - - string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; - attr_name.append(*op_tag); - - op.setxattr(attr_name.c_str(), bl); - - int ret = obj_operate(dpp, bucket_info, olh_obj, &op); - if (ret < 0) { - return ret; - } - - state.exists = true; - state.attrset[attr_name] = bl; - - return 0; -} - -int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag) -{ - int ret; - - ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag); - if (ret == -EEXIST) { - ret = -ECANCELED; - } - - return ret; -} - -int RGWRados::guard_reshard(const DoutPrefixProvider *dpp, - BucketShard *bs, - const rgw_obj& obj_instance, - RGWBucketInfo& bucket_info, - std::function call) -{ - rgw_obj obj; - const rgw_obj *pobj = &obj_instance; - int r; - - for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) { - r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp); - if (r < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl; - return r; - } - - r = call(bs); - if (r != -ERR_BUSY_RESHARDING) { - break; - } - - ldpp_dout(dpp, 10) << - "NOTICE: resharding operation on bucket index detected, blocking. obj=" << - obj_instance.key << dendl; - - r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp); - if (r == -ERR_BUSY_RESHARDING) { - ldpp_dout(dpp, 10) << __func__ << - " NOTICE: block_while_resharding() still busy. obj=" << - obj_instance.key << dendl; - continue; - } else if (r < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: block_while_resharding() failed. obj=" << - obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; - return r; - } - - ldpp_dout(dpp, 20) << "reshard completion identified" << dendl; - i = 0; /* resharding is finished, make sure we can retry */ - } // for loop - - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << - obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl; - return r; - } - - return 0; -} - - -int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, - const rgw_obj& obj_instance, - RGWBucketInfo& bucket_info, - optional_yield y, - const DoutPrefixProvider *dpp) -{ - int ret = 0; - cls_rgw_bucket_instance_entry entry; - - // gets loaded by fetch_new_bucket_info; can be used by - // clear_resharding - std::map bucket_attrs; - - // since we want to run this recovery code from two distinct places, - // let's just put it in a lambda so we can easily re-use; if the - // lambda successfully fetches a new bucket id, it sets - // new_bucket_id and returns 0, otherwise it returns a negative - // error code - auto fetch_new_bucket_info = - [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int { - int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name, - bucket_info, nullptr, y, dpp, &bucket_attrs); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: failed to refresh bucket info after reshard at " << - log_tag << ": " << cpp_strerror(-ret) << dendl; - return ret; - } - - ret = bs->init(dpp, bucket_info, obj_instance); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: failed to refresh bucket shard generation after reshard at " << - log_tag << ": " << cpp_strerror(-ret) << dendl; - return ret; - } - - const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen; - ldpp_dout(dpp, 20) << __func__ << - " INFO: refreshed bucket info after reshard at " << - log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl; - - return 0; - }; // lambda fetch_new_bucket_info - - constexpr int num_retries = 10; - for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop - auto& ref = bs->bucket_obj.get_ref(); - ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry); - if (ret == -ENOENT) { - ret = fetch_new_bucket_info("get_bucket_resharding_failed"); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " failed to refresh bucket info after reshard when get bucket " - "resharding failed, error: " << cpp_strerror(-ret) << dendl; - return ret; - } - } else if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) << - dendl; - return ret; - } - - if (!entry.resharding_in_progress()) { - ret = fetch_new_bucket_info("get_bucket_resharding_succeeded"); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " failed to refresh bucket info after reshard when get bucket " - "resharding succeeded, error: " << cpp_strerror(-ret) << dendl; - return ret; - } - } - - ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " << - (i < num_retries ? "retrying" : "too many retries") << dendl; - - if (i == num_retries) { - break; - } - - // If bucket is erroneously marked as resharding (e.g., crash or - // other error) then fix it. If we can take the bucket reshard - // lock then it means no other resharding should be taking place, - // and we're free to clear the flags. - { - // since we expect to do this rarely, we'll do our work in a - // block and erase our work after each try - - RGWObjectCtx obj_ctx(this->driver); - const rgw_bucket& b = bs->bucket; - std::string bucket_id = b.get_key(); - RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true); - ret = reshard_lock.lock(dpp); - if (ret == -ENOENT) { - continue; - } else if (ret < 0) { - ldpp_dout(dpp, 20) << __func__ << - " ERROR: failed to take reshard lock for bucket " << - bucket_id << "; expected if resharding underway" << dendl; - } else { - ldpp_dout(dpp, 10) << __func__ << - " INFO: was able to take reshard lock for bucket " << - bucket_id << dendl; - // the reshard may have finished, so call clear_resharding() - // with its current bucket info; ALSO this will load - // bucket_attrs for call to clear_resharding below - ret = fetch_new_bucket_info("trying_to_clear_resharding"); - if (ret < 0) { - reshard_lock.unlock(); - ldpp_dout(dpp, 0) << __func__ << - " ERROR: failed to update bucket info before clear resharding for bucket " << - bucket_id << dendl; - continue; // try again - } - - ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp); - reshard_lock.unlock(); - if (ret == -ENOENT) { - ldpp_dout(dpp, 5) << __func__ << - " INFO: no need to reset reshard flags; old shards apparently" - " removed after successful resharding of bucket " << - bucket_id << dendl; - continue; // immediately test again - } else if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: failed to clear resharding flags for bucket " << - bucket_id << ", " << cpp_strerror(-ret) << dendl; - // wait and then test again - } else { - ldpp_dout(dpp, 5) << __func__ << - " INFO: apparently successfully cleared resharding flags for " - "bucket " << bucket_id << dendl; - continue; // if we apparently succeed immediately test again - } // if clear resharding succeeded - } // if taking of lock succeeded - } // block to encapsulate recovery from incomplete reshard - - ret = reshard_wait->wait(y); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - " ERROR: bucket is still resharding, please retry" << dendl; - return ret; - } - } // for loop - - ldpp_dout(dpp, 0) << __func__ << - " ERROR: bucket is still resharding, please retry" << dendl; - return -ERR_BUSY_RESHARDING; -} - -int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, - RGWObjState& olh_state, const rgw_obj& obj_instance, - bool delete_marker, const string& op_tag, - struct rgw_bucket_dir_entry_meta *meta, - uint64_t olh_epoch, - real_time unmod_since, bool high_precision_time, - rgw_zone_set *_zones_trace, bool log_data_change) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); - if (r < 0) { - return r; - } - - rgw_zone_set zones_trace; - if (_zones_trace) { - zones_trace = *_zones_trace; - } - zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); - - BucketShard bs(this); - - r = guard_reshard(dpp, &bs, obj_instance, bucket_info, - [&](BucketShard *bs) -> int { - cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); - auto& ref = bs->bucket_obj.get_ref(); - librados::ObjectWriteOperation op; - op.assert_exists(); // bucket index shard must exist - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag, - delete_marker, op_tag, meta, olh_epoch, - unmod_since, high_precision_time, - svc.zone->get_zone().log_data, zones_trace); - return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - }); - if (r < 0) { - ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl; - return r; - } - - add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id); - - return 0; -} - -void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op) -{ - ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl; - op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag); -} - -int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw_obj& obj_instance, - const string& op_tag, const string& olh_tag, - uint64_t olh_epoch, rgw_zone_set *_zones_trace) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); - if (r < 0) { - return r; - } - - rgw_zone_set zones_trace; - if (_zones_trace) { - zones_trace = *_zones_trace; - } - zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key()); - - BucketShard bs(this); - - cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance); - r = guard_reshard(dpp, &bs, obj_instance, bucket_info, - [&](BucketShard *bs) -> int { - auto& ref = bs->bucket_obj.get_ref(); - librados::ObjectWriteOperation op; - op.assert_exists(); // bucket index shard must exist - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_unlink_instance(op, key, op_tag, - olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace); - return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - }); - if (r < 0) { - ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl; - return r; - } - - return 0; -} - -int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, RGWObjState& state, - const rgw_obj& obj_instance, uint64_t ver_marker, - std::map > *log, - bool *is_truncated) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); - if (r < 0) { - return r; - } - - BucketShard bs(this); - int ret = - bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); - - cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - - auto& shard_ref = bs.bucket_obj.get_ref(); - ObjectReadOperation op; - - rgw_cls_read_olh_log_ret log_ret; - int op_ret = 0; - cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); - bufferlist outbl; - r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield); - if (r < 0) { - return r; - } - if (op_ret < 0) { - ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl; - return op_ret; - } - - *log = std::move(log_ret.log); - *is_truncated = log_ret.is_truncated; - - return 0; -} - -// a multisite sync bug resulted in the OLH head attributes being overwritten by -// the attributes from another zone, causing link_olh() to fail endlessly due to -// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH -// attributes from the bucket index. see http://tracker.ceph.com/issues/37792 -int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, - const rgw_obj& obj) -{ - // fetch the current olh entry from the bucket index - rgw_bucket_olh_entry olh; - int r = bi_get_olh(dpp, bucket_info, obj, &olh); - if (r < 0) { - ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl; - return r; - } - if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved? - return 0; - } - - ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag - << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl; - - // rewrite OLH_ID_TAG and OLH_INFO from current olh - ObjectWriteOperation op; - // assert this is the same olh tag we think we're fixing - bucket_index_guard_olh_op(dpp, *state, op); - // preserve existing mtime - struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime); - op.mtime2(&mtime_ts); - { - bufferlist bl; - bl.append(olh.tag.c_str(), olh.tag.size()); - op.setxattr(RGW_ATTR_OLH_ID_TAG, bl); - } - { - RGWOLHInfo info; - info.target = rgw_obj(bucket_info.bucket, olh.key); - info.removed = olh.delete_marker; - bufferlist bl; - encode(info, bl); - op.setxattr(RGW_ATTR_OLH_INFO, bl); - } - rgw_rados_ref ref; - r = get_obj_head_ref(dpp, bucket_info, obj, &ref); - if (r < 0) { - return r; - } - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - if (r < 0) { - ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with " - << cpp_strerror(r) << dendl; - return r; - } - return 0; -} - -int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - RGWObjState& state, - const rgw_obj& obj_instance, uint64_t ver) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); - if (r < 0) { - return r; - } - - BucketShard bs(this); - int ret = - bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); - - cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - - ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, - [&](BucketShard *pbs) -> int { - ObjectWriteOperation op; - op.assert_exists(); // bucket index shard must exist - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - cls_rgw_trim_olh_log(op, key, ver, olh_tag); - return pbs->bucket_obj.operate(dpp, &op, null_yield); - }); - if (ret < 0) { - ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - RGWObjState& state, - const rgw_obj& obj_instance) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref); - if (r < 0) { - return r; - } - - BucketShard bs(this); - - string olh_tag(state.olh_tag.c_str(), state.olh_tag.length()); - - cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string()); - - int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info, - [&](BucketShard *pbs) -> int { - ObjectWriteOperation op; - op.assert_exists(); // bucket index shard must exist - auto& ref = pbs->bucket_obj.get_ref(); - cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING); - cls_rgw_clear_olh(op, key, olh_tag); - return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - }); - if (ret < 0) { - ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl; - return ret; - } - - return 0; -} - -static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh) -{ - try { - auto biter = bl.cbegin(); - decode(*olh, biter); - return 0; - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl; - return -EIO; - } -} - -int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp, - RGWObjState& state, - RGWBucketInfo& bucket_info, - const rgw::sal::Object* obj, - bufferlist& olh_tag, - std::map >& log, - uint64_t *plast_ver, - rgw_zone_set* zones_trace) -{ - if (log.empty()) { - return 0; - } - - librados::ObjectWriteOperation op; - - uint64_t last_ver = log.rbegin()->first; - *plast_ver = last_ver; - - map >::iterator iter = log.begin(); - - op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); - op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver); - - bufferlist ver_bl; - string last_ver_s = to_string(last_ver); - ver_bl.append(last_ver_s.c_str(), last_ver_s.size()); - op.setxattr(RGW_ATTR_OLH_VER, ver_bl); - - struct timespec mtime_ts = real_clock::to_timespec(state.mtime); - op.mtime2(&mtime_ts); - - bool need_to_link = false; - uint64_t link_epoch = 0; - cls_rgw_obj_key key; - bool delete_marker = false; - list remove_instances; - bool need_to_remove = false; - - // decode current epoch and instance - auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER); - if (olh_ver != state.attrset.end()) { - std::string str = olh_ver->second.to_str(); - std::string err; - link_epoch = strict_strtoll(str.c_str(), 10, &err); - } - auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO); - if (olh_info != state.attrset.end()) { - RGWOLHInfo info; - int r = decode_olh_info(dpp, cct, olh_info->second, &info); - if (r < 0) { - return r; - } - info.target.key.get_index_key(&key); - delete_marker = info.removed; - } - - for (iter = log.begin(); iter != log.end(); ++iter) { - vector::iterator viter = iter->second.begin(); - for (; viter != iter->second.end(); ++viter) { - rgw_bucket_olh_log_entry& entry = *viter; - - ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op - << " key=" << entry.key.name << "[" << entry.key.instance << "] " - << (entry.delete_marker ? "(delete)" : "") << dendl; - switch (entry.op) { - case CLS_RGW_OLH_OP_REMOVE_INSTANCE: - remove_instances.push_back(entry.key); - break; - case CLS_RGW_OLH_OP_LINK_OLH: - // only overwrite a link of the same epoch if its key sorts before - if (link_epoch < iter->first || key.instance.empty() || - key.instance > entry.key.instance) { - ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker - << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; - need_to_link = true; - need_to_remove = false; - key = entry.key; - delete_marker = entry.delete_marker; - } else { - ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker - << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl; - } - break; - case CLS_RGW_OLH_OP_UNLINK_OLH: - need_to_remove = true; - need_to_link = false; - break; - default: - ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl; - return -EIO; - } - string attr_name = RGW_ATTR_OLH_PENDING_PREFIX; - attr_name.append(entry.op_tag); - op.rmxattr(attr_name.c_str()); - } - } - - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref); - if (r < 0) { - return r; - } - - rgw::sal::Bucket* bucket = obj->get_bucket(); - - if (need_to_link) { - rgw_obj target(bucket->get_key(), key); - RGWOLHInfo info; - info.target = target; - info.removed = delete_marker; - bufferlist bl; - encode(info, bl); - op.setxattr(RGW_ATTR_OLH_INFO, bl); - } - - /* first remove object instances */ - for (list::iterator liter = remove_instances.begin(); - liter != remove_instances.end(); ++liter) { - cls_rgw_obj_key& key = *liter; - std::unique_ptr obj_instance = bucket->get_object(key); - int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace); - if (ret < 0 && ret != -ENOENT) { - ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl; - return ret; - } - } - - /* update olh object */ - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; - return r; - } - - r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl; - return r; - } - - if (need_to_remove) { - ObjectWriteOperation rm_op; - - rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag); - rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver); - cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */ - rm_op.remove(); - - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield); - if (r == -ECANCELED) { - return 0; /* someone else won this race */ - } else { - /* - * only clear if was successful, otherwise we might clobber pending operations on this object - */ - r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj()); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl; - return r; - } - } - } - - return 0; -} - -/* - * read olh log and apply it - */ -int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace) -{ - map > log; - bool is_truncated; - uint64_t ver_marker = 0; - - do { - int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated); - if (ret < 0) { - return ret; - } - ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace); - if (ret < 0) { - return ret; - } - } while (is_truncated); - - return 0; -} - -int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, - RGWBucketInfo& bucket_info, - rgw::sal::Object* target_obj, bool delete_marker, - rgw_bucket_dir_entry_meta *meta, - uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, - optional_yield y, rgw_zone_set *zones_trace, bool log_data_change) -{ - string op_tag; - - std::unique_ptr olh_obj = target_obj->clone(); - olh_obj->clear_instance(); - - RGWObjState *state = NULL; - RGWObjManifest *manifest = nullptr; - - int ret = 0; - int i; - -#define MAX_ECANCELED_RETRY 100 - for (i = 0; i < MAX_ECANCELED_RETRY; i++) { - if (ret == -ECANCELED) { - olh_obj->invalidate(); - } - - ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */ - if (ret < 0) { - return ret; - } - - ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag); - if (ret < 0) { - ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; - if (ret == -ECANCELED) { - continue; - } - return ret; - } - ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(), - delete_marker, op_tag, meta, olh_epoch, unmod_since, - high_precision_time, zones_trace, log_data_change); - if (ret < 0) { - ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl; - if (ret == -ECANCELED) { - // the bucket index rejected the link_olh() due to olh tag mismatch; - // attempt to reconstruct olh head attributes based on the bucket index - int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj()); - if (r2 < 0 && r2 != -ECANCELED) { - return r2; - } - continue; - } - return ret; - } - break; - } - - if (i == MAX_ECANCELED_RETRY) { - ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; - return -EIO; - } - - ret = update_olh(dpp, state, bucket_info, olh_obj.get()); - if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ - ret = 0; - } - if (ret < 0) { - ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; - return ret; - } - - return 0; -} - -int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, - uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace) -{ - string op_tag; - - std::unique_ptr olh_obj = target_obj->clone(); - olh_obj->clear_instance(); - - RGWObjState *state = NULL; - - int ret = 0; - int i; - - for (i = 0; i < MAX_ECANCELED_RETRY; i++) { - if (ret == -ECANCELED) { - olh_obj->invalidate(); - } - - ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */ - if (ret < 0) - return ret; - - ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag); - if (ret < 0) { - ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl; - if (ret == -ECANCELED) { - continue; - } - return ret; - } - - string olh_tag(state->olh_tag.c_str(), state->olh_tag.length()); - - ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace); - if (ret < 0) { - ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl; - if (ret == -ECANCELED) { - continue; - } - return ret; - } - break; - } - - if (i == MAX_ECANCELED_RETRY) { - ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl; - return -EIO; - } - - ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace); - if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ - return 0; - } - if (ret < 0) { - ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl; - return ret; - } - - return 0; -} - -void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key) -{ -#define OBJ_INSTANCE_LEN 32 - char buf[OBJ_INSTANCE_LEN + 1]; - - gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped, - no underscore for instance name due to the way we encode the raw keys */ - - target_key->set_instance(buf); -} - -void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj) -{ - gen_rand_obj_instance_name(&target_obj->key); -} - -int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh) -{ - map attrset; - - ObjectReadOperation op; - op.getxattrs(&attrset, NULL); - - int r = obj_operate(dpp, bucket_info, obj, &op); - if (r < 0) { - return r; - } - - auto iter = attrset.find(RGW_ATTR_OLH_INFO); - if (iter == attrset.end()) { /* not an olh */ - return -EINVAL; - } - - return decode_olh_info(dpp, cct, iter->second, olh); -} - -void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp, - map& pending_entries, - map *rm_pending_entries) -{ - map::iterator iter = pending_entries.begin(); - - real_time now = real_clock::now(); - - while (iter != pending_entries.end()) { - auto biter = iter->second.cbegin(); - RGWOLHPendingInfo pending_info; - try { - decode(pending_info, biter); - } catch (buffer::error& err) { - /* skipping bad entry, we could remove it but it might hide a bug */ - ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl; - ++iter; - continue; - } - - map::iterator cur_iter = iter; - ++iter; - if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) { - (*rm_pending_entries)[cur_iter->first] = cur_iter->second; - pending_entries.erase(cur_iter); - } else { - /* entries names are sorted by time (rounded to a second) */ - break; - } - } -} - -int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map& pending_attrs) -{ - rgw_rados_ref ref; - int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref); - if (r < 0) { - return r; - } - - // trim no more than 1000 entries per osd op - constexpr int max_entries = 1000; - - auto i = pending_attrs.begin(); - while (i != pending_attrs.end()) { - ObjectWriteOperation op; - bucket_index_guard_olh_op(dpp, state, op); - - for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) { - op.rmxattr(i->first.c_str()); - } - - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - if (r == -ENOENT || r == -ECANCELED) { - /* raced with some other change, shouldn't sweat about it */ - return 0; - } - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl; - return r; - } - } - return 0; -} - -int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target) -{ - map pending_entries; - rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries); - - map rm_pending_entries; - check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries); - - if (!rm_pending_entries.empty()) { - int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries); - if (ret < 0) { - ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl; - return ret; - } - } - if (!pending_entries.empty()) { - ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl; - - int ret = update_olh(dpp, state, bucket_info, olh_obj); - if (ret < 0) { - if (ret == -ECANCELED) { - // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object. - // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We - // return ENOENT to indicate that the OLH object was removed. - ret = -ENOENT; - } - return ret; - } - } - - auto iter = state->attrset.find(RGW_ATTR_OLH_INFO); - if (iter == state->attrset.end()) { - return -EINVAL; - } - - RGWOLHInfo olh; - int ret = decode_olh_info(dpp, cct, iter->second, &olh); - if (ret < 0) { - return ret; - } - - if (olh.removed) { - return -ENOENT; - } - - *target = olh.target; - - return 0; -} - -int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp, - rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch, - map *attrs, bufferlist *first_chunk, - RGWObjVersionTracker *objv_tracker, optional_yield y) -{ - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - - map unfiltered_attrset; - uint64_t size = 0; - struct timespec mtime_ts; - - ObjectReadOperation op; - if (objv_tracker) { - objv_tracker->prepare_op_for_read(&op); - } - if (attrs) { - op.getxattrs(&unfiltered_attrset, NULL); - } - if (psize || pmtime) { - op.stat2(&size, &mtime_ts, NULL); - } - if (first_chunk) { - op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL); - } - bufferlist outbl; - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y); - - if (epoch) { - *epoch = ref.pool.ioctx().get_last_version(); - } - - if (r < 0) - return r; - - if (psize) - *psize = size; - if (pmtime) - *pmtime = ceph::real_clock::from_timespec(mtime_ts); - if (attrs) { - rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs); - } - - return 0; -} - -int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - int shard_id, string *bucket_ver, string *master_ver, - map& stats, - string *max_marker, bool *syncstopped) -{ - vector headers; - map bucket_instance_ids; - int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids); - if (r < 0) { - return r; - } - - ceph_assert(headers.size() == bucket_instance_ids.size()); - - auto iter = headers.begin(); - map::iterator viter = bucket_instance_ids.begin(); - BucketIndexShardsManager ver_mgr; - BucketIndexShardsManager master_ver_mgr; - BucketIndexShardsManager marker_mgr; - char buf[64]; - for(; iter != headers.end(); ++iter, ++viter) { - accumulate_raw_stats(*iter, stats); - snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver); - ver_mgr.add(viter->first, string(buf)); - snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver); - master_ver_mgr.add(viter->first, string(buf)); - if (shard_id >= 0) { - *max_marker = iter->max_marker; - } else { - marker_mgr.add(viter->first, iter->max_marker); - } - if (syncstopped != NULL) - *syncstopped = iter->syncstopped; - } - ver_mgr.to_string(bucket_ver); - master_ver_mgr.to_string(master_ver); - if (shard_id < 0) { - marker_mgr.to_string(max_marker); - } - return 0; -} - -class RGWGetBucketStatsContext : public RGWGetDirHeader_CB { - RGWGetBucketStats_CB *cb; - uint32_t pendings; - map stats; - int ret_code; - bool should_cb; - ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext"); - -public: - RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings) - : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true) - {} - - void handle_response(int r, rgw_bucket_dir_header& header) override { - std::lock_guard l{lock}; - if (should_cb) { - if ( r >= 0) { - accumulate_raw_stats(header, stats); - } else { - ret_code = r; - } - - // Are we all done? - if (--pendings == 0) { - if (!ret_code) { - cb->set_response(&stats); - } - cb->handle_response(ret_code); - cb->put(); - } - } - } - - void unset_cb() { - std::lock_guard l{lock}; - should_cb = false; - } -}; - -int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx) -{ - int num_aio = 0; - RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1); - ceph_assert(get_ctx); - int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio); - if (r < 0) { - ctx->put(); - if (num_aio) { - get_ctx->unset_cb(); - } - } - get_ctx->put(); - return r; -} - -int RGWRados::get_bucket_instance_info(const string& meta_key, - RGWBucketInfo& info, - real_time *pmtime, - map *pattrs, - optional_yield y, - const DoutPrefixProvider *dpp) -{ - rgw_bucket bucket; - rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr); - - return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp); -} - -int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, - real_time *pmtime, map *pattrs, optional_yield y, - const DoutPrefixProvider *dpp) -{ - return ctl.bucket->read_bucket_instance_info(bucket, &info, - y, - dpp, - RGWBucketCtl::BucketInstance::GetParams() - .set_mtime(pmtime) - .set_attrs(pattrs)); -} - -int RGWRados::get_bucket_info(RGWServices *svc, - const string& tenant, const string& bucket_name, - RGWBucketInfo& info, - real_time *pmtime, - optional_yield y, - const DoutPrefixProvider *dpp, map *pattrs) -{ - rgw_bucket bucket; - bucket.tenant = tenant; - bucket.name = bucket_name; - return ctl.bucket->read_bucket_info(bucket, &info, y, dpp, - RGWBucketCtl::BucketInstance::GetParams() - .set_mtime(pmtime) - .set_attrs(pattrs)); -} - -int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info, - ceph::real_time *pmtime, - const DoutPrefixProvider *dpp, - map *pattrs) -{ - rgw_bucket bucket = info.bucket; - bucket.bucket_id.clear(); - - auto rv = info.objv_tracker.read_version; - - return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp, - RGWBucketCtl::BucketInstance::GetParams() - .set_mtime(pmtime) - .set_attrs(pattrs) - .set_refresh_version(rv)); -} - -int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, - real_time mtime, map *pattrs, - const DoutPrefixProvider *dpp) -{ - return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp, - RGWBucketCtl::BucketInstance::PutParams() - .set_exclusive(exclusive) - .set_mtime(mtime) - .set_attrs(pattrs)); -} - -int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv, - map *pattrs, bool create_entry_point, - const DoutPrefixProvider *dpp) -{ - bool create_head = !info.has_instance_obj || create_entry_point; - - int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp); - if (ret < 0) { - return ret; - } - - if (!create_head) - return 0; /* done! */ - - RGWBucketEntryPoint entry_point; - entry_point.bucket = info.bucket; - entry_point.owner = info.owner; - entry_point.creation_time = info.creation_time; - entry_point.linked = true; - RGWObjVersionTracker ot; - if (pep_objv && !pep_objv->tag.empty()) { - ot.write_version = *pep_objv; - } else { - ot.generate_new_write_ver(cct); - if (pep_objv) { - *pep_objv = ot.write_version; - } - } - ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams() - .set_exclusive(exclusive) - .set_objv_tracker(&ot) - .set_mtime(mtime)); - if (ret < 0) - return ret; - - return 0; -} - -int RGWRados::update_containers_stats(map& m, const DoutPrefixProvider *dpp) -{ - map::iterator iter; - for (iter = m.begin(); iter != m.end(); ++iter) { - RGWBucketEnt& ent = iter->second; - rgw_bucket& bucket = ent.bucket; - ent.count = 0; - ent.size = 0; - ent.size_rounded = 0; - - vector headers; - - RGWBucketInfo bucket_info; - int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp); - if (ret < 0) { - return ret; - } - - int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers); - if (r < 0) - return r; - - auto hiter = headers.begin(); - for (; hiter != headers.end(); ++hiter) { - RGWObjCategory category = main_category; - auto iter = (hiter->stats).find(category); - if (iter != hiter->stats.end()) { - struct rgw_bucket_category_stats& stats = iter->second; - ent.count += stats.num_entries; - ent.size += stats.total_size; - ent.size_rounded += stats.total_size_rounded; - } - } - - // fill in placement_rule from the bucket instance for use in swift's - // per-storage policy statistics - ent.placement_rule = std::move(bucket_info.placement_rule); - } - - return m.size(); -} - -int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl) -{ - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - librados::Rados *rad = get_rados_handle(); - librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr); - - r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size); - completion->release(); - return r; -} - -int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx) -{ - librados::IoCtx& io_ctx = ctx.io_ctx; - librados::NObjectIterator& iter = ctx.iter; - - int r = open_pool_ctx(dpp, pool, io_ctx, false); - if (r < 0) - return r; - - iter = io_ctx.nobjects_begin(); - - return 0; -} - -int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx) -{ - librados::IoCtx& io_ctx = ctx.io_ctx; - librados::NObjectIterator& iter = ctx.iter; - - int r = open_pool_ctx(dpp, pool, io_ctx, false); - if (r < 0) - return r; - - librados::ObjectCursor oc; - if (!oc.from_str(cursor)) { - ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl; - return -EINVAL; - } - - try { - iter = io_ctx.nobjects_begin(oc); - return 0; - } catch (const std::system_error& e) { - r = -e.code().value(); - ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() - << ", returning " << r << dendl; - return r; - } catch (const std::exception& e) { - ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what() - << ", returning -5" << dendl; - return -EIO; - } -} - -string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx) -{ - return ctx.iter.get_cursor().to_str(); -} - -static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num, - vector& objs, - bool *is_truncated, RGWAccessListFilter *filter) -{ - librados::IoCtx& io_ctx = ctx.io_ctx; - librados::NObjectIterator& iter = ctx.iter; - - if (iter == io_ctx.nobjects_end()) - return -ENOENT; - - uint32_t i; - - for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) { - rgw_bucket_dir_entry e; - - string oid = iter->get_oid(); - ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; - - // fill it in with initial values; we may correct later - if (filter && !filter->filter(oid, oid)) - continue; - - e.key = oid; - objs.push_back(e); - } - - if (is_truncated) - *is_truncated = (iter != io_ctx.nobjects_end()); - - return objs.size(); -} - -int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector& objs, - bool *is_truncated, RGWAccessListFilter *filter) -{ - // catch exceptions from NObjectIterator::operator++() - try { - return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter); - } catch (const std::system_error& e) { - int r = -e.code().value(); - ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() - << ", returning " << r << dendl; - return r; - } catch (const std::exception& e) { - ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what() - << ", returning -5" << dendl; - return -EIO; - } -} - -int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx) -{ - if (!ctx->initialized) { - int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx); - if (r < 0) { - ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl; - return r; - } - ctx->initialized = true; - } - return 0; -} - -int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max, - RGWListRawObjsCtx& ctx, list& oids, - bool *is_truncated) -{ - if (!ctx.initialized) { - return -EINVAL; - } - RGWAccessListFilterPrefix filter(prefix_filter); - vector objs; - int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter); - if (r < 0) { - if(r != -ENOENT) - ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl; - return r; - } - - vector::iterator iter; - for (iter = objs.begin(); iter != objs.end(); ++iter) { - oids.push_back(iter->key.name); - } - - return oids.size(); -} - -int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter, - int max, RGWListRawObjsCtx& ctx, list& oids, - bool *is_truncated) -{ - if (!ctx.initialized) { - int r = list_raw_objects_init(dpp, pool, string(), &ctx); - if (r < 0) { - return r; - } - } - - return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated); -} - -string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx) -{ - return pool_iterate_get_cursor(ctx.iter_ctx); -} - -int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, - rgw_bucket_dir_entry *dirent) -{ - rgw_cls_bi_entry bi_entry; - int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; - } - if (r < 0) { - return r; - } - auto iter = bi_entry.data.cbegin(); - try { - decode(*dirent, iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; - return -EIO; - } - - return 0; -} - -int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, - rgw_bucket_olh_entry *olh) -{ - rgw_cls_bi_entry bi_entry; - int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl; - } - if (r < 0) { - return r; - } - auto iter = bi_entry.data.cbegin(); - try { - decode(*olh, iter); - } catch (buffer::error& err) { - ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl; - return -EIO; - } - - return 0; -} - -int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, - BIIndexType index_type, rgw_cls_bi_entry *entry) -{ - BucketShard bs(this); - int ret = bs.init(dpp, bucket_info, obj); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); - - auto& ref = bs.bucket_obj.get_ref(); - - return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry); -} - -void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry) -{ - auto& ref = bs.bucket_obj.get_ref(); - cls_rgw_bi_put(op, ref.obj.oid, entry); -} - -int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry) -{ - auto& ref = bs.bucket_obj.get_ref(); - int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry); - if (ret < 0) - return ret; - - return 0; -} - -int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry) -{ - // make sure incomplete multipart uploads are hashed correctly - if (obj.key.ns == RGW_OBJ_NS_MULTIPART) { - RGWMPObj mp; - mp.from_meta(obj.key.name); - obj.index_hash_source = mp.get_key(); - } - BucketShard bs(this); - - int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - return bi_put(bs, entry); -} - -int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, - const string& obj_name_filter, const string& marker, uint32_t max, - list *entries, bool *is_truncated) -{ - rgw_obj obj(bucket, obj_name_filter); - BucketShard bs(this); - int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - auto& ref = bs.bucket_obj.get_ref(); - ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); - if (ret == -ENOENT) { - *is_truncated = false; - } - if (ret < 0) - return ret; - - return 0; -} - -int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max, - list *entries, bool *is_truncated) -{ - auto& ref = bs.bucket_obj.get_ref(); - int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated); - if (ret < 0) - return ret; - - return 0; -} - -int RGWRados::bi_list(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max, - list *entries, bool *is_truncated) -{ - BucketShard bs(this); - int ret = bs.init(dpp, bucket_info, - bucket_info.layout.current_index, - shard_id); - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl; - return ret; - } - - return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated); -} - -int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs) -{ - auto& ref = bs.bucket_obj.get_ref(); - int ret = ref.pool.ioctx().remove(ref.obj.oid); - if (ret == -ENOENT) { - ret = 0; - } - if (ret < 0) { - ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl; - return ret; - } - - return 0; -} - -int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op) -{ - return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield); -} - -int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c, - librados::ObjectWriteOperation *op) -{ - return gc_pool_ctx.aio_operate(oid, c, op); -} - -int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl) -{ - return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield); -} - -int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue) -{ - return gc->list(index, marker, max, expired_only, result, truncated, processing_queue); -} - -int RGWRados::process_gc(bool expired_only) -{ - return gc->process(expired_only); -} - -int RGWRados::list_lc_progress(string& marker, uint32_t max_entries, - vector>& progress_map, - int& index) -{ - return lc->list_lc_progress(marker, max_entries, progress_map, index); -} - -int RGWRados::process_lc(const std::unique_ptr& optional_bucket) -{ - RGWLC lc; - lc.initialize(cct, this->driver); - RGWLC::LCWorker worker(&lc, cct, &lc, 0); - auto ret = lc.process(&worker, optional_bucket, true /* once */); - lc.stop_processor(); // sets down_flag, but returns immediately - return ret; -} - -bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp) -{ - return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now()); -} - -int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag, - rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace) -{ - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx; - ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; - - rgw_zone_set zones_trace; - if (_zones_trace) { - zones_trace = *_zones_trace; - } - zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); - - ObjectWriteOperation o; - o.assert_exists(); // bucket index shard must exist - - cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance); - cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace); - int ret = bs.bucket_obj.operate(dpp, &o, y); - ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; - return ret; -} - -int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, - int64_t pool, uint64_t epoch, - rgw_bucket_dir_entry& ent, RGWObjCategory category, - list *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace) -{ - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << - " obj=" << obj << " tag=" << tag << " op=" << op << - ", remove_objs=" << (remove_objs ? *remove_objs : std::list()) << dendl_bitx; - ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; - - ObjectWriteOperation o; - o.assert_exists(); // bucket index shard must exist - - rgw_bucket_dir_entry_meta dir_meta; - dir_meta = ent.meta; - dir_meta.category = category; - - rgw_zone_set zones_trace; - if (_zones_trace) { - zones_trace = *_zones_trace; - } - zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key()); - - rgw_bucket_entry_ver ver; - ver.pool = pool; - ver.epoch = epoch; - cls_rgw_obj_key key(ent.key.name, ent.key.instance); - cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING); - cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs, - svc.zone->get_zone().log_data, bilog_flags, &zones_trace); - complete_op_data *arg; - index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs, - svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg); - librados::AioCompletion *completion = arg->rados_completion; - int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o); - completion->release(); /* can't reference arg here, as it might have already been released */ - - ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx; - return ret; -} - -int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag, - int64_t pool, uint64_t epoch, - rgw_bucket_dir_entry& ent, RGWObjCategory category, - list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace) -{ - return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace); -} - -int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag, - int64_t pool, uint64_t epoch, - rgw_obj& obj, - real_time& removed_mtime, - list *remove_objs, - uint16_t bilog_flags, - rgw_zone_set *zones_trace) -{ - rgw_bucket_dir_entry ent; - ent.meta.mtime = removed_mtime; - obj.key.get_index_key(&ent.key); - return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, - ent, RGWObjCategory::None, remove_objs, - bilog_flags, zones_trace); -} - -int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, - list *remove_objs, - uint16_t bilog_flags, rgw_zone_set *zones_trace) -{ - rgw_bucket_dir_entry ent; - obj.key.get_index_key(&ent.key); - return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, - -1 /* pool id */, 0, ent, - RGWObjCategory::None, remove_objs, bilog_flags, - zones_trace); -} - -int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout) -{ - RGWSI_RADOS::Pool index_pool; - map bucket_objs; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr); - if (r < 0) - return r; - - return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)(); -} - - -// returns 0 if there is an error in calculation -uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries, - uint32_t num_shards) -{ - if (num_shards == 0) { - // we'll get a floating point exception since we divide by - // num_shards - return 0; - } - - // We want to minimize the chances that when num_shards >> - // num_entries that we return much fewer than num_entries to the - // client. Given all the overhead of making a cls call to the osd, - // returning a few entries is not much more work than returning one - // entry. This minimum might be better tuned based on future - // experiments where num_shards >> num_entries. (Note: ">>" should - // be interpreted as "much greater than".) - constexpr uint32_t min_read = 8; - - // The following is based on _"Balls into Bins" -- A Simple and - // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle - // cases when num_shards >> num_entries (it almost serves as a - // ceiling calculation). We also assume alpha is 1.0 and extract it - // from the calculation. Future work could involve memoizing some of - // the transcendental functions to minimize repeatedly re-calling - // them with the same parameters, which we expect to be the case the - // majority of the time. - uint32_t calc_read = - 1 + - static_cast((num_entries / num_shards) + - sqrt((2 * num_entries) * - log(num_shards) / num_shards)); - - return std::max(min_read, calc_read); -} - - -int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - const int shard_id, - const rgw_obj_index_key& start_after, - const std::string& prefix, - const std::string& delimiter, - const uint32_t num_entries, - const bool list_versions, - const uint16_t expansion_factor, - ent_map_t& m, - bool* is_truncated, - bool* cls_filtered, - rgw_obj_index_key* last_entry, - optional_yield y, - RGWBucketListNameFilter force_check_filter) -{ - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - - /* expansion_factor allows the number of entries to read to grow - * exponentially; this is used when earlier reads are producing too - * few results, perhaps due to filtering or to a series of - * namespaced entries */ - - ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << - " start_after=\"" << start_after.to_string() << - "\", prefix=\"" << prefix << - ", delimiter=\"" << delimiter << - "\", shard_id=" << shard_id << - "\", num_entries=" << num_entries << - ", shard_id=" << shard_id << - ", list_versions=" << list_versions << - ", expansion_factor=" << expansion_factor << - ", force_check_filter is " << - (force_check_filter ? "set" : "unset") << dendl_bitx; - ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; - - m.clear(); - - RGWSI_RADOS::Pool index_pool; - // key - oid (for different shards if there is any) - // value - list result for the corresponding oid (shard), it is filled by - // the AIO callback - std::map shard_oids; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, - &index_pool, &shard_oids, - nullptr); - if (r < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl; - return r; - } - - const uint32_t shard_count = shard_oids.size(); - if (shard_count == 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": the bucket index shard count appears to be 0, " - "which is an illegal value" << dendl; - return -ERR_INVALID_BUCKET_STATE; - } - - uint32_t num_entries_per_shard; - if (expansion_factor == 0) { - num_entries_per_shard = - calc_ordered_bucket_list_per_shard(num_entries, shard_count); - } else if (expansion_factor <= 11) { - // we'll max out the exponential multiplication factor at 1024 (2<<10) - num_entries_per_shard = - std::min(num_entries, - (uint32_t(1 << (expansion_factor - 1)) * - calc_ordered_bucket_list_per_shard(num_entries, shard_count))); - } else { - num_entries_per_shard = num_entries; - } - - if (num_entries_per_shard == 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": unable to calculate the number of entries to read from each " - "bucket index shard" << dendl; - return -ERR_INVALID_BUCKET_STATE; - } - - ldpp_dout(dpp, 10) << __func__ << - ": request from each of " << shard_count << - " shard(s) for " << num_entries_per_shard << " entries to get " << - num_entries << " total entries" << dendl; - - auto& ioctx = index_pool.ioctx(); - std::map shard_list_results; - cls_rgw_obj_key start_after_key(start_after.name, start_after.instance); - r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter, - num_entries_per_shard, - list_versions, shard_oids, shard_list_results, - cct->_conf->rgw_bucket_index_max_aio)(); - if (r < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": CLSRGWIssueBucketList for " << bucket_info.bucket << - " failed" << dendl; - return r; - } - - // to manage the iterators through each shard's list results - struct ShardTracker { - const size_t shard_idx; - rgw_cls_list_ret& result; - const std::string& oid_name; - RGWRados::ent_map_t::iterator cursor; - RGWRados::ent_map_t::iterator end; - - // manages an iterator through a shard and provides other - // accessors - ShardTracker(size_t _shard_idx, - rgw_cls_list_ret& _result, - const std::string& _oid_name): - shard_idx(_shard_idx), - result(_result), - oid_name(_oid_name), - cursor(_result.dir.m.begin()), - end(_result.dir.m.end()) - {} - - inline const std::string& entry_name() const { - return cursor->first; - } - rgw_bucket_dir_entry& dir_entry() const { - return cursor->second; - } - inline bool is_truncated() const { - return result.is_truncated; - } - inline ShardTracker& advance() { - ++cursor; - // return a self-reference to allow for chaining of calls, such - // as x.advance().at_end() - return *this; - } - inline bool at_end() const { - return cursor == end; - } - }; // ShardTracker - - // add the next unique candidate, or return false if we reach the end - auto next_candidate = [] (CephContext *cct, ShardTracker& t, - std::multimap& candidates, - size_t tracker_idx) { - if (!t.at_end()) { - candidates.emplace(t.entry_name(), tracker_idx); - } - return; - }; - - // one tracker per shard requested (may not be all shards) - std::vector results_trackers; - results_trackers.reserve(shard_list_results.size()); - for (auto& r : shard_list_results) { - results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]); - - // if any *one* shard's result is trucated, the entire result is - // truncated - *is_truncated = *is_truncated || r.second.is_truncated; - - // unless *all* are shards are cls_filtered, the entire result is - // not filtered - *cls_filtered = *cls_filtered && r.second.cls_filtered; - } - - // create a map to track the next candidate entry from ShardTracker - // (key=candidate, value=index into results_trackers); as we consume - // entries from shards, we replace them with the next entries in the - // shards until we run out - std::multimap candidates; - size_t tracker_idx = 0; - std::vector vidx; - vidx.reserve(shard_list_results.size()); - for (auto& t : results_trackers) { - // it's important that the values in the map refer to the index - // into the results_trackers vector, which may not be the same - // as the shard number (i.e., when not all shards are requested) - next_candidate(cct, t, candidates, tracker_idx); - ++tracker_idx; - } - - rgw_bucket_dir_entry* - last_entry_visited = nullptr; // to set last_entry (marker) - std::map updates; - uint32_t count = 0; - while (count < num_entries && !candidates.empty()) { - r = 0; - // select the next entry in lexical order (first key in map); - // again tracker_idx is not necessarily shard number, but is index - // into results_trackers vector - tracker_idx = candidates.begin()->second; - auto& tracker = results_trackers.at(tracker_idx); - - const std::string& name = tracker.entry_name(); - rgw_bucket_dir_entry& dirent = tracker.dir_entry(); - - ldpp_dout(dpp, 20) << __func__ << ": currently processing " << - dirent.key << " from shard " << tracker.shard_idx << dendl; - - const bool force_check = - force_check_filter && force_check_filter(dirent.key.name); - - if ((!dirent.exists && - !dirent.is_delete_marker() && - !dirent.is_common_prefix()) || - !dirent.pending_map.empty() || - force_check) { - /* there are uncommitted ops. We need to check the current - * state, and if the tags are old we need to do clean-up as - * well. */ - librados::IoCtx sub_ctx; - sub_ctx.dup(ioctx); - ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << - " calling check_disk_state bucket=" << bucket_info.bucket << - " entry=" << dirent.key << dendl_bitx; - r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, - updates[tracker.oid_name], y); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << __func__ << - ": check_disk_state for \"" << dirent.key << - "\" failed with r=" << r << dendl; - return r; - } - } else { - r = 0; - } - - // at this point either r >= 0 or r == -ENOENT - if (r >= 0) { // i.e., if r != -ENOENT - ldpp_dout(dpp, 10) << __func__ << ": got " << - dirent.key << dendl; - - auto [it, inserted] = m.insert_or_assign(name, std::move(dirent)); - last_entry_visited = &it->second; - if (inserted) { - ++count; - } else { - ldpp_dout(dpp, 0) << "WARNING: " << __func__ << - " reassigned map value at \"" << name << - "\", which should not happen" << dendl; - } - } else { - ldpp_dout(dpp, 10) << __func__ << ": skipping " << - dirent.key.name << "[" << dirent.key.instance << "]" << dendl; - last_entry_visited = &tracker.dir_entry(); - } - - // refresh the candidates map - vidx.clear(); - bool need_to_stop = false; - auto range = candidates.equal_range(name); - for (auto i = range.first; i != range.second; ++i) { - vidx.push_back(i->second); - } - candidates.erase(range.first, range.second); - for (auto idx : vidx) { - auto& tracker_match = results_trackers.at(idx); - tracker_match.advance(); - next_candidate(cct, tracker_match, candidates, idx); - if (tracker_match.at_end() && tracker_match.is_truncated()) { - need_to_stop = true; - break; - } - } - if (need_to_stop) { - // once we exhaust one shard that is truncated, we need to stop, - // as we cannot be certain that one of the next entries needs to - // come from that shard; S3 and swift protocols allow returning - // fewer than what was requested - ldpp_dout(dpp, 10) << __func__ << - ": stopped accumulating results at count=" << count << - ", dirent=\"" << dirent.key << - "\", because its shard is truncated and exhausted" << dendl; - break; - } - } // while we haven't provided requested # of result entries - - // suggest updates if there are any - for (auto& miter : updates) { - if (miter.second.length()) { - ObjectWriteOperation o; - cls_rgw_suggest_changes(o, miter.second); - // we don't care if we lose suggested updates, send them off blindly - AioCompletion *c = - librados::Rados::aio_create_completion(nullptr, nullptr); - - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << - ": doing dir_suggest on " << miter.first << dendl_bitx; - ioctx.aio_operate(miter.first, c, &o); - c->release(); - } - } // updates loop - - // determine truncation by checking if all the returned entries are - // consumed or not - *is_truncated = false; - for (const auto& t : results_trackers) { - if (!t.at_end() || t.is_truncated()) { - *is_truncated = true; - break; - } - } - - ldpp_dout(dpp, 20) << __func__ << - ": returning, count=" << count << ", is_truncated=" << *is_truncated << - dendl; - - if (*is_truncated && count < num_entries) { - ldpp_dout(dpp, 10) << __func__ << - ": requested " << num_entries << " entries but returning " << - count << ", which is truncated" << dendl; - } - - if (last_entry_visited != nullptr && last_entry) { - *last_entry = last_entry_visited->key; - ldpp_dout(dpp, 20) << __func__ << - ": returning, last_entry=" << *last_entry << dendl; - } else { - ldpp_dout(dpp, 20) << __func__ << - ": returning, last_entry NOT SET" << dendl; - } - - ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; - return 0; -} // RGWRados::cls_bucket_list_ordered - - -// A helper function to retrieve the hash source from an incomplete -// multipart entry by removing everything from the second to last -// period on. -static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) { - std::size_t found = oid_wo_ns.rfind('.'); - if (found == std::string::npos || found < 1) { - return -EINVAL; - } - found = oid_wo_ns.rfind('.', found - 1); - if (found == std::string::npos || found < 1) { - return -EINVAL; - } - *index_hash_source = oid_wo_ns.substr(0, found); - return 0; -} - - -int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - int shard_id, - const rgw_obj_index_key& start_after, - const std::string& prefix, - uint32_t num_entries, - bool list_versions, - std::vector& ent_list, - bool *is_truncated, - rgw_obj_index_key *last_entry, - optional_yield y, - RGWBucketListNameFilter force_check_filter) { - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - - ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket << - " start_after=\"" << start_after << - "\", prefix=\"" << prefix << - "\", shard_id=" << shard_id << - "\", num_entries=" << num_entries << - ", list_versions=" << list_versions << - (force_check_filter ? "set" : "unset") << dendl_bitx; - ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; - - ent_list.clear(); - static MultipartMetaFilter multipart_meta_filter; - - *is_truncated = false; - RGWSI_RADOS::Pool index_pool; - - std::map oids; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr); - if (r < 0) { - return r; - } - - auto& ioctx = index_pool.ioctx(); - - const uint32_t num_shards = oids.size(); - - rgw_obj_index_key marker = start_after; - uint32_t current_shard; - if (shard_id >= 0) { - current_shard = shard_id; - } else if (start_after.empty()) { - current_shard = 0u; - } else { - // at this point we have a marker (start_after) that has something - // in it, so we need to get to the bucket shard index, so we can - // start reading from there - - - // now convert the key (oid) to an rgw_obj_key since that will - // separate out the namespace, name, and instance - rgw_obj_key obj_key; - bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key); - if (!parsed) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " received an invalid start marker: \"" << start_after << "\"" << - dendl; - return -EINVAL; - } else if (obj_key.name.empty()) { - // if the name is empty that means the object name came in with - // a namespace only, and therefore we need to start our scan at - // the first bucket index shard - current_shard = 0u; - } else { - // so now we have the key used to compute the bucket index shard - // and can extract the specific shard from it - if (obj_key.ns == RGW_OBJ_NS_MULTIPART) { - // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of - // the implementation relying on MultipartMetaFilter - // because MultipartMetaFilter only checks .meta suffix, which may - // exclude data multiparts but include some regular objects with .meta suffix - // by mistake. - string index_hash_source; - r = parse_index_hash_source(obj_key.name, &index_hash_source); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - " parse_index_hash_source unable to parse \"" << obj_key.name << - "\", r=" << r << dendl; - return r; - } - current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards); - } else { - current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards); - } - } - } - - uint32_t count = 0u; - std::map updates; - rgw_obj_index_key last_added_entry; - while (count <= num_entries && - ((shard_id >= 0 && current_shard == uint32_t(shard_id)) || - current_shard < num_shards)) { - const std::string& oid = oids[current_shard]; - rgw_cls_list_ret result; - - librados::ObjectReadOperation op; - const std::string empty_delimiter; - cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter, - num_entries, - list_versions, &result); - r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield); - if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": error in rgw_rados_operate (bucket list op), r=" << r << dendl; - return r; - } - - for (auto& entry : result.dir.m) { - rgw_bucket_dir_entry& dirent = entry.second; - - bool force_check = force_check_filter && - force_check_filter(dirent.key.name); - if ((!dirent.exists && !dirent.is_delete_marker()) || - !dirent.pending_map.empty() || - force_check) { - /* there are uncommitted ops. We need to check the current state, - * and if the tags are old we need to do cleanup as well. */ - librados::IoCtx sub_ctx; - sub_ctx.dup(ioctx); - ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << - ": calling check_disk_state bucket=" << bucket_info.bucket << - " entry=" << dirent.key << dendl_bitx; - r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y); - if (r < 0 && r != -ENOENT) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << - ": error in check_disk_state, r=" << r << dendl; - return r; - } - } else { - r = 0; - } - - // at this point either r >= 0 or r == -ENOENT - if (r >= 0) { // i.e., if r != -ENOENT - ldpp_dout(dpp, 10) << __func__ << ": got " << - dirent.key << dendl; - - if (count < num_entries) { - marker = last_added_entry = dirent.key; // double assign - ent_list.emplace_back(std::move(dirent)); - ++count; - } else { - last_added_entry = dirent.key; - *is_truncated = true; - ldpp_dout(dpp, 10) << "INFO: " << __func__ << - ": reached max entries (" << num_entries << ") to return at \"" << - dirent.key << "\"" << dendl; - goto check_updates; - } - } else { // r == -ENOENT - // in the case of -ENOENT, make sure we're advancing marker - // for possible next call to CLSRGWIssueBucketList - marker = dirent.key; - } - } // entry for loop - - if (!result.is_truncated) { - // if we reached the end of the shard read next shard - ++current_shard; - marker = rgw_obj_index_key(); - } - } // shard loop - -check_updates: - - // suggest updates if there is any - std::map::iterator miter = updates.begin(); - for (; miter != updates.end(); ++miter) { - if (miter->second.length()) { - ObjectWriteOperation o; - cls_rgw_suggest_changes(o, miter->second); - // we don't care if we lose suggested updates, send them off blindly - AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); - - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << - " doing dir_suggest on " << miter->first << dendl_bitx; - ioctx.aio_operate(miter->first, c, &o); - c->release(); - } - } - - if (last_entry && !ent_list.empty()) { - *last_entry = last_added_entry; - } - - ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; - return 0; -} // RGWRados::cls_bucket_list_unordered - - -int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid, - rgw_usage_log_info& info) -{ - rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); - - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - - ObjectWriteOperation op; - cls_rgw_usage_log_add(op, info); - - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - return r; -} - -int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, - uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries, - string& read_iter, map& usage, - bool *is_truncated) -{ - rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); - - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - - *is_truncated = false; - - r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch, - max_entries, read_iter, usage, is_truncated); - - return r; -} - -static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch) -{ - bool done = false; - do { - librados::ObjectWriteOperation op; - cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch); - int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - if (r == -ENODATA) - done = true; - else if (r < 0) - return r; - } while (!done); - - return 0; -} - -int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket, - uint64_t start_epoch, uint64_t end_epoch) -{ - rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); - - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - - r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch); - return r; -} - -int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid) -{ - rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid); - - rgw_rados_ref ref; - int r = get_raw_obj_ref(dpp, obj, &ref); - if (r < 0) { - return r; - } - librados::ObjectWriteOperation op; - cls_rgw_usage_log_clear(op); - r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield); - return r; -} - - -// note: this removes entries from the rados bucket index objects -// without going through CLS; this is known to be called from -// "radosgw-admin unlink" and "radosgw-admin bucket check --fix" -int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const std::list& entry_key_list) -{ - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket << - " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx; - ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx; - - const auto& current_index = bucket_info.get_current_index(); - if (is_layout_indexless(current_index)) { - return -EINVAL; - } - const uint32_t num_shards = current_index.layout.normal.num_shards; - - RGWSI_RADOS::Pool index_pool; - std::map index_oids; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, - bucket_info.layout.current_index, - &index_pool, &index_oids, nullptr); - if (r < 0) { - ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << - " open_bucket_index returned " << r << dendl_bitx; - return r; - } - - // split up removals by shard - std::map> sharded_removals; - for (const auto& entry_key : entry_key_list) { - const rgw_obj_key obj_key(entry_key); - const uint32_t shard = - RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards); - - // entry_key already combines namespace and name, so we first have - // to break that apart before we can then combine with instance - std::string name; - std::string ns; // namespace - rgw_obj_key::parse_index_key(entry_key.name, &name, &ns); - rgw_obj_key full_key(name, entry_key.instance, ns); - std::string combined_key = full_key.get_oid(); - - sharded_removals[shard].insert(combined_key); - - ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ << - ": removal from bucket index, bucket=" << bucket_info.bucket << - " key=" << combined_key << " designated for shard " << shard << - dendl_bitx; - } - - for (const auto& removals : sharded_removals) { - const int shard = removals.first; - const std::string& oid = index_oids[shard]; - - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << - ": removal from bucket index, bucket=" << bucket_info.bucket << - ", shard=" << shard << ", oid=" << oid << ", num_keys=" << - removals.second.size() << dendl_bitx; - - r = index_pool.ioctx().omap_rm_keys(oid, removals.second); - if (r < 0) { - ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ << - ": omap_rm_keys returned ret=" << r << - dendl_bitx; - return r; - } - } - - ldout_bitx(bitx, dpp, 5) << - "EXITING " << __func__ << " and returning " << r << dendl_bitx; - - return r; -} - -int RGWRados::check_disk_state(const DoutPrefixProvider *dpp, - librados::IoCtx io_ctx, - RGWBucketInfo& bucket_info, - rgw_bucket_dir_entry& list_state, - rgw_bucket_dir_entry& object, - bufferlist& suggested_updates, - optional_yield y) -{ - const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation; - ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << - bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx; - - std::unique_ptr bucket; - driver->get_bucket(nullptr, bucket_info, &bucket); - uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0); - - std::string loc; - - std::unique_ptr obj = bucket->get_object(list_state.key); - MultipartMetaFilter multipart_meta_filter; - string temp_key; - if (multipart_meta_filter.filter(list_state.key.name, temp_key)) { - obj->set_in_extra_data(true); - } - - string oid; - get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc); - - if (loc != list_state.locator) { - ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl; - } - - io_ctx.locator_set_key(list_state.locator); - - RGWObjState *astate = NULL; - RGWObjManifest *manifest = nullptr; - RGWObjectCtx rctx(this->driver); - int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y); - if (r < 0) - return r; - - list_state.pending_map.clear(); // we don't need this and it inflates size - if (!list_state.is_delete_marker() && !astate->exists) { - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx; - /* object doesn't exist right now -- hopefully because it's - * marked as !exists and got deleted */ - if (list_state.exists) { - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx; - /* FIXME: what should happen now? Work out if there are any - * non-bad ways this could happen (there probably are, but annoying - * to handle!) */ - } - - // encode a suggested removal of that key - list_state.ver.epoch = io_ctx.get_last_version(); - list_state.ver.pool = io_ctx.get_id(); - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx; - cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates); - return -ENOENT; - } - - string etag; - string content_type; - string storage_class; - ACLOwner owner; - bool appendable = false; - - object.meta.size = astate->size; - object.meta.accounted_size = astate->accounted_size; - object.meta.mtime = astate->mtime; - - map::iterator iter = astate->attrset.find(RGW_ATTR_ETAG); - if (iter != astate->attrset.end()) { - etag = rgw_bl_str(iter->second); - } - iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE); - if (iter != astate->attrset.end()) { - content_type = rgw_bl_str(iter->second); - } - iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS); - if (iter != astate->attrset.end()) { - storage_class = rgw_bl_str(iter->second); - } - iter = astate->attrset.find(RGW_ATTR_ACL); - if (iter != astate->attrset.end()) { - r = decode_policy(dpp, iter->second, &owner); - if (r < 0) { - ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl; - } - } - iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM); - if (iter != astate->attrset.end()) { - appendable = true; - } - - if (manifest) { - RGWObjManifest::obj_iterator miter; - for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) { - const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver); - rgw_obj loc; - RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc); - - if (loc.key.ns == RGW_OBJ_NS_MULTIPART) { - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx; - r = delete_obj_index(loc, astate->mtime, dpp); - if (r < 0) { - ldout_bitx(bitx, dpp, 0) << - "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx; - } - } - } - } - - object.meta.etag = etag; - object.meta.content_type = content_type; - object.meta.storage_class = storage_class; - object.meta.owner = owner.get_id().to_str(); - object.meta.owner_display_name = owner.get_display_name(); - object.meta.appendable = appendable; - - // encode suggested updates - - list_state.meta.size = object.meta.size; - list_state.meta.accounted_size = object.meta.accounted_size; - list_state.meta.mtime = object.meta.mtime; - list_state.meta.category = main_category; - list_state.meta.etag = etag; - list_state.meta.appendable = appendable; - list_state.meta.content_type = content_type; - list_state.meta.storage_class = storage_class; - - librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id - r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx); - if (r < 0) { - ldpp_dout(dpp, 0) << __func__ << - " WARNING: unable to find head object data pool for \"" << - obj << "\", not updating version pool/epoch" << dendl; - } else { - list_state.ver.pool = head_obj_ctx.get_id(); - list_state.ver.epoch = astate->epoch; - } - - if (astate->obj_tag.length() > 0) { - list_state.tag = astate->obj_tag.c_str(); - } - - list_state.meta.owner = owner.get_id().to_str(); - list_state.meta.owner_display_name = owner.get_display_name(); - - list_state.exists = true; - - ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << - ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx; - cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates); - - ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx; - return 0; -} // RGWRados::check_disk_state - -int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector& headers, map *bucket_instance_ids) -{ - RGWSI_RADOS::Pool index_pool; - map oids; - map list_results; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids); - if (r < 0) { - ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned " - << r << dendl; - return r; - } - - r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)(); - if (r < 0) { - ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned " - << r << dendl; - return r; - } - - map::iterator iter = list_results.begin(); - for(; iter != list_results.end(); ++iter) { - headers.push_back(std::move(iter->second.dir.header)); - } - return 0; -} - -int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio) -{ - RGWSI_RADOS::Pool index_pool; - map bucket_objs; - int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr); - if (r < 0) - return r; - - map::iterator iter = bucket_objs.begin(); - for (; iter != bucket_objs.end(); ++iter) { - r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast(ctx->get())); - if (r < 0) { - ctx->put(); - break; - } else { - (*num_aio)++; - } - } - return r; -} - -int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, - const rgw_bucket& bucket, - uint64_t num_objs, - const DoutPrefixProvider *dpp) -{ - if (! cct->_conf.get_val("rgw_dynamic_resharding")) { - return 0; - } - - bool need_resharding = false; - uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); - const uint32_t max_dynamic_shards = - uint32_t(cct->_conf.get_val("rgw_max_dynamic_shards")); - - if (num_source_shards >= max_dynamic_shards) { - return 0; - } - - uint32_t suggested_num_shards = 0; - const uint64_t max_objs_per_shard = - cct->_conf.get_val("rgw_max_objs_per_shard"); - - // TODO: consider per-bucket sync policy here? - const bool is_multisite = svc.zone->get_zone().log_data; - - quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards, - num_objs, is_multisite, need_resharding, - &suggested_num_shards); - if (! need_resharding) { - return 0; - } - - const uint32_t final_num_shards = - RGWBucketReshard::get_preferred_shards(suggested_num_shards, - max_dynamic_shards); - // final verification, so we don't reduce number of shards - if (final_num_shards <= num_source_shards) { - return 0; - } - - ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name << - " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards << - "; new num shards " << final_num_shards << " (suggested " << - suggested_num_shards << ")" << dendl; - - return add_bucket_to_reshard(dpp, bucket_info, final_num_shards); -} - -int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards) -{ - RGWReshard reshard(this->driver, dpp); - - uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout); - - new_num_shards = std::min(new_num_shards, get_max_bucket_shards()); - if (new_num_shards <= num_source_shards) { - ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl; - return 0; - } - - cls_rgw_reshard_entry entry; - entry.time = real_clock::now(); - entry.tenant = bucket_info.owner.tenant; - entry.bucket_name = bucket_info.bucket.name; - entry.bucket_id = bucket_info.bucket.bucket_id; - entry.old_num_shards = num_source_shards; - entry.new_num_shards = new_num_shards; - - return reshard.add(dpp, entry); -} - -int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, - RGWQuota& quota, - uint64_t obj_size, optional_yield y, - bool check_size_only) -{ - // if we only check size, then num_objs will set to 0 - if(check_size_only) - return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y); - - return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y); -} - -int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key, - int *shard_id) -{ - int r = 0; - switch (layout.hash_type) { - case rgw::BucketHashType::Mod: - if (!layout.num_shards) { - if (shard_id) { - *shard_id = -1; - } - } else { - uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards); - if (shard_id) { - *shard_id = (int)sid; - } - } - break; - default: - r = -ENOTSUP; - } - return r; -} - -uint64_t RGWRados::instance_id() -{ - return get_rados_handle()->get_instance_id(); -} - -uint64_t RGWRados::next_bucket_id() -{ - std::lock_guard l{bucket_id_lock}; - return ++max_bucket_id; -} - -librados::Rados* RGWRados::get_rados_handle() -{ - return &rados; -} - -int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list& handles) -{ - rgw_rados_ref ref; - int ret = get_raw_obj_ref(dpp, obj, &ref); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; - return ret; - } - - ObjectWriteOperation op; - list prefixes; - cls_rgw_remove_obj(op, prefixes); - - AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); - ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; - c->release(); - return ret; - } - - handles.push_back(c); - - return 0; -} - -int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, - RGWBucketInfo& bucket_info, RGWObjState *astate, - list& handles, bool keep_index_consistent, - optional_yield y) -{ - rgw_rados_ref ref; - int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl; - return ret; - } - - if (keep_index_consistent) { - RGWRados::Bucket bop(this, bucket_info); - RGWRados::Bucket::UpdateIndex index_op(&bop, obj); - - ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl; - return ret; - } - } - - ObjectWriteOperation op; - list prefixes; - cls_rgw_remove_obj(op, prefixes); - - AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr); - ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl; - c->release(); - return ret; - } - - handles.push_back(c); - - if (keep_index_consistent) { - ret = delete_obj_index(obj, astate->mtime, dpp); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl; - return ret; - } - } - return ret; -} - -void objexp_hint_entry::generate_test_instances(list& o) -{ - auto it = new objexp_hint_entry; - it->tenant = "tenant1"; - it->bucket_name = "bucket1"; - it->bucket_id = "1234"; - it->obj_key = rgw_obj_key("obj"); - o.push_back(it); - o.push_back(new objexp_hint_entry); -} - -void objexp_hint_entry::dump(Formatter *f) const -{ - f->open_object_section("objexp_hint_entry"); - encode_json("tenant", tenant, f); - encode_json("bucket_name", bucket_name, f); - encode_json("bucket_id", bucket_id, f); - encode_json("rgw_obj_key", obj_key, f); - utime_t ut(exp_time); - encode_json("exp_time", ut, f); - f->close_section(); -} - -void RGWOLHInfo::generate_test_instances(list &o) -{ - RGWOLHInfo *olh = new RGWOLHInfo; - olh->removed = false; - o.push_back(olh); - o.push_back(new RGWOLHInfo); -} - -void RGWOLHInfo::dump(Formatter *f) const -{ - encode_json("target", target, f); -} - -void RGWOLHPendingInfo::dump(Formatter *f) const -{ - utime_t ut(time); - encode_json("time", ut, f); -} - diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h deleted file mode 100644 index 1b5295178727..000000000000 --- a/src/rgw/rgw_rados.h +++ /dev/null @@ -1,1636 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef CEPH_RGWRADOS_H -#define CEPH_RGWRADOS_H - -#include -#include -#include -#include - -#include "include/rados/librados.hpp" -#include "include/Context.h" -#include "include/random.h" -#include "common/RefCountedObj.h" -#include "common/ceph_time.h" -#include "common/Timer.h" -#include "rgw_common.h" -#include "cls/rgw/cls_rgw_types.h" -#include "cls/version/cls_version_types.h" -#include "cls/log/cls_log_types.h" -#include "cls/timeindex/cls_timeindex_types.h" -#include "cls/otp/cls_otp_types.h" -#include "rgw_quota.h" -#include "rgw_log.h" -#include "rgw_metadata.h" -#include "rgw_meta_sync_status.h" -#include "rgw_period_puller.h" -#include "rgw_obj_manifest.h" -#include "rgw_sync_module.h" -#include "rgw_trim_bilog.h" -#include "rgw_service.h" -#include "rgw_sal.h" -#include "rgw_aio.h" -#include "rgw_d3n_cacherequest.h" - -#include "services/svc_rados.h" -#include "services/svc_bi_rados.h" -#include "common/Throttle.h" -#include "common/ceph_mutex.h" -#include "rgw_cache.h" -#include "rgw_sal_fwd.h" - -struct D3nDataCache; - -class RGWWatcher; -class ACLOwner; -class RGWGC; -class RGWMetaNotifier; -class RGWDataNotifier; -class RGWLC; -class RGWObjectExpirer; -class RGWMetaSyncProcessorThread; -class RGWDataSyncProcessorThread; -class RGWSyncLogTrimThread; -class RGWSyncTraceManager; -struct RGWZoneGroup; -struct RGWZoneParams; -class RGWReshard; -class RGWReshardWait; - -struct get_obj_data; - -/* flags for put_obj_meta() */ -#define PUT_OBJ_CREATE 0x01 -#define PUT_OBJ_EXCL 0x02 -#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL) - -static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid) -{ - if (bucket.marker.empty() || orig_oid.empty()) { - oid = orig_oid; - } else { - oid = bucket.marker; - oid.append("_"); - oid.append(orig_oid); - } -} - -static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator) -{ - const rgw_bucket& bucket = obj.bucket; - prepend_bucket_marker(bucket, obj.get_oid(), oid); - const std::string& loc = obj.key.get_loc(); - if (!loc.empty()) { - prepend_bucket_marker(bucket, loc, locator); - } else { - locator.clear(); - } -} - -struct RGWOLHInfo { - rgw_obj target; - bool removed; - - RGWOLHInfo() : removed(false) {} - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(target, bl); - encode(removed, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(target, bl); - decode(removed, bl); - DECODE_FINISH(bl); - } - static void generate_test_instances(std::list& o); - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(RGWOLHInfo) - -struct RGWOLHPendingInfo { - ceph::real_time time; - - RGWOLHPendingInfo() {} - - void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); - encode(time, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - DECODE_START(1, bl); - decode(time, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; -}; -WRITE_CLASS_ENCODER(RGWOLHPendingInfo) - -struct RGWUsageBatch { - std::map m; - - void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) { - bool exists = m.find(t) != m.end(); - *account = !exists; - m[t].aggregate(entry); - } -}; - -struct RGWCloneRangeInfo { - rgw_obj src; - off_t src_ofs; - off_t dst_ofs; - uint64_t len; -}; - -class RGWFetchObjFilter { -public: - virtual ~RGWFetchObjFilter() {} - - virtual int filter(CephContext *cct, - const rgw_obj_key& source_key, - const RGWBucketInfo& dest_bucket_info, - std::optional dest_placement_rule, - const std::map& obj_attrs, - std::optional *poverride_owner, - const rgw_placement_rule **prule) = 0; -}; - -class RGWFetchObjFilter_Default : public RGWFetchObjFilter { -protected: - rgw_placement_rule dest_rule; -public: - RGWFetchObjFilter_Default() {} - - int filter(CephContext *cct, - const rgw_obj_key& source_key, - const RGWBucketInfo& dest_bucket_info, - std::optional dest_placement_rule, - const std::map& obj_attrs, - std::optional *poverride_owner, - const rgw_placement_rule **prule) override; -}; - -struct RGWObjStateManifest { - RGWObjState state; - std::optional manifest; -}; - -class RGWObjectCtx { - rgw::sal::Driver* driver; - ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx"); - - std::map objs_state; -public: - explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {} - RGWObjectCtx(RGWObjectCtx& _o) { - std::unique_lock wl{lock}; - this->driver = _o.driver; - this->objs_state = _o.objs_state; - } - - rgw::sal::Driver* get_driver() { - return driver; - } - - RGWObjStateManifest *get_state(const rgw_obj& obj); - - void set_compressed(const rgw_obj& obj); - void set_atomic(rgw_obj& obj); - void set_prefetch_data(const rgw_obj& obj); - void invalidate(const rgw_obj& obj); -}; - - -struct RGWRawObjState { - rgw_raw_obj obj; - bool has_attrs{false}; - bool exists{false}; - uint64_t size{0}; - ceph::real_time mtime; - uint64_t epoch{0}; - bufferlist obj_tag; - bool has_data{false}; - bufferlist data; - bool prefetch_data{false}; - uint64_t pg_ver{0}; - - /* important! don't forget to update copy constructor */ - - RGWObjVersionTracker objv_tracker; - - std::map attrset; - RGWRawObjState() {} - RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) { - has_attrs = rhs.has_attrs; - exists = rhs.exists; - size = rhs.size; - mtime = rhs.mtime; - epoch = rhs.epoch; - if (rhs.obj_tag.length()) { - obj_tag = rhs.obj_tag; - } - has_data = rhs.has_data; - if (rhs.data.length()) { - data = rhs.data; - } - prefetch_data = rhs.prefetch_data; - pg_ver = rhs.pg_ver; - objv_tracker = rhs.objv_tracker; - } -}; - -struct RGWPoolIterCtx { - librados::IoCtx io_ctx; - librados::NObjectIterator iter; -}; - -struct RGWListRawObjsCtx { - bool initialized; - RGWPoolIterCtx iter_ctx; - - RGWListRawObjsCtx() : initialized(false) {} -}; - -struct objexp_hint_entry { - std::string tenant; - std::string bucket_name; - std::string bucket_id; - rgw_obj_key obj_key; - ceph::real_time exp_time; - - void encode(bufferlist& bl) const { - ENCODE_START(2, 1, bl); - encode(bucket_name, bl); - encode(bucket_id, bl); - encode(obj_key, bl); - encode(exp_time, bl); - encode(tenant, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::const_iterator& bl) { - // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ? - DECODE_START(2, bl); - decode(bucket_name, bl); - decode(bucket_id, bl); - decode(obj_key, bl); - decode(exp_time, bl); - if (struct_v >= 2) { - decode(tenant, bl); - } else { - tenant.clear(); - } - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const; - static void generate_test_instances(std::list& o); -}; -WRITE_CLASS_ENCODER(objexp_hint_entry) - -class RGWMetaSyncStatusManager; -class RGWDataSyncStatusManager; -class RGWCoroutinesManagerRegistry; - -class RGWGetDirHeader_CB; -class RGWGetUserHeader_CB; -namespace rgw { namespace sal { - class RadosStore; - class MPRadosSerializer; - class LCRadosSerializer; -} } - -class RGWAsyncRadosProcessor; - -template -class RGWChainedCacheImpl; - -struct bucket_info_entry { - RGWBucketInfo info; - real_time mtime; - std::map attrs; -}; - -struct tombstone_entry; - -template -class lru_map; -using tombstone_cache_t = lru_map; - -class RGWIndexCompletionManager; - -class RGWRados -{ - friend class RGWGC; - friend class RGWMetaNotifier; - friend class RGWDataNotifier; - friend class RGWObjectExpirer; - friend class RGWMetaSyncProcessorThread; - friend class RGWDataSyncProcessorThread; - friend class RGWReshard; - friend class RGWBucketReshard; - friend class RGWBucketReshardLock; - friend class BucketIndexLockGuard; - friend class rgw::sal::MPRadosSerializer; - friend class rgw::sal::LCRadosSerializer; - friend class rgw::sal::RadosStore; - - /** Open the pool used as root for this gateway */ - int open_root_pool_ctx(const DoutPrefixProvider *dpp); - int open_gc_pool_ctx(const DoutPrefixProvider *dpp); - int open_lc_pool_ctx(const DoutPrefixProvider *dpp); - int open_objexp_pool_ctx(const DoutPrefixProvider *dpp); - int open_reshard_pool_ctx(const DoutPrefixProvider *dpp); - int open_notif_pool_ctx(const DoutPrefixProvider *dpp); - - int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx, - bool mostly_omap); - - - ceph::mutex lock = ceph::make_mutex("rados_timer_lock"); - SafeTimer *timer; - - rgw::sal::RadosStore* driver = nullptr; - RGWGC *gc = nullptr; - RGWLC *lc; - RGWObjectExpirer *obj_expirer; - bool use_gc_thread; - bool use_lc_thread; - bool quota_threads; - bool run_sync_thread; - bool run_reshard_thread; - - RGWMetaNotifier *meta_notifier; - RGWDataNotifier *data_notifier; - RGWMetaSyncProcessorThread *meta_sync_processor_thread; - RGWSyncTraceManager *sync_tracer = nullptr; - std::map data_sync_processor_threads; - - boost::optional bucket_trim; - RGWSyncLogTrimThread *sync_log_trimmer{nullptr}; - - ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock"); - ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock"); - - librados::IoCtx root_pool_ctx; // .rgw - - double inject_notify_timeout_probability = 0; - unsigned max_notify_retries = 0; - - friend class RGWWatcher; - - ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id"); - - // This field represents the number of bucket index object shards - uint32_t bucket_index_max_shards; - - std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y); - - int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref); - int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref); - int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); - uint64_t max_bucket_id; - - int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx, - RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - RGWObjState *olh_state, RGWObjState **target_state, - RGWObjManifest **target_manifest, optional_yield y); - int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, - bool follow_olh, optional_yield y, bool assume_noent = false); - int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - librados::ObjectOperation& op, RGWObjState **state, - RGWObjManifest** pmanifest, optional_yield y); - - int update_placement_map(); - int store_bucket_info(RGWBucketInfo& info, std::map *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive); - - void remove_rgw_head_obj(librados::ObjectWriteOperation& op); - void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist); - void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type); -protected: - CephContext *cct; - - librados::Rados rados; - - using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl; - RGWChainedCacheImpl_bucket_info_entry *binfo_cache; - - tombstone_cache_t *obj_tombstone_cache; - - librados::IoCtx gc_pool_ctx; // .rgw.gc - librados::IoCtx lc_pool_ctx; // .rgw.lc - librados::IoCtx objexp_pool_ctx; - librados::IoCtx reshard_pool_ctx; - librados::IoCtx notif_pool_ctx; // .rgw.notif - - bool pools_initialized; - - RGWQuotaHandler *quota_handler; - - RGWCoroutinesManagerRegistry *cr_registry; - - RGWSyncModuleInstanceRef sync_module; - bool writeable_zone{false}; - - RGWIndexCompletionManager *index_completion_manager{nullptr}; - - bool use_cache{false}; - bool use_gc{true}; - bool use_datacache{false}; - - int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx); -public: - RGWRados(): timer(NULL), - gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false), - run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL), - data_notifier(NULL), meta_sync_processor_thread(NULL), - bucket_index_max_shards(0), - max_bucket_id(0), cct(NULL), - binfo_cache(NULL), obj_tombstone_cache(nullptr), - pools_initialized(false), - quota_handler(NULL), - cr_registry(NULL), - pctl(&ctl), - reshard(NULL) {} - - RGWRados& set_use_cache(bool status) { - use_cache = status; - return *this; - } - - RGWRados& set_use_gc(bool status) { - use_gc = status; - return *this; - } - - RGWRados& set_use_datacache(bool status) { - use_datacache = status; - return *this; - } - - bool get_use_datacache() { - return use_datacache; - } - - RGWLC *get_lc() { - return lc; - } - - RGWGC *get_gc() { - return gc; - } - - RGWRados& set_run_gc_thread(bool _use_gc_thread) { - use_gc_thread = _use_gc_thread; - return *this; - } - - RGWRados& set_run_lc_thread(bool _use_lc_thread) { - use_lc_thread = _use_lc_thread; - return *this; - } - - RGWRados& set_run_quota_threads(bool _run_quota_threads) { - quota_threads = _run_quota_threads; - return *this; - } - - RGWRados& set_run_sync_thread(bool _run_sync_thread) { - run_sync_thread = _run_sync_thread; - return *this; - } - - RGWRados& set_run_reshard_thread(bool _run_reshard_thread) { - run_reshard_thread = _run_reshard_thread; - return *this; - } - - librados::IoCtx* get_lc_pool_ctx() { - return &lc_pool_ctx; - } - - librados::IoCtx& get_notif_pool_ctx() { - return notif_pool_ctx; - } - - void set_context(CephContext *_cct) { - cct = _cct; - } - void set_store(rgw::sal::RadosStore* _driver) { - driver = _driver; - } - - RGWServices svc; - RGWCtl ctl; - - RGWCtl *pctl{nullptr}; - - /** - * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we - * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed. - */ - std::string host_id; - - RGWReshard *reshard; - std::shared_ptr reshard_wait; - - virtual ~RGWRados() = default; - - tombstone_cache_t *get_tombstone_cache() { - return obj_tombstone_cache; - } - const RGWSyncModuleInstanceRef& get_sync_module() { - return sync_module; - } - RGWSyncTraceManager *get_sync_tracer() { - return sync_tracer; - } - - int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment); - void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size); - int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); - int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr); - - uint32_t get_max_bucket_shards() { - return RGWSI_BucketIndex_RADOS::shards_max(); - } - - - int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref); - - int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx); - int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max, - RGWListRawObjsCtx& ctx, std::list& oids, - bool *is_truncated); - int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max, - RGWListRawObjsCtx& ctx, std::list& oids, - bool *is_truncated); - std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx); - - CephContext *ctx() { return cct; } - /** do all necessary setup of the storage device */ - int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) { - set_context(_cct); - return init_begin(dpp); - } - /** Initialize the RADOS instance and prepare to do other ops */ - int init_svc(bool raw, const DoutPrefixProvider *dpp); - int init_ctl(const DoutPrefixProvider *dpp); - virtual int init_rados(); - int init_begin(const DoutPrefixProvider *dpp); - int init_complete(const DoutPrefixProvider *dpp); - void finalize(); - - int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map& meta); - int update_service_map(const DoutPrefixProvider *dpp, std::map&& status); - - /// list logs - int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle); - int log_list_next(RGWAccessHandle handle, std::string *name); - - /// remove log - int log_remove(const DoutPrefixProvider *dpp, const std::string& name); - - /// show log - int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle); - int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry); - - // log bandwidth info - int log_usage(const DoutPrefixProvider *dpp, std::map& usage_info); - int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch, - uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map& usage); - int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch); - int clear_usage(const DoutPrefixProvider *dpp); - - int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool); - - void create_bucket_id(std::string *bucket_id); - - bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool); - bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj); - - int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket, - const std::string& zonegroup_id, - const rgw_placement_rule& placement_rule, - const std::string& swift_ver_location, - const RGWQuotaInfo * pquota_info, - std::map& attrs, - RGWBucketInfo& bucket_info, - obj_version *pobjv, - obj_version *pep_objv, - ceph::real_time creation_time, - rgw_bucket *master_bucket, - uint32_t *master_num_shards, - optional_yield y, - const DoutPrefixProvider *dpp, - bool exclusive = true); - - RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; } - - struct BucketShard { - RGWRados *store; - rgw_bucket bucket; - int shard_id; - RGWSI_RADOS::Obj bucket_obj; - - explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {} - int init(const rgw_bucket& _bucket, const rgw_obj& obj, - RGWBucketInfo* out, const DoutPrefixProvider *dpp); - int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj); - int init(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& index, int sid); - - friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) { - out << "BucketShard:{ bucket=" << bs.bucket << - ", shard_id=" << bs.shard_id << - ", bucket_ojb=" << bs.bucket_obj << "}"; - return out; - } - }; - - class Object { - RGWRados *store; - rgw::sal::Bucket* bucket; - RGWObjectCtx& ctx; - rgw::sal::Object* obj; - - BucketShard bs; - - RGWObjState *state; - RGWObjManifest *manifest; - - bool versioning_disabled; - - bool bs_initialized; - - const rgw_placement_rule *pmeta_placement_rule; - - protected: - int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false); - void invalidate_state(); - - int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag, - const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y); - int complete_atomic_modification(const DoutPrefixProvider *dpp); - - public: - Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket), - ctx(_ctx), obj(_obj), bs(store), - state(NULL), manifest(nullptr), versioning_disabled(false), - bs_initialized(false), - pmeta_placement_rule(nullptr) {} - - RGWRados *get_store() { return store; } - rgw_obj get_obj() { return obj->get_obj(); } - RGWObjectCtx& get_ctx() { return ctx; } - RGWBucketInfo& get_bucket_info() { return bucket->get_info(); } - const std::string& get_instance() { return obj->get_instance(); } - rgw::sal::Object* get_target() { return obj; } - int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y); - - int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { - if (!bs_initialized) { - int r = - bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp); - if (r < 0) { - return r; - } - bs_initialized = true; - } - *pbs = &bs; - return 0; - } - - void set_versioning_disabled(bool status) { - versioning_disabled = status; - } - - bool versioning_enabled() { - return (!versioning_disabled && bucket->versioning_enabled()); - } - - void set_meta_placement_rule(const rgw_placement_rule *p) { - pmeta_placement_rule = p; - } - - const rgw_placement_rule& get_meta_placement_rule() { - return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule(); - } - - struct Read { - RGWRados::Object *source; - - struct GetObjState { - std::map io_ctxs; - rgw_pool cur_pool; - librados::IoCtx *cur_ioctx{nullptr}; - rgw_obj obj; - rgw_raw_obj head_obj; - } state; - - struct ConditionParams { - const ceph::real_time *mod_ptr; - const ceph::real_time *unmod_ptr; - bool high_precision_time; - uint32_t mod_zone_id; - uint64_t mod_pg_ver; - const char *if_match; - const char *if_nomatch; - - ConditionParams() : - mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0), - if_match(NULL), if_nomatch(NULL) {} - } conds; - - struct Params { - ceph::real_time *lastmod; - uint64_t *obj_size; - std::map *attrs; - rgw_obj *target_obj; - - Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr), - target_obj(nullptr) {} - } params; - - explicit Read(RGWRados::Object *_source) : source(_source) {} - - int prepare(optional_yield y, const DoutPrefixProvider *dpp); - static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end); - int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp); - int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y); - int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y); - }; - - struct Write { - RGWRados::Object *target; - - struct MetaParams { - ceph::real_time *mtime; - std::map* rmattrs; - const bufferlist *data; - RGWObjManifest *manifest; - const std::string *ptag; - std::list *remove_objs; - ceph::real_time set_mtime; - rgw_user owner; - RGWObjCategory category; - int flags; - const char *if_match; - const char *if_nomatch; - std::optional olh_epoch; - ceph::real_time delete_at; - bool canceled; - const std::string *user_data; - rgw_zone_set *zones_trace; - bool modify_tail; - bool completeMultipart; - bool appendable; - - MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL), - remove_objs(NULL), category(RGWObjCategory::Main), flags(0), - if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr), - modify_tail(false), completeMultipart(false), appendable(false) {} - } meta; - - explicit Write(RGWRados::Object *_target) : target(_target) {} - - int _do_write_meta(const DoutPrefixProvider *dpp, - uint64_t size, uint64_t accounted_size, - std::map& attrs, - bool modify_tail, bool assume_noent, - void *index_op, optional_yield y); - int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size, - std::map& attrs, optional_yield y); - int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive); - const req_state* get_req_state() { - return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */ - } - }; - - struct Delete { - RGWRados::Object *target; - - struct DeleteParams { - rgw_user bucket_owner; - int versioning_status; // versioning flags defined in enum RGWBucketFlags - ACLOwner obj_owner; // needed for creation of deletion marker - uint64_t olh_epoch; - std::string marker_version_id; - uint32_t bilog_flags; - std::list *remove_objs; - ceph::real_time expiration_time; - ceph::real_time unmod_since; - ceph::real_time mtime; /* for setting delete marker mtime */ - bool high_precision_time; - rgw_zone_set *zones_trace; - bool abortmp; - uint64_t parts_accounted_size; - - DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {} - } params; - - struct DeleteResult { - bool delete_marker; - std::string version_id; - - DeleteResult() : delete_marker(false) {} - } result; - - explicit Delete(RGWRados::Object *_target) : target(_target) {} - - int delete_obj(optional_yield y, const DoutPrefixProvider *dpp); - }; - - struct Stat { - RGWRados::Object *source; - - struct Result { - rgw_obj obj; - std::optional manifest; - uint64_t size{0}; - struct timespec mtime {}; - std::map attrs; - } result; - - struct State { - librados::IoCtx io_ctx; - librados::AioCompletion *completion; - int ret; - - State() : completion(NULL), ret(0) {} - } state; - - - explicit Stat(RGWRados::Object *_source) : source(_source) {} - - int stat_async(const DoutPrefixProvider *dpp); - int wait(const DoutPrefixProvider *dpp); - int stat(); - private: - int finish(const DoutPrefixProvider *dpp); - }; - }; - - class Bucket { - RGWRados *store; - RGWBucketInfo bucket_info; - rgw_bucket& bucket; - int shard_id; - - public: - Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket), - shard_id(RGW_NO_SHARD) {} - RGWRados *get_store() { return store; } - rgw_bucket& get_bucket() { return bucket; } - RGWBucketInfo& get_bucket_info() { return bucket_info; } - - int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp); - - int get_shard_id() { return shard_id; } - void set_shard_id(int id) { - shard_id = id; - } - - class UpdateIndex { - RGWRados::Bucket *target; - std::string optag; - rgw_obj obj; - uint16_t bilog_flags{0}; - BucketShard bs; - bool bs_initialized{false}; - bool blind; - bool prepared{false}; - rgw_zone_set *zones_trace{nullptr}; - - int init_bs(const DoutPrefixProvider *dpp) { - int r = - bs.init(target->get_bucket(), obj, &target->bucket_info, dpp); - if (r < 0) { - return r; - } - bs_initialized = true; - return 0; - } - - void invalidate_bs() { - bs_initialized = false; - } - - int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function call); - public: - - UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj), - bs(target->get_store()) { - blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless); - } - - int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) { - if (!bs_initialized) { - int r = init_bs(dpp); - if (r < 0) { - return r; - } - } - *pbs = &bs; - return 0; - } - - void set_bilog_flags(uint16_t flags) { - bilog_flags = flags; - } - - void set_zones_trace(rgw_zone_set *_zones_trace) { - zones_trace = _zones_trace; - } - - int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y); - int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size, - uint64_t accounted_size, ceph::real_time& ut, - const std::string& etag, const std::string& content_type, - const std::string& storage_class, - bufferlist *acl_bl, RGWObjCategory category, - std::list *remove_objs, const std::string *user_data = nullptr, bool appendable = false); - int complete_del(const DoutPrefixProvider *dpp, - int64_t poolid, uint64_t epoch, - ceph::real_time& removed_mtime, /* mtime of removed object */ - std::list *remove_objs); - int cancel(const DoutPrefixProvider *dpp, - std::list *remove_objs); - - const std::string *get_optag() { return &optag; } - - bool is_prepared() { return prepared; } - }; // class UpdateIndex - - class List { - protected: - // absolute maximum number of objects that - // list_objects_(un)ordered can return - static constexpr int64_t bucket_list_objects_absolute_max = 25000; - - RGWRados::Bucket *target; - rgw_obj_key next_marker; - - int list_objects_ordered(const DoutPrefixProvider *dpp, - int64_t max, - std::vector *result, - std::map *common_prefixes, - bool *is_truncated, - optional_yield y); - int list_objects_unordered(const DoutPrefixProvider *dpp, - int64_t max, - std::vector *result, - std::map *common_prefixes, - bool *is_truncated, - optional_yield y); - - public: - - struct Params { - std::string prefix; - std::string delim; - rgw_obj_key marker; - rgw_obj_key end_marker; - std::string ns; - bool enforce_ns; - RGWAccessListFilter* access_list_filter; - RGWBucketListNameFilter force_check_filter; - bool list_versions; - bool allow_unordered; - - Params() : - enforce_ns(true), - access_list_filter(nullptr), - list_versions(false), - allow_unordered(false) - {} - } params; - - explicit List(RGWRados::Bucket *_target) : target(_target) {} - - int list_objects(const DoutPrefixProvider *dpp, int64_t max, - std::vector *result, - std::map *common_prefixes, - bool *is_truncated, - optional_yield y) { - if (params.allow_unordered) { - return list_objects_unordered(dpp, max, result, common_prefixes, - is_truncated, y); - } else { - return list_objects_ordered(dpp, max, result, common_prefixes, - is_truncated, y); - } - } - rgw_obj_key& get_next_marker() { - return next_marker; - } - }; // class List - }; // class Bucket - - int on_last_entry_in_listing(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const std::string& obj_prefix, - const std::string& obj_delim, - std::function handler); - - bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const; - - int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */ - const rgw_user& user, /* in */ - rgw::sal::Bucket* bucket, /* in */ - rgw::sal::Object* obj, /* in */ - const DoutPrefixProvider *dpp, /* in/out */ - optional_yield y); /* in */ - int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */ - const rgw_user& user, /* in */ - rgw::sal::Bucket* bucket, /* in */ - rgw::sal::Object* obj, /* in */ - bool& restored, /* out */ - const DoutPrefixProvider *dpp); /* in/out */ - int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp, - RGWObjState *astate, - std::map& src_attrs, - RGWRados::Object::Read& read_op, - const rgw_user& user_id, - rgw::sal::Object* dest_obj, - ceph::real_time *mtime); - - enum AttrsMod { - ATTRSMOD_NONE = 0, - ATTRSMOD_REPLACE = 1, - ATTRSMOD_MERGE = 2 - }; - - D3nDataCache* d3n_data_cache{nullptr}; - - int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y); - - int stat_remote_obj(const DoutPrefixProvider *dpp, - RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* src_obj, - const RGWBucketInfo *src_bucket_info, - real_time *src_mtime, - uint64_t *psize, - const real_time *mod_ptr, - const real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - std::map *pattrs, - std::map *pheaders, - std::string *version_id, - std::string *ptag, - std::string *petag); - - int fetch_remote_obj(RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* dest_obj, - rgw::sal::Object* src_obj, - rgw::sal::Bucket* dest_bucket, - rgw::sal::Bucket* src_bucket, - std::optional dest_placement, - ceph::real_time *src_mtime, - ceph::real_time *mtime, - const ceph::real_time *mod_ptr, - const ceph::real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - AttrsMod attrs_mod, - bool copy_if_newer, - rgw::sal::Attrs& attrs, - RGWObjCategory category, - std::optional olh_epoch, - ceph::real_time delete_at, - std::string *ptag, - std::string *petag, - void (*progress_cb)(off_t, void *), - void *progress_data, - const DoutPrefixProvider *dpp, - RGWFetchObjFilter *filter, - rgw_zone_set *zones_trace= nullptr, - std::optional* bytes_transferred = 0); - /** - * Copy an object. - * dest_obj: the object to copy into - * src_obj: the object to copy from - * attrs: usage depends on attrs_mod parameter - * attrs_mod: the modification mode of the attrs, may have the following values: - * ATTRSMOD_NONE - the attributes of the source object will be - * copied without modifications, attrs parameter is ignored; - * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs - * parameter, source object attributes are not copied; - * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes - * are overwritten by values contained in attrs parameter. - * Returns: 0 on success, -ERR# otherwise. - */ - int copy_obj(RGWObjectCtx& obj_ctx, - const rgw_user& user_id, - req_info *info, - const rgw_zone_id& source_zone, - rgw::sal::Object* dest_obj, - rgw::sal::Object* src_obj, - rgw::sal::Bucket* dest_bucket, - rgw::sal::Bucket* src_bucket, - const rgw_placement_rule& dest_placement, - ceph::real_time *src_mtime, - ceph::real_time *mtime, - const ceph::real_time *mod_ptr, - const ceph::real_time *unmod_ptr, - bool high_precision_time, - const char *if_match, - const char *if_nomatch, - AttrsMod attrs_mod, - bool copy_if_newer, - std::map& attrs, - RGWObjCategory category, - uint64_t olh_epoch, - ceph::real_time delete_at, - std::string *version_id, - std::string *ptag, - std::string *petag, - void (*progress_cb)(off_t, void *), - void *progress_data, - const DoutPrefixProvider *dpp, - optional_yield y); - - int copy_obj_data(RGWObjectCtx& obj_ctx, - rgw::sal::Bucket* bucket, - const rgw_placement_rule& dest_placement, - RGWRados::Object::Read& read_op, off_t end, - rgw::sal::Object* dest_obj, - ceph::real_time *mtime, - ceph::real_time set_mtime, - std::map& attrs, - uint64_t olh_epoch, - ceph::real_time delete_at, - std::string *petag, - const DoutPrefixProvider *dpp, - optional_yield y); - - int transition_obj(RGWObjectCtx& obj_ctx, - rgw::sal::Bucket* bucket, - rgw::sal::Object& obj, - const rgw_placement_rule& placement_rule, - const real_time& mtime, - uint64_t olh_epoch, - const DoutPrefixProvider *dpp, - optional_yield y); - - int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y); - - /** - * Delete a bucket. - * bucket: the name of the bucket to delete - * Returns 0 on success, -ERR# otherwise. - */ - int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true); - - void wakeup_meta_sync_shards(std::set& shard_ids); - - void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map >& entries); - - RGWMetaSyncStatusManager* get_meta_sync_manager(); - RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone); - - int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp); - int set_buckets_enabled(std::vector& buckets, bool enabled, const DoutPrefixProvider *dpp); - int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended); - - /** Delete an object.*/ - int delete_obj(rgw::sal::Driver* driver, - const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_owner, - const rgw_obj& src_obj, - int versioning_status, // versioning flags defined in enum RGWBucketFlags - uint16_t bilog_flags = 0, - const ceph::real_time& expiration_time = ceph::real_time(), - rgw_zone_set *zones_trace = nullptr); - int delete_obj(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_owner, - rgw::sal::Object* src_obj, - int versioning_status, // versioning flags defined in enum RGWBucketFlags - uint16_t bilog_flags = 0, - const ceph::real_time& expiration_time = ceph::real_time(), - rgw_zone_set *zones_trace = nullptr); - - int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj); - - /** Remove an object from the bucket index */ - int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp); - - /** - * Set an attr on an object. - * bucket: name of the bucket holding the object - * obj: name of the object to set the attr on - * name: the attr to set - * bl: the contents of the attr - * Returns: 0 on success, -ERR# otherwise. - */ - int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl); - - int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, - std::map& attrs, - std::map* rmattrs, - optional_yield y); - - int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, - bool follow_olh, optional_yield y, bool assume_noent = false); - int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) { - return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y); - } - - using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t, - off_t, bool, RGWObjState*, void*); - - int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info, - rgw::sal::Object* obj, off_t ofs, off_t end, - uint64_t max_chunk_size, iterate_obj_cb cb, void *arg, - optional_yield y); - - int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op); - - virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp, - const rgw_raw_obj& read_obj, off_t obj_ofs, - off_t read_ofs, off_t len, bool is_head_obj, - RGWObjState *astate, void *arg); - - /** - * a simple object read without keeping state - */ - - int raw_obj_stat(const DoutPrefixProvider *dpp, - rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch, - std::map *attrs, bufferlist *first_chunk, - RGWObjVersionTracker *objv_tracker, optional_yield y); - - int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op); - int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op); - - int guard_reshard(const DoutPrefixProvider *dpp, - BucketShard *bs, - const rgw_obj& obj_instance, - RGWBucketInfo& bucket_info, - std::function call); - int block_while_resharding(RGWRados::BucketShard *bs, - const rgw_obj& obj_instance, - RGWBucketInfo& bucket_info, - optional_yield y, - const DoutPrefixProvider *dpp); - - void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op); - int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); - int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag); - int bucket_index_link_olh(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, RGWObjState& olh_state, - const rgw_obj& obj_instance, bool delete_marker, - const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta, - uint64_t olh_epoch, - ceph::real_time unmod_since, bool high_precision_time, - rgw_zone_set *zones_trace = nullptr, - bool log_data_change = false); - int bucket_index_unlink_instance(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw_obj& obj_instance, - const std::string& op_tag, const std::string& olh_tag, - uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr); - int bucket_index_read_olh_log(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, RGWObjState& state, - const rgw_obj& obj_instance, uint64_t ver_marker, - std::map > *log, bool *is_truncated); - int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver); - int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance); - int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, - bufferlist& obj_tag, std::map >& log, - uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr); - int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr); - int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta, - uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time, - optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false); - int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info, - const rgw_obj& obj); - int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, - uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr); - - void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map& pending_entries, std::map *rm_pending_entries); - int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map& pending_attrs); - int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target); - int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh); - - void gen_rand_obj_instance_name(rgw_obj_key *target_key); - void gen_rand_obj_instance_name(rgw_obj *target); - - int update_containers_stats(std::map& m, const DoutPrefixProvider *dpp); - int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl); - -public: - void set_atomic(void *ctx, rgw_obj& obj) { - RGWObjectCtx *rctx = static_cast(ctx); - rctx->set_atomic(obj); - } - void set_prefetch_data(void *ctx, const rgw_obj& obj) { - RGWObjectCtx *rctx = static_cast(ctx); - rctx->set_prefetch_data(obj); - } - void set_compressed(void *ctx, const rgw_obj& obj) { - RGWObjectCtx *rctx = static_cast(ctx); - rctx->set_compressed(obj); - } - int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner); - int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver, - std::map& stats, std::string *max_marker, bool* syncstopped = NULL); - int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb); - - int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map *pattrs, const DoutPrefixProvider *dpp); - /* xxx dang obj_ctx -> svc */ - int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); - int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map *pattrs, optional_yield y, const DoutPrefixProvider *dpp); - - static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry); - - int get_bucket_info(RGWServices *svc, - const std::string& tenant_name, const std::string& bucket_name, - RGWBucketInfo& info, - ceph::real_time *pmtime, optional_yield y, - const DoutPrefixProvider *dpp, std::map *pattrs = NULL); - - // Returns 0 on successful refresh. Returns error code if there was - // an error or the version stored on the OSD is the same as that - // presented in the BucketInfo structure. - // - int try_refresh_bucket_info(RGWBucketInfo& info, - ceph::real_time *pmtime, - const DoutPrefixProvider *dpp, - std::map *pattrs = nullptr); - - int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv, - std::map *pattrs, bool create_entry_point, - const DoutPrefixProvider *dpp); - - int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr); - int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch, - rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); - int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent, - RGWObjCategory category, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); - int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj, - ceph::real_time& removed_mtime, std::list *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); - int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj, - std::list *remove_objs, - uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); - int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout); - - using ent_map_t = - boost::container::flat_map; - - int cls_bucket_list_ordered(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - const int shard_id, - const rgw_obj_index_key& start_after, - const std::string& prefix, - const std::string& delimiter, - const uint32_t num_entries, - const bool list_versions, - const uint16_t exp_factor, // 0 means ignore - ent_map_t& m, - bool* is_truncated, - bool* cls_filtered, - rgw_obj_index_key *last_entry, - optional_yield y, - RGWBucketListNameFilter force_check_filter = {}); - int cls_bucket_list_unordered(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - int shard_id, - const rgw_obj_index_key& start_after, - const std::string& prefix, - uint32_t num_entries, - bool list_versions, - std::vector& ent_list, - bool *is_truncated, - rgw_obj_index_key *last_entry, - optional_yield y, - RGWBucketListNameFilter force_check_filter = {}); - int cls_bucket_head(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - int shard_id, std::vector& headers, - std::map *bucket_instance_ids = NULL); - int cls_bucket_head_async(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& idx_layout, - int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio); - int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent); - int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh); - int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry); - void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry); - int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry); - int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry); - int bi_list(const DoutPrefixProvider *dpp, - const RGWBucketInfo& bucket_info, - int shard_id, - const std::string& filter_obj, - const std::string& marker, - uint32_t max, - std::list *entries, - bool *is_truncated); - int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list *entries, bool *is_truncated); - int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max, - std::list *entries, bool *is_truncated); - int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs); - - int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info); - int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, - uint64_t end_epoch, uint32_t max_entries, std::string& read_iter, - std::map& usage, bool *is_truncated); - int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch, - uint64_t end_epoch); - int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid); - - int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id); - - int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id); - int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id); - - void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain); - std::tuple> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag); - void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag); - int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op); - int gc_aio_operate(const std::string& oid, librados::AioCompletion *c, - librados::ObjectWriteOperation *op); - int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl); - - int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list& result, bool *truncated, bool& processing_queue); - int process_gc(bool expired_only); - bool process_expire_objects(const DoutPrefixProvider *dpp); - int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y); - - int process_lc(const std::unique_ptr& optional_bucket); - int list_lc_progress(std::string& marker, uint32_t max_entries, - std::vector>& progress_map, - int& index); - - int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, - std::map *existing_stats, - std::map *calculated_stats); - int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info); - int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry); - int remove_objs_from_index(const DoutPrefixProvider *dpp, - RGWBucketInfo& bucket_info, - const std::list& oid_list); - int move_rados_obj(const DoutPrefixProvider *dpp, - librados::IoCtx& src_ioctx, - const std::string& src_oid, const std::string& src_locator, - librados::IoCtx& dst_ioctx, - const std::string& dst_oid, const std::string& dst_locator); - int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key); - int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, - rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y); - - int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket, - RGWQuota& quota, uint64_t obj_size, - optional_yield y, bool check_size_only = false); - - int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket, - uint64_t num_objs, const DoutPrefixProvider *dpp); - - int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards); - - uint64_t instance_id(); - - librados::Rados* get_rados_handle(); - - int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list& handles); - int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate, - std::list& handles, bool keep_index_consistent, - optional_yield y); - - private: - /** - * Check the actual on-disk state of the object specified - * by list_state, and fill in the time and size of object. - * Then append any changes to suggested_updates for - * the rgw class' dir_suggest_changes function. - * - * Note that this can maul list_state; don't use it afterwards. Also - * it expects object to already be filled in from list_state; it only - * sets the size and mtime. - * - * Returns 0 on success, -ENOENT if the object doesn't exist on disk, - * and -errno on other failures. (-ENOENT is not a failure, and it - * will encode that info as a suggested update.) - */ - int check_disk_state(const DoutPrefixProvider *dpp, - librados::IoCtx io_ctx, - RGWBucketInfo& bucket_info, - rgw_bucket_dir_entry& list_state, - rgw_bucket_dir_entry& object, - bufferlist& suggested_updates, - optional_yield y); - - /** - * Init pool iteration - * pool: pool to use for the ctx initialization - * ctx: context object to use for the iteration - * Returns: 0 on success, -ERR# otherwise. - */ - int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx); - - /** - * Init pool iteration - * pool: pool to use - * cursor: position to start iteration - * ctx: context object to use for the iteration - * Returns: 0 on success, -ERR# otherwise. - */ - int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx); - - /** - * Get pool iteration position - * ctx: context object to use for the iteration - * Returns: std::string representation of position - */ - std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx); - - /** - * Iterate over pool return object names, use optional filter - * ctx: iteration context, initialized with pool_iterate_begin() - * num: max number of objects to return - * objs: a vector that the results will append into - * is_truncated: if not NULL, will hold true iff iteration is complete - * filter: if not NULL, will be used to filter returned objects - * Returns: 0 on success, -ERR# otherwise. - */ - int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, - std::vector& objs, - bool *is_truncated, RGWAccessListFilter *filter); - - uint64_t next_bucket_id(); - - /** - * This is broken out to facilitate unit testing. - */ - static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries, - uint32_t num_shards); -}; - - -struct get_obj_data { - RGWRados* rgwrados; - RGWGetDataCB* client_cb = nullptr; - rgw::Aio* aio; - uint64_t offset; // next offset to write to client - rgw::AioResultList completed; // completed read results, sorted by offset - optional_yield yield; - - get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio, - uint64_t offset, optional_yield yield) - : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {} - ~get_obj_data() { - if (rgwrados->get_use_datacache()) { - const std::lock_guard l(d3n_get_data.d3n_lock); - } - } - - D3nGetObjData d3n_get_data; - std::atomic_bool d3n_bypass_cache_write{false}; - - int flush(rgw::AioResultList&& results); - - void cancel() { - // wait for all completions to drain and ignore the results - aio->drain(); - } - - int drain() { - auto c = aio->wait(); - while (!c.empty()) { - int r = flush(std::move(c)); - if (r < 0) { - cancel(); - return r; - } - c = aio->wait(); - } - return flush(std::move(c)); - } -}; - - -#endif diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h index 0a28cff18603..25082a2e490d 100644 --- a/src/rgw/rgw_realm_reloader.h +++ b/src/rgw/rgw_realm_reloader.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_REALM_RELOADER_H -#define RGW_REALM_RELOADER_H +#pragma once #include "rgw_realm_watcher.h" #include "common/Cond.h" @@ -63,5 +62,3 @@ class RGWRealmReloader : public RGWRealmWatcher::Watcher { ceph::condition_variable cond; //< to signal reload() after an invalid realm config C_Reload* reload_scheduled; //< reload() context if scheduled }; - -#endif // RGW_REALM_RELOADER_H diff --git a/src/rgw/rgw_realm_watcher.h b/src/rgw/rgw_realm_watcher.h index b2e3ac6b9d64..2a0c0d076990 100644 --- a/src/rgw/rgw_realm_watcher.h +++ b/src/rgw/rgw_realm_watcher.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_REALM_WATCHER_H -#define RGW_REALM_WATCHER_H +#pragma once #include "include/rados/librados.hpp" #include "include/ceph_assert.h" @@ -65,5 +64,3 @@ class RGWRealmWatcher : public librados::WatchCtx2 { std::map watchers; }; - -#endif // RGW_REALM_WATCHER_H diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h index ed54dca595ec..cd05f51c9424 100644 --- a/src/rgw/rgw_request.h +++ b/src/rgw/rgw_request.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_REQUEST_H -#define RGW_REQUEST_H +#pragma once #include "rgw_common.h" #include "rgw_acl.h" @@ -39,5 +38,3 @@ RGWLoadGenRequest(uint64_t req_id, const std::string& _m, const std::string& _r, : RGWRequest(req_id), method(_m), resource(_r), content_length(_cl), fail_flag(ff) {} }; - -#endif /* RGW_REQUEST_H */ diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc deleted file mode 100644 index b2dec7af1c86..000000000000 --- a/src/rgw/rgw_reshard.cc +++ /dev/null @@ -1,1407 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#include -#include - -#include "rgw_zone.h" -#include "driver/rados/rgw_bucket.h" -#include "rgw_reshard.h" -#include "rgw_sal.h" -#include "rgw_sal_rados.h" -#include "cls/rgw/cls_rgw_client.h" -#include "cls/lock/cls_lock_client.h" -#include "common/errno.h" -#include "common/ceph_json.h" - -#include "common/dout.h" - -#include "services/svc_zone.h" -#include "services/svc_sys_obj.h" -#include "services/svc_tier_rados.h" -#include "services/svc_bilog_rados.h" - -#define dout_context g_ceph_context -#define dout_subsys ceph_subsys_rgw - -using namespace std; - -const string reshard_oid_prefix = "reshard."; -const string reshard_lock_name = "reshard_process"; -const string bucket_instance_lock_name = "bucket_instance_lock"; - -/* All primes up to 2000 used to attempt to make dynamic sharding use - * a prime numbers of shards. Note: this list also includes 1 for when - * 1 shard is the most appropriate, even though 1 is not prime. - */ -const std::initializer_list RGWBucketReshard::reshard_primes = { - 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, - 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, - 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, - 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, - 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, - 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, - 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, - 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, - 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, - 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, - 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, - 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, - 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, - 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, - 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259, - 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, - 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, - 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, - 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, - 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, - 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, - 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831, - 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, - 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999 -}; - -class BucketReshardShard { - rgw::sal::RadosStore* store; - const RGWBucketInfo& bucket_info; - int shard_id; - RGWRados::BucketShard bs; - vector entries; - map stats; - deque& aio_completions; - uint64_t max_aio_completions; - uint64_t reshard_shard_batch_size; - - int wait_next_completion() { - librados::AioCompletion *c = aio_completions.front(); - aio_completions.pop_front(); - - c->wait_for_complete(); - - int ret = c->get_return_value(); - c->release(); - - if (ret < 0) { - derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl; - return ret; - } - - return 0; - } - - int get_completion(librados::AioCompletion **c) { - if (aio_completions.size() >= max_aio_completions) { - int ret = wait_next_completion(); - if (ret < 0) { - return ret; - } - } - - *c = librados::Rados::aio_create_completion(nullptr, nullptr); - aio_completions.push_back(*c); - - return 0; - } - -public: - BucketReshardShard(const DoutPrefixProvider *dpp, - rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info, - const rgw::bucket_index_layout_generation& index, - int shard_id, deque& _completions) : - store(_store), bucket_info(_bucket_info), shard_id(shard_id), - bs(store->getRados()), aio_completions(_completions) - { - bs.init(dpp, bucket_info, index, shard_id); - - max_aio_completions = - store->ctx()->_conf.get_val("rgw_reshard_max_aio"); - reshard_shard_batch_size = - store->ctx()->_conf.get_val("rgw_reshard_batch_size"); - } - - int get_shard_id() const { - return shard_id; - } - - int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, - const rgw_bucket_category_stats& entry_stats) { - entries.push_back(entry); - if (account) { - rgw_bucket_category_stats& target = stats[category]; - target.num_entries += entry_stats.num_entries; - target.total_size += entry_stats.total_size; - target.total_size_rounded += entry_stats.total_size_rounded; - target.actual_size += entry_stats.actual_size; - } - if (entries.size() >= reshard_shard_batch_size) { - int ret = flush(); - if (ret < 0) { - return ret; - } - } - - return 0; - } - - int flush() { - if (entries.size() == 0) { - return 0; - } - - librados::ObjectWriteOperation op; - for (auto& entry : entries) { - store->getRados()->bi_put(op, bs, entry); - } - cls_rgw_bucket_update_stats(op, false, stats); - - librados::AioCompletion *c; - int ret = get_completion(&c); - if (ret < 0) { - return ret; - } - ret = bs.bucket_obj.aio_operate(c, &op); - if (ret < 0) { - derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl; - return ret; - } - entries.clear(); - stats.clear(); - return 0; - } - - int wait_all_aio() { - int ret = 0; - while (!aio_completions.empty()) { - int r = wait_next_completion(); - if (r < 0) { - ret = r; - } - } - return ret; - } -}; // class BucketReshardShard - - -class BucketReshardManager { - rgw::sal::RadosStore *store; - deque completions; - vector target_shards; - -public: - BucketReshardManager(const DoutPrefixProvider *dpp, - rgw::sal::RadosStore *_store, - const RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& target) - : store(_store) - { - const int num_shards = target.layout.normal.num_shards; - target_shards.reserve(num_shards); - for (int i = 0; i < num_shards; ++i) { - target_shards.emplace_back(dpp, store, bucket_info, target, i, completions); - } - } - - ~BucketReshardManager() { - for (auto& shard : target_shards) { - int ret = shard.wait_all_aio(); - if (ret < 0) { - ldout(store->ctx(), 20) << __func__ << - ": shard->wait_all_aio() returned ret=" << ret << dendl; - } - } - } - - int add_entry(int shard_index, - rgw_cls_bi_entry& entry, bool account, RGWObjCategory category, - const rgw_bucket_category_stats& entry_stats) { - int ret = target_shards[shard_index].add_entry(entry, account, category, - entry_stats); - if (ret < 0) { - derr << "ERROR: target_shards.add_entry(" << entry.idx << - ") returned error: " << cpp_strerror(-ret) << dendl; - return ret; - } - - return 0; - } - - int finish() { - int ret = 0; - for (auto& shard : target_shards) { - int r = shard.flush(); - if (r < 0) { - derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl; - ret = r; - } - } - for (auto& shard : target_shards) { - int r = shard.wait_all_aio(); - if (r < 0) { - derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl; - ret = r; - } - } - target_shards.clear(); - return ret; - } -}; // class BucketReshardManager - -RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store, - const RGWBucketInfo& _bucket_info, - const std::map& _bucket_attrs, - RGWBucketReshardLock* _outer_reshard_lock) : - store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs), - reshard_lock(store, bucket_info, true), - outer_reshard_lock(_outer_reshard_lock) -{ } - -// sets reshard status of bucket index shards for the current index layout -static int set_resharding_status(const DoutPrefixProvider *dpp, - rgw::sal::RadosStore* store, - const RGWBucketInfo& bucket_info, - cls_rgw_reshard_status status) -{ - cls_rgw_bucket_instance_entry instance_entry; - instance_entry.set_status(status); - - int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry); - if (ret < 0) { - ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: " - << cpp_strerror(-ret) << dendl; - return ret; - } - return 0; -} - -static int remove_old_reshard_instance(rgw::sal::RadosStore* store, - const rgw_bucket& bucket, - const DoutPrefixProvider* dpp) -{ - RGWBucketInfo info; - int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr, - nullptr, null_yield, dpp); - if (r < 0) { - return r; - } - - // delete its shard objects (ignore errors) - store->svc()->bi->clean_index(dpp, info, info.layout.current_index); - // delete the bucket instance metadata - return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp); -} - -// initialize the new bucket index shard objects -static int init_target_index(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - const rgw::bucket_index_layout_generation& index, - const DoutPrefixProvider* dpp) -{ - int ret = store->svc()->bi->init_index(dpp, bucket_info, index); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize " - "target index shard objects: " << cpp_strerror(ret) << dendl; - return ret; - } - - if (!bucket_info.datasync_flag_enabled()) { - // if bucket sync is disabled, disable it on each of the new shards too - auto log = rgw::log_layout_from_index(0, index); - ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable " - "bucket sync on the target index shard objects: " - << cpp_strerror(ret) << dendl; - store->svc()->bi->clean_index(dpp, bucket_info, index); - return ret; - } - } - - return ret; -} - -// initialize a target index layout, create its bucket index shard objects, and -// write the target layout to the bucket instance metadata -static int init_target_layout(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - uint32_t new_num_shards, - const DoutPrefixProvider* dpp) -{ - auto prev = bucket_info.layout; // make a copy for cleanup - const auto current = prev.current_index; - - // initialize a new normal target index layout generation - rgw::bucket_index_layout_generation target; - target.layout.type = rgw::BucketIndexType::Normal; - target.layout.normal.num_shards = new_num_shards; - target.gen = current.gen + 1; - - if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) { - // backward-compatible cleanup of old reshards, where the target was in a - // different bucket instance - if (!bucket_info.new_bucket_instance_id.empty()) { - rgw_bucket new_bucket = bucket_info.bucket; - new_bucket.bucket_id = bucket_info.new_bucket_instance_id; - ldout(store->ctx(), 10) << __func__ << " removing target bucket instance " - "from a previous reshard attempt" << dendl; - // ignore errors - remove_old_reshard_instance(store, new_bucket, dpp); - } - bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING; - } - - if (bucket_info.layout.target_index) { - // a previous reshard failed or stalled, and its reshard lock dropped - ldpp_dout(dpp, 10) << __func__ << " removing existing target index " - "objects from a previous reshard attempt" << dendl; - // delete its existing shard objects (ignore errors) - store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index); - // don't reuse this same generation in the new target layout, in case - // something is still trying to operate on its shard objects - target.gen = bucket_info.layout.target_index->gen + 1; - } - - // create the index shard objects - int ret = init_target_index(store, bucket_info, target, dpp); - if (ret < 0) { - return ret; - } - - // retry in case of racing writes to the bucket instance metadata - static constexpr auto max_retries = 10; - int tries = 0; - do { - // update resharding state - bucket_info.layout.target_index = target; - bucket_info.layout.resharding = rgw::BucketReshardState::InProgress; - - if (ret = fault.check("set_target_layout"); - ret == 0) { // no fault injected, write the bucket instance metadata - ret = store->getRados()->put_bucket_instance_info(bucket_info, false, - real_time(), &bucket_attrs, dpp); - } else if (ret == -ECANCELED) { - fault.clear(); // clear the fault so a retry can succeed - } - - if (ret == -ECANCELED) { - // racing write detected, read the latest bucket info and try again - int ret2 = store->getRados()->get_bucket_instance_info( - bucket_info.bucket, bucket_info, - nullptr, &bucket_attrs, null_yield, dpp); - if (ret2 < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " - "bucket info: " << cpp_strerror(ret2) << dendl; - ret = ret2; - break; - } - - // check that we're still in the reshard state we started in - if (bucket_info.layout.resharding != rgw::BucketReshardState::None || - bucket_info.layout.current_index != current) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " - "another reshard" << dendl; - break; - } - - prev = bucket_info.layout; // update the copy - } - ++tries; - } while (ret == -ECANCELED && tries < max_retries); - - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write " - "target index layout to bucket info: " << cpp_strerror(ret) << dendl; - - bucket_info.layout = std::move(prev); // restore in-memory layout - - // delete the target shard objects (ignore errors) - store->svc()->bi->clean_index(dpp, bucket_info, target); - return ret; - } - return 0; -} // init_target_layout - -// delete the bucket index shards associated with the target layout and remove -// it from the bucket instance metadata -static int revert_target_layout(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - const DoutPrefixProvider* dpp) -{ - auto prev = bucket_info.layout; // make a copy for cleanup - - // remove target index shard objects - int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index); - if (ret < 0) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove " - "target index with: " << cpp_strerror(ret) << dendl; - ret = 0; // non-fatal error - } - - // retry in case of racing writes to the bucket instance metadata - static constexpr auto max_retries = 10; - int tries = 0; - do { - // clear target_index and resharding state - bucket_info.layout.target_index = std::nullopt; - bucket_info.layout.resharding = rgw::BucketReshardState::None; - - if (ret = fault.check("revert_target_layout"); - ret == 0) { // no fault injected, revert the bucket instance metadata - ret = store->getRados()->put_bucket_instance_info(bucket_info, false, - real_time(), - &bucket_attrs, dpp); - } else if (ret == -ECANCELED) { - fault.clear(); // clear the fault so a retry can succeed - } - - if (ret == -ECANCELED) { - // racing write detected, read the latest bucket info and try again - int ret2 = store->getRados()->get_bucket_instance_info( - bucket_info.bucket, bucket_info, - nullptr, &bucket_attrs, null_yield, dpp); - if (ret2 < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " - "bucket info: " << cpp_strerror(ret2) << dendl; - ret = ret2; - break; - } - - // check that we're still in the reshard state we started in - if (bucket_info.layout.resharding == rgw::BucketReshardState::None) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " - "reshard cancel" << dendl; - return -ECANCELED; - } - if (bucket_info.layout.current_index != prev.current_index || - bucket_info.layout.target_index != prev.target_index) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " - "another reshard" << dendl; - return -ECANCELED; - } - - prev = bucket_info.layout; // update the copy - } - ++tries; - } while (ret == -ECANCELED && tries < max_retries); - - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear " - "target index layout in bucket info: " << cpp_strerror(ret) << dendl; - - bucket_info.layout = std::move(prev); // restore in-memory layout - return ret; - } - return 0; -} // remove_target_layout - -static int init_reshard(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - uint32_t new_num_shards, - const DoutPrefixProvider *dpp) -{ - int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp); - if (ret < 0) { - return ret; - } - - if (ret = fault.check("block_writes"); - ret == 0) { // no fault injected, block writes to the current index shards - ret = set_resharding_status(dpp, store, bucket_info, - cls_rgw_reshard_status::IN_PROGRESS); - } - - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause " - "writes to the current index: " << cpp_strerror(ret) << dendl; - // clean up the target layout (ignore errors) - revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); - return ret; - } - return 0; -} // init_reshard - -static int cancel_reshard(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - const DoutPrefixProvider *dpp) -{ - // unblock writes to the current index shard objects - int ret = set_resharding_status(dpp, store, bucket_info, - cls_rgw_reshard_status::NOT_RESHARDING); - if (ret < 0) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " - "writes to current index objects: " << cpp_strerror(ret) << dendl; - ret = 0; // non-fatal error - } - - if (bucket_info.layout.target_index) { - return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp); - } - // there is nothing to revert - return 0; -} // cancel_reshard - -static int commit_target_layout(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - const DoutPrefixProvider *dpp) -{ - auto& layout = bucket_info.layout; - const auto next_log_gen = layout.logs.empty() ? 1 : - layout.logs.back().gen + 1; - - if (!store->svc()->zone->need_to_log_data()) { - // if we're not syncing data, we can drop any existing logs - layout.logs.clear(); - } - - // use the new index layout as current - ceph_assert(layout.target_index); - layout.current_index = std::move(*layout.target_index); - layout.target_index = std::nullopt; - layout.resharding = rgw::BucketReshardState::None; - // add the in-index log layout - layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index)); - - int ret = fault.check("commit_target_layout"); - if (ret == 0) { // no fault injected, write the bucket instance metadata - ret = store->getRados()->put_bucket_instance_info( - bucket_info, false, real_time(), &bucket_attrs, dpp); - } else if (ret == -ECANCELED) { - fault.clear(); // clear the fault so a retry can succeed - } - return ret; -} // commit_target_layout - -static int commit_reshard(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - ReshardFaultInjector& fault, - const DoutPrefixProvider *dpp) -{ - auto prev = bucket_info.layout; // make a copy for cleanup - - // retry in case of racing writes to the bucket instance metadata - static constexpr auto max_retries = 10; - int tries = 0; - int ret = 0; - do { - ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp); - if (ret == -ECANCELED) { - // racing write detected, read the latest bucket info and try again - int ret2 = store->getRados()->get_bucket_instance_info( - bucket_info.bucket, bucket_info, - nullptr, &bucket_attrs, null_yield, dpp); - if (ret2 < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read " - "bucket info: " << cpp_strerror(ret2) << dendl; - ret = ret2; - break; - } - - // check that we're still in the reshard state we started in - if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " - "reshard cancel" << dendl; - return -ECANCELED; // whatever canceled us already did the cleanup - } - if (bucket_info.layout.current_index != prev.current_index || - bucket_info.layout.target_index != prev.target_index) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with " - "another reshard" << dendl; - return -ECANCELED; // whatever canceled us already did the cleanup - } - - prev = bucket_info.layout; // update the copy - } - ++tries; - } while (ret == -ECANCELED && tries < max_retries); - - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit " - "target index layout: " << cpp_strerror(ret) << dendl; - - bucket_info.layout = std::move(prev); // restore in-memory layout - - // unblock writes to the current index shard objects - int ret2 = set_resharding_status(dpp, store, bucket_info, - cls_rgw_reshard_status::NOT_RESHARDING); - if (ret2 < 0) { - ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock " - "writes to current index objects: " << cpp_strerror(ret2) << dendl; - // non-fatal error - } - return ret; - } - - if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() && - prev.current_index.layout.type == rgw::BucketIndexType::Normal) { - // write a datalog entry for each shard of the previous index. triggering - // sync on the old shards will force them to detect the end-of-log for that - // generation, and eventually transition to the next - // TODO: use a log layout to support types other than BucketLogType::InIndex - for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) { - ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id); - if (ret < 0) { - ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket=" - << bucket_info.bucket << ", shard_id=" << shard_id << "of generation=" - << prev.logs.back().gen << ")" << dendl; - } // datalog error is not fatal - } - } - - // check whether the old index objects are still needed for bilogs - const auto& logs = bucket_info.layout.logs; - auto log = std::find_if(logs.begin(), logs.end(), - [&prev] (const rgw::bucket_log_layout_generation& log) { - return log.layout.type == rgw::BucketLogType::InIndex - && log.layout.in_index.gen == prev.current_index.gen; - }); - if (log == logs.end()) { - // delete the index objects (ignore errors) - store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index); - } - return 0; -} // commit_reshard - -int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - const DoutPrefixProvider* dpp) -{ - ReshardFaultInjector no_fault; - return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp); -} - -int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp) -{ - int ret = reshard_lock.lock(dpp); - if (ret < 0) { - return ret; - } - - if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) { - ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl; - ret = -EINVAL; - } else { - ret = clear_resharding(store, bucket_info, bucket_attrs, dpp); - } - - reshard_lock.unlock(); - return ret; -} - -RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store, - const std::string& reshard_lock_oid, - bool _ephemeral) : - store(_store), - lock_oid(reshard_lock_oid), - ephemeral(_ephemeral), - internal_lock(reshard_lock_name) -{ - const int lock_dur_secs = store->ctx()->_conf.get_val( - "rgw_reshard_bucket_lock_duration"); - duration = std::chrono::seconds(lock_dur_secs); - -#define COOKIE_LEN 16 - char cookie_buf[COOKIE_LEN + 1]; - gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1); - cookie_buf[COOKIE_LEN] = '\0'; - - internal_lock.set_cookie(cookie_buf); - internal_lock.set_duration(duration); -} - -int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) { - internal_lock.set_must_renew(false); - - int ret; - if (ephemeral) { - ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, - lock_oid); - } else { - ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); - } - - if (ret == -EBUSY) { - ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ << - " found lock on " << lock_oid << - " to be held by another RGW process; skipping for now" << dendl; - return ret; - } else if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ << - " failed to acquire lock on " << lock_oid << ": " << - cpp_strerror(-ret) << dendl; - return ret; - } - - reset_time(Clock::now()); - - return 0; -} - -void RGWBucketReshardLock::unlock() { - int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid); - if (ret < 0) { - ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ << - " failed to drop lock on " << lock_oid << " ret=" << ret << dendl; - } -} - -int RGWBucketReshardLock::renew(const Clock::time_point& now) { - internal_lock.set_must_renew(true); - int ret; - if (ephemeral) { - ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx, - lock_oid); - } else { - ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid); - } - if (ret < 0) { /* expired or already locked by another processor */ - std::stringstream error_s; - if (-ENOENT == ret) { - error_s << "ENOENT (lock expired or never initially locked)"; - } else { - error_s << ret << " (" << cpp_strerror(-ret) << ")"; - } - ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " << - lock_oid << " with error " << error_s.str() << dendl; - return ret; - } - internal_lock.set_must_renew(false); - - reset_time(now); - ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " << - lock_oid << dendl; - - return 0; -} - - -int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current, - const rgw::bucket_index_layout_generation& target, - int max_entries, - bool verbose, - ostream *out, - Formatter *formatter, - const DoutPrefixProvider *dpp) -{ - if (out) { - (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl; - (*out) << "bucket name: " << bucket_info.bucket.name << std::endl; - } - - /* update bucket info -- in progress*/ - list entries; - - if (max_entries < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": can't reshard, negative max_entries" << dendl; - return -EINVAL; - } - - BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target); - - bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr); - - if (verbose_json_out) { - formatter->open_array_section("entries"); - } - - uint64_t total_entries = 0; - - if (!verbose_json_out && out) { - (*out) << "total entries:"; - } - - const int num_source_shards = current.layout.normal.num_shards; - string marker; - for (int i = 0; i < num_source_shards; ++i) { - bool is_truncated = true; - marker.clear(); - const std::string null_object_filter; // empty string since we're not filtering by object - while (is_truncated) { - entries.clear(); - int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated); - if (ret < 0 && ret != -ENOENT) { - derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl; - return ret; - } - - for (auto iter = entries.begin(); iter != entries.end(); ++iter) { - rgw_cls_bi_entry& entry = *iter; - if (verbose_json_out) { - formatter->open_object_section("entry"); - - encode_json("shard_id", i, formatter); - encode_json("num_entry", total_entries, formatter); - encode_json("entry", entry, formatter); - } - total_entries++; - - marker = entry.idx; - - int target_shard_id; - cls_rgw_obj_key cls_key; - RGWObjCategory category; - rgw_bucket_category_stats stats; - bool account = entry.get_info(&cls_key, &category, &stats); - rgw_obj_key key(cls_key); - if (entry.type == BIIndexType::OLH && key.empty()) { - // bogus entry created by https://tracker.ceph.com/issues/46456 - // to fix, skip so it doesn't get include in the new bucket instance - total_entries--; - ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl; - continue; - } - rgw_obj obj(bucket_info.bucket, key); - RGWMPObj mp; - if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) { - // place the multipart .meta object on the same shard as its head object - obj.index_hash_source = mp.get_key(); - } - ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal, - obj.get_hash_object(), &target_shard_id); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl; - return ret; - } - - int shard_index = (target_shard_id > 0 ? target_shard_id : 0); - - ret = target_shards_mgr.add_entry(shard_index, entry, account, - category, stats); - if (ret < 0) { - return ret; - } - - Clock::time_point now = Clock::now(); - if (reshard_lock.should_renew(now)) { - // assume outer locks have timespans at least the size of ours, so - // can call inside conditional - if (outer_reshard_lock) { - ret = outer_reshard_lock->renew(now); - if (ret < 0) { - return ret; - } - } - ret = reshard_lock.renew(now); - if (ret < 0) { - ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl; - return ret; - } - } - if (verbose_json_out) { - formatter->close_section(); - formatter->flush(*out); - } else if (out && !(total_entries % 1000)) { - (*out) << " " << total_entries; - } - } // entries loop - } - } - - if (verbose_json_out) { - formatter->close_section(); - formatter->flush(*out); - } else if (out) { - (*out) << " " << total_entries << std::endl; - } - - int ret = target_shards_mgr.finish(); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl; - return -EIO; - } - return 0; -} // RGWBucketReshard::do_reshard - -int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list *status) -{ - return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status); -} - -int RGWBucketReshard::execute(int num_shards, - ReshardFaultInjector& fault, - int max_op_entries, - const DoutPrefixProvider *dpp, - bool verbose, ostream *out, - Formatter *formatter, - RGWReshard* reshard_log) -{ - // take a reshard lock on the bucket - int ret = reshard_lock.lock(dpp); - if (ret < 0) { - return ret; - } - // unlock when scope exits - auto unlock = make_scope_guard([this] { reshard_lock.unlock(); }); - - if (reshard_log) { - ret = reshard_log->update(dpp, bucket_info); - if (ret < 0) { - return ret; - } - } - - // prepare the target index and add its layout the bucket info - ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp); - if (ret < 0) { - return ret; - } - - if (ret = fault.check("do_reshard"); - ret == 0) { // no fault injected, do the reshard - ret = do_reshard(bucket_info.layout.current_index, - *bucket_info.layout.target_index, - max_op_entries, verbose, out, formatter, dpp); - } - - if (ret < 0) { - cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp); - - ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" - << bucket_info.bucket.name << "\" canceled due to errors" << dendl; - return ret; - } - - ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp); - if (ret < 0) { - return ret; - } - - ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \"" - << bucket_info.bucket.name << "\" completed successfully" << dendl; - return 0; -} // execute - -bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket, - const RGWSI_Zone* zone_svc) -{ - return !zone_svc->need_to_log_data() || - bucket.layout.logs.size() < max_bilog_history; -} - - -RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out, - Formatter *_formatter) : - store(_store), instance_lock(bucket_instance_lock_name), - verbose(_verbose), out(_out), formatter(_formatter) -{ - num_logshards = store->ctx()->_conf.get_val("rgw_reshard_num_logs"); -} - -string RGWReshard::get_logshard_key(const string& tenant, - const string& bucket_name) -{ - return tenant + ":" + bucket_name; -} - -#define MAX_RESHARD_LOGSHARDS_PRIME 7877 - -void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid) -{ - string key = get_logshard_key(tenant, bucket_name); - - uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size()); - uint32_t sid2 = sid ^ ((sid & 0xFF) << 24); - sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards; - - get_logshard_oid(int(sid), oid); -} - -int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) -{ - if (!store->svc()->zone->can_reshard()) { - ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl; - return 0; - } - - string logshard_oid; - - get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); - - librados::ObjectWriteOperation op; - cls_rgw_reshard_add(op, entry); - - int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; - return ret; - } - return 0; -} - -int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info) -{ - cls_rgw_reshard_entry entry; - entry.bucket_name = bucket_info.bucket.name; - entry.bucket_id = bucket_info.bucket.bucket_id; - entry.tenant = bucket_info.owner.tenant; - - int ret = get(dpp, entry); - if (ret < 0) { - return ret; - } - - ret = add(dpp, entry); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " << - cpp_strerror(-ret) << dendl; - } - - return ret; -} - - -int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list& entries, bool *is_truncated) -{ - string logshard_oid; - - get_logshard_oid(logshard_num, &logshard_oid); - - int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated); - - if (ret == -ENOENT) { - // these shard objects aren't created until we actually write something to - // them, so treat ENOENT as a successful empty listing - *is_truncated = false; - ret = 0; - } else if (ret == -EACCES) { - ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool - << ". Fix the pool access permissions of your client" << dendl; - } else if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid=" - << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl; - } - - return ret; -} - -int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry) -{ - string logshard_oid; - - get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); - - int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry); - if (ret < 0) { - if (ret != -ENOENT) { - ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << - " bucket=" << entry.bucket_name << dendl; - } - return ret; - } - - return 0; -} - -int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry) -{ - string logshard_oid; - - get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid); - - librados::ObjectWriteOperation op; - cls_rgw_reshard_remove(op, entry); - - int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl; - return ret; - } - - return ret; -} - -int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry) -{ - int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid); - if (ret < 0) { - ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl; - return ret; - } - - return 0; -} - -int RGWReshardWait::wait(optional_yield y) -{ - std::unique_lock lock(mutex); - - if (going_down) { - return -ECANCELED; - } - - if (y) { - auto& context = y.get_io_context(); - auto& yield = y.get_yield_context(); - - Waiter waiter(context); - waiters.push_back(waiter); - lock.unlock(); - - waiter.timer.expires_after(duration); - - boost::system::error_code ec; - waiter.timer.async_wait(yield[ec]); - - lock.lock(); - waiters.erase(waiters.iterator_to(waiter)); - return -ec.value(); - } - - cond.wait_for(lock, duration); - - if (going_down) { - return -ECANCELED; - } - - return 0; -} - -void RGWReshardWait::stop() -{ - std::scoped_lock lock(mutex); - going_down = true; - cond.notify_all(); - for (auto& waiter : waiters) { - // unblock any waiters with ECANCELED - waiter.timer.cancel(); - } -} - -int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry, - int max_entries, const DoutPrefixProvider *dpp) -{ - ldpp_dout(dpp, 20) << __func__ << " resharding " << - entry.bucket_name << dendl; - - rgw_bucket bucket; - RGWBucketInfo bucket_info; - std::map bucket_attrs; - - int ret = store->getRados()->get_bucket_info(store->svc(), - entry.tenant, - entry.bucket_name, - bucket_info, nullptr, - null_yield, dpp, - &bucket_attrs); - if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) { - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": Error in get_bucket_info for bucket " << entry.bucket_name << - ": " << cpp_strerror(-ret) << dendl; - if (ret != -ENOENT) { - // any error other than ENOENT will abort - return ret; - } - } else { - ldpp_dout(dpp, 0) << __func__ << - ": Bucket: " << entry.bucket_name << - " already resharded by someone, skipping " << dendl; - } - - // we've encountered a reshard queue entry for an apparently - // non-existent bucket; let's try to recover by cleaning up - ldpp_dout(dpp, 0) << __func__ << - ": removing reshard queue entry for a resharded or non-existent bucket" << - entry.bucket_name << dendl; - - ret = remove(dpp, entry); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": Error removing non-existent bucket " << - entry.bucket_name << " from resharding queue: " << - cpp_strerror(-ret) << dendl; - return ret; - } - - // we cleaned up, move on to the next entry - return 0; - } - - if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) { - ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not " - "eligible for resharding until peer zones finish syncing one " - "or more of its old log generations" << dendl; - return remove(dpp, entry); - } - - RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr); - - ReshardFaultInjector f; // no fault injected - ret = br.execute(entry.new_num_shards, f, max_entries, dpp, - false, nullptr, nullptr, this); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << - ": Error during resharding bucket " << entry.bucket_name << ":" << - cpp_strerror(-ret)<< dendl; - return ret; - } - - ldpp_dout(dpp, 20) << __func__ << - " removing reshard queue entry for bucket " << entry.bucket_name << - dendl; - - ret = remove(dpp, entry); - if (ret < 0) { - ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " << - entry.bucket_name << " from resharding queue: " << - cpp_strerror(-ret) << dendl; - return ret; - } - return 0; -} - -int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp) -{ - string marker; - bool truncated = true; - - constexpr uint32_t max_entries = 1000; - - string logshard_oid; - get_logshard_oid(logshard_num, &logshard_oid); - - RGWBucketReshardLock logshard_lock(store, logshard_oid, false); - - int ret = logshard_lock.lock(dpp); - if (ret < 0) { - ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << - logshard_oid << ", ret = " << ret < entries; - ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated); - if (ret < 0) { - ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" << - logshard_oid << dendl; - continue; - } - - for(auto& entry: entries) { // logshard entries - process_entry(entry, max_entries, dpp); - if (ret < 0) { - return ret; - } - - Clock::time_point now = Clock::now(); - if (logshard_lock.should_renew(now)) { - ret = logshard_lock.renew(now); - if (ret < 0) { - return ret; - } - } - - entry.get_key(&marker); - } // entry for loop - } while (truncated); - - logshard_lock.unlock(); - return 0; -} - - -void RGWReshard::get_logshard_oid(int shard_num, string *logshard) -{ - char buf[32]; - snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num); - - string objname(reshard_oid_prefix); - *logshard = objname + buf; -} - -int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp) -{ - int ret = 0; - - for (int i = 0; i < num_logshards; i++) { - string logshard; - get_logshard_oid(i, &logshard); - - ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl; - - ret = process_single_logshard(i, dpp); - - ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl; - } - - return 0; -} - -bool RGWReshard::going_down() -{ - return down_flag; -} - -void RGWReshard::start_processor() -{ - worker = new ReshardWorker(store->ctx(), this); - worker->create("rgw_reshard"); -} - -void RGWReshard::stop_processor() -{ - down_flag = true; - if (worker) { - worker->stop(); - worker->join(); - } - delete worker; - worker = nullptr; -} - -void *RGWReshard::ReshardWorker::entry() { - do { - utime_t start = ceph_clock_now(); - reshard->process_all_logshards(this); - - if (reshard->going_down()) - break; - - utime_t end = ceph_clock_now(); - end -= start; - int secs = cct->_conf.get_val("rgw_reshard_thread_interval"); - - if (secs <= end.sec()) - continue; // next round - - secs -= end.sec(); - - std::unique_lock locker{lock}; - cond.wait_for(locker, std::chrono::seconds(secs)); - } while (!reshard->going_down()); - - return NULL; -} - -void RGWReshard::ReshardWorker::stop() -{ - std::lock_guard l{lock}; - cond.notify_all(); -} - -CephContext *RGWReshard::ReshardWorker::get_cct() const -{ - return cct; -} - -unsigned RGWReshard::ReshardWorker::get_subsys() const -{ - return dout_subsys; -} - -std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const -{ - return out << "rgw reshard worker thread: "; -} diff --git a/src/rgw/rgw_reshard.h b/src/rgw/rgw_reshard.h deleted file mode 100644 index d8a8e4991db6..000000000000 --- a/src/rgw/rgw_reshard.h +++ /dev/null @@ -1,277 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#ifndef RGW_RESHARD_H -#define RGW_RESHARD_H - -#include -#include -#include -#include -#include - -#include -#include - -#include "include/common_fwd.h" -#include "include/rados/librados.hpp" -#include "common/ceph_time.h" -#include "common/async/yield_context.h" -#include "cls/rgw/cls_rgw_types.h" -#include "cls/lock/cls_lock_client.h" - -#include "rgw_common.h" -#include "common/fault_injector.h" - - -class RGWReshard; -namespace rgw { namespace sal { - class RadosStore; -} } - -using ReshardFaultInjector = FaultInjector; - -class RGWBucketReshardLock { - using Clock = ceph::coarse_mono_clock; - - rgw::sal::RadosStore* store; - const std::string lock_oid; - const bool ephemeral; - rados::cls::lock::Lock internal_lock; - std::chrono::seconds duration; - - Clock::time_point start_time; - Clock::time_point renew_thresh; - - void reset_time(const Clock::time_point& now) { - start_time = now; - renew_thresh = start_time + duration / 2; - } - -public: - RGWBucketReshardLock(rgw::sal::RadosStore* _store, - const std::string& reshard_lock_oid, - bool _ephemeral); - RGWBucketReshardLock(rgw::sal::RadosStore* _store, - const RGWBucketInfo& bucket_info, - bool _ephemeral) : - RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral) - {} - - int lock(const DoutPrefixProvider *dpp); - void unlock(); - int renew(const Clock::time_point&); - - bool should_renew(const Clock::time_point& now) const { - return now >= renew_thresh; - } -}; // class RGWBucketReshardLock - -class RGWBucketReshard { - public: - using Clock = ceph::coarse_mono_clock; - - private: - rgw::sal::RadosStore *store; - RGWBucketInfo bucket_info; - std::map bucket_attrs; - - RGWBucketReshardLock reshard_lock; - RGWBucketReshardLock* outer_reshard_lock; - - // using an initializer_list as an array in contiguous memory - // allocated in at once - static const std::initializer_list reshard_primes; - - int do_reshard(const rgw::bucket_index_layout_generation& current, - const rgw::bucket_index_layout_generation& target, - int max_entries, - bool verbose, - std::ostream *os, - Formatter *formatter, - const DoutPrefixProvider *dpp); -public: - - // pass nullptr for the final parameter if no outer reshard lock to - // manage - RGWBucketReshard(rgw::sal::RadosStore* _store, - const RGWBucketInfo& _bucket_info, - const std::map& _bucket_attrs, - RGWBucketReshardLock* _outer_reshard_lock); - int execute(int num_shards, ReshardFaultInjector& f, - int max_op_entries, const DoutPrefixProvider *dpp, - bool verbose = false, std::ostream *out = nullptr, - ceph::Formatter *formatter = nullptr, - RGWReshard *reshard_log = nullptr); - int get_status(const DoutPrefixProvider *dpp, std::list *status); - int cancel(const DoutPrefixProvider* dpp); - - static int clear_resharding(rgw::sal::RadosStore* store, - RGWBucketInfo& bucket_info, - std::map& bucket_attrs, - const DoutPrefixProvider* dpp); - - static uint32_t get_max_prime_shards() { - return *std::crbegin(reshard_primes); - } - - // returns the prime in our list less than or equal to the - // parameter; the lowest value that can be returned is 1 - static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) { - auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(), - requested_shards); - if (it == reshard_primes.begin()) { - return 1; - } else { - return *(--it); - } - } - - // returns the prime in our list greater than or equal to the - // parameter; if we do not have such a prime, 0 is returned - static uint32_t get_prime_shards_greater_or_equal( - uint32_t requested_shards) - { - auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(), - requested_shards); - if (it == reshard_primes.end()) { - return 0; - } else { - return *it; - } - } - - // returns a preferred number of shards given a calculated number of - // shards based on max_dynamic_shards and the list of prime values - static uint32_t get_preferred_shards(uint32_t suggested_shards, - uint32_t max_dynamic_shards) { - - // use a prime if max is within our prime range, otherwise use - // specified max - const uint32_t absolute_max = - max_dynamic_shards >= get_max_prime_shards() ? - max_dynamic_shards : - get_prime_shards_less_or_equal(max_dynamic_shards); - - // if we can use a prime number, use it, otherwise use suggested; - // note get_prime_shards_greater_or_equal will return 0 if no prime in - // prime range - const uint32_t prime_ish_num_shards = - std::max(get_prime_shards_greater_or_equal(suggested_shards), - suggested_shards); - - // dynamic sharding cannot reshard more than defined maximum - const uint32_t final_num_shards = - std::min(prime_ish_num_shards, absolute_max); - - return final_num_shards; - } - - const std::map& get_bucket_attrs() const { - return bucket_attrs; - } - - // for multisite, the RGWBucketInfo keeps a history of old log generations - // until all peers are done with them. prevent this log history from growing - // too large by refusing to reshard the bucket until the old logs get trimmed - static constexpr size_t max_bilog_history = 4; - - static bool can_reshard(const RGWBucketInfo& bucket, - const RGWSI_Zone* zone_svc); -}; // RGWBucketReshard - - -class RGWReshard { -public: - using Clock = ceph::coarse_mono_clock; - -private: - rgw::sal::RadosStore* store; - std::string lock_name; - rados::cls::lock::Lock instance_lock; - int num_logshards; - - bool verbose; - std::ostream *out; - Formatter *formatter; - - void get_logshard_oid(int shard_num, std::string *shard); -protected: - class ReshardWorker : public Thread, public DoutPrefixProvider { - CephContext *cct; - RGWReshard *reshard; - ceph::mutex lock = ceph::make_mutex("ReshardWorker"); - ceph::condition_variable cond; - - public: - ReshardWorker(CephContext * const _cct, - RGWReshard * const _reshard) - : cct(_cct), - reshard(_reshard) {} - - void *entry() override; - void stop(); - - CephContext *get_cct() const override; - unsigned get_subsys() const override; - std::ostream& gen_prefix(std::ostream& out) const override; - }; - - ReshardWorker *worker = nullptr; - std::atomic down_flag = { false }; - - std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name); - void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid); - -public: - RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr); - int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); - int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info); - int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry); - int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry); - int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list& entries, bool *is_truncated); - int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry); - - /* reshard thread */ - int process_entry(const cls_rgw_reshard_entry& entry, int max_entries, - const DoutPrefixProvider *dpp); - int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp); - int process_all_logshards(const DoutPrefixProvider *dpp); - bool going_down(); - void start_processor(); - void stop_processor(); -}; - -class RGWReshardWait { - public: - // the blocking wait uses std::condition_variable::wait_for(), which uses the - // std::chrono::steady_clock. use that for the async waits as well - using Clock = std::chrono::steady_clock; - private: - const ceph::timespan duration; - ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock"); - ceph::condition_variable cond; - - struct Waiter : boost::intrusive::list_base_hook<> { - using Executor = boost::asio::io_context::executor_type; - using Timer = boost::asio::basic_waitable_timer, Executor>; - Timer timer; - explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {} - }; - boost::intrusive::list waiters; - - bool going_down{false}; - -public: - RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5)) - : duration(duration) {} - ~RGWReshardWait() { - ceph_assert(going_down); - } - int wait(optional_yield y); - // unblock any threads waiting on reshard - void stop(); -}; - -#endif diff --git a/src/rgw/rgw_resolve.h b/src/rgw/rgw_resolve.h index 92e09220cc6a..0428e0a02fca 100644 --- a/src/rgw/rgw_resolve.h +++ b/src/rgw/rgw_resolve.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_RESOLVE_H -#define CEPH_RGW_RESOLVE_H +#pragma once #include "rgw_common.h" @@ -23,5 +22,3 @@ public: extern void rgw_init_resolver(void); extern void rgw_shutdown_resolver(void); extern RGWResolver *rgw_resolver; - -#endif diff --git a/src/rgw/rgw_rest_bucket.cc b/src/rgw/rgw_rest_bucket.cc deleted file mode 100644 index ebe4e429cc98..000000000000 --- a/src/rgw/rgw_rest_bucket.cc +++ /dev/null @@ -1,413 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#include "rgw_op.h" -#include "driver/rados/rgw_bucket.h" -#include "rgw_rest_bucket.h" -#include "rgw_sal.h" - -#include "include/str_list.h" - -#include "services/svc_sys_obj.h" -#include "services/svc_zone.h" - -#define dout_subsys ceph_subsys_rgw - -using namespace std; - -class RGWOp_Bucket_Info : public RGWRESTOp { - -public: - RGWOp_Bucket_Info() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_READ); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "get_bucket_info"; } -}; - -void RGWOp_Bucket_Info::execute(optional_yield y) -{ - RGWBucketAdminOpState op_state; - - bool fetch_stats; - - std::string bucket; - - string uid_str; - - RESTArgs::get_string(s, "uid", uid_str, &uid_str); - rgw_user uid(uid_str); - - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_bool(s, "stats", false, &fetch_stats); - - op_state.set_user_id(uid); - op_state.set_bucket_name(bucket); - op_state.set_fetch_stats(fetch_stats); - - op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this); -} - -class RGWOp_Get_Policy : public RGWRESTOp { - -public: - RGWOp_Get_Policy() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_READ); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "get_policy"; } -}; - -void RGWOp_Get_Policy::execute(optional_yield y) -{ - RGWBucketAdminOpState op_state; - - std::string bucket; - std::string object; - - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_string(s, "object", object, &object); - - op_state.set_bucket_name(bucket); - op_state.set_object(object); - - op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this); -} - -class RGWOp_Check_Bucket_Index : public RGWRESTOp { - -public: - RGWOp_Check_Bucket_Index() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "check_bucket_index"; } -}; - -void RGWOp_Check_Bucket_Index::execute(optional_yield y) -{ - std::string bucket; - - bool fix_index; - bool check_objects; - - RGWBucketAdminOpState op_state; - - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_bool(s, "fix", false, &fix_index); - RESTArgs::get_bool(s, "check-objects", false, &check_objects); - - op_state.set_bucket_name(bucket); - op_state.set_fix_index(fix_index); - op_state.set_check_objects(check_objects); - - op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s); -} - -class RGWOp_Bucket_Link : public RGWRESTOp { - -public: - RGWOp_Bucket_Link() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "link_bucket"; } -}; - -void RGWOp_Bucket_Link::execute(optional_yield y) -{ - std::string uid_str; - std::string bucket; - std::string bucket_id; - std::string new_bucket_name; - - RGWBucketAdminOpState op_state; - - RESTArgs::get_string(s, "uid", uid_str, &uid_str); - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id); - RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name); - - rgw_user uid(uid_str); - op_state.set_user_id(uid); - op_state.set_bucket_name(bucket); - op_state.set_bucket_id(bucket_id); - op_state.set_new_bucket_name(new_bucket_name); - - bufferlist data; - op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); - if (op_ret < 0) { - ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; - return; - } - op_ret = RGWBucketAdminOp::link(driver, op_state, s); -} - -class RGWOp_Bucket_Unlink : public RGWRESTOp { - -public: - RGWOp_Bucket_Unlink() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "unlink_bucket"; } -}; - -void RGWOp_Bucket_Unlink::execute(optional_yield y) -{ - std::string uid_str; - std::string bucket; - - RGWBucketAdminOpState op_state; - - RESTArgs::get_string(s, "uid", uid_str, &uid_str); - rgw_user uid(uid_str); - - RESTArgs::get_string(s, "bucket", bucket, &bucket); - - op_state.set_user_id(uid); - op_state.set_bucket_name(bucket); - - bufferlist data; - op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y); - if (op_ret < 0) { - ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl; - return; - } - op_ret = RGWBucketAdminOp::unlink(driver, op_state, s); -} - -class RGWOp_Bucket_Remove : public RGWRESTOp { - -public: - RGWOp_Bucket_Remove() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "remove_bucket"; } -}; - -void RGWOp_Bucket_Remove::execute(optional_yield y) -{ - std::string bucket_name; - bool delete_children; - std::unique_ptr bucket; - - RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name); - RESTArgs::get_bool(s, "purge-objects", false, &delete_children); - - /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to - * the master. This user is actually the OP caller, not the bucket owner. */ - op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl; - if (op_ret == -ENOENT) { - op_ret = -ERR_NO_SUCH_BUCKET; - } - return; - } - - op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield); -} - -class RGWOp_Set_Bucket_Quota : public RGWRESTOp { - -public: - RGWOp_Set_Bucket_Quota() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "set_bucket_quota"; } -}; - -#define QUOTA_INPUT_MAX_LEN 1024 - -void RGWOp_Set_Bucket_Quota::execute(optional_yield y) -{ - bool uid_arg_existed = false; - std::string uid_str; - RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed); - if (! uid_arg_existed) { - op_ret = -EINVAL; - return; - } - rgw_user uid(uid_str); - bool bucket_arg_existed = false; - std::string bucket_name; - RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed); - if (! bucket_arg_existed) { - op_ret = -EINVAL; - return; - } - - bool use_http_params; - - if (s->content_length > 0) { - use_http_params = false; - } else { - const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING"); - use_http_params = (!encoding || strcmp(encoding, "chunked") != 0); - } - RGWQuotaInfo quota; - if (!use_http_params) { - bool empty; - op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty); - if (op_ret < 0) { - if (!empty) - return; - /* was probably chunked input, but no content provided, configure via http params */ - use_http_params = true; - } - } - if (use_http_params) { - std::unique_ptr bucket; - op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield); - if (op_ret < 0) { - return; - } - RGWQuotaInfo *old_quota = &bucket->get_info().quota; - int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size); - int64_t max_size_kb; - bool has_max_size_kb = false; - RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects); - RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size); - RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb); - if (has_max_size_kb) - quota.max_size = max_size_kb * 1024; - RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled); - } - - RGWBucketAdminOpState op_state; - op_state.set_user_id(uid); - op_state.set_bucket_name(bucket_name); - op_state.set_quota(quota); - - op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s); -} - -class RGWOp_Sync_Bucket : public RGWRESTOp { - -public: - RGWOp_Sync_Bucket() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "sync_bucket"; } -}; - -void RGWOp_Sync_Bucket::execute(optional_yield y) -{ - std::string bucket; - std::string tenant; - bool sync_bucket; - - RGWBucketAdminOpState op_state; - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_string(s, "tenant", tenant, &tenant); - RESTArgs::get_bool(s, "sync", true, &sync_bucket); - - op_state.set_bucket_name(bucket); - op_state.set_tenant(tenant); - op_state.set_sync_bucket(sync_bucket); - - op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s); -} - -class RGWOp_Object_Remove: public RGWRESTOp { - -public: - RGWOp_Object_Remove() {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("buckets", RGW_CAP_WRITE); - } - - void execute(optional_yield y) override; - - const char* name() const override { return "remove_object"; } -}; - -void RGWOp_Object_Remove::execute(optional_yield y) -{ - std::string bucket; - std::string object; - - RGWBucketAdminOpState op_state; - - RESTArgs::get_string(s, "bucket", bucket, &bucket); - RESTArgs::get_string(s, "object", object, &object); - - op_state.set_bucket_name(bucket); - op_state.set_object(object); - - op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s); -} - - -RGWOp *RGWHandler_Bucket::op_get() -{ - - if (s->info.args.sub_resource_exists("policy")) - return new RGWOp_Get_Policy; - - if (s->info.args.sub_resource_exists("index")) - return new RGWOp_Check_Bucket_Index; - - return new RGWOp_Bucket_Info; -} - -RGWOp *RGWHandler_Bucket::op_put() -{ - if (s->info.args.sub_resource_exists("quota")) - return new RGWOp_Set_Bucket_Quota; - - if (s->info.args.sub_resource_exists("sync")) - return new RGWOp_Sync_Bucket; - - return new RGWOp_Bucket_Link; -} - -RGWOp *RGWHandler_Bucket::op_post() -{ - return new RGWOp_Bucket_Unlink; -} - -RGWOp *RGWHandler_Bucket::op_delete() -{ - if (s->info.args.sub_resource_exists("object")) - return new RGWOp_Object_Remove; - - return new RGWOp_Bucket_Remove; -} diff --git a/src/rgw/rgw_rest_bucket.h b/src/rgw/rgw_rest_bucket.h deleted file mode 100644 index 00f0b64397a2..000000000000 --- a/src/rgw/rgw_rest_bucket.h +++ /dev/null @@ -1,36 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -#pragma once - -#include "rgw_rest.h" -#include "rgw_rest_s3.h" - - -class RGWHandler_Bucket : public RGWHandler_Auth_S3 { -protected: - RGWOp *op_get() override; - RGWOp *op_put() override; - RGWOp *op_post() override; - RGWOp *op_delete() override; -public: - using RGWHandler_Auth_S3::RGWHandler_Auth_S3; - ~RGWHandler_Bucket() override = default; - - int read_permissions(RGWOp*, optional_yield y) override { - return 0; - } -}; - -class RGWRESTMgr_Bucket : public RGWRESTMgr { -public: - RGWRESTMgr_Bucket() = default; - ~RGWRESTMgr_Bucket() override = default; - - RGWHandler_REST* get_handler(rgw::sal::Driver* driver, - req_state*, - const rgw::auth::StrategyRegistry& auth_registry, - const std::string&) override { - return new RGWHandler_Bucket(auth_registry); - } -}; diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc deleted file mode 100644 index 3563cf051bd7..000000000000 --- a/src/rgw/rgw_rest_log.cc +++ /dev/null @@ -1,1267 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 eNovance SAS - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "common/ceph_json.h" -#include "common/strtol.h" -#include "rgw_rest.h" -#include "rgw_op.h" -#include "rgw_rest_s3.h" -#include "rgw_rest_log.h" -#include "rgw_client_io.h" -#include "rgw_sync.h" -#include "rgw_data_sync.h" -#include "rgw_common.h" -#include "rgw_zone.h" -#include "rgw_mdlog.h" -#include "rgw_datalog_notify.h" -#include "rgw_trim_bilog.h" - -#include "services/svc_zone.h" -#include "services/svc_mdlog.h" -#include "services/svc_bilog_rados.h" - -#include "common/errno.h" -#include "include/ceph_assert.h" - -#define dout_context g_ceph_context -#define LOG_CLASS_LIST_MAX_ENTRIES (1000) -#define dout_subsys ceph_subsys_rgw - -using namespace std; - -void RGWOp_MDLog_List::execute(optional_yield y) { - string period = s->info.args.get("period"); - string shard = s->info.args.get("id"); - string max_entries_str = s->info.args.get("max-entries"); - string marker = s->info.args.get("marker"), - err; - void *handle; - unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; - - if (s->info.args.exists("start-time") || - s->info.args.exists("end-time")) { - ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; - op_ret = -EINVAL; - return; - } - - shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - - if (!max_entries_str.empty()) { - max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; - op_ret = -EINVAL; - return; - } - if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { - max_entries = LOG_CLASS_LIST_MAX_ENTRIES; - } - } - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; - period = driver->get_zone()->get_current_period_id(); - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id" << dendl; - op_ret = -EINVAL; - return; - } - } - - RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; - - meta_log.init_list_entries(shard_id, {}, {}, marker, &handle); - - op_ret = meta_log.list_entries(this, handle, max_entries, entries, - &last_marker, &truncated); - - meta_log.complete_list_entries(handle); -} - -void RGWOp_MDLog_List::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret < 0) - return; - - s->formatter->open_object_section("log_entries"); - s->formatter->dump_string("marker", last_marker); - s->formatter->dump_bool("truncated", truncated); - { - s->formatter->open_array_section("entries"); - for (list::iterator iter = entries.begin(); - iter != entries.end(); ++iter) { - cls_log_entry& entry = *iter; - static_cast(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter); - flusher.flush(); - } - s->formatter->close_section(); - } - s->formatter->close_section(); - flusher.flush(); -} - -void RGWOp_MDLog_Info::execute(optional_yield y) { - num_objects = s->cct->_conf->rgw_md_log_max_shards; - period = static_cast(driver)->svc()->mdlog->read_oldest_log_period(y, s); - op_ret = period.get_error(); -} - -void RGWOp_MDLog_Info::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - s->formatter->open_object_section("mdlog"); - s->formatter->dump_unsigned("num_objects", num_objects); - if (period) { - s->formatter->dump_string("period", period.get_period().get_id()); - s->formatter->dump_unsigned("realm_epoch", period.get_epoch()); - } - s->formatter->close_section(); - flusher.flush(); -} - -void RGWOp_MDLog_ShardInfo::execute(optional_yield y) { - string period = s->info.args.get("period"); - string shard = s->info.args.get("id"); - string err; - - unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; - period = driver->get_zone()->get_current_period_id(); - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id" << dendl; - op_ret = -EINVAL; - return; - } - } - RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; - - op_ret = meta_log.get_info(this, shard_id, &info); -} - -void RGWOp_MDLog_ShardInfo::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - encode_json("info", info, s->formatter); - flusher.flush(); -} - -void RGWOp_MDLog_Delete::execute(optional_yield y) { - string marker = s->info.args.get("marker"), - period = s->info.args.get("period"), - shard = s->info.args.get("id"), - err; - unsigned shard_id; - - - if (s->info.args.exists("start-time") || - s->info.args.exists("end-time")) { - ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; - op_ret = -EINVAL; - } - - if (s->info.args.exists("start-marker")) { - ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; - op_ret = -EINVAL; - } - - if (s->info.args.exists("end-marker")) { - if (!s->info.args.exists("marker")) { - marker = s->info.args.get("end-marker"); - } else { - ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; - op_ret = -EINVAL; - } - } - - op_ret = 0; - - shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - - if (marker.empty()) { /* bounding end */ - op_ret = -EINVAL; - return; - } - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; - period = driver->get_zone()->get_current_period_id(); - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id" << dendl; - op_ret = -EINVAL; - return; - } - } - RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; - - op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker); -} - -void RGWOp_MDLog_Lock::execute(optional_yield y) { - string period, shard_id_str, duration_str, locker_id, zone_id; - unsigned shard_id; - - op_ret = 0; - - period = s->info.args.get("period"); - shard_id_str = s->info.args.get("id"); - duration_str = s->info.args.get("length"); - locker_id = s->info.args.get("locker-id"); - zone_id = s->info.args.get("zone-id"); - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; - period = driver->get_zone()->get_current_period_id(); - } - - if (period.empty() || - shard_id_str.empty() || - (duration_str.empty()) || - locker_id.empty() || - zone_id.empty()) { - ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; - op_ret = -EINVAL; - return; - } - - string err; - shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; - op_ret = -EINVAL; - return; - } - - RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; - unsigned dur; - dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err); - if (!err.empty() || dur <= 0) { - ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl; - op_ret = -EINVAL; - return; - } - op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id, - locker_id); - if (op_ret == -EBUSY) - op_ret = -ERR_LOCKED; -} - -void RGWOp_MDLog_Unlock::execute(optional_yield y) { - string period, shard_id_str, locker_id, zone_id; - unsigned shard_id; - - op_ret = 0; - - period = s->info.args.get("period"); - shard_id_str = s->info.args.get("id"); - locker_id = s->info.args.get("locker-id"); - zone_id = s->info.args.get("zone-id"); - - if (period.empty()) { - ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl; - period = driver->get_zone()->get_current_period_id(); - } - - if (period.empty() || - shard_id_str.empty() || - locker_id.empty() || - zone_id.empty()) { - ldpp_dout(this, 5) << "Error invalid parameter list" << dendl; - op_ret = -EINVAL; - return; - } - - string err; - shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl; - op_ret = -EINVAL; - return; - } - - RGWMetadataLog meta_log{s->cct, static_cast(driver)->svc()->zone, static_cast(driver)->svc()->cls, period}; - op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id); -} - -void RGWOp_MDLog_Notify::execute(optional_yield y) { -#define LARGE_ENOUGH_BUF (128 * 1024) - - int r = 0; - bufferlist data; - std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); - if (r < 0) { - op_ret = r; - return; - } - - char* buf = data.c_str(); - ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; - - JSONParser p; - r = p.parse(buf, data.length()); - if (r < 0) { - ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; - op_ret = r; - return; - } - - set updated_shards; - try { - decode_json_obj(updated_shards, &p); - } catch (JSONDecoder::err& err) { - ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; - op_ret = -EINVAL; - return; - } - - if (driver->ctx()->_conf->subsys.should_gather()) { - for (set::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { - ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl; - } - } - - driver->wakeup_meta_sync_shards(updated_shards); - - op_ret = 0; -} - -void RGWOp_BILog_List::execute(optional_yield y) { - bool gen_specified = false; - string tenant_name = s->info.args.get("tenant"), - bucket_name = s->info.args.get("bucket"), - marker = s->info.args.get("marker"), - max_entries_str = s->info.args.get("max-entries"), - bucket_instance = s->info.args.get("bucket-instance"), - gen_str = s->info.args.get("generation", &gen_specified), - format_version_str = s->info.args.get("format-ver"); - std::unique_ptr bucket; - rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); - - unsigned max_entries; - - if (bucket_name.empty() && bucket_instance.empty()) { - ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; - op_ret = -EINVAL; - return; - } - - string err; - std::optional gen; - if (gen_specified) { - gen = strict_strtoll(gen_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; - op_ret = -EINVAL; - return; - } - } - - if (!format_version_str.empty()) { - format_ver = strict_strtoll(format_version_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl; - op_ret = -EINVAL; - return; - } - } - - int shard_id; - string bn; - op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); - if (op_ret < 0) { - return; - } - - if (!bucket_instance.empty()) { - b.name = bn; - b.bucket_id = bucket_instance; - } - op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; - return; - } - - const auto& logs = bucket->get_info().layout.logs; - if (logs.empty()) { - ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; - op_ret = -ENOENT; - return; - } - - auto log = std::prev(logs.end()); - if (gen) { - log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen)); - if (log == logs.end()) { - ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl; - op_ret = -ENOENT; - return; - } - } - if (auto next = std::next(log); next != logs.end()) { - next_log_layout = *next; // get the next log after the current latest - } - auto& log_layout = *log; // current log layout for log listing - - unsigned count = 0; - - - max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); - if (!err.empty()) - max_entries = LOG_CLASS_LIST_MAX_ENTRIES; - - send_response(); - do { - list entries; - int ret = static_cast(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id, - marker, max_entries - count, - entries, &truncated); - if (ret < 0) { - ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl; - return; - } - - count += entries.size(); - - send_response(entries, marker); - } while (truncated && count < max_entries); - - send_response_end(); -} - -void RGWOp_BILog_List::send_response() { - if (sent_header) - return; - - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - sent_header = true; - - if (op_ret < 0) - return; - - if (format_ver >= 2) { - s->formatter->open_object_section("result"); - } - - s->formatter->open_array_section("entries"); -} - -void RGWOp_BILog_List::send_response(list& entries, string& marker) -{ - for (list::iterator iter = entries.begin(); iter != entries.end(); ++iter) { - rgw_bi_log_entry& entry = *iter; - encode_json("entry", entry, s->formatter); - - marker = entry.id; - flusher.flush(); - } -} - -void RGWOp_BILog_List::send_response_end() { - s->formatter->close_section(); - - if (format_ver >= 2) { - encode_json("truncated", truncated, s->formatter); - - if (next_log_layout) { - s->formatter->open_object_section("next_log"); - encode_json("generation", next_log_layout->gen, s->formatter); - encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter); - s->formatter->close_section(); // next_log - } - - s->formatter->close_section(); // result - } - - flusher.flush(); -} - -void RGWOp_BILog_Info::execute(optional_yield y) { - string tenant_name = s->info.args.get("tenant"), - bucket_name = s->info.args.get("bucket"), - bucket_instance = s->info.args.get("bucket-instance"); - std::unique_ptr bucket; - rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); - - if (bucket_name.empty() && bucket_instance.empty()) { - ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl; - op_ret = -EINVAL; - return; - } - - int shard_id; - string bn; - op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); - if (op_ret < 0) { - return; - } - - if (!bucket_instance.empty()) { - b.name = bn; - b.bucket_id = bucket_instance; - } - op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; - return; - } - - const auto& logs = bucket->get_info().layout.logs; - if (logs.empty()) { - ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl; - op_ret = -ENOENT; - return; - } - - map stats; - const auto& index = log_to_index_layout(logs.back()); - - int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped); - if (ret < 0 && ret != -ENOENT) { - op_ret = ret; - return; - } - - oldest_gen = logs.front().gen; - latest_gen = logs.back().gen; - - for (auto& log : logs) { - uint32_t num_shards = log.layout.in_index.layout.num_shards; - generations.push_back({log.gen, num_shards}); - } -} - -void RGWOp_BILog_Info::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret < 0) - return; - - s->formatter->open_object_section("info"); - encode_json("bucket_ver", bucket_ver, s->formatter); - encode_json("master_ver", master_ver, s->formatter); - encode_json("max_marker", max_marker, s->formatter); - encode_json("syncstopped", syncstopped, s->formatter); - encode_json("oldest_gen", oldest_gen, s->formatter); - encode_json("latest_gen", latest_gen, s->formatter); - encode_json("generations", generations, s->formatter); - s->formatter->close_section(); - - flusher.flush(); -} - -void RGWOp_BILog_Delete::execute(optional_yield y) { - bool gen_specified = false; - string tenant_name = s->info.args.get("tenant"), - bucket_name = s->info.args.get("bucket"), - start_marker = s->info.args.get("start-marker"), - end_marker = s->info.args.get("end-marker"), - bucket_instance = s->info.args.get("bucket-instance"), - gen_str = s->info.args.get("generation", &gen_specified); - - std::unique_ptr bucket; - rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name)); - - op_ret = 0; - if ((bucket_name.empty() && bucket_instance.empty()) || - end_marker.empty()) { - ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl; - op_ret = -EINVAL; - return; - } - - string err; - uint64_t gen = 0; - if (gen_specified) { - gen = strict_strtoll(gen_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl; - op_ret = -EINVAL; - return; - } - } - - int shard_id; - string bn; - op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id); - if (op_ret < 0) { - return; - } - - if (!bucket_instance.empty()) { - b.name = bn; - b.bucket_id = bucket_instance; - } - op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl; - return; - } - - op_ret = bilog_trim(this, static_cast(driver), - bucket->get_info(), gen, shard_id, - start_marker, end_marker); - if (op_ret < 0) { - ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl; - } - - return; -} - -void RGWOp_DATALog_List::execute(optional_yield y) { - string shard = s->info.args.get("id"); - - string max_entries_str = s->info.args.get("max-entries"), - marker = s->info.args.get("marker"), - err; - unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES; - - if (s->info.args.exists("start-time") || - s->info.args.exists("end-time")) { - ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; - op_ret = -EINVAL; - } - - s->info.args.get_bool("extra-info", &extra_info, false); - - shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - - if (!max_entries_str.empty()) { - max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl; - op_ret = -EINVAL; - return; - } - if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) { - max_entries = LOG_CLASS_LIST_MAX_ENTRIES; - } - } - - // Note that last_marker is updated to be the marker of the last - // entry listed - op_ret = static_cast(driver)->svc()->datalog_rados->list_entries(this, shard_id, - max_entries, entries, - marker, &last_marker, - &truncated); -} - -void RGWOp_DATALog_List::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret < 0) - return; - - s->formatter->open_object_section("log_entries"); - s->formatter->dump_string("marker", last_marker); - s->formatter->dump_bool("truncated", truncated); - { - s->formatter->open_array_section("entries"); - for (const auto& entry : entries) { - if (!extra_info) { - encode_json("entry", entry.entry, s->formatter); - } else { - encode_json("entry", entry, s->formatter); - } - flusher.flush(); - } - s->formatter->close_section(); - } - s->formatter->close_section(); - flusher.flush(); -} - - -void RGWOp_DATALog_Info::execute(optional_yield y) { - num_objects = s->cct->_conf->rgw_data_log_num_shards; - op_ret = 0; -} - -void RGWOp_DATALog_Info::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - s->formatter->open_object_section("num_objects"); - s->formatter->dump_unsigned("num_objects", num_objects); - s->formatter->close_section(); - flusher.flush(); -} - -void RGWOp_DATALog_ShardInfo::execute(optional_yield y) { - string shard = s->info.args.get("id"); - string err; - - unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - - op_ret = static_cast(driver)->svc()->datalog_rados->get_info(this, shard_id, &info); -} - -void RGWOp_DATALog_ShardInfo::send_response() { - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - encode_json("info", info, s->formatter); - flusher.flush(); -} - -void RGWOp_DATALog_Notify::execute(optional_yield y) { - string source_zone = s->info.args.get("source-zone"); -#define LARGE_ENOUGH_BUF (128 * 1024) - - int r = 0; - bufferlist data; - std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF); - if (r < 0) { - op_ret = r; - return; - } - - char* buf = data.c_str(); - ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl; - - JSONParser p; - r = p.parse(buf, data.length()); - if (r < 0) { - ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl; - op_ret = r; - return; - } - - bc::flat_map> updated_shards; - try { - auto decoder = rgw_data_notify_v1_decoder{updated_shards}; - decode_json_obj(decoder, &p); - } catch (JSONDecoder::err& err) { - ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; - op_ret = -EINVAL; - return; - } - - if (driver->ctx()->_conf->subsys.should_gather()) { - for (bc::flat_map >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) { - ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; - bc::flat_set& entries = iter->second; - for (const auto& [key, gen] : entries) { - ldpp_dout(this, 20) << __func__ << "(): modified key=" << key - << " of gen=" << gen << dendl; - } - } - } - - driver->wakeup_data_sync_shards(this, source_zone, updated_shards); - - op_ret = 0; -} - -void RGWOp_DATALog_Notify2::execute(optional_yield y) { - string source_zone = s->info.args.get("source-zone"); -#define LARGE_ENOUGH_BUF (128 * 1024) - - int r = 0; - bufferlist data; - std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF); - if (r < 0) { - op_ret = r; - return; - } - - char* buf = data.c_str(); - ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl; - - JSONParser p; - r = p.parse(buf, data.length()); - if (r < 0) { - ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl; - op_ret = r; - return; - } - - bc::flat_map > updated_shards; - try { - decode_json_obj(updated_shards, &p); - } catch (JSONDecoder::err& err) { - ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl; - op_ret = -EINVAL; - return; - } - - if (driver->ctx()->_conf->subsys.should_gather()) { - for (bc::flat_map >::iterator iter = - updated_shards.begin(); iter != updated_shards.end(); ++iter) { - ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl; - bc::flat_set& entries = iter->second; - for (const auto& [key, gen] : entries) { - ldpp_dout(this, 20) << __func__ << "(): modified key=" << key << - " of generation=" << gen << dendl; - } - } - } - - driver->wakeup_data_sync_shards(this, source_zone, updated_shards); - - op_ret = 0; -} - -void RGWOp_DATALog_Delete::execute(optional_yield y) { - string marker = s->info.args.get("marker"), - shard = s->info.args.get("id"), - err; - unsigned shard_id; - - op_ret = 0; - - if (s->info.args.exists("start-time") || - s->info.args.exists("end-time")) { - ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl; - op_ret = -EINVAL; - } - - if (s->info.args.exists("start-marker")) { - ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl; - op_ret = -EINVAL; - } - - if (s->info.args.exists("end-marker")) { - if (!s->info.args.exists("marker")) { - marker = s->info.args.get("end-marker"); - } else { - ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl; - op_ret = -EINVAL; - } - } - - shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err); - if (!err.empty()) { - ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl; - op_ret = -EINVAL; - return; - } - if (marker.empty()) { /* bounding end */ - op_ret = -EINVAL; - return; - } - - op_ret = static_cast(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker); -} - -// not in header to avoid pulling in rgw_sync.h -class RGWOp_MDLog_Status : public RGWRESTOp { - rgw_meta_sync_status status; -public: - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_READ); - } - int verify_permission(optional_yield) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { return "get_metadata_log_status"; } -}; - -void RGWOp_MDLog_Status::execute(optional_yield y) -{ - auto sync = static_cast(driver)->getRados()->get_meta_sync_manager(); - if (sync == nullptr) { - ldpp_dout(this, 1) << "no sync manager" << dendl; - op_ret = -ENOENT; - return; - } - op_ret = sync->read_sync_status(this, &status); -} - -void RGWOp_MDLog_Status::send_response() -{ - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret >= 0) { - encode_json("status", status, s->formatter); - } - flusher.flush(); -} - -// not in header to avoid pulling in rgw_data_sync.h -class RGWOp_BILog_Status : public RGWRESTOp { - bilog_status_v2 status; - int version = 1; -public: - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("bilog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { return "get_bucket_index_log_status"; } -}; - -void RGWOp_BILog_Status::execute(optional_yield y) -{ - const auto options = s->info.args.get("options"); - bool merge = (options == "merge"); - const auto source_zone = s->info.args.get("source-zone"); - const auto source_key = s->info.args.get("source-bucket"); - auto key = s->info.args.get("bucket"); - op_ret = s->info.args.get_int("version", &version, 1); - - if (key.empty()) { - key = source_key; - } - if (key.empty()) { - ldpp_dout(this, 4) << "no 'bucket' provided" << dendl; - op_ret = -EINVAL; - return; - } - - rgw_bucket b; - int shard_id{-1}; // unused - op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id); - if (op_ret < 0) { - ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl; - op_ret = -EINVAL; - return; - } - - // read the bucket instance info for num_shards - std::unique_ptr bucket; - op_ret = driver->get_bucket(s, nullptr, b, &bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl; - return; - } - - rgw_bucket source_bucket; - - if (source_key.empty() || - source_key == key) { - source_bucket = bucket->get_key(); - } else { - op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr); - if (op_ret < 0) { - ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl; - return; - } - } - - const auto& local_zone_id = driver->get_zone()->get_id(); - - if (!merge) { - rgw_sync_bucket_pipe pipe; - pipe.source.zone = source_zone; - pipe.source.bucket = source_bucket; - pipe.dest.zone = local_zone_id; - pipe.dest.bucket = bucket->get_key(); - - ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; - - op_ret = rgw_read_bucket_full_sync_status( - this, - static_cast(driver), - pipe, - &status.sync_status, - s->yield); - if (op_ret < 0) { - ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; - return; - } - status.inc_status.resize(status.sync_status.shards_done_with_gen.size()); - - op_ret = rgw_read_bucket_inc_sync_status( - this, - static_cast(driver), - pipe, - status.sync_status.incremental_gen, - &status.inc_status); - if (op_ret < 0) { - ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; - } - return; - } - - rgw_zone_id source_zone_id(source_zone); - - RGWBucketSyncPolicyHandlerRef source_handler; - op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y); - if (op_ret < 0) { - ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl; - return; - } - - auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id); - - std::vector current_status; - for (auto& entry : local_dests) { - auto pipe = entry.second; - - ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl; - - RGWBucketInfo *pinfo = &bucket->get_info(); - std::optional opt_dest_info; - - if (!pipe.dest.bucket) { - /* Uh oh, something went wrong */ - ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; - op_ret = -EIO; - return; - } - - if (*pipe.dest.bucket != pinfo->bucket) { - opt_dest_info.emplace(); - std::unique_ptr dest_bucket; - op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl; - return; - } - - *opt_dest_info = dest_bucket->get_info(); - pinfo = &(*opt_dest_info); - pipe.dest.bucket = pinfo->bucket; - } - - op_ret = rgw_read_bucket_full_sync_status( - this, - static_cast(driver), - pipe, - &status.sync_status, - s->yield); - if (op_ret < 0) { - ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl; - return; - } - - current_status.resize(status.sync_status.shards_done_with_gen.size()); - int r = rgw_read_bucket_inc_sync_status(this, static_cast(driver), - pipe, status.sync_status.incremental_gen, ¤t_status); - if (r < 0) { - ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl; - op_ret = r; - return; - } - - if (status.inc_status.empty()) { - status.inc_status = std::move(current_status); - } else { - if (current_status.size() != status.inc_status.size()) { - op_ret = -EINVAL; - ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets " - "syncing from the same source: status.size()= " - << status.inc_status.size() - << " current_status.size()=" - << current_status.size() << dendl; - return; - } - auto m = status.inc_status.begin(); - for (auto& cur_shard_status : current_status) { - auto& result_shard_status = *m++; - // always take the first marker, or any later marker that's smaller - if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) { - result_shard_status = std::move(cur_shard_status); - } - } - } - } -} - -void RGWOp_BILog_Status::send_response() -{ - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret >= 0) { - if (version < 2) { - encode_json("status", status.inc_status, s->formatter); - } else { - encode_json("status", status, s->formatter); - } - } - flusher.flush(); -} - -// not in header to avoid pulling in rgw_data_sync.h -class RGWOp_DATALog_Status : public RGWRESTOp { - rgw_data_sync_status status; -public: - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override ; - void send_response() override; - const char* name() const override { return "get_data_changes_log_status"; } -}; - -void RGWOp_DATALog_Status::execute(optional_yield y) -{ - const auto source_zone = s->info.args.get("source-zone"); - auto sync = driver->get_data_sync_manager(source_zone); - if (sync == nullptr) { - ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl; - op_ret = -ENOENT; - return; - } - op_ret = sync->read_sync_status(this, &status); -} - -void RGWOp_DATALog_Status::send_response() -{ - set_req_state_err(s, op_ret); - dump_errno(s); - end_header(s); - - if (op_ret >= 0) { - encode_json("status", status, s->formatter); - } - flusher.flush(); -} - - -RGWOp *RGWHandler_Log::op_get() { - bool exists; - string type = s->info.args.get("type", &exists); - - if (!exists) { - return NULL; - } - - if (type.compare("metadata") == 0) { - if (s->info.args.exists("id")) { - if (s->info.args.exists("info")) { - return new RGWOp_MDLog_ShardInfo; - } else { - return new RGWOp_MDLog_List; - } - } else if (s->info.args.exists("status")) { - return new RGWOp_MDLog_Status; - } else { - return new RGWOp_MDLog_Info; - } - } else if (type.compare("bucket-index") == 0) { - if (s->info.args.exists("info")) { - return new RGWOp_BILog_Info; - } else if (s->info.args.exists("status")) { - return new RGWOp_BILog_Status; - } else { - return new RGWOp_BILog_List; - } - } else if (type.compare("data") == 0) { - if (s->info.args.exists("id")) { - if (s->info.args.exists("info")) { - return new RGWOp_DATALog_ShardInfo; - } else { - return new RGWOp_DATALog_List; - } - } else if (s->info.args.exists("status")) { - return new RGWOp_DATALog_Status; - } else { - return new RGWOp_DATALog_Info; - } - } - return NULL; -} - -RGWOp *RGWHandler_Log::op_delete() { - bool exists; - string type = s->info.args.get("type", &exists); - - if (!exists) { - return NULL; - } - - if (type.compare("metadata") == 0) - return new RGWOp_MDLog_Delete; - else if (type.compare("bucket-index") == 0) - return new RGWOp_BILog_Delete; - else if (type.compare("data") == 0) - return new RGWOp_DATALog_Delete; - return NULL; -} - -RGWOp *RGWHandler_Log::op_post() { - bool exists; - string type = s->info.args.get("type", &exists); - - if (!exists) { - return NULL; - } - - if (type.compare("metadata") == 0) { - if (s->info.args.exists("lock")) - return new RGWOp_MDLog_Lock; - else if (s->info.args.exists("unlock")) - return new RGWOp_MDLog_Unlock; - else if (s->info.args.exists("notify")) - return new RGWOp_MDLog_Notify; - } else if (type.compare("data") == 0) { - if (s->info.args.exists("notify")) { - return new RGWOp_DATALog_Notify; - } else if (s->info.args.exists("notify2")) { - return new RGWOp_DATALog_Notify2; - } - } - return NULL; -} - diff --git a/src/rgw/rgw_rest_log.h b/src/rgw/rgw_rest_log.h deleted file mode 100644 index c8a0c4df07be..000000000000 --- a/src/rgw/rgw_rest_log.h +++ /dev/null @@ -1,337 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab ft=cpp - -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 eNovance SAS - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#pragma once - -#include "rgw_datalog.h" -#include "rgw_rest.h" -#include "rgw_rest_s3.h" -#include "rgw_metadata.h" -#include "rgw_mdlog.h" -#include "rgw_data_sync.h" - -class RGWOp_BILog_List : public RGWRESTOp { - bool sent_header; - uint32_t format_ver{0}; - bool truncated{false}; - std::optional next_log_layout; - -public: - RGWOp_BILog_List() : sent_header(false) {} - ~RGWOp_BILog_List() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("bilog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void send_response() override; - virtual void send_response(std::list& entries, std::string& marker); - virtual void send_response_end(); - void execute(optional_yield y) override; - const char* name() const override { - return "list_bucket_index_log"; - } -}; - -class RGWOp_BILog_Info : public RGWRESTOp { - std::string bucket_ver; - std::string master_ver; - std::string max_marker; - bool syncstopped; - uint64_t oldest_gen = 0; - uint64_t latest_gen = 0; - std::vector generations; - -public: - RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {} - ~RGWOp_BILog_Info() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("bilog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void send_response() override; - void execute(optional_yield y) override; - const char* name() const override { - return "bucket_index_log_info"; - } -}; - -class RGWOp_BILog_Delete : public RGWRESTOp { -public: - RGWOp_BILog_Delete() {} - ~RGWOp_BILog_Delete() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("bilog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "trim_bucket_index_log"; - } -}; - -class RGWOp_MDLog_List : public RGWRESTOp { - std::list entries; - std::string last_marker; - bool truncated; -public: - RGWOp_MDLog_List() : truncated(false) {} - ~RGWOp_MDLog_List() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "list_metadata_log"; - } -}; - -class RGWOp_MDLog_Info : public RGWRESTOp { - unsigned num_objects; - RGWPeriodHistory::Cursor period; -public: - RGWOp_MDLog_Info() : num_objects(0) {} - ~RGWOp_MDLog_Info() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "get_metadata_log_info"; - } -}; - -class RGWOp_MDLog_ShardInfo : public RGWRESTOp { - RGWMetadataLogInfo info; -public: - RGWOp_MDLog_ShardInfo() {} - ~RGWOp_MDLog_ShardInfo() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "get_metadata_log_shard_info"; - } -}; - -class RGWOp_MDLog_Lock : public RGWRESTOp { -public: - RGWOp_MDLog_Lock() {} - ~RGWOp_MDLog_Lock() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "lock_mdlog_object"; - } -}; - -class RGWOp_MDLog_Unlock : public RGWRESTOp { -public: - RGWOp_MDLog_Unlock() {} - ~RGWOp_MDLog_Unlock() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "unlock_mdlog_object"; - } -}; - -class RGWOp_MDLog_Notify : public RGWRESTOp { -public: - RGWOp_MDLog_Notify() {} - ~RGWOp_MDLog_Notify() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "mdlog_notify"; - } - RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; } -}; - -class RGWOp_MDLog_Delete : public RGWRESTOp { -public: - RGWOp_MDLog_Delete() {} - ~RGWOp_MDLog_Delete() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("mdlog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "trim_metadata_log"; - } -}; - -class RGWOp_DATALog_List : public RGWRESTOp { - std::vector entries; - std::string last_marker; - bool truncated; - bool extra_info; -public: - RGWOp_DATALog_List() : truncated(false), extra_info(false) {} - ~RGWOp_DATALog_List() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "list_data_changes_log"; - } -}; - -class RGWOp_DATALog_Info : public RGWRESTOp { - unsigned num_objects; -public: - RGWOp_DATALog_Info() : num_objects(0) {} - ~RGWOp_DATALog_Info() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "get_data_changes_log_info"; - } -}; - -class RGWOp_DATALog_ShardInfo : public RGWRESTOp { - RGWDataChangesLogInfo info; -public: - RGWOp_DATALog_ShardInfo() {} - ~RGWOp_DATALog_ShardInfo() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_READ); - } - int verify_permission(optional_yield y) override { - return check_caps(s->user->get_caps()); - } - void execute(optional_yield y) override; - void send_response() override; - const char* name() const override { - return "get_data_changes_log_shard_info"; - } -}; - -class RGWOp_DATALog_Notify : public RGWRESTOp { -public: - RGWOp_DATALog_Notify() {} - ~RGWOp_DATALog_Notify() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "datalog_notify"; - } - RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; } -}; - -class RGWOp_DATALog_Notify2 : public RGWRESTOp { - rgw_data_notify_entry data_notify; -public: - RGWOp_DATALog_Notify2() {} - ~RGWOp_DATALog_Notify2() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "datalog_notify2"; - } - RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; } -}; - -class RGWOp_DATALog_Delete : public RGWRESTOp { -public: - RGWOp_DATALog_Delete() {} - ~RGWOp_DATALog_Delete() override {} - - int check_caps(const RGWUserCaps& caps) override { - return caps.check_cap("datalog", RGW_CAP_WRITE); - } - void execute(optional_yield y) override; - const char* name() const override { - return "trim_data_changes_log"; - } -}; - -class RGWHandler_Log : public RGWHandler_Auth_S3 { -protected: - RGWOp *op_get() override; - RGWOp *op_delete() override; - RGWOp *op_post() override; - - int read_permissions(RGWOp*, optional_yield) override { - return 0; - } -public: - using RGWHandler_Auth_S3::RGWHandler_Auth_S3; - ~RGWHandler_Log() override = default; -}; - -class RGWRESTMgr_Log : public RGWRESTMgr { -public: - RGWRESTMgr_Log() = default; - ~RGWRESTMgr_Log() override = default; - - RGWHandler_REST* get_handler(rgw::sal::Driver* driver, - req_state* const, - const rgw::auth::StrategyRegistry& auth_registry, - const std::string& frontend_prefixs) override { - return new RGWHandler_Log(auth_registry); - } -}; diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h index 868578924f51..9183829d976b 100644 --- a/src/rgw/rgw_role.h +++ b/src/rgw/rgw_role.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_ROLE_H -#define CEPH_RGW_ROLE_H +#pragma once #include @@ -10,7 +9,7 @@ #include "common/ceph_json.h" #include "common/ceph_context.h" -#include "rgw/rgw_rados.h" +#include "rgw_rados.h" #include "rgw_metadata.h" class RGWRados; @@ -208,5 +207,3 @@ private: Driver* driver; }; } } // namespace rgw::sal - -#endif /* CEPH_RGW_ROLE_H */ diff --git a/src/rgw/rgw_s3select.h b/src/rgw/rgw_s3select.h index 00d55cf93ea3..4a506ba4c0a4 100644 --- a/src/rgw/rgw_s3select.h +++ b/src/rgw/rgw_s3select.h @@ -2,6 +2,8 @@ // vim: ts=8 sw=2 smarttab ft=cpp // +#pragma once + namespace rgw::s3select { RGWOp* create_s3select_op(); } diff --git a/src/rgw/rgw_string.h b/src/rgw/rgw_string.h index 90e64f98a258..e58a356f4715 100644 --- a/src/rgw/rgw_string.h +++ b/src/rgw/rgw_string.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_STRING_H -#define CEPH_RGW_STRING_H +#pragma once #include #include @@ -234,5 +233,3 @@ static constexpr uint32_t MATCH_CASE_INSENSITIVE = 0x01; extern bool match_wildcards(std::string_view pattern, std::string_view input, uint32_t flags = 0); - -#endif diff --git a/src/rgw/rgw_sts.h b/src/rgw/rgw_sts.h index f73be3765897..65dbb17477f1 100644 --- a/src/rgw/rgw_sts.h +++ b/src/rgw/rgw_sts.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_STS_H -#define CEPH_RGW_STS_H +#pragma once #include "rgw_role.h" #include "rgw_auth.h" @@ -251,4 +250,3 @@ public: AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req); }; } -#endif /* CEPH_RGW_STS_H */ diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h index 1d5a54dde600..4b71dbc705af 100644 --- a/src/rgw/rgw_swift_auth.h +++ b/src/rgw/rgw_swift_auth.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_SWIFT_AUTH_H -#define CEPH_RGW_SWIFT_AUTH_H +#pragma once #include "rgw_common.h" #include "rgw_user.h" @@ -353,6 +352,3 @@ public: return new RGWHandler_SWIFT_Auth; } }; - - -#endif diff --git a/src/rgw/rgw_tag.h b/src/rgw/rgw_tag.h index 88a4e6652288..15bb25ee832c 100644 --- a/src/rgw/rgw_tag.h +++ b/src/rgw/rgw_tag.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_TAG_H -#define RGW_TAG_H +#pragma once #include #include @@ -48,5 +47,3 @@ protected: tag_map_t& get_tags() {return tag_map;} }; WRITE_CLASS_ENCODER(RGWObjTags) - -#endif /* RGW_TAG_H */ diff --git a/src/rgw/rgw_tag_s3.h b/src/rgw/rgw_tag_s3.h index e07e081fb48a..7cc892f1f669 100644 --- a/src/rgw/rgw_tag_s3.h +++ b/src/rgw/rgw_tag_s3.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef RGW_TAG_S3_H -#define RGW_TAG_S3_H +#pragma once #include #include @@ -48,6 +47,3 @@ public: return tagset.rebuild(dest); } }; - - -#endif /* RGW_TAG_S3_H */ diff --git a/src/rgw/rgw_tar.h b/src/rgw/rgw_tar.h index f51a7fc66407..b06943a3c185 100644 --- a/src/rgw/rgw_tar.h +++ b/src/rgw/rgw_tar.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_TAR_H -#define CEPH_RGW_TAR_H +#pragma once #include #include @@ -152,5 +151,3 @@ interpret_block(const StatusIndicator& status, ceph::bufferlist& bl) { } /* namespace tar */ } /* namespace rgw */ - -#endif /* CEPH_RGW_TAR_H */ diff --git a/src/rgw/rgw_token.h b/src/rgw/rgw_token.h index 9505f970c603..b2476596bec3 100644 --- a/src/rgw/rgw_token.h +++ b/src/rgw/rgw_token.h @@ -13,8 +13,7 @@ * */ -#ifndef RGW_TOKEN_H -#define RGW_TOKEN_H +#pragma once #include #include @@ -169,5 +168,3 @@ namespace rgw { } } /* namespace rgw */ - -#endif /* RGW_TOKEN_H */ diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h index 1f62ced35179..704dba28cf86 100644 --- a/src/rgw/rgw_torrent.h +++ b/src/rgw/rgw_torrent.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_TORRENT_H -#define CEPH_RGW_TORRENT_H +#pragma once #include #include @@ -138,4 +137,3 @@ private: void sha1(SHA1 *h, bufferlist &bl, off_t bl_len); int save_torrent_file(optional_yield y); }; -#endif /* CEPH_RGW_TORRENT_H */ diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h index ec596ed75469..b12b57df0d84 100644 --- a/src/rgw/rgw_usage.h +++ b/src/rgw/rgw_usage.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_USAGE_H -#define CEPH_RGW_USAGE_H +#pragma once #include #include @@ -29,6 +28,3 @@ public: static int clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver); }; - - -#endif diff --git a/src/rgw/rgw_web_idp.h b/src/rgw/rgw_web_idp.h index 089c4da665dc..a9aa5b82916e 100644 --- a/src/rgw/rgw_web_idp.h +++ b/src/rgw/rgw_web_idp.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_WEB_IDP_H -#define CEPH_RGW_WEB_IDP_H +#pragma once namespace rgw { namespace web_idp { @@ -25,5 +24,3 @@ struct WebTokenClaims { }; /* namespace web_idp */ }; /* namespace rgw */ - -#endif /* CEPH_RGW_WEB_IDP_H */ diff --git a/src/rgw/rgw_website.h b/src/rgw/rgw_website.h index fa3c19578fad..2ba22c87a8c7 100644 --- a/src/rgw/rgw_website.h +++ b/src/rgw/rgw_website.h @@ -14,8 +14,7 @@ * */ -#ifndef RGW_WEBSITE_H -#define RGW_WEBSITE_H +#pragma once #include #include @@ -242,5 +241,3 @@ struct RGWBucketWebsiteConf } }; WRITE_CLASS_ENCODER(RGWBucketWebsiteConf) - -#endif diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h index 6a34344fbeb9..ab3f267ade1b 100644 --- a/src/rgw/rgw_xml.h +++ b/src/rgw/rgw_xml.h @@ -1,8 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#ifndef CEPH_RGW_XML_H -#define CEPH_RGW_XML_H +#pragma once #include #include @@ -370,6 +369,3 @@ static void encode_xml(const char *name, const std::optional& o, ceph::Format encode_xml(name, *o, f); } - - -#endif diff --git a/src/rgw/services/svc_bucket_sobj.h b/src/rgw/services/svc_bucket_sobj.h index 93e2063a7096..8e9fe063c1b1 100644 --- a/src/rgw/services/svc_bucket_sobj.h +++ b/src/rgw/services/svc_bucket_sobj.h @@ -22,13 +22,13 @@ #include "svc_meta_be.h" #include "svc_bucket_types.h" #include "svc_bucket.h" +#include "svc_bucket_sync.h" class RGWSI_Zone; class RGWSI_SysObj; class RGWSI_SysObj_Cache; class RGWSI_Meta; class RGWSI_SyncModules; -class RGWSI_Bucket_Sync; struct rgw_cache_entry_info; diff --git a/src/rgw/services/svc_user_rados.h b/src/rgw/services/svc_user_rados.h index ff1fe41989fb..177f720d6b18 100644 --- a/src/rgw/services/svc_user_rados.h +++ b/src/rgw/services/svc_user_rados.h @@ -20,6 +20,7 @@ #include "svc_meta_be.h" #include "svc_user.h" +#include "rgw_bucket.h" class RGWSI_RADOS; class RGWSI_Zone; @@ -31,8 +32,6 @@ class RGWSI_MetaBackend_Handler; struct rgw_cache_entry_info; -class RGWUserBuckets; - class RGWGetUserHeader_CB; class RGWGetUserStats_CB;