rgw_multipart_meta_filter.cc
rgw_obj_manifest.cc
rgw_period.cc
- rgw_pubsub.cc
rgw_realm.cc
rgw_sync.cc
rgw_sync_policy.cc
- rgw_pubsub_push.cc
rgw_notify_event_type.cc
rgw_period_history.cc
rgw_period_puller.cc
- rgw_reshard.cc
rgw_coroutine.cc
rgw_cr_rest.cc
rgw_op.cc
rgw_policy_s3.cc
rgw_public_access.cc
rgw_putobj.cc
- rgw_putobj_processor.cc
rgw_quota.cc
- rgw_rados.cc
rgw_resolve.cc
rgw_rest.cc
- rgw_rest_bucket.cc
rgw_rest_client.cc
rgw_rest_config.cc
rgw_rest_conn.cc
- rgw_rest_log.cc
rgw_rest_metadata.cc
rgw_rest_ratelimit.cc
rgw_rest_role.cc
driver/rados/rgw_object_expirer_core.cc
driver/rados/rgw_otp.cc
driver/rados/rgw_period.cc
+ driver/rados/rgw_pubsub.cc
+ driver/rados/rgw_pubsub_push.cc
+ driver/rados/rgw_putobj_processor.cc
+ driver/rados/rgw_rados.cc
+ driver/rados/rgw_reshard.cc
+ driver/rados/rgw_rest_bucket.cc
+ driver/rados/rgw_rest_log.cc
driver/rados/rgw_rest_pubsub.cc
driver/rados/rgw_rest_realm.cc
driver/rados/rgw_rest_user.cc
rgw_process.cc
rgw_realm_reloader.cc
rgw_realm_watcher.cc
- rgw_rest_bucket.cc
rgw_rest_config.cc
rgw_rest_info.cc
- rgw_rest_log.cc
rgw_rest_metadata.cc
rgw_rest_ratelimit.cc
- driver/rados/rgw_rest_realm.cc
rgw_rest_sts.cc
rgw_rest_swift.cc
rgw_rest_usage.cc
rgw_signal.cc
rgw_swift_auth.cc
rgw_usage.cc
- rgw_sts.cc)
+ rgw_sts.cc
+ driver/rados/rgw_rest_bucket.cc
+ driver/rados/rgw_rest_log.cc
+ driver/rados/rgw_rest_realm.cc)
gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf
rgw_iam_policy_keywords.frag.cc)
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#ifndef DB_STORE_H
-#define DB_STORE_H
+#pragma once
#include <errno.h>
#include <stdlib.h>
};
} } // namespace rgw::store
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#ifndef DB_STORE_LOG_H
-#define DB_STORE_LOG_H
+#pragma once
#include <cerrno>
#include <cstdlib>
#undef dout_prefix
#define dout_prefix *_dout << "rgw dbstore: "
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#ifndef SQLITE_DB_H
-#define SQLITE_DB_H
+#pragma once
#include <errno.h>
#include <stdlib.h>
int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
};
-
-#endif
*
*/
-#ifndef CEPH_RGW_CLS_FIFO_LEGACY_H
-#define CEPH_RGW_CLS_FIFO_LEGACY_H
+#pragma once
#include <cstdint>
#include <deque>
};
}
-
-#endif // CEPH_RGW_CLS_FIFO_LEGACY_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_CR_RADOS_H
-#define CEPH_RGW_CR_RADOS_H
+#pragma once
#include <boost/intrusive_ptr.hpp>
#include "include/ceph_assert.h"
int operate(const DoutPrefixProvider* dpp) override;
};
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_CR_TOOLS_H
-#define CEPH_RGW_CR_TOOLS_H
+#pragma once
#include "rgw_cr_rados.h"
#include "rgw_tools.h"
using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGWD3NDATACACHE_H
-#define CEPH_RGWD3NDATACACHE_H
+#pragma once
#include "rgw_rados.h"
#include <curl/curl.h>
return 0;
}
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_DATA_SYNC_H
-#define CEPH_RGW_DATA_SYNC_H
+#pragma once
#include <fmt/format.h>
#include <fmt/ostream.h>
bool supports_data_export() override { return false; }
int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_DATALOG_H
-#define CEPH_RGW_DATALOG_H
+#pragma once
#include <cstdint>
#include <list>
// 1 on empty, 0 on non-empty, negative on error.
virtual int is_empty(const DoutPrefixProvider *dpp) = 0;
};
-
-
-#endif
* same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
* on the MPU parts.
*/
-#ifndef CEPH_RGW_ETAG_VERIFIER_H
-#define CEPH_RGW_ETAG_VERIFIER_H
+
+#pragma once
#include "rgw_putobj.h"
#include "rgw_op.h"
etag_verifier_ptr& verifier);
} // namespace rgw::putobj
-
-#endif /* CEPH_RGW_ETAG_VERIFIER_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_GC_H
-#define CEPH_RGW_GC_H
-
+#pragma once
#include "include/types.h"
#include "include/rados/librados.hpp"
std::ostream& gen_prefix(std::ostream& out) const;
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LC_TIER_H
-#define CEPH_RGW_LC_TIER_H
+#pragma once
#include "rgw_lc.h"
#include "rgw_rest_conn.h"
/* Transition object to cloud endpoint */
int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LOGBACKING_H
-#define CEPH_RGW_LOGBACKING_H
+#pragma once
#include <optional>
#include <iostream>
return 0;
}
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_METADATA_H
-#define CEPH_RGW_METADATA_H
+#pragma once
#include <string>
#include <utility>
void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_OBJEXP_H
-#define CEPH_OBJEXP_H
+#pragma once
#include <atomic>
#include <string>
void start_processor();
void stop_processor();
};
-#endif /* CEPH_OBJEXP_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_OTP_H
-#define CEPH_RGW_OTP_H
+#pragma once
#include "rgw_sal_fwd.h"
#include "cls/otp/cls_otp_types.h"
const rgw_user& user, optional_yield y,
const RemoveParams& params = {});
};
-
-#endif
-
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "services/svc_zone.h"
+#include "rgw_b64.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_pubsub.h"
+#include "rgw_tools.h"
+#include "rgw_xml.h"
+#include "rgw_arn.h"
+#include "rgw_pubsub_push.h"
+#include <regex>
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) {
+ char buf[64];
+ const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str());
+ if (len > 0) {
+ id.assign(buf, len);
+ }
+}
+
+bool rgw_s3_key_filter::decode_xml(XMLObj* obj) {
+ XMLObjIter iter = obj->find("FilterRule");
+ XMLObj *o;
+
+ const auto throw_if_missing = true;
+ auto prefix_not_set = true;
+ auto suffix_not_set = true;
+ auto regex_not_set = true;
+ std::string name;
+
+ while ((o = iter.get_next())) {
+ RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing);
+ if (name == "prefix" && prefix_not_set) {
+ prefix_not_set = false;
+ RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing);
+ } else if (name == "suffix" && suffix_not_set) {
+ suffix_not_set = false;
+ RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing);
+ } else if (name == "regex" && regex_not_set) {
+ regex_not_set = false;
+ RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing);
+ } else {
+ throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'");
+ }
+ }
+ return true;
+}
+
+void rgw_s3_key_filter::dump_xml(Formatter *f) const {
+ if (!prefix_rule.empty()) {
+ f->open_object_section("FilterRule");
+ ::encode_xml("Name", "prefix", f);
+ ::encode_xml("Value", prefix_rule, f);
+ f->close_section();
+ }
+ if (!suffix_rule.empty()) {
+ f->open_object_section("FilterRule");
+ ::encode_xml("Name", "suffix", f);
+ ::encode_xml("Value", suffix_rule, f);
+ f->close_section();
+ }
+ if (!regex_rule.empty()) {
+ f->open_object_section("FilterRule");
+ ::encode_xml("Name", "regex", f);
+ ::encode_xml("Value", regex_rule, f);
+ f->close_section();
+ }
+}
+
+bool rgw_s3_key_filter::has_content() const {
+ return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty());
+}
+
+bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) {
+ kv.clear();
+ XMLObjIter iter = obj->find("FilterRule");
+ XMLObj *o;
+
+ const auto throw_if_missing = true;
+
+ std::string key;
+ std::string value;
+
+ while ((o = iter.get_next())) {
+ RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing);
+ RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing);
+ kv.emplace(key, value);
+ }
+ return true;
+}
+
+void rgw_s3_key_value_filter::dump_xml(Formatter *f) const {
+ for (const auto& key_value : kv) {
+ f->open_object_section("FilterRule");
+ ::encode_xml("Name", key_value.first, f);
+ ::encode_xml("Value", key_value.second, f);
+ f->close_section();
+ }
+}
+
+bool rgw_s3_key_value_filter::has_content() const {
+ return !kv.empty();
+}
+
+bool rgw_s3_filter::decode_xml(XMLObj* obj) {
+ RGWXMLDecoder::decode_xml("S3Key", key_filter, obj);
+ RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj);
+ RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj);
+ return true;
+}
+
+void rgw_s3_filter::dump_xml(Formatter *f) const {
+ if (key_filter.has_content()) {
+ ::encode_xml("S3Key", key_filter, f);
+ }
+ if (metadata_filter.has_content()) {
+ ::encode_xml("S3Metadata", metadata_filter, f);
+ }
+ if (tag_filter.has_content()) {
+ ::encode_xml("S3Tags", tag_filter, f);
+ }
+}
+
+bool rgw_s3_filter::has_content() const {
+ return key_filter.has_content() ||
+ metadata_filter.has_content() ||
+ tag_filter.has_content();
+}
+
+bool match(const rgw_s3_key_filter& filter, const std::string& key) {
+ const auto key_size = key.size();
+ const auto prefix_size = filter.prefix_rule.size();
+ if (prefix_size != 0) {
+ // prefix rule exists
+ if (prefix_size > key_size) {
+ // if prefix is longer than key, we fail
+ return false;
+ }
+ if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) {
+ return false;
+ }
+ }
+ const auto suffix_size = filter.suffix_rule.size();
+ if (suffix_size != 0) {
+ // suffix rule exists
+ if (suffix_size > key_size) {
+ // if suffix is longer than key, we fail
+ return false;
+ }
+ if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) {
+ return false;
+ }
+ }
+ if (!filter.regex_rule.empty()) {
+ // TODO add regex chaching in the filter
+ const std::regex base_regex(filter.regex_rule);
+ if (!std::regex_match(key, base_regex)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) {
+ // all filter pairs must exist with the same value in the object's metadata/tags
+ // object metadata/tags may include items not in the filter
+ return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end());
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) {
+ // all filter pairs must exist with the same value in the object's metadata/tags
+ // object metadata/tags may include items not in the filter
+ for (auto& filter : filter.kv) {
+ auto result = kv.equal_range(filter.first);
+ if (std::any_of(result.first, result.second, [&filter](const pair<string,string>& p) { return p.second == filter.second;}))
+ continue;
+ else
+ return false;
+ }
+ return true;
+}
+
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) {
+ // if event list exists, and none of the events in the list matches the event type, filter the message
+ if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) {
+ return false;
+ }
+ return true;
+}
+
+void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) {
+ l.clear();
+
+ XMLObjIter iter = obj->find(name);
+ XMLObj *o;
+
+ while ((o = iter.get_next())) {
+ std::string val;
+ decode_xml_obj(val, o);
+ l.push_back(rgw::notify::from_string(val));
+ }
+}
+
+bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) {
+ const auto throw_if_missing = true;
+ RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing);
+
+ RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing);
+
+ RGWXMLDecoder::decode_xml("Filter", filter, obj);
+
+ do_decode_xml_obj(events, "Event", obj);
+ if (events.empty()) {
+ // if no events are provided, we assume all events
+ events.push_back(rgw::notify::ObjectCreated);
+ events.push_back(rgw::notify::ObjectRemoved);
+ }
+ return true;
+}
+
+void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const {
+ ::encode_xml("Id", id, f);
+ ::encode_xml("Topic", topic_arn.c_str(), f);
+ if (filter.has_content()) {
+ ::encode_xml("Filter", filter, f);
+ }
+ for (const auto& event : events) {
+ ::encode_xml("Event", rgw::notify::to_string(event), f);
+ }
+}
+
+bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) {
+ do_decode_xml_obj(list, "TopicConfiguration", obj);
+ return true;
+}
+
+rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) :
+ id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {}
+
+void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const {
+ do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f);
+}
+
+void rgw_pubsub_s3_event::dump(Formatter *f) const {
+ encode_json("eventVersion", eventVersion, f);
+ encode_json("eventSource", eventSource, f);
+ encode_json("awsRegion", awsRegion, f);
+ utime_t ut(eventTime);
+ encode_json("eventTime", ut, f);
+ encode_json("eventName", eventName, f);
+ {
+ Formatter::ObjectSection s(*f, "userIdentity");
+ encode_json("principalId", userIdentity, f);
+ }
+ {
+ Formatter::ObjectSection s(*f, "requestParameters");
+ encode_json("sourceIPAddress", sourceIPAddress, f);
+ }
+ {
+ Formatter::ObjectSection s(*f, "responseElements");
+ encode_json("x-amz-request-id", x_amz_request_id, f);
+ encode_json("x-amz-id-2", x_amz_id_2, f);
+ }
+ {
+ Formatter::ObjectSection s(*f, "s3");
+ encode_json("s3SchemaVersion", s3SchemaVersion, f);
+ encode_json("configurationId", configurationId, f);
+ {
+ Formatter::ObjectSection sub_s(*f, "bucket");
+ encode_json("name", bucket_name, f);
+ {
+ Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity");
+ encode_json("principalId", bucket_ownerIdentity, f);
+ }
+ encode_json("arn", bucket_arn, f);
+ encode_json("id", bucket_id, f);
+ }
+ {
+ Formatter::ObjectSection sub_s(*f, "object");
+ encode_json("key", object_key, f);
+ encode_json("size", object_size, f);
+ encode_json("eTag", object_etag, f);
+ encode_json("versionId", object_versionId, f);
+ encode_json("sequencer", object_sequencer, f);
+ encode_json("metadata", x_meta_map, f);
+ encode_json("tags", tags, f);
+ }
+ }
+ encode_json("eventId", id, f);
+ encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump(Formatter *f) const
+{
+ encode_json("user", user, f);
+ encode_json("name", name, f);
+ encode_json("dest", dest, f);
+ encode_json("arn", arn, f);
+ encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump_xml(Formatter *f) const
+{
+ encode_xml("User", user, f);
+ encode_xml("Name", name, f);
+ encode_xml("EndPoint", dest, f);
+ encode_xml("TopicArn", arn, f);
+ encode_xml("OpaqueData", opaque_data, f);
+}
+
+void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) {
+ f->open_object_section("entry");
+ encode_xml("key", key, f);
+ encode_xml("value", value, f);
+ f->close_section(); // entry
+}
+
+void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const
+{
+ f->open_array_section("Attributes");
+ std::string str_user;
+ user.to_str(str_user);
+ encode_xml_key_value_entry("User", str_user, f);
+ encode_xml_key_value_entry("Name", name, f);
+ encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f);
+ encode_xml_key_value_entry("TopicArn", arn, f);
+ encode_xml_key_value_entry("OpaqueData", opaque_data, f);
+ f->close_section(); // Attributes
+}
+
+void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f)
+{
+ f->open_array_section(name);
+ for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+ f->dump_string("obj", rgw::notify::to_string(*iter));
+ }
+ f->close_section();
+}
+
+void rgw_pubsub_topic_filter::dump(Formatter *f) const
+{
+ encode_json("topic", topic, f);
+ encode_json("events", events, f);
+}
+
+void rgw_pubsub_topic_subs::dump(Formatter *f) const
+{
+ encode_json("topic", topic, f);
+ encode_json("subs", subs, f);
+}
+
+void rgw_pubsub_bucket_topics::dump(Formatter *f) const
+{
+ Formatter::ArraySection s(*f, "topics");
+ for (auto& t : topics) {
+ encode_json(t.first.c_str(), t.second, f);
+ }
+}
+
+void rgw_pubsub_topics::dump(Formatter *f) const
+{
+ Formatter::ArraySection s(*f, "topics");
+ for (auto& t : topics) {
+ encode_json(t.first.c_str(), t.second, f);
+ }
+}
+
+void rgw_pubsub_topics::dump_xml(Formatter *f) const
+{
+ for (auto& t : topics) {
+ encode_xml("member", t.second.topic, f);
+ }
+}
+
+void rgw_pubsub_sub_dest::dump(Formatter *f) const
+{
+ encode_json("bucket_name", bucket_name, f);
+ encode_json("oid_prefix", oid_prefix, f);
+ encode_json("push_endpoint", push_endpoint, f);
+ encode_json("push_endpoint_args", push_endpoint_args, f);
+ encode_json("push_endpoint_topic", arn_topic, f);
+ encode_json("stored_secret", stored_secret, f);
+ encode_json("persistent", persistent, f);
+}
+
+void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const
+{
+ // first 2 members are omitted here since they
+ // dont apply to AWS compliant topics
+ encode_xml("EndpointAddress", push_endpoint, f);
+ encode_xml("EndpointArgs", push_endpoint_args, f);
+ encode_xml("EndpointTopic", arn_topic, f);
+ encode_xml("HasStoredSecret", stored_secret, f);
+ encode_xml("Persistent", persistent, f);
+}
+
+std::string rgw_pubsub_sub_dest::to_json_str() const
+{
+ // first 2 members are omitted here since they
+ // dont apply to AWS compliant topics
+ JSONFormatter f;
+ f.open_object_section("");
+ encode_json("EndpointAddress", push_endpoint, &f);
+ encode_json("EndpointArgs", push_endpoint_args, &f);
+ encode_json("EndpointTopic", arn_topic, &f);
+ encode_json("HasStoredSecret", stored_secret, &f);
+ encode_json("Persistent", persistent, &f);
+ f.close_section();
+ std::stringstream ss;
+ f.flush(ss);
+ return ss.str();
+}
+
+void rgw_pubsub_sub_config::dump(Formatter *f) const
+{
+ encode_json("user", user, f);
+ encode_json("name", name, f);
+ encode_json("topic", topic, f);
+ encode_json("dest", dest, f);
+ encode_json("s3_id", s3_id, f);
+}
+
+RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant)
+ : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj)
+{
+ get_meta_obj(&meta_obj);
+}
+
+int RGWPubSub::remove(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& obj,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y)
+{
+ int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker)
+{
+ int ret = read(meta_obj, result, objv_tracker);
+ if (ret < 0) {
+ ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+ RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+ int ret = write(dpp, meta_obj, topics, objv_tracker, y);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWPubSub::get_topics(rgw_pubsub_topics *result)
+{
+ return read_topics(result, nullptr);
+}
+
+int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker)
+{
+ int ret = ps->read(bucket_meta_obj, result, objv_tracker);
+ if (ret < 0 && ret != -ENOENT) {
+ ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+ RGWObjVersionTracker *objv_tracker,
+ optional_yield y)
+{
+ int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y);
+ if (ret < 0) {
+ ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result)
+{
+ return read_topics(result, nullptr);
+}
+
+int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result)
+{
+ rgw_pubsub_topics topics;
+ int ret = get_topics(&topics);
+ if (ret < 0) {
+ ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ auto iter = topics.topics.find(name);
+ if (iter == topics.topics.end()) {
+ ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
+ return -ENOENT;
+ }
+
+ *result = iter->second;
+ return 0;
+}
+
+int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result)
+{
+ rgw_pubsub_topics topics;
+ int ret = get_topics(&topics);
+ if (ret < 0) {
+ ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ auto iter = topics.topics.find(name);
+ if (iter == topics.topics.end()) {
+ ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
+ return -ENOENT;
+ }
+
+ *result = iter->second.topic;
+ return 0;
+}
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) {
+ return create_notification(dpp, topic_name, events, std::nullopt, "", y);
+}
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) {
+ rgw_pubsub_topic_subs topic_info;
+
+ int ret = ps->get_topic(topic_name, &topic_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl;
+
+ RGWObjVersionTracker objv_tracker;
+ rgw_pubsub_bucket_topics bucket_topics;
+
+ ret = read_topics(&bucket_topics, &objv_tracker);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" <<
+ bucket.name << "': ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" <<
+ bucket.name << "'" << dendl;
+
+ auto& topic_filter = bucket_topics.topics[topic_name];
+ topic_filter.topic = topic_info.topic;
+ topic_filter.events = events;
+ topic_filter.s3_id = notif_name;
+ if (s3_filter) {
+ topic_filter.s3_filter = *s3_filter;
+ }
+
+ ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl;
+ return ret;
+ }
+
+ ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl;
+
+ return 0;
+}
+
+int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y)
+{
+ rgw_pubsub_topic_subs topic_info;
+
+ int ret = ps->get_topic(topic_name, &topic_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ RGWObjVersionTracker objv_tracker;
+ rgw_pubsub_bucket_topics bucket_topics;
+
+ ret = read_topics(&bucket_topics, &objv_tracker);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ bucket_topics.topics.erase(topic_name);
+
+ if (bucket_topics.topics.empty()) {
+ // no more topics - delete the notification object of the bucket
+ ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+
+ // write back the notifications without the deleted one
+ ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ // get all topics on a bucket
+ rgw_pubsub_bucket_topics bucket_topics;
+ auto ret = get_topics(&bucket_topics);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl;
+ return ret ;
+ }
+
+ // remove all auto-genrated topics
+ for (const auto& topic : bucket_topics.topics) {
+ const auto& topic_name = topic.first;
+ ret = ps->remove_topic(dpp, topic_name, y);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl;
+ }
+ }
+
+ // delete the notification object of the bucket
+ ret = ps->remove(dpp, bucket_meta_obj, nullptr, y);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) {
+ return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y);
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) {
+ RGWObjVersionTracker objv_tracker;
+ rgw_pubsub_topics topics;
+
+ int ret = read_topics(&topics, &objv_tracker);
+ if (ret < 0 && ret != -ENOENT) {
+ // its not an error if not topics exist, we create one
+ ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ rgw_pubsub_topic_subs& new_topic = topics.topics[name];
+ new_topic.topic.user = rgw_user("", tenant);
+ new_topic.topic.name = name;
+ new_topic.topic.dest = dest;
+ new_topic.topic.arn = arn;
+ new_topic.topic.opaque_data = opaque_data;
+
+ ret = write_topics(dpp, topics, &objv_tracker, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y)
+{
+ RGWObjVersionTracker objv_tracker;
+ rgw_pubsub_topics topics;
+
+ int ret = read_topics(&topics, &objv_tracker);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+ return ret;
+ } else if (ret == -ENOENT) {
+ // its not an error if no topics exist, just a no-op
+ ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl;
+ return 0;
+ }
+
+ topics.topics.erase(name);
+
+ ret = write_topics(dpp, topics, &objv_tracker, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const {
+ *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid());
+}
+
+void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const {
+ *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket));
+}
+
+void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const {
+ *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name));
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "services/svc_sys_obj.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+#include "rgw_notify_event_type.h"
+#include <boost/container/flat_map.hpp>
+
+namespace rgw::sal { class RadosStore; }
+
+class XMLObj;
+
+struct rgw_s3_key_filter {
+ std::string prefix_rule;
+ std::string suffix_rule;
+ std::string regex_rule;
+
+ bool has_content() const;
+
+ bool decode_xml(XMLObj *obj);
+ void dump_xml(Formatter *f) const;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(prefix_rule, bl);
+ encode(suffix_rule, bl);
+ encode(regex_rule, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(prefix_rule, bl);
+ decode(suffix_rule, bl);
+ decode(regex_rule, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_filter)
+
+using KeyValueMap = boost::container::flat_map<std::string, std::string>;
+using KeyMultiValueMap = std::multimap<std::string, std::string>;
+
+struct rgw_s3_key_value_filter {
+ KeyValueMap kv;
+
+ bool has_content() const;
+
+ bool decode_xml(XMLObj *obj);
+ void dump_xml(Formatter *f) const;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(kv, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(kv, bl);
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_value_filter)
+
+struct rgw_s3_filter {
+ rgw_s3_key_filter key_filter;
+ rgw_s3_key_value_filter metadata_filter;
+ rgw_s3_key_value_filter tag_filter;
+
+ bool has_content() const;
+
+ bool decode_xml(XMLObj *obj);
+ void dump_xml(Formatter *f) const;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(key_filter, bl);
+ encode(metadata_filter, bl);
+ encode(tag_filter, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(key_filter, bl);
+ decode(metadata_filter, bl);
+ if (struct_v >= 2) {
+ decode(tag_filter, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+};
+WRITE_CLASS_ENCODER(rgw_s3_filter)
+
+using OptionalFilter = std::optional<rgw_s3_filter>;
+
+struct rgw_pubsub_topic_filter;
+/* S3 notification configuration
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html
+<NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+ <TopicConfiguration>
+ <Filter>
+ <S3Key>
+ <FilterRule>
+ <Name>suffix</Name>
+ <Value>jpg</Value>
+ </FilterRule>
+ </S3Key>
+ <S3Metadata>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Metadata>
+ <S3Tags>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Tags>
+ </Filter>
+ <Id>notification1</Id>
+ <Topic>arn:aws:sns:<region>:<account>:<topic></Topic>
+ <Event>s3:ObjectCreated:*</Event>
+ <Event>s3:ObjectRemoved:*</Event>
+ </TopicConfiguration>
+</NotificationConfiguration>
+*/
+struct rgw_pubsub_s3_notification {
+ // notification id
+ std::string id;
+ // types of events
+ rgw::notify::EventTypeList events;
+ // topic ARN
+ std::string topic_arn;
+ // filter rules
+ rgw_s3_filter filter;
+
+ bool decode_xml(XMLObj *obj);
+ void dump_xml(Formatter *f) const;
+
+ rgw_pubsub_s3_notification() = default;
+ // construct from rgw_pubsub_topic_filter (used by get/list notifications)
+ explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter);
+};
+
+// return true if the key matches the prefix/suffix/regex rules of the key filter
+bool match(const rgw_s3_key_filter& filter, const std::string& key);
+
+// return true if the key matches the metadata rules of the metadata filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv);
+
+// return true if the key matches the tag rules of the tag filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv);
+
+// return true if the event type matches (equal or contained in) one of the events in the list
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event);
+
+struct rgw_pubsub_s3_notifications {
+ std::list<rgw_pubsub_s3_notification> list;
+ bool decode_xml(XMLObj *obj);
+ void dump_xml(Formatter *f) const;
+};
+
+/* S3 event records structure
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
+{
+"Records":[
+ {
+ "eventVersion":""
+ "eventSource":"",
+ "awsRegion":"",
+ "eventTime":"",
+ "eventName":"",
+ "userIdentity":{
+ "principalId":""
+ },
+ "requestParameters":{
+ "sourceIPAddress":""
+ },
+ "responseElements":{
+ "x-amz-request-id":"",
+ "x-amz-id-2":""
+ },
+ "s3":{
+ "s3SchemaVersion":"1.0",
+ "configurationId":"",
+ "bucket":{
+ "name":"",
+ "ownerIdentity":{
+ "principalId":""
+ },
+ "arn":""
+ "id": ""
+ },
+ "object":{
+ "key":"",
+ "size": ,
+ "eTag":"",
+ "versionId":"",
+ "sequencer": "",
+ "metadata": ""
+ "tags": ""
+ }
+ },
+ "eventId":"",
+ }
+]
+}*/
+
+struct rgw_pubsub_s3_event {
+ constexpr static const char* const json_type_plural = "Records";
+ std::string eventVersion = "2.2";
+ // aws:s3
+ std::string eventSource = "ceph:s3";
+ // zonegroup
+ std::string awsRegion;
+ // time of the request
+ ceph::real_time eventTime;
+ // type of the event
+ std::string eventName;
+ // user that sent the request
+ std::string userIdentity;
+ // IP address of source of the request (not implemented)
+ std::string sourceIPAddress;
+ // request ID (not implemented)
+ std::string x_amz_request_id;
+ // radosgw that received the request
+ std::string x_amz_id_2;
+ std::string s3SchemaVersion = "1.0";
+ // ID received in the notification request
+ std::string configurationId;
+ // bucket name
+ std::string bucket_name;
+ // bucket owner
+ std::string bucket_ownerIdentity;
+ // bucket ARN
+ std::string bucket_arn;
+ // object key
+ std::string object_key;
+ // object size
+ uint64_t object_size = 0;
+ // object etag
+ std::string object_etag;
+ // object version id bucket is versioned
+ std::string object_versionId;
+ // hexadecimal value used to determine event order for specific key
+ std::string object_sequencer;
+ // this is an rgw extension (not S3 standard)
+ // used to store a globally unique identifier of the event
+ // that could be used for acking or any other identification of the event
+ std::string id;
+ // this is an rgw extension holding the internal bucket id
+ std::string bucket_id;
+ // meta data
+ KeyValueMap x_meta_map;
+ // tags
+ KeyMultiValueMap tags;
+ // opaque data received from the topic
+ // could be used to identify the gateway
+ std::string opaque_data;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(4, 1, bl);
+ encode(eventVersion, bl);
+ encode(eventSource, bl);
+ encode(awsRegion, bl);
+ encode(eventTime, bl);
+ encode(eventName, bl);
+ encode(userIdentity, bl);
+ encode(sourceIPAddress, bl);
+ encode(x_amz_request_id, bl);
+ encode(x_amz_id_2, bl);
+ encode(s3SchemaVersion, bl);
+ encode(configurationId, bl);
+ encode(bucket_name, bl);
+ encode(bucket_ownerIdentity, bl);
+ encode(bucket_arn, bl);
+ encode(object_key, bl);
+ encode(object_size, bl);
+ encode(object_etag, bl);
+ encode(object_versionId, bl);
+ encode(object_sequencer, bl);
+ encode(id, bl);
+ encode(bucket_id, bl);
+ encode(x_meta_map, bl);
+ encode(tags, bl);
+ encode(opaque_data, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(4, bl);
+ decode(eventVersion, bl);
+ decode(eventSource, bl);
+ decode(awsRegion, bl);
+ decode(eventTime, bl);
+ decode(eventName, bl);
+ decode(userIdentity, bl);
+ decode(sourceIPAddress, bl);
+ decode(x_amz_request_id, bl);
+ decode(x_amz_id_2, bl);
+ decode(s3SchemaVersion, bl);
+ decode(configurationId, bl);
+ decode(bucket_name, bl);
+ decode(bucket_ownerIdentity, bl);
+ decode(bucket_arn, bl);
+ decode(object_key, bl);
+ decode(object_size, bl);
+ decode(object_etag, bl);
+ decode(object_versionId, bl);
+ decode(object_sequencer, bl);
+ decode(id, bl);
+ if (struct_v >= 2) {
+ decode(bucket_id, bl);
+ decode(x_meta_map, bl);
+ }
+ if (struct_v >= 3) {
+ decode(tags, bl);
+ }
+ if (struct_v >= 4) {
+ decode(opaque_data, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_s3_event)
+
+// setting a unique ID for an event based on object hash and timestamp
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts);
+
+struct rgw_pubsub_sub_dest {
+ std::string bucket_name;
+ std::string oid_prefix;
+ std::string push_endpoint;
+ std::string push_endpoint_args;
+ std::string arn_topic;
+ bool stored_secret = false;
+ bool persistent = false;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(5, 1, bl);
+ encode(bucket_name, bl);
+ encode(oid_prefix, bl);
+ encode(push_endpoint, bl);
+ encode(push_endpoint_args, bl);
+ encode(arn_topic, bl);
+ encode(stored_secret, bl);
+ encode(persistent, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(5, bl);
+ decode(bucket_name, bl);
+ decode(oid_prefix, bl);
+ decode(push_endpoint, bl);
+ if (struct_v >= 2) {
+ decode(push_endpoint_args, bl);
+ }
+ if (struct_v >= 3) {
+ decode(arn_topic, bl);
+ }
+ if (struct_v >= 4) {
+ decode(stored_secret, bl);
+ }
+ if (struct_v >= 5) {
+ decode(persistent, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void dump_xml(Formatter *f) const;
+ std::string to_json_str() const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest)
+
+struct rgw_pubsub_sub_config {
+ rgw_user user;
+ std::string name;
+ std::string topic;
+ rgw_pubsub_sub_dest dest;
+ std::string s3_id;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(user, bl);
+ encode(name, bl);
+ encode(topic, bl);
+ encode(dest, bl);
+ encode(s3_id, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(2, bl);
+ decode(user, bl);
+ decode(name, bl);
+ decode(topic, bl);
+ decode(dest, bl);
+ if (struct_v >= 2) {
+ decode(s3_id, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_sub_config)
+
+struct rgw_pubsub_topic {
+ rgw_user user;
+ std::string name;
+ rgw_pubsub_sub_dest dest;
+ std::string arn;
+ std::string opaque_data;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(user, bl);
+ encode(name, bl);
+ encode(dest, bl);
+ encode(arn, bl);
+ encode(opaque_data, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(user, bl);
+ decode(name, bl);
+ if (struct_v >= 2) {
+ decode(dest, bl);
+ decode(arn, bl);
+ }
+ if (struct_v >= 3) {
+ decode(opaque_data, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ std::string to_str() const {
+ return user.tenant + "/" + name;
+ }
+
+ void dump(Formatter *f) const;
+ void dump_xml(Formatter *f) const;
+ void dump_xml_as_attributes(Formatter *f) const;
+
+ bool operator<(const rgw_pubsub_topic& t) const {
+ return to_str().compare(t.to_str());
+ }
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic)
+
+struct rgw_pubsub_topic_subs {
+ rgw_pubsub_topic topic;
+ std::set<std::string> subs;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(topic, bl);
+ encode(subs, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(topic, bl);
+ decode(subs, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs)
+
+struct rgw_pubsub_topic_filter {
+ rgw_pubsub_topic topic;
+ rgw::notify::EventTypeList events;
+ std::string s3_id;
+ rgw_s3_filter s3_filter;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(topic, bl);
+ // events are stored as a vector of std::strings
+ std::vector<std::string> tmp_events;
+ std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string);
+ encode(tmp_events, bl);
+ encode(s3_id, bl);
+ encode(s3_filter, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(topic, bl);
+ // events are stored as a vector of std::strings
+ events.clear();
+ std::vector<std::string> tmp_events;
+ decode(tmp_events, bl);
+ std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string);
+ if (struct_v >= 2) {
+ decode(s3_id, bl);
+ }
+ if (struct_v >= 3) {
+ decode(s3_filter, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter)
+
+struct rgw_pubsub_bucket_topics {
+ std::map<std::string, rgw_pubsub_topic_filter> topics;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(topics, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(topics, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics)
+
+struct rgw_pubsub_topics {
+ std::map<std::string, rgw_pubsub_topic_subs> topics;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(topics, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(topics, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topics)
+
+static std::string pubsub_oid_prefix = "pubsub.";
+
+class RGWPubSub
+{
+ friend class Bucket;
+
+ rgw::sal::RadosStore* store;
+ const std::string tenant;
+ RGWSI_SysObj* svc_sysobj;
+
+ rgw_raw_obj meta_obj;
+
+ std::string meta_oid() const {
+ return pubsub_oid_prefix + tenant;
+ }
+
+ std::string bucket_meta_oid(const rgw_bucket& bucket) const {
+ return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker;
+ }
+
+ std::string sub_meta_oid(const std::string& name) const {
+ return pubsub_oid_prefix + tenant + ".sub." + name;
+ }
+
+ template <class T>
+ int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker);
+
+ template <class T>
+ int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
+ RGWObjVersionTracker* obj_tracker, optional_yield y);
+
+ int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker,
+ optional_yield y);
+
+ int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker);
+ int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+ RGWObjVersionTracker* objv_tracker, optional_yield y);
+
+public:
+ RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant);
+
+ class Bucket {
+ friend class RGWPubSub;
+ RGWPubSub *ps;
+ rgw_bucket bucket;
+ rgw_raw_obj bucket_meta_obj;
+
+ // read the list of topics associated with a bucket and populate into result
+ // use version tacker to enforce atomicity between read/write
+ // return 0 on success or if no topic was associated with the bucket, error code otherwise
+ int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker);
+ // set the list of topics associated with a bucket
+ // use version tacker to enforce atomicity between read/write
+ // return 0 on success, error code otherwise
+ int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+ RGWObjVersionTracker* objv_tracker, optional_yield y);
+ public:
+ Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) {
+ ps->get_bucket_meta_obj(bucket, &bucket_meta_obj);
+ }
+
+ // read the list of topics associated with a bucket and populate into result
+ // return 0 on success or if no topic was associated with the bucket, error code otherwise
+ int get_topics(rgw_pubsub_bucket_topics *result);
+ // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket
+ // assigning a notification name is optional (needed for S3 compatible notifications)
+ // if the topic already exist on the bucket, the filter event list may be updated
+ // for S3 compliant notifications the version with: s3_filter and notif_name should be used
+ // return -ENOENT if the topic does not exists
+ // return 0 on success, error code otherwise
+ int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y);
+ int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y);
+ // remove a topic and filter from bucket
+ // if the topic does not exists on the bucket it is a no-op (considered success)
+ // return -ENOENT if the topic does not exists
+ // return 0 on success, error code otherwise
+ int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y);
+ // remove all notifications (and autogenerated topics) associated with the bucket
+ // return 0 on success or if no topic was associated with the bucket, error code otherwise
+ int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y);
+ };
+
+ using BucketRef = std::shared_ptr<Bucket>;
+
+ BucketRef get_bucket(const rgw_bucket& bucket) {
+ return std::make_shared<Bucket>(this, bucket);
+ }
+
+ void get_meta_obj(rgw_raw_obj *obj) const;
+ void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const;
+
+ void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const;
+
+ // get all topics (per tenant, if used)) and populate them into "result"
+ // return 0 on success or if no topics exist, error code otherwise
+ int get_topics(rgw_pubsub_topics *result);
+ // get a topic with its subscriptions by its name and populate it into "result"
+ // return -ENOENT if the topic does not exists
+ // return 0 on success, error code otherwise
+ int get_topic(const std::string& name, rgw_pubsub_topic_subs *result);
+ // get a topic with by its name and populate it into "result"
+ // return -ENOENT if the topic does not exists
+ // return 0 on success, error code otherwise
+ int get_topic(const std::string& name, rgw_pubsub_topic *result);
+ // create a topic with a name only
+ // if the topic already exists it is a no-op (considered success)
+ // return 0 on success, error code otherwise
+ int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
+ // create a topic with push destination information and ARN
+ // if the topic already exists the destination and ARN values may be updated (considered succsess)
+ // return 0 on success, error code otherwise
+ int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y);
+ // remove a topic according to its name
+ // if the topic does not exists it is a no-op (considered success)
+ // return 0 on success, error code otherwise
+ int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
+};
+
+
+template <class T>
+int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker)
+{
+ bufferlist bl;
+ int ret = rgw_get_system_obj(svc_sysobj,
+ obj.pool, obj.oid,
+ bl,
+ objv_tracker,
+ nullptr, null_yield, nullptr, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto iter = bl.cbegin();
+ try {
+ decode(*result, iter);
+ } catch (buffer::error& err) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+template <class T>
+int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
+ RGWObjVersionTracker* objv_tracker, optional_yield y)
+{
+ bufferlist bl;
+ encode(info, bl);
+
+ return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid,
+ bl, false, objv_tracker, real_time(), y);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_pubsub_push.h"
+#include <string>
+#include <sstream>
+#include <algorithm>
+#include "include/buffer_fwd.h"
+#include "common/Formatter.h"
+#include "common/iso_8601.h"
+#include "common/async/completion.h"
+#include "rgw_common.h"
+#include "rgw_data_sync.h"
+#include "rgw_pubsub.h"
+#include "acconfig.h"
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+#include "rgw_amqp.h"
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+#include "rgw_kafka.h"
+#endif
+#include <boost/asio/yield.hpp>
+#include <boost/algorithm/string.hpp>
+#include <functional>
+#include "rgw_perf_counters.h"
+
+using namespace rgw;
+
+template<typename EventType>
+std::string json_format_pubsub_event(const EventType& event) {
+ std::stringstream ss;
+ JSONFormatter f(false);
+ {
+ Formatter::ObjectSection s(f, EventType::json_type_plural);
+ {
+ Formatter::ArraySection s(f, EventType::json_type_plural);
+ encode_json("", event, &f);
+ }
+ }
+ f.flush(ss);
+ return ss.str();
+}
+
+bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
+ bool value;
+ bool exists;
+ if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
+ throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
+ }
+ if (!exists) {
+ return default_value;
+ }
+ return value;
+}
+
+class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
+private:
+ const std::string endpoint;
+ typedef unsigned ack_level_t;
+ ack_level_t ack_level; // TODO: not used for now
+ const bool verify_ssl;
+ const bool cloudevents;
+ static const ack_level_t ACK_LEVEL_ANY = 0;
+ static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
+
+public:
+ RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) :
+ endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false))
+ {
+ bool exists;
+ const auto& str_ack_level = args.get("http-ack-level", &exists);
+ if (!exists || str_ack_level == "any") {
+ // "any" is default
+ ack_level = ACK_LEVEL_ANY;
+ } else if (str_ack_level == "non-error") {
+ ack_level = ACK_LEVEL_NON_ERROR;
+ } else {
+ ack_level = std::atoi(str_ack_level.c_str());
+ if (ack_level < 100 || ack_level >= 600) {
+ throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
+ }
+ }
+ }
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ bufferlist read_bl;
+ RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
+ const auto post_data = json_format_pubsub_event(event);
+ if (cloudevents) {
+ // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
+ // using "Binary Content Mode"
+ request.append_header("ce-specversion", "1.0");
+ request.append_header("ce-type", "com.amazonaws." + event.eventName);
+ request.append_header("ce-time", to_iso_8601(event.eventTime));
+ // default output of iso8601 is also RFC3339 compatible
+ request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
+ request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
+ request.append_header("ce-subject", event.object_key);
+ }
+ request.set_post_data(post_data);
+ request.set_send_length(post_data.length());
+ request.append_header("Content-Type", "application/json");
+ if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
+ const auto rc = RGWHTTP::process(&request, y);
+ if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
+ // TODO: use read_bl to process return code and handle according to ack level
+ return rc;
+ }
+
+ std::string to_str() const override {
+ std::string str("HTTP/S Endpoint");
+ str += "\nURI: " + endpoint;
+ str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
+ return str;
+ }
+};
+
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
+private:
+ enum class ack_level_t {
+ None,
+ Broker,
+ Routable
+ };
+ CephContext* const cct;
+ const std::string endpoint;
+ const std::string topic;
+ const std::string exchange;
+ ack_level_t ack_level;
+ amqp::connection_ptr_t conn;
+
+ bool get_verify_ssl(const RGWHTTPArgs& args) {
+ bool exists;
+ auto str_verify_ssl = args.get("verify-ssl", &exists);
+ if (!exists) {
+ // verify server certificate by default
+ return true;
+ }
+ boost::algorithm::to_lower(str_verify_ssl);
+ if (str_verify_ssl == "true") {
+ return true;
+ }
+ if (str_verify_ssl == "false") {
+ return false;
+ }
+ throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
+ }
+
+ std::string get_exchange(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto exchange = args.get("amqp-exchange", &exists);
+ if (!exists) {
+ throw configuration_error("AMQP: missing amqp-exchange");
+ }
+ return exchange;
+ }
+
+ ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto& str_ack_level = args.get("amqp-ack-level", &exists);
+ if (!exists || str_ack_level == "broker") {
+ // "broker" is default
+ return ack_level_t::Broker;
+ }
+ if (str_ack_level == "none") {
+ return ack_level_t::None;
+ }
+ if (str_ack_level == "routable") {
+ return ack_level_t::Routable;
+ }
+ throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
+ }
+
+public:
+ RGWPubSubAMQPEndpoint(const std::string& _endpoint,
+ const std::string& _topic,
+ const RGWHTTPArgs& args,
+ CephContext* _cct) :
+ cct(_cct),
+ endpoint(_endpoint),
+ topic(_topic),
+ exchange(get_exchange(args)),
+ ack_level(get_ack_level(args)),
+ conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
+ if (!conn) {
+ throw configuration_error("AMQP: failed to create connection to: " + endpoint);
+ }
+ }
+
+ // this allows waiting untill "finish()" is called from a different thread
+ // waiting could be blocking the waiting thread or yielding, depending
+ // with compilation flag support and whether the optional_yield is set
+ class Waiter {
+ using Signature = void(boost::system::error_code);
+ using Completion = ceph::async::Completion<Signature>;
+ std::unique_ptr<Completion> completion = nullptr;
+ int ret;
+
+ mutable std::atomic<bool> done = false;
+ mutable std::mutex lock;
+ mutable std::condition_variable cond;
+
+ template <typename ExecutionContext, typename CompletionToken>
+ auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, Signature> init(token);
+ auto& handler = init.completion_handler;
+ {
+ std::unique_lock l{lock};
+ completion = Completion::create(ctx.get_executor(), std::move(handler));
+ }
+ return init.result.get();
+ }
+
+ public:
+ int wait(optional_yield y) {
+ if (done) {
+ return ret;
+ }
+ if (y) {
+ auto& io_ctx = y.get_io_context();
+ auto& yield_ctx = y.get_yield_context();
+ boost::system::error_code ec;
+ async_wait(io_ctx, yield_ctx[ec]);
+ return -ec.value();
+ }
+ std::unique_lock l(lock);
+ cond.wait(l, [this]{return (done==true);});
+ return ret;
+ }
+
+ void finish(int r) {
+ std::unique_lock l{lock};
+ ret = r;
+ done = true;
+ if (completion) {
+ boost::system::error_code ec(-ret, boost::system::system_category());
+ Completion::post(std::move(completion), ec);
+ } else {
+ cond.notify_all();
+ }
+ }
+ };
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ ceph_assert(conn);
+ if (ack_level == ack_level_t::None) {
+ return amqp::publish(conn, topic, json_format_pubsub_event(event));
+ } else {
+ // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
+ // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+ auto w = std::unique_ptr<Waiter>(new Waiter);
+ const auto rc = amqp::publish_with_confirm(conn,
+ topic,
+ json_format_pubsub_event(event),
+ std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+ if (rc < 0) {
+ // failed to publish, does not wait for reply
+ return rc;
+ }
+ return w->wait(y);
+ }
+ }
+
+ std::string to_str() const override {
+ std::string str("AMQP(0.9.1) Endpoint");
+ str += "\nURI: " + endpoint;
+ str += "\nTopic: " + topic;
+ str += "\nExchange: " + exchange;
+ return str;
+ }
+};
+
+static const std::string AMQP_0_9_1("0-9-1");
+static const std::string AMQP_1_0("1-0");
+static const std::string AMQP_SCHEMA("amqp");
+#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT
+
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
+private:
+ enum class ack_level_t {
+ None,
+ Broker,
+ };
+ CephContext* const cct;
+ const std::string topic;
+ kafka::connection_ptr_t conn;
+ const ack_level_t ack_level;
+
+
+ ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+ bool exists;
+ const auto& str_ack_level = args.get("kafka-ack-level", &exists);
+ if (!exists || str_ack_level == "broker") {
+ // "broker" is default
+ return ack_level_t::Broker;
+ }
+ if (str_ack_level == "none") {
+ return ack_level_t::None;
+ }
+ throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
+ }
+
+public:
+ RGWPubSubKafkaEndpoint(const std::string& _endpoint,
+ const std::string& _topic,
+ const RGWHTTPArgs& args,
+ CephContext* _cct) :
+ cct(_cct),
+ topic(_topic),
+ conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) ,
+ ack_level(get_ack_level(args)) {
+ if (!conn) {
+ throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
+ }
+ }
+
+ // this allows waiting untill "finish()" is called from a different thread
+ // waiting could be blocking the waiting thread or yielding, depending
+ // with compilation flag support and whether the optional_yield is set
+ class Waiter {
+ using Signature = void(boost::system::error_code);
+ using Completion = ceph::async::Completion<Signature>;
+ std::unique_ptr<Completion> completion = nullptr;
+ int ret;
+
+ mutable std::atomic<bool> done = false;
+ mutable std::mutex lock;
+ mutable std::condition_variable cond;
+
+ template <typename ExecutionContext, typename CompletionToken>
+ auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+ boost::asio::async_completion<CompletionToken, Signature> init(token);
+ auto& handler = init.completion_handler;
+ {
+ std::unique_lock l{lock};
+ completion = Completion::create(ctx.get_executor(), std::move(handler));
+ }
+ return init.result.get();
+ }
+
+ public:
+ int wait(optional_yield y) {
+ if (done) {
+ return ret;
+ }
+ if (y) {
+ auto& io_ctx = y.get_io_context();
+ auto& yield_ctx = y.get_yield_context();
+ boost::system::error_code ec;
+ async_wait(io_ctx, yield_ctx[ec]);
+ return -ec.value();
+ }
+ std::unique_lock l(lock);
+ cond.wait(l, [this]{return (done==true);});
+ return ret;
+ }
+
+ void finish(int r) {
+ std::unique_lock l{lock};
+ ret = r;
+ done = true;
+ if (completion) {
+ boost::system::error_code ec(-ret, boost::system::system_category());
+ Completion::post(std::move(completion), ec);
+ } else {
+ cond.notify_all();
+ }
+ }
+ };
+
+ int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+ ceph_assert(conn);
+ if (ack_level == ack_level_t::None) {
+ return kafka::publish(conn, topic, json_format_pubsub_event(event));
+ } else {
+ // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+ auto w = std::unique_ptr<Waiter>(new Waiter);
+ const auto rc = kafka::publish_with_confirm(conn,
+ topic,
+ json_format_pubsub_event(event),
+ std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+ if (rc < 0) {
+ // failed to publish, does not wait for reply
+ return rc;
+ }
+ return w->wait(y);
+ }
+ }
+
+ std::string to_str() const override {
+ std::string str("Kafka Endpoint");
+ str += kafka::to_string(conn);
+ str += "\nTopic: " + topic;
+ return str;
+ }
+};
+
+static const std::string KAFKA_SCHEMA("kafka");
+#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+
+static const std::string WEBHOOK_SCHEMA("webhook");
+static const std::string UNKNOWN_SCHEMA("unknown");
+static const std::string NO_SCHEMA("");
+
+const std::string& get_schema(const std::string& endpoint) {
+ if (endpoint.empty()) {
+ return NO_SCHEMA;
+ }
+ const auto pos = endpoint.find(':');
+ if (pos == std::string::npos) {
+ return UNKNOWN_SCHEMA;
+ }
+ const auto& schema = endpoint.substr(0,pos);
+ if (schema == "http" || schema == "https") {
+ return WEBHOOK_SCHEMA;
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+ } else if (schema == "amqp" || schema == "amqps") {
+ return AMQP_SCHEMA;
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+ } else if (schema == "kafka") {
+ return KAFKA_SCHEMA;
+#endif
+ }
+ return UNKNOWN_SCHEMA;
+}
+
+RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint,
+ const std::string& topic,
+ const RGWHTTPArgs& args,
+ CephContext* cct) {
+ const auto& schema = get_schema(endpoint);
+ if (schema == WEBHOOK_SCHEMA) {
+ return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+ } else if (schema == AMQP_SCHEMA) {
+ bool exists;
+ std::string version = args.get("amqp-version", &exists);
+ if (!exists) {
+ version = AMQP_0_9_1;
+ }
+ if (version == AMQP_0_9_1) {
+ return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
+ } else if (version == AMQP_1_0) {
+ throw configuration_error("AMQP: v1.0 not supported");
+ return nullptr;
+ } else {
+ throw configuration_error("AMQP: unknown version: " + version);
+ return nullptr;
+ }
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+ } else if (schema == KAFKA_SCHEMA) {
+ return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
+#endif
+ }
+
+ throw configuration_error("unknown schema in: " + endpoint);
+ return nullptr;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#pragma once
+
+#include <string>
+#include <memory>
+#include <stdexcept>
+#include "include/buffer_fwd.h"
+#include "include/common_fwd.h"
+#include "common/async/yield_context.h"
+
+// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
+class RGWDataSyncEnv;
+class RGWHTTPArgs;
+struct rgw_pubsub_s3_event;
+
+// endpoint base class all endpoint - types should derive from it
+class RGWPubSubEndpoint {
+public:
+ RGWPubSubEndpoint() = default;
+ // endpoint should not be copied
+ RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
+ const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
+
+ typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
+
+ // factory method for the actual notification endpoint
+ // derived class specific arguments are passed in http args format
+ // may throw a configuration_error if creation fails
+ static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
+
+ // this method is used in order to send notification (S3 compliant) and wait for completion
+ // in async manner via a coroutine when invoked in the frontend environment
+ virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
+
+ // present as string
+ virtual std::string to_str() const { return ""; }
+
+ virtual ~RGWPubSubEndpoint() = default;
+
+ // exception object for configuration error
+ struct configuration_error : public std::logic_error {
+ configuration_error(const std::string& what_arg) :
+ std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
+ };
+};
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_aio.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_multi.h"
+#include "rgw_compression.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::putobj {
+
+int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
+{
+ const bool flush = (data.length() == 0);
+
+ // capture the first chunk for special handling
+ if (data_offset < head_chunk_size || data_offset == 0) {
+ if (flush) {
+ // flush partial chunk
+ return process_first_chunk(std::move(head_data), &processor);
+ }
+
+ auto remaining = head_chunk_size - data_offset;
+ auto count = std::min<uint64_t>(data.length(), remaining);
+ data.splice(0, count, &head_data);
+ data_offset += count;
+
+ if (data_offset == head_chunk_size) {
+ // process the first complete chunk
+ ceph_assert(head_data.length() == head_chunk_size);
+ int r = process_first_chunk(std::move(head_data), &processor);
+ if (r < 0) {
+ return r;
+ }
+ }
+ if (data.length() == 0) { // avoid flushing stripe processor
+ return 0;
+ }
+ }
+ ceph_assert(processor); // process_first_chunk() must initialize
+
+ // send everything else through the processor
+ auto write_offset = data_offset;
+ data_offset += data.length();
+ return processor->process(std::move(data), write_offset);
+}
+
+
+static int process_completed(const AioResultList& completed, RawObjSet *written)
+{
+ std::optional<int> error;
+ for (auto& r : completed) {
+ if (r.result >= 0) {
+ written->insert(r.obj.get_ref().obj);
+ } else if (!error) { // record first error code
+ error = r.result;
+ }
+ }
+ return error.value_or(0);
+}
+
+void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
+ const rgw_obj obj = head_obj->get_obj();
+ const RGWObjStateManifest *sm = obj_ctx.get_state(obj);
+ const bool compressed = sm->state.compressed;
+ uint32_t alloc_hint_flags = 0;
+ if (compressed) {
+ alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ }
+
+ op.set_alloc_hint2(0, 0, alloc_hint_flags);
+}
+
+int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
+{
+ stripe_obj = store->svc()->rados->obj(raw_obj);
+ return stripe_obj.open(dpp);
+}
+
+int RadosWriter::process(bufferlist&& bl, uint64_t offset)
+{
+ bufferlist data = std::move(bl);
+ const uint64_t cost = data.length();
+ if (cost == 0) { // no empty writes, use aio directly for creates
+ return 0;
+ }
+ librados::ObjectWriteOperation op;
+ add_write_hint(op);
+ if (offset == 0) {
+ op.write_full(data);
+ } else {
+ op.write(offset, data);
+ }
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::write_exclusive(const bufferlist& data)
+{
+ const uint64_t cost = data.length();
+
+ librados::ObjectWriteOperation op;
+ op.create(true); // exclusive create
+ add_write_hint(op);
+ op.write_full(data);
+
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+ auto d = aio->drain();
+ c.splice(c.end(), d);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::drain()
+{
+ return process_completed(aio->drain(), &written);
+}
+
+RadosWriter::~RadosWriter()
+{
+ // wait on any outstanding aio completions
+ process_completed(aio->drain(), &written);
+
+ bool need_to_remove_head = false;
+ std::optional<rgw_raw_obj> raw_head;
+ if (!rgw::sal::Object::empty(head_obj.get())) {
+ raw_head.emplace();
+ rgw::sal::RadosObject* obj = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get());
+ obj->get_raw_obj(&*raw_head);
+ }
+
+ /**
+ * We should delete the object in the "multipart" namespace to avoid race condition.
+ * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
+ * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
+ * written by the second upload may be deleted by the first upload.
+ * details is describled on #11749
+ *
+ * The above comment still stands, but instead of searching for a specific object in the multipart
+ * namespace, we just make sure that we remove the object that is marked as the head object after
+ * we remove all the other raw objects. Note that we use different call to remove the head object,
+ * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
+ */
+ for (const auto& obj : written) {
+ if (raw_head && obj == *raw_head) {
+ ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
+ need_to_remove_head = true;
+ continue;
+ }
+
+ int r = store->delete_raw_obj(dpp, obj);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
+ }
+ }
+
+ if (need_to_remove_head) {
+ std::string version_id;
+ ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
+ int r = head_obj->delete_object(dpp, null_yield);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
+ }
+ }
+}
+
+
+// advance to the next stripe
+int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
+{
+ // advance the manifest
+ int r = manifest_gen.create_next(offset);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ *pstripe_size = manifest_gen.cur_stripe_max_size();
+ return 0;
+}
+
+
+
+int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ first_chunk = std::move(data);
+ *processor = &stripe;
+ return 0;
+}
+
+int AtomicObjectProcessor::prepare(optional_yield y)
+{
+ uint64_t max_head_chunk_size;
+ uint64_t head_max_size;
+ uint64_t chunk_size = 0;
+ uint64_t alignment;
+
+ int r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(
+ dpp, head_obj->get_bucket()->get_placement_rule(),
+ &max_head_chunk_size, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ bool same_pool = true;
+ if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) {
+ if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) {
+ same_pool = false;
+ r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+ head_max_size = 0;
+ }
+ }
+
+ if (same_pool) {
+ RGWZonePlacementInfo placement_info;
+ if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) {
+ head_max_size = max_head_chunk_size;
+ } else {
+ head_max_size = 0;
+ }
+ chunk_size = max_head_chunk_size;
+ }
+
+ uint64_t stripe_size;
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+
+ dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_aligned_size(
+ default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_trivial_rule(head_max_size, stripe_size);
+
+ rgw_obj obj = head_obj->get_obj();
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ head_obj->get_bucket()->get_placement_rule(),
+ &tail_placement_rule,
+ obj.bucket, obj);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ set_head_chunk_size(head_max_size);
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, head_max_size);
+ return 0;
+}
+
+int AtomicObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ rgw::sal::Attrs& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled, optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ head_obj->set_atomic();
+
+ RGWRados::Object op_target(store->getRados(),
+ head_obj->get_bucket(),
+ obj_ctx, head_obj.get());
+ RGWRados::Object::Write obj_op(&op_target);
+
+ /* some object types shouldn't be versioned, e.g., multipart parts */
+ op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled());
+ obj_op.meta.data = &first_chunk;
+ obj_op.meta.manifest = &manifest;
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.if_match = if_match;
+ obj_op.meta.if_nomatch = if_nomatch;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.olh_epoch = olh_epoch;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+ if (r < 0) {
+ if (r == -ETIMEDOUT) {
+ // The head object write may eventually succeed, clear the set of objects for deletion. if it
+ // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
+ writer.clear_written();
+ }
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+
+int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ // write the first chunk of the head object as part of an exclusive create,
+ // then drain to wait for the result in case of EEXIST
+ int r = writer.write_exclusive(data);
+ if (r == -EEXIST) {
+ // randomize the oid prefix and reprepare the head/manifest
+ std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
+
+ mp.init(target_obj->get_name(), upload_id, oid_rand);
+ manifest.set_prefix(target_obj->get_name() + "." + oid_rand);
+
+ r = prepare_head();
+ if (r < 0) {
+ return r;
+ }
+ // resubmit the write op on the new head object
+ r = writer.write_exclusive(data);
+ }
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare_head()
+{
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+ uint64_t chunk_size;
+ uint64_t stripe_size;
+ uint64_t alignment;
+
+ int r = dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_chunk_size(dpp,
+ tail_placement_rule, &chunk_size, &alignment);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
+ return r;
+ }
+ dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_aligned_size(
+ default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_multipart_part_rule(stripe_size, part_num);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ head_obj->get_bucket()->get_placement_rule(),
+ &tail_placement_rule,
+ target_obj->get_bucket()->get_key(),
+ target_obj->get_obj());
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+ dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->raw_obj_to_obj(stripe_obj);
+ head_obj->set_hash_source(target_obj->get_name());
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+ stripe_size = manifest_gen.cur_stripe_max_size();
+ set_head_chunk_size(stripe_size);
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare(optional_yield y)
+{
+ manifest.set_prefix(target_obj->get_name() + "." + upload_id);
+
+ return prepare_head();
+}
+
+int MultipartObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled, optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWRados::Object op_target(store->getRados(),
+ head_obj->get_bucket(),
+ obj_ctx, head_obj.get());
+ RGWRados::Object::Write obj_op(&op_target);
+
+ op_target.set_versioning_disabled(true);
+ op_target.set_meta_placement_rule(&tail_placement_rule);
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ RGWUploadPartInfo info;
+ string p = "part.";
+ bool sorted_omap = is_v2_upload_id(upload_id);
+
+ if (sorted_omap) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%08d", part_num);
+ p.append(buf);
+ } else {
+ p.append(part_num_str);
+ }
+ info.num = part_num;
+ info.etag = etag;
+ info.size = actual_size;
+ info.accounted_size = accounted_size;
+ info.modified = real_clock::now();
+ info.manifest = manifest;
+
+ bool compressed;
+ r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+ if (r < 0) {
+ ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+ return r;
+ }
+
+ encode(info, bl);
+
+ std::unique_ptr<rgw::sal::Object> meta_obj =
+ head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
+ meta_obj->set_in_extra_data(true);
+
+ r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield);
+ if (r < 0) {
+ return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
+ }
+
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
+{
+ int r = writer.write_exclusive(data);
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int AppendObjectProcessor::prepare(optional_yield y)
+{
+ RGWObjState *astate;
+ int r = head_obj->get_obj_state(dpp, &astate, y);
+ if (r < 0) {
+ return r;
+ }
+ cur_size = astate->size;
+ *cur_accounted_size = astate->accounted_size;
+ if (!astate->exists) {
+ if (position != 0) {
+ ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ } else {
+ cur_part_num = 1;
+ //set the prefix
+ char buf[33];
+ gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+ string oid_prefix = head_obj->get_name();
+ oid_prefix.append(".");
+ oid_prefix.append(buf);
+ oid_prefix.append("_");
+ manifest.set_prefix(oid_prefix);
+ }
+ } else {
+ // check whether the object appendable
+ map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+ if (iter == astate->attrset.end()) {
+ ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
+ return -ERR_OBJECT_NOT_APPENDABLE;
+ }
+ if (position != *cur_accounted_size) {
+ ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ }
+ try {
+ using ceph::decode;
+ decode(cur_part_num, iter->second);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
+ return -EIO;
+ }
+ cur_part_num++;
+ //get the current obj etag
+ iter = astate->attrset.find(RGW_ATTR_ETAG);
+ if (iter != astate->attrset.end()) {
+ string s = rgw_string_unquote(iter->second.c_str());
+ size_t pos = s.find("-");
+ cur_etag = s.substr(0, pos);
+ }
+
+ iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != astate->attrset.end()) {
+ tail_placement_rule.storage_class = iter->second.to_str();
+ } else {
+ tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
+ }
+ cur_manifest = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_manifest();
+ manifest.set_prefix(cur_manifest->get_prefix());
+ astate->keep_tail = true;
+ }
+ manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
+
+ rgw_obj obj = head_obj->get_obj();
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj);
+ if (r < 0) {
+ return r;
+ }
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(std::move(stripe_obj));
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
+
+ uint64_t max_head_size = std::min(chunk_size, stripe_size);
+ set_head_chunk_size(max_head_size);
+
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+
+ return 0;
+}
+
+int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
+ ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
+ ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
+ const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
+ optional_yield y)
+{
+ int r = writer.drain();
+ if (r < 0)
+ return r;
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+ head_obj->set_atomic();
+ RGWRados::Object op_target(store->getRados(),
+ head_obj->get_bucket(),
+ obj_ctx, head_obj.get());
+ RGWRados::Object::Write obj_op(&op_target);
+ //For Append obj, disable versioning
+ op_target.set_versioning_disabled(true);
+ if (cur_manifest) {
+ cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
+ obj_op.meta.manifest = cur_manifest;
+ } else {
+ obj_op.meta.manifest = &manifest;
+ }
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+ obj_op.meta.appendable = true;
+ //Add the append part number
+ bufferlist cur_part_num_bl;
+ using ceph::encode;
+ encode(cur_part_num, cur_part_num_bl);
+ attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
+ //calculate the etag
+ if (!cur_etag.empty()) {
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+ hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hash.Final((unsigned char *)final_etag);
+ buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+ snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+ "-%lld", (long long)cur_part_num);
+ bufferlist etag_bl;
+ etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
+ attrs[RGW_ATTR_ETAG] = etag_bl;
+ }
+ r = obj_op.write_meta(dpp, actual_size + cur_size,
+ accounted_size + *cur_accounted_size,
+ attrs, y);
+ if (r < 0) {
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ *cur_accounted_size += accounted_size;
+
+ return 0;
+}
+
+} // namespace rgw::putobj
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "rgw_putobj.h"
+#include "services/svc_rados.h"
+#include "services/svc_tier_rados.h"
+#include "rgw_sal.h"
+#include "rgw_obj_manifest.h"
+
+namespace rgw {
+
+namespace sal {
+ class RadosStore;
+}
+
+class Aio;
+
+namespace putobj {
+
+// an object processor with special handling for the first chunk of the head.
+// the virtual process_first_chunk() function returns a processor to handle the
+// rest of the object
+class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
+ uint64_t head_chunk_size;
+ // buffer to capture the first chunk of the head object
+ bufferlist head_data;
+ // initialized after process_first_chunk() to process everything else
+ rgw::sal::DataProcessor *processor = nullptr;
+ uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
+ protected:
+ uint64_t get_actual_size() const { return data_offset; }
+
+ // process the first chunk of data and return a processor for the rest
+ virtual int process_first_chunk(bufferlist&& data,
+ rgw::sal::DataProcessor **processor) = 0;
+ public:
+ HeadObjectProcessor(uint64_t head_chunk_size)
+ : head_chunk_size(head_chunk_size)
+ {}
+
+ void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
+
+ // cache first chunk for process_first_chunk(), then forward everything else
+ // to the returned processor
+ int process(bufferlist&& data, uint64_t logical_offset) final override;
+};
+
+using RawObjSet = std::set<rgw_raw_obj>;
+
+// a data sink that writes to rados objects and deletes them on cancelation
+class RadosWriter : public rgw::sal::DataProcessor {
+ Aio *const aio;
+ rgw::sal::RadosStore *const store;
+ RGWObjectCtx& obj_ctx;
+ std::unique_ptr<rgw::sal::Object> head_obj;
+ RGWSI_RADOS::Obj stripe_obj; // current stripe object
+ RawObjSet written; // set of written objects for deletion
+ const DoutPrefixProvider *dpp;
+ optional_yield y;
+
+ public:
+ RadosWriter(Aio *aio, rgw::sal::RadosStore *store,
+ RGWObjectCtx& obj_ctx, std::unique_ptr<rgw::sal::Object> _head_obj,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : aio(aio), store(store),
+ obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y)
+ {}
+ RadosWriter(RadosWriter&& r)
+ : aio(r.aio), store(r.store),
+ obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y)
+ {}
+
+ ~RadosWriter();
+
+ // add alloc hint to osd
+ void add_write_hint(librados::ObjectWriteOperation& op);
+
+ // change the current stripe object
+ int set_stripe_obj(const rgw_raw_obj& obj);
+
+ // write the data at the given offset of the current stripe object
+ int process(bufferlist&& data, uint64_t stripe_offset) override;
+
+ // write the data as an exclusive create and wait for it to complete
+ int write_exclusive(const bufferlist& data);
+
+ int drain();
+
+ // when the operation completes successfully, clear the set of written objects
+ // so they aren't deleted on destruction
+ void clear_written() { written.clear(); }
+
+};
+
+
+// a rados object processor that stripes according to RGWObjManifest
+class ManifestObjectProcessor : public HeadObjectProcessor,
+ public StripeGenerator {
+ protected:
+ rgw::sal::RadosStore* const store;
+ rgw_placement_rule tail_placement_rule;
+ rgw_user owner;
+ RGWObjectCtx& obj_ctx;
+ std::unique_ptr<rgw::sal::Object> head_obj;
+
+ RadosWriter writer;
+ RGWObjManifest manifest;
+ RGWObjManifest::generator manifest_gen;
+ ChunkProcessor chunk;
+ StripeProcessor stripe;
+ const DoutPrefixProvider *dpp;
+
+ // implements StripeGenerator
+ int next(uint64_t offset, uint64_t *stripe_size) override;
+
+ public:
+ ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& _obj_ctx,
+ std::unique_ptr<rgw::sal::Object> _head_obj,
+ const DoutPrefixProvider* dpp, optional_yield y)
+ : HeadObjectProcessor(0),
+ store(store),
+ owner(owner),
+ obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)),
+ writer(aio, store, obj_ctx, head_obj->clone(), dpp, y),
+ chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
+ if (ptail_placement_rule) {
+ tail_placement_rule = *ptail_placement_rule;
+ }
+ }
+
+ void set_owner(const rgw_user& _owner) {
+ owner = _owner;
+ }
+
+ void set_tail_placement(const rgw_placement_rule& tpr) {
+ tail_placement_rule = tpr;
+ }
+ void set_tail_placement(const rgw_placement_rule&& tpr) {
+ tail_placement_rule = tpr;
+ }
+
+};
+
+
+// a processor that completes with an atomic write to the head object as part of
+// a bucket index transaction
+class AtomicObjectProcessor : public ManifestObjectProcessor {
+ const std::optional<uint64_t> olh_epoch;
+ const std::string unique_tag;
+ bufferlist first_chunk; // written with the head in complete()
+
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ public:
+ AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner,
+ RGWObjectCtx& obj_ctx,
+ std::unique_ptr<rgw::sal::Object> _head_obj,
+ std::optional<uint64_t> olh_epoch,
+ const std::string& unique_tag,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+ owner, obj_ctx, std::move(_head_obj), dpp, y),
+ olh_epoch(olh_epoch), unique_tag(unique_tag)
+ {}
+
+ // prepare a trivial manifest
+ int prepare(optional_yield y) override;
+ // write the head object atomically in a bucket index transaction
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+
+};
+
+
+// a processor for multipart parts, which don't require atomic completion. the
+// part's head is written with an exclusive create to detect racing uploads of
+// the same part/upload id, which are restarted with a random oid prefix
+class MultipartObjectProcessor : public ManifestObjectProcessor {
+ std::unique_ptr<rgw::sal::Object> target_obj; // target multipart object
+ const std::string upload_id;
+ const int part_num;
+ const std::string part_num_str;
+ RGWMPObj mp;
+
+ // write the first chunk and wait on aio->drain() for its completion.
+ // on EEXIST, retry with random prefix
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ // prepare the head stripe and manifest
+ int prepare_head();
+ public:
+ MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& obj_ctx,
+ std::unique_ptr<rgw::sal::Object> _head_obj,
+ const std::string& upload_id, uint64_t part_num,
+ const std::string& part_num_str,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+ owner, obj_ctx, std::move(_head_obj), dpp, y),
+ target_obj(head_obj->clone()), upload_id(upload_id),
+ part_num(part_num), part_num_str(part_num_str),
+ mp(head_obj->get_name(), upload_id)
+ {}
+
+ // prepare a multipart manifest
+ int prepare(optional_yield y) override;
+ // write the head object attributes in a bucket index transaction, then
+ // register the completed part with the multipart meta object
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+
+};
+
+ class AppendObjectProcessor : public ManifestObjectProcessor {
+ uint64_t cur_part_num;
+ uint64_t position;
+ uint64_t cur_size;
+ uint64_t *cur_accounted_size;
+ std::string cur_etag;
+ const std::string unique_tag;
+
+ RGWObjManifest *cur_manifest;
+
+ int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+
+ public:
+ AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
+ const rgw_placement_rule *ptail_placement_rule,
+ const rgw_user& owner, RGWObjectCtx& obj_ctx,
+ std::unique_ptr<rgw::sal::Object> _head_obj,
+ const std::string& unique_tag, uint64_t position,
+ uint64_t *cur_accounted_size,
+ const DoutPrefixProvider *dpp, optional_yield y)
+ : ManifestObjectProcessor(aio, store, ptail_placement_rule,
+ owner, obj_ctx, std::move(_head_obj), dpp, y),
+ position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
+ unique_tag(unique_tag), cur_manifest(nullptr)
+ {}
+ int prepare(optional_yield y) override;
+ int complete(size_t accounted_size, const std::string& etag,
+ ceph::real_time *mtime, ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
+ const char *if_match, const char *if_nomatch, const std::string *user_data,
+ rgw_zone_set *zones_trace, bool *canceled,
+ optional_yield y) override;
+ };
+
+} // namespace putobj
+} // namespace rgw
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sstream>
+
+#include <boost/algorithm/string.hpp>
+#include <string_view>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "common/ceph_json.h"
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Throttle.h"
+#include "common/BackTrace.h"
+
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_cache.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
+#include "rgw_aio_throttle.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_putobj_processor.h"
+
+#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw/cls_rgw_const.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "osd/osd_types.h"
+
+#include "rgw_tools.h"
+#include "rgw_coroutine.h"
+#include "rgw_compression.h"
+#include "rgw_etag_verifier.h"
+#include "rgw_worker.h"
+#include "rgw_notify.h"
+#include "rgw_http_errors.h"
+
+#undef fork // fails to compile RGWPeriod::fork() below
+
+#include "common/Clock.h"
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <atomic>
+#include <list>
+#include <map>
+#include "include/random.h"
+
+#include "rgw_gc.h"
+#include "rgw_lc.h"
+
+#include "rgw_object_expirer_core.h"
+#include "rgw_sync.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_trace.h"
+#include "rgw_trim_datalog.h"
+#include "rgw_trim_mdlog.h"
+#include "rgw_data_sync.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_reshard.h"
+#include "rgw_cr_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_bucket.h"
+#include "services/svc_mdlog.h"
+
+#include "compressor/Compressor.h"
+
+#include "rgw_d3n_datacache.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/rgw_rados.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
+#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
+#define dendl_bitx dendl ; }
+
+static string shadow_ns = "shadow";
+static string default_bucket_index_pool_suffix = "rgw.buckets.index";
+static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
+
+static RGWObjCategory main_category = RGWObjCategory::Main;
+#define RGW_USAGE_OBJ_PREFIX "usage."
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const
+{
+ if (!is_raw) {
+ rgw_raw_obj r;
+ driver->get_raw_obj(placement_rule, obj, &r);
+ return r;
+ }
+ return raw_obj;
+}
+
+void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
+{
+ obj_version* check_objv = version_for_check();
+
+ if (check_objv) {
+ cls_version_check(*op, *check_objv, VER_COND_EQ);
+ }
+
+ cls_version_read(*op, &read_version);
+}
+
+void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
+{
+ obj_version* check_objv = version_for_check();
+ obj_version* modify_version = version_for_write();
+
+ if (check_objv) {
+ cls_version_check(*op, *check_objv, VER_COND_EQ);
+ }
+
+ if (modify_version) {
+ cls_version_set(*op, *modify_version);
+ } else {
+ cls_version_inc(*op);
+ }
+}
+
+void RGWObjVersionTracker::apply_write()
+{
+ const bool checked = (read_version.ver != 0);
+ const bool incremented = (write_version.ver == 0);
+
+ if (checked && incremented) {
+ // apply cls_version_inc() so our next operation can recheck it
+ ++read_version.ver;
+ } else {
+ read_version = write_version;
+ }
+ write_version = obj_version();
+}
+
+RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
+ RGWObjStateManifest *result;
+ typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
+ lock.lock_shared();
+ assert (!obj.empty());
+ iter = objs_state.find(obj);
+ if (iter != objs_state.end()) {
+ result = &iter->second;
+ lock.unlock_shared();
+ } else {
+ lock.unlock_shared();
+ lock.lock();
+ result = &objs_state[obj];
+ lock.unlock();
+ }
+ return result;
+}
+
+void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.compressed = true;
+}
+
+void RGWObjectCtx::set_atomic(rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.is_atomic = true;
+}
+void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ assert (!obj.empty());
+ objs_state[obj].state.prefetch_data = true;
+}
+
+void RGWObjectCtx::invalidate(const rgw_obj& obj) {
+ std::unique_lock wl{lock};
+ auto iter = objs_state.find(obj);
+ if (iter == objs_state.end()) {
+ return;
+ }
+ bool is_atomic = iter->second.state.is_atomic;
+ bool prefetch_data = iter->second.state.prefetch_data;
+ bool compressed = iter->second.state.compressed;
+
+ objs_state.erase(iter);
+
+ if (is_atomic || prefetch_data) {
+ auto& sm = objs_state[obj];
+ sm.state.is_atomic = is_atomic;
+ sm.state.prefetch_data = prefetch_data;
+ sm.state.compressed = compressed;
+ }
+}
+
+class RGWMetaNotifierManager : public RGWCoroutinesManager {
+ RGWRados* store;
+ RGWHTTPManager http_manager;
+
+public:
+ RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+ http_manager(store->ctx(), completion_mgr) {
+ http_manager.start();
+ }
+
+ int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
+ rgw_http_param_pair pairs[] = { { "type", "metadata" },
+ { "notify", NULL },
+ { NULL, NULL } };
+
+ list<RGWCoroutinesStack *> stacks;
+ for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+ RGWRESTConn *conn = iter->second;
+ RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+ stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
+
+ stacks.push_back(stack);
+ }
+ return run(dpp, stacks);
+ }
+};
+
+class RGWDataNotifierManager : public RGWCoroutinesManager {
+ RGWRados* store;
+ RGWHTTPManager http_manager;
+
+public:
+ RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+ http_manager(store->ctx(), completion_mgr) {
+ http_manager.start();
+ }
+
+ int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
+
+ list<RGWCoroutinesStack *> stacks;
+ const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
+ for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+ RGWRESTConn *conn = iter->second;
+ RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+ stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
+ stacks.push_back(stack);
+ }
+
+ return run(dpp, stacks);
+ }
+};
+
+/* class RGWRadosThread */
+
+void RGWRadosThread::start()
+{
+ worker = new Worker(cct, this);
+ worker->create(thread_name.c_str());
+}
+
+void RGWRadosThread::stop()
+{
+ down_flag = true;
+ stop_process();
+ if (worker) {
+ worker->signal();
+ worker->join();
+ }
+ delete worker;
+ worker = NULL;
+}
+
+void *RGWRadosThread::Worker::entry() {
+ uint64_t msec = processor->interval_msec();
+ auto interval = std::chrono::milliseconds(msec);
+
+ do {
+ auto start = ceph::real_clock::now();
+ int r = processor->process(this);
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
+ }
+
+ if (processor->going_down())
+ break;
+
+ auto end = ceph::real_clock::now() - start;
+
+ uint64_t cur_msec = processor->interval_msec();
+ if (cur_msec != msec) { /* was it reconfigured? */
+ msec = cur_msec;
+ interval = std::chrono::milliseconds(msec);
+ }
+
+ if (cur_msec > 0) {
+ if (interval <= end)
+ continue; // next round
+
+ auto wait_time = interval - end;
+ wait_interval(wait_time);
+ } else {
+ wait();
+ }
+ } while (!processor->going_down());
+
+ return NULL;
+}
+
+class RGWMetaNotifier : public RGWRadosThread {
+ RGWMetaNotifierManager notify_mgr;
+ RGWMetadataLog *const log;
+
+ uint64_t interval_msec() override {
+ return cct->_conf->rgw_md_notify_interval_msec;
+ }
+ void stop_process() override {
+ notify_mgr.stop();
+ }
+public:
+ RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
+ : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
+
+ int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
+{
+ set<int> shards;
+
+ log->read_clear_modified(shards);
+
+ if (shards.empty()) {
+ return 0;
+ }
+
+ for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
+ ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
+ }
+
+ notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
+
+ return 0;
+}
+
+class RGWDataNotifier : public RGWRadosThread {
+ RGWDataNotifierManager notify_mgr;
+ bc::flat_set<rgw_data_notify_entry> entry;
+
+ uint64_t interval_msec() override {
+ return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
+ }
+ void stop_process() override {
+ notify_mgr.stop();
+ }
+public:
+ RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
+
+ int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
+{
+ auto data_log = store->svc.datalog_rados;
+ if (!data_log) {
+ return 0;
+ }
+
+ auto shards = data_log->read_clear_modified();
+
+ if (shards.empty()) {
+ return 0;
+ }
+
+ for (const auto& [shard_id, entries] : shards) {
+ bc::flat_set<rgw_data_notify_entry>::iterator it;
+ for (const auto& entry : entries) {
+ ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
+ << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
+ }
+ }
+
+ notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
+
+ return 0;
+}
+
+class RGWSyncProcessorThread : public RGWRadosThread {
+public:
+ RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
+ RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
+ ~RGWSyncProcessorThread() override {}
+ int init(const DoutPrefixProvider *dpp) override = 0 ;
+ int process(const DoutPrefixProvider *dpp) override = 0;
+};
+
+class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
+{
+ RGWMetaSyncStatusManager sync;
+
+ uint64_t interval_msec() override {
+ return 0; /* no interval associated, it'll run once until stopped */
+ }
+ void stop_process() override {
+ sync.stop();
+ }
+public:
+ RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
+ : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
+
+ void wakeup_sync_shards(set<int>& shard_ids) {
+ for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
+ sync.wakeup(*iter);
+ }
+ }
+ RGWMetaSyncStatusManager* get_manager() { return &sync; }
+
+ int init(const DoutPrefixProvider *dpp) override {
+ int ret = sync.init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
+ return ret;
+ }
+ return 0;
+ }
+
+ int process(const DoutPrefixProvider *dpp) override {
+ sync.run(dpp, null_yield);
+ return 0;
+ }
+};
+
+class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
+{
+ PerfCountersRef counters;
+ RGWDataSyncStatusManager sync;
+ bool initialized;
+
+ uint64_t interval_msec() override {
+ if (initialized) {
+ return 0; /* no interval associated, it'll run once until stopped */
+ } else {
+#define DATA_SYNC_INIT_WAIT_SEC 20
+ return DATA_SYNC_INIT_WAIT_SEC * 1000;
+ }
+ }
+ void stop_process() override {
+ sync.stop();
+ }
+public:
+ RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+ const RGWZone* source_zone)
+ : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
+ counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
+ sync(_driver, async_rados, source_zone->id, counters.get()),
+ initialized(false) {}
+
+ void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ sync.wakeup(iter->first, iter->second);
+ }
+ }
+
+ RGWDataSyncStatusManager* get_manager() { return &sync; }
+
+ int init(const DoutPrefixProvider *dpp) override {
+ return 0;
+ }
+
+ int process(const DoutPrefixProvider *dpp) override {
+ while (!initialized) {
+ if (going_down()) {
+ return 0;
+ }
+ int ret = sync.init(dpp);
+ if (ret >= 0) {
+ initialized = true;
+ break;
+ }
+ /* we'll be back! */
+ return 0;
+ }
+ sync.run(dpp);
+ return 0;
+ }
+};
+
+class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
+{
+ RGWCoroutinesManager crs;
+ rgw::sal::RadosStore* store;
+ rgw::BucketTrimManager *bucket_trim;
+ RGWHTTPManager http;
+ const utime_t trim_interval;
+
+ uint64_t interval_msec() override { return 0; }
+ void stop_process() override { crs.stop(); }
+public:
+ RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
+ int interval)
+ : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
+ crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
+ bucket_trim(bucket_trim),
+ http(store->ctx(), crs.get_completion_mgr()),
+ trim_interval(interval, 0)
+ {}
+
+ int init(const DoutPrefixProvider *dpp) override {
+ return http.start();
+ }
+ int process(const DoutPrefixProvider *dpp) override {
+ list<RGWCoroutinesStack*> stacks;
+ auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
+ cct->_conf->rgw_md_log_max_shards,
+ trim_interval);
+ if (!metatrimcr) {
+ ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
+ return -EINVAL;
+ }
+ auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
+ meta->call(metatrimcr);
+
+ stacks.push_back(meta);
+
+ if (store->svc()->zone->sync_module_exports_data()) {
+ auto data = new RGWCoroutinesStack(store->ctx(), &crs);
+ data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
+ cct->_conf->rgw_data_log_num_shards,
+ trim_interval));
+ stacks.push_back(data);
+
+ auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
+ bucket->call(bucket_trim->create_bucket_trim_cr(&http));
+ stacks.push_back(bucket);
+ }
+
+ crs.run(dpp, stacks);
+ return 0;
+ }
+
+ // implements DoutPrefixProvider
+ CephContext *get_cct() const override { return store->ctx(); }
+ unsigned get_subsys() const override
+ {
+ return dout_subsys;
+ }
+
+ std::ostream& gen_prefix(std::ostream& out) const override
+ {
+ return out << "sync log trim: ";
+ }
+
+};
+
+void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
+{
+ std::lock_guard l{meta_sync_thread_lock};
+ if (meta_sync_processor_thread) {
+ meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
+ }
+}
+
+void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
+{
+ ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
+ << ", gen=" << gen << dendl;
+ }
+ }
+
+ std::lock_guard l{data_sync_thread_lock};
+ auto iter = data_sync_processor_threads.find(source_zone);
+ if (iter == data_sync_processor_threads.end()) {
+ ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
+ return;
+ }
+
+ RGWDataSyncProcessorThread *thread = iter->second;
+ ceph_assert(thread);
+ thread->wakeup_sync_shards(entries);
+}
+
+RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
+{
+ std::lock_guard l{meta_sync_thread_lock};
+ if (meta_sync_processor_thread) {
+ return meta_sync_processor_thread->get_manager();
+ }
+ return nullptr;
+}
+
+RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+ std::lock_guard l{data_sync_thread_lock};
+ auto thread = data_sync_processor_threads.find(source_zone);
+ if (thread == data_sync_processor_threads.end()) {
+ return nullptr;
+ }
+ return thread->second->get_manager();
+}
+
+int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
+{
+ IoCtx ioctx;
+ int r = open_pool_ctx(dpp, pool, ioctx, false);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
+ return r;
+ }
+
+ bool req;
+ r = ioctx.pool_requires_alignment2(&req);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
+ << r << dendl;
+ return r;
+ }
+
+ if (!req) {
+ *alignment = 0;
+ return 0;
+ }
+
+ uint64_t align;
+ r = ioctx.pool_required_alignment2(&align);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
+ << r << dendl;
+ return r;
+ }
+ if (align != 0) {
+ ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
+ }
+ *alignment = align;
+ return 0;
+}
+
+void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
+{
+ if (alignment == 0) {
+ *max_size = size;
+ return;
+ }
+
+ if (size <= alignment) {
+ *max_size = alignment;
+ return;
+ }
+
+ *max_size = size - (size % alignment);
+}
+
+int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+ uint64_t alignment;
+ int r = get_required_alignment(dpp, pool, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ if (palignment) {
+ *palignment = alignment;
+ }
+
+ uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+ get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
+
+ ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
+
+ return 0;
+}
+
+int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
+ uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+ rgw_pool pool;
+ if (!get_obj_data_pool(placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
+ return -EIO;
+ }
+ return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
+}
+
+void add_datalog_entry(const DoutPrefixProvider* dpp,
+ RGWDataChangesLog* datalog,
+ const RGWBucketInfo& bucket_info,
+ uint32_t shard_id)
+{
+ const auto& logs = bucket_info.layout.logs;
+ if (logs.empty()) {
+ return;
+ }
+ int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
+ } // datalog error is not fatal
+}
+
+class RGWIndexCompletionManager;
+
+struct complete_op_data {
+ ceph::mutex lock = ceph::make_mutex("complete_op_data");
+ AioCompletion *rados_completion{nullptr};
+ int manager_shard_id{-1};
+ RGWIndexCompletionManager *manager{nullptr};
+ rgw_obj obj;
+ RGWModifyOp op;
+ string tag;
+ rgw_bucket_entry_ver ver;
+ cls_rgw_obj_key key;
+ rgw_bucket_dir_entry_meta dir_meta;
+ list<cls_rgw_obj_key> remove_objs;
+ bool log_op;
+ uint16_t bilog_op;
+ rgw_zone_set zones_trace;
+
+ bool stopped{false};
+
+ void stop() {
+ std::lock_guard l{lock};
+ stopped = true;
+ }
+};
+
+class RGWIndexCompletionManager {
+ RGWRados* const store;
+ const uint32_t num_shards;
+ ceph::containers::tiny_vector<ceph::mutex> locks;
+ std::vector<set<complete_op_data*>> completions;
+ std::vector<complete_op_data*> retry_completions;
+
+ std::condition_variable cond;
+ std::mutex retry_completions_lock;
+ bool _stop{false};
+ std::thread retry_thread;
+
+ // used to distribute the completions and the locks they use across
+ // their respective vectors; it will get incremented and can wrap
+ // around back to 0 without issue
+ std::atomic<uint32_t> cur_shard {0};
+
+ void process();
+
+ void add_completion(complete_op_data *completion);
+
+ void stop() {
+ if (retry_thread.joinable()) {
+ _stop = true;
+ cond.notify_all();
+ retry_thread.join();
+ }
+
+ for (uint32_t i = 0; i < num_shards; ++i) {
+ std::lock_guard l{locks[i]};
+ for (auto c : completions[i]) {
+ c->stop();
+ }
+ }
+ completions.clear();
+ }
+
+ uint32_t next_shard() {
+ return cur_shard++ % num_shards;
+ }
+
+public:
+ RGWIndexCompletionManager(RGWRados *_driver) :
+ store(_driver),
+ num_shards(store->ctx()->_conf->rgw_thread_pool_size),
+ locks{ceph::make_lock_container<ceph::mutex>(
+ num_shards,
+ [](const size_t i) {
+ return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
+ std::to_string(i));
+ })},
+ completions(num_shards),
+ retry_thread(&RGWIndexCompletionManager::process, this)
+ {}
+
+ ~RGWIndexCompletionManager() {
+ stop();
+ }
+
+ void create_completion(const rgw_obj& obj,
+ RGWModifyOp op, string& tag,
+ rgw_bucket_entry_ver& ver,
+ const cls_rgw_obj_key& key,
+ rgw_bucket_dir_entry_meta& dir_meta,
+ list<cls_rgw_obj_key> *remove_objs, bool log_op,
+ uint16_t bilog_op,
+ rgw_zone_set *zones_trace,
+ complete_op_data **result);
+
+ bool handle_completion(completion_t cb, complete_op_data *arg);
+
+ CephContext* ctx() {
+ return store->ctx();
+ }
+};
+
+static void obj_complete_cb(completion_t cb, void *arg)
+{
+ complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
+ completion->lock.lock();
+ if (completion->stopped) {
+ completion->lock.unlock(); /* can drop lock, no one else is referencing us */
+ delete completion;
+ return;
+ }
+ bool need_delete = completion->manager->handle_completion(cb, completion);
+ completion->lock.unlock();
+ if (need_delete) {
+ delete completion;
+ }
+}
+
+void RGWIndexCompletionManager::process()
+{
+ DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
+ while(!_stop) {
+ std::vector<complete_op_data*> comps;
+
+ {
+ std::unique_lock l{retry_completions_lock};
+ cond.wait(l, [this](){return _stop || !retry_completions.empty();});
+ if (_stop) {
+ return;
+ }
+ retry_completions.swap(comps);
+ }
+
+ for (auto c : comps) {
+ std::unique_ptr<complete_op_data> up{c};
+
+ ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
+
+ RGWRados::BucketShard bs(store);
+ RGWBucketInfo bucket_info;
+
+ int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
+ if (r < 0) {
+ ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
+ /* not much to do */
+ continue;
+ }
+
+ r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
+ [&](RGWRados::BucketShard *bs) -> int {
+ const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, &dpp, 10) <<
+ "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+ " obj=" << c->obj << " tag=" << c->tag <<
+ " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
+ ldout_bitx(bitx, &dpp, 25) <<
+ "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
+
+ librados::ObjectWriteOperation o;
+ o.assert_exists();
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+ c->log_op, c->bilog_op, &c->zones_trace);
+ int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
+ ldout_bitx(bitx, &dpp, 10) <<
+ "EXITING " << __func__ << ": ret=" << dendl_bitx;
+ return ret;
+ });
+ if (r < 0) {
+ ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
+ /* ignoring error, can't do anything about it */
+ continue;
+ }
+
+ add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id);
+ }
+ }
+}
+
+void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
+ RGWModifyOp op, string& tag,
+ rgw_bucket_entry_ver& ver,
+ const cls_rgw_obj_key& key,
+ rgw_bucket_dir_entry_meta& dir_meta,
+ list<cls_rgw_obj_key> *remove_objs, bool log_op,
+ uint16_t bilog_op,
+ rgw_zone_set *zones_trace,
+ complete_op_data **result)
+{
+ complete_op_data *entry = new complete_op_data;
+
+ int shard_id = next_shard();
+
+ entry->manager_shard_id = shard_id;
+ entry->manager = this;
+ entry->obj = obj;
+ entry->op = op;
+ entry->tag = tag;
+ entry->ver = ver;
+ entry->key = key;
+ entry->dir_meta = dir_meta;
+ entry->log_op = log_op;
+ entry->bilog_op = bilog_op;
+
+ if (remove_objs) {
+ for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
+ entry->remove_objs.push_back(*iter);
+ }
+ }
+
+ if (zones_trace) {
+ entry->zones_trace = *zones_trace;
+ } else {
+ entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
+ }
+
+ *result = entry;
+
+ entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
+
+ std::lock_guard l{locks[shard_id]};
+ const auto ok = completions[shard_id].insert(entry).second;
+ ceph_assert(ok);
+}
+
+void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
+ {
+ std::lock_guard l{retry_completions_lock};
+ retry_completions.push_back(completion);
+ }
+ cond.notify_all();
+}
+
+bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
+{
+ int shard_id = arg->manager_shard_id;
+ {
+ std::lock_guard l{locks[shard_id]};
+
+ auto& comps = completions[shard_id];
+
+ auto iter = comps.find(arg);
+ if (iter == comps.end()) {
+ ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
+ return true;
+ }
+
+ comps.erase(iter);
+ }
+
+ int r = rados_aio_get_return_value(cb);
+ if (r != -ERR_BUSY_RESHARDING) {
+ ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " <<
+ (r == 0 ? "ok" : "failed with " + to_string(r)) <<
+ " for obj=" << arg->key << dendl;
+ return true;
+ }
+ add_completion(arg);
+ ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
+ return false;
+}
+
+void RGWRados::finalize()
+{
+ /* Before joining any sync threads, drain outstanding requests &
+ * mark the async_processor as going_down() */
+ if (svc.rados) {
+ svc.rados->stop_processor();
+ }
+
+ if (run_sync_thread) {
+ std::lock_guard l{meta_sync_thread_lock};
+ meta_sync_processor_thread->stop();
+
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto iter : data_sync_processor_threads) {
+ RGWDataSyncProcessorThread *thread = iter.second;
+ thread->stop();
+ }
+ if (sync_log_trimmer) {
+ sync_log_trimmer->stop();
+ }
+ }
+ if (run_sync_thread) {
+ delete meta_sync_processor_thread;
+ meta_sync_processor_thread = NULL;
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto iter : data_sync_processor_threads) {
+ RGWDataSyncProcessorThread *thread = iter.second;
+ delete thread;
+ }
+ data_sync_processor_threads.clear();
+ delete sync_log_trimmer;
+ sync_log_trimmer = nullptr;
+ bucket_trim = boost::none;
+ }
+ if (meta_notifier) {
+ meta_notifier->stop();
+ delete meta_notifier;
+ }
+ if (data_notifier) {
+ data_notifier->stop();
+ delete data_notifier;
+ }
+ delete sync_tracer;
+
+ delete lc;
+ lc = NULL;
+
+ delete gc;
+ gc = NULL;
+
+ delete obj_expirer;
+ obj_expirer = NULL;
+
+ RGWQuotaHandler::free_handler(quota_handler);
+ if (cr_registry) {
+ cr_registry->put();
+ }
+
+ svc.shutdown();
+
+ delete binfo_cache;
+ delete obj_tombstone_cache;
+ if (d3n_data_cache)
+ delete d3n_data_cache;
+
+ if (reshard_wait.get()) {
+ reshard_wait->stop();
+ reshard_wait.reset();
+ }
+
+ if (run_reshard_thread) {
+ reshard->stop_processor();
+ }
+ delete reshard;
+ delete index_completion_manager;
+
+ rgw::notify::shutdown();
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_rados()
+{
+ int ret = 0;
+
+ ret = rados.init_with_context(cct);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = rados.connect();
+ if (ret < 0) {
+ return ret;
+ }
+
+ auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
+ new RGWCoroutinesManagerRegistry(cct)};
+ ret = crs->hook_to_admin_command("cr dump");
+ if (ret < 0) {
+ return ret;
+ }
+
+ cr_registry = crs.release();
+
+ if (use_datacache) {
+ d3n_data_cache = new D3nDataCache();
+ d3n_data_cache->init(cct);
+ }
+
+ return ret;
+}
+
+int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
+{
+ string name = cct->_conf->name.get_id();
+ if (name.compare(0, 4, "rgw.") == 0) {
+ name = name.substr(4);
+ }
+ map<string,string> metadata = meta;
+ metadata["num_handles"] = "1"s;
+ metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
+ metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
+ metadata["zone_name"] = svc.zone->zone_name();
+ metadata["zone_id"] = svc.zone->zone_id().id;
+ metadata["realm_name"] = svc.zone->get_realm().get_name();
+ metadata["realm_id"] = svc.zone->get_realm().get_id();
+ metadata["id"] = name;
+ int ret = rados.service_daemon_register(
+ daemon_type,
+ stringify(rados.get_instance_id()),
+ metadata);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
+{
+ int ret = rados.service_daemon_update_status(move(status));
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_complete(const DoutPrefixProvider *dpp)
+{
+ int ret;
+
+ /*
+ * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
+ */
+ sync_module = svc.sync_modules->get_sync_module();
+
+ ret = open_root_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_gc_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_lc_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_objexp_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_reshard_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ ret = open_notif_pool_ctx(dpp);
+ if (ret < 0)
+ return ret;
+
+ pools_initialized = true;
+
+ if (use_gc) {
+ gc = new RGWGC();
+ gc->initialize(cct, this);
+ } else {
+ ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
+ }
+
+ obj_expirer = new RGWObjectExpirer(this->driver);
+
+ if (use_gc_thread && use_gc) {
+ gc->start_processor();
+ obj_expirer->start_processor();
+ }
+
+ auto& current_period = svc.zone->get_current_period();
+ auto& zonegroup = svc.zone->get_zonegroup();
+ auto& zone_params = svc.zone->get_zone_params();
+ auto& zone = svc.zone->get_zone();
+
+ /* no point of running sync thread if we don't have a master zone configured
+ or there is no rest_master_conn */
+ if (!svc.zone->need_to_sync()) {
+ run_sync_thread = false;
+ }
+
+ if (svc.zone->is_meta_master()) {
+ auto md_log = svc.mdlog->get_log(current_period.get_id());
+ meta_notifier = new RGWMetaNotifier(this, md_log);
+ meta_notifier->start();
+ }
+
+ /* init it anyway, might run sync through radosgw-admin explicitly */
+ sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
+ sync_tracer->init(this);
+ ret = sync_tracer->hook_to_admin_command();
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (run_sync_thread) {
+ for (const auto &pt: zonegroup.placement_targets) {
+ if (zone_params.placement_pools.find(pt.second.name)
+ == zone_params.placement_pools.end()){
+ ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
+ << pt.second.name << " present in zonegroup" << dendl;
+ }
+ }
+ auto async_processor = svc.rados->get_async_processor();
+ std::lock_guard l{meta_sync_thread_lock};
+ meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
+ ret = meta_sync_processor_thread->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
+ return ret;
+ }
+ meta_sync_processor_thread->start();
+
+ // configure the bucket trim manager
+ rgw::BucketTrimConfig config;
+ rgw::configure_bucket_trim(cct, config);
+
+ bucket_trim.emplace(this->driver, config);
+ ret = bucket_trim->init();
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
+ return ret;
+ }
+ svc.datalog_rados->set_observer(&*bucket_trim);
+
+ std::lock_guard dl{data_sync_thread_lock};
+ for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
+ ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
+ auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
+ ret = thread->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
+ return ret;
+ }
+ thread->start();
+ data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
+ }
+ auto interval = cct->_conf->rgw_sync_log_trim_interval;
+ if (interval > 0) {
+ sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
+ ret = sync_log_trimmer->init(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
+ return ret;
+ }
+ sync_log_trimmer->start();
+ }
+ }
+ if (cct->_conf->rgw_data_notify_interval_msec) {
+ data_notifier = new RGWDataNotifier(this);
+ data_notifier->start();
+ }
+
+ binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
+ binfo_cache->init(svc.cache);
+
+ lc = new RGWLC();
+ lc->initialize(cct, this->driver);
+
+ if (use_lc_thread)
+ lc->start_processor();
+
+ quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
+
+ bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
+ zone.bucket_index_max_shards);
+ if (bucket_index_max_shards > get_max_bucket_shards()) {
+ bucket_index_max_shards = get_max_bucket_shards();
+ ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
+ << get_max_bucket_shards() << dendl;
+ }
+ ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
+
+ bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
+
+ if (need_tombstone_cache) {
+ obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
+ }
+
+ reshard_wait = std::make_shared<RGWReshardWait>();
+
+ reshard = new RGWReshard(this->driver);
+
+ // disable reshard thread based on zone/zonegroup support
+ run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
+
+ if (run_reshard_thread) {
+ reshard->start_processor();
+ }
+
+ index_completion_manager = new RGWIndexCompletionManager(this);
+ ret = rgw::notify::init(cct, driver, dpp);
+ if (ret < 0 ) {
+ ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
+ }
+
+ return ret;
+}
+
+int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
+{
+ if (raw) {
+ return svc.init_raw(cct, use_cache, null_yield, dpp);
+ }
+
+ return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
+}
+
+int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
+{
+ return ctl.init(&svc, driver, dpp);
+}
+
+/**
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_begin(const DoutPrefixProvider *dpp)
+{
+ int ret;
+
+ inject_notify_timeout_probability =
+ cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
+ max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
+
+ ret = init_svc(false, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
+ return ret;
+ }
+
+ ret = init_ctl(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
+ return ret;
+ }
+
+ host_id = svc.zone_utils->gen_host_id();
+
+ return init_rados();
+}
+
+/**
+ * Open the pool used as root for this gateway
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
+}
+
+int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
+}
+
+int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
+}
+
+int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
+}
+
+int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
+}
+
+int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
+{
+ return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
+}
+
+int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+ bool mostly_omap)
+{
+ constexpr bool create = true; // create the pool if it doesn't exist
+ return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
+}
+
+/**** logs ****/
+
+struct log_list_state {
+ string prefix;
+ librados::IoCtx io_ctx;
+ librados::NObjectIterator obit;
+};
+
+int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
+{
+ log_list_state *state = new log_list_state;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+ if (r < 0) {
+ delete state;
+ return r;
+ }
+ state->prefix = prefix;
+ state->obit = state->io_ctx.nobjects_begin();
+ *handle = (RGWAccessHandle)state;
+ return 0;
+}
+
+int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
+{
+ log_list_state *state = static_cast<log_list_state *>(handle);
+ while (true) {
+ if (state->obit == state->io_ctx.nobjects_end()) {
+ delete state;
+ return -ENOENT;
+ }
+ if (state->prefix.length() &&
+ state->obit->get_oid().find(state->prefix) != 0) {
+ state->obit++;
+ continue;
+ }
+ *name = state->obit->get_oid();
+ state->obit++;
+ break;
+ }
+ return 0;
+}
+
+int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
+{
+ librados::IoCtx io_ctx;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
+ if (r < 0)
+ return r;
+ return io_ctx.remove(name);
+}
+
+struct log_show_state {
+ librados::IoCtx io_ctx;
+ bufferlist bl;
+ bufferlist::const_iterator p;
+ string name;
+ uint64_t pos;
+ bool eof;
+ log_show_state() : pos(0), eof(false) {}
+};
+
+int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
+{
+ log_show_state *state = new log_show_state;
+ int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+ if (r < 0) {
+ delete state;
+ return r;
+ }
+ state->name = name;
+ *handle = (RGWAccessHandle)state;
+ return 0;
+}
+
+int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
+{
+ log_show_state *state = static_cast<log_show_state *>(handle);
+ off_t off = state->p.get_off();
+
+ ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
+ << " off " << off
+ << " eof " << (int)state->eof
+ << dendl;
+ // read some?
+ unsigned chunk = 1024*1024;
+ if ((state->bl.length() - off) < chunk/2 && !state->eof) {
+ bufferlist more;
+ int r = state->io_ctx.read(state->name, more, chunk, state->pos);
+ if (r < 0)
+ return r;
+ state->pos += r;
+ bufferlist old;
+ try {
+ old.substr_of(state->bl, off, state->bl.length() - off);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ state->bl = std::move(old);
+ state->bl.claim_append(more);
+ state->p = state->bl.cbegin();
+ if ((unsigned)r < chunk)
+ state->eof = true;
+ ldpp_dout(dpp, 10) << " read " << r << dendl;
+ }
+
+ if (state->p.end())
+ return 0; // end of file
+ try {
+ decode(*entry, state->p);
+ }
+ catch (const buffer::error &e) {
+ return -EINVAL;
+ }
+ return 1;
+}
+
+/**
+ * usage_log_hash: get usage log key hash, based on name and index
+ *
+ * Get the usage object name. Since a user may have more than 1
+ * object holding that info (multiple shards), we use index to
+ * specify that shard number. Once index exceeds max shards it
+ * wraps.
+ * If name is not being set, results for all users will be returned
+ * and index will wrap only after total shards number.
+ *
+ * @param cct [in] ceph context
+ * @param name [in] user name
+ * @param hash [out] hash value
+ * @param index [in] shard index number
+ */
+static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
+{
+ uint32_t val = index;
+
+ if (!name.empty()) {
+ int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
+ val %= max_user_shards;
+ val += ceph_str_hash_linux(name.c_str(), name.size());
+ }
+ char buf[17];
+ int max_shards = cct->_conf->rgw_usage_max_shards;
+ snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
+ hash = buf;
+}
+
+int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+ uint32_t index = 0;
+
+ map<string, rgw_usage_log_info> log_objs;
+
+ string hash;
+ string last_user;
+
+ /* restructure usage map, zone by object hash */
+ map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
+ for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
+ const rgw_user_bucket& ub = iter->first;
+ RGWUsageBatch& info = iter->second;
+
+ if (ub.user.empty()) {
+ ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
+ continue;
+ }
+
+ if (ub.user != last_user) {
+ /* index *should* be random, but why waste extra cycles
+ in most cases max user shards is not going to exceed 1,
+ so just incrementing it */
+ usage_log_hash(cct, ub.user, hash, index++);
+ }
+ last_user = ub.user;
+ vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
+
+ for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
+ v.push_back(miter->second);
+ }
+ }
+
+ map<string, rgw_usage_log_info>::iterator liter;
+
+ for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
+ int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
+ rgw_usage_log_entry>& usage)
+{
+ uint32_t num = max_entries;
+ string hash, first_hash;
+ string user_str = user.to_str();
+ usage_log_hash(cct, user_str, first_hash, 0);
+
+ if (usage_iter.index) {
+ usage_log_hash(cct, user_str, hash, usage_iter.index);
+ } else {
+ hash = first_hash;
+ }
+
+ usage.clear();
+
+ do {
+ map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
+ map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+
+ int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
+ usage_iter.read_iter, ret_usage, is_truncated);
+ if (ret == -ENOENT)
+ goto next;
+
+ if (ret < 0)
+ return ret;
+
+ num -= ret_usage.size();
+
+ for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
+ usage[iter->first].aggregate(iter->second);
+ }
+
+next:
+ if (!*is_truncated) {
+ usage_iter.read_iter.clear();
+ usage_log_hash(cct, user_str, hash, ++usage_iter.index);
+ }
+ } while (num && !*is_truncated && hash != first_hash);
+ return 0;
+}
+
+int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
+{
+ uint32_t index = 0;
+ string hash, first_hash;
+ string user_str = user.to_str();
+ usage_log_hash(cct, user_str, first_hash, index);
+
+ hash = first_hash;
+ do {
+ int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
+
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+
+ usage_log_hash(cct, user_str, hash, ++index);
+ } while (hash != first_hash);
+
+ return 0;
+}
+
+
+int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
+{
+ auto max_shards = cct->_conf->rgw_usage_max_shards;
+ int ret=0;
+ for (unsigned i=0; i < max_shards; i++){
+ string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
+ ret = cls_obj_usage_log_clear(dpp, oid);
+ if (ret < 0){
+ ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ return ret;
+}
+
+int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
+ ceph::buffer::list& bl,
+ ACLOwner *owner)
+{
+ auto i = bl.cbegin();
+ RGWAccessControlPolicy policy(cct);
+ try {
+ policy.decode_owner(i);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ *owner = policy.get_owner();
+ return 0;
+}
+
+int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
+{
+ rgw_bucket bucket = bucket_info.bucket;
+ bucket.update_bucket_id(new_bucket_id);
+
+ bucket_info.objv_tracker.clear();
+ int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ * Any skipped results will have the matching portion of their name
+ * inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int RGWRados::Bucket::List::list_objects_ordered(
+ const DoutPrefixProvider *dpp,
+ int64_t max_p,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y)
+{
+ RGWRados *store = target->get_store();
+ CephContext *cct = store->ctx();
+ int shard_id = target->get_shard_id();
+ const auto& current_index = target->get_bucket_info().layout.current_index;
+
+ int count = 0;
+ bool truncated = true;
+ bool cls_filtered = false;
+ const int64_t max = // protect against memory issues and negative vals
+ std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+ int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
+
+ result->clear();
+
+ // use a local marker; either the marker will have a previous entry
+ // or it will be empty; either way it's OK to copy
+ rgw_obj_key marker_obj(params.marker.name,
+ params.marker.instance,
+ params.ns.empty() ? params.marker.ns : params.ns);
+ rgw_obj_index_key cur_marker;
+ marker_obj.get_index_key(&cur_marker);
+
+ rgw_obj_key end_marker_obj(params.end_marker.name,
+ params.end_marker.instance,
+ params.ns.empty() ? params.end_marker.ns : params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
+ const bool cur_end_marker_valid = !params.end_marker.empty();
+
+ rgw_obj_key prefix_obj(params.prefix);
+ prefix_obj.set_ns(params.ns);
+ std::string cur_prefix = prefix_obj.get_index_key_name();
+ std::string after_delim_s; /* needed in !params.delim.empty() AND later */
+
+ if (!params.delim.empty()) {
+ after_delim_s = cls_rgw_after_delim(params.delim);
+ /* if marker points at a common prefix, fast forward it into its
+ * upper bound string */
+ int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
+ if (delim_pos >= 0) {
+ string s = cur_marker.name.substr(0, delim_pos);
+ s.append(after_delim_s);
+ cur_marker = s;
+ }
+ }
+
+ // we'll stop after this many attempts as long we return at least
+ // one entry; but we will also go beyond this number of attempts
+ // until we return at least one entry
+ constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
+
+ rgw_obj_index_key prev_marker;
+ for (uint16_t attempt = 1; /* empty */; ++attempt) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": starting attempt " << attempt << dendl;
+
+ if (attempt > 1 && !(prev_marker < cur_marker)) {
+ // we've failed to make forward progress
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " marker failed to make forward progress; attempt=" << attempt <<
+ ", prev_marker=" << prev_marker <<
+ ", cur_marker=" << cur_marker << dendl;
+ break;
+ }
+ prev_marker = cur_marker;
+
+ ent_map_t ent_map;
+ ent_map.reserve(read_ahead);
+ int r = store->cls_bucket_list_ordered(dpp,
+ target->get_bucket_info(),
+ current_index,
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ params.delim,
+ read_ahead + 1 - count,
+ params.list_versions,
+ attempt,
+ ent_map,
+ &truncated,
+ &cls_filtered,
+ &cur_marker,
+ y,
+ params.force_check_filter);
+ if (r < 0) {
+ return r;
+ }
+
+ for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+ rgw_bucket_dir_entry& entry = eiter->second;
+ rgw_obj_index_key index_key = entry.key;
+ rgw_obj_key obj(index_key);
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": considering entry " << entry.key << dendl;
+
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
+ */
+ bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+ if (!valid) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " could not parse object name: " << obj.name << dendl;
+ continue;
+ }
+
+ bool matched_ns = (obj.ns == params.ns);
+ if (!params.list_versions && !entry.is_visible()) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": skipping not visible entry \"" << entry.key << "\"" << dendl;
+ continue;
+ }
+
+ if (params.enforce_ns && !matched_ns) {
+ if (!params.ns.empty()) {
+ /* we've iterated past the namespace we're searching -- done now */
+ truncated = false;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": finished due to getting past requested namespace \"" <<
+ params.ns << "\"" << dendl;
+ goto done;
+ }
+
+ /* we're skipping past namespaced objects */
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping past namespaced objects, including \"" << entry.key <<
+ "\"" << dendl;
+ continue;
+ }
+
+ if (cur_end_marker_valid && cur_end_marker <= index_key) {
+ truncated = false;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": finished due to gitting end marker of \"" << cur_end_marker <<
+ "\" with \"" << entry.key << "\"" << dendl;
+ goto done;
+ }
+
+ if (count < max) {
+ params.marker = index_key;
+ next_marker = index_key;
+ }
+
+ if (params.access_list_filter &&
+ ! params.access_list_filter->filter(obj.name, index_key.name)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping past namespaced objects, including \"" << entry.key <<
+ "\"" << dendl;
+ continue;
+ }
+
+ if (params.prefix.size() &&
+ 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skipping object \"" << entry.key <<
+ "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
+ continue;
+ }
+
+ if (!params.delim.empty()) {
+ const int delim_pos = obj.name.find(params.delim, params.prefix.size());
+ if (delim_pos >= 0) {
+ // run either the code where delimiter filtering is done a)
+ // in the OSD/CLS or b) here.
+ if (cls_filtered) {
+ // NOTE: this condition is for the newer versions of the
+ // OSD that does filtering on the CLS side should only
+ // find one delimiter at the end if it finds any after the
+ // prefix
+ if (delim_pos !=
+ int(obj.name.length() - params.delim.length())) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+ " found delimiter in place other than the end of "
+ "the prefix; obj.name=" << obj.name <<
+ ", prefix=" << params.prefix << dendl;
+ }
+ if (common_prefixes) {
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with common prefix \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached (cls filtered)" << dendl;
+ goto done;
+ }
+
+ (*common_prefixes)[obj.name] = true;
+ count++;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": finished entry with common prefix \"" << entry.key <<
+ "\" so continuing loop (cls filtered)" << dendl;
+ continue;
+ } else {
+ // NOTE: this condition is for older versions of the OSD
+ // that do not filter on the CLS side, so the following code
+ // must do the filtering; once we reach version 16 of ceph,
+ // this code can be removed along with the conditional that
+ // can lead this way
+
+ /* extract key -with trailing delimiter- for CommonPrefix */
+ string prefix_key =
+ obj.name.substr(0, delim_pos + params.delim.length());
+
+ if (common_prefixes &&
+ common_prefixes->find(prefix_key) == common_prefixes->end()) {
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with common prefix \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached (not cls filtered)" << dendl;
+ goto done;
+ }
+ next_marker = prefix_key;
+ (*common_prefixes)[prefix_key] = true;
+
+ count++;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": finished entry with common prefix \"" << entry.key <<
+ "\" so continuing loop (not cls filtered)" << dendl;
+ continue;
+ } // if we're running an older OSD version
+ } // if a delimiter was found after prefix
+ } // if a delimiter was passed in
+
+ if (count >= max) {
+ truncated = true;
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopping early with entry \"" << entry.key <<
+ "\" because requested number (" << max <<
+ ") reached" << dendl;
+ goto done;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": adding entry " << entry.key << " to result" << dendl;
+
+ result->emplace_back(std::move(entry));
+ count++;
+ } // eiter for loop
+
+ // NOTE: the following conditional is needed by older versions of
+ // the OSD that don't do delimiter filtering on the CLS side; once
+ // we reach version 16 of ceph, the following conditional and the
+ // code within can be removed
+ if (!cls_filtered && !params.delim.empty()) {
+ int marker_delim_pos =
+ cur_marker.name.find(params.delim, cur_prefix.size());
+ if (marker_delim_pos >= 0) {
+ std::string skip_after_delim =
+ cur_marker.name.substr(0, marker_delim_pos);
+ skip_after_delim.append(after_delim_s);
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skip_after_delim=" << skip_after_delim << dendl;
+
+ if (skip_after_delim > cur_marker.name) {
+ cur_marker = skip_after_delim;
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": setting cur_marker=" << cur_marker.name <<
+ "[" << cur_marker.instance << "]" << dendl;
+ }
+ }
+ } // if older osd didn't do delimiter filtering
+
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": end of outer loop, truncated=" << truncated <<
+ ", count=" << count << ", attempt=" << attempt << dendl;
+
+ if (!truncated || count >= (max + 1) / 2) {
+ // if we finished listing, or if we're returning at least half the
+ // requested entries, that's enough; S3 and swift protocols allow
+ // returning fewer than max entries
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": exiting attempt loop because we reached end (" << truncated <<
+ ") or we're returning half the requested entries (" << count <<
+ " of " << max << ")" << dendl;
+ break;
+ } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
+ // if we've made at least 8 attempts and we have some, but very
+ // few, results, return with what we have
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": exiting attempt loop because we made " << attempt <<
+ " attempts and we're returning " << count << " entries" << dendl;
+ break;
+ }
+ } // for (uint16_t attempt...
+
+done:
+
+ if (is_truncated) {
+ *is_truncated = truncated;
+ }
+
+ return 0;
+} // list_objects_ordered
+
+
+/**
+ * Get listing of the objects in a bucket and allow the results to be out
+ * of order.
+ *
+ * Even though there are key differences with the ordered counterpart,
+ * the parameters are the same to maintain some compatability.
+ *
+ * max: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: should not be set; if it is we should have indicated an error
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: this is never filled with an unordered list; the param
+ * is maintained for compatibility
+ * is_truncated: if number of objects in the bucket is bigger than max, then
+ * truncated.
+ */
+int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
+ int64_t max_p,
+ std::vector<rgw_bucket_dir_entry>* result,
+ std::map<std::string, bool>* common_prefixes,
+ bool* is_truncated,
+ optional_yield y)
+{
+ RGWRados *store = target->get_store();
+ int shard_id = target->get_shard_id();
+ const auto& current_index = target->get_bucket_info().layout.current_index;
+
+ int count = 0;
+ bool truncated = true;
+
+ const int64_t max = // protect against memory issues and negative vals
+ std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+
+ // read a few extra in each call to cls_bucket_list_unordered in
+ // case some are filtered out due to namespace matching, versioning,
+ // filtering, etc.
+ const int64_t max_read_ahead = 100;
+ const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
+
+ result->clear();
+
+ // use a local marker; either the marker will have a previous entry
+ // or it will be empty; either way it's OK to copy
+ rgw_obj_key marker_obj(params.marker.name,
+ params.marker.instance,
+ params.ns.empty() ? params.marker.ns : params.ns);
+ rgw_obj_index_key cur_marker;
+ marker_obj.get_index_key(&cur_marker);
+
+ rgw_obj_key end_marker_obj(params.end_marker.name,
+ params.end_marker.instance,
+ params.ns.empty() ? params.end_marker.ns : params.ns);
+ rgw_obj_index_key cur_end_marker;
+ end_marker_obj.get_index_key(&cur_end_marker);
+ const bool cur_end_marker_valid = !params.end_marker.empty();
+
+ rgw_obj_key prefix_obj(params.prefix);
+ prefix_obj.set_ns(params.ns);
+ std::string cur_prefix = prefix_obj.get_index_key_name();
+
+ while (truncated && count <= max) {
+ std::vector<rgw_bucket_dir_entry> ent_list;
+ ent_list.reserve(read_ahead);
+
+ int r = store->cls_bucket_list_unordered(dpp,
+ target->get_bucket_info(),
+ current_index,
+ shard_id,
+ cur_marker,
+ cur_prefix,
+ read_ahead,
+ params.list_versions,
+ ent_list,
+ &truncated,
+ &cur_marker,
+ y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " cls_bucket_list_unordered returned " << r << " for " <<
+ target->get_bucket_info().bucket << dendl;
+ return r;
+ }
+
+ // NB: while regions of ent_list will be sorted, we have no
+ // guarantee that all items will be sorted since they can cross
+ // shard boundaries
+
+ for (auto& entry : ent_list) {
+ rgw_obj_index_key index_key = entry.key;
+ rgw_obj_key obj(index_key);
+
+ if (count < max) {
+ params.marker.set(index_key);
+ next_marker.set(index_key);
+ }
+
+ /* note that parse_raw_oid() here will not set the correct
+ * object's instance, as rgw_obj_index_key encodes that
+ * separately. We don't need to set the instance because it's
+ * not needed for the checks here and we end up using the raw
+ * entry for the return vector
+ */
+ bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+ if (!valid) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " could not parse object name: " << obj.name << dendl;
+ continue;
+ }
+
+ if (!params.list_versions && !entry.is_visible()) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because not listing versions and entry not visibile" << dendl;
+ continue;
+ }
+
+ if (params.enforce_ns && obj.ns != params.ns) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because namespace does not match" << dendl;
+ continue;
+ }
+
+ if (cur_end_marker_valid && cur_end_marker <= index_key) {
+ // we're not guaranteed items will come in order, so we have
+ // to loop through all
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because after end_marker" << dendl;
+ continue;
+ }
+
+ if (params.access_list_filter &&
+ !params.access_list_filter->filter(obj.name, index_key.name)) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because doesn't match filter" << dendl;
+ continue;
+ }
+
+ if (params.prefix.size() &&
+ (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": skippping \"" << index_key <<
+ "\" because doesn't match prefix" << dendl;
+ continue;
+ }
+
+ if (count >= max) {
+ truncated = true;
+ goto done;
+ }
+
+ result->emplace_back(std::move(entry));
+ count++;
+ } // for (auto& entry : ent_list)
+ } // while (truncated && count <= max)
+
+done:
+
+ if (is_truncated) {
+ *is_truncated = truncated;
+ }
+
+ return 0;
+} // list_objects_unordered
+
+
+/**
+ * create a rados pool, associated meta info
+ * returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
+{
+ librados::IoCtx io_ctx;
+ constexpr bool create = true;
+ return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
+}
+
+void RGWRados::create_bucket_id(string *bucket_id)
+{
+ uint64_t iid = instance_id();
+ uint64_t bid = next_bucket_id();
+ char buf[svc.zone->get_zone_params().get_id().size() + 48];
+ snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
+ svc.zone->get_zone_params().get_id().c_str(), iid, bid);
+ *bucket_id = buf;
+}
+
+int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+ const string& zonegroup_id,
+ const rgw_placement_rule& placement_rule,
+ const string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ map<std::string, bufferlist>& attrs,
+ RGWBucketInfo& info,
+ obj_version *pobjv,
+ obj_version *pep_objv,
+ real_time creation_time,
+ rgw_bucket *pmaster_bucket,
+ uint32_t *pmaster_num_shards,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool exclusive)
+{
+#define MAX_CREATE_RETRIES 20 /* need to bound retries */
+ rgw_placement_rule selected_placement_rule;
+ RGWZonePlacementInfo rule_info;
+
+ for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
+ int ret = 0;
+ ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
+ &selected_placement_rule, &rule_info, y);
+ if (ret < 0)
+ return ret;
+
+ if (!pmaster_bucket) {
+ create_bucket_id(&bucket.marker);
+ bucket.bucket_id = bucket.marker;
+ } else {
+ bucket.marker = pmaster_bucket->marker;
+ bucket.bucket_id = pmaster_bucket->bucket_id;
+ }
+
+ RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+ objv_tracker.read_version.clear();
+
+ if (pobjv) {
+ objv_tracker.write_version = *pobjv;
+ } else {
+ objv_tracker.generate_new_write_ver(cct);
+ }
+
+ info.bucket = bucket;
+ info.owner = owner.user_id;
+ info.zonegroup = zonegroup_id;
+ info.placement_rule = selected_placement_rule;
+ info.swift_ver_location = swift_ver_location;
+ info.swift_versioning = (!swift_ver_location.empty());
+
+ init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
+ pmaster_num_shards ?
+ std::optional{*pmaster_num_shards} :
+ std::nullopt,
+ rule_info.index_type);
+
+ info.requester_pays = false;
+ if (real_clock::is_zero(creation_time)) {
+ info.creation_time = ceph::real_clock::now();
+ } else {
+ info.creation_time = creation_time;
+ }
+ if (pquota_info) {
+ info.quota = *pquota_info;
+ }
+
+ int r = svc.bi->init_index(dpp, info, info.layout.current_index);
+ if (r < 0) {
+ return r;
+ }
+
+ ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
+ if (ret == -ECANCELED) {
+ ret = -EEXIST;
+ }
+ if (ret == -EEXIST) {
+ /* we need to reread the info and return it, caller will have a use for it */
+ RGWBucketInfo orig_info;
+ r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ continue;
+ }
+ ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
+ return r;
+ }
+
+ /* only remove it if it's a different bucket instance */
+ if (orig_info.bucket.bucket_id != bucket.bucket_id) {
+ int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
+ }
+ r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
+ /* continue anyway */
+ }
+ }
+
+ info = std::move(orig_info);
+ /* ret == -EEXIST here */
+ }
+ return ret;
+ }
+
+ /* this is highly unlikely */
+ ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
+ return -ENOENT;
+}
+
+bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+ get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+ return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
+}
+
+std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
+{
+ return svc.rados->cluster_fsid();
+}
+
+int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ librados::IoCtx *ioctx)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj, oid, key);
+
+ rgw_pool pool;
+ if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
+ ", probably misconfiguration" << dendl;
+ return -EIO;
+ }
+
+ int r = open_pool_ctx(dpp, pool, *ioctx, false);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
+ " for obj=" << obj << " with error-code=" << r << dendl;
+ return r;
+ }
+
+ ioctx->locator_set_key(key);
+
+ return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+ const rgw_placement_rule& target_placement_rule,
+ const rgw_obj& obj,
+ rgw_rados_ref *ref)
+{
+ get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
+
+ rgw_pool pool;
+ if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
+ return -EIO;
+ }
+
+ ref->pool = svc.rados->pool(pool);
+
+ int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+ .set_mostly_omap(false));
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
+ return r;
+ }
+
+ ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+ return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ rgw_rados_ref *ref)
+{
+ return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
+}
+
+int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+ ref->obj = obj;
+
+ if (ref->obj.oid.empty()) {
+ ref->obj.oid = obj.pool.to_str();
+ ref->obj.pool = svc.zone->get_zone_params().domain_root;
+ }
+ ref->pool = svc.rados->pool(obj.pool);
+ int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+ .set_mostly_omap(false));
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
+ return r;
+ }
+
+ ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+ return 0;
+}
+
+int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+ return get_raw_obj_ref(dpp, obj, ref);
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
+{
+ const rgw_bucket& bucket = bucket_info.bucket;
+ string oid;
+ string locator;
+
+ rgw_obj obj(bucket, key);
+
+ get_obj_bucket_and_oid_loc(obj, oid, locator);
+
+ if (locator.empty()) {
+ ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
+ return 0;
+ }
+
+ librados::IoCtx ioctx;
+
+ int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
+ if (ret < 0) {
+ cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
+ return ret;
+ }
+ ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
+
+ uint64_t size;
+ bufferlist data;
+
+ struct timespec mtime_ts;
+ map<string, bufferlist> attrs;
+ librados::ObjectReadOperation op;
+ op.getxattrs(&attrs, NULL);
+ op.stat2(&size, &mtime_ts, NULL);
+#define HEAD_SIZE 512 * 1024
+ op.read(0, HEAD_SIZE, &data, NULL);
+
+ ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (size > HEAD_SIZE) {
+ ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
+ return -EIO;
+ }
+
+ if (size != data.length()) {
+ ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
+ return -EIO;
+ }
+
+ if (copy_obj) {
+ librados::ObjectWriteOperation wop;
+
+ wop.mtime2(&mtime_ts);
+
+ map<string, bufferlist>::iterator iter;
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ wop.setxattr(iter->first.c_str(), iter->second);
+ }
+
+ wop.write(0, data);
+
+ ioctx.locator_set_key(locator);
+ rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
+ }
+
+ if (remove_bad) {
+ ioctx.locator_set_key(string());
+
+ ret = ioctx.remove(oid);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
+ librados::IoCtx& src_ioctx,
+ const string& src_oid, const string& src_locator,
+ librados::IoCtx& dst_ioctx,
+ const string& dst_oid, const string& dst_locator)
+{
+
+#define COPY_BUF_SIZE (4 * 1024 * 1024)
+ bool done = false;
+ uint64_t chunk_size = COPY_BUF_SIZE;
+ uint64_t ofs = 0;
+ int ret = 0;
+ real_time mtime;
+ struct timespec mtime_ts;
+ uint64_t size;
+
+ if (src_oid == dst_oid && src_locator == dst_locator) {
+ return 0;
+ }
+
+ src_ioctx.locator_set_key(src_locator);
+ dst_ioctx.locator_set_key(dst_locator);
+
+ do {
+ bufferlist data;
+ ObjectReadOperation rop;
+ ObjectWriteOperation wop;
+
+ if (ofs == 0) {
+ rop.stat2(&size, &mtime_ts, NULL);
+ mtime = real_clock::from_timespec(mtime_ts);
+ }
+ rop.read(ofs, chunk_size, &data, NULL);
+ ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
+ if (ret < 0) {
+ goto done_err;
+ }
+
+ if (data.length() == 0) {
+ break;
+ }
+
+ if (ofs == 0) {
+ wop.create(true); /* make it exclusive */
+ wop.mtime2(&mtime_ts);
+ mtime = real_clock::from_timespec(mtime_ts);
+ }
+ wop.write(ofs, data);
+ ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
+ if (ret < 0) {
+ goto done_err;
+ }
+ ofs += data.length();
+ done = data.length() != chunk_size;
+ } while (!done);
+
+ if (ofs != size) {
+ ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
+ << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
+ ret = -EIO;
+ goto done_err;
+ }
+
+ src_ioctx.remove(src_oid);
+
+ return 0;
+
+done_err:
+ // TODO: clean up dst_oid if we created it
+ ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
+ return ret;
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, rgw_obj_key& key,
+ bool fix, bool *need_fix, optional_yield y)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ driver->get_bucket(nullptr, bucket_info, &bucket);
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+
+ if (need_fix) {
+ *need_fix = false;
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWObjState *astate = nullptr;
+ RGWObjManifest* manifest = nullptr;
+ RGWObjectCtx rctx(this->driver);
+ r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ if (manifest) {
+ RGWObjManifest::obj_iterator miter;
+ for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+ rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver);
+ rgw_obj loc;
+ string oid;
+ string locator;
+
+ RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
+
+ if (loc.key.ns.empty()) {
+ /* continue, we're only interested in tail objects */
+ continue;
+ }
+
+ auto& ioctx = ref.pool.ioctx();
+
+ get_obj_bucket_and_oid_loc(loc, oid, locator);
+ ref.pool.ioctx().locator_set_key(locator);
+
+ ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
+
+ r = ioctx.stat(oid, NULL, NULL);
+ if (r != -ENOENT) {
+ continue;
+ }
+
+ string bad_loc;
+ prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc);
+
+ /* create a new ioctx with the bad locator */
+ librados::IoCtx src_ioctx;
+ src_ioctx.dup(ioctx);
+ src_ioctx.locator_set_key(bad_loc);
+
+ r = src_ioctx.stat(oid, NULL, NULL);
+ if (r != 0) {
+ /* cannot find a broken part */
+ continue;
+ }
+ ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
+ if (need_fix) {
+ *need_fix = true;
+ }
+ if (fix) {
+ r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
+ if (r < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+ const rgw_obj& obj,
+ RGWBucketInfo* bucket_info_out,
+ const DoutPrefixProvider *dpp)
+{
+ bucket = _bucket;
+
+ RGWBucketInfo bucket_info;
+ RGWBucketInfo* bucket_info_p =
+ bucket_info_out ? bucket_info_out : &bucket_info;
+
+ int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ string oid;
+
+ ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj)
+{
+ bucket = bucket_info.bucket;
+
+ int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
+ obj.get_hash_object(),
+ &bucket_obj,
+ &shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+ return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ int sid)
+{
+ bucket = bucket_info.bucket;
+ shard_id = sid;
+
+ int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id,
+ num_shards(index), index.gen,
+ &bucket_obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+ return 0;
+}
+
+
+/* Execute @handler on last item in bucket listing for bucket specified
+ * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
+ * to objects matching these criterias. */
+int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::string& obj_prefix,
+ const std::string& obj_delim,
+ std::function<int(const rgw_bucket_dir_entry&)> handler)
+{
+ RGWRados::Bucket target(this, bucket_info);
+ RGWRados::Bucket::List list_op(&target);
+
+ list_op.params.prefix = obj_prefix;
+ list_op.params.delim = obj_delim;
+
+ ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
+ << ", obj_prefix=" << obj_prefix
+ << ", obj_delim=" << obj_delim
+ << dendl;
+
+ bool is_truncated = false;
+
+ boost::optional<rgw_bucket_dir_entry> last_entry;
+ /* We need to rewind to the last object in a listing. */
+ do {
+ /* List bucket entries in chunks. */
+ static constexpr int MAX_LIST_OBJS = 100;
+ std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
+
+ int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+ &is_truncated, null_yield);
+ if (ret < 0) {
+ return ret;
+ } else if (!entries.empty()) {
+ last_entry = entries.back();
+ }
+ } while (is_truncated);
+
+ if (last_entry) {
+ return handler(*last_entry);
+ }
+
+ /* Empty listing - no items we can run handler on. */
+ return 0;
+}
+
+bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
+{
+ return bucket->get_info().has_swift_versioning() &&
+ bucket->get_info().swift_ver_location.size();
+}
+
+int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
+ const rgw_user& user,
+ rgw::sal::Bucket* bucket,
+ rgw::sal::Object* obj,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ if (! swift_versioning_enabled(bucket)) {
+ return 0;
+ }
+
+ obj->set_atomic();
+
+ RGWObjState * state = nullptr;
+ RGWObjManifest *manifest = nullptr;
+ int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!state->exists) {
+ return 0;
+ }
+
+ const string& src_name = obj->get_oid();
+ char buf[src_name.size() + 32];
+ struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
+ snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
+ src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
+
+ RGWBucketInfo dest_bucket_info;
+
+ r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
+ if (r == -ENOENT) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ return r;
+ }
+
+ if (dest_bucket_info.owner != bucket->get_info().owner) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info);
+ rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket);
+
+ if (dest_bucket_info.versioning_enabled()){
+ dest_obj.gen_rand_obj_instance_name();
+ }
+
+ dest_obj.set_atomic();
+
+ rgw_zone_id no_zone;
+
+ r = copy_obj(obj_ctx,
+ user,
+ NULL, /* req_info *info */
+ no_zone,
+ &dest_obj,
+ obj,
+ &dest_bucket,
+ bucket,
+ bucket->get_placement_rule(),
+ NULL, /* time_t *src_mtime */
+ NULL, /* time_t *mtime */
+ NULL, /* const time_t *mod_ptr */
+ NULL, /* const time_t *unmod_ptr */
+ false, /* bool high_precision_time */
+ NULL, /* const char *if_match */
+ NULL, /* const char *if_nomatch */
+ RGWRados::ATTRSMOD_NONE,
+ true, /* bool copy_if_newer */
+ state->attrset,
+ RGWObjCategory::Main,
+ 0, /* uint64_t olh_epoch */
+ real_time(), /* time_t delete_at */
+ NULL, /* string *version_id */
+ NULL, /* string *ptag */
+ NULL, /* string *petag */
+ NULL, /* void (*progress_cb)(off_t, void *) */
+ NULL, /* void *progress_data */
+ dpp,
+ null_yield);
+ if (r == -ECANCELED || r == -ENOENT) {
+ /* Has already been overwritten, meaning another rgw process already
+ * copied it out */
+ return 0;
+ }
+
+ return r;
+}
+
+int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
+ const rgw_user& user,
+ rgw::sal::Bucket* bucket,
+ rgw::sal::Object* obj,
+ bool& restored, /* out */
+ const DoutPrefixProvider *dpp)
+{
+ if (! swift_versioning_enabled(bucket)) {
+ return 0;
+ }
+
+ /* Bucket info of the bucket that stores previous versions of our object. */
+ RGWBucketInfo archive_binfo;
+
+ int ret = get_bucket_info(&svc, bucket->get_tenant(),
+ bucket->get_info().swift_ver_location,
+ archive_binfo, nullptr, null_yield, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Abort the operation if the bucket storing our archive belongs to someone
+ * else. This is a limitation in comparison to Swift as we aren't taking ACLs
+ * into consideration. For we can live with that.
+ *
+ * TODO: delegate this check to un upper layer and compare with ACLs. */
+ if (bucket->get_info().owner != archive_binfo.owner) {
+ return -EPERM;
+ }
+
+ /* This code will be executed on latest version of the object. */
+ const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
+ rgw_zone_id no_zone;
+
+ /* We don't support object versioning of Swift API on those buckets that
+ * are already versioned using the S3 mechanism. This affects also bucket
+ * storing archived objects. Otherwise the delete operation would create
+ * a deletion marker. */
+ if (archive_binfo.versioned()) {
+ restored = false;
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
+ * irrelevant and may be safely skipped. */
+ std::map<std::string, ceph::bufferlist> no_attrs;
+
+ rgw::sal::RadosBucket archive_bucket(driver, archive_binfo);
+ rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket);
+
+ if (bucket->versioning_enabled()){
+ obj->gen_rand_obj_instance_name();
+ }
+
+ archive_obj.set_atomic();
+ obj->set_atomic();
+
+ int ret = copy_obj(obj_ctx,
+ user,
+ nullptr, /* req_info *info */
+ no_zone,
+ obj, /* dest obj */
+ &archive_obj, /* src obj */
+ bucket, /* dest bucket info */
+ &archive_bucket, /* src bucket info */
+ bucket->get_placement_rule(), /* placement_rule */
+ nullptr, /* time_t *src_mtime */
+ nullptr, /* time_t *mtime */
+ nullptr, /* const time_t *mod_ptr */
+ nullptr, /* const time_t *unmod_ptr */
+ false, /* bool high_precision_time */
+ nullptr, /* const char *if_match */
+ nullptr, /* const char *if_nomatch */
+ RGWRados::ATTRSMOD_NONE,
+ true, /* bool copy_if_newer */
+ no_attrs,
+ RGWObjCategory::Main,
+ 0, /* uint64_t olh_epoch */
+ real_time(), /* time_t delete_at */
+ nullptr, /* string *version_id */
+ nullptr, /* string *ptag */
+ nullptr, /* string *petag */
+ nullptr, /* void (*progress_cb)(off_t, void *) */
+ nullptr, /* void *progress_data */
+ dpp,
+ null_yield);
+ if (ret == -ECANCELED || ret == -ENOENT) {
+ /* Has already been overwritten, meaning another rgw process already
+ * copied it out */
+ return 0;
+ } else if (ret < 0) {
+ return ret;
+ } else {
+ restored = true;
+ }
+
+ /* Need to remove the archived copy. */
+ ret = delete_obj(dpp, archive_binfo, &archive_obj,
+ archive_binfo.versioning_status());
+
+ return ret;
+ };
+
+ const std::string& obj_name = obj->get_oid();
+ const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
+ % obj_name);
+
+ return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
+ handler);
+}
+
+int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+ uint64_t size, uint64_t accounted_size,
+ map<string, bufferlist>& attrs,
+ bool assume_noent, bool modify_tail,
+ void *_index_op, optional_yield y)
+{
+ RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
+ RGWRados *store = target->get_store();
+
+ ObjectWriteOperation op;
+#ifdef WITH_LTTNG
+ const req_state* s = get_req_state();
+ string req_id;
+ if (!s) {
+ // fake req_id
+ req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
+ } else {
+ req_id = s->req_id;
+ }
+#endif
+
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
+ if (r < 0)
+ return r;
+
+ rgw_obj obj = target->get_obj();
+
+ if (obj.get_oid().empty()) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
+ return -EIO;
+ }
+
+ rgw_rados_ref ref;
+ r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
+ if (r < 0)
+ return r;
+
+ bool is_olh = state->is_olh;
+
+ bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
+
+ const string *ptag = meta.ptag;
+ if (!ptag && !index_op->get_optag()->empty()) {
+ ptag = index_op->get_optag();
+ }
+ r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
+ if (r < 0)
+ return r;
+
+ if (real_clock::is_zero(meta.set_mtime)) {
+ meta.set_mtime = real_clock::now();
+ }
+
+ if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
+ auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+ if (iter == attrs.end()) {
+ real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
+ string mode = target->get_bucket_info().obj_lock.get_mode();
+ RGWObjectRetention obj_retention(mode, lock_until_date);
+ bufferlist bl;
+ obj_retention.encode(bl);
+ op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
+ }
+ }
+
+ if (state->is_olh) {
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
+ }
+
+ struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
+ op.mtime2(&mtime_ts);
+
+ if (meta.data) {
+ /* if we want to overwrite the data, we also want to overwrite the
+ xattrs, so just remove the object */
+ op.write_full(*meta.data);
+ if (state->compressed) {
+ uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+ op.set_alloc_hint2(0, 0, alloc_hint_flags);
+ }
+ }
+
+ string etag;
+ string content_type;
+ bufferlist acl_bl;
+ string storage_class;
+
+ map<string, bufferlist>::iterator iter;
+ if (meta.rmattrs) {
+ for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+ const string& name = iter->first;
+ op.rmxattr(name.c_str());
+ }
+ }
+
+ if (meta.manifest) {
+ storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+ /* remove existing manifest attr */
+ iter = attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != attrs.end())
+ attrs.erase(iter);
+
+ bufferlist bl;
+ encode(*meta.manifest, bl);
+ op.setxattr(RGW_ATTR_MANIFEST, bl);
+ }
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ const string& name = iter->first;
+ bufferlist& bl = iter->second;
+
+ if (!bl.length())
+ continue;
+
+ op.setxattr(name.c_str(), bl);
+
+ if (name.compare(RGW_ATTR_ETAG) == 0) {
+ etag = rgw_bl_str(bl);
+ } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+ content_type = rgw_bl_str(bl);
+ } else if (name.compare(RGW_ATTR_ACL) == 0) {
+ acl_bl = bl;
+ }
+ }
+ if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
+ cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
+ }
+
+ if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
+ bufferlist bl;
+ encode(store->svc.zone->get_zone_short_id(), bl);
+ op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
+ }
+
+ if (!storage_class.empty()) {
+ bufferlist bl;
+ bl.append(storage_class);
+ op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
+ }
+
+ if (!op.size())
+ return 0;
+
+ uint64_t epoch;
+ int64_t poolid;
+ bool orig_exists;
+ uint64_t orig_size;
+
+ if (!reset_obj) { //Multipart upload, it has immutable head.
+ orig_exists = false;
+ orig_size = 0;
+ } else {
+ orig_exists = state->exists;
+ orig_size = state->accounted_size;
+ }
+
+ bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
+ !obj.key.instance.empty();
+
+ bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
+
+ if (versioned_op) {
+ index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
+ }
+
+ if (!index_op->is_prepared()) {
+ tracepoint(rgw_rados, prepare_enter, req_id.c_str());
+ r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+ tracepoint(rgw_rados, prepare_exit, req_id.c_str());
+ if (r < 0)
+ return r;
+ }
+
+ auto& ioctx = ref.pool.ioctx();
+
+ tracepoint(rgw_rados, operate_enter, req_id.c_str());
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ tracepoint(rgw_rados, operate_exit, req_id.c_str());
+ if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
+ or -ENOENT if was removed, or -EEXIST if it did not exist
+ before and now it does */
+ if (r == -EEXIST && assume_noent) {
+ target->invalidate_state();
+ return r;
+ }
+ goto done_cancel;
+ }
+
+ epoch = ioctx.get_last_version();
+ poolid = ioctx.get_id();
+
+ r = target->complete_atomic_modification(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
+ }
+
+ tracepoint(rgw_rados, complete_enter, req_id.c_str());
+ r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
+ meta.set_mtime, etag, content_type,
+ storage_class, &acl_bl,
+ meta.category, meta.remove_objs, meta.user_data, meta.appendable);
+ tracepoint(rgw_rados, complete_exit, req_id.c_str());
+ if (r < 0)
+ goto done_cancel;
+
+ if (meta.mtime) {
+ *meta.mtime = meta.set_mtime;
+ }
+
+ /* note that index_op was using state so we couldn't invalidate it earlier */
+ target->invalidate_state();
+ state = NULL;
+
+ if (versioned_op && meta.olh_epoch) {
+ r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!real_clock::is_zero(meta.delete_at)) {
+ rgw_obj_index_key obj_key;
+ obj.key.get_index_key(&obj_key);
+
+ r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
+ obj.bucket.bucket_id, obj_key);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
+ /* ignoring error, nothing we can do at this point */
+ }
+ }
+ meta.canceled = false;
+
+ /* update quota cache */
+ if (meta.completeMultipart){
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ 0, orig_size);
+ }
+ else {
+ store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+ accounted_size, orig_size);
+ }
+ return 0;
+
+done_cancel:
+ int ret = index_op->cancel(dpp, meta.remove_objs);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+ }
+
+ meta.canceled = true;
+
+ /* we lost in a race. There are a few options:
+ * - existing object was rewritten (ECANCELED)
+ * - non existing object was created (EEXIST)
+ * - object was removed (ENOENT)
+ * should treat it as a success
+ */
+ if (meta.if_match == NULL && meta.if_nomatch == NULL) {
+ if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
+ r = 0;
+ }
+ } else {
+ if (meta.if_match != NULL) {
+ // only overwrite existing object
+ if (strcmp(meta.if_match, "*") == 0) {
+ if (r == -ENOENT) {
+ r = -ERR_PRECONDITION_FAILED;
+ } else if (r == -ECANCELED) {
+ r = 0;
+ }
+ }
+ }
+
+ if (meta.if_nomatch != NULL) {
+ // only create a new object
+ if (strcmp(meta.if_nomatch, "*") == 0) {
+ if (r == -EEXIST) {
+ r = -ERR_PRECONDITION_FAILED;
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+ }
+ }
+ }
+
+ return r;
+}
+
+int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+ map<string, bufferlist>& attrs, optional_yield y)
+{
+ RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+ RGWRados::Bucket bop(target->get_store(), bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
+ index_op.set_zones_trace(meta.zones_trace);
+
+ bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
+ int r;
+ if (assume_noent) {
+ r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+ if (r == -EEXIST) {
+ assume_noent = false;
+ }
+ }
+ if (!assume_noent) {
+ r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+ }
+ return r;
+}
+
+class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
+{
+ const DoutPrefixProvider *dpp;
+ CephContext* cct;
+ rgw_obj obj;
+ rgw::sal::DataProcessor *filter;
+ boost::optional<RGWPutObj_Compress>& compressor;
+ bool try_etag_verify;
+ rgw::putobj::etag_verifier_ptr etag_verifier;
+ boost::optional<rgw::putobj::ChunkProcessor> buffering;
+ CompressorRef& plugin;
+ rgw::sal::ObjectProcessor *processor;
+ void (*progress_cb)(off_t, void *);
+ void *progress_data;
+ bufferlist extra_data_bl, manifest_bl;
+ std::optional<RGWCompressionInfo> compression_info;
+ uint64_t extra_data_left{0};
+ bool need_to_process_attrs{true};
+ uint64_t data_len{0};
+ map<string, bufferlist> src_attrs;
+ uint64_t ofs{0};
+ uint64_t lofs{0}; /* logical ofs */
+ std::function<int(map<string, bufferlist>&)> attrs_handler;
+
+public:
+ RGWRadosPutObj(const DoutPrefixProvider *dpp,
+ CephContext* cct,
+ CompressorRef& plugin,
+ boost::optional<RGWPutObj_Compress>& compressor,
+ rgw::sal::ObjectProcessor *p,
+ void (*_progress_cb)(off_t, void *),
+ void *_progress_data,
+ std::function<int(map<string, bufferlist>&)> _attrs_handler) :
+ dpp(dpp),
+ cct(cct),
+ filter(p),
+ compressor(compressor),
+ try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
+ plugin(plugin),
+ processor(p),
+ progress_cb(_progress_cb),
+ progress_data(_progress_data),
+ attrs_handler(_attrs_handler) {}
+
+
+ int process_attrs(void) {
+ if (extra_data_bl.length()) {
+ JSONParser jp;
+ if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+ ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+ return -EIO;
+ }
+
+ JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+ auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
+ if (iter != src_attrs.end()) {
+ const bufferlist bl = std::move(iter->second);
+ src_attrs.erase(iter); // don't preserve source compression info
+
+ if (try_etag_verify) {
+ // if we're trying to verify etags, we need to convert compressed
+ // ranges in the manifest back into logical multipart part offsets
+ RGWCompressionInfo info;
+ bool compressed = false;
+ int r = rgw_compression_info_from_attr(bl, compressed, info);
+ if (r < 0) {
+ ldpp_dout(dpp, 4) << "failed to decode compression info, "
+ "disabling etag verification" << dendl;
+ try_etag_verify = false;
+ } else if (compressed) {
+ compression_info = std::move(info);
+ }
+ }
+ }
+ /* We need the manifest to recompute the ETag for verification */
+ iter = src_attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != src_attrs.end()) {
+ manifest_bl = std::move(iter->second);
+ src_attrs.erase(iter);
+ }
+
+ // filter out olh attributes
+ iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+ while (iter != src_attrs.end()) {
+ if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+ break;
+ }
+ iter = src_attrs.erase(iter);
+ }
+ }
+
+ int ret = attrs_handler(src_attrs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
+ //do not compress if object is encrypted
+ compressor = boost::in_place(cct, plugin, filter);
+ // add a filter that buffers data so we don't try to compress tiny blocks.
+ // libcurl reads in 16k at a time, and we need at least 64k to get a good
+ // compression ratio
+ constexpr unsigned buffer_size = 512 * 1024;
+ buffering = boost::in_place(&*compressor, buffer_size);
+ filter = &*buffering;
+ }
+
+ /*
+ * Presently we don't support ETag based verification if encryption is
+ * requested. We can enable simultaneous support once we have a mechanism
+ * to know the sequence in which the filters must be applied.
+ */
+ if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
+ ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
+ compression_info,
+ etag_verifier);
+ if (ret < 0) {
+ ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
+ "disabling etag verification" << dendl;
+ } else {
+ filter = etag_verifier.get();
+ }
+ }
+
+ need_to_process_attrs = false;
+
+ return 0;
+ }
+
+ int handle_data(bufferlist& bl, bool *pause) override {
+ if (progress_cb) {
+ progress_cb(data_len, progress_data);
+ }
+ if (extra_data_left) {
+ uint64_t extra_len = bl.length();
+ if (extra_len > extra_data_left)
+ extra_len = extra_data_left;
+
+ bufferlist extra;
+ bl.splice(0, extra_len, &extra);
+ extra_data_bl.append(extra);
+
+ extra_data_left -= extra_len;
+ if (extra_data_left == 0) {
+ int res = process_attrs();
+ if (res < 0)
+ return res;
+ }
+ ofs += extra_len;
+ if (bl.length() == 0) {
+ return 0;
+ }
+ }
+ if (need_to_process_attrs) {
+ /* need to call process_attrs() even if we don't get any attrs,
+ * need it to call attrs_handler().
+ */
+ int res = process_attrs();
+ if (res < 0) {
+ return res;
+ }
+ }
+
+ ceph_assert(uint64_t(ofs) >= extra_data_len);
+
+ uint64_t size = bl.length();
+ ofs += size;
+
+ const uint64_t lofs = data_len;
+ data_len += size;
+
+ return filter->process(std::move(bl), lofs);
+ }
+
+ int flush() {
+ return filter->process({}, data_len);
+ }
+
+ bufferlist& get_extra_data() { return extra_data_bl; }
+
+ map<string, bufferlist>& get_attrs() { return src_attrs; }
+
+ void set_extra_data_len(uint64_t len) override {
+ extra_data_left = len;
+ RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
+ }
+
+ uint64_t get_data_len() {
+ return data_len;
+ }
+
+ std::string get_verifier_etag() {
+ if (etag_verifier) {
+ etag_verifier->calculate_etag();
+ return etag_verifier->get_calculated_etag();
+ } else {
+ return "";
+ }
+ }
+};
+
+/*
+ * prepare attrset depending on attrs_mod.
+ */
+static void set_copy_attrs(map<string, bufferlist>& src_attrs,
+ map<string, bufferlist>& attrs,
+ RGWRados::AttrsMod attrs_mod)
+{
+ switch (attrs_mod) {
+ case RGWRados::ATTRSMOD_NONE:
+ attrs = src_attrs;
+ break;
+ case RGWRados::ATTRSMOD_REPLACE:
+ if (!attrs[RGW_ATTR_ETAG].length()) {
+ attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
+ }
+ if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+ auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+ if (ttiter != src_attrs.end()) {
+ attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+ }
+ }
+ break;
+ case RGWRados::ATTRSMOD_MERGE:
+ for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+ if (attrs.find(it->first) == attrs.end()) {
+ attrs[it->first] = it->second;
+ }
+ }
+ break;
+ }
+}
+
+int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
+{
+ RGWObjectCtx rctx(this->driver);
+ rgw::sal::Attrs attrset;
+ uint64_t obj_size;
+ ceph::real_time mtime;
+ RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj);
+ RGWRados::Object::Read read_op(&op_target);
+
+ read_op.params.attrs = &attrset;
+ read_op.params.obj_size = &obj_size;
+ read_op.params.lastmod = &mtime;
+
+ int ret = read_op.prepare(y, dpp);
+ if (ret < 0)
+ return ret;
+
+ attrset.erase(RGW_ATTR_ID_TAG);
+ attrset.erase(RGW_ATTR_TAIL_TAG);
+ attrset.erase(RGW_ATTR_STORAGE_CLASS);
+
+ return this->copy_obj_data(rctx, obj->get_bucket(),
+ obj->get_bucket()->get_info().placement_rule,
+ read_op, obj_size - 1, obj, NULL, mtime,
+ attrset, 0, real_time(), NULL, dpp, y);
+}
+
+struct obj_time_weight {
+ real_time mtime;
+ uint32_t zone_short_id;
+ uint64_t pg_ver;
+ bool high_precision;
+
+ obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
+
+ bool compare_low_precision(const obj_time_weight& rhs) {
+ struct timespec l = ceph::real_clock::to_timespec(mtime);
+ struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
+ l.tv_nsec = 0;
+ r.tv_nsec = 0;
+ if (l > r) {
+ return false;
+ }
+ if (l < r) {
+ return true;
+ }
+ if (!zone_short_id || !rhs.zone_short_id) {
+ /* don't compare zone ids, if one wasn't provided */
+ return false;
+ }
+ if (zone_short_id != rhs.zone_short_id) {
+ return (zone_short_id < rhs.zone_short_id);
+ }
+ return (pg_ver < rhs.pg_ver);
+
+ }
+
+ bool operator<(const obj_time_weight& rhs) {
+ if (!high_precision || !rhs.high_precision) {
+ return compare_low_precision(rhs);
+ }
+ if (mtime > rhs.mtime) {
+ return false;
+ }
+ if (mtime < rhs.mtime) {
+ return true;
+ }
+ if (!zone_short_id || !rhs.zone_short_id) {
+ /* don't compare zone ids, if one wasn't provided */
+ return false;
+ }
+ if (zone_short_id != rhs.zone_short_id) {
+ return (zone_short_id < rhs.zone_short_id);
+ }
+ return (pg_ver < rhs.pg_ver);
+ }
+
+ void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
+ mtime = _mtime;
+ zone_short_id = _short_id;
+ pg_ver = _pg_ver;
+ }
+
+ void init(RGWObjState *state) {
+ mtime = state->mtime;
+ zone_short_id = state->zone_short_id;
+ pg_ver = state->pg_ver;
+ }
+};
+
+inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
+ out << o.mtime;
+
+ if (o.zone_short_id != 0 || o.pg_ver != 0) {
+ out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
+ }
+
+ return out;
+}
+
+class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
+ bufferlist extra_data;
+public:
+ RGWGetExtraDataCB() {}
+ int handle_data(bufferlist& bl, bool *pause) override {
+ int bl_len = (int)bl.length();
+ if (extra_data.length() < extra_data_len) {
+ off_t max = extra_data_len - extra_data.length();
+ if (max > bl_len) {
+ max = bl_len;
+ }
+ bl.splice(0, max, &extra_data);
+ }
+ return bl_len;
+ }
+
+ bufferlist& get_extra_data() {
+ return extra_data;
+ }
+};
+
+int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* src_obj,
+ const RGWBucketInfo *src_bucket_info,
+ real_time *src_mtime,
+ uint64_t *psize,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ map<string, bufferlist> *pattrs,
+ map<string, string> *pheaders,
+ string *version_id,
+ string *ptag,
+ string *petag)
+{
+ /* source is in a different zonegroup, copy from there */
+
+ RGWRESTStreamRWRequest *in_stream_req;
+ string tag;
+ map<string, bufferlist> src_attrs;
+ append_rand_alpha(cct, tag, tag, 32);
+ obj_time_weight set_mtime_weight;
+ set_mtime_weight.high_precision = high_precision_time;
+
+ RGWRESTConn *conn;
+ if (source_zone.empty()) {
+ if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+ /* source is in the master zonegroup */
+ conn = svc.zone->get_master_conn();
+ } else {
+ auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+ map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+ if (iter == zonegroup_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+ } else {
+ auto& zone_conn_map = svc.zone->get_zone_conn_map();
+ auto iter = zone_conn_map.find(source_zone);
+ if (iter == zone_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+
+ RGWGetExtraDataCB cb;
+ map<string, string> req_headers;
+ real_time set_mtime;
+
+ const real_time *pmod = mod_ptr;
+
+ obj_time_weight dest_mtime_weight;
+
+ constexpr bool prepend_meta = true;
+ constexpr bool get_op = true;
+ constexpr bool rgwx_stat = true;
+ constexpr bool sync_manifest = true;
+ constexpr bool skip_decrypt = true;
+ int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+ dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt,
+ true, &cb, &in_stream_req);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
+ nullptr, pheaders, null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+
+ bufferlist& extra_data_bl = cb.get_extra_data();
+ if (extra_data_bl.length()) {
+ JSONParser jp;
+ if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+ ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+ return -EIO;
+ }
+
+ JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+ src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+ }
+
+ if (src_mtime) {
+ *src_mtime = set_mtime;
+ }
+
+ if (petag) {
+ map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+ if (iter != src_attrs.end()) {
+ bufferlist& etagbl = iter->second;
+ *petag = etagbl.to_str();
+ while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
+ *petag = petag->substr(0, petag->size() - 1);
+ }
+ }
+ }
+
+ if (pattrs) {
+ *pattrs = std::move(src_attrs);
+ }
+
+ return 0;
+}
+
+int RGWFetchObjFilter_Default::filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const map<string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule)
+{
+ const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
+ if (!ptail_rule) {
+ auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != obj_attrs.end()) {
+ dest_rule.storage_class = iter->second.to_str();
+ dest_rule.inherit_from(dest_bucket_info.placement_rule);
+ ptail_rule = &dest_rule;
+ } else {
+ ptail_rule = &dest_bucket_info.placement_rule;
+ }
+ }
+ *prule = ptail_rule;
+ return 0;
+}
+
+int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_obj,
+ rgw::sal::Object* src_obj,
+ rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ real_time *src_mtime,
+ real_time *mtime,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ std::optional<uint64_t> olh_epoch,
+ real_time delete_at,
+ string *ptag,
+ string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ RGWFetchObjFilter *filter,
+ rgw_zone_set *zones_trace,
+ std::optional<uint64_t>* bytes_transferred)
+{
+ /* source is in a different zonegroup, copy from there */
+
+ RGWRESTStreamRWRequest *in_stream_req;
+ string tag;
+ int i;
+ append_rand_alpha(cct, tag, tag, 32);
+ obj_time_weight set_mtime_weight;
+ set_mtime_weight.high_precision = high_precision_time;
+ int ret;
+
+ rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+ using namespace rgw::putobj;
+ AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id,
+ obj_ctx, dest_obj->clone(), olh_epoch,
+ tag, dpp, null_yield);
+ RGWRESTConn *conn;
+ auto& zone_conn_map = svc.zone->get_zone_conn_map();
+ auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+ if (source_zone.empty()) {
+ if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
+ /* source is in the master zonegroup */
+ conn = svc.zone->get_master_conn();
+ } else {
+ map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
+ if (iter == zonegroup_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+ } else {
+ auto iter = zone_conn_map.find(source_zone);
+ if (iter == zone_conn_map.end()) {
+ ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
+
+ boost::optional<RGWPutObj_Compress> compressor;
+ CompressorRef plugin;
+
+ RGWFetchObjFilter_Default source_filter;
+ if (!filter) {
+ filter = &source_filter;
+ }
+
+ std::optional<rgw_user> override_owner;
+
+ RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+ [&](map<string, bufferlist>& obj_attrs) {
+ const rgw_placement_rule *ptail_rule;
+
+ int ret = filter->filter(cct,
+ src_obj->get_key(),
+ dest_bucket->get_info(),
+ dest_placement_rule,
+ obj_attrs,
+ &override_owner,
+ &ptail_rule);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ processor.set_tail_placement(*ptail_rule);
+
+ const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
+ if (compression_type != "none") {
+ plugin = Compressor::create(cct, compression_type);
+ if (!plugin) {
+ ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+ << compression_type << dendl;
+ }
+ }
+
+ ret = processor.prepare(null_yield);
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+ });
+
+ string etag;
+ real_time set_mtime;
+ uint64_t expected_size = 0;
+
+ RGWObjState *dest_state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ const real_time *pmod = mod_ptr;
+
+ obj_time_weight dest_mtime_weight;
+
+ if (copy_if_newer) {
+ /* need to get mtime for destination */
+ ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
+ if (ret < 0)
+ goto set_err_state;
+
+ if (!real_clock::is_zero(dest_state->mtime)) {
+ dest_mtime_weight.init(dest_state);
+ pmod = &dest_mtime_weight.mtime;
+ }
+ }
+
+ static constexpr bool prepend_meta = true;
+ static constexpr bool get_op = true;
+ static constexpr bool rgwx_stat = false;
+ static constexpr bool sync_manifest = true;
+ static constexpr bool skip_decrypt = true;
+ ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+ dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+ prepend_meta, get_op, rgwx_stat,
+ sync_manifest, skip_decrypt,
+ true,
+ &cb, &in_stream_req);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+
+ ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
+ &expected_size, nullptr, nullptr, null_yield);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+ ret = cb.flush();
+ if (ret < 0) {
+ goto set_err_state;
+ }
+ if (cb.get_data_len() != expected_size) {
+ ret = -EIO;
+ ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
+ << expected_size << " bytes but received " << cb.get_data_len() << dendl;
+ goto set_err_state;
+ }
+ if (compressor && compressor->is_compressed()) {
+ bufferlist tmp;
+ RGWCompressionInfo cs_info;
+ cs_info.compression_type = plugin->get_type_name();
+ cs_info.orig_size = cb.get_data_len();
+ cs_info.compressor_message = compressor->get_compressor_message();
+ cs_info.blocks = move(compressor->get_compression_blocks());
+ encode(cs_info, tmp);
+ cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
+ }
+
+ if (override_owner) {
+ processor.set_owner(*override_owner);
+
+ auto& obj_attrs = cb.get_attrs();
+
+ RGWUserInfo owner_info;
+ if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
+ ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
+ return -EINVAL;
+ }
+
+ RGWAccessControlPolicy acl;
+
+ auto aiter = obj_attrs.find(RGW_ATTR_ACL);
+ if (aiter == obj_attrs.end()) {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
+ acl.create_default(owner_info.user_id, owner_info.display_name);
+ } else {
+ auto iter = aiter->second.cbegin();
+ try {
+ acl.decode(iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+ return -EIO;
+ }
+ }
+
+ ACLOwner new_owner;
+ new_owner.set_id(*override_owner);
+ new_owner.set_name(owner_info.display_name);
+
+ acl.set_owner(new_owner);
+
+ bufferlist bl;
+ acl.encode(bl);
+ obj_attrs[RGW_ATTR_ACL] = std::move(bl);
+ }
+
+ if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
+ cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
+ } else {
+ map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
+ if (iter != cb.get_attrs().end()) {
+ try {
+ decode(delete_at, iter->second);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
+ }
+ }
+ }
+
+ if (src_mtime) {
+ *src_mtime = set_mtime;
+ }
+
+ if (petag) {
+ const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
+ if (iter != cb.get_attrs().end()) {
+ *petag = iter->second.to_str();
+ }
+ }
+
+ //erase the append attr
+ cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
+
+ { // add x-amz-replication-status=REPLICA
+ auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
+ bl.clear(); // overwrite source's status
+ bl.append("REPLICA");
+ }
+
+ if (source_zone.empty()) {
+ set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
+ } else {
+ attrs = cb.get_attrs();
+ }
+
+ if (copy_if_newer) {
+ uint64_t pg_ver = 0;
+ auto i = attrs.find(RGW_ATTR_PG_VER);
+ if (i != attrs.end() && i->second.length() > 0) {
+ auto iter = i->second.cbegin();
+ try {
+ decode(pg_ver, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
+ /* non critical error */
+ }
+ }
+ set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
+ }
+
+ /* Perform ETag verification is we have computed the object's MD5 sum at our end */
+ if (const auto& verifier_etag = cb.get_verifier_etag();
+ !verifier_etag.empty()) {
+ string trimmed_etag = etag;
+
+ /* Remove the leading and trailing double quotes from etag */
+ trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
+ trimmed_etag.end());
+
+ if (verifier_etag != trimmed_etag) {
+ ret = -EIO;
+ ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
+ << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
+ goto set_err_state;
+ }
+ }
+
+#define MAX_COMPLETE_RETRY 100
+ for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
+ bool canceled = false;
+ ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
+ attrs, delete_at, nullptr, nullptr, nullptr,
+ zones_trace, &canceled, null_yield);
+ if (ret < 0) {
+ goto set_err_state;
+ }
+
+ if (copy_if_newer && canceled) {
+ ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
+ obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
+ ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
+ goto set_err_state;
+ }
+ dest_mtime_weight.init(dest_state);
+ dest_mtime_weight.high_precision = high_precision_time;
+ if (!dest_state->exists ||
+ dest_mtime_weight < set_mtime_weight) {
+ ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ continue;
+ } else {
+ ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+ }
+ }
+ break;
+ }
+
+ if (i == MAX_COMPLETE_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
+ ret = -EIO;
+ goto set_err_state;
+ }
+
+ if (bytes_transferred) {
+ *bytes_transferred = cb.get_data_len();
+ }
+ return 0;
+set_err_state:
+ if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
+ // we may have already fetched during sync of OP_ADD, but were waiting
+ // for OP_LINK_OLH to call set_olh() with a real olh_epoch
+ if (olh_epoch && *olh_epoch > 0) {
+ constexpr bool log_data_change = true;
+ ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr,
+ *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
+ } else {
+ // we already have the latest copy
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+
+int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+ RGWObjState *astate,
+ map<string, bufferlist>& src_attrs,
+ RGWRados::Object::Read& read_op,
+ const rgw_user& user_id,
+ rgw::sal::Object* dest_obj,
+ real_time *mtime)
+{
+ string etag;
+
+ RGWRESTStreamS3PutObj *out_stream_req;
+
+ auto rest_master_conn = svc.zone->get_master_conn();
+
+ int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
+ if (ret < 0) {
+ return ret;
+ }
+
+ out_stream_req->set_send_length(astate->size);
+
+ ret = RGWHTTP::send(out_stream_req);
+ if (ret < 0) {
+ delete out_stream_req;
+ return ret;
+ }
+
+ ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
+ if (ret < 0) {
+ delete out_stream_req;
+ return ret;
+ }
+
+ ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ * ATTRSMOD_NONE - the attributes of the source object will be
+ * copied without modifications, attrs parameter is ignored;
+ * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ * parameter, source object attributes are not copied;
+ * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ * are overwritten by values contained in attrs parameter.
+ * err: stores any errors resulting from the get of the original object
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_obj,
+ rgw::sal::Object* src_obj,
+ rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ const rgw_placement_rule& dest_placement,
+ real_time *src_mtime,
+ real_time *mtime,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ uint64_t olh_epoch,
+ real_time delete_at,
+ string *version_id,
+ string *ptag,
+ string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ int ret;
+ uint64_t obj_size;
+ rgw_obj shadow_obj = dest_obj->get_obj();
+ string shadow_oid;
+
+ bool remote_src;
+ bool remote_dest;
+
+ append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
+ shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
+
+ auto& zonegroup = svc.zone->get_zonegroup();
+
+ remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
+ remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
+
+ if (remote_src && remote_dest) {
+ ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
+ return -EINVAL;
+ }
+
+ ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
+
+ if (remote_src || !source_zone.empty()) {
+ return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
+ dest_obj, src_obj, dest_bucket, src_bucket,
+ dest_placement, src_mtime, mtime, mod_ptr,
+ unmod_ptr, high_precision_time,
+ if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
+ olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
+ nullptr /* filter */);
+ }
+
+ map<string, bufferlist> src_attrs;
+ RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj);
+ RGWRados::Object::Read read_op(&src_op_target);
+
+ read_op.conds.mod_ptr = mod_ptr;
+ read_op.conds.unmod_ptr = unmod_ptr;
+ read_op.conds.high_precision_time = high_precision_time;
+ read_op.conds.if_match = if_match;
+ read_op.conds.if_nomatch = if_nomatch;
+ read_op.params.attrs = &src_attrs;
+ read_op.params.lastmod = src_mtime;
+ read_op.params.obj_size = &obj_size;
+
+ ret = read_op.prepare(y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
+ // Current implementation does not follow S3 spec and even
+ // may result in data corruption silently when copying
+ // multipart objects acorss pools. So reject COPY operations
+ //on encrypted objects before it is fully functional.
+ ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
+ << " has not been implemented." << dendl;
+ return -ERR_NOT_IMPLEMENTED;
+ }
+
+ src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+ src_attrs.erase(RGW_ATTR_DELETE_AT);
+
+ src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
+ src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
+ map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+ if (rt != attrs.end())
+ src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
+ map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+ if (lh != attrs.end())
+ src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
+
+ set_copy_attrs(src_attrs, attrs, attrs_mod);
+ attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_PG_VER);
+ attrs.erase(RGW_ATTR_SOURCE_ZONE);
+ map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
+ if (cmp != src_attrs.end())
+ attrs[RGW_ATTR_COMPRESSION] = cmp->second;
+
+ RGWObjManifest manifest;
+ RGWObjState *astate = NULL;
+ RGWObjManifest *amanifest = nullptr;
+
+ ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ vector<rgw_raw_obj> ref_objs;
+
+ if (remote_dest) {
+ /* dest is in a different zonegroup, copy it there */
+ return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
+ }
+ uint64_t max_chunk_size;
+
+ ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
+ return ret;
+ }
+
+ rgw_pool src_pool;
+ rgw_pool dest_pool;
+
+ const rgw_placement_rule *src_rule{nullptr};
+
+ if (amanifest) {
+ src_rule = &amanifest->get_tail_placement().placement_rule;
+ ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
+ }
+
+ if (!src_rule || src_rule->empty()) {
+ src_rule = &src_bucket->get_placement_rule();
+ }
+
+ if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
+ return -EIO;
+ }
+
+ if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
+ return -EIO;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
+ << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
+
+ bool copy_data = (!amanifest) ||
+ (*src_rule != dest_placement) ||
+ (src_pool != dest_pool);
+
+ bool copy_first = false;
+ if (amanifest) {
+ if (!amanifest->has_tail()) {
+ copy_data = true;
+ } else {
+ uint64_t head_size = amanifest->get_head_size();
+
+ if (head_size > 0) {
+ if (head_size > max_chunk_size) {
+ copy_data = true;
+ } else {
+ copy_first = true;
+ }
+ }
+ }
+ }
+
+ if (petag) {
+ const auto iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
+ *petag = iter->second.to_str();
+ }
+ }
+
+ if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+ return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
+ mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
+ }
+
+ /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
+ RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
+
+ if (copy_first) { // we need to copy first chunk, not increase refcount
+ ++miter;
+ }
+
+ bufferlist first_chunk;
+
+ const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
+ RGWObjManifest *pmanifest;
+ ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
+
+ RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj);
+ RGWRados::Object::Write write_op(&dest_op_target);
+
+ string tag;
+
+ if (ptag) {
+ tag = *ptag;
+ }
+
+ if (tag.empty()) {
+ append_rand_alpha(cct, tag, tag, 32);
+ }
+
+ std::unique_ptr<rgw::Aio> aio;
+ rgw::AioResultList all_results;
+ if (!copy_itself) {
+ aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+ manifest = *amanifest;
+ const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
+ if (tail_placement.bucket.name.empty()) {
+ manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
+ }
+ string ref_tag;
+ for (; miter != amanifest->obj_end(dpp); ++miter) {
+ ObjectWriteOperation op;
+ ref_tag = tag + '\0';
+ cls_refcount_get(op, ref_tag, true);
+
+ auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver));
+ ret = obj.open(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
+ goto done_ret;
+ }
+
+ static constexpr uint64_t cost = 1; // 1 throttle unit per request
+ static constexpr uint64_t id = 0; // ids unused
+ rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+ ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
+ goto done_ret;
+ }
+ }
+
+ rgw::AioResultList completed = aio->drain();
+ ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
+ goto done_ret;
+ }
+
+ pmanifest = &manifest;
+ } else {
+ pmanifest = amanifest;
+ /* don't send the object's tail for garbage collection */
+ astate->keep_tail = true;
+ }
+
+ if (copy_first) {
+ ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
+ if (ret < 0) {
+ goto done_ret;
+ }
+
+ pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
+ } else {
+ pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
+ }
+
+ write_op.meta.data = &first_chunk;
+ write_op.meta.manifest = pmanifest;
+ write_op.meta.ptag = &tag;
+ write_op.meta.owner = dest_bucket->get_info().owner;
+ write_op.meta.mtime = mtime;
+ write_op.meta.flags = PUT_OBJ_CREATE;
+ write_op.meta.category = category;
+ write_op.meta.olh_epoch = olh_epoch;
+ write_op.meta.delete_at = delete_at;
+ write_op.meta.modify_tail = !copy_itself;
+
+ ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
+ if (ret < 0) {
+ goto done_ret;
+ }
+
+ return 0;
+
+done_ret:
+ if (!copy_itself) {
+
+ /* wait all pending op done */
+ rgw::AioResultList completed = aio->drain();
+ all_results.splice(all_results.end(), completed);
+
+ /* rollback reference */
+ string ref_tag = tag + '\0';
+ int ret2 = 0;
+ for (auto& r : all_results) {
+ if (r.result < 0) {
+ continue; // skip errors
+ }
+ ObjectWriteOperation op;
+ cls_refcount_put(op, ref_tag, true);
+
+ static constexpr uint64_t cost = 1; // 1 throttle unit per request
+ static constexpr uint64_t id = 0; // ids unused
+ rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
+ }
+ }
+ completed = aio->drain();
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
+ }
+ }
+ return ret;
+}
+
+
+int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
+ rgw::sal::Bucket* bucket,
+ const rgw_placement_rule& dest_placement,
+ RGWRados::Object::Read& read_op, off_t end,
+ rgw::sal::Object* dest_obj,
+ real_time *mtime,
+ real_time set_mtime,
+ rgw::sal::Attrs& attrs,
+ uint64_t olh_epoch,
+ real_time delete_at,
+ string *petag,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ string tag;
+ append_rand_alpha(cct, tag, tag, 32);
+
+ rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+ using namespace rgw::putobj;
+ // do not change the null_yield in the initialization of this AtomicObjectProcessor
+ // it causes crashes in the ragweed tests
+ AtomicObjectProcessor processor(&aio, this->driver, &dest_placement,
+ bucket->get_info().owner, obj_ctx,
+ dest_obj->clone(), olh_epoch, tag,
+ dpp, null_yield);
+ int ret = processor.prepare(y);
+ if (ret < 0)
+ return ret;
+
+ off_t ofs = 0;
+
+ do {
+ bufferlist bl;
+ ret = read_op.read(ofs, end, bl, y, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
+ return ret;
+ }
+
+ uint64_t read_len = ret;
+ ret = processor.process(std::move(bl), ofs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ofs += read_len;
+ } while (ofs <= end);
+
+ // flush
+ ret = processor.process({}, ofs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ string etag;
+ auto iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
+ bufferlist& bl = iter->second;
+ etag = bl.to_str();
+ if (petag) {
+ *petag = etag;
+ }
+ }
+
+ uint64_t accounted_size;
+ {
+ bool compressed{false};
+ RGWCompressionInfo cs_info;
+ ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
+ return ret;
+ }
+ // pass original size if compressed
+ accounted_size = compressed ? cs_info.orig_size : ofs;
+ }
+
+ return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+ nullptr, nullptr, nullptr, nullptr, nullptr, y);
+}
+
+int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
+ rgw::sal::Bucket* bucket,
+ rgw::sal::Object& obj,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider *dpp,
+ optional_yield y)
+{
+ rgw::sal::Attrs attrs;
+ real_time read_mtime;
+ uint64_t obj_size;
+
+ obj.set_atomic();
+ RGWRados::Object op_target(this, bucket, obj_ctx, &obj);
+ RGWRados::Object::Read read_op(&op_target);
+
+ read_op.params.attrs = &attrs;
+ read_op.params.lastmod = &read_mtime;
+ read_op.params.obj_size = &obj_size;
+
+ int ret = read_op.prepare(y, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (read_mtime != mtime) {
+ /* raced */
+ return -ECANCELED;
+ }
+
+ attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_TAIL_TAG);
+
+ ret = copy_obj_data(obj_ctx,
+ bucket,
+ placement_rule,
+ read_op,
+ obj_size - 1,
+ &obj,
+ nullptr /* pmtime */,
+ mtime,
+ attrs,
+ olh_epoch,
+ real_time(),
+ nullptr /* petag */,
+ dpp,
+ y);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
+{
+ constexpr uint NUM_ENTRIES = 1000u;
+
+ rgw_obj_index_key marker;
+ string prefix;
+ bool is_truncated;
+
+ do {
+ std::vector<rgw_bucket_dir_entry> ent_list;
+ ent_list.reserve(NUM_ENTRIES);
+
+ int r = cls_bucket_list_unordered(dpp,
+ bucket_info,
+ bucket_info.layout.current_index,
+ RGW_NO_SHARD,
+ marker,
+ prefix,
+ NUM_ENTRIES,
+ true,
+ ent_list,
+ &is_truncated,
+ &marker,
+ y);
+ if (r < 0) {
+ return r;
+ }
+
+ string ns;
+ for (auto const& dirent : ent_list) {
+ rgw_obj_key obj;
+
+ if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
+ return -ENOTEMPTY;
+ }
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+/**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
+{
+ const rgw_bucket& bucket = bucket_info.bucket;
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ if (check_empty) {
+ r = check_bucket_empty(dpp, bucket_info, y);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ bool remove_ep = true;
+
+ if (objv_tracker.read_version.empty()) {
+ RGWBucketEntryPoint ep;
+ r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
+ &ep,
+ null_yield,
+ dpp,
+ RGWBucketCtl::Bucket::GetParams()
+ .set_objv_tracker(&objv_tracker));
+ if (r < 0 ||
+ (!bucket_info.bucket.bucket_id.empty() &&
+ ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
+ if (r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
+ /* we have no idea what caused the error, will not try to remove it */
+ }
+ /*
+ * either failed to read bucket entrypoint, or it points to a different bucket instance than
+ * requested
+ */
+ remove_ep = false;
+ }
+ }
+
+ if (remove_ep) {
+ r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
+ RGWBucketCtl::Bucket::RemoveParams()
+ .set_objv_tracker(&objv_tracker));
+ if (r < 0)
+ return r;
+ }
+
+ /* if the bucket is not synced we can remove the meta file */
+ if (!svc.zone->is_syncing_bucket_meta(bucket)) {
+ RGWObjVersionTracker objv_tracker;
+ r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ /* remove bucket index objects asynchronously by best effort */
+ (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
+ bucket_objs,
+ cct->_conf->rgw_bucket_index_max_aio)();
+ }
+
+ return 0;
+}
+
+int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
+{
+ RGWBucketInfo info;
+ map<string, bufferlist> attrs;
+ int r;
+
+ if (bucket.bucket_id.empty()) {
+ r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+ } else {
+ r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+ return r;
+ }
+
+ info.owner = owner.get_id();
+
+ r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+
+ vector<rgw_bucket>::iterator iter;
+
+ for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
+ rgw_bucket& bucket = *iter;
+ if (enabled) {
+ ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
+ }
+
+ RGWBucketInfo info;
+ map<string, bufferlist> attrs;
+ int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+ ret = r;
+ continue;
+ }
+ if (enabled) {
+ info.flags &= ~BUCKET_SUSPENDED;
+ } else {
+ info.flags |= BUCKET_SUSPENDED;
+ }
+
+ r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+ ret = r;
+ continue;
+ }
+ }
+ return ret;
+}
+
+int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
+{
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
+ return 0;
+}
+
+int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
+{
+ if ((!manifest)|| state->keep_tail)
+ return 0;
+
+ cls_rgw_obj_chain chain;
+ store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain);
+
+ if (chain.empty()) {
+ return 0;
+ }
+
+ string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
+ if (store->gc == nullptr) {
+ ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
+ //Delete objects inline just in case gc hasn't been initialised, prevents crashes
+ store->delete_objs_inline(dpp, chain, tag);
+ } else {
+ auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
+ if (ret < 0 && leftover_chain) {
+ //Delete objects inline if send chain to gc fails
+ store->delete_objs_inline(dpp, *leftover_chain, tag);
+ }
+ }
+ return 0;
+}
+
+void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
+{
+ RGWObjManifest::obj_iterator iter;
+ rgw_raw_obj raw_head;
+ obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
+ for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
+ const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver);
+ if (mobj == raw_head)
+ continue;
+ cls_rgw_obj_key key(mobj.oid);
+ chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
+ }
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
+{
+ if (chain.empty()) {
+ return {0, std::nullopt};
+ }
+
+ return gc->send_split_chain(chain, tag);
+}
+
+void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
+{
+ string last_pool;
+ std::unique_ptr<IoCtx> ctx(new IoCtx);
+ int ret = 0;
+ for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+ cls_rgw_obj& obj = *liter;
+ if (obj.pool != last_pool) {
+ ctx.reset(new IoCtx);
+ ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
+ if (ret < 0) {
+ last_pool = "";
+ ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
+ obj.pool << dendl;
+ continue;
+ }
+ last_pool = obj.pool;
+ }
+ ctx->locator_set_key(obj.loc);
+ const string& oid = obj.key.name; /* just stored raw oid there */
+ ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
+ ":" << obj.key.name << dendl;
+ ObjectWriteOperation op;
+ cls_refcount_put(op, tag, true);
+ ret = ctx->operate(oid, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
+ }
+ }
+}
+
+static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
+ map<RGWObjCategory, RGWStorageStats>& stats)
+{
+ for (const auto& pair : header.stats) {
+ const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
+ const rgw_bucket_category_stats& header_stats = pair.second;
+
+ RGWStorageStats& s = stats[category];
+
+ s.category = category;
+ s.size += header_stats.total_size;
+ s.size_rounded += header_stats.total_size_rounded;
+ s.size_utilized += header_stats.actual_size;
+ s.num_objects += header_stats.num_entries;
+ }
+}
+
+int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ map<RGWObjCategory, RGWStorageStats> *existing_stats,
+ map<RGWObjCategory, RGWStorageStats> *calculated_stats)
+{
+ RGWSI_RADOS::Pool index_pool;
+
+ // key - bucket index object id
+ // value - bucket index check OP returned result with the given bucket index object (shard)
+ map<int, string> oids;
+
+ int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // declare and pre-populate
+ map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
+ for (auto& iter : oids) {
+ bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
+ }
+
+ ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
+ if (ret < 0) {
+ return ret;
+ }
+
+ // aggregate results (from different shards if there are any)
+ for (const auto& iter : bucket_objs_ret) {
+ accumulate_raw_stats(iter.second.existing_header, *existing_stats);
+ accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to open bucket index, r=" << r << " (" <<
+ cpp_strerror(-r) << ")" << dendl;
+ return r;
+ }
+
+ r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to issue set bucket resharding, r=" << r << " (" <<
+ cpp_strerror(-r) << ")" << dendl;
+ }
+ return r;
+}
+
+int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key);
+ if (!rctx)
+ return 0;
+
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
+ return -EINVAL;
+ }
+
+ string tag;
+
+ if (state->tail_tag.length() > 0) {
+ tag = state->tail_tag.c_str();
+ } else if (state->obj_tag.length() > 0) {
+ tag = state->obj_tag.c_str();
+ } else {
+ ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
+ return -EINVAL;
+ }
+
+ ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
+
+ cls_rgw_obj_chain chain;
+ update_gc_chain(dpp, state->obj, *manifest, &chain);
+ return gc->async_defer_chain(tag, chain);
+}
+
+void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
+{
+ list<string> prefixes;
+ prefixes.push_back(RGW_ATTR_OLH_PREFIX);
+ cls_rgw_remove_obj(op, prefixes);
+}
+
+void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
+{
+ cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
+}
+
+void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
+{
+ cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
+}
+
+struct tombstone_entry {
+ ceph::real_time mtime;
+ uint32_t zone_short_id;
+ uint64_t pg_ver;
+
+ tombstone_entry() = default;
+ explicit tombstone_entry(const RGWObjState& state)
+ : mtime(state.mtime), zone_short_id(state.zone_short_id),
+ pg_ver(state.pg_ver) {}
+};
+
+/**
+ * Delete an object.
+ * bucket: name of the bucket storing the object
+ * obj: name of the object to delete
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = target->get_store();
+ const string& instance = target->get_instance();
+ rgw_obj obj = target->get_obj();
+
+ if (instance == "null") {
+ obj.key.instance.clear();
+ }
+
+ bool explicit_marker_version = (!params.marker_version_id.empty());
+
+ if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
+ if (instance.empty() || explicit_marker_version) {
+ std::unique_ptr<rgw::sal::Object> marker = target->get_target()->clone();
+ marker->clear_instance();
+
+ if (!params.marker_version_id.empty()) {
+ if (params.marker_version_id != "null") {
+ marker->set_instance(params.marker_version_id);
+ }
+ } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
+ marker->gen_rand_obj_instance_name();
+ }
+
+ result.version_id = marker->get_instance();
+ if (result.version_id.empty())
+ result.version_id = "null";
+ result.delete_marker = true;
+
+ struct rgw_bucket_dir_entry_meta meta;
+
+ meta.owner = params.obj_owner.get_id().to_str();
+ meta.owner_display_name = params.obj_owner.get_display_name();
+
+ if (real_clock::is_zero(params.mtime)) {
+ meta.mtime = real_clock::now();
+ } else {
+ meta.mtime = params.mtime;
+ }
+
+ int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ rgw_bucket_dir_entry dirent;
+
+ int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
+ if (r < 0) {
+ return r;
+ }
+ result.delete_marker = dirent.is_delete_marker();
+ r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace);
+ if (r < 0) {
+ return r;
+ }
+ result.version_id = instance;
+ }
+
+ BucketShard *bs = nullptr;
+ int r = target->get_bucket_shard(&bs, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
+ return r;
+ }
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->get_bucket_info(), bs->shard_id);
+
+ return 0;
+ }
+
+ rgw_rados_ref ref;
+ int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ r = target->get_state(dpp, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ ObjectWriteOperation op;
+
+ if (!real_clock::is_zero(params.unmod_since)) {
+ struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
+ struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
+ if (!params.high_precision_time) {
+ ctime.tv_nsec = 0;
+ unmod.tv_nsec = 0;
+ }
+
+ ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
+ if (ctime > unmod) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+
+ /* only delete object if mtime is less than or equal to params.unmod_since */
+ store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
+ }
+ uint64_t obj_accounted_size = state->accounted_size;
+
+ if(params.abortmp) {
+ obj_accounted_size = params.parts_accounted_size;
+ }
+
+ if (!real_clock::is_zero(params.expiration_time)) {
+ bufferlist bl;
+ real_time delete_at;
+
+ if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
+ try {
+ auto iter = bl.cbegin();
+ decode(delete_at, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
+ return -EIO;
+ }
+
+ if (params.expiration_time != delete_at) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+
+ if (!state->exists) {
+ target->invalidate_state();
+ return -ENOENT;
+ }
+
+ r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
+ if (r < 0)
+ return r;
+
+ RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+ RGWRados::Bucket bop(store, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ index_op.set_zones_trace(params.zones_trace);
+ index_op.set_bilog_flags(params.bilog_flags);
+
+ r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
+ if (r < 0)
+ return r;
+
+ store->remove_rgw_head_obj(op);
+
+ auto& ioctx = ref.pool.ioctx();
+ r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+
+ /* raced with another operation, object state is indeterminate */
+ const bool need_invalidate = (r == -ECANCELED);
+
+ int64_t poolid = ioctx.get_id();
+ if (r >= 0) {
+ tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
+ if (obj_tombstone_cache) {
+ tombstone_entry entry{*state};
+ obj_tombstone_cache->add(obj, entry);
+ }
+ r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
+
+ int ret = target->complete_atomic_modification(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
+ }
+ /* other than that, no need to propagate error */
+ } else {
+ int ret = index_op.cancel(dpp, params.remove_objs);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+ }
+ }
+
+ if (need_invalidate) {
+ target->invalidate_state();
+ }
+
+ if (r < 0)
+ return r;
+
+ /* update quota cache */
+ store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
+
+ return 0;
+}
+
+int RGWRados::delete_obj(rgw::sal::Driver* store,
+ const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags,
+ const real_time& expiration_time,
+ rgw_zone_set *zones_trace)
+{
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ store->get_bucket(nullptr, bucket_info, &bucket);
+ std::unique_ptr<rgw::sal::Object> object = bucket->get_object(obj.key);
+
+ return delete_obj(dpp, bucket_info, object.get(), versioning_status,
+ bilog_flags, expiration_time, zones_trace);
+}
+
+int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ rgw::sal::Object* obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags,
+ const real_time& expiration_time,
+ rgw_zone_set *zones_trace)
+{
+ std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+
+ del_op->params.bucket_owner = bucket_info.owner;
+ del_op->params.versioning_status = versioning_status;
+ del_op->params.bilog_flags = bilog_flags;
+ del_op->params.expiration_time = expiration_time;
+ del_op->params.zones_trace = zones_trace;
+
+ return del_op->delete_obj(dpp, null_yield);
+}
+
+int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+
+ op.remove();
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
+{
+ std::string oid, key;
+ get_obj_bucket_and_oid_loc(obj, oid, key);
+
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
+}
+
+static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
+{
+ string tag;
+
+ RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
+ if (mi != manifest.obj_end(dpp)) {
+ if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
+ ++mi;
+ rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
+ tag = mi.get_location().get_raw_obj(rstore).oid;
+ tag.append("_");
+ }
+
+ unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+ MD5 hash;
+ // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+ hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
+
+ map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
+ if (iter != attrset.end()) {
+ bufferlist& bl = iter->second;
+ hash.Update((const unsigned char *)bl.c_str(), bl.length());
+ }
+
+ hash.Final(md5);
+ buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
+ tag.append(md5_str);
+
+ ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
+
+ tag_bl.append(tag.c_str(), tag.size() + 1);
+}
+
+static bool is_olh(map<string, bufferlist>& attrs)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
+ return (iter != attrs.end());
+}
+
+static bool has_olh_tag(map<string, bufferlist>& attrs)
+{
+ map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
+ return (iter != attrs.end());
+}
+
+int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
+ obj_ctx, RGWBucketInfo& bucket_info,
+ rgw::sal::Object* obj, RGWObjState *olh_state,
+ RGWObjState **target_state,
+ RGWObjManifest **target_manifest, optional_yield y)
+{
+ ceph_assert(olh_state->is_olh);
+
+ rgw_obj target;
+ int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
+ if (r < 0) {
+ return r;
+ }
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ driver->get_bucket(nullptr, bucket_info, &bucket);
+ std::unique_ptr<rgw::sal::Object> target_obj = bucket->get_object(target.key);
+
+ r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state,
+ target_manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
+ RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent)
+{
+ if (obj->empty()) {
+ return -EINVAL;
+ }
+
+ bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty();
+ *manifest = nullptr;
+
+ RGWObjStateManifest *sm = rctx->get_state(obj->get_obj());
+ RGWObjState *s = &(sm->state);
+ ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
+ *state = s;
+ if (sm->manifest) {
+ *manifest = &(*sm->manifest);
+ }
+ if (s->has_attrs) {
+ if (s->is_olh && need_follow_olh) {
+ return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+ }
+ return 0;
+ }
+
+ s->obj = obj->get_obj();
+
+ rgw_raw_obj raw_obj;
+ obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj);
+
+ int r = -ENOENT;
+
+ if (!assume_noent) {
+ r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+ }
+
+ if (r == -ENOENT) {
+ s->exists = false;
+ s->has_attrs = true;
+ tombstone_entry entry;
+ if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) {
+ s->mtime = entry.mtime;
+ s->zone_short_id = entry.zone_short_id;
+ s->pg_ver = entry.pg_ver;
+ ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
+ << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
+ } else {
+ s->mtime = real_time();
+ }
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ s->exists = true;
+ s->has_attrs = true;
+ s->accounted_size = s->size;
+
+ auto iter = s->attrset.find(RGW_ATTR_ETAG);
+ if (iter != s->attrset.end()) {
+ /* get rid of extra null character at the end of the etag, as we used to store it like that */
+ bufferlist& bletag = iter->second;
+ if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
+ bufferlist newbl;
+ bletag.splice(0, bletag.length() - 1, &newbl);
+ bletag = std::move(newbl);
+ }
+ }
+
+ iter = s->attrset.find(RGW_ATTR_COMPRESSION);
+ const bool compressed = (iter != s->attrset.end());
+ if (compressed) {
+ // use uncompressed size for accounted_size
+ try {
+ RGWCompressionInfo info;
+ auto p = iter->second.cbegin();
+ decode(info, p);
+ s->accounted_size = info.orig_size;
+ } catch (buffer::error&) {
+ ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
+ return -EIO;
+ }
+ }
+
+ iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
+ if (iter != s->attrset.end()) {
+ bufferlist bl = iter->second;
+ bufferlist::iterator it = bl.begin();
+ it.copy(bl.length(), s->shadow_obj);
+ s->shadow_obj[bl.length()] = '\0';
+ }
+ s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
+ auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
+ if (ttiter != s->attrset.end()) {
+ s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
+ }
+
+ bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
+ if (manifest_bl.length()) {
+ auto miter = manifest_bl.cbegin();
+ try {
+ sm->manifest.emplace();
+ decode(*sm->manifest, miter);
+ sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be
+ broken due to old bugs */
+ s->size = sm->manifest->get_obj_size();
+ if (!compressed)
+ s->accounted_size = s->size;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+ return -EIO;
+ }
+ *manifest = &(*sm->manifest);
+ ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
+ if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
+ sm->manifest->has_explicit_objs()) {
+ RGWObjManifest::obj_iterator mi;
+ for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
+ ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl;
+ }
+ }
+
+ if (!s->obj_tag.length()) {
+ /*
+ * Uh oh, something's wrong, object with manifest should have tag. Let's
+ * create one out of the manifest, would be unique
+ */
+ generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
+ s->fake_tag = true;
+ }
+ }
+ map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
+ if (aiter != s->attrset.end()) {
+ bufferlist& pg_ver_bl = aiter->second;
+ if (pg_ver_bl.length()) {
+ auto pgbl = pg_ver_bl.cbegin();
+ try {
+ decode(s->pg_ver, pgbl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+ }
+ }
+ }
+ aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
+ if (aiter != s->attrset.end()) {
+ bufferlist& zone_short_id_bl = aiter->second;
+ if (zone_short_id_bl.length()) {
+ auto zbl = zone_short_id_bl.cbegin();
+ try {
+ decode(s->zone_short_id, zbl);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+ }
+ }
+ }
+ if (s->obj_tag.length()) {
+ ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
+ }
+
+ /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
+ * it exist, and not only if is_olh() returns true
+ */
+ iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
+ if (iter != s->attrset.end()) {
+ s->olh_tag = iter->second;
+ }
+
+ if (is_olh(s->attrset)) {
+ s->is_olh = true;
+
+ ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
+
+ if (need_follow_olh) {
+ return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+ } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) {
+ // read null version, and the head object only have olh info
+ s->exists = false;
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent)
+{
+ int ret;
+
+ do {
+ ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
+{
+ RGWObjState *astate;
+ int r = get_state(dpp, &astate, pmanifest, true, y);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
+{
+ RGWObjState *state;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &state, &manifest, true, y);
+ if (r < 0)
+ return r;
+ if (!state->exists)
+ return -ENOENT;
+ if (!state->get_attr(name, dest))
+ return -ENODATA;
+
+ return 0;
+}
+
+int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
+{
+ rgw::sal::Object* target = source->get_target();
+ rgw_obj obj = target->get_obj();
+ RGWRados *store = source->get_store();
+
+ result.obj = obj;
+ if (target->has_attrs()) {
+ state.ret = 0;
+ result.size = target->get_obj_size();
+ result.mtime = ceph::real_clock::to_timespec(target->get_mtime());
+ result.attrs = target->get_attrs();
+ //result.manifest = sm->manifest;
+ return 0;
+ }
+
+ string oid;
+ string loc;
+ get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+ int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectReadOperation op;
+ op.stat2(&result.size, &result.mtime, NULL);
+ op.getxattrs(&result.attrs, NULL);
+ state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
+ state.io_ctx.locator_set_key(loc);
+ r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << __func__
+ << ": ERROR: aio_operate() returned ret=" << r
+ << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
+{
+ if (!state.completion) {
+ return state.ret;
+ }
+
+ state.completion->wait_for_complete();
+ state.ret = state.completion->get_return_value();
+ state.completion->release();
+
+ if (state.ret != 0) {
+ return state.ret;
+ }
+
+ return finish(dpp);
+}
+
+int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
+{
+ map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != result.attrs.end()) {
+ bufferlist& bl = iter->second;
+ auto biter = bl.cbegin();
+ try {
+ result.manifest.emplace();
+ decode(*result.manifest, biter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ ObjectOperation& op, RGWObjState **pstate,
+ RGWObjManifest** pmanifest, optional_yield y)
+{
+ int r = obj->get_obj_state(dpp, pstate, y, false);
+ if (r < 0)
+ return r;
+
+ return append_atomic_test(dpp, *pstate, op);
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+ const RGWObjState* state,
+ librados::ObjectOperation& op)
+{
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
+ return 0;
+ }
+
+ if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
+ op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+ } else {
+ ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
+ }
+ return 0;
+}
+
+int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
+{
+ int r = obj->get_obj_state(dpp, pstate, y, follow_olh);
+ if (r < 0) {
+ return r;
+ }
+ *pmanifest = static_cast<rgw::sal::RadosObject*>(obj)->get_manifest();
+
+ return r;
+}
+
+void RGWRados::Object::invalidate_state()
+{
+ obj->invalidate();
+}
+
+int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
+ ObjectWriteOperation& op, bool reset_obj, const string *ptag,
+ const char *if_match, const char *if_nomatch, bool removal_op,
+ bool modify_tail, optional_yield y)
+{
+ int r = get_state(dpp, &state, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
+ if_match != NULL || if_nomatch != NULL) &&
+ (!state->fake_tag);
+
+ if (!state->is_atomic) {
+ ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
+
+ if (reset_obj) {
+ op.create(false);
+ store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
+ }
+
+ return 0;
+ }
+
+ if (need_guard) {
+ /* first verify that the object wasn't replaced under */
+ if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
+ op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+ // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
+ }
+
+ if (if_match) {
+ if (strcmp(if_match, "*") == 0) {
+ // test the object is existing
+ if (!state->exists) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ bufferlist bl;
+ if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+ strncmp(if_match, bl.c_str(), bl.length()) != 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+
+ if (if_nomatch) {
+ if (strcmp(if_nomatch, "*") == 0) {
+ // test the object is NOT existing
+ if (state->exists) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ } else {
+ bufferlist bl;
+ if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+ strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+ }
+
+ if (reset_obj) {
+ if (state->exists) {
+ op.create(false);
+ store->remove_rgw_head_obj(op);
+ } else {
+ op.create(true);
+ }
+ }
+
+ if (removal_op) {
+ /* the object is being removed, no need to update its tag */
+ return 0;
+ }
+
+ if (ptag) {
+ state->write_tag = *ptag;
+ } else {
+ append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
+ }
+ bufferlist bl;
+ bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
+
+ ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
+
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+ if (modify_tail) {
+ op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+ }
+
+ return 0;
+}
+
+/**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl)
+{
+ map<string, bufferlist> attrs;
+ attrs[name] = bl;
+ return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
+}
+
+int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj,
+ map<string, bufferlist>& attrs,
+ map<string, bufferlist>* rmattrs,
+ optional_yield y)
+{
+ std::unique_ptr<rgw::sal::Object> obj = src_obj->clone();
+ if (obj->get_instance() == "null") {
+ obj->clear_instance();
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y);
+ if (r < 0)
+ return r;
+
+ // ensure null version object exist
+ if (src_obj->get_instance() == "null" && !manifest) {
+ return -ENOENT;
+ }
+
+ map<string, bufferlist>::iterator iter;
+ if (rmattrs) {
+ for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+ const string& name = iter->first;
+ op.rmxattr(name.c_str());
+ }
+ }
+
+ const rgw_bucket& bucket = obj->get_bucket()->get_key();
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ const string& name = iter->first;
+ bufferlist& bl = iter->second;
+
+ if (!bl.length())
+ continue;
+
+ op.setxattr(name.c_str(), bl);
+
+ if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
+ real_time ts;
+ try {
+ decode(ts, bl);
+
+ rgw_obj_index_key obj_key;
+ obj->get_key().get_index_key(&obj_key);
+
+ obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
+ }
+ }
+ }
+
+ if (!op.size())
+ return 0;
+
+ bufferlist bl;
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj());
+
+ if (state) {
+ string tag;
+ append_rand_alpha(cct, tag, tag, 32);
+ state->write_tag = tag;
+ r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+
+ if (r < 0)
+ return r;
+
+ bl.append(tag.c_str(), tag.size() + 1);
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+ }
+
+
+ real_time mtime = real_clock::now();
+ struct timespec mtime_ts = real_clock::to_timespec(mtime);
+ op.mtime2(&mtime_ts);
+ auto& ioctx = ref.pool.ioctx();
+ r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
+ if (state) {
+ if (r >= 0) {
+ bufferlist acl_bl = attrs[RGW_ATTR_ACL];
+ bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
+ bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
+ string etag = rgw_bl_str(etag_bl);
+ string content_type = rgw_bl_str(content_type_bl);
+ string storage_class;
+ auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != attrs.end()) {
+ storage_class = rgw_bl_str(iter->second);
+ }
+ uint64_t epoch = ioctx.get_last_version();
+ int64_t poolid = ioctx.get_id();
+ r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
+ mtime, etag, content_type, storage_class, &acl_bl,
+ RGWObjCategory::Main, NULL);
+ } else {
+ int ret = index_op.cancel(dpp, nullptr);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
+ }
+ }
+ }
+ if (r < 0)
+ return r;
+
+ if (state) {
+ state->obj_tag.swap(bl);
+ if (rmattrs) {
+ for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+ state->attrset.erase(iter->first);
+ }
+ }
+
+ for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+ state->attrset[iter->first] = iter->second;
+ }
+
+ auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
+ if (iter != state->attrset.end()) {
+ iter->second = state->obj_tag;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = source->get_store();
+ CephContext *cct = store->ctx();
+
+ bufferlist etag;
+
+ map<string, bufferlist>::iterator iter;
+
+ RGWObjState *astate;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &astate, &manifest, true, y);
+ if (r < 0)
+ return r;
+
+ if (!astate->exists) {
+ return -ENOENT;
+ }
+
+ const RGWBucketInfo& bucket_info = source->get_bucket_info();
+
+ state.obj = astate->obj;
+ store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
+
+ state.cur_pool = state.head_obj.pool;
+ state.cur_ioctx = &state.io_ctxs[state.cur_pool];
+
+ r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
+ if (r < 0) {
+ return r;
+ }
+ if (params.target_obj) {
+ *params.target_obj = state.obj;
+ }
+ if (params.attrs) {
+ *params.attrs = astate->attrset;
+ if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+ ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+ }
+ }
+ }
+
+ /* Convert all times go GMT to make them compatible */
+ if (conds.mod_ptr || conds.unmod_ptr) {
+ obj_time_weight src_weight;
+ src_weight.init(astate);
+ src_weight.high_precision = conds.high_precision_time;
+
+ obj_time_weight dest_weight;
+ dest_weight.high_precision = conds.high_precision_time;
+
+ if (conds.mod_ptr && !conds.if_nomatch) {
+ dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+ ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+ if (!(dest_weight < src_weight)) {
+ return -ERR_NOT_MODIFIED;
+ }
+ }
+
+ if (conds.unmod_ptr && !conds.if_match) {
+ dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+ ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+ if (dest_weight < src_weight) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+ }
+ if (conds.if_match || conds.if_nomatch) {
+ r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
+ if (r < 0)
+ return r;
+
+ if (conds.if_match) {
+ string if_match_str = rgw_string_unquote(conds.if_match);
+ ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+ if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+ return -ERR_PRECONDITION_FAILED;
+ }
+ }
+
+ if (conds.if_nomatch) {
+ string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+ ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+ if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+ return -ERR_NOT_MODIFIED;
+ }
+ }
+ }
+
+ if (params.obj_size)
+ *params.obj_size = astate->size;
+ if (params.lastmod)
+ *params.lastmod = astate->mtime;
+
+ return 0;
+}
+
+int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+ if (ofs < 0) {
+ ofs += obj_size;
+ if (ofs < 0)
+ ofs = 0;
+ end = obj_size - 1;
+ } else if (end < 0) {
+ end = obj_size - 1;
+ }
+
+ if (obj_size > 0) {
+ if (ofs >= (off_t)obj_size) {
+ return -ERANGE;
+ }
+ if (end >= (off_t)obj_size) {
+ end = obj_size - 1;
+ }
+ }
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
+{
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+ int r;
+
+#define NUM_RESHARD_RETRIES 10
+ for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" <<
+ obj_instance.key << ". ret=" << ret << dendl;
+ return ret;
+ }
+
+ r = call(bs);
+ if (r != -ERR_BUSY_RESHARDING) {
+ break;
+ }
+
+ ldpp_dout(dpp, 10) <<
+ "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+ obj_instance.key << dendl;
+
+ r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
+ if (r == -ERR_BUSY_RESHARDING) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " NOTICE: block_while_resharding() still busy. obj=" <<
+ obj_instance.key << dendl;
+ continue;
+ } else if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: block_while_resharding() failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
+ i = 0; /* resharding is finished, make sure we can retry */
+ invalidate_bs();
+ } // for loop
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ if (pbs) {
+ *pbs = bs;
+ }
+
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+
+ if (write_tag && write_tag->length()) {
+ optag = string(write_tag->c_str(), write_tag->length());
+ } else {
+ if (optag.empty()) {
+ append_rand_alpha(store->ctx(), optag, optag, 32);
+ }
+ }
+
+ int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
+ return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
+ });
+
+ if (r < 0) {
+ return r;
+ }
+ prepared = true;
+
+ return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
+ uint64_t size, uint64_t accounted_size,
+ ceph::real_time& ut, const string& etag,
+ const string& content_type, const string& storage_class,
+ bufferlist *acl_bl,
+ RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs, const string *user_data,
+ bool appendable)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+ return ret;
+ }
+
+ rgw_bucket_dir_entry ent;
+ obj.key.get_index_key(&ent.key);
+ ent.meta.size = size;
+ ent.meta.accounted_size = accounted_size;
+ ent.meta.mtime = ut;
+ ent.meta.etag = etag;
+ ent.meta.storage_class = storage_class;
+ if (user_data)
+ ent.meta.user_data = *user_data;
+
+ ACLOwner owner;
+ if (acl_bl && acl_bl->length()) {
+ int ret = store->decode_policy(dpp, *acl_bl, &owner);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
+ }
+ }
+ ent.meta.owner = owner.get_id().to_str();
+ ent.meta.owner_display_name = owner.get_display_name();
+ ent.meta.content_type = content_type;
+ ent.meta.appendable = appendable;
+
+ ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id);
+
+ return ret;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
+ int64_t poolid, uint64_t epoch,
+ real_time& removed_mtime,
+ list<rgw_obj_index_key> *remove_objs)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs = nullptr;
+
+ int ret = get_bucket_shard(&bs, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+ return ret;
+ }
+
+ ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
+
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id);
+
+ return ret;
+}
+
+
+int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
+ list<rgw_obj_index_key> *remove_objs)
+{
+ if (blind) {
+ return 0;
+ }
+ RGWRados *store = target->get_store();
+ BucketShard *bs;
+
+ int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
+ return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
+ });
+
+ /*
+ * need to update data log anyhow, so that whoever follows needs to update its internal markers
+ * for following the specific bucket shard log. Otherwise they end up staying behind, and users
+ * have no way to tell that they're all caught up
+ */
+ add_datalog_entry(dpp, store->svc.datalog_rados,
+ target->bucket_info, bs->shard_id);
+
+ return ret;
+}
+
+/*
+ * Read up through index `end` inclusive. Number of bytes read is up
+ * to `end - ofs + 1`.
+ */
+int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
+ bufferlist& bl, optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ RGWRados *store = source->get_store();
+
+ rgw_raw_obj read_obj;
+ uint64_t read_ofs = ofs;
+ uint64_t len, read_len;
+ bool reading_from_head = true;
+ ObjectReadOperation op;
+
+ bool merge_bl = false;
+ bufferlist *pbl = &bl;
+ bufferlist read_bl;
+ uint64_t max_chunk_size;
+
+ RGWObjState *astate;
+ RGWObjManifest *manifest = nullptr;
+ int r = source->get_state(dpp, &astate, &manifest, true, y);
+ if (r < 0)
+ return r;
+
+ if (astate->size == 0) {
+ end = 0;
+ } else if (end >= (int64_t)astate->size) {
+ end = astate->size - 1;
+ }
+
+ if (end < 0)
+ len = 0;
+ else
+ len = end - ofs + 1;
+
+ if (manifest && manifest->has_tail()) {
+ /* now get the relevant object part */
+ RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+ uint64_t stripe_ofs = iter.get_stripe_ofs();
+ read_obj = iter.get_location().get_raw_obj(store->driver);
+ len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+ read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+ reading_from_head = (read_obj == state.head_obj);
+ } else {
+ read_obj = state.head_obj;
+ }
+
+ r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
+ return r;
+ }
+
+ if (len > max_chunk_size)
+ len = max_chunk_size;
+
+
+ read_len = len;
+
+ if (reading_from_head) {
+ /* only when reading from the head object do we need to do the atomic test */
+ std::unique_ptr<rgw::sal::Object> obj = source->bucket->get_object(state.obj.key);
+ r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y);
+ if (r < 0)
+ return r;
+
+ if (astate && astate->prefetch_data) {
+ if (!ofs && astate->data.length() >= len) {
+ bl = astate->data;
+ return bl.length();
+ }
+
+ if (ofs < astate->data.length()) {
+ unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
+ astate->data.begin(ofs).copy(copy_len, bl);
+ read_len -= copy_len;
+ read_ofs += copy_len;
+ if (!read_len)
+ return bl.length();
+
+ merge_bl = true;
+ pbl = &read_bl;
+ }
+ }
+ }
+
+ ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+ op.read(read_ofs, read_len, pbl, NULL);
+
+ if (state.cur_pool != read_obj.pool) {
+ auto iter = state.io_ctxs.find(read_obj.pool);
+ if (iter == state.io_ctxs.end()) {
+ state.cur_ioctx = &state.io_ctxs[read_obj.pool];
+ r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
+ return r;
+ }
+ } else {
+ state.cur_ioctx = &iter->second;
+ }
+ state.cur_pool = read_obj.pool;
+ }
+
+ state.cur_ioctx->locator_set_key(read_obj.loc);
+
+ r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
+ ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
+
+ if (r < 0) {
+ return r;
+ }
+
+ if (merge_bl) {
+ bl.append(read_bl);
+ }
+
+ return bl.length();
+}
+
+int get_obj_data::flush(rgw::AioResultList&& results) {
+ int r = rgw::check_for_errors(results);
+ if (r < 0) {
+ return r;
+ }
+ std::list<bufferlist> bl_list;
+
+ auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
+ results.sort(cmp); // merge() requires results to be sorted first
+ completed.merge(results, cmp); // merge results in sorted order
+
+ while (!completed.empty() && completed.front().id == offset) {
+ auto bl = std::move(completed.front().data);
+
+ bl_list.push_back(bl);
+ offset += bl.length();
+ int r = client_cb->handle_data(bl, 0, bl.length());
+ if (r < 0) {
+ return r;
+ }
+
+ if (rgwrados->get_use_datacache()) {
+ const std::lock_guard l(d3n_get_data.d3n_lock);
+ auto oid = completed.front().obj.get_ref().obj.oid;
+ if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
+ lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
+ rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
+ } else {
+ lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
+ }
+ }
+ completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
+ }
+ return 0;
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg)
+{
+ struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+ return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
+ is_head_obj, astate, arg);
+}
+
+int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg)
+{
+ ObjectReadOperation op;
+ struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+ string oid, key;
+
+ if (is_head_obj) {
+ /* only when reading from the head object do we need to do the atomic test */
+ int r = append_atomic_test(dpp, astate, op);
+ if (r < 0)
+ return r;
+
+ if (astate &&
+ obj_ofs < astate->data.length()) {
+ unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+ r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+ if (r < 0)
+ return r;
+
+ len -= chunk_len;
+ d->offset += chunk_len;
+ read_ofs += chunk_len;
+ obj_ofs += chunk_len;
+ if (!len)
+ return 0;
+ }
+ }
+
+ auto obj = d->rgwrados->svc.rados->obj(read_obj);
+ int r = obj.open(dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+ op.read(read_ofs, len, nullptr, nullptr);
+
+ const uint64_t cost = len;
+ const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+ auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+
+ return d->flush(std::move(completed));
+}
+
+int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
+ optional_yield y)
+{
+ RGWRados *store = source->get_store();
+ CephContext *cct = store->ctx();
+ const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
+ const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
+
+ auto aio = rgw::make_throttle(window_size, y);
+ get_obj_data data(store, cb, &*aio, ofs, y);
+
+ int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(),
+ source->get_target(),
+ ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+ data.cancel(); // drain completions without writing back to client
+ return r;
+ }
+
+ return data.drain();
+}
+
+int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ off_t ofs, off_t end, uint64_t max_chunk_size,
+ iterate_obj_cb cb, void *arg, optional_yield y)
+{
+ rgw_raw_obj head_obj;
+ rgw_raw_obj read_obj;
+ uint64_t read_ofs = ofs;
+ uint64_t len;
+ bool reading_from_head = true;
+ RGWObjState *astate = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj);
+
+ int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
+ if (r < 0) {
+ return r;
+ }
+
+ if (end < 0)
+ len = 0;
+ else
+ len = end - ofs + 1;
+
+ if (manifest) {
+ /* now get the relevant object stripe */
+ RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+ RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
+
+ for (; iter != obj_end && ofs <= end; ++iter) {
+ off_t stripe_ofs = iter.get_stripe_ofs();
+ off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
+
+ while (ofs < next_stripe_ofs && ofs <= end) {
+ read_obj = iter.get_location().get_raw_obj(driver);
+ uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+ read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+
+ if (read_len > max_chunk_size) {
+ read_len = max_chunk_size;
+ }
+
+ reading_from_head = (read_obj == head_obj);
+ r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
+ if (r < 0) {
+ return r;
+ }
+
+ len -= read_len;
+ ofs += read_len;
+ }
+ }
+ } else {
+ while (ofs <= end) {
+ read_obj = head_obj;
+ uint64_t read_len = std::min(len, max_chunk_size);
+
+ r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
+ if (r < 0) {
+ return r;
+ }
+
+ len -= read_len;
+ ofs += read_len;
+ }
+ }
+
+ return 0;
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ bufferlist outbl;
+
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
+}
+
+int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
+{
+ ObjectWriteOperation op;
+
+ ceph_assert(olh_obj.key.instance.empty());
+
+ bool has_tag = (state.exists && has_olh_tag(state.attrset));
+
+ if (!state.exists) {
+ op.create(true);
+ } else {
+ op.assert_exists();
+ struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+ op.mtime2(&mtime_ts);
+ }
+
+ /*
+ * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
+ * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
+ * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
+ * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
+ * log will reflect that.
+ *
+ * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
+ * is used for object data instance, olh_tag for olh instance.
+ */
+ if (has_tag) {
+ /* guard against racing writes */
+ bucket_index_guard_olh_op(dpp, state, op);
+ }
+
+ if (!has_tag) {
+ /* obj tag */
+ string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+ bufferlist bl;
+ bl.append(obj_tag.c_str(), obj_tag.size());
+ op.setxattr(RGW_ATTR_ID_TAG, bl);
+
+ state.attrset[RGW_ATTR_ID_TAG] = bl;
+ state.obj_tag = bl;
+
+ /* olh tag */
+ string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+ bufferlist olh_bl;
+ olh_bl.append(olh_tag.c_str(), olh_tag.size());
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
+
+ state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
+ state.olh_tag = olh_bl;
+ state.is_olh = true;
+
+ bufferlist verbl;
+ op.setxattr(RGW_ATTR_OLH_VER, verbl);
+ }
+
+ bufferlist bl;
+ RGWOLHPendingInfo pending_info;
+ pending_info.time = real_clock::now();
+ encode(pending_info, bl);
+
+#define OLH_PENDING_TAG_LEN 32
+ /* tag will start with current time epoch, this so that entries are sorted by time */
+ char buf[32];
+ utime_t ut(pending_info.time);
+ snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
+ *op_tag = buf;
+
+ string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
+
+ op_tag->append(s);
+
+ string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+ attr_name.append(*op_tag);
+
+ op.setxattr(attr_name.c_str(), bl);
+
+ int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
+ if (ret < 0) {
+ return ret;
+ }
+
+ state.exists = true;
+ state.attrset[attr_name] = bl;
+
+ return 0;
+}
+
+int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
+{
+ int ret;
+
+ ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
+ if (ret == -EEXIST) {
+ ret = -ECANCELED;
+ }
+
+ return ret;
+}
+
+int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
+ BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call)
+{
+ rgw_obj obj;
+ const rgw_obj *pobj = &obj_instance;
+ int r;
+
+ for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+ r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
+ if (r < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
+ return r;
+ }
+
+ r = call(bs);
+ if (r != -ERR_BUSY_RESHARDING) {
+ break;
+ }
+
+ ldpp_dout(dpp, 10) <<
+ "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+ obj_instance.key << dendl;
+
+ r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
+ if (r == -ERR_BUSY_RESHARDING) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " NOTICE: block_while_resharding() still busy. obj=" <<
+ obj_instance.key << dendl;
+ continue;
+ } else if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: block_while_resharding() failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
+ i = 0; /* resharding is finished, make sure we can retry */
+ } // for loop
+
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
+ obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+ cls_rgw_bucket_instance_entry entry;
+
+ // gets loaded by fetch_new_bucket_info; can be used by
+ // clear_resharding
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ // since we want to run this recovery code from two distinct places,
+ // let's just put it in a lambda so we can easily re-use; if the
+ // lambda successfully fetches a new bucket id, it sets
+ // new_bucket_id and returns 0, otherwise it returns a negative
+ // error code
+ auto fetch_new_bucket_info =
+ [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
+ int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
+ bucket_info, nullptr, y, dpp, &bucket_attrs);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to refresh bucket info after reshard at " <<
+ log_tag << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ret = bs->init(dpp, bucket_info, obj_instance);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to refresh bucket shard generation after reshard at " <<
+ log_tag << ": " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
+ ldpp_dout(dpp, 20) << __func__ <<
+ " INFO: refreshed bucket info after reshard at " <<
+ log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
+
+ return 0;
+ }; // lambda fetch_new_bucket_info
+
+ constexpr int num_retries = 10;
+ for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
+ auto& ref = bs->bucket_obj.get_ref();
+ ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
+ if (ret == -ENOENT) {
+ ret = fetch_new_bucket_info("get_bucket_resharding_failed");
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " failed to refresh bucket info after reshard when get bucket "
+ "resharding failed, error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
+ dendl;
+ return ret;
+ }
+
+ if (!entry.resharding_in_progress()) {
+ ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " failed to refresh bucket info after reshard when get bucket "
+ "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
+ (i < num_retries ? "retrying" : "too many retries") << dendl;
+
+ if (i == num_retries) {
+ break;
+ }
+
+ // If bucket is erroneously marked as resharding (e.g., crash or
+ // other error) then fix it. If we can take the bucket reshard
+ // lock then it means no other resharding should be taking place,
+ // and we're free to clear the flags.
+ {
+ // since we expect to do this rarely, we'll do our work in a
+ // block and erase our work after each try
+
+ RGWObjectCtx obj_ctx(this->driver);
+ const rgw_bucket& b = bs->bucket;
+ std::string bucket_id = b.get_key();
+ RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
+ ret = reshard_lock.lock(dpp);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ <<
+ " ERROR: failed to take reshard lock for bucket " <<
+ bucket_id << "; expected if resharding underway" << dendl;
+ } else {
+ ldpp_dout(dpp, 10) << __func__ <<
+ " INFO: was able to take reshard lock for bucket " <<
+ bucket_id << dendl;
+ // the reshard may have finished, so call clear_resharding()
+ // with its current bucket info; ALSO this will load
+ // bucket_attrs for call to clear_resharding below
+ ret = fetch_new_bucket_info("trying_to_clear_resharding");
+ if (ret < 0) {
+ reshard_lock.unlock();
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to update bucket info before clear resharding for bucket " <<
+ bucket_id << dendl;
+ continue; // try again
+ }
+
+ ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
+ reshard_lock.unlock();
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 5) << __func__ <<
+ " INFO: no need to reset reshard flags; old shards apparently"
+ " removed after successful resharding of bucket " <<
+ bucket_id << dendl;
+ continue; // immediately test again
+ } else if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: failed to clear resharding flags for bucket " <<
+ bucket_id << ", " << cpp_strerror(-ret) << dendl;
+ // wait and then test again
+ } else {
+ ldpp_dout(dpp, 5) << __func__ <<
+ " INFO: apparently successfully cleared resharding flags for "
+ "bucket " << bucket_id << dendl;
+ continue; // if we apparently succeed immediately test again
+ } // if clear resharding succeeded
+ } // if taking of lock succeeded
+ } // block to encapsulate recovery from incomplete reshard
+
+ ret = reshard_wait->wait(y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: bucket is still resharding, please retry" << dendl;
+ return ret;
+ }
+ } // for loop
+
+ ldpp_dout(dpp, 0) << __func__ <<
+ " ERROR: bucket is still resharding, please retry" << dendl;
+ return -ERR_BUSY_RESHARDING;
+}
+
+int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ RGWObjState& olh_state, const rgw_obj& obj_instance,
+ bool delete_marker, const string& op_tag,
+ struct rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch,
+ real_time unmod_since, bool high_precision_time,
+ rgw_zone_set *_zones_trace, bool log_data_change)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+ BucketShard bs(this);
+
+ r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+ auto& ref = bs->bucket_obj.get_ref();
+ librados::ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
+ delete_marker, op_tag, meta, olh_epoch,
+ unmod_since, high_precision_time,
+ svc.zone->get_zone().log_data, zones_trace);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
+ return r;
+ }
+
+ add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id);
+
+ return 0;
+}
+
+void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
+{
+ ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
+ op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
+}
+
+int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj_instance,
+ const string& op_tag, const string& olh_tag,
+ uint64_t olh_epoch, rgw_zone_set *_zones_trace)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+ BucketShard bs(this);
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+ r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *bs) -> int {
+ auto& ref = bs->bucket_obj.get_ref();
+ librados::ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_unlink_instance(op, key, op_tag,
+ olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver_marker,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
+ bool *is_truncated)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ auto& shard_ref = bs.bucket_obj.get_ref();
+ ObjectReadOperation op;
+
+ rgw_cls_read_olh_log_ret log_ret;
+ int op_ret = 0;
+ cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
+ bufferlist outbl;
+ r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
+ if (r < 0) {
+ return r;
+ }
+ if (op_ret < 0) {
+ ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
+ return op_ret;
+ }
+
+ *log = std::move(log_ret.log);
+ *is_truncated = log_ret.is_truncated;
+
+ return 0;
+}
+
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj)
+{
+ // fetch the current olh entry from the bucket index
+ rgw_bucket_olh_entry olh;
+ int r = bi_get_olh(dpp, bucket_info, obj, &olh);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+ return r;
+ }
+ if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
+ return 0;
+ }
+
+ ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
+ << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+ // rewrite OLH_ID_TAG and OLH_INFO from current olh
+ ObjectWriteOperation op;
+ // assert this is the same olh tag we think we're fixing
+ bucket_index_guard_olh_op(dpp, *state, op);
+ // preserve existing mtime
+ struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+ op.mtime2(&mtime_ts);
+ {
+ bufferlist bl;
+ bl.append(olh.tag.c_str(), olh.tag.size());
+ op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+ }
+ {
+ RGWOLHInfo info;
+ info.target = rgw_obj(bucket_info.bucket, olh.key);
+ info.removed = olh.delete_marker;
+ bufferlist bl;
+ encode(info, bl);
+ op.setxattr(RGW_ATTR_OLH_INFO, bl);
+ }
+ rgw_rados_ref ref;
+ r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+ int ret =
+ bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+ return pbs->bucket_obj.operate(dpp, &op, null_yield);
+ });
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ RGWObjState& state,
+ const rgw_obj& obj_instance)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ BucketShard bs(this);
+
+ string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+ cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+ int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+ [&](BucketShard *pbs) -> int {
+ ObjectWriteOperation op;
+ op.assert_exists(); // bucket index shard must exist
+ auto& ref = pbs->bucket_obj.get_ref();
+ cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+ cls_rgw_clear_olh(op, key, olh_tag);
+ return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ });
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
+{
+ try {
+ auto biter = bl.cbegin();
+ decode(*olh, biter);
+ return 0;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
+ return -EIO;
+ }
+}
+
+int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
+ RGWObjState& state,
+ RGWBucketInfo& bucket_info,
+ const rgw::sal::Object* obj,
+ bufferlist& olh_tag,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+ uint64_t *plast_ver,
+ rgw_zone_set* zones_trace)
+{
+ if (log.empty()) {
+ return 0;
+ }
+
+ librados::ObjectWriteOperation op;
+
+ uint64_t last_ver = log.rbegin()->first;
+ *plast_ver = last_ver;
+
+ map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
+
+ op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+ op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
+
+ bufferlist ver_bl;
+ string last_ver_s = to_string(last_ver);
+ ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+ op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
+ struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+ op.mtime2(&mtime_ts);
+
+ bool need_to_link = false;
+ uint64_t link_epoch = 0;
+ cls_rgw_obj_key key;
+ bool delete_marker = false;
+ list<cls_rgw_obj_key> remove_instances;
+ bool need_to_remove = false;
+
+ // decode current epoch and instance
+ auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
+ if (olh_ver != state.attrset.end()) {
+ std::string str = olh_ver->second.to_str();
+ std::string err;
+ link_epoch = strict_strtoll(str.c_str(), 10, &err);
+ }
+ auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
+ if (olh_info != state.attrset.end()) {
+ RGWOLHInfo info;
+ int r = decode_olh_info(dpp, cct, olh_info->second, &info);
+ if (r < 0) {
+ return r;
+ }
+ info.target.key.get_index_key(&key);
+ delete_marker = info.removed;
+ }
+
+ for (iter = log.begin(); iter != log.end(); ++iter) {
+ vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
+ for (; viter != iter->second.end(); ++viter) {
+ rgw_bucket_olh_log_entry& entry = *viter;
+
+ ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
+ << " key=" << entry.key.name << "[" << entry.key.instance << "] "
+ << (entry.delete_marker ? "(delete)" : "") << dendl;
+ switch (entry.op) {
+ case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
+ remove_instances.push_back(entry.key);
+ break;
+ case CLS_RGW_OLH_OP_LINK_OLH:
+ // only overwrite a link of the same epoch if its key sorts before
+ if (link_epoch < iter->first || key.instance.empty() ||
+ key.instance > entry.key.instance) {
+ ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+ << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+ need_to_link = true;
+ need_to_remove = false;
+ key = entry.key;
+ delete_marker = entry.delete_marker;
+ } else {
+ ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+ << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+ }
+ break;
+ case CLS_RGW_OLH_OP_UNLINK_OLH:
+ need_to_remove = true;
+ need_to_link = false;
+ break;
+ default:
+ ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
+ return -EIO;
+ }
+ string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+ attr_name.append(entry.op_tag);
+ op.rmxattr(attr_name.c_str());
+ }
+ }
+
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw::sal::Bucket* bucket = obj->get_bucket();
+
+ if (need_to_link) {
+ rgw_obj target(bucket->get_key(), key);
+ RGWOLHInfo info;
+ info.target = target;
+ info.removed = delete_marker;
+ bufferlist bl;
+ encode(info, bl);
+ op.setxattr(RGW_ATTR_OLH_INFO, bl);
+ }
+
+ /* first remove object instances */
+ for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
+ liter != remove_instances.end(); ++liter) {
+ cls_rgw_obj_key& key = *liter;
+ std::unique_ptr<rgw::sal::Object> obj_instance = bucket->get_object(key);
+ int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
+ if (ret < 0 && ret != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
+ return ret;
+ }
+ }
+
+ /* update olh object */
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+ return r;
+ }
+
+ r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
+ return r;
+ }
+
+ if (need_to_remove) {
+ ObjectWriteOperation rm_op;
+
+ rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+ rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
+ cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
+ rm_op.remove();
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
+ if (r == -ECANCELED) {
+ return 0; /* someone else won this race */
+ } else {
+ /*
+ * only clear if was successful, otherwise we might clobber pending operations on this object
+ */
+ r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj());
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * read olh log and apply it
+ */
+int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace)
+{
+ map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
+ bool is_truncated;
+ uint64_t ver_marker = 0;
+
+ do {
+ int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
+ if (ret < 0) {
+ return ret;
+ }
+ } while (is_truncated);
+
+ return 0;
+}
+
+int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+ RGWBucketInfo& bucket_info,
+ rgw::sal::Object* target_obj, bool delete_marker,
+ rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
+ optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
+{
+ string op_tag;
+
+ std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
+ olh_obj->clear_instance();
+
+ RGWObjState *state = NULL;
+ RGWObjManifest *manifest = nullptr;
+
+ int ret = 0;
+ int i;
+
+#define MAX_ECANCELED_RETRY 100
+ for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+ if (ret == -ECANCELED) {
+ olh_obj->invalidate();
+ }
+
+ ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ return ret;
+ }
+ ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(),
+ delete_marker, op_tag, meta, olh_epoch, unmod_since,
+ high_precision_time, zones_trace, log_data_change);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ // the bucket index rejected the link_olh() due to olh tag mismatch;
+ // attempt to reconstruct olh head attributes based on the bucket index
+ int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj());
+ if (r2 < 0 && r2 != -ECANCELED) {
+ return r2;
+ }
+ continue;
+ }
+ return ret;
+ }
+ break;
+ }
+
+ if (i == MAX_ECANCELED_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+ return -EIO;
+ }
+
+ ret = update_olh(dpp, state, bucket_info, olh_obj.get());
+ if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+ ret = 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
+ uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
+{
+ string op_tag;
+
+ std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
+ olh_obj->clear_instance();
+
+ RGWObjState *state = NULL;
+
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+ if (ret == -ECANCELED) {
+ olh_obj->invalidate();
+ }
+
+ ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */
+ if (ret < 0)
+ return ret;
+
+ ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ return ret;
+ }
+
+ string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+ ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
+ if (ret == -ECANCELED) {
+ continue;
+ }
+ return ret;
+ }
+ break;
+ }
+
+ if (i == MAX_ECANCELED_RETRY) {
+ ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+ return -EIO;
+ }
+
+ ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace);
+ if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+ return 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+ char buf[OBJ_INSTANCE_LEN + 1];
+
+ gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+ no underscore for instance name due to the way we encode the raw keys */
+
+ target_key->set_instance(buf);
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
+{
+ gen_rand_obj_instance_name(&target_obj->key);
+}
+
+int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
+{
+ map<string, bufferlist> attrset;
+
+ ObjectReadOperation op;
+ op.getxattrs(&attrset, NULL);
+
+ int r = obj_operate(dpp, bucket_info, obj, &op);
+ if (r < 0) {
+ return r;
+ }
+
+ auto iter = attrset.find(RGW_ATTR_OLH_INFO);
+ if (iter == attrset.end()) { /* not an olh */
+ return -EINVAL;
+ }
+
+ return decode_olh_info(dpp, cct, iter->second, olh);
+}
+
+void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
+ map<string, bufferlist>& pending_entries,
+ map<string, bufferlist> *rm_pending_entries)
+{
+ map<string, bufferlist>::iterator iter = pending_entries.begin();
+
+ real_time now = real_clock::now();
+
+ while (iter != pending_entries.end()) {
+ auto biter = iter->second.cbegin();
+ RGWOLHPendingInfo pending_info;
+ try {
+ decode(pending_info, biter);
+ } catch (buffer::error& err) {
+ /* skipping bad entry, we could remove it but it might hide a bug */
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
+ ++iter;
+ continue;
+ }
+
+ map<string, bufferlist>::iterator cur_iter = iter;
+ ++iter;
+ if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
+ (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
+ pending_entries.erase(cur_iter);
+ } else {
+ /* entries names are sorted by time (rounded to a second) */
+ break;
+ }
+ }
+}
+
+int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
+{
+ rgw_rados_ref ref;
+ int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ // trim no more than 1000 entries per osd op
+ constexpr int max_entries = 1000;
+
+ auto i = pending_attrs.begin();
+ while (i != pending_attrs.end()) {
+ ObjectWriteOperation op;
+ bucket_index_guard_olh_op(dpp, state, op);
+
+ for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
+ op.rmxattr(i->first.c_str());
+ }
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r == -ENOENT || r == -ECANCELED) {
+ /* raced with some other change, shouldn't sweat about it */
+ return 0;
+ }
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target)
+{
+ map<string, bufferlist> pending_entries;
+ rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+ map<string, bufferlist> rm_pending_entries;
+ check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+ if (!rm_pending_entries.empty()) {
+ int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ if (!pending_entries.empty()) {
+ ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl;
+
+ int ret = update_olh(dpp, state, bucket_info, olh_obj);
+ if (ret < 0) {
+ if (ret == -ECANCELED) {
+ // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
+ // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
+ // return ENOENT to indicate that the OLH object was removed.
+ ret = -ENOENT;
+ }
+ return ret;
+ }
+ }
+
+ auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
+ if (iter == state->attrset.end()) {
+ return -EINVAL;
+ }
+
+ RGWOLHInfo olh;
+ int ret = decode_olh_info(dpp, cct, iter->second, &olh);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (olh.removed) {
+ return -ENOENT;
+ }
+
+ *target = olh.target;
+
+ return 0;
+}
+
+int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
+ rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
+ map<string, bufferlist> *attrs, bufferlist *first_chunk,
+ RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ map<string, bufferlist> unfiltered_attrset;
+ uint64_t size = 0;
+ struct timespec mtime_ts;
+
+ ObjectReadOperation op;
+ if (objv_tracker) {
+ objv_tracker->prepare_op_for_read(&op);
+ }
+ if (attrs) {
+ op.getxattrs(&unfiltered_attrset, NULL);
+ }
+ if (psize || pmtime) {
+ op.stat2(&size, &mtime_ts, NULL);
+ }
+ if (first_chunk) {
+ op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
+ }
+ bufferlist outbl;
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
+
+ if (epoch) {
+ *epoch = ref.pool.ioctx().get_last_version();
+ }
+
+ if (r < 0)
+ return r;
+
+ if (psize)
+ *psize = size;
+ if (pmtime)
+ *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+ if (attrs) {
+ rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
+ }
+
+ return 0;
+}
+
+int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, string *bucket_ver, string *master_ver,
+ map<RGWObjCategory, RGWStorageStats>& stats,
+ string *max_marker, bool *syncstopped)
+{
+ vector<rgw_bucket_dir_header> headers;
+ map<int, string> bucket_instance_ids;
+ int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
+ if (r < 0) {
+ return r;
+ }
+
+ ceph_assert(headers.size() == bucket_instance_ids.size());
+
+ auto iter = headers.begin();
+ map<int, string>::iterator viter = bucket_instance_ids.begin();
+ BucketIndexShardsManager ver_mgr;
+ BucketIndexShardsManager master_ver_mgr;
+ BucketIndexShardsManager marker_mgr;
+ char buf[64];
+ for(; iter != headers.end(); ++iter, ++viter) {
+ accumulate_raw_stats(*iter, stats);
+ snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
+ ver_mgr.add(viter->first, string(buf));
+ snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
+ master_ver_mgr.add(viter->first, string(buf));
+ if (shard_id >= 0) {
+ *max_marker = iter->max_marker;
+ } else {
+ marker_mgr.add(viter->first, iter->max_marker);
+ }
+ if (syncstopped != NULL)
+ *syncstopped = iter->syncstopped;
+ }
+ ver_mgr.to_string(bucket_ver);
+ master_ver_mgr.to_string(master_ver);
+ if (shard_id < 0) {
+ marker_mgr.to_string(max_marker);
+ }
+ return 0;
+}
+
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+ RGWGetBucketStats_CB *cb;
+ uint32_t pendings;
+ map<RGWObjCategory, RGWStorageStats> stats;
+ int ret_code;
+ bool should_cb;
+ ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
+
+public:
+ RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
+ : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
+ {}
+
+ void handle_response(int r, rgw_bucket_dir_header& header) override {
+ std::lock_guard l{lock};
+ if (should_cb) {
+ if ( r >= 0) {
+ accumulate_raw_stats(header, stats);
+ } else {
+ ret_code = r;
+ }
+
+ // Are we all done?
+ if (--pendings == 0) {
+ if (!ret_code) {
+ cb->set_response(&stats);
+ }
+ cb->handle_response(ret_code);
+ cb->put();
+ }
+ }
+ }
+
+ void unset_cb() {
+ std::lock_guard l{lock};
+ should_cb = false;
+ }
+};
+
+int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
+{
+ int num_aio = 0;
+ RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
+ ceph_assert(get_ctx);
+ int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
+ if (r < 0) {
+ ctx->put();
+ if (num_aio) {
+ get_ctx->unset_cb();
+ }
+ }
+ get_ctx->put();
+ return r;
+}
+
+int RGWRados::get_bucket_instance_info(const string& meta_key,
+ RGWBucketInfo& info,
+ real_time *pmtime,
+ map<string, bufferlist> *pattrs,
+ optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ rgw_bucket bucket;
+ rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
+
+ return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
+}
+
+int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
+ real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
+ const DoutPrefixProvider *dpp)
+{
+ return ctl.bucket->read_bucket_instance_info(bucket, &info,
+ y,
+ dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::get_bucket_info(RGWServices *svc,
+ const string& tenant, const string& bucket_name,
+ RGWBucketInfo& info,
+ real_time *pmtime,
+ optional_yield y,
+ const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
+{
+ rgw_bucket bucket;
+ bucket.tenant = tenant;
+ bucket.name = bucket_name;
+ return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
+ ceph::real_time *pmtime,
+ const DoutPrefixProvider *dpp,
+ map<string, bufferlist> *pattrs)
+{
+ rgw_bucket bucket = info.bucket;
+ bucket.bucket_id.clear();
+
+ auto rv = info.objv_tracker.read_version;
+
+ return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
+ RGWBucketCtl::BucketInstance::GetParams()
+ .set_mtime(pmtime)
+ .set_attrs(pattrs)
+ .set_refresh_version(rv));
+}
+
+int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
+ real_time mtime, map<string, bufferlist> *pattrs,
+ const DoutPrefixProvider *dpp)
+{
+ return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
+ RGWBucketCtl::BucketInstance::PutParams()
+ .set_exclusive(exclusive)
+ .set_mtime(mtime)
+ .set_attrs(pattrs));
+}
+
+int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
+ map<string, bufferlist> *pattrs, bool create_entry_point,
+ const DoutPrefixProvider *dpp)
+{
+ bool create_head = !info.has_instance_obj || create_entry_point;
+
+ int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!create_head)
+ return 0; /* done! */
+
+ RGWBucketEntryPoint entry_point;
+ entry_point.bucket = info.bucket;
+ entry_point.owner = info.owner;
+ entry_point.creation_time = info.creation_time;
+ entry_point.linked = true;
+ RGWObjVersionTracker ot;
+ if (pep_objv && !pep_objv->tag.empty()) {
+ ot.write_version = *pep_objv;
+ } else {
+ ot.generate_new_write_ver(cct);
+ if (pep_objv) {
+ *pep_objv = ot.write_version;
+ }
+ }
+ ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
+ .set_exclusive(exclusive)
+ .set_objv_tracker(&ot)
+ .set_mtime(mtime));
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
+{
+ map<string, RGWBucketEnt>::iterator iter;
+ for (iter = m.begin(); iter != m.end(); ++iter) {
+ RGWBucketEnt& ent = iter->second;
+ rgw_bucket& bucket = ent.bucket;
+ ent.count = 0;
+ ent.size = 0;
+ ent.size_rounded = 0;
+
+ vector<rgw_bucket_dir_header> headers;
+
+ RGWBucketInfo bucket_info;
+ int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
+ if (r < 0)
+ return r;
+
+ auto hiter = headers.begin();
+ for (; hiter != headers.end(); ++hiter) {
+ RGWObjCategory category = main_category;
+ auto iter = (hiter->stats).find(category);
+ if (iter != hiter->stats.end()) {
+ struct rgw_bucket_category_stats& stats = iter->second;
+ ent.count += stats.num_entries;
+ ent.size += stats.total_size;
+ ent.size_rounded += stats.total_size_rounded;
+ }
+ }
+
+ // fill in placement_rule from the bucket instance for use in swift's
+ // per-storage policy statistics
+ ent.placement_rule = std::move(bucket_info.placement_rule);
+ }
+
+ return m.size();
+}
+
+int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
+{
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ librados::Rados *rad = get_rados_handle();
+ librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
+
+ r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
+ completion->release();
+ return r;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ int r = open_pool_ctx(dpp, pool, io_ctx, false);
+ if (r < 0)
+ return r;
+
+ iter = io_ctx.nobjects_begin();
+
+ return 0;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ int r = open_pool_ctx(dpp, pool, io_ctx, false);
+ if (r < 0)
+ return r;
+
+ librados::ObjectCursor oc;
+ if (!oc.from_str(cursor)) {
+ ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
+ return -EINVAL;
+ }
+
+ try {
+ iter = io_ctx.nobjects_begin(oc);
+ return 0;
+ } catch (const std::system_error& e) {
+ r = -e.code().value();
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+ return ctx.iter.get_cursor().to_str();
+}
+
+static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+ vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter)
+{
+ librados::IoCtx& io_ctx = ctx.io_ctx;
+ librados::NObjectIterator& iter = ctx.iter;
+
+ if (iter == io_ctx.nobjects_end())
+ return -ENOENT;
+
+ uint32_t i;
+
+ for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
+ rgw_bucket_dir_entry e;
+
+ string oid = iter->get_oid();
+ ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+
+ // fill it in with initial values; we may correct later
+ if (filter && !filter->filter(oid, oid))
+ continue;
+
+ e.key = oid;
+ objs.push_back(e);
+ }
+
+ if (is_truncated)
+ *is_truncated = (iter != io_ctx.nobjects_end());
+
+ return objs.size();
+}
+
+int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter)
+{
+ // catch exceptions from NObjectIterator::operator++()
+ try {
+ return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
+ } catch (const std::system_error& e) {
+ int r = -e.code().value();
+ ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning " << r << dendl;
+ return r;
+ } catch (const std::exception& e) {
+ ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+ << ", returning -5" << dendl;
+ return -EIO;
+ }
+}
+
+int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
+{
+ if (!ctx->initialized) {
+ int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
+ if (r < 0) {
+ ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+ return r;
+ }
+ ctx->initialized = true;
+ }
+ return 0;
+}
+
+int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ return -EINVAL;
+ }
+ RGWAccessListFilterPrefix filter(prefix_filter);
+ vector<rgw_bucket_dir_entry> objs;
+ int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
+ if (r < 0) {
+ if(r != -ENOENT)
+ ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+ return r;
+ }
+
+ vector<rgw_bucket_dir_entry>::iterator iter;
+ for (iter = objs.begin(); iter != objs.end(); ++iter) {
+ oids.push_back(iter->key.name);
+ }
+
+ return oids.size();
+}
+
+int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
+ int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated)
+{
+ if (!ctx.initialized) {
+ int r = list_raw_objects_init(dpp, pool, string(), &ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+ return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
+int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ rgw_bucket_dir_entry *dirent)
+{
+ rgw_cls_bi_entry bi_entry;
+ int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+ }
+ if (r < 0) {
+ return r;
+ }
+ auto iter = bi_entry.data.cbegin();
+ try {
+ decode(*dirent, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ rgw_bucket_olh_entry *olh)
+{
+ rgw_cls_bi_entry bi_entry;
+ int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+ }
+ if (r < 0) {
+ return r;
+ }
+ auto iter = bi_entry.data.cbegin();
+ try {
+ decode(*olh, iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+ BIIndexType index_type, rgw_cls_bi_entry *entry)
+{
+ BucketShard bs(this);
+ int ret = bs.init(dpp, bucket_info, obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+
+ auto& ref = bs.bucket_obj.get_ref();
+
+ return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
+}
+
+void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ cls_rgw_bi_put(op, ref.obj.oid, entry);
+}
+
+int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
+{
+ // make sure incomplete multipart uploads are hashed correctly
+ if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
+ RGWMPObj mp;
+ mp.from_meta(obj.key.name);
+ obj.index_hash_source = mp.get_key();
+ }
+ BucketShard bs(this);
+
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return bi_put(bs, entry);
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
+ const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ rgw_obj obj(bucket, obj_name_filter);
+ BucketShard bs(this);
+ int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ auto& ref = bs.bucket_obj.get_ref();
+ ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+ if (ret == -ENOENT) {
+ *is_truncated = false;
+ }
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
+ list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+ BucketShard bs(this);
+ int ret = bs.init(dpp, bucket_info,
+ bucket_info.layout.current_index,
+ shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
+}
+
+int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
+{
+ auto& ref = bs.bucket_obj.get_ref();
+ int ret = ref.pool.ioctx().remove(ref.obj.oid);
+ if (ret == -ENOENT) {
+ ret = 0;
+ }
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
+{
+ return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
+}
+
+int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
+ librados::ObjectWriteOperation *op)
+{
+ return gc_pool_ctx.aio_operate(oid, c, op);
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
+{
+ return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
+}
+
+int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+ return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
+}
+
+int RGWRados::process_gc(bool expired_only)
+{
+ return gc->process(expired_only);
+}
+
+int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
+ vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+ int& index)
+{
+ return lc->list_lc_progress(marker, max_entries, progress_map, index);
+}
+
+int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
+{
+ RGWLC lc;
+ lc.initialize(cct, this->driver);
+ RGWLC::LCWorker worker(&lc, cct, &lc, 0);
+ auto ret = lc.process(&worker, optional_bucket, true /* once */);
+ lc.stop_processor(); // sets down_flag, but returns immediately
+ return ret;
+}
+
+bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
+{
+ return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
+}
+
+int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
+ rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+ ObjectWriteOperation o;
+ o.assert_exists(); // bucket index shard must exist
+
+ cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
+ int ret = bs.bucket_obj.operate(dpp, &o, y);
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+ return ret;
+}
+
+int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+ " obj=" << obj << " tag=" << tag << " op=" << op <<
+ ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
+ ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ ObjectWriteOperation o;
+ o.assert_exists(); // bucket index shard must exist
+
+ rgw_bucket_dir_entry_meta dir_meta;
+ dir_meta = ent.meta;
+ dir_meta.category = category;
+
+ rgw_zone_set zones_trace;
+ if (_zones_trace) {
+ zones_trace = *_zones_trace;
+ }
+ zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+ rgw_bucket_entry_ver ver;
+ ver.pool = pool;
+ ver.epoch = epoch;
+ cls_rgw_obj_key key(ent.key.name, ent.key.instance);
+ cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+ cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
+ svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
+ complete_op_data *arg;
+ index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
+ svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
+ librados::AioCompletion *completion = arg->rados_completion;
+ int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
+ completion->release(); /* can't reference arg here, as it might have already been released */
+
+ ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+ return ret;
+}
+
+int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category,
+ list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
+ int64_t pool, uint64_t epoch,
+ rgw_obj& obj,
+ real_time& removed_mtime,
+ list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags,
+ rgw_zone_set *zones_trace)
+{
+ rgw_bucket_dir_entry ent;
+ ent.meta.mtime = removed_mtime;
+ obj.key.get_index_key(&ent.key);
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
+ ent, RGWObjCategory::None, remove_objs,
+ bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
+ list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+ rgw_bucket_dir_entry ent;
+ obj.key.get_index_key(&ent.key);
+ return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
+ -1 /* pool id */, 0, ent,
+ RGWObjCategory::None, remove_objs, bilog_flags,
+ zones_trace);
+}
+
+int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
+}
+
+
+// returns 0 if there is an error in calculation
+uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+ uint32_t num_shards)
+{
+ if (num_shards == 0) {
+ // we'll get a floating point exception since we divide by
+ // num_shards
+ return 0;
+ }
+
+ // We want to minimize the chances that when num_shards >>
+ // num_entries that we return much fewer than num_entries to the
+ // client. Given all the overhead of making a cls call to the osd,
+ // returning a few entries is not much more work than returning one
+ // entry. This minimum might be better tuned based on future
+ // experiments where num_shards >> num_entries. (Note: ">>" should
+ // be interpreted as "much greater than".)
+ constexpr uint32_t min_read = 8;
+
+ // The following is based on _"Balls into Bins" -- A Simple and
+ // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
+ // cases when num_shards >> num_entries (it almost serves as a
+ // ceiling calculation). We also assume alpha is 1.0 and extract it
+ // from the calculation. Future work could involve memoizing some of
+ // the transcendental functions to minimize repeatedly re-calling
+ // them with the same parameters, which we expect to be the case the
+ // majority of the time.
+ uint32_t calc_read =
+ 1 +
+ static_cast<uint32_t>((num_entries / num_shards) +
+ sqrt((2 * num_entries) *
+ log(num_shards) / num_shards));
+
+ return std::max(min_read, calc_read);
+}
+
+
+int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ const int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ const std::string& delimiter,
+ const uint32_t num_entries,
+ const bool list_versions,
+ const uint16_t expansion_factor,
+ ent_map_t& m,
+ bool* is_truncated,
+ bool* cls_filtered,
+ rgw_obj_index_key* last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+ /* expansion_factor allows the number of entries to read to grow
+ * exponentially; this is used when earlier reads are producing too
+ * few results, perhaps due to filtering or to a series of
+ * namespaced entries */
+
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+ " start_after=\"" << start_after.to_string() <<
+ "\", prefix=\"" << prefix <<
+ ", delimiter=\"" << delimiter <<
+ "\", shard_id=" << shard_id <<
+ "\", num_entries=" << num_entries <<
+ ", shard_id=" << shard_id <<
+ ", list_versions=" << list_versions <<
+ ", expansion_factor=" << expansion_factor <<
+ ", force_check_filter is " <<
+ (force_check_filter ? "set" : "unset") << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ m.clear();
+
+ RGWSI_RADOS::Pool index_pool;
+ // key - oid (for different shards if there is any)
+ // value - list result for the corresponding oid (shard), it is filled by
+ // the AIO callback
+ std::map<int, std::string> shard_oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
+ &index_pool, &shard_oids,
+ nullptr);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
+ return r;
+ }
+
+ const uint32_t shard_count = shard_oids.size();
+ if (shard_count == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": the bucket index shard count appears to be 0, "
+ "which is an illegal value" << dendl;
+ return -ERR_INVALID_BUCKET_STATE;
+ }
+
+ uint32_t num_entries_per_shard;
+ if (expansion_factor == 0) {
+ num_entries_per_shard =
+ calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+ } else if (expansion_factor <= 11) {
+ // we'll max out the exponential multiplication factor at 1024 (2<<10)
+ num_entries_per_shard =
+ std::min(num_entries,
+ (uint32_t(1 << (expansion_factor - 1)) *
+ calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+ } else {
+ num_entries_per_shard = num_entries;
+ }
+
+ if (num_entries_per_shard == 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": unable to calculate the number of entries to read from each "
+ "bucket index shard" << dendl;
+ return -ERR_INVALID_BUCKET_STATE;
+ }
+
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": request from each of " << shard_count <<
+ " shard(s) for " << num_entries_per_shard << " entries to get " <<
+ num_entries << " total entries" << dendl;
+
+ auto& ioctx = index_pool.ioctx();
+ std::map<int, rgw_cls_list_ret> shard_list_results;
+ cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
+ r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
+ num_entries_per_shard,
+ list_versions, shard_oids, shard_list_results,
+ cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
+ " failed" << dendl;
+ return r;
+ }
+
+ // to manage the iterators through each shard's list results
+ struct ShardTracker {
+ const size_t shard_idx;
+ rgw_cls_list_ret& result;
+ const std::string& oid_name;
+ RGWRados::ent_map_t::iterator cursor;
+ RGWRados::ent_map_t::iterator end;
+
+ // manages an iterator through a shard and provides other
+ // accessors
+ ShardTracker(size_t _shard_idx,
+ rgw_cls_list_ret& _result,
+ const std::string& _oid_name):
+ shard_idx(_shard_idx),
+ result(_result),
+ oid_name(_oid_name),
+ cursor(_result.dir.m.begin()),
+ end(_result.dir.m.end())
+ {}
+
+ inline const std::string& entry_name() const {
+ return cursor->first;
+ }
+ rgw_bucket_dir_entry& dir_entry() const {
+ return cursor->second;
+ }
+ inline bool is_truncated() const {
+ return result.is_truncated;
+ }
+ inline ShardTracker& advance() {
+ ++cursor;
+ // return a self-reference to allow for chaining of calls, such
+ // as x.advance().at_end()
+ return *this;
+ }
+ inline bool at_end() const {
+ return cursor == end;
+ }
+ }; // ShardTracker
+
+ // add the next unique candidate, or return false if we reach the end
+ auto next_candidate = [] (CephContext *cct, ShardTracker& t,
+ std::multimap<std::string, size_t>& candidates,
+ size_t tracker_idx) {
+ if (!t.at_end()) {
+ candidates.emplace(t.entry_name(), tracker_idx);
+ }
+ return;
+ };
+
+ // one tracker per shard requested (may not be all shards)
+ std::vector<ShardTracker> results_trackers;
+ results_trackers.reserve(shard_list_results.size());
+ for (auto& r : shard_list_results) {
+ results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
+
+ // if any *one* shard's result is trucated, the entire result is
+ // truncated
+ *is_truncated = *is_truncated || r.second.is_truncated;
+
+ // unless *all* are shards are cls_filtered, the entire result is
+ // not filtered
+ *cls_filtered = *cls_filtered && r.second.cls_filtered;
+ }
+
+ // create a map to track the next candidate entry from ShardTracker
+ // (key=candidate, value=index into results_trackers); as we consume
+ // entries from shards, we replace them with the next entries in the
+ // shards until we run out
+ std::multimap<std::string, size_t> candidates;
+ size_t tracker_idx = 0;
+ std::vector<size_t> vidx;
+ vidx.reserve(shard_list_results.size());
+ for (auto& t : results_trackers) {
+ // it's important that the values in the map refer to the index
+ // into the results_trackers vector, which may not be the same
+ // as the shard number (i.e., when not all shards are requested)
+ next_candidate(cct, t, candidates, tracker_idx);
+ ++tracker_idx;
+ }
+
+ rgw_bucket_dir_entry*
+ last_entry_visited = nullptr; // to set last_entry (marker)
+ std::map<std::string, bufferlist> updates;
+ uint32_t count = 0;
+ while (count < num_entries && !candidates.empty()) {
+ r = 0;
+ // select the next entry in lexical order (first key in map);
+ // again tracker_idx is not necessarily shard number, but is index
+ // into results_trackers vector
+ tracker_idx = candidates.begin()->second;
+ auto& tracker = results_trackers.at(tracker_idx);
+
+ const std::string& name = tracker.entry_name();
+ rgw_bucket_dir_entry& dirent = tracker.dir_entry();
+
+ ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
+ dirent.key << " from shard " << tracker.shard_idx << dendl;
+
+ const bool force_check =
+ force_check_filter && force_check_filter(dirent.key.name);
+
+ if ((!dirent.exists &&
+ !dirent.is_delete_marker() &&
+ !dirent.is_common_prefix()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
+ /* there are uncommitted ops. We need to check the current
+ * state, and if the tags are old we need to do clean-up as
+ * well. */
+ librados::IoCtx sub_ctx;
+ sub_ctx.dup(ioctx);
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ " calling check_disk_state bucket=" << bucket_info.bucket <<
+ " entry=" << dirent.key << dendl_bitx;
+ r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+ updates[tracker.oid_name], y);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": check_disk_state for \"" << dirent.key <<
+ "\" failed with r=" << r << dendl;
+ return r;
+ }
+ } else {
+ r = 0;
+ }
+
+ // at this point either r >= 0 or r == -ENOENT
+ if (r >= 0) { // i.e., if r != -ENOENT
+ ldpp_dout(dpp, 10) << __func__ << ": got " <<
+ dirent.key << dendl;
+
+ auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
+ last_entry_visited = &it->second;
+ if (inserted) {
+ ++count;
+ } else {
+ ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+ " reassigned map value at \"" << name <<
+ "\", which should not happen" << dendl;
+ }
+ } else {
+ ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
+ dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+ last_entry_visited = &tracker.dir_entry();
+ }
+
+ // refresh the candidates map
+ vidx.clear();
+ bool need_to_stop = false;
+ auto range = candidates.equal_range(name);
+ for (auto i = range.first; i != range.second; ++i) {
+ vidx.push_back(i->second);
+ }
+ candidates.erase(range.first, range.second);
+ for (auto idx : vidx) {
+ auto& tracker_match = results_trackers.at(idx);
+ tracker_match.advance();
+ next_candidate(cct, tracker_match, candidates, idx);
+ if (tracker_match.at_end() && tracker_match.is_truncated()) {
+ need_to_stop = true;
+ break;
+ }
+ }
+ if (need_to_stop) {
+ // once we exhaust one shard that is truncated, we need to stop,
+ // as we cannot be certain that one of the next entries needs to
+ // come from that shard; S3 and swift protocols allow returning
+ // fewer than what was requested
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": stopped accumulating results at count=" << count <<
+ ", dirent=\"" << dirent.key <<
+ "\", because its shard is truncated and exhausted" << dendl;
+ break;
+ }
+ } // while we haven't provided requested # of result entries
+
+ // suggest updates if there are any
+ for (auto& miter : updates) {
+ if (miter.second.length()) {
+ ObjectWriteOperation o;
+ cls_rgw_suggest_changes(o, miter.second);
+ // we don't care if we lose suggested updates, send them off blindly
+ AioCompletion *c =
+ librados::Rados::aio_create_completion(nullptr, nullptr);
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": doing dir_suggest on " << miter.first << dendl_bitx;
+ ioctx.aio_operate(miter.first, c, &o);
+ c->release();
+ }
+ } // updates loop
+
+ // determine truncation by checking if all the returned entries are
+ // consumed or not
+ *is_truncated = false;
+ for (const auto& t : results_trackers) {
+ if (!t.at_end() || t.is_truncated()) {
+ *is_truncated = true;
+ break;
+ }
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+ dendl;
+
+ if (*is_truncated && count < num_entries) {
+ ldpp_dout(dpp, 10) << __func__ <<
+ ": requested " << num_entries << " entries but returning " <<
+ count << ", which is truncated" << dendl;
+ }
+
+ if (last_entry_visited != nullptr && last_entry) {
+ *last_entry = last_entry_visited->key;
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, last_entry=" << *last_entry << dendl;
+ } else {
+ ldpp_dout(dpp, 20) << __func__ <<
+ ": returning, last_entry NOT SET" << dendl;
+ }
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::cls_bucket_list_ordered
+
+
+// A helper function to retrieve the hash source from an incomplete
+// multipart entry by removing everything from the second to last
+// period on.
+static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
+ std::size_t found = oid_wo_ns.rfind('.');
+ if (found == std::string::npos || found < 1) {
+ return -EINVAL;
+ }
+ found = oid_wo_ns.rfind('.', found - 1);
+ if (found == std::string::npos || found < 1) {
+ return -EINVAL;
+ }
+ *index_hash_source = oid_wo_ns.substr(0, found);
+ return 0;
+}
+
+
+int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ std::vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter) {
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+ " start_after=\"" << start_after <<
+ "\", prefix=\"" << prefix <<
+ "\", shard_id=" << shard_id <<
+ "\", num_entries=" << num_entries <<
+ ", list_versions=" << list_versions <<
+ (force_check_filter ? "set" : "unset") << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ ent_list.clear();
+ static MultipartMetaFilter multipart_meta_filter;
+
+ *is_truncated = false;
+ RGWSI_RADOS::Pool index_pool;
+
+ std::map<int, std::string> oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ auto& ioctx = index_pool.ioctx();
+
+ const uint32_t num_shards = oids.size();
+
+ rgw_obj_index_key marker = start_after;
+ uint32_t current_shard;
+ if (shard_id >= 0) {
+ current_shard = shard_id;
+ } else if (start_after.empty()) {
+ current_shard = 0u;
+ } else {
+ // at this point we have a marker (start_after) that has something
+ // in it, so we need to get to the bucket shard index, so we can
+ // start reading from there
+
+
+ // now convert the key (oid) to an rgw_obj_key since that will
+ // separate out the namespace, name, and instance
+ rgw_obj_key obj_key;
+ bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
+ if (!parsed) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " received an invalid start marker: \"" << start_after << "\"" <<
+ dendl;
+ return -EINVAL;
+ } else if (obj_key.name.empty()) {
+ // if the name is empty that means the object name came in with
+ // a namespace only, and therefore we need to start our scan at
+ // the first bucket index shard
+ current_shard = 0u;
+ } else {
+ // so now we have the key used to compute the bucket index shard
+ // and can extract the specific shard from it
+ if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
+ // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
+ // the implementation relying on MultipartMetaFilter
+ // because MultipartMetaFilter only checks .meta suffix, which may
+ // exclude data multiparts but include some regular objects with .meta suffix
+ // by mistake.
+ string index_hash_source;
+ r = parse_index_hash_source(obj_key.name, &index_hash_source);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ " parse_index_hash_source unable to parse \"" << obj_key.name <<
+ "\", r=" << r << dendl;
+ return r;
+ }
+ current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
+ } else {
+ current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
+ }
+ }
+ }
+
+ uint32_t count = 0u;
+ std::map<std::string, bufferlist> updates;
+ rgw_obj_index_key last_added_entry;
+ while (count <= num_entries &&
+ ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
+ current_shard < num_shards)) {
+ const std::string& oid = oids[current_shard];
+ rgw_cls_list_ret result;
+
+ librados::ObjectReadOperation op;
+ const std::string empty_delimiter;
+ cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
+ num_entries,
+ list_versions, &result);
+ r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
+ return r;
+ }
+
+ for (auto& entry : result.dir.m) {
+ rgw_bucket_dir_entry& dirent = entry.second;
+
+ bool force_check = force_check_filter &&
+ force_check_filter(dirent.key.name);
+ if ((!dirent.exists && !dirent.is_delete_marker()) ||
+ !dirent.pending_map.empty() ||
+ force_check) {
+ /* there are uncommitted ops. We need to check the current state,
+ * and if the tags are old we need to do cleanup as well. */
+ librados::IoCtx sub_ctx;
+ sub_ctx.dup(ioctx);
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ ": calling check_disk_state bucket=" << bucket_info.bucket <<
+ " entry=" << dirent.key << dendl_bitx;
+ r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+ if (r < 0 && r != -ENOENT) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+ ": error in check_disk_state, r=" << r << dendl;
+ return r;
+ }
+ } else {
+ r = 0;
+ }
+
+ // at this point either r >= 0 or r == -ENOENT
+ if (r >= 0) { // i.e., if r != -ENOENT
+ ldpp_dout(dpp, 10) << __func__ << ": got " <<
+ dirent.key << dendl;
+
+ if (count < num_entries) {
+ marker = last_added_entry = dirent.key; // double assign
+ ent_list.emplace_back(std::move(dirent));
+ ++count;
+ } else {
+ last_added_entry = dirent.key;
+ *is_truncated = true;
+ ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
+ ": reached max entries (" << num_entries << ") to return at \"" <<
+ dirent.key << "\"" << dendl;
+ goto check_updates;
+ }
+ } else { // r == -ENOENT
+ // in the case of -ENOENT, make sure we're advancing marker
+ // for possible next call to CLSRGWIssueBucketList
+ marker = dirent.key;
+ }
+ } // entry for loop
+
+ if (!result.is_truncated) {
+ // if we reached the end of the shard read next shard
+ ++current_shard;
+ marker = rgw_obj_index_key();
+ }
+ } // shard loop
+
+check_updates:
+
+ // suggest updates if there is any
+ std::map<std::string, bufferlist>::iterator miter = updates.begin();
+ for (; miter != updates.end(); ++miter) {
+ if (miter->second.length()) {
+ ObjectWriteOperation o;
+ cls_rgw_suggest_changes(o, miter->second);
+ // we don't care if we lose suggested updates, send them off blindly
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ " doing dir_suggest on " << miter->first << dendl_bitx;
+ ioctx.aio_operate(miter->first, c, &o);
+ c->release();
+ }
+ }
+
+ if (last_entry && !ent_list.empty()) {
+ *last_entry = last_added_entry;
+ }
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::cls_bucket_list_unordered
+
+
+int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
+ rgw_usage_log_info& info)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ ObjectWriteOperation op;
+ cls_rgw_usage_log_add(op, info);
+
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ return r;
+}
+
+int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+ uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+ string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
+ bool *is_truncated)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ *is_truncated = false;
+
+ r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
+ max_entries, read_iter, usage, is_truncated);
+
+ return r;
+}
+
+static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
+{
+ bool done = false;
+ do {
+ librados::ObjectWriteOperation op;
+ cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
+ int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ if (r == -ENODATA)
+ done = true;
+ else if (r < 0)
+ return r;
+ } while (!done);
+
+ return 0;
+}
+
+int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+ uint64_t start_epoch, uint64_t end_epoch)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+
+ r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
+ return r;
+}
+
+int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
+{
+ rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+ rgw_rados_ref ref;
+ int r = get_raw_obj_ref(dpp, obj, &ref);
+ if (r < 0) {
+ return r;
+ }
+ librados::ObjectWriteOperation op;
+ cls_rgw_usage_log_clear(op);
+ r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+ return r;
+}
+
+
+// note: this removes entries from the rados bucket index objects
+// without going through CLS; this is known to be called from
+// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
+int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::list<rgw_obj_index_key>& entry_key_list)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
+ " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
+ ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+ const auto& current_index = bucket_info.get_current_index();
+ if (is_layout_indexless(current_index)) {
+ return -EINVAL;
+ }
+ const uint32_t num_shards = current_index.layout.normal.num_shards;
+
+ RGWSI_RADOS::Pool index_pool;
+ std::map<int, std::string> index_oids;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
+ bucket_info.layout.current_index,
+ &index_pool, &index_oids, nullptr);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+ " open_bucket_index returned " << r << dendl_bitx;
+ return r;
+ }
+
+ // split up removals by shard
+ std::map<int, std::set<std::string>> sharded_removals;
+ for (const auto& entry_key : entry_key_list) {
+ const rgw_obj_key obj_key(entry_key);
+ const uint32_t shard =
+ RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+
+ // entry_key already combines namespace and name, so we first have
+ // to break that apart before we can then combine with instance
+ std::string name;
+ std::string ns; // namespace
+ rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
+ rgw_obj_key full_key(name, entry_key.instance, ns);
+ std::string combined_key = full_key.get_oid();
+
+ sharded_removals[shard].insert(combined_key);
+
+ ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+ ": removal from bucket index, bucket=" << bucket_info.bucket <<
+ " key=" << combined_key << " designated for shard " << shard <<
+ dendl_bitx;
+ }
+
+ for (const auto& removals : sharded_removals) {
+ const int shard = removals.first;
+ const std::string& oid = index_oids[shard];
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": removal from bucket index, bucket=" << bucket_info.bucket <<
+ ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
+ removals.second.size() << dendl_bitx;
+
+ r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+ ": omap_rm_keys returned ret=" << r <<
+ dendl_bitx;
+ return r;
+ }
+ }
+
+ ldout_bitx(bitx, dpp, 5) <<
+ "EXITING " << __func__ << " and returning " << r << dendl_bitx;
+
+ return r;
+}
+
+int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
+ librados::IoCtx io_ctx,
+ RGWBucketInfo& bucket_info,
+ rgw_bucket_dir_entry& list_state,
+ rgw_bucket_dir_entry& object,
+ bufferlist& suggested_updates,
+ optional_yield y)
+{
+ const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+ ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
+ bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ driver->get_bucket(nullptr, bucket_info, &bucket);
+ uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
+
+ std::string loc;
+
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(list_state.key);
+ MultipartMetaFilter multipart_meta_filter;
+ string temp_key;
+ if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
+ obj->set_in_extra_data(true);
+ }
+
+ string oid;
+ get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc);
+
+ if (loc != list_state.locator) {
+ ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
+ }
+
+ io_ctx.locator_set_key(list_state.locator);
+
+ RGWObjState *astate = NULL;
+ RGWObjManifest *manifest = nullptr;
+ RGWObjectCtx rctx(this->driver);
+ int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
+ if (r < 0)
+ return r;
+
+ list_state.pending_map.clear(); // we don't need this and it inflates size
+ if (!list_state.is_delete_marker() && !astate->exists) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
+ /* object doesn't exist right now -- hopefully because it's
+ * marked as !exists and got deleted */
+ if (list_state.exists) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
+ /* FIXME: what should happen now? Work out if there are any
+ * non-bad ways this could happen (there probably are, but annoying
+ * to handle!) */
+ }
+
+ // encode a suggested removal of that key
+ list_state.ver.epoch = io_ctx.get_last_version();
+ list_state.ver.pool = io_ctx.get_id();
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
+ cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
+ return -ENOENT;
+ }
+
+ string etag;
+ string content_type;
+ string storage_class;
+ ACLOwner owner;
+ bool appendable = false;
+
+ object.meta.size = astate->size;
+ object.meta.accounted_size = astate->accounted_size;
+ object.meta.mtime = astate->mtime;
+
+ map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
+ if (iter != astate->attrset.end()) {
+ etag = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
+ if (iter != astate->attrset.end()) {
+ content_type = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != astate->attrset.end()) {
+ storage_class = rgw_bl_str(iter->second);
+ }
+ iter = astate->attrset.find(RGW_ATTR_ACL);
+ if (iter != astate->attrset.end()) {
+ r = decode_policy(dpp, iter->second, &owner);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
+ }
+ }
+ iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+ if (iter != astate->attrset.end()) {
+ appendable = true;
+ }
+
+ if (manifest) {
+ RGWObjManifest::obj_iterator miter;
+ for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+ const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver);
+ rgw_obj loc;
+ RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
+
+ if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
+ r = delete_obj_index(loc, astate->mtime, dpp);
+ if (r < 0) {
+ ldout_bitx(bitx, dpp, 0) <<
+ "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
+ }
+ }
+ }
+ }
+
+ object.meta.etag = etag;
+ object.meta.content_type = content_type;
+ object.meta.storage_class = storage_class;
+ object.meta.owner = owner.get_id().to_str();
+ object.meta.owner_display_name = owner.get_display_name();
+ object.meta.appendable = appendable;
+
+ // encode suggested updates
+
+ list_state.meta.size = object.meta.size;
+ list_state.meta.accounted_size = object.meta.accounted_size;
+ list_state.meta.mtime = object.meta.mtime;
+ list_state.meta.category = main_category;
+ list_state.meta.etag = etag;
+ list_state.meta.appendable = appendable;
+ list_state.meta.content_type = content_type;
+ list_state.meta.storage_class = storage_class;
+
+ librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
+ r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx);
+ if (r < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ " WARNING: unable to find head object data pool for \"" <<
+ obj << "\", not updating version pool/epoch" << dendl;
+ } else {
+ list_state.ver.pool = head_obj_ctx.get_id();
+ list_state.ver.epoch = astate->epoch;
+ }
+
+ if (astate->obj_tag.length() > 0) {
+ list_state.tag = astate->obj_tag.c_str();
+ }
+
+ list_state.meta.owner = owner.get_id().to_str();
+ list_state.meta.owner_display_name = owner.get_display_name();
+
+ list_state.exists = true;
+
+ ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+ ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
+ cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
+
+ ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+ return 0;
+} // RGWRados::check_disk_state
+
+int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> oids;
+ map<int, struct rgw_cls_list_ret> list_results;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
+ << r << dendl;
+ return r;
+ }
+
+ r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+ if (r < 0) {
+ ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
+ << r << dendl;
+ return r;
+ }
+
+ map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
+ for(; iter != list_results.end(); ++iter) {
+ headers.push_back(std::move(iter->second.dir.header));
+ }
+ return 0;
+}
+
+int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
+{
+ RGWSI_RADOS::Pool index_pool;
+ map<int, string> bucket_objs;
+ int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
+ if (r < 0)
+ return r;
+
+ map<int, string>::iterator iter = bucket_objs.begin();
+ for (; iter != bucket_objs.end(); ++iter) {
+ r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
+ if (r < 0) {
+ ctx->put();
+ break;
+ } else {
+ (*num_aio)++;
+ }
+ }
+ return r;
+}
+
+int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
+ const rgw_bucket& bucket,
+ uint64_t num_objs,
+ const DoutPrefixProvider *dpp)
+{
+ if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
+ return 0;
+ }
+
+ bool need_resharding = false;
+ uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+ const uint32_t max_dynamic_shards =
+ uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
+
+ if (num_source_shards >= max_dynamic_shards) {
+ return 0;
+ }
+
+ uint32_t suggested_num_shards = 0;
+ const uint64_t max_objs_per_shard =
+ cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
+
+ // TODO: consider per-bucket sync policy here?
+ const bool is_multisite = svc.zone->get_zone().log_data;
+
+ quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
+ num_objs, is_multisite, need_resharding,
+ &suggested_num_shards);
+ if (! need_resharding) {
+ return 0;
+ }
+
+ const uint32_t final_num_shards =
+ RGWBucketReshard::get_preferred_shards(suggested_num_shards,
+ max_dynamic_shards);
+ // final verification, so we don't reduce number of shards
+ if (final_num_shards <= num_source_shards) {
+ return 0;
+ }
+
+ ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
+ " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
+ "; new num shards " << final_num_shards << " (suggested " <<
+ suggested_num_shards << ")" << dendl;
+
+ return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
+}
+
+int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
+{
+ RGWReshard reshard(this->driver, dpp);
+
+ uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+
+ new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
+ if (new_num_shards <= num_source_shards) {
+ ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
+ return 0;
+ }
+
+ cls_rgw_reshard_entry entry;
+ entry.time = real_clock::now();
+ entry.tenant = bucket_info.owner.tenant;
+ entry.bucket_name = bucket_info.bucket.name;
+ entry.bucket_id = bucket_info.bucket.bucket_id;
+ entry.old_num_shards = num_source_shards;
+ entry.new_num_shards = new_num_shards;
+
+ return reshard.add(dpp, entry);
+}
+
+int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+ RGWQuota& quota,
+ uint64_t obj_size, optional_yield y,
+ bool check_size_only)
+{
+ // if we only check size, then num_objs will set to 0
+ if(check_size_only)
+ return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
+
+ return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
+}
+
+int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
+ int *shard_id)
+{
+ int r = 0;
+ switch (layout.hash_type) {
+ case rgw::BucketHashType::Mod:
+ if (!layout.num_shards) {
+ if (shard_id) {
+ *shard_id = -1;
+ }
+ } else {
+ uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
+ if (shard_id) {
+ *shard_id = (int)sid;
+ }
+ }
+ break;
+ default:
+ r = -ENOTSUP;
+ }
+ return r;
+}
+
+uint64_t RGWRados::instance_id()
+{
+ return get_rados_handle()->get_instance_id();
+}
+
+uint64_t RGWRados::next_bucket_id()
+{
+ std::lock_guard l{bucket_id_lock};
+ return ++max_bucket_id;
+}
+
+librados::Rados* RGWRados::get_rados_handle()
+{
+ return &rados;
+}
+
+int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
+{
+ rgw_rados_ref ref;
+ int ret = get_raw_obj_ref(dpp, obj, &ref);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+ return ret;
+ }
+
+ ObjectWriteOperation op;
+ list<string> prefixes;
+ cls_rgw_remove_obj(op, prefixes);
+
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+ c->release();
+ return ret;
+ }
+
+ handles.push_back(c);
+
+ return 0;
+}
+
+int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
+ RGWBucketInfo& bucket_info, RGWObjState *astate,
+ list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+ optional_yield y)
+{
+ rgw_rados_ref ref;
+ int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (keep_index_consistent) {
+ RGWRados::Bucket bop(this, bucket_info);
+ RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+ ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+
+ ObjectWriteOperation op;
+ list<string> prefixes;
+ cls_rgw_remove_obj(op, prefixes);
+
+ AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+ c->release();
+ return ret;
+ }
+
+ handles.push_back(c);
+
+ if (keep_index_consistent) {
+ ret = delete_obj_index(obj, astate->mtime, dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
+ return ret;
+ }
+ }
+ return ret;
+}
+
+void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
+{
+ auto it = new objexp_hint_entry;
+ it->tenant = "tenant1";
+ it->bucket_name = "bucket1";
+ it->bucket_id = "1234";
+ it->obj_key = rgw_obj_key("obj");
+ o.push_back(it);
+ o.push_back(new objexp_hint_entry);
+}
+
+void objexp_hint_entry::dump(Formatter *f) const
+{
+ f->open_object_section("objexp_hint_entry");
+ encode_json("tenant", tenant, f);
+ encode_json("bucket_name", bucket_name, f);
+ encode_json("bucket_id", bucket_id, f);
+ encode_json("rgw_obj_key", obj_key, f);
+ utime_t ut(exp_time);
+ encode_json("exp_time", ut, f);
+ f->close_section();
+}
+
+void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
+{
+ RGWOLHInfo *olh = new RGWOLHInfo;
+ olh->removed = false;
+ o.push_back(olh);
+ o.push_back(new RGWOLHInfo);
+}
+
+void RGWOLHInfo::dump(Formatter *f) const
+{
+ encode_json("target", target, f);
+}
+
+void RGWOLHPendingInfo::dump(Formatter *f) const
+{
+ utime_t ut(time);
+ encode_json("time", ut, f);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "rgw_common.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "cls/timeindex/cls_timeindex_types.h"
+#include "cls/otp/cls_otp_types.h"
+#include "rgw_quota.h"
+#include "rgw_log.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_period_puller.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_sync_module.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_service.h"
+#include "rgw_sal.h"
+#include "rgw_aio.h"
+#include "rgw_d3n_cacherequest.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_bi_rados.h"
+#include "common/Throttle.h"
+#include "common/ceph_mutex.h"
+#include "rgw_cache.h"
+#include "rgw_sal_fwd.h"
+
+struct D3nDataCache;
+
+class RGWWatcher;
+class ACLOwner;
+class RGWGC;
+class RGWMetaNotifier;
+class RGWDataNotifier;
+class RGWLC;
+class RGWObjectExpirer;
+class RGWMetaSyncProcessorThread;
+class RGWDataSyncProcessorThread;
+class RGWSyncLogTrimThread;
+class RGWSyncTraceManager;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWReshard;
+class RGWReshardWait;
+
+struct get_obj_data;
+
+/* flags for put_obj_meta() */
+#define PUT_OBJ_CREATE 0x01
+#define PUT_OBJ_EXCL 0x02
+#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
+
+static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
+{
+ if (bucket.marker.empty() || orig_oid.empty()) {
+ oid = orig_oid;
+ } else {
+ oid = bucket.marker;
+ oid.append("_");
+ oid.append(orig_oid);
+ }
+}
+
+static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
+{
+ const rgw_bucket& bucket = obj.bucket;
+ prepend_bucket_marker(bucket, obj.get_oid(), oid);
+ const std::string& loc = obj.key.get_loc();
+ if (!loc.empty()) {
+ prepend_bucket_marker(bucket, loc, locator);
+ } else {
+ locator.clear();
+ }
+}
+
+struct RGWOLHInfo {
+ rgw_obj target;
+ bool removed;
+
+ RGWOLHInfo() : removed(false) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(target, bl);
+ encode(removed, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(target, bl);
+ decode(removed, bl);
+ DECODE_FINISH(bl);
+ }
+ static void generate_test_instances(std::list<RGWOLHInfo*>& o);
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHInfo)
+
+struct RGWOLHPendingInfo {
+ ceph::real_time time;
+
+ RGWOLHPendingInfo() {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(time, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(time, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
+
+struct RGWUsageBatch {
+ std::map<ceph::real_time, rgw_usage_log_entry> m;
+
+ void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
+ bool exists = m.find(t) != m.end();
+ *account = !exists;
+ m[t].aggregate(entry);
+ }
+};
+
+struct RGWCloneRangeInfo {
+ rgw_obj src;
+ off_t src_ofs;
+ off_t dst_ofs;
+ uint64_t len;
+};
+
+class RGWFetchObjFilter {
+public:
+ virtual ~RGWFetchObjFilter() {}
+
+ virtual int filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const std::map<std::string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule) = 0;
+};
+
+class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
+protected:
+ rgw_placement_rule dest_rule;
+public:
+ RGWFetchObjFilter_Default() {}
+
+ int filter(CephContext *cct,
+ const rgw_obj_key& source_key,
+ const RGWBucketInfo& dest_bucket_info,
+ std::optional<rgw_placement_rule> dest_placement_rule,
+ const std::map<std::string, bufferlist>& obj_attrs,
+ std::optional<rgw_user> *poverride_owner,
+ const rgw_placement_rule **prule) override;
+};
+
+struct RGWObjStateManifest {
+ RGWObjState state;
+ std::optional<RGWObjManifest> manifest;
+};
+
+class RGWObjectCtx {
+ rgw::sal::Driver* driver;
+ ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
+
+ std::map<rgw_obj, RGWObjStateManifest> objs_state;
+public:
+ explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
+ RGWObjectCtx(RGWObjectCtx& _o) {
+ std::unique_lock wl{lock};
+ this->driver = _o.driver;
+ this->objs_state = _o.objs_state;
+ }
+
+ rgw::sal::Driver* get_driver() {
+ return driver;
+ }
+
+ RGWObjStateManifest *get_state(const rgw_obj& obj);
+
+ void set_compressed(const rgw_obj& obj);
+ void set_atomic(rgw_obj& obj);
+ void set_prefetch_data(const rgw_obj& obj);
+ void invalidate(const rgw_obj& obj);
+};
+
+
+struct RGWRawObjState {
+ rgw_raw_obj obj;
+ bool has_attrs{false};
+ bool exists{false};
+ uint64_t size{0};
+ ceph::real_time mtime;
+ uint64_t epoch{0};
+ bufferlist obj_tag;
+ bool has_data{false};
+ bufferlist data;
+ bool prefetch_data{false};
+ uint64_t pg_ver{0};
+
+ /* important! don't forget to update copy constructor */
+
+ RGWObjVersionTracker objv_tracker;
+
+ std::map<std::string, bufferlist> attrset;
+ RGWRawObjState() {}
+ RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
+ has_attrs = rhs.has_attrs;
+ exists = rhs.exists;
+ size = rhs.size;
+ mtime = rhs.mtime;
+ epoch = rhs.epoch;
+ if (rhs.obj_tag.length()) {
+ obj_tag = rhs.obj_tag;
+ }
+ has_data = rhs.has_data;
+ if (rhs.data.length()) {
+ data = rhs.data;
+ }
+ prefetch_data = rhs.prefetch_data;
+ pg_ver = rhs.pg_ver;
+ objv_tracker = rhs.objv_tracker;
+ }
+};
+
+struct RGWPoolIterCtx {
+ librados::IoCtx io_ctx;
+ librados::NObjectIterator iter;
+};
+
+struct RGWListRawObjsCtx {
+ bool initialized;
+ RGWPoolIterCtx iter_ctx;
+
+ RGWListRawObjsCtx() : initialized(false) {}
+};
+
+struct objexp_hint_entry {
+ std::string tenant;
+ std::string bucket_name;
+ std::string bucket_id;
+ rgw_obj_key obj_key;
+ ceph::real_time exp_time;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(2, 1, bl);
+ encode(bucket_name, bl);
+ encode(bucket_id, bl);
+ encode(obj_key, bl);
+ encode(exp_time, bl);
+ encode(tenant, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::const_iterator& bl) {
+ // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
+ DECODE_START(2, bl);
+ decode(bucket_name, bl);
+ decode(bucket_id, bl);
+ decode(obj_key, bl);
+ decode(exp_time, bl);
+ if (struct_v >= 2) {
+ decode(tenant, bl);
+ } else {
+ tenant.clear();
+ }
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<objexp_hint_entry*>& o);
+};
+WRITE_CLASS_ENCODER(objexp_hint_entry)
+
+class RGWMetaSyncStatusManager;
+class RGWDataSyncStatusManager;
+class RGWCoroutinesManagerRegistry;
+
+class RGWGetDirHeader_CB;
+class RGWGetUserHeader_CB;
+namespace rgw { namespace sal {
+ class RadosStore;
+ class MPRadosSerializer;
+ class LCRadosSerializer;
+} }
+
+class RGWAsyncRadosProcessor;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+struct bucket_info_entry {
+ RGWBucketInfo info;
+ real_time mtime;
+ std::map<std::string, bufferlist> attrs;
+};
+
+struct tombstone_entry;
+
+template <class K, class V>
+class lru_map;
+using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
+
+class RGWIndexCompletionManager;
+
+class RGWRados
+{
+ friend class RGWGC;
+ friend class RGWMetaNotifier;
+ friend class RGWDataNotifier;
+ friend class RGWObjectExpirer;
+ friend class RGWMetaSyncProcessorThread;
+ friend class RGWDataSyncProcessorThread;
+ friend class RGWReshard;
+ friend class RGWBucketReshard;
+ friend class RGWBucketReshardLock;
+ friend class BucketIndexLockGuard;
+ friend class rgw::sal::MPRadosSerializer;
+ friend class rgw::sal::LCRadosSerializer;
+ friend class rgw::sal::RadosStore;
+
+ /** Open the pool used as root for this gateway */
+ int open_root_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
+ int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
+
+ int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+ bool mostly_omap);
+
+
+ ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
+ SafeTimer *timer;
+
+ rgw::sal::RadosStore* driver = nullptr;
+ RGWGC *gc = nullptr;
+ RGWLC *lc;
+ RGWObjectExpirer *obj_expirer;
+ bool use_gc_thread;
+ bool use_lc_thread;
+ bool quota_threads;
+ bool run_sync_thread;
+ bool run_reshard_thread;
+
+ RGWMetaNotifier *meta_notifier;
+ RGWDataNotifier *data_notifier;
+ RGWMetaSyncProcessorThread *meta_sync_processor_thread;
+ RGWSyncTraceManager *sync_tracer = nullptr;
+ std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
+
+ boost::optional<rgw::BucketTrimManager> bucket_trim;
+ RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
+
+ ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
+ ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
+
+ librados::IoCtx root_pool_ctx; // .rgw
+
+ double inject_notify_timeout_probability = 0;
+ unsigned max_notify_retries = 0;
+
+ friend class RGWWatcher;
+
+ ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id");
+
+ // This field represents the number of bucket index object shards
+ uint32_t bucket_index_max_shards;
+
+ std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
+
+ int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
+ int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
+ int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+ uint64_t max_bucket_id;
+
+ int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
+ RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ RGWObjState *olh_state, RGWObjState **target_state,
+ RGWObjManifest **target_manifest, optional_yield y);
+ int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent = false);
+ int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ librados::ObjectOperation& op, RGWObjState **state,
+ RGWObjManifest** pmanifest, optional_yield y);
+
+ int update_placement_map();
+ int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
+
+ void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
+ void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
+ void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
+protected:
+ CephContext *cct;
+
+ librados::Rados rados;
+
+ using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
+ RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
+
+ tombstone_cache_t *obj_tombstone_cache;
+
+ librados::IoCtx gc_pool_ctx; // .rgw.gc
+ librados::IoCtx lc_pool_ctx; // .rgw.lc
+ librados::IoCtx objexp_pool_ctx;
+ librados::IoCtx reshard_pool_ctx;
+ librados::IoCtx notif_pool_ctx; // .rgw.notif
+
+ bool pools_initialized;
+
+ RGWQuotaHandler *quota_handler;
+
+ RGWCoroutinesManagerRegistry *cr_registry;
+
+ RGWSyncModuleInstanceRef sync_module;
+ bool writeable_zone{false};
+
+ RGWIndexCompletionManager *index_completion_manager{nullptr};
+
+ bool use_cache{false};
+ bool use_gc{true};
+ bool use_datacache{false};
+
+ int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
+public:
+ RGWRados(): timer(NULL),
+ gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
+ run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
+ data_notifier(NULL), meta_sync_processor_thread(NULL),
+ bucket_index_max_shards(0),
+ max_bucket_id(0), cct(NULL),
+ binfo_cache(NULL), obj_tombstone_cache(nullptr),
+ pools_initialized(false),
+ quota_handler(NULL),
+ cr_registry(NULL),
+ pctl(&ctl),
+ reshard(NULL) {}
+
+ RGWRados& set_use_cache(bool status) {
+ use_cache = status;
+ return *this;
+ }
+
+ RGWRados& set_use_gc(bool status) {
+ use_gc = status;
+ return *this;
+ }
+
+ RGWRados& set_use_datacache(bool status) {
+ use_datacache = status;
+ return *this;
+ }
+
+ bool get_use_datacache() {
+ return use_datacache;
+ }
+
+ RGWLC *get_lc() {
+ return lc;
+ }
+
+ RGWGC *get_gc() {
+ return gc;
+ }
+
+ RGWRados& set_run_gc_thread(bool _use_gc_thread) {
+ use_gc_thread = _use_gc_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_lc_thread(bool _use_lc_thread) {
+ use_lc_thread = _use_lc_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_quota_threads(bool _run_quota_threads) {
+ quota_threads = _run_quota_threads;
+ return *this;
+ }
+
+ RGWRados& set_run_sync_thread(bool _run_sync_thread) {
+ run_sync_thread = _run_sync_thread;
+ return *this;
+ }
+
+ RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
+ run_reshard_thread = _run_reshard_thread;
+ return *this;
+ }
+
+ librados::IoCtx* get_lc_pool_ctx() {
+ return &lc_pool_ctx;
+ }
+
+ librados::IoCtx& get_notif_pool_ctx() {
+ return notif_pool_ctx;
+ }
+
+ void set_context(CephContext *_cct) {
+ cct = _cct;
+ }
+ void set_store(rgw::sal::RadosStore* _driver) {
+ driver = _driver;
+ }
+
+ RGWServices svc;
+ RGWCtl ctl;
+
+ RGWCtl *pctl{nullptr};
+
+ /**
+ * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
+ * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
+ */
+ std::string host_id;
+
+ RGWReshard *reshard;
+ std::shared_ptr<RGWReshardWait> reshard_wait;
+
+ virtual ~RGWRados() = default;
+
+ tombstone_cache_t *get_tombstone_cache() {
+ return obj_tombstone_cache;
+ }
+ const RGWSyncModuleInstanceRef& get_sync_module() {
+ return sync_module;
+ }
+ RGWSyncTraceManager *get_sync_tracer() {
+ return sync_tracer;
+ }
+
+ int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
+ void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
+ int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+ int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+
+ uint32_t get_max_bucket_shards() {
+ return RGWSI_BucketIndex_RADOS::shards_max();
+ }
+
+
+ int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+
+ int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
+ int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+ bool *is_truncated);
+ int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+ bool *is_truncated);
+ std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
+
+ CephContext *ctx() { return cct; }
+ /** do all necessary setup of the storage device */
+ int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
+ set_context(_cct);
+ return init_begin(dpp);
+ }
+ /** Initialize the RADOS instance and prepare to do other ops */
+ int init_svc(bool raw, const DoutPrefixProvider *dpp);
+ int init_ctl(const DoutPrefixProvider *dpp);
+ virtual int init_rados();
+ int init_begin(const DoutPrefixProvider *dpp);
+ int init_complete(const DoutPrefixProvider *dpp);
+ void finalize();
+
+ int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
+ int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
+
+ /// list logs
+ int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
+ int log_list_next(RGWAccessHandle handle, std::string *name);
+
+ /// remove log
+ int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
+
+ /// show log
+ int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
+ int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
+
+ // log bandwidth info
+ int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
+ int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+ uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
+ rgw_usage_log_entry>& usage);
+ int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
+ int clear_usage(const DoutPrefixProvider *dpp);
+
+ int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
+
+ void create_bucket_id(std::string *bucket_id);
+
+ bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
+ bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
+
+ int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+ const std::string& zonegroup_id,
+ const rgw_placement_rule& placement_rule,
+ const std::string& swift_ver_location,
+ const RGWQuotaInfo * pquota_info,
+ std::map<std::string,bufferlist>& attrs,
+ RGWBucketInfo& bucket_info,
+ obj_version *pobjv,
+ obj_version *pep_objv,
+ ceph::real_time creation_time,
+ rgw_bucket *master_bucket,
+ uint32_t *master_num_shards,
+ optional_yield y,
+ const DoutPrefixProvider *dpp,
+ bool exclusive = true);
+
+ RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
+
+ struct BucketShard {
+ RGWRados *store;
+ rgw_bucket bucket;
+ int shard_id;
+ RGWSI_RADOS::Obj bucket_obj;
+
+ explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
+ int init(const rgw_bucket& _bucket, const rgw_obj& obj,
+ RGWBucketInfo* out, const DoutPrefixProvider *dpp);
+ int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
+ int init(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index, int sid);
+
+ friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
+ out << "BucketShard:{ bucket=" << bs.bucket <<
+ ", shard_id=" << bs.shard_id <<
+ ", bucket_ojb=" << bs.bucket_obj << "}";
+ return out;
+ }
+ };
+
+ class Object {
+ RGWRados *store;
+ rgw::sal::Bucket* bucket;
+ RGWObjectCtx& ctx;
+ rgw::sal::Object* obj;
+
+ BucketShard bs;
+
+ RGWObjState *state;
+ RGWObjManifest *manifest;
+
+ bool versioning_disabled;
+
+ bool bs_initialized;
+
+ const rgw_placement_rule *pmeta_placement_rule;
+
+ protected:
+ int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
+ void invalidate_state();
+
+ int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
+ const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
+ int complete_atomic_modification(const DoutPrefixProvider *dpp);
+
+ public:
+ Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket),
+ ctx(_ctx), obj(_obj), bs(store),
+ state(NULL), manifest(nullptr), versioning_disabled(false),
+ bs_initialized(false),
+ pmeta_placement_rule(nullptr) {}
+
+ RGWRados *get_store() { return store; }
+ rgw_obj get_obj() { return obj->get_obj(); }
+ RGWObjectCtx& get_ctx() { return ctx; }
+ RGWBucketInfo& get_bucket_info() { return bucket->get_info(); }
+ const std::string& get_instance() { return obj->get_instance(); }
+ rgw::sal::Object* get_target() { return obj; }
+ int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
+
+ int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+ if (!bs_initialized) {
+ int r =
+ bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp);
+ if (r < 0) {
+ return r;
+ }
+ bs_initialized = true;
+ }
+ *pbs = &bs;
+ return 0;
+ }
+
+ void set_versioning_disabled(bool status) {
+ versioning_disabled = status;
+ }
+
+ bool versioning_enabled() {
+ return (!versioning_disabled && bucket->versioning_enabled());
+ }
+
+ void set_meta_placement_rule(const rgw_placement_rule *p) {
+ pmeta_placement_rule = p;
+ }
+
+ const rgw_placement_rule& get_meta_placement_rule() {
+ return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule();
+ }
+
+ struct Read {
+ RGWRados::Object *source;
+
+ struct GetObjState {
+ std::map<rgw_pool, librados::IoCtx> io_ctxs;
+ rgw_pool cur_pool;
+ librados::IoCtx *cur_ioctx{nullptr};
+ rgw_obj obj;
+ rgw_raw_obj head_obj;
+ } state;
+
+ struct ConditionParams {
+ const ceph::real_time *mod_ptr;
+ const ceph::real_time *unmod_ptr;
+ bool high_precision_time;
+ uint32_t mod_zone_id;
+ uint64_t mod_pg_ver;
+ const char *if_match;
+ const char *if_nomatch;
+
+ ConditionParams() :
+ mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+ if_match(NULL), if_nomatch(NULL) {}
+ } conds;
+
+ struct Params {
+ ceph::real_time *lastmod;
+ uint64_t *obj_size;
+ std::map<std::string, bufferlist> *attrs;
+ rgw_obj *target_obj;
+
+ Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+ target_obj(nullptr) {}
+ } params;
+
+ explicit Read(RGWRados::Object *_source) : source(_source) {}
+
+ int prepare(optional_yield y, const DoutPrefixProvider *dpp);
+ static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+ int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
+ int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
+ int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
+ };
+
+ struct Write {
+ RGWRados::Object *target;
+
+ struct MetaParams {
+ ceph::real_time *mtime;
+ std::map<std::string, bufferlist>* rmattrs;
+ const bufferlist *data;
+ RGWObjManifest *manifest;
+ const std::string *ptag;
+ std::list<rgw_obj_index_key> *remove_objs;
+ ceph::real_time set_mtime;
+ rgw_user owner;
+ RGWObjCategory category;
+ int flags;
+ const char *if_match;
+ const char *if_nomatch;
+ std::optional<uint64_t> olh_epoch;
+ ceph::real_time delete_at;
+ bool canceled;
+ const std::string *user_data;
+ rgw_zone_set *zones_trace;
+ bool modify_tail;
+ bool completeMultipart;
+ bool appendable;
+
+ MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+ remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+ if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+ modify_tail(false), completeMultipart(false), appendable(false) {}
+ } meta;
+
+ explicit Write(RGWRados::Object *_target) : target(_target) {}
+
+ int _do_write_meta(const DoutPrefixProvider *dpp,
+ uint64_t size, uint64_t accounted_size,
+ std::map<std::string, bufferlist>& attrs,
+ bool modify_tail, bool assume_noent,
+ void *index_op, optional_yield y);
+ int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+ std::map<std::string, bufferlist>& attrs, optional_yield y);
+ int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
+ const req_state* get_req_state() {
+ return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */
+ }
+ };
+
+ struct Delete {
+ RGWRados::Object *target;
+
+ struct DeleteParams {
+ rgw_user bucket_owner;
+ int versioning_status; // versioning flags defined in enum RGWBucketFlags
+ ACLOwner obj_owner; // needed for creation of deletion marker
+ uint64_t olh_epoch;
+ std::string marker_version_id;
+ uint32_t bilog_flags;
+ std::list<rgw_obj_index_key> *remove_objs;
+ ceph::real_time expiration_time;
+ ceph::real_time unmod_since;
+ ceph::real_time mtime; /* for setting delete marker mtime */
+ bool high_precision_time;
+ rgw_zone_set *zones_trace;
+ bool abortmp;
+ uint64_t parts_accounted_size;
+
+ DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+ } params;
+
+ struct DeleteResult {
+ bool delete_marker;
+ std::string version_id;
+
+ DeleteResult() : delete_marker(false) {}
+ } result;
+
+ explicit Delete(RGWRados::Object *_target) : target(_target) {}
+
+ int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
+ };
+
+ struct Stat {
+ RGWRados::Object *source;
+
+ struct Result {
+ rgw_obj obj;
+ std::optional<RGWObjManifest> manifest;
+ uint64_t size{0};
+ struct timespec mtime {};
+ std::map<std::string, bufferlist> attrs;
+ } result;
+
+ struct State {
+ librados::IoCtx io_ctx;
+ librados::AioCompletion *completion;
+ int ret;
+
+ State() : completion(NULL), ret(0) {}
+ } state;
+
+
+ explicit Stat(RGWRados::Object *_source) : source(_source) {}
+
+ int stat_async(const DoutPrefixProvider *dpp);
+ int wait(const DoutPrefixProvider *dpp);
+ int stat();
+ private:
+ int finish(const DoutPrefixProvider *dpp);
+ };
+ };
+
+ class Bucket {
+ RGWRados *store;
+ RGWBucketInfo bucket_info;
+ rgw_bucket& bucket;
+ int shard_id;
+
+ public:
+ Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
+ shard_id(RGW_NO_SHARD) {}
+ RGWRados *get_store() { return store; }
+ rgw_bucket& get_bucket() { return bucket; }
+ RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+ int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
+
+ int get_shard_id() { return shard_id; }
+ void set_shard_id(int id) {
+ shard_id = id;
+ }
+
+ class UpdateIndex {
+ RGWRados::Bucket *target;
+ std::string optag;
+ rgw_obj obj;
+ uint16_t bilog_flags{0};
+ BucketShard bs;
+ bool bs_initialized{false};
+ bool blind;
+ bool prepared{false};
+ rgw_zone_set *zones_trace{nullptr};
+
+ int init_bs(const DoutPrefixProvider *dpp) {
+ int r =
+ bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
+ if (r < 0) {
+ return r;
+ }
+ bs_initialized = true;
+ return 0;
+ }
+
+ void invalidate_bs() {
+ bs_initialized = false;
+ }
+
+ int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
+ public:
+
+ UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
+ bs(target->get_store()) {
+ blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
+ }
+
+ int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+ if (!bs_initialized) {
+ int r = init_bs(dpp);
+ if (r < 0) {
+ return r;
+ }
+ }
+ *pbs = &bs;
+ return 0;
+ }
+
+ void set_bilog_flags(uint16_t flags) {
+ bilog_flags = flags;
+ }
+
+ void set_zones_trace(rgw_zone_set *_zones_trace) {
+ zones_trace = _zones_trace;
+ }
+
+ int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
+ int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
+ uint64_t accounted_size, ceph::real_time& ut,
+ const std::string& etag, const std::string& content_type,
+ const std::string& storage_class,
+ bufferlist *acl_bl, RGWObjCategory category,
+ std::list<rgw_obj_index_key> *remove_objs, const std::string *user_data = nullptr, bool appendable = false);
+ int complete_del(const DoutPrefixProvider *dpp,
+ int64_t poolid, uint64_t epoch,
+ ceph::real_time& removed_mtime, /* mtime of removed object */
+ std::list<rgw_obj_index_key> *remove_objs);
+ int cancel(const DoutPrefixProvider *dpp,
+ std::list<rgw_obj_index_key> *remove_objs);
+
+ const std::string *get_optag() { return &optag; }
+
+ bool is_prepared() { return prepared; }
+ }; // class UpdateIndex
+
+ class List {
+ protected:
+ // absolute maximum number of objects that
+ // list_objects_(un)ordered can return
+ static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+ RGWRados::Bucket *target;
+ rgw_obj_key next_marker;
+
+ int list_objects_ordered(const DoutPrefixProvider *dpp,
+ int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y);
+ int list_objects_unordered(const DoutPrefixProvider *dpp,
+ int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y);
+
+ public:
+
+ struct Params {
+ std::string prefix;
+ std::string delim;
+ rgw_obj_key marker;
+ rgw_obj_key end_marker;
+ std::string ns;
+ bool enforce_ns;
+ RGWAccessListFilter* access_list_filter;
+ RGWBucketListNameFilter force_check_filter;
+ bool list_versions;
+ bool allow_unordered;
+
+ Params() :
+ enforce_ns(true),
+ access_list_filter(nullptr),
+ list_versions(false),
+ allow_unordered(false)
+ {}
+ } params;
+
+ explicit List(RGWRados::Bucket *_target) : target(_target) {}
+
+ int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+ std::vector<rgw_bucket_dir_entry> *result,
+ std::map<std::string, bool> *common_prefixes,
+ bool *is_truncated,
+ optional_yield y) {
+ if (params.allow_unordered) {
+ return list_objects_unordered(dpp, max, result, common_prefixes,
+ is_truncated, y);
+ } else {
+ return list_objects_ordered(dpp, max, result, common_prefixes,
+ is_truncated, y);
+ }
+ }
+ rgw_obj_key& get_next_marker() {
+ return next_marker;
+ }
+ }; // class List
+ }; // class Bucket
+
+ int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::string& obj_prefix,
+ const std::string& obj_delim,
+ std::function<int(const rgw_bucket_dir_entry&)> handler);
+
+ bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const;
+
+ int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */
+ const rgw_user& user, /* in */
+ rgw::sal::Bucket* bucket, /* in */
+ rgw::sal::Object* obj, /* in */
+ const DoutPrefixProvider *dpp, /* in/out */
+ optional_yield y); /* in */
+ int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */
+ const rgw_user& user, /* in */
+ rgw::sal::Bucket* bucket, /* in */
+ rgw::sal::Object* obj, /* in */
+ bool& restored, /* out */
+ const DoutPrefixProvider *dpp); /* in/out */
+ int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+ RGWObjState *astate,
+ std::map<std::string, bufferlist>& src_attrs,
+ RGWRados::Object::Read& read_op,
+ const rgw_user& user_id,
+ rgw::sal::Object* dest_obj,
+ ceph::real_time *mtime);
+
+ enum AttrsMod {
+ ATTRSMOD_NONE = 0,
+ ATTRSMOD_REPLACE = 1,
+ ATTRSMOD_MERGE = 2
+ };
+
+ D3nDataCache* d3n_data_cache{nullptr};
+
+ int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y);
+
+ int stat_remote_obj(const DoutPrefixProvider *dpp,
+ RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* src_obj,
+ const RGWBucketInfo *src_bucket_info,
+ real_time *src_mtime,
+ uint64_t *psize,
+ const real_time *mod_ptr,
+ const real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ std::map<std::string, bufferlist> *pattrs,
+ std::map<std::string, std::string> *pheaders,
+ std::string *version_id,
+ std::string *ptag,
+ std::string *petag);
+
+ int fetch_remote_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_obj,
+ rgw::sal::Object* src_obj,
+ rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ std::optional<rgw_placement_rule> dest_placement,
+ ceph::real_time *src_mtime,
+ ceph::real_time *mtime,
+ const ceph::real_time *mod_ptr,
+ const ceph::real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ rgw::sal::Attrs& attrs,
+ RGWObjCategory category,
+ std::optional<uint64_t> olh_epoch,
+ ceph::real_time delete_at,
+ std::string *ptag,
+ std::string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ RGWFetchObjFilter *filter,
+ rgw_zone_set *zones_trace= nullptr,
+ std::optional<uint64_t>* bytes_transferred = 0);
+ /**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ * ATTRSMOD_NONE - the attributes of the source object will be
+ * copied without modifications, attrs parameter is ignored;
+ * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ * parameter, source object attributes are not copied;
+ * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ * are overwritten by values contained in attrs parameter.
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int copy_obj(RGWObjectCtx& obj_ctx,
+ const rgw_user& user_id,
+ req_info *info,
+ const rgw_zone_id& source_zone,
+ rgw::sal::Object* dest_obj,
+ rgw::sal::Object* src_obj,
+ rgw::sal::Bucket* dest_bucket,
+ rgw::sal::Bucket* src_bucket,
+ const rgw_placement_rule& dest_placement,
+ ceph::real_time *src_mtime,
+ ceph::real_time *mtime,
+ const ceph::real_time *mod_ptr,
+ const ceph::real_time *unmod_ptr,
+ bool high_precision_time,
+ const char *if_match,
+ const char *if_nomatch,
+ AttrsMod attrs_mod,
+ bool copy_if_newer,
+ std::map<std::string, bufferlist>& attrs,
+ RGWObjCategory category,
+ uint64_t olh_epoch,
+ ceph::real_time delete_at,
+ std::string *version_id,
+ std::string *ptag,
+ std::string *petag,
+ void (*progress_cb)(off_t, void *),
+ void *progress_data,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int copy_obj_data(RGWObjectCtx& obj_ctx,
+ rgw::sal::Bucket* bucket,
+ const rgw_placement_rule& dest_placement,
+ RGWRados::Object::Read& read_op, off_t end,
+ rgw::sal::Object* dest_obj,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ uint64_t olh_epoch,
+ ceph::real_time delete_at,
+ std::string *petag,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int transition_obj(RGWObjectCtx& obj_ctx,
+ rgw::sal::Bucket* bucket,
+ rgw::sal::Object& obj,
+ const rgw_placement_rule& placement_rule,
+ const real_time& mtime,
+ uint64_t olh_epoch,
+ const DoutPrefixProvider *dpp,
+ optional_yield y);
+
+ int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
+
+ /**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+ int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
+
+ void wakeup_meta_sync_shards(std::set<int>& shard_ids);
+
+ void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
+
+ RGWMetaSyncStatusManager* get_meta_sync_manager();
+ RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
+
+ int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
+ int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
+ int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
+
+ /** Delete an object.*/
+ int delete_obj(rgw::sal::Driver* driver,
+ const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_owner,
+ const rgw_obj& src_obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags = 0,
+ const ceph::real_time& expiration_time = ceph::real_time(),
+ rgw_zone_set *zones_trace = nullptr);
+ int delete_obj(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_owner,
+ rgw::sal::Object* src_obj,
+ int versioning_status, // versioning flags defined in enum RGWBucketFlags
+ uint16_t bilog_flags = 0,
+ const ceph::real_time& expiration_time = ceph::real_time(),
+ rgw_zone_set *zones_trace = nullptr);
+
+ int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+
+ /** Remove an object from the bucket index */
+ int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp);
+
+ /**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl);
+
+ int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
+ std::map<std::string, bufferlist>& attrs,
+ std::map<std::string, bufferlist>* rmattrs,
+ optional_yield y);
+
+ int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
+ bool follow_olh, optional_yield y, bool assume_noent = false);
+ int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
+ return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
+ }
+
+ using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
+ off_t, bool, RGWObjState*, void*);
+
+ int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
+ rgw::sal::Object* obj, off_t ofs, off_t end,
+ uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
+ optional_yield y);
+
+ int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
+
+ virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+ const rgw_raw_obj& read_obj, off_t obj_ofs,
+ off_t read_ofs, off_t len, bool is_head_obj,
+ RGWObjState *astate, void *arg);
+
+ /**
+ * a simple object read without keeping state
+ */
+
+ int raw_obj_stat(const DoutPrefixProvider *dpp,
+ rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
+ std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
+ RGWObjVersionTracker *objv_tracker, optional_yield y);
+
+ int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
+ int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
+
+ int guard_reshard(const DoutPrefixProvider *dpp,
+ BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ std::function<int(BucketShard *)> call);
+ int block_while_resharding(RGWRados::BucketShard *bs,
+ const rgw_obj& obj_instance,
+ RGWBucketInfo& bucket_info,
+ optional_yield y,
+ const DoutPrefixProvider *dpp);
+
+ void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
+ int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+ int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+ int bucket_index_link_olh(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& olh_state,
+ const rgw_obj& obj_instance, bool delete_marker,
+ const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch,
+ ceph::real_time unmod_since, bool high_precision_time,
+ rgw_zone_set *zones_trace = nullptr,
+ bool log_data_change = false);
+ int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw_obj& obj_instance,
+ const std::string& op_tag, const std::string& olh_tag,
+ uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
+ int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info, RGWObjState& state,
+ const rgw_obj& obj_instance, uint64_t ver_marker,
+ std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
+ int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
+ int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance);
+ int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj,
+ bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+ uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
+ int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr);
+ int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
+ uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
+ optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+ int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+ const rgw_obj& obj);
+ int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
+ uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+
+ void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
+ int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
+ int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target);
+ int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
+
+ void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+ void gen_rand_obj_instance_name(rgw_obj *target);
+
+ int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
+ int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
+
+public:
+ void set_atomic(void *ctx, rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_atomic(obj);
+ }
+ void set_prefetch_data(void *ctx, const rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_prefetch_data(obj);
+ }
+ void set_compressed(void *ctx, const rgw_obj& obj) {
+ RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+ rctx->set_compressed(obj);
+ }
+ int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
+ int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
+ std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
+ int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
+
+ int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp);
+ /* xxx dang obj_ctx -> svc */
+ int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+ int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+
+ static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
+
+ int get_bucket_info(RGWServices *svc,
+ const std::string& tenant_name, const std::string& bucket_name,
+ RGWBucketInfo& info,
+ ceph::real_time *pmtime, optional_yield y,
+ const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
+
+ // Returns 0 on successful refresh. Returns error code if there was
+ // an error or the version stored on the OSD is the same as that
+ // presented in the BucketInfo structure.
+ //
+ int try_refresh_bucket_info(RGWBucketInfo& info,
+ ceph::real_time *pmtime,
+ const DoutPrefixProvider *dpp,
+ std::map<std::string, bufferlist> *pattrs = nullptr);
+
+ int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
+ std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
+ const DoutPrefixProvider *dpp);
+
+ int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
+ rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
+ RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
+ ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
+ std::list<rgw_obj_index_key> *remove_objs,
+ uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+ int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
+
+ using ent_map_t =
+ boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
+
+ int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ const int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ const std::string& delimiter,
+ const uint32_t num_entries,
+ const bool list_versions,
+ const uint16_t exp_factor, // 0 means ignore
+ ent_map_t& m,
+ bool* is_truncated,
+ bool* cls_filtered,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter = {});
+ int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id,
+ const rgw_obj_index_key& start_after,
+ const std::string& prefix,
+ uint32_t num_entries,
+ bool list_versions,
+ std::vector<rgw_bucket_dir_entry>& ent_list,
+ bool *is_truncated,
+ rgw_obj_index_key *last_entry,
+ optional_yield y,
+ RGWBucketListNameFilter force_check_filter = {});
+ int cls_bucket_head(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, std::vector<rgw_bucket_dir_header>& headers,
+ std::map<int, std::string> *bucket_instance_ids = NULL);
+ int cls_bucket_head_async(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& idx_layout,
+ int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
+ int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+ int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+ int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+ void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
+ int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
+ int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
+ int bi_list(const DoutPrefixProvider *dpp,
+ const RGWBucketInfo& bucket_info,
+ int shard_id,
+ const std::string& filter_obj,
+ const std::string& marker,
+ uint32_t max,
+ std::list<rgw_cls_bi_entry> *entries,
+ bool *is_truncated);
+ int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+ int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
+ std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+ int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
+
+ int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
+ int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+ uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
+ std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
+ int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+ uint64_t end_epoch);
+ int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
+
+ int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
+
+ int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
+ int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
+
+ void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
+ std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
+ void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
+ int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
+ int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
+ librados::ObjectWriteOperation *op);
+ int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
+
+ int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+ int process_gc(bool expired_only);
+ bool process_expire_objects(const DoutPrefixProvider *dpp);
+ int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y);
+
+ int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
+ int list_lc_progress(std::string& marker, uint32_t max_entries,
+ std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+ int& index);
+
+ int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
+ std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
+ int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
+ int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+ int remove_objs_from_index(const DoutPrefixProvider *dpp,
+ RGWBucketInfo& bucket_info,
+ const std::list<rgw_obj_index_key>& oid_list);
+ int move_rados_obj(const DoutPrefixProvider *dpp,
+ librados::IoCtx& src_ioctx,
+ const std::string& src_oid, const std::string& src_locator,
+ librados::IoCtx& dst_ioctx,
+ const std::string& dst_oid, const std::string& dst_locator);
+ int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
+ int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+ rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
+
+ int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+ RGWQuota& quota, uint64_t obj_size,
+ optional_yield y, bool check_size_only = false);
+
+ int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
+ uint64_t num_objs, const DoutPrefixProvider *dpp);
+
+ int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
+
+ uint64_t instance_id();
+
+ librados::Rados* get_rados_handle();
+
+ int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
+ int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
+ std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+ optional_yield y);
+
+ private:
+ /**
+ * Check the actual on-disk state of the object specified
+ * by list_state, and fill in the time and size of object.
+ * Then append any changes to suggested_updates for
+ * the rgw class' dir_suggest_changes function.
+ *
+ * Note that this can maul list_state; don't use it afterwards. Also
+ * it expects object to already be filled in from list_state; it only
+ * sets the size and mtime.
+ *
+ * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
+ * and -errno on other failures. (-ENOENT is not a failure, and it
+ * will encode that info as a suggested update.)
+ */
+ int check_disk_state(const DoutPrefixProvider *dpp,
+ librados::IoCtx io_ctx,
+ RGWBucketInfo& bucket_info,
+ rgw_bucket_dir_entry& list_state,
+ rgw_bucket_dir_entry& object,
+ bufferlist& suggested_updates,
+ optional_yield y);
+
+ /**
+ * Init pool iteration
+ * pool: pool to use for the ctx initialization
+ * ctx: context object to use for the iteration
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
+
+ /**
+ * Init pool iteration
+ * pool: pool to use
+ * cursor: position to start iteration
+ * ctx: context object to use for the iteration
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
+
+ /**
+ * Get pool iteration position
+ * ctx: context object to use for the iteration
+ * Returns: std::string representation of position
+ */
+ std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
+ /**
+ * Iterate over pool return object names, use optional filter
+ * ctx: iteration context, initialized with pool_iterate_begin()
+ * num: max number of objects to return
+ * objs: a vector that the results will append into
+ * is_truncated: if not NULL, will hold true iff iteration is complete
+ * filter: if not NULL, will be used to filter returned objects
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+ int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
+ std::vector<rgw_bucket_dir_entry>& objs,
+ bool *is_truncated, RGWAccessListFilter *filter);
+
+ uint64_t next_bucket_id();
+
+ /**
+ * This is broken out to facilitate unit testing.
+ */
+ static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+ uint32_t num_shards);
+};
+
+
+struct get_obj_data {
+ RGWRados* rgwrados;
+ RGWGetDataCB* client_cb = nullptr;
+ rgw::Aio* aio;
+ uint64_t offset; // next offset to write to client
+ rgw::AioResultList completed; // completed read results, sorted by offset
+ optional_yield yield;
+
+ get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
+ uint64_t offset, optional_yield yield)
+ : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
+ ~get_obj_data() {
+ if (rgwrados->get_use_datacache()) {
+ const std::lock_guard l(d3n_get_data.d3n_lock);
+ }
+ }
+
+ D3nGetObjData d3n_get_data;
+ std::atomic_bool d3n_bypass_cache_write{false};
+
+ int flush(rgw::AioResultList&& results);
+
+ void cancel() {
+ // wait for all completions to drain and ignore the results
+ aio->drain();
+ }
+
+ int drain() {
+ auto c = aio->wait();
+ while (!c.empty()) {
+ int r = flush(std::move(c));
+ if (r < 0) {
+ cancel();
+ return r;
+ }
+ c = aio->wait();
+ }
+ return flush(std::move(c));
+ }
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <limits>
+#include <sstream>
+
+#include "rgw_zone.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_reshard.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+
+#include "common/dout.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_bilog_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const string reshard_oid_prefix = "reshard.";
+const string reshard_lock_name = "reshard_process";
+const string bucket_instance_lock_name = "bucket_instance_lock";
+
+/* All primes up to 2000 used to attempt to make dynamic sharding use
+ * a prime numbers of shards. Note: this list also includes 1 for when
+ * 1 shard is the most appropriate, even though 1 is not prime.
+ */
+const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
+ 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
+ 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
+ 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
+ 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
+ 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
+ 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
+ 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
+ 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
+ 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
+ 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
+ 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
+ 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
+ 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
+ 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
+ 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
+ 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+ 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
+ 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
+ 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
+ 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
+ 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
+ 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
+ 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
+ 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
+};
+
+class BucketReshardShard {
+ rgw::sal::RadosStore* store;
+ const RGWBucketInfo& bucket_info;
+ int shard_id;
+ RGWRados::BucketShard bs;
+ vector<rgw_cls_bi_entry> entries;
+ map<RGWObjCategory, rgw_bucket_category_stats> stats;
+ deque<librados::AioCompletion *>& aio_completions;
+ uint64_t max_aio_completions;
+ uint64_t reshard_shard_batch_size;
+
+ int wait_next_completion() {
+ librados::AioCompletion *c = aio_completions.front();
+ aio_completions.pop_front();
+
+ c->wait_for_complete();
+
+ int ret = c->get_return_value();
+ c->release();
+
+ if (ret < 0) {
+ derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int get_completion(librados::AioCompletion **c) {
+ if (aio_completions.size() >= max_aio_completions) {
+ int ret = wait_next_completion();
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+ aio_completions.push_back(*c);
+
+ return 0;
+ }
+
+public:
+ BucketReshardShard(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ int shard_id, deque<librados::AioCompletion *>& _completions) :
+ store(_store), bucket_info(_bucket_info), shard_id(shard_id),
+ bs(store->getRados()), aio_completions(_completions)
+ {
+ bs.init(dpp, bucket_info, index, shard_id);
+
+ max_aio_completions =
+ store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
+ reshard_shard_batch_size =
+ store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
+ }
+
+ int get_shard_id() const {
+ return shard_id;
+ }
+
+ int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+ const rgw_bucket_category_stats& entry_stats) {
+ entries.push_back(entry);
+ if (account) {
+ rgw_bucket_category_stats& target = stats[category];
+ target.num_entries += entry_stats.num_entries;
+ target.total_size += entry_stats.total_size;
+ target.total_size_rounded += entry_stats.total_size_rounded;
+ target.actual_size += entry_stats.actual_size;
+ }
+ if (entries.size() >= reshard_shard_batch_size) {
+ int ret = flush();
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+ }
+
+ int flush() {
+ if (entries.size() == 0) {
+ return 0;
+ }
+
+ librados::ObjectWriteOperation op;
+ for (auto& entry : entries) {
+ store->getRados()->bi_put(op, bs, entry);
+ }
+ cls_rgw_bucket_update_stats(op, false, stats);
+
+ librados::AioCompletion *c;
+ int ret = get_completion(&c);
+ if (ret < 0) {
+ return ret;
+ }
+ ret = bs.bucket_obj.aio_operate(c, &op);
+ if (ret < 0) {
+ derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ entries.clear();
+ stats.clear();
+ return 0;
+ }
+
+ int wait_all_aio() {
+ int ret = 0;
+ while (!aio_completions.empty()) {
+ int r = wait_next_completion();
+ if (r < 0) {
+ ret = r;
+ }
+ }
+ return ret;
+ }
+}; // class BucketReshardShard
+
+
+class BucketReshardManager {
+ rgw::sal::RadosStore *store;
+ deque<librados::AioCompletion *> completions;
+ vector<BucketReshardShard> target_shards;
+
+public:
+ BucketReshardManager(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore *_store,
+ const RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& target)
+ : store(_store)
+ {
+ const int num_shards = target.layout.normal.num_shards;
+ target_shards.reserve(num_shards);
+ for (int i = 0; i < num_shards; ++i) {
+ target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
+ }
+ }
+
+ ~BucketReshardManager() {
+ for (auto& shard : target_shards) {
+ int ret = shard.wait_all_aio();
+ if (ret < 0) {
+ ldout(store->ctx(), 20) << __func__ <<
+ ": shard->wait_all_aio() returned ret=" << ret << dendl;
+ }
+ }
+ }
+
+ int add_entry(int shard_index,
+ rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+ const rgw_bucket_category_stats& entry_stats) {
+ int ret = target_shards[shard_index].add_entry(entry, account, category,
+ entry_stats);
+ if (ret < 0) {
+ derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+ ") returned error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ return 0;
+ }
+
+ int finish() {
+ int ret = 0;
+ for (auto& shard : target_shards) {
+ int r = shard.flush();
+ if (r < 0) {
+ derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
+ ret = r;
+ }
+ }
+ for (auto& shard : target_shards) {
+ int r = shard.wait_all_aio();
+ if (r < 0) {
+ derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
+ ret = r;
+ }
+ }
+ target_shards.clear();
+ return ret;
+ }
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& _bucket_info,
+ const std::map<std::string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock) :
+ store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+ reshard_lock(store, bucket_info, true),
+ outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+// sets reshard status of bucket index shards for the current index layout
+static int set_resharding_status(const DoutPrefixProvider *dpp,
+ rgw::sal::RadosStore* store,
+ const RGWBucketInfo& bucket_info,
+ cls_rgw_reshard_status status)
+{
+ cls_rgw_bucket_instance_entry instance_entry;
+ instance_entry.set_status(status);
+
+ int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
+ const rgw_bucket& bucket,
+ const DoutPrefixProvider* dpp)
+{
+ RGWBucketInfo info;
+ int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
+ nullptr, null_yield, dpp);
+ if (r < 0) {
+ return r;
+ }
+
+ // delete its shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
+ // delete the bucket instance metadata
+ return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
+}
+
+// initialize the new bucket index shard objects
+static int init_target_index(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ const rgw::bucket_index_layout_generation& index,
+ const DoutPrefixProvider* dpp)
+{
+ int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
+ "target index shard objects: " << cpp_strerror(ret) << dendl;
+ return ret;
+ }
+
+ if (!bucket_info.datasync_flag_enabled()) {
+ // if bucket sync is disabled, disable it on each of the new shards too
+ auto log = rgw::log_layout_from_index(0, index);
+ ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
+ "bucket sync on the target index shard objects: "
+ << cpp_strerror(ret) << dendl;
+ store->svc()->bi->clean_index(dpp, bucket_info, index);
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+// initialize a target index layout, create its bucket index shard objects, and
+// write the target layout to the bucket instance metadata
+static int init_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ uint32_t new_num_shards,
+ const DoutPrefixProvider* dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+ const auto current = prev.current_index;
+
+ // initialize a new normal target index layout generation
+ rgw::bucket_index_layout_generation target;
+ target.layout.type = rgw::BucketIndexType::Normal;
+ target.layout.normal.num_shards = new_num_shards;
+ target.gen = current.gen + 1;
+
+ if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
+ // backward-compatible cleanup of old reshards, where the target was in a
+ // different bucket instance
+ if (!bucket_info.new_bucket_instance_id.empty()) {
+ rgw_bucket new_bucket = bucket_info.bucket;
+ new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
+ ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
+ "from a previous reshard attempt" << dendl;
+ // ignore errors
+ remove_old_reshard_instance(store, new_bucket, dpp);
+ }
+ bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
+ }
+
+ if (bucket_info.layout.target_index) {
+ // a previous reshard failed or stalled, and its reshard lock dropped
+ ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
+ "objects from a previous reshard attempt" << dendl;
+ // delete its existing shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
+ // don't reuse this same generation in the new target layout, in case
+ // something is still trying to operate on its shard objects
+ target.gen = bucket_info.layout.target_index->gen + 1;
+ }
+
+ // create the index shard objects
+ int ret = init_target_index(store, bucket_info, target, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ do {
+ // update resharding state
+ bucket_info.layout.target_index = target;
+ bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
+
+ if (ret = fault.check("set_target_layout");
+ ret == 0) { // no fault injected, write the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+ real_time(), &bucket_attrs, dpp);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
+ bucket_info.layout.current_index != current) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ break;
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
+ "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+
+ // delete the target shard objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, target);
+ return ret;
+ }
+ return 0;
+} // init_target_layout
+
+// delete the bucket index shards associated with the target layout and remove
+// it from the bucket instance metadata
+static int revert_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider* dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+
+ // remove target index shard objects
+ int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
+ "target index with: " << cpp_strerror(ret) << dendl;
+ ret = 0; // non-fatal error
+ }
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ do {
+ // clear target_index and resharding state
+ bucket_info.layout.target_index = std::nullopt;
+ bucket_info.layout.resharding = rgw::BucketReshardState::None;
+
+ if (ret = fault.check("revert_target_layout");
+ ret == 0) { // no fault injected, revert the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+ real_time(),
+ &bucket_attrs, dpp);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "reshard cancel" << dendl;
+ return -ECANCELED;
+ }
+ if (bucket_info.layout.current_index != prev.current_index ||
+ bucket_info.layout.target_index != prev.target_index) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ return -ECANCELED;
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
+ "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+ return ret;
+ }
+ return 0;
+} // remove_target_layout
+
+static int init_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ uint32_t new_num_shards,
+ const DoutPrefixProvider *dpp)
+{
+ int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret = fault.check("block_writes");
+ ret == 0) { // no fault injected, block writes to the current index shards
+ ret = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::IN_PROGRESS);
+ }
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
+ "writes to the current index: " << cpp_strerror(ret) << dendl;
+ // clean up the target layout (ignore errors)
+ revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ return ret;
+ }
+ return 0;
+} // init_reshard
+
+static int cancel_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ // unblock writes to the current index shard objects
+ int ret = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::NOT_RESHARDING);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+ "writes to current index objects: " << cpp_strerror(ret) << dendl;
+ ret = 0; // non-fatal error
+ }
+
+ if (bucket_info.layout.target_index) {
+ return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ }
+ // there is nothing to revert
+ return 0;
+} // cancel_reshard
+
+static int commit_target_layout(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ auto& layout = bucket_info.layout;
+ const auto next_log_gen = layout.logs.empty() ? 1 :
+ layout.logs.back().gen + 1;
+
+ if (!store->svc()->zone->need_to_log_data()) {
+ // if we're not syncing data, we can drop any existing logs
+ layout.logs.clear();
+ }
+
+ // use the new index layout as current
+ ceph_assert(layout.target_index);
+ layout.current_index = std::move(*layout.target_index);
+ layout.target_index = std::nullopt;
+ layout.resharding = rgw::BucketReshardState::None;
+ // add the in-index log layout
+ layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
+
+ int ret = fault.check("commit_target_layout");
+ if (ret == 0) { // no fault injected, write the bucket instance metadata
+ ret = store->getRados()->put_bucket_instance_info(
+ bucket_info, false, real_time(), &bucket_attrs, dpp);
+ } else if (ret == -ECANCELED) {
+ fault.clear(); // clear the fault so a retry can succeed
+ }
+ return ret;
+} // commit_target_layout
+
+static int commit_reshard(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ ReshardFaultInjector& fault,
+ const DoutPrefixProvider *dpp)
+{
+ auto prev = bucket_info.layout; // make a copy for cleanup
+
+ // retry in case of racing writes to the bucket instance metadata
+ static constexpr auto max_retries = 10;
+ int tries = 0;
+ int ret = 0;
+ do {
+ ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+ if (ret == -ECANCELED) {
+ // racing write detected, read the latest bucket info and try again
+ int ret2 = store->getRados()->get_bucket_instance_info(
+ bucket_info.bucket, bucket_info,
+ nullptr, &bucket_attrs, null_yield, dpp);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+ "bucket info: " << cpp_strerror(ret2) << dendl;
+ ret = ret2;
+ break;
+ }
+
+ // check that we're still in the reshard state we started in
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "reshard cancel" << dendl;
+ return -ECANCELED; // whatever canceled us already did the cleanup
+ }
+ if (bucket_info.layout.current_index != prev.current_index ||
+ bucket_info.layout.target_index != prev.target_index) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+ "another reshard" << dendl;
+ return -ECANCELED; // whatever canceled us already did the cleanup
+ }
+
+ prev = bucket_info.layout; // update the copy
+ }
+ ++tries;
+ } while (ret == -ECANCELED && tries < max_retries);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
+ "target index layout: " << cpp_strerror(ret) << dendl;
+
+ bucket_info.layout = std::move(prev); // restore in-memory layout
+
+ // unblock writes to the current index shard objects
+ int ret2 = set_resharding_status(dpp, store, bucket_info,
+ cls_rgw_reshard_status::NOT_RESHARDING);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+ "writes to current index objects: " << cpp_strerror(ret2) << dendl;
+ // non-fatal error
+ }
+ return ret;
+ }
+
+ if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
+ prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
+ // write a datalog entry for each shard of the previous index. triggering
+ // sync on the old shards will force them to detect the end-of-log for that
+ // generation, and eventually transition to the next
+ // TODO: use a log layout to support types other than BucketLogType::InIndex
+ for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) {
+ ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
+ << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
+ << prev.logs.back().gen << ")" << dendl;
+ } // datalog error is not fatal
+ }
+ }
+
+ // check whether the old index objects are still needed for bilogs
+ const auto& logs = bucket_info.layout.logs;
+ auto log = std::find_if(logs.begin(), logs.end(),
+ [&prev] (const rgw::bucket_log_layout_generation& log) {
+ return log.layout.type == rgw::BucketLogType::InIndex
+ && log.layout.in_index.gen == prev.current_index.gen;
+ });
+ if (log == logs.end()) {
+ // delete the index objects (ignore errors)
+ store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
+ }
+ return 0;
+} // commit_reshard
+
+int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ const DoutPrefixProvider* dpp)
+{
+ ReshardFaultInjector no_fault;
+ return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
+}
+
+int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
+{
+ int ret = reshard_lock.lock(dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+ ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
+ ret = -EINVAL;
+ } else {
+ ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
+ }
+
+ reshard_lock.unlock();
+ return ret;
+}
+
+RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral) :
+ store(_store),
+ lock_oid(reshard_lock_oid),
+ ephemeral(_ephemeral),
+ internal_lock(reshard_lock_name)
+{
+ const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
+ "rgw_reshard_bucket_lock_duration");
+ duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+ char cookie_buf[COOKIE_LEN + 1];
+ gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+ cookie_buf[COOKIE_LEN] = '\0';
+
+ internal_lock.set_cookie(cookie_buf);
+ internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
+ internal_lock.set_must_renew(false);
+
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+ }
+
+ if (ret == -EBUSY) {
+ ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
+ " found lock on " << lock_oid <<
+ " to be held by another RGW process; skipping for now" << dendl;
+ return ret;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
+ " failed to acquire lock on " << lock_oid << ": " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ reset_time(Clock::now());
+
+ return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+ int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
+ if (ret < 0) {
+ ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+ " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+ }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+ internal_lock.set_must_renew(true);
+ int ret;
+ if (ephemeral) {
+ ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+ lock_oid);
+ } else {
+ ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+ }
+ if (ret < 0) { /* expired or already locked by another processor */
+ std::stringstream error_s;
+ if (-ENOENT == ret) {
+ error_s << "ENOENT (lock expired or never initially locked)";
+ } else {
+ error_s << ret << " (" << cpp_strerror(-ret) << ")";
+ }
+ ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+ lock_oid << " with error " << error_s.str() << dendl;
+ return ret;
+ }
+ internal_lock.set_must_renew(false);
+
+ reset_time(now);
+ ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+ lock_oid << dendl;
+
+ return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
+ const rgw::bucket_index_layout_generation& target,
+ int max_entries,
+ bool verbose,
+ ostream *out,
+ Formatter *formatter,
+ const DoutPrefixProvider *dpp)
+{
+ if (out) {
+ (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
+ (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
+ }
+
+ /* update bucket info -- in progress*/
+ list<rgw_cls_bi_entry> entries;
+
+ if (max_entries < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": can't reshard, negative max_entries" << dendl;
+ return -EINVAL;
+ }
+
+ BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
+
+ bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
+
+ if (verbose_json_out) {
+ formatter->open_array_section("entries");
+ }
+
+ uint64_t total_entries = 0;
+
+ if (!verbose_json_out && out) {
+ (*out) << "total entries:";
+ }
+
+ const int num_source_shards = current.layout.normal.num_shards;
+ string marker;
+ for (int i = 0; i < num_source_shards; ++i) {
+ bool is_truncated = true;
+ marker.clear();
+ const std::string null_object_filter; // empty string since we're not filtering by object
+ while (is_truncated) {
+ entries.clear();
+ int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
+ if (ret < 0 && ret != -ENOENT) {
+ derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_cls_bi_entry& entry = *iter;
+ if (verbose_json_out) {
+ formatter->open_object_section("entry");
+
+ encode_json("shard_id", i, formatter);
+ encode_json("num_entry", total_entries, formatter);
+ encode_json("entry", entry, formatter);
+ }
+ total_entries++;
+
+ marker = entry.idx;
+
+ int target_shard_id;
+ cls_rgw_obj_key cls_key;
+ RGWObjCategory category;
+ rgw_bucket_category_stats stats;
+ bool account = entry.get_info(&cls_key, &category, &stats);
+ rgw_obj_key key(cls_key);
+ if (entry.type == BIIndexType::OLH && key.empty()) {
+ // bogus entry created by https://tracker.ceph.com/issues/46456
+ // to fix, skip so it doesn't get include in the new bucket instance
+ total_entries--;
+ ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
+ continue;
+ }
+ rgw_obj obj(bucket_info.bucket, key);
+ RGWMPObj mp;
+ if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
+ // place the multipart .meta object on the same shard as its head object
+ obj.index_hash_source = mp.get_key();
+ }
+ ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
+ obj.get_hash_object(), &target_shard_id);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
+
+ ret = target_shards_mgr.add_entry(shard_index, entry, account,
+ category, stats);
+ if (ret < 0) {
+ return ret;
+ }
+
+ Clock::time_point now = Clock::now();
+ if (reshard_lock.should_renew(now)) {
+ // assume outer locks have timespans at least the size of ours, so
+ // can call inside conditional
+ if (outer_reshard_lock) {
+ ret = outer_reshard_lock->renew(now);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ ret = reshard_lock.renew(now);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
+ return ret;
+ }
+ }
+ if (verbose_json_out) {
+ formatter->close_section();
+ formatter->flush(*out);
+ } else if (out && !(total_entries % 1000)) {
+ (*out) << " " << total_entries;
+ }
+ } // entries loop
+ }
+ }
+
+ if (verbose_json_out) {
+ formatter->close_section();
+ formatter->flush(*out);
+ } else if (out) {
+ (*out) << " " << total_entries << std::endl;
+ }
+
+ int ret = target_shards_mgr.finish();
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
+ return -EIO;
+ }
+ return 0;
+} // RGWBucketReshard::do_reshard
+
+int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
+{
+ return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
+}
+
+int RGWBucketReshard::execute(int num_shards,
+ ReshardFaultInjector& fault,
+ int max_op_entries,
+ const DoutPrefixProvider *dpp,
+ bool verbose, ostream *out,
+ Formatter *formatter,
+ RGWReshard* reshard_log)
+{
+ // take a reshard lock on the bucket
+ int ret = reshard_lock.lock(dpp);
+ if (ret < 0) {
+ return ret;
+ }
+ // unlock when scope exits
+ auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
+
+ if (reshard_log) {
+ ret = reshard_log->update(dpp, bucket_info);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ // prepare the target index and add its layout the bucket info
+ ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (ret = fault.check("do_reshard");
+ ret == 0) { // no fault injected, do the reshard
+ ret = do_reshard(bucket_info.layout.current_index,
+ *bucket_info.layout.target_index,
+ max_op_entries, verbose, out, formatter, dpp);
+ }
+
+ if (ret < 0) {
+ cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+
+ ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+ << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
+ return ret;
+ }
+
+ ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+ << bucket_info.bucket.name << "\" completed successfully" << dendl;
+ return 0;
+} // execute
+
+bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
+ const RGWSI_Zone* zone_svc)
+{
+ return !zone_svc->need_to_log_data() ||
+ bucket.layout.logs.size() < max_bilog_history;
+}
+
+
+RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
+ Formatter *_formatter) :
+ store(_store), instance_lock(bucket_instance_lock_name),
+ verbose(_verbose), out(_out), formatter(_formatter)
+{
+ num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
+}
+
+string RGWReshard::get_logshard_key(const string& tenant,
+ const string& bucket_name)
+{
+ return tenant + ":" + bucket_name;
+}
+
+#define MAX_RESHARD_LOGSHARDS_PRIME 7877
+
+void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
+{
+ string key = get_logshard_key(tenant, bucket_name);
+
+ uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+ uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+ sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
+
+ get_logshard_oid(int(sid), oid);
+}
+
+int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+ if (!store->svc()->zone->can_reshard()) {
+ ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl;
+ return 0;
+ }
+
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ librados::ObjectWriteOperation op;
+ cls_rgw_reshard_add(op, entry);
+
+ int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
+{
+ cls_rgw_reshard_entry entry;
+ entry.bucket_name = bucket_info.bucket.name;
+ entry.bucket_id = bucket_info.bucket.bucket_id;
+ entry.tenant = bucket_info.owner.tenant;
+
+ int ret = get(dpp, entry);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = add(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
+ cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+}
+
+
+int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
+{
+ string logshard_oid;
+
+ get_logshard_oid(logshard_num, &logshard_oid);
+
+ int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
+
+ if (ret == -ENOENT) {
+ // these shard objects aren't created until we actually write something to
+ // them, so treat ENOENT as a successful empty listing
+ *is_truncated = false;
+ ret = 0;
+ } else if (ret == -EACCES) {
+ ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
+ << ". Fix the pool access permissions of your client" << dendl;
+ } else if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
+ << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
+ }
+
+ return ret;
+}
+
+int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
+ " bucket=" << entry.bucket_name << dendl;
+ }
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
+{
+ string logshard_oid;
+
+ get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+ librados::ObjectWriteOperation op;
+ cls_rgw_reshard_remove(op, entry);
+
+ int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+ return ret;
+ }
+
+ return ret;
+}
+
+int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
+{
+ int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
+ if (ret < 0) {
+ ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWReshardWait::wait(optional_yield y)
+{
+ std::unique_lock lock(mutex);
+
+ if (going_down) {
+ return -ECANCELED;
+ }
+
+ if (y) {
+ auto& context = y.get_io_context();
+ auto& yield = y.get_yield_context();
+
+ Waiter waiter(context);
+ waiters.push_back(waiter);
+ lock.unlock();
+
+ waiter.timer.expires_after(duration);
+
+ boost::system::error_code ec;
+ waiter.timer.async_wait(yield[ec]);
+
+ lock.lock();
+ waiters.erase(waiters.iterator_to(waiter));
+ return -ec.value();
+ }
+
+ cond.wait_for(lock, duration);
+
+ if (going_down) {
+ return -ECANCELED;
+ }
+
+ return 0;
+}
+
+void RGWReshardWait::stop()
+{
+ std::scoped_lock lock(mutex);
+ going_down = true;
+ cond.notify_all();
+ for (auto& waiter : waiters) {
+ // unblock any waiters with ECANCELED
+ waiter.timer.cancel();
+ }
+}
+
+int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
+ int max_entries, const DoutPrefixProvider *dpp)
+{
+ ldpp_dout(dpp, 20) << __func__ << " resharding " <<
+ entry.bucket_name << dendl;
+
+ rgw_bucket bucket;
+ RGWBucketInfo bucket_info;
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ int ret = store->getRados()->get_bucket_info(store->svc(),
+ entry.tenant,
+ entry.bucket_name,
+ bucket_info, nullptr,
+ null_yield, dpp,
+ &bucket_attrs);
+ if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error in get_bucket_info for bucket " << entry.bucket_name <<
+ ": " << cpp_strerror(-ret) << dendl;
+ if (ret != -ENOENT) {
+ // any error other than ENOENT will abort
+ return ret;
+ }
+ } else {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Bucket: " << entry.bucket_name <<
+ " already resharded by someone, skipping " << dendl;
+ }
+
+ // we've encountered a reshard queue entry for an apparently
+ // non-existent bucket; let's try to recover by cleaning up
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": removing reshard queue entry for a resharded or non-existent bucket" <<
+ entry.bucket_name << dendl;
+
+ ret = remove(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error removing non-existent bucket " <<
+ entry.bucket_name << " from resharding queue: " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ // we cleaned up, move on to the next entry
+ return 0;
+ }
+
+ if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
+ ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
+ "eligible for resharding until peer zones finish syncing one "
+ "or more of its old log generations" << dendl;
+ return remove(dpp, entry);
+ }
+
+ RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
+
+ ReshardFaultInjector f; // no fault injected
+ ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
+ false, nullptr, nullptr, this);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ <<
+ ": Error during resharding bucket " << entry.bucket_name << ":" <<
+ cpp_strerror(-ret)<< dendl;
+ return ret;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<
+ " removing reshard queue entry for bucket " << entry.bucket_name <<
+ dendl;
+
+ ret = remove(dpp, entry);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
+ entry.bucket_name << " from resharding queue: " <<
+ cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ return 0;
+}
+
+int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
+{
+ string marker;
+ bool truncated = true;
+
+ constexpr uint32_t max_entries = 1000;
+
+ string logshard_oid;
+ get_logshard_oid(logshard_num, &logshard_oid);
+
+ RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+ int ret = logshard_lock.lock(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
+ logshard_oid << ", ret = " << ret <<dendl;
+ return ret;
+ }
+
+ do {
+ std::list<cls_rgw_reshard_entry> entries;
+ ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
+ if (ret < 0) {
+ ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
+ logshard_oid << dendl;
+ continue;
+ }
+
+ for(auto& entry: entries) { // logshard entries
+ process_entry(entry, max_entries, dpp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ Clock::time_point now = Clock::now();
+ if (logshard_lock.should_renew(now)) {
+ ret = logshard_lock.renew(now);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ entry.get_key(&marker);
+ } // entry for loop
+ } while (truncated);
+
+ logshard_lock.unlock();
+ return 0;
+}
+
+
+void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
+{
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
+
+ string objname(reshard_oid_prefix);
+ *logshard = objname + buf;
+}
+
+int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
+{
+ int ret = 0;
+
+ for (int i = 0; i < num_logshards; i++) {
+ string logshard;
+ get_logshard_oid(i, &logshard);
+
+ ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
+
+ ret = process_single_logshard(i, dpp);
+
+ ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
+ }
+
+ return 0;
+}
+
+bool RGWReshard::going_down()
+{
+ return down_flag;
+}
+
+void RGWReshard::start_processor()
+{
+ worker = new ReshardWorker(store->ctx(), this);
+ worker->create("rgw_reshard");
+}
+
+void RGWReshard::stop_processor()
+{
+ down_flag = true;
+ if (worker) {
+ worker->stop();
+ worker->join();
+ }
+ delete worker;
+ worker = nullptr;
+}
+
+void *RGWReshard::ReshardWorker::entry() {
+ do {
+ utime_t start = ceph_clock_now();
+ reshard->process_all_logshards(this);
+
+ if (reshard->going_down())
+ break;
+
+ utime_t end = ceph_clock_now();
+ end -= start;
+ int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+
+ if (secs <= end.sec())
+ continue; // next round
+
+ secs -= end.sec();
+
+ std::unique_lock locker{lock};
+ cond.wait_for(locker, std::chrono::seconds(secs));
+ } while (!reshard->going_down());
+
+ return NULL;
+}
+
+void RGWReshard::ReshardWorker::stop()
+{
+ std::lock_guard l{lock};
+ cond.notify_all();
+}
+
+CephContext *RGWReshard::ReshardWorker::get_cct() const
+{
+ return cct;
+}
+
+unsigned RGWReshard::ReshardWorker::get_subsys() const
+{
+ return dout_subsys;
+}
+
+std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
+{
+ return out << "rgw reshard worker thread: ";
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+#include <initializer_list>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/asio/basic_waitable_timer.hpp>
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
+#include "common/async/yield_context.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/lock/cls_lock_client.h"
+
+#include "rgw_common.h"
+#include "common/fault_injector.h"
+
+
+class RGWReshard;
+namespace rgw { namespace sal {
+ class RadosStore;
+} }
+
+using ReshardFaultInjector = FaultInjector<std::string_view>;
+
+class RGWBucketReshardLock {
+ using Clock = ceph::coarse_mono_clock;
+
+ rgw::sal::RadosStore* store;
+ const std::string lock_oid;
+ const bool ephemeral;
+ rados::cls::lock::Lock internal_lock;
+ std::chrono::seconds duration;
+
+ Clock::time_point start_time;
+ Clock::time_point renew_thresh;
+
+ void reset_time(const Clock::time_point& now) {
+ start_time = now;
+ renew_thresh = start_time + duration / 2;
+ }
+
+public:
+ RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const std::string& reshard_lock_oid,
+ bool _ephemeral);
+ RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& bucket_info,
+ bool _ephemeral) :
+ RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+ {}
+
+ int lock(const DoutPrefixProvider *dpp);
+ void unlock();
+ int renew(const Clock::time_point&);
+
+ bool should_renew(const Clock::time_point& now) const {
+ return now >= renew_thresh;
+ }
+}; // class RGWBucketReshardLock
+
+class RGWBucketReshard {
+ public:
+ using Clock = ceph::coarse_mono_clock;
+
+ private:
+ rgw::sal::RadosStore *store;
+ RGWBucketInfo bucket_info;
+ std::map<std::string, bufferlist> bucket_attrs;
+
+ RGWBucketReshardLock reshard_lock;
+ RGWBucketReshardLock* outer_reshard_lock;
+
+ // using an initializer_list as an array in contiguous memory
+ // allocated in at once
+ static const std::initializer_list<uint16_t> reshard_primes;
+
+ int do_reshard(const rgw::bucket_index_layout_generation& current,
+ const rgw::bucket_index_layout_generation& target,
+ int max_entries,
+ bool verbose,
+ std::ostream *os,
+ Formatter *formatter,
+ const DoutPrefixProvider *dpp);
+public:
+
+ // pass nullptr for the final parameter if no outer reshard lock to
+ // manage
+ RGWBucketReshard(rgw::sal::RadosStore* _store,
+ const RGWBucketInfo& _bucket_info,
+ const std::map<std::string, bufferlist>& _bucket_attrs,
+ RGWBucketReshardLock* _outer_reshard_lock);
+ int execute(int num_shards, ReshardFaultInjector& f,
+ int max_op_entries, const DoutPrefixProvider *dpp,
+ bool verbose = false, std::ostream *out = nullptr,
+ ceph::Formatter *formatter = nullptr,
+ RGWReshard *reshard_log = nullptr);
+ int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
+ int cancel(const DoutPrefixProvider* dpp);
+
+ static int clear_resharding(rgw::sal::RadosStore* store,
+ RGWBucketInfo& bucket_info,
+ std::map<std::string, bufferlist>& bucket_attrs,
+ const DoutPrefixProvider* dpp);
+
+ static uint32_t get_max_prime_shards() {
+ return *std::crbegin(reshard_primes);
+ }
+
+ // returns the prime in our list less than or equal to the
+ // parameter; the lowest value that can be returned is 1
+ static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
+ auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
+ requested_shards);
+ if (it == reshard_primes.begin()) {
+ return 1;
+ } else {
+ return *(--it);
+ }
+ }
+
+ // returns the prime in our list greater than or equal to the
+ // parameter; if we do not have such a prime, 0 is returned
+ static uint32_t get_prime_shards_greater_or_equal(
+ uint32_t requested_shards)
+ {
+ auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
+ requested_shards);
+ if (it == reshard_primes.end()) {
+ return 0;
+ } else {
+ return *it;
+ }
+ }
+
+ // returns a preferred number of shards given a calculated number of
+ // shards based on max_dynamic_shards and the list of prime values
+ static uint32_t get_preferred_shards(uint32_t suggested_shards,
+ uint32_t max_dynamic_shards) {
+
+ // use a prime if max is within our prime range, otherwise use
+ // specified max
+ const uint32_t absolute_max =
+ max_dynamic_shards >= get_max_prime_shards() ?
+ max_dynamic_shards :
+ get_prime_shards_less_or_equal(max_dynamic_shards);
+
+ // if we can use a prime number, use it, otherwise use suggested;
+ // note get_prime_shards_greater_or_equal will return 0 if no prime in
+ // prime range
+ const uint32_t prime_ish_num_shards =
+ std::max(get_prime_shards_greater_or_equal(suggested_shards),
+ suggested_shards);
+
+ // dynamic sharding cannot reshard more than defined maximum
+ const uint32_t final_num_shards =
+ std::min(prime_ish_num_shards, absolute_max);
+
+ return final_num_shards;
+ }
+
+ const std::map<std::string, bufferlist>& get_bucket_attrs() const {
+ return bucket_attrs;
+ }
+
+ // for multisite, the RGWBucketInfo keeps a history of old log generations
+ // until all peers are done with them. prevent this log history from growing
+ // too large by refusing to reshard the bucket until the old logs get trimmed
+ static constexpr size_t max_bilog_history = 4;
+
+ static bool can_reshard(const RGWBucketInfo& bucket,
+ const RGWSI_Zone* zone_svc);
+}; // RGWBucketReshard
+
+
+class RGWReshard {
+public:
+ using Clock = ceph::coarse_mono_clock;
+
+private:
+ rgw::sal::RadosStore* store;
+ std::string lock_name;
+ rados::cls::lock::Lock instance_lock;
+ int num_logshards;
+
+ bool verbose;
+ std::ostream *out;
+ Formatter *formatter;
+
+ void get_logshard_oid(int shard_num, std::string *shard);
+protected:
+ class ReshardWorker : public Thread, public DoutPrefixProvider {
+ CephContext *cct;
+ RGWReshard *reshard;
+ ceph::mutex lock = ceph::make_mutex("ReshardWorker");
+ ceph::condition_variable cond;
+
+ public:
+ ReshardWorker(CephContext * const _cct,
+ RGWReshard * const _reshard)
+ : cct(_cct),
+ reshard(_reshard) {}
+
+ void *entry() override;
+ void stop();
+
+ CephContext *get_cct() const override;
+ unsigned get_subsys() const override;
+ std::ostream& gen_prefix(std::ostream& out) const override;
+ };
+
+ ReshardWorker *worker = nullptr;
+ std::atomic<bool> down_flag = { false };
+
+ std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
+ void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
+
+public:
+ RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
+ int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+ int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
+ int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+ int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
+ int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
+ int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
+
+ /* reshard thread */
+ int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
+ const DoutPrefixProvider *dpp);
+ int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
+ int process_all_logshards(const DoutPrefixProvider *dpp);
+ bool going_down();
+ void start_processor();
+ void stop_processor();
+};
+
+class RGWReshardWait {
+ public:
+ // the blocking wait uses std::condition_variable::wait_for(), which uses the
+ // std::chrono::steady_clock. use that for the async waits as well
+ using Clock = std::chrono::steady_clock;
+ private:
+ const ceph::timespan duration;
+ ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
+ ceph::condition_variable cond;
+
+ struct Waiter : boost::intrusive::list_base_hook<> {
+ using Executor = boost::asio::io_context::executor_type;
+ using Timer = boost::asio::basic_waitable_timer<Clock,
+ boost::asio::wait_traits<Clock>, Executor>;
+ Timer timer;
+ explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
+ };
+ boost::intrusive::list<Waiter> waiters;
+
+ bool going_down{false};
+
+public:
+ RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
+ : duration(duration) {}
+ ~RGWReshardWait() {
+ ceph_assert(going_down);
+ }
+ int wait(optional_yield y);
+ // unblock any threads waiting on reshard
+ void stop();
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_Bucket_Info : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Info() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_bucket_info"; }
+};
+
+void RGWOp_Bucket_Info::execute(optional_yield y)
+{
+ RGWBucketAdminOpState op_state;
+
+ bool fetch_stats;
+
+ std::string bucket;
+
+ string uid_str;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+ op_state.set_fetch_stats(fetch_stats);
+
+ op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
+}
+
+class RGWOp_Get_Policy : public RGWRESTOp {
+
+public:
+ RGWOp_Get_Policy() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_READ);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "get_policy"; }
+};
+
+void RGWOp_Get_Policy::execute(optional_yield y)
+{
+ RGWBucketAdminOpState op_state;
+
+ std::string bucket;
+ std::string object;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "object", object, &object);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_object(object);
+
+ op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
+}
+
+class RGWOp_Check_Bucket_Index : public RGWRESTOp {
+
+public:
+ RGWOp_Check_Bucket_Index() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "check_bucket_index"; }
+};
+
+void RGWOp_Check_Bucket_Index::execute(optional_yield y)
+{
+ std::string bucket;
+
+ bool fix_index;
+ bool check_objects;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_bool(s, "fix", false, &fix_index);
+ RESTArgs::get_bool(s, "check-objects", false, &check_objects);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_fix_index(fix_index);
+ op_state.set_check_objects(check_objects);
+
+ op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
+}
+
+class RGWOp_Bucket_Link : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Link() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "link_bucket"; }
+};
+
+void RGWOp_Bucket_Link::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string bucket;
+ std::string bucket_id;
+ std::string new_bucket_name;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
+ RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
+
+ rgw_user uid(uid_str);
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+ op_state.set_bucket_id(bucket_id);
+ op_state.set_new_bucket_name(new_bucket_name);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWBucketAdminOp::link(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Unlink : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Unlink() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "unlink_bucket"; }
+};
+
+void RGWOp_Bucket_Unlink::execute(optional_yield y)
+{
+ std::string uid_str;
+ std::string bucket;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+ rgw_user uid(uid_str);
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket);
+
+ bufferlist data;
+ op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+ return;
+ }
+ op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Remove : public RGWRESTOp {
+
+public:
+ RGWOp_Bucket_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_bucket"; }
+};
+
+void RGWOp_Bucket_Remove::execute(optional_yield y)
+{
+ std::string bucket_name;
+ bool delete_children;
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+
+ RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+ RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
+
+ /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
+ * the master. This user is actually the OP caller, not the bucket owner. */
+ op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
+ if (op_ret == -ENOENT) {
+ op_ret = -ERR_NO_SUCH_BUCKET;
+ }
+ return;
+ }
+
+ op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
+}
+
+class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
+
+public:
+ RGWOp_Set_Bucket_Quota() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "set_bucket_quota"; }
+};
+
+#define QUOTA_INPUT_MAX_LEN 1024
+
+void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
+{
+ bool uid_arg_existed = false;
+ std::string uid_str;
+ RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
+ if (! uid_arg_existed) {
+ op_ret = -EINVAL;
+ return;
+ }
+ rgw_user uid(uid_str);
+ bool bucket_arg_existed = false;
+ std::string bucket_name;
+ RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
+ if (! bucket_arg_existed) {
+ op_ret = -EINVAL;
+ return;
+ }
+
+ bool use_http_params;
+
+ if (s->content_length > 0) {
+ use_http_params = false;
+ } else {
+ const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+ use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+ }
+ RGWQuotaInfo quota;
+ if (!use_http_params) {
+ bool empty;
+ op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+ if (op_ret < 0) {
+ if (!empty)
+ return;
+ /* was probably chunked input, but no content provided, configure via http params */
+ use_http_params = true;
+ }
+ }
+ if (use_http_params) {
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
+ if (op_ret < 0) {
+ return;
+ }
+ RGWQuotaInfo *old_quota = &bucket->get_info().quota;
+ int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
+ int64_t max_size_kb;
+ bool has_max_size_kb = false;
+ RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects);
+ RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size);
+ RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
+ if (has_max_size_kb)
+ quota.max_size = max_size_kb * 1024;
+ RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled);
+ }
+
+ RGWBucketAdminOpState op_state;
+ op_state.set_user_id(uid);
+ op_state.set_bucket_name(bucket_name);
+ op_state.set_quota(quota);
+
+ op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
+}
+
+class RGWOp_Sync_Bucket : public RGWRESTOp {
+
+public:
+ RGWOp_Sync_Bucket() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "sync_bucket"; }
+};
+
+void RGWOp_Sync_Bucket::execute(optional_yield y)
+{
+ std::string bucket;
+ std::string tenant;
+ bool sync_bucket;
+
+ RGWBucketAdminOpState op_state;
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "tenant", tenant, &tenant);
+ RESTArgs::get_bool(s, "sync", true, &sync_bucket);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_tenant(tenant);
+ op_state.set_sync_bucket(sync_bucket);
+
+ op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
+}
+
+class RGWOp_Object_Remove: public RGWRESTOp {
+
+public:
+ RGWOp_Object_Remove() {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("buckets", RGW_CAP_WRITE);
+ }
+
+ void execute(optional_yield y) override;
+
+ const char* name() const override { return "remove_object"; }
+};
+
+void RGWOp_Object_Remove::execute(optional_yield y)
+{
+ std::string bucket;
+ std::string object;
+
+ RGWBucketAdminOpState op_state;
+
+ RESTArgs::get_string(s, "bucket", bucket, &bucket);
+ RESTArgs::get_string(s, "object", object, &object);
+
+ op_state.set_bucket_name(bucket);
+ op_state.set_object(object);
+
+ op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
+}
+
+
+RGWOp *RGWHandler_Bucket::op_get()
+{
+
+ if (s->info.args.sub_resource_exists("policy"))
+ return new RGWOp_Get_Policy;
+
+ if (s->info.args.sub_resource_exists("index"))
+ return new RGWOp_Check_Bucket_Index;
+
+ return new RGWOp_Bucket_Info;
+}
+
+RGWOp *RGWHandler_Bucket::op_put()
+{
+ if (s->info.args.sub_resource_exists("quota"))
+ return new RGWOp_Set_Bucket_Quota;
+
+ if (s->info.args.sub_resource_exists("sync"))
+ return new RGWOp_Sync_Bucket;
+
+ return new RGWOp_Bucket_Link;
+}
+
+RGWOp *RGWHandler_Bucket::op_post()
+{
+ return new RGWOp_Bucket_Unlink;
+}
+
+RGWOp *RGWHandler_Bucket::op_delete()
+{
+ if (s->info.args.sub_resource_exists("object"))
+ return new RGWOp_Object_Remove;
+
+ return new RGWOp_Bucket_Remove;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
+protected:
+ RGWOp *op_get() override;
+ RGWOp *op_put() override;
+ RGWOp *op_post() override;
+ RGWOp *op_delete() override;
+public:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ ~RGWHandler_Bucket() override = default;
+
+ int read_permissions(RGWOp*, optional_yield y) override {
+ return 0;
+ }
+};
+
+class RGWRESTMgr_Bucket : public RGWRESTMgr {
+public:
+ RGWRESTMgr_Bucket() = default;
+ ~RGWRESTMgr_Bucket() override = default;
+
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state*,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string&) override {
+ return new RGWHandler_Bucket(auth_registry);
+ }
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ceph_json.h"
+#include "common/strtol.h"
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_log.h"
+#include "rgw_client_io.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_trim_bilog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_bilog_rados.h"
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWOp_MDLog_List::execute(optional_yield y) {
+ string period = s->info.args.get("period");
+ string shard = s->info.args.get("id");
+ string max_entries_str = s->info.args.get("max-entries");
+ string marker = s->info.args.get("marker"),
+ err;
+ void *handle;
+ unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (!max_entries_str.empty()) {
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+ }
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
+
+ op_ret = meta_log.list_entries(this, handle, max_entries, entries,
+ &last_marker, &truncated);
+
+ meta_log.complete_list_entries(handle);
+}
+
+void RGWOp_MDLog_List::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("log_entries");
+ s->formatter->dump_string("marker", last_marker);
+ s->formatter->dump_bool("truncated", truncated);
+ {
+ s->formatter->open_array_section("entries");
+ for (list<cls_log_entry>::iterator iter = entries.begin();
+ iter != entries.end(); ++iter) {
+ cls_log_entry& entry = *iter;
+ static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
+ flusher.flush();
+ }
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_MDLog_Info::execute(optional_yield y) {
+ num_objects = s->cct->_conf->rgw_md_log_max_shards;
+ period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
+ op_ret = period.get_error();
+}
+
+void RGWOp_MDLog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ s->formatter->open_object_section("mdlog");
+ s->formatter->dump_unsigned("num_objects", num_objects);
+ if (period) {
+ s->formatter->dump_string("period", period.get_period().get_id());
+ s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
+ string period = s->info.args.get("period");
+ string shard = s->info.args.get("id");
+ string err;
+
+ unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ op_ret = meta_log.get_info(this, shard_id, &info);
+}
+
+void RGWOp_MDLog_ShardInfo::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ encode_json("info", info, s->formatter);
+ flusher.flush();
+}
+
+void RGWOp_MDLog_Delete::execute(optional_yield y) {
+ string marker = s->info.args.get("marker"),
+ period = s->info.args.get("period"),
+ shard = s->info.args.get("id"),
+ err;
+ unsigned shard_id;
+
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("start-marker")) {
+ ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("end-marker")) {
+ if (!s->info.args.exists("marker")) {
+ marker = s->info.args.get("end-marker");
+ } else {
+ ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+ op_ret = -EINVAL;
+ }
+ }
+
+ op_ret = 0;
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (marker.empty()) { /* bounding end */
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+ op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
+}
+
+void RGWOp_MDLog_Lock::execute(optional_yield y) {
+ string period, shard_id_str, duration_str, locker_id, zone_id;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ period = s->info.args.get("period");
+ shard_id_str = s->info.args.get("id");
+ duration_str = s->info.args.get("length");
+ locker_id = s->info.args.get("locker-id");
+ zone_id = s->info.args.get("zone-id");
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ }
+
+ if (period.empty() ||
+ shard_id_str.empty() ||
+ (duration_str.empty()) ||
+ locker_id.empty() ||
+ zone_id.empty()) {
+ ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+ unsigned dur;
+ dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
+ if (!err.empty() || dur <= 0) {
+ ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
+ locker_id);
+ if (op_ret == -EBUSY)
+ op_ret = -ERR_LOCKED;
+}
+
+void RGWOp_MDLog_Unlock::execute(optional_yield y) {
+ string period, shard_id_str, locker_id, zone_id;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ period = s->info.args.get("period");
+ shard_id_str = s->info.args.get("id");
+ locker_id = s->info.args.get("locker-id");
+ zone_id = s->info.args.get("zone-id");
+
+ if (period.empty()) {
+ ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+ period = driver->get_zone()->get_current_period_id();
+ }
+
+ if (period.empty() ||
+ shard_id_str.empty() ||
+ locker_id.empty() ||
+ zone_id.empty()) {
+ ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+ op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
+}
+
+void RGWOp_MDLog_Notify::execute(optional_yield y) {
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ set<int> updated_shards;
+ try {
+ decode_json_obj(updated_shards, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
+ }
+ }
+
+ driver->wakeup_meta_sync_shards(updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_BILog_List::execute(optional_yield y) {
+ bool gen_specified = false;
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ marker = s->info.args.get("marker"),
+ max_entries_str = s->info.args.get("max-entries"),
+ bucket_instance = s->info.args.get("bucket-instance"),
+ gen_str = s->info.args.get("generation", &gen_specified),
+ format_version_str = s->info.args.get("format-ver");
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ unsigned max_entries;
+
+ if (bucket_name.empty() && bucket_instance.empty()) {
+ ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ std::optional<uint64_t> gen;
+ if (gen_specified) {
+ gen = strict_strtoll(gen_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ if (!format_version_str.empty()) {
+ format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ const auto& logs = bucket->get_info().layout.logs;
+ if (logs.empty()) {
+ ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+
+ auto log = std::prev(logs.end());
+ if (gen) {
+ log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+ if (log == logs.end()) {
+ ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ }
+ if (auto next = std::next(log); next != logs.end()) {
+ next_log_layout = *next; // get the next log after the current latest
+ }
+ auto& log_layout = *log; // current log layout for log listing
+
+ unsigned count = 0;
+
+
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty())
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ send_response();
+ do {
+ list<rgw_bi_log_entry> entries;
+ int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
+ marker, max_entries - count,
+ entries, &truncated);
+ if (ret < 0) {
+ ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
+ return;
+ }
+
+ count += entries.size();
+
+ send_response(entries, marker);
+ } while (truncated && count < max_entries);
+
+ send_response_end();
+}
+
+void RGWOp_BILog_List::send_response() {
+ if (sent_header)
+ return;
+
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ sent_header = true;
+
+ if (op_ret < 0)
+ return;
+
+ if (format_ver >= 2) {
+ s->formatter->open_object_section("result");
+ }
+
+ s->formatter->open_array_section("entries");
+}
+
+void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
+{
+ for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ rgw_bi_log_entry& entry = *iter;
+ encode_json("entry", entry, s->formatter);
+
+ marker = entry.id;
+ flusher.flush();
+ }
+}
+
+void RGWOp_BILog_List::send_response_end() {
+ s->formatter->close_section();
+
+ if (format_ver >= 2) {
+ encode_json("truncated", truncated, s->formatter);
+
+ if (next_log_layout) {
+ s->formatter->open_object_section("next_log");
+ encode_json("generation", next_log_layout->gen, s->formatter);
+ encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter);
+ s->formatter->close_section(); // next_log
+ }
+
+ s->formatter->close_section(); // result
+ }
+
+ flusher.flush();
+}
+
+void RGWOp_BILog_Info::execute(optional_yield y) {
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ bucket_instance = s->info.args.get("bucket-instance");
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ if (bucket_name.empty() && bucket_instance.empty()) {
+ ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ const auto& logs = bucket->get_info().layout.logs;
+ if (logs.empty()) {
+ ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+
+ map<RGWObjCategory, RGWStorageStats> stats;
+ const auto& index = log_to_index_layout(logs.back());
+
+ int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
+ if (ret < 0 && ret != -ENOENT) {
+ op_ret = ret;
+ return;
+ }
+
+ oldest_gen = logs.front().gen;
+ latest_gen = logs.back().gen;
+
+ for (auto& log : logs) {
+ uint32_t num_shards = log.layout.in_index.layout.num_shards;
+ generations.push_back({log.gen, num_shards});
+ }
+}
+
+void RGWOp_BILog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("info");
+ encode_json("bucket_ver", bucket_ver, s->formatter);
+ encode_json("master_ver", master_ver, s->formatter);
+ encode_json("max_marker", max_marker, s->formatter);
+ encode_json("syncstopped", syncstopped, s->formatter);
+ encode_json("oldest_gen", oldest_gen, s->formatter);
+ encode_json("latest_gen", latest_gen, s->formatter);
+ encode_json("generations", generations, s->formatter);
+ s->formatter->close_section();
+
+ flusher.flush();
+}
+
+void RGWOp_BILog_Delete::execute(optional_yield y) {
+ bool gen_specified = false;
+ string tenant_name = s->info.args.get("tenant"),
+ bucket_name = s->info.args.get("bucket"),
+ start_marker = s->info.args.get("start-marker"),
+ end_marker = s->info.args.get("end-marker"),
+ bucket_instance = s->info.args.get("bucket-instance"),
+ gen_str = s->info.args.get("generation", &gen_specified);
+
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+ op_ret = 0;
+ if ((bucket_name.empty() && bucket_instance.empty()) ||
+ end_marker.empty()) {
+ ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ string err;
+ uint64_t gen = 0;
+ if (gen_specified) {
+ gen = strict_strtoll(gen_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ }
+
+ int shard_id;
+ string bn;
+ op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+ if (op_ret < 0) {
+ return;
+ }
+
+ if (!bucket_instance.empty()) {
+ b.name = bn;
+ b.bucket_id = bucket_instance;
+ }
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+ return;
+ }
+
+ op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
+ bucket->get_info(), gen, shard_id,
+ start_marker, end_marker);
+ if (op_ret < 0) {
+ ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
+ }
+
+ return;
+}
+
+void RGWOp_DATALog_List::execute(optional_yield y) {
+ string shard = s->info.args.get("id");
+
+ string max_entries_str = s->info.args.get("max-entries"),
+ marker = s->info.args.get("marker"),
+ err;
+ unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ s->info.args.get_bool("extra-info", &extra_info, false);
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (!max_entries_str.empty()) {
+ max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+ max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+ }
+ }
+
+ // Note that last_marker is updated to be the marker of the last
+ // entry listed
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->list_entries(this, shard_id,
+ max_entries, entries,
+ marker, &last_marker,
+ &truncated);
+}
+
+void RGWOp_DATALog_List::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret < 0)
+ return;
+
+ s->formatter->open_object_section("log_entries");
+ s->formatter->dump_string("marker", last_marker);
+ s->formatter->dump_bool("truncated", truncated);
+ {
+ s->formatter->open_array_section("entries");
+ for (const auto& entry : entries) {
+ if (!extra_info) {
+ encode_json("entry", entry.entry, s->formatter);
+ } else {
+ encode_json("entry", entry, s->formatter);
+ }
+ flusher.flush();
+ }
+ s->formatter->close_section();
+ }
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+
+void RGWOp_DATALog_Info::execute(optional_yield y) {
+ num_objects = s->cct->_conf->rgw_data_log_num_shards;
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Info::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ s->formatter->open_object_section("num_objects");
+ s->formatter->dump_unsigned("num_objects", num_objects);
+ s->formatter->close_section();
+ flusher.flush();
+}
+
+void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
+ string shard = s->info.args.get("id");
+ string err;
+
+ unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->get_info(this, shard_id, &info);
+}
+
+void RGWOp_DATALog_ShardInfo::send_response() {
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ encode_json("info", info, s->formatter);
+ flusher.flush();
+}
+
+void RGWOp_DATALog_Notify::execute(optional_yield y) {
+ string source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
+ try {
+ auto decoder = rgw_data_notify_v1_decoder{updated_shards};
+ decode_json_obj(decoder, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
+ << " of gen=" << gen << dendl;
+ }
+ }
+ }
+
+ driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Notify2::execute(optional_yield y) {
+ string source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+ int r = 0;
+ bufferlist data;
+ std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
+ if (r < 0) {
+ op_ret = r;
+ return;
+ }
+
+ char* buf = data.c_str();
+ ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
+
+ JSONParser p;
+ r = p.parse(buf, data.length());
+ if (r < 0) {
+ ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
+ op_ret = r;
+ return;
+ }
+
+ bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
+ try {
+ decode_json_obj(updated_shards, &p);
+ } catch (JSONDecoder::err& err) {
+ ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+ for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
+ updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+ ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+ bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+ for (const auto& [key, gen] : entries) {
+ ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
+ " of generation=" << gen << dendl;
+ }
+ }
+ }
+
+ driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+ op_ret = 0;
+}
+
+void RGWOp_DATALog_Delete::execute(optional_yield y) {
+ string marker = s->info.args.get("marker"),
+ shard = s->info.args.get("id"),
+ err;
+ unsigned shard_id;
+
+ op_ret = 0;
+
+ if (s->info.args.exists("start-time") ||
+ s->info.args.exists("end-time")) {
+ ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("start-marker")) {
+ ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+ op_ret = -EINVAL;
+ }
+
+ if (s->info.args.exists("end-marker")) {
+ if (!s->info.args.exists("marker")) {
+ marker = s->info.args.get("end-marker");
+ } else {
+ ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+ op_ret = -EINVAL;
+ }
+ }
+
+ shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+ if (!err.empty()) {
+ ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+ if (marker.empty()) { /* bounding end */
+ op_ret = -EINVAL;
+ return;
+ }
+
+ op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker);
+}
+
+// not in header to avoid pulling in rgw_sync.h
+class RGWOp_MDLog_Status : public RGWRESTOp {
+ rgw_meta_sync_status status;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "get_metadata_log_status"; }
+};
+
+void RGWOp_MDLog_Status::execute(optional_yield y)
+{
+ auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
+ if (sync == nullptr) {
+ ldpp_dout(this, 1) << "no sync manager" << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_MDLog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ encode_json("status", status, s->formatter);
+ }
+ flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_BILog_Status : public RGWRESTOp {
+ bilog_status_v2 status;
+ int version = 1;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override { return "get_bucket_index_log_status"; }
+};
+
+void RGWOp_BILog_Status::execute(optional_yield y)
+{
+ const auto options = s->info.args.get("options");
+ bool merge = (options == "merge");
+ const auto source_zone = s->info.args.get("source-zone");
+ const auto source_key = s->info.args.get("source-bucket");
+ auto key = s->info.args.get("bucket");
+ op_ret = s->info.args.get_int("version", &version, 1);
+
+ if (key.empty()) {
+ key = source_key;
+ }
+ if (key.empty()) {
+ ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ rgw_bucket b;
+ int shard_id{-1}; // unused
+ op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
+ op_ret = -EINVAL;
+ return;
+ }
+
+ // read the bucket instance info for num_shards
+ std::unique_ptr<rgw::sal::Bucket> bucket;
+ op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
+ return;
+ }
+
+ rgw_bucket source_bucket;
+
+ if (source_key.empty() ||
+ source_key == key) {
+ source_bucket = bucket->get_key();
+ } else {
+ op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
+ return;
+ }
+ }
+
+ const auto& local_zone_id = driver->get_zone()->get_id();
+
+ if (!merge) {
+ rgw_sync_bucket_pipe pipe;
+ pipe.source.zone = source_zone;
+ pipe.source.bucket = source_bucket;
+ pipe.dest.zone = local_zone_id;
+ pipe.dest.bucket = bucket->get_key();
+
+ ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+ op_ret = rgw_read_bucket_full_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ &status.sync_status,
+ s->yield);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ return;
+ }
+ status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
+
+ op_ret = rgw_read_bucket_inc_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ status.sync_status.incremental_gen,
+ &status.inc_status);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ }
+ return;
+ }
+
+ rgw_zone_id source_zone_id(source_zone);
+
+ RGWBucketSyncPolicyHandlerRef source_handler;
+ op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
+ return;
+ }
+
+ auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
+
+ std::vector<rgw_bucket_shard_sync_info> current_status;
+ for (auto& entry : local_dests) {
+ auto pipe = entry.second;
+
+ ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+ RGWBucketInfo *pinfo = &bucket->get_info();
+ std::optional<RGWBucketInfo> opt_dest_info;
+
+ if (!pipe.dest.bucket) {
+ /* Uh oh, something went wrong */
+ ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
+ op_ret = -EIO;
+ return;
+ }
+
+ if (*pipe.dest.bucket != pinfo->bucket) {
+ opt_dest_info.emplace();
+ std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+ op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
+ if (op_ret < 0) {
+ ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
+ return;
+ }
+
+ *opt_dest_info = dest_bucket->get_info();
+ pinfo = &(*opt_dest_info);
+ pipe.dest.bucket = pinfo->bucket;
+ }
+
+ op_ret = rgw_read_bucket_full_sync_status(
+ this,
+ static_cast<rgw::sal::RadosStore*>(driver),
+ pipe,
+ &status.sync_status,
+ s->yield);
+ if (op_ret < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+ return;
+ }
+
+ current_status.resize(status.sync_status.shards_done_with_gen.size());
+ int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
+ pipe, status.sync_status.incremental_gen, ¤t_status);
+ if (r < 0) {
+ ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
+ op_ret = r;
+ return;
+ }
+
+ if (status.inc_status.empty()) {
+ status.inc_status = std::move(current_status);
+ } else {
+ if (current_status.size() != status.inc_status.size()) {
+ op_ret = -EINVAL;
+ ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
+ "syncing from the same source: status.size()= "
+ << status.inc_status.size()
+ << " current_status.size()="
+ << current_status.size() << dendl;
+ return;
+ }
+ auto m = status.inc_status.begin();
+ for (auto& cur_shard_status : current_status) {
+ auto& result_shard_status = *m++;
+ // always take the first marker, or any later marker that's smaller
+ if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
+ result_shard_status = std::move(cur_shard_status);
+ }
+ }
+ }
+ }
+}
+
+void RGWOp_BILog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ if (version < 2) {
+ encode_json("status", status.inc_status, s->formatter);
+ } else {
+ encode_json("status", status, s->formatter);
+ }
+ }
+ flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_DATALog_Status : public RGWRESTOp {
+ rgw_data_sync_status status;
+public:
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override ;
+ void send_response() override;
+ const char* name() const override { return "get_data_changes_log_status"; }
+};
+
+void RGWOp_DATALog_Status::execute(optional_yield y)
+{
+ const auto source_zone = s->info.args.get("source-zone");
+ auto sync = driver->get_data_sync_manager(source_zone);
+ if (sync == nullptr) {
+ ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
+ op_ret = -ENOENT;
+ return;
+ }
+ op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_DATALog_Status::send_response()
+{
+ set_req_state_err(s, op_ret);
+ dump_errno(s);
+ end_header(s);
+
+ if (op_ret >= 0) {
+ encode_json("status", status, s->formatter);
+ }
+ flusher.flush();
+}
+
+
+RGWOp *RGWHandler_Log::op_get() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0) {
+ if (s->info.args.exists("id")) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_MDLog_ShardInfo;
+ } else {
+ return new RGWOp_MDLog_List;
+ }
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_MDLog_Status;
+ } else {
+ return new RGWOp_MDLog_Info;
+ }
+ } else if (type.compare("bucket-index") == 0) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_BILog_Info;
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_BILog_Status;
+ } else {
+ return new RGWOp_BILog_List;
+ }
+ } else if (type.compare("data") == 0) {
+ if (s->info.args.exists("id")) {
+ if (s->info.args.exists("info")) {
+ return new RGWOp_DATALog_ShardInfo;
+ } else {
+ return new RGWOp_DATALog_List;
+ }
+ } else if (s->info.args.exists("status")) {
+ return new RGWOp_DATALog_Status;
+ } else {
+ return new RGWOp_DATALog_Info;
+ }
+ }
+ return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_delete() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0)
+ return new RGWOp_MDLog_Delete;
+ else if (type.compare("bucket-index") == 0)
+ return new RGWOp_BILog_Delete;
+ else if (type.compare("data") == 0)
+ return new RGWOp_DATALog_Delete;
+ return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_post() {
+ bool exists;
+ string type = s->info.args.get("type", &exists);
+
+ if (!exists) {
+ return NULL;
+ }
+
+ if (type.compare("metadata") == 0) {
+ if (s->info.args.exists("lock"))
+ return new RGWOp_MDLog_Lock;
+ else if (s->info.args.exists("unlock"))
+ return new RGWOp_MDLog_Unlock;
+ else if (s->info.args.exists("notify"))
+ return new RGWOp_MDLog_Notify;
+ } else if (type.compare("data") == 0) {
+ if (s->info.args.exists("notify")) {
+ return new RGWOp_DATALog_Notify;
+ } else if (s->info.args.exists("notify2")) {
+ return new RGWOp_DATALog_Notify2;
+ }
+ }
+ return NULL;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_datalog.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+#include "rgw_data_sync.h"
+
+class RGWOp_BILog_List : public RGWRESTOp {
+ bool sent_header;
+ uint32_t format_ver{0};
+ bool truncated{false};
+ std::optional<rgw::bucket_log_layout_generation> next_log_layout;
+
+public:
+ RGWOp_BILog_List() : sent_header(false) {}
+ ~RGWOp_BILog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void send_response() override;
+ virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
+ virtual void send_response_end();
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "list_bucket_index_log";
+ }
+};
+
+class RGWOp_BILog_Info : public RGWRESTOp {
+ std::string bucket_ver;
+ std::string master_ver;
+ std::string max_marker;
+ bool syncstopped;
+ uint64_t oldest_gen = 0;
+ uint64_t latest_gen = 0;
+ std::vector<store_gen_shards> generations;
+
+public:
+ RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
+ ~RGWOp_BILog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void send_response() override;
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "bucket_index_log_info";
+ }
+};
+
+class RGWOp_BILog_Delete : public RGWRESTOp {
+public:
+ RGWOp_BILog_Delete() {}
+ ~RGWOp_BILog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("bilog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_bucket_index_log";
+ }
+};
+
+class RGWOp_MDLog_List : public RGWRESTOp {
+ std::list<cls_log_entry> entries;
+ std::string last_marker;
+ bool truncated;
+public:
+ RGWOp_MDLog_List() : truncated(false) {}
+ ~RGWOp_MDLog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "list_metadata_log";
+ }
+};
+
+class RGWOp_MDLog_Info : public RGWRESTOp {
+ unsigned num_objects;
+ RGWPeriodHistory::Cursor period;
+public:
+ RGWOp_MDLog_Info() : num_objects(0) {}
+ ~RGWOp_MDLog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_metadata_log_info";
+ }
+};
+
+class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
+ RGWMetadataLogInfo info;
+public:
+ RGWOp_MDLog_ShardInfo() {}
+ ~RGWOp_MDLog_ShardInfo() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_metadata_log_shard_info";
+ }
+};
+
+class RGWOp_MDLog_Lock : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Lock() {}
+ ~RGWOp_MDLog_Lock() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "lock_mdlog_object";
+ }
+};
+
+class RGWOp_MDLog_Unlock : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Unlock() {}
+ ~RGWOp_MDLog_Unlock() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "unlock_mdlog_object";
+ }
+};
+
+class RGWOp_MDLog_Notify : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Notify() {}
+ ~RGWOp_MDLog_Notify() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "mdlog_notify";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
+};
+
+class RGWOp_MDLog_Delete : public RGWRESTOp {
+public:
+ RGWOp_MDLog_Delete() {}
+ ~RGWOp_MDLog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("mdlog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_metadata_log";
+ }
+};
+
+class RGWOp_DATALog_List : public RGWRESTOp {
+ std::vector<rgw_data_change_log_entry> entries;
+ std::string last_marker;
+ bool truncated;
+ bool extra_info;
+public:
+ RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
+ ~RGWOp_DATALog_List() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "list_data_changes_log";
+ }
+};
+
+class RGWOp_DATALog_Info : public RGWRESTOp {
+ unsigned num_objects;
+public:
+ RGWOp_DATALog_Info() : num_objects(0) {}
+ ~RGWOp_DATALog_Info() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_data_changes_log_info";
+ }
+};
+
+class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
+ RGWDataChangesLogInfo info;
+public:
+ RGWOp_DATALog_ShardInfo() {}
+ ~RGWOp_DATALog_ShardInfo() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_READ);
+ }
+ int verify_permission(optional_yield y) override {
+ return check_caps(s->user->get_caps());
+ }
+ void execute(optional_yield y) override;
+ void send_response() override;
+ const char* name() const override {
+ return "get_data_changes_log_shard_info";
+ }
+};
+
+class RGWOp_DATALog_Notify : public RGWRESTOp {
+public:
+ RGWOp_DATALog_Notify() {}
+ ~RGWOp_DATALog_Notify() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "datalog_notify";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
+};
+
+class RGWOp_DATALog_Notify2 : public RGWRESTOp {
+ rgw_data_notify_entry data_notify;
+public:
+ RGWOp_DATALog_Notify2() {}
+ ~RGWOp_DATALog_Notify2() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "datalog_notify2";
+ }
+ RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
+};
+
+class RGWOp_DATALog_Delete : public RGWRESTOp {
+public:
+ RGWOp_DATALog_Delete() {}
+ ~RGWOp_DATALog_Delete() override {}
+
+ int check_caps(const RGWUserCaps& caps) override {
+ return caps.check_cap("datalog", RGW_CAP_WRITE);
+ }
+ void execute(optional_yield y) override;
+ const char* name() const override {
+ return "trim_data_changes_log";
+ }
+};
+
+class RGWHandler_Log : public RGWHandler_Auth_S3 {
+protected:
+ RGWOp *op_get() override;
+ RGWOp *op_delete() override;
+ RGWOp *op_post() override;
+
+ int read_permissions(RGWOp*, optional_yield) override {
+ return 0;
+ }
+public:
+ using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+ ~RGWHandler_Log() override = default;
+};
+
+class RGWRESTMgr_Log : public RGWRESTMgr {
+public:
+ RGWRESTMgr_Log() = default;
+ ~RGWRESTMgr_Log() override = default;
+
+ RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+ req_state* const,
+ const rgw::auth::StrategyRegistry& auth_registry,
+ const std::string& frontend_prefixs) override {
+ return new RGWHandler_Log(auth_registry);
+ }
+};
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SERVICE_H
-#define CEPH_RGW_SERVICE_H
-
+#pragma once
#include <string>
#include <vector>
int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SYNC_H
-#define CEPH_RGW_SYNC_H
+#pragma once
#include <atomic>
uint32_t max_entries,
rgw_mdlog_shard_data *result);
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SYNC_MODULE_H
-#define CEPH_RGW_SYNC_MODULE_H
+#pragma once
#include "rgw_common.h"
#include "rgw_coroutine.h"
};
void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_SYNC_MODULE_AWS_H
-#define RGW_SYNC_MODULE_AWS_H
+#pragma once
#include "rgw_sync_module.h"
bool supports_data_export() override { return false;}
int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
};
-
-#endif /* RGW_SYNC_MODULE_AWS_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SYNC_MODULE_ES_H
-#define CEPH_RGW_SYNC_MODULE_ES_H
+#pragma once
#include "rgw_sync_module.h"
return true;
}
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SYNC_MODULE_LOG_H
-#define CEPH_RGW_SYNC_MODULE_LOG_H
+#pragma once
#include "rgw_sync_module.h"
}
int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SYNC_LOG_H
-#define CEPH_RGW_SYNC_LOG_H
+#pragma once
#include <atomic>
bufferlist& out) override;
std::string get_active_names();
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_TOOLS_H
-#define CEPH_RGW_TOOLS_H
+#pragma once
#include <string>
// (Currently providing nullptr will wipe all attributes.)
std::map<std::string, ceph::buffer::list>* no_change_attrs();
-#endif
* Foundation. See file COPYING.
*/
-#ifndef RGW_SYNC_LOG_TRIM_H
-#define RGW_SYNC_LOG_TRIM_H
+#pragma once
#include <memory>
#include <string_view>
int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
std::string_view start_marker, std::string_view end_marker);
-
-#endif // RGW_SYNC_LOG_TRIM_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_USER_H
-#define CEPH_RGW_USER_H
+#pragma once
#include <string>
#include <boost/algorithm/string.hpp>
public:
static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ZONE_H
-#define CEPH_RGW_ZONE_H
+#pragma once
#include <ostream>
#include "rgw_zone_types.h"
sal::ZoneWriter& writer);
} // namespace rgw
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ACL_H
-#define CEPH_RGW_ACL_H
+#pragma once
#include <map>
#include <string>
friend bool operator!=(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs);
};
WRITE_CLASS_ENCODER(RGWAccessControlPolicy)
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ACL_S3_H
-#define CEPH_RGW_ACL_S3_H
+#pragma once
#include <map>
#include <string>
public:
explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {}
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ACL_SWIFT_H
-#define CEPH_RGW_ACL_SWIFT_H
+#pragma once
#include <map>
#include <vector>
const std::string& acl_str);
boost::optional<std::string> to_str() const;
};
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_ASIO_CLIENT_H
-#define RGW_ASIO_CLIENT_H
+#pragma once
#include <boost/asio/ip/tcp.hpp>
#include <boost/beast/core.hpp>
} // namespace asio
} // namespace rgw
-
-#endif // RGW_ASIO_CLIENT_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_ASIO_FRONTEND_H
-#define RGW_ASIO_FRONTEND_H
+#pragma once
#include <memory>
#include "rgw_frontend.h"
void pause_for_new_config() override;
void unpause_with_new_config() override;
};
-
-#endif // RGW_ASIO_FRONTEND_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_AUTH_H
-#define CEPH_RGW_AUTH_H
+#pragma once
#include <functional>
#include <optional>
const rgw_user& uid,
const rgw::auth::Identity::aclspec_t& aclspec,
const DoutPrefixProvider *dpp);
-
-#endif /* CEPH_RGW_AUTH_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_AUTH_FILTERS_H
-#define CEPH_RGW_AUTH_FILTERS_H
+#pragma once
#include <type_traits>
} /* namespace auth */
} /* namespace rgw */
-
-#endif /* CEPH_RGW_AUTH_FILTERS_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_AUTH_KEYSTONE_H
-#define CEPH_RGW_AUTH_KEYSTONE_H
+#pragma once
#include <string_view>
#include <utility>
}; /* namespace keystone */
}; /* namespace auth */
}; /* namespace rgw */
-
-#endif /* CEPH_RGW_AUTH_KEYSTONE_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_AUTH_REGISTRY_H
-#define CEPH_RGW_AUTH_REGISTRY_H
+#pragma once
#include <functional>
#include <memory>
using rgw_auth_registry_t = rgw::auth::StrategyRegistry;
using rgw_auth_registry_ptr_t = std::unique_ptr<rgw_auth_registry_t>;
-
-#endif /* CEPH_RGW_AUTH_REGISTRY_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_AUTH_S3_H
-#define CEPH_RGW_AUTH_S3_H
+#pragma once
#include <array>
#include <memory>
} /* namespace s3 */
} /* namespace auth */
} /* namespace rgw */
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_B64_H
-#define RGW_B64_H
+#pragma once
#include <boost/archive/iterators/base64_from_binary.hpp>
#include <boost/archive/iterators/binary_from_base64.hpp>
return outstr;
}
} /* namespace */
-
-#endif /* RGW_B64_H */
* radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
*/
-#ifndef CEPH_RGW_BASIC_TYPES_H
-#define CEPH_RGW_BASIC_TYPES_H
+#pragma once
#include <string>
#include <fmt/format.h>
static void generate_test_instances(std::list<RGWUploadPartInfo*>& o);
};
WRITE_CLASS_ENCODER(RGWUploadPartInfo)
-
-#endif /* CEPH_RGW_BASIC_TYPES_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGWCACHE_H
-#define CEPH_RGWCACHE_H
+#pragma once
#include <string>
#include <map>
void unchain_cache(RGWChainedCache *cache);
void invalidate_all();
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_CLIENT_IO_H
-#define CEPH_RGW_CLIENT_IO_H
+#pragma once
#include <exception>
#include <string>
std::istream(static_cast<RGWClientIOStreamBuf *>(this)) {
}
};
-
-#endif /* CEPH_RGW_CLIENT_IO_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_CLIENT_IO_DECOIMPL_H
-#define CEPH_RGW_CLIENT_IO_DECOIMPL_H
+#pragma once
#include <type_traits>
} /* namespace io */
} /* namespace rgw */
-#endif /* CEPH_RGW_CLIENT_IO_DECOIMPL_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_COMPRESSION_H
-#define CEPH_RGW_COMPRESSION_H
+#pragma once
#include <vector>
std::optional<int32_t> get_compressor_message() { return compressor_message; }
}; /* RGWPutObj_Compress */
-
-#endif /* CEPH_RGW_COMPRESSION_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_COROUTINE_H
-#define CEPH_RGW_COROUTINE_H
+#pragma once
#ifdef _ASSERT_H
#define NEED_ASSERT_H
virtual int finish() { return 0; }
virtual void request_cleanup() {}
};
-
-#endif
*
*/
-#ifndef CEPH_RGW_CORS_H
-#define CEPH_RGW_CORS_H
+#pragma once
#include <map>
#include <string>
return -1;
return 0;
}
-#endif /*CEPH_RGW_CORS_H*/
*
*/
-#ifndef CEPH_RGW_CORS_S3_H
-#define CEPH_RGW_CORS_S3_H
+#pragma once
#include <map>
#include <string>
public:
explicit RGWCORSXMLParser_S3(const DoutPrefixProvider *_dpp, CephContext *_cct) : dpp(_dpp), cct(_cct) {}
};
-#endif /*CEPH_RGW_CORS_S3_H*/
*
*/
-#ifndef CEPH_RGW_CORS_SWIFT3_H
-#define CEPH_RGW_CORS_SWIFT3_H
+#pragma once
#include <map>
#include <string>
return 0;
}
};
-#endif /*CEPH_RGW_CORS_SWIFT3_H*/
* Crypto filters for Put/Post/Get operations.
*/
-#ifndef CEPH_RGW_CRYPT_H
-#define CEPH_RGW_CRYPT_H
+#pragma once
#include <string_view>
}
int rgw_remove_sse_s3_bucket_key(req_state *s);
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_RGW_CRYPT_SANITIZE_H_
-#define RGW_RGW_CRYPT_SANITIZE_H_
+#pragma once
#include <string_view>
#include "rgw_common.h"
std::ostream& operator<<(std::ostream& out, const log_content& x);
}
}
-#endif /* RGW_RGW_CRYPT_SANITIZE_H_ */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_CACHEREQUEST_H
-#define RGW_CACHEREQUEST_H
+#pragma once
#include <fcntl.h>
#include <stdlib.h>
}
};
-
-#endif
*
*/
-#ifndef RGW_DMCLOCK_H
-#define RGW_DMCLOCK_H
+#pragma once
+
#include "dmclock/src/dmclock_server.h"
namespace rgw::dmclock {
}
} // namespace rgw::dmclock
-
-#endif /* RGW_DMCLOCK_H */
*
*/
-#ifndef RGW_DMCLOCK_ASYNC_SCHEDULER_H
-#define RGW_DMCLOCK_ASYNC_SCHEDULER_H
+#pragma once
#include "common/async/completion.h"
};
} // namespace rgw::dmclock
-#endif /* RGW_DMCLOCK_ASYNC_SCHEDULER_H */
*
*/
-#ifndef RGW_DMCLOCK_SCHEDULER_H
-#define RGW_DMCLOCK_SCHEDULER_H
+#pragma once
#include "common/ceph_time.h"
#include "common/ceph_context.h"
};
} // namespace rgw::dmclock
-
-#endif // RGW_DMCLOCK_SCHEDULER_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_DMCLOCK_SCHEDULER_CTX_H
-#define RGW_DMCLOCK_SCHEDULER_CTX_H
+#pragma once
#include "common/perf_counters.h"
#include "common/ceph_context.h"
};
} // namespace rgw::dmclock
-
-#endif /* RGW_DMCLOCK_SCHEDULER_CTX_H */
*
*/
-#ifndef RGW_DMCLOCK_SYNC_SCHEDULER_H
-#define RGW_DMCLOCK_SYNC_SCHEDULER_H
+#pragma once
#include "rgw_dmclock_scheduler.h"
#include "rgw_dmclock_scheduler_ctx.h"
{}
} // namespace rgw::dmclock
-#endif /* RGW_DMCLOCK_SYNC_SCHEDULER_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ES_QUERY_H
-#define CEPH_RGW_ES_QUERY_H
+#pragma once
#include "rgw_string.h"
return (restricted_fields && restricted_fields->find(f) != restricted_fields->end());
}
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_FILE_H
-#define RGW_FILE_H
+#pragma once
#include "include/rados/rgw_file.h"
} /* namespace rgw */
-
-#endif /* RGW_FILE_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_FORMATS_H
-#define CEPH_RGW_FORMATS_H
+#pragma once
#include "common/Formatter.h"
public:
RGWNullFlusher() : RGWFormatterFlusher(nullptr) {}
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_FRONTEND_H
-#define RGW_FRONTEND_H
+#pragma once
#include <map>
#include <string>
pauser->resume(driver);
}
};
-
-#endif /* RGW_FRONTEND_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_HTTP_CLIENT_H
-#define CEPH_RGW_HTTP_CLIENT_H
+#pragma once
#include "common/async/yield_context.h"
#include "common/Cond.h"
static int send(RGWHTTPClient *req);
static int process(RGWHTTPClient *req, optional_yield y);
};
-#endif
*
*/
-#ifndef RGW_HTTP_CLIENT_CURL_H
-#define RGW_HTTP_CLIENT_CURL_H
+#pragma once
#include <map>
#include <boost/optional.hpp>
void cleanup_curl();
}
}
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_HTTP_ERRORS_H_
-#define RGW_HTTP_ERRORS_H_
+#pragma once
#include "rgw_common.h"
return 0; /* unreachable */
}
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_IAM_POLICY_H
-#define CEPH_RGW_IAM_POLICY_H
+#pragma once
#include <bitset>
#include <chrono>
}
}
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_POLICY_S3V2_KEYWORDS_H
-#define CEPH_RGW_POLICY_S3V2_KEYWORDS_H
+#pragma once
namespace rgw {
namespace IAM {
};
}
}
-
-#endif // CEPH_RGW_POLICY_S3V2_KEYWORDS_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_KEYSTONE_H
-#define CEPH_RGW_KEYSTONE_H
+#pragma once
#include <atomic>
#include <string_view>
}; /* namespace keystone */
}; /* namespace rgw */
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_KMIP_CLIENT_H
-#define CEPH_RGW_KMIP_CLIENT_H
+#pragma once
class RGWKMIPManager;
void rgw_kmip_client_init(RGWKMIPManager &);
void rgw_kmip_client_cleanup();
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_KMIP_CLIENT_IMPL_H
-#define CEPH_RGW_KMIP_CLIENT_IMPL_H
+#pragma once
+
struct RGWKmipWorker;
class RGWKMIPManagerImpl: public RGWKMIPManager {
protected:
void stop();
friend RGWKmipWorker;
};
-#endif
-
* Server-side encryption integrations with Key Management Systems (SSE-KMS)
*/
-#ifndef CEPH_RGW_KMS_H
-#define CEPH_RGW_KMS_H
+#pragma once
#include <string>
virtual int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) = 0;
virtual ~SecretEngine(){};
};
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LC_H
-#define CEPH_RGW_LC_H
+#pragma once
#include <map>
#include <array>
std::string& rule_id);
} // namespace rgw::lc
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LC_S3_H
-#define CEPH_RGW_LC_S3_H
+#pragma once
#include <map>
#include <string>
int rebuild(RGWLifecycleConfiguration& dest);
void dump_xml(Formatter *f) const;
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_LDAP_H
-#define RGW_LDAP_H
+#pragma once
#include "acconfig.h"
#include "include/ceph_assert.h"
std::string parse_rgw_ldap_bindpw(CephContext* ctx);
-
-#endif /* RGW_LDAP_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_LIB_H
-#define RGW_LIB_H
+#pragma once
#include <mutex>
#include "rgw_common.h"
}; /* RGWLibContinuedReq */
} /* namespace rgw */
-
-#endif /* RGW_LIB_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_LIB_FRONTEND_H
-#define RGW_LIB_FRONTEND_H
+#pragma once
#include <boost/container/flat_map.hpp>
}; /* RGWLibFrontend */
} /* namespace rgw */
-
-#endif /* RGW_LIB_FRONTEND_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LOADGEN_H
-#define CEPH_RGW_LOADGEN_H
+#pragma once
#include <map>
#include <string>
size_t complete_request() override;
};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_LOG_H
-#define CEPH_RGW_LOG_H
+#pragma once
#include <boost/container/flat_map.hpp>
#include "rgw_common.h"
void rgw_log_usage_finalize();
void rgw_format_ops_log_entry(struct rgw_log_entry& entry,
ceph::Formatter *formatter);
-
-#endif /* CEPH_RGW_LOG_H */
-
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_META_SYNC_STATUS_H
-#define RGW_META_SYNC_STATUS_H
+#pragma once
#include <string>
static void generate_test_instances(std::list<rgw_meta_sync_status*>& ls);
};
WRITE_CLASS_ENCODER(rgw_meta_sync_status)
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_MULTI_H
-#define CEPH_RGW_MULTI_H
+#pragma once
#include <map>
#include "rgw_xml.h"
};
extern bool is_v2_upload_id(const std::string& upload_id);
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_MULTI_DELETE_H_
-#define RGW_MULTI_DELETE_H_
+#pragma once
#include <vector>
#include "rgw_xml.h"
RGWMultiDelXMLParser() {}
~RGWMultiDelXMLParser() override {}
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_OBJECT_LOCK_H
-#define CEPH_RGW_OBJECT_LOCK_H
+#pragma once
#include <string>
#include "common/ceph_time.h"
bool is_enabled() const;
};
WRITE_CLASS_ENCODER(RGWObjectLegalHold)
-#endif //CEPH_RGW_OBJECT_LOCK_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_OIDC_PROVIDER_H
-#define CEPH_RGW_OIDC_PROVIDER_H
+#pragma once
#include <string>
WRITE_CLASS_ENCODER(RGWOIDCProvider)
} } // namespace rgw::sal
-#endif /* CEPH_RGW_OIDC_PROVIDER_H */
-
* to provide additional virtual methods such as send_response or get_params.
*/
-#ifndef CEPH_RGW_OP_H
-#define CEPH_RGW_OP_H
+#pragma once
#include <limits.h>
CephContext *cct,
std::map<std::string, bufferlist>& attrset,
RGWAccessControlPolicy *policy);
-
-#endif /* CEPH_RGW_OP_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_OPA_H
-#define RGW_OPA_H
+#pragma once
#include "rgw_common.h"
#include "rgw_op.h"
/* authorize request using OPA */
int rgw_opa_authorize(RGWOp*& op,
req_state* s);
-
-#endif /* RGW_OPA_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_OS_LIB_H
-#define RGW_OS_LIB_H
+#pragma once
#include <functional>
#include "rgw_common.h"
#include "rgw_lib.h"
-
-#endif /* RGW_OS_LIB_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_PERIOD_HISTORY_H
-#define RGW_PERIOD_HISTORY_H
+#pragma once
#include <deque>
#include <mutex>
/// the current_history
Cursor lookup(epoch_t realm_epoch);
};
-
-#endif // RGW_PERIOD_HISTORY_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_PERIOD_PULLER_H
-#define CEPH_RGW_PERIOD_PULLER_H
+#pragma once
#include "rgw_period_history.h"
#include "include/common_fwd.h"
int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y) override;
};
-
-#endif // CEPH_RGW_PERIOD_PULLER_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_PERIOD_PUSHER_H
-#define RGW_PERIOD_PUSHER_H
+#pragma once
#include <memory>
#include <mutex>
class CRThread; //< contains thread, coroutine manager, http manager
std::unique_ptr<CRThread> cr_thread; //< thread to run the push coroutines
};
-
-#endif // RGW_PERIOD_PUSHER_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_POLICY_H
-#define CEPH_RGW_POLICY_H
+#pragma once
#include <limits.h>
int check(RGWPolicyEnv *env, std::string& err_msg);
int from_json(bufferlist& bl, std::string& err_msg);
};
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_PROCESS_H
-#define RGW_PROCESS_H
+#pragma once
#include "rgw_common.h"
#include "rgw_acl.h"
bool skip_retarget = false);
#undef dout_context
-
-#endif /* RGW_PROCESS_H */
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "services/svc_zone.h"
-#include "rgw_b64.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "rgw_pubsub.h"
-#include "rgw_tools.h"
-#include "rgw_xml.h"
-#include "rgw_arn.h"
-#include "rgw_pubsub_push.h"
-#include <regex>
-#include <algorithm>
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) {
- char buf[64];
- const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str());
- if (len > 0) {
- id.assign(buf, len);
- }
-}
-
-bool rgw_s3_key_filter::decode_xml(XMLObj* obj) {
- XMLObjIter iter = obj->find("FilterRule");
- XMLObj *o;
-
- const auto throw_if_missing = true;
- auto prefix_not_set = true;
- auto suffix_not_set = true;
- auto regex_not_set = true;
- std::string name;
-
- while ((o = iter.get_next())) {
- RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing);
- if (name == "prefix" && prefix_not_set) {
- prefix_not_set = false;
- RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing);
- } else if (name == "suffix" && suffix_not_set) {
- suffix_not_set = false;
- RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing);
- } else if (name == "regex" && regex_not_set) {
- regex_not_set = false;
- RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing);
- } else {
- throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'");
- }
- }
- return true;
-}
-
-void rgw_s3_key_filter::dump_xml(Formatter *f) const {
- if (!prefix_rule.empty()) {
- f->open_object_section("FilterRule");
- ::encode_xml("Name", "prefix", f);
- ::encode_xml("Value", prefix_rule, f);
- f->close_section();
- }
- if (!suffix_rule.empty()) {
- f->open_object_section("FilterRule");
- ::encode_xml("Name", "suffix", f);
- ::encode_xml("Value", suffix_rule, f);
- f->close_section();
- }
- if (!regex_rule.empty()) {
- f->open_object_section("FilterRule");
- ::encode_xml("Name", "regex", f);
- ::encode_xml("Value", regex_rule, f);
- f->close_section();
- }
-}
-
-bool rgw_s3_key_filter::has_content() const {
- return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty());
-}
-
-bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) {
- kv.clear();
- XMLObjIter iter = obj->find("FilterRule");
- XMLObj *o;
-
- const auto throw_if_missing = true;
-
- std::string key;
- std::string value;
-
- while ((o = iter.get_next())) {
- RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing);
- RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing);
- kv.emplace(key, value);
- }
- return true;
-}
-
-void rgw_s3_key_value_filter::dump_xml(Formatter *f) const {
- for (const auto& key_value : kv) {
- f->open_object_section("FilterRule");
- ::encode_xml("Name", key_value.first, f);
- ::encode_xml("Value", key_value.second, f);
- f->close_section();
- }
-}
-
-bool rgw_s3_key_value_filter::has_content() const {
- return !kv.empty();
-}
-
-bool rgw_s3_filter::decode_xml(XMLObj* obj) {
- RGWXMLDecoder::decode_xml("S3Key", key_filter, obj);
- RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj);
- RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj);
- return true;
-}
-
-void rgw_s3_filter::dump_xml(Formatter *f) const {
- if (key_filter.has_content()) {
- ::encode_xml("S3Key", key_filter, f);
- }
- if (metadata_filter.has_content()) {
- ::encode_xml("S3Metadata", metadata_filter, f);
- }
- if (tag_filter.has_content()) {
- ::encode_xml("S3Tags", tag_filter, f);
- }
-}
-
-bool rgw_s3_filter::has_content() const {
- return key_filter.has_content() ||
- metadata_filter.has_content() ||
- tag_filter.has_content();
-}
-
-bool match(const rgw_s3_key_filter& filter, const std::string& key) {
- const auto key_size = key.size();
- const auto prefix_size = filter.prefix_rule.size();
- if (prefix_size != 0) {
- // prefix rule exists
- if (prefix_size > key_size) {
- // if prefix is longer than key, we fail
- return false;
- }
- if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) {
- return false;
- }
- }
- const auto suffix_size = filter.suffix_rule.size();
- if (suffix_size != 0) {
- // suffix rule exists
- if (suffix_size > key_size) {
- // if suffix is longer than key, we fail
- return false;
- }
- if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) {
- return false;
- }
- }
- if (!filter.regex_rule.empty()) {
- // TODO add regex chaching in the filter
- const std::regex base_regex(filter.regex_rule);
- if (!std::regex_match(key, base_regex)) {
- return false;
- }
- }
- return true;
-}
-
-bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) {
- // all filter pairs must exist with the same value in the object's metadata/tags
- // object metadata/tags may include items not in the filter
- return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end());
-}
-
-bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) {
- // all filter pairs must exist with the same value in the object's metadata/tags
- // object metadata/tags may include items not in the filter
- for (auto& filter : filter.kv) {
- auto result = kv.equal_range(filter.first);
- if (std::any_of(result.first, result.second, [&filter](const pair<string,string>& p) { return p.second == filter.second;}))
- continue;
- else
- return false;
- }
- return true;
-}
-
-bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) {
- // if event list exists, and none of the events in the list matches the event type, filter the message
- if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) {
- return false;
- }
- return true;
-}
-
-void do_decode_xml_obj(rgw::notify::EventTypeList& l, const string& name, XMLObj *obj) {
- l.clear();
-
- XMLObjIter iter = obj->find(name);
- XMLObj *o;
-
- while ((o = iter.get_next())) {
- std::string val;
- decode_xml_obj(val, o);
- l.push_back(rgw::notify::from_string(val));
- }
-}
-
-bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) {
- const auto throw_if_missing = true;
- RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing);
-
- RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing);
-
- RGWXMLDecoder::decode_xml("Filter", filter, obj);
-
- do_decode_xml_obj(events, "Event", obj);
- if (events.empty()) {
- // if no events are provided, we assume all events
- events.push_back(rgw::notify::ObjectCreated);
- events.push_back(rgw::notify::ObjectRemoved);
- }
- return true;
-}
-
-void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const {
- ::encode_xml("Id", id, f);
- ::encode_xml("Topic", topic_arn.c_str(), f);
- if (filter.has_content()) {
- ::encode_xml("Filter", filter, f);
- }
- for (const auto& event : events) {
- ::encode_xml("Event", rgw::notify::to_string(event), f);
- }
-}
-
-bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) {
- do_decode_xml_obj(list, "TopicConfiguration", obj);
- return true;
-}
-
-rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) :
- id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {}
-
-void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const {
- do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f);
-}
-
-void rgw_pubsub_s3_event::dump(Formatter *f) const {
- encode_json("eventVersion", eventVersion, f);
- encode_json("eventSource", eventSource, f);
- encode_json("awsRegion", awsRegion, f);
- utime_t ut(eventTime);
- encode_json("eventTime", ut, f);
- encode_json("eventName", eventName, f);
- {
- Formatter::ObjectSection s(*f, "userIdentity");
- encode_json("principalId", userIdentity, f);
- }
- {
- Formatter::ObjectSection s(*f, "requestParameters");
- encode_json("sourceIPAddress", sourceIPAddress, f);
- }
- {
- Formatter::ObjectSection s(*f, "responseElements");
- encode_json("x-amz-request-id", x_amz_request_id, f);
- encode_json("x-amz-id-2", x_amz_id_2, f);
- }
- {
- Formatter::ObjectSection s(*f, "s3");
- encode_json("s3SchemaVersion", s3SchemaVersion, f);
- encode_json("configurationId", configurationId, f);
- {
- Formatter::ObjectSection sub_s(*f, "bucket");
- encode_json("name", bucket_name, f);
- {
- Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity");
- encode_json("principalId", bucket_ownerIdentity, f);
- }
- encode_json("arn", bucket_arn, f);
- encode_json("id", bucket_id, f);
- }
- {
- Formatter::ObjectSection sub_s(*f, "object");
- encode_json("key", object_key, f);
- encode_json("size", object_size, f);
- encode_json("eTag", object_etag, f);
- encode_json("versionId", object_versionId, f);
- encode_json("sequencer", object_sequencer, f);
- encode_json("metadata", x_meta_map, f);
- encode_json("tags", tags, f);
- }
- }
- encode_json("eventId", id, f);
- encode_json("opaqueData", opaque_data, f);
-}
-
-void rgw_pubsub_topic::dump(Formatter *f) const
-{
- encode_json("user", user, f);
- encode_json("name", name, f);
- encode_json("dest", dest, f);
- encode_json("arn", arn, f);
- encode_json("opaqueData", opaque_data, f);
-}
-
-void rgw_pubsub_topic::dump_xml(Formatter *f) const
-{
- encode_xml("User", user, f);
- encode_xml("Name", name, f);
- encode_xml("EndPoint", dest, f);
- encode_xml("TopicArn", arn, f);
- encode_xml("OpaqueData", opaque_data, f);
-}
-
-void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) {
- f->open_object_section("entry");
- encode_xml("key", key, f);
- encode_xml("value", value, f);
- f->close_section(); // entry
-}
-
-void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const
-{
- f->open_array_section("Attributes");
- std::string str_user;
- user.to_str(str_user);
- encode_xml_key_value_entry("User", str_user, f);
- encode_xml_key_value_entry("Name", name, f);
- encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f);
- encode_xml_key_value_entry("TopicArn", arn, f);
- encode_xml_key_value_entry("OpaqueData", opaque_data, f);
- f->close_section(); // Attributes
-}
-
-void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f)
-{
- f->open_array_section(name);
- for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
- f->dump_string("obj", rgw::notify::to_string(*iter));
- }
- f->close_section();
-}
-
-void rgw_pubsub_topic_filter::dump(Formatter *f) const
-{
- encode_json("topic", topic, f);
- encode_json("events", events, f);
-}
-
-void rgw_pubsub_topic_subs::dump(Formatter *f) const
-{
- encode_json("topic", topic, f);
- encode_json("subs", subs, f);
-}
-
-void rgw_pubsub_bucket_topics::dump(Formatter *f) const
-{
- Formatter::ArraySection s(*f, "topics");
- for (auto& t : topics) {
- encode_json(t.first.c_str(), t.second, f);
- }
-}
-
-void rgw_pubsub_topics::dump(Formatter *f) const
-{
- Formatter::ArraySection s(*f, "topics");
- for (auto& t : topics) {
- encode_json(t.first.c_str(), t.second, f);
- }
-}
-
-void rgw_pubsub_topics::dump_xml(Formatter *f) const
-{
- for (auto& t : topics) {
- encode_xml("member", t.second.topic, f);
- }
-}
-
-void rgw_pubsub_sub_dest::dump(Formatter *f) const
-{
- encode_json("bucket_name", bucket_name, f);
- encode_json("oid_prefix", oid_prefix, f);
- encode_json("push_endpoint", push_endpoint, f);
- encode_json("push_endpoint_args", push_endpoint_args, f);
- encode_json("push_endpoint_topic", arn_topic, f);
- encode_json("stored_secret", stored_secret, f);
- encode_json("persistent", persistent, f);
-}
-
-void rgw_pubsub_sub_dest::dump_xml(Formatter *f) const
-{
- // first 2 members are omitted here since they
- // dont apply to AWS compliant topics
- encode_xml("EndpointAddress", push_endpoint, f);
- encode_xml("EndpointArgs", push_endpoint_args, f);
- encode_xml("EndpointTopic", arn_topic, f);
- encode_xml("HasStoredSecret", stored_secret, f);
- encode_xml("Persistent", persistent, f);
-}
-
-std::string rgw_pubsub_sub_dest::to_json_str() const
-{
- // first 2 members are omitted here since they
- // dont apply to AWS compliant topics
- JSONFormatter f;
- f.open_object_section("");
- encode_json("EndpointAddress", push_endpoint, &f);
- encode_json("EndpointArgs", push_endpoint_args, &f);
- encode_json("EndpointTopic", arn_topic, &f);
- encode_json("HasStoredSecret", stored_secret, &f);
- encode_json("Persistent", persistent, &f);
- f.close_section();
- std::stringstream ss;
- f.flush(ss);
- return ss.str();
-}
-
-void rgw_pubsub_sub_config::dump(Formatter *f) const
-{
- encode_json("user", user, f);
- encode_json("name", name, f);
- encode_json("topic", topic, f);
- encode_json("dest", dest, f);
- encode_json("s3_id", s3_id, f);
-}
-
-RGWPubSub::RGWPubSub(rgw::sal::RadosStore* _store, const std::string& _tenant)
- : store(_store), tenant(_tenant), svc_sysobj(store->svc()->sysobj)
-{
- get_meta_obj(&meta_obj);
-}
-
-int RGWPubSub::remove(const DoutPrefixProvider *dpp,
- const rgw_raw_obj& obj,
- RGWObjVersionTracker *objv_tracker,
- optional_yield y)
-{
- int ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, obj.pool, obj.oid, objv_tracker, y);
- if (ret < 0) {
- return ret;
- }
-
- return 0;
-}
-
-int RGWPubSub::read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker *objv_tracker)
-{
- int ret = read(meta_obj, result, objv_tracker);
- if (ret < 0) {
- ldout(store->ctx(), 10) << "WARNING: failed to read topics info: ret=" << ret << dendl;
- return ret;
- }
- return 0;
-}
-
-int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
- RGWObjVersionTracker *objv_tracker, optional_yield y)
-{
- int ret = write(dpp, meta_obj, topics, objv_tracker, y);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
- return ret;
- }
- return 0;
-}
-
-int RGWPubSub::get_topics(rgw_pubsub_topics *result)
-{
- return read_topics(result, nullptr);
-}
-
-int RGWPubSub::Bucket::read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker *objv_tracker)
-{
- int ret = ps->read(bucket_meta_obj, result, objv_tracker);
- if (ret < 0 && ret != -ENOENT) {
- ldout(ps->store->ctx(), 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
- return ret;
- }
- return 0;
-}
-
-int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
- RGWObjVersionTracker *objv_tracker,
- optional_yield y)
-{
- int ret = ps->write(dpp, bucket_meta_obj, topics, objv_tracker, y);
- if (ret < 0) {
- ldout(ps->store->ctx(), 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWPubSub::Bucket::get_topics(rgw_pubsub_bucket_topics *result)
-{
- return read_topics(result, nullptr);
-}
-
-int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic_subs *result)
-{
- rgw_pubsub_topics topics;
- int ret = get_topics(&topics);
- if (ret < 0) {
- ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
- return ret;
- }
-
- auto iter = topics.topics.find(name);
- if (iter == topics.topics.end()) {
- ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
- return -ENOENT;
- }
-
- *result = iter->second;
- return 0;
-}
-
-int RGWPubSub::get_topic(const string& name, rgw_pubsub_topic *result)
-{
- rgw_pubsub_topics topics;
- int ret = get_topics(&topics);
- if (ret < 0) {
- ldout(store->ctx(), 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
- return ret;
- }
-
- auto iter = topics.topics.find(name);
- if (iter == topics.topics.end()) {
- ldout(store->ctx(), 1) << "ERROR: topic not found" << dendl;
- return -ENOENT;
- }
-
- *result = iter->second.topic;
- return 0;
-}
-
-int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y) {
- return create_notification(dpp, topic_name, events, std::nullopt, "", y);
-}
-
-int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const string& topic_name,const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) {
- rgw_pubsub_topic_subs topic_info;
-
- int ret = ps->get_topic(topic_name, &topic_info);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl;
- return ret;
- }
- ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl;
-
- RGWObjVersionTracker objv_tracker;
- rgw_pubsub_bucket_topics bucket_topics;
-
- ret = read_topics(&bucket_topics, &objv_tracker);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" <<
- bucket.name << "': ret=" << ret << dendl;
- return ret;
- }
- ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" <<
- bucket.name << "'" << dendl;
-
- auto& topic_filter = bucket_topics.topics[topic_name];
- topic_filter.topic = topic_info.topic;
- topic_filter.events = events;
- topic_filter.s3_id = notif_name;
- if (s3_filter) {
- topic_filter.s3_filter = *s3_filter;
- }
-
- ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket.name << "': ret=" << ret << dendl;
- return ret;
- }
-
- ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket.name << "'" << dendl;
-
- return 0;
-}
-
-int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const string& topic_name, optional_yield y)
-{
- rgw_pubsub_topic_subs topic_info;
-
- int ret = ps->get_topic(topic_name, &topic_info);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to read topic info: ret=" << ret << dendl;
- return ret;
- }
-
- RGWObjVersionTracker objv_tracker;
- rgw_pubsub_bucket_topics bucket_topics;
-
- ret = read_topics(&bucket_topics, &objv_tracker);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
- return ret;
- }
-
- bucket_topics.topics.erase(topic_name);
-
- if (bucket_topics.topics.empty()) {
- // no more topics - delete the notification object of the bucket
- ret = ps->remove(dpp, bucket_meta_obj, &objv_tracker, y);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
- return ret;
- }
- return 0;
- }
-
- // write back the notifications without the deleted one
- ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y)
-{
- // get all topics on a bucket
- rgw_pubsub_bucket_topics bucket_topics;
- auto ret = get_topics(&bucket_topics);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket.name << "', ret=" << ret << dendl;
- return ret ;
- }
-
- // remove all auto-genrated topics
- for (const auto& topic : bucket_topics.topics) {
- const auto& topic_name = topic.first;
- ret = ps->remove_topic(dpp, topic_name, y);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl;
- }
- }
-
- // delete the notification object of the bucket
- ret = ps->remove(dpp, bucket_meta_obj, nullptr, y);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y) {
- return create_topic(dpp, name, rgw_pubsub_sub_dest(), "", "", y);
-}
-
-int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y) {
- RGWObjVersionTracker objv_tracker;
- rgw_pubsub_topics topics;
-
- int ret = read_topics(&topics, &objv_tracker);
- if (ret < 0 && ret != -ENOENT) {
- // its not an error if not topics exist, we create one
- ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
- return ret;
- }
-
- rgw_pubsub_topic_subs& new_topic = topics.topics[name];
- new_topic.topic.user = rgw_user("", tenant);
- new_topic.topic.name = name;
- new_topic.topic.dest = dest;
- new_topic.topic.arn = arn;
- new_topic.topic.opaque_data = opaque_data;
-
- ret = write_topics(dpp, topics, &objv_tracker, y);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const string& name, optional_yield y)
-{
- RGWObjVersionTracker objv_tracker;
- rgw_pubsub_topics topics;
-
- int ret = read_topics(&topics, &objv_tracker);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
- return ret;
- } else if (ret == -ENOENT) {
- // its not an error if no topics exist, just a no-op
- ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl;
- return 0;
- }
-
- topics.topics.erase(name);
-
- ret = write_topics(dpp, topics, &objv_tracker, y);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-void RGWPubSub::get_meta_obj(rgw_raw_obj *obj) const {
- *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, meta_oid());
-}
-
-void RGWPubSub::get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const {
- *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, bucket_meta_oid(bucket));
-}
-
-void RGWPubSub::get_sub_meta_obj(const string& name, rgw_raw_obj *obj) const {
- *obj = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sub_meta_oid(name));
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_PUBSUB_H
-#define CEPH_RGW_PUBSUB_H
-
-#include "services/svc_sys_obj.h"
-#include "rgw_tools.h"
-#include "rgw_zone.h"
-#include "rgw_notify_event_type.h"
-#include <boost/container/flat_map.hpp>
-
-namespace rgw::sal { class RadosStore; }
-
-class XMLObj;
-
-struct rgw_s3_key_filter {
- std::string prefix_rule;
- std::string suffix_rule;
- std::string regex_rule;
-
- bool has_content() const;
-
- bool decode_xml(XMLObj *obj);
- void dump_xml(Formatter *f) const;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(prefix_rule, bl);
- encode(suffix_rule, bl);
- encode(regex_rule, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(prefix_rule, bl);
- decode(suffix_rule, bl);
- decode(regex_rule, bl);
- DECODE_FINISH(bl);
- }
-};
-WRITE_CLASS_ENCODER(rgw_s3_key_filter)
-
-using KeyValueMap = boost::container::flat_map<std::string, std::string>;
-using KeyMultiValueMap = std::multimap<std::string, std::string>;
-
-struct rgw_s3_key_value_filter {
- KeyValueMap kv;
-
- bool has_content() const;
-
- bool decode_xml(XMLObj *obj);
- void dump_xml(Formatter *f) const;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(kv, bl);
- ENCODE_FINISH(bl);
- }
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(kv, bl);
- DECODE_FINISH(bl);
- }
-};
-WRITE_CLASS_ENCODER(rgw_s3_key_value_filter)
-
-struct rgw_s3_filter {
- rgw_s3_key_filter key_filter;
- rgw_s3_key_value_filter metadata_filter;
- rgw_s3_key_value_filter tag_filter;
-
- bool has_content() const;
-
- bool decode_xml(XMLObj *obj);
- void dump_xml(Formatter *f) const;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(2, 1, bl);
- encode(key_filter, bl);
- encode(metadata_filter, bl);
- encode(tag_filter, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(2, bl);
- decode(key_filter, bl);
- decode(metadata_filter, bl);
- if (struct_v >= 2) {
- decode(tag_filter, bl);
- }
- DECODE_FINISH(bl);
- }
-};
-WRITE_CLASS_ENCODER(rgw_s3_filter)
-
-using OptionalFilter = std::optional<rgw_s3_filter>;
-
-struct rgw_pubsub_topic_filter;
-/* S3 notification configuration
- * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html
-<NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
- <TopicConfiguration>
- <Filter>
- <S3Key>
- <FilterRule>
- <Name>suffix</Name>
- <Value>jpg</Value>
- </FilterRule>
- </S3Key>
- <S3Metadata>
- <FilterRule>
- <Name></Name>
- <Value></Value>
- </FilterRule>
- </S3Metadata>
- <S3Tags>
- <FilterRule>
- <Name></Name>
- <Value></Value>
- </FilterRule>
- </S3Tags>
- </Filter>
- <Id>notification1</Id>
- <Topic>arn:aws:sns:<region>:<account>:<topic></Topic>
- <Event>s3:ObjectCreated:*</Event>
- <Event>s3:ObjectRemoved:*</Event>
- </TopicConfiguration>
-</NotificationConfiguration>
-*/
-struct rgw_pubsub_s3_notification {
- // notification id
- std::string id;
- // types of events
- rgw::notify::EventTypeList events;
- // topic ARN
- std::string topic_arn;
- // filter rules
- rgw_s3_filter filter;
-
- bool decode_xml(XMLObj *obj);
- void dump_xml(Formatter *f) const;
-
- rgw_pubsub_s3_notification() = default;
- // construct from rgw_pubsub_topic_filter (used by get/list notifications)
- explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter);
-};
-
-// return true if the key matches the prefix/suffix/regex rules of the key filter
-bool match(const rgw_s3_key_filter& filter, const std::string& key);
-
-// return true if the key matches the metadata rules of the metadata filter
-bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv);
-
-// return true if the key matches the tag rules of the tag filter
-bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv);
-
-// return true if the event type matches (equal or contained in) one of the events in the list
-bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event);
-
-struct rgw_pubsub_s3_notifications {
- std::list<rgw_pubsub_s3_notification> list;
- bool decode_xml(XMLObj *obj);
- void dump_xml(Formatter *f) const;
-};
-
-/* S3 event records structure
- * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
-{
-"Records":[
- {
- "eventVersion":""
- "eventSource":"",
- "awsRegion":"",
- "eventTime":"",
- "eventName":"",
- "userIdentity":{
- "principalId":""
- },
- "requestParameters":{
- "sourceIPAddress":""
- },
- "responseElements":{
- "x-amz-request-id":"",
- "x-amz-id-2":""
- },
- "s3":{
- "s3SchemaVersion":"1.0",
- "configurationId":"",
- "bucket":{
- "name":"",
- "ownerIdentity":{
- "principalId":""
- },
- "arn":""
- "id": ""
- },
- "object":{
- "key":"",
- "size": ,
- "eTag":"",
- "versionId":"",
- "sequencer": "",
- "metadata": ""
- "tags": ""
- }
- },
- "eventId":"",
- }
-]
-}*/
-
-struct rgw_pubsub_s3_event {
- constexpr static const char* const json_type_plural = "Records";
- std::string eventVersion = "2.2";
- // aws:s3
- std::string eventSource = "ceph:s3";
- // zonegroup
- std::string awsRegion;
- // time of the request
- ceph::real_time eventTime;
- // type of the event
- std::string eventName;
- // user that sent the request
- std::string userIdentity;
- // IP address of source of the request (not implemented)
- std::string sourceIPAddress;
- // request ID (not implemented)
- std::string x_amz_request_id;
- // radosgw that received the request
- std::string x_amz_id_2;
- std::string s3SchemaVersion = "1.0";
- // ID received in the notification request
- std::string configurationId;
- // bucket name
- std::string bucket_name;
- // bucket owner
- std::string bucket_ownerIdentity;
- // bucket ARN
- std::string bucket_arn;
- // object key
- std::string object_key;
- // object size
- uint64_t object_size = 0;
- // object etag
- std::string object_etag;
- // object version id bucket is versioned
- std::string object_versionId;
- // hexadecimal value used to determine event order for specific key
- std::string object_sequencer;
- // this is an rgw extension (not S3 standard)
- // used to store a globally unique identifier of the event
- // that could be used for acking or any other identification of the event
- std::string id;
- // this is an rgw extension holding the internal bucket id
- std::string bucket_id;
- // meta data
- KeyValueMap x_meta_map;
- // tags
- KeyMultiValueMap tags;
- // opaque data received from the topic
- // could be used to identify the gateway
- std::string opaque_data;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(4, 1, bl);
- encode(eventVersion, bl);
- encode(eventSource, bl);
- encode(awsRegion, bl);
- encode(eventTime, bl);
- encode(eventName, bl);
- encode(userIdentity, bl);
- encode(sourceIPAddress, bl);
- encode(x_amz_request_id, bl);
- encode(x_amz_id_2, bl);
- encode(s3SchemaVersion, bl);
- encode(configurationId, bl);
- encode(bucket_name, bl);
- encode(bucket_ownerIdentity, bl);
- encode(bucket_arn, bl);
- encode(object_key, bl);
- encode(object_size, bl);
- encode(object_etag, bl);
- encode(object_versionId, bl);
- encode(object_sequencer, bl);
- encode(id, bl);
- encode(bucket_id, bl);
- encode(x_meta_map, bl);
- encode(tags, bl);
- encode(opaque_data, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(4, bl);
- decode(eventVersion, bl);
- decode(eventSource, bl);
- decode(awsRegion, bl);
- decode(eventTime, bl);
- decode(eventName, bl);
- decode(userIdentity, bl);
- decode(sourceIPAddress, bl);
- decode(x_amz_request_id, bl);
- decode(x_amz_id_2, bl);
- decode(s3SchemaVersion, bl);
- decode(configurationId, bl);
- decode(bucket_name, bl);
- decode(bucket_ownerIdentity, bl);
- decode(bucket_arn, bl);
- decode(object_key, bl);
- decode(object_size, bl);
- decode(object_etag, bl);
- decode(object_versionId, bl);
- decode(object_sequencer, bl);
- decode(id, bl);
- if (struct_v >= 2) {
- decode(bucket_id, bl);
- decode(x_meta_map, bl);
- }
- if (struct_v >= 3) {
- decode(tags, bl);
- }
- if (struct_v >= 4) {
- decode(opaque_data, bl);
- }
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_s3_event)
-
-// setting a unique ID for an event based on object hash and timestamp
-void set_event_id(std::string& id, const std::string& hash, const utime_t& ts);
-
-struct rgw_pubsub_sub_dest {
- std::string bucket_name;
- std::string oid_prefix;
- std::string push_endpoint;
- std::string push_endpoint_args;
- std::string arn_topic;
- bool stored_secret = false;
- bool persistent = false;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(5, 1, bl);
- encode(bucket_name, bl);
- encode(oid_prefix, bl);
- encode(push_endpoint, bl);
- encode(push_endpoint_args, bl);
- encode(arn_topic, bl);
- encode(stored_secret, bl);
- encode(persistent, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(5, bl);
- decode(bucket_name, bl);
- decode(oid_prefix, bl);
- decode(push_endpoint, bl);
- if (struct_v >= 2) {
- decode(push_endpoint_args, bl);
- }
- if (struct_v >= 3) {
- decode(arn_topic, bl);
- }
- if (struct_v >= 4) {
- decode(stored_secret, bl);
- }
- if (struct_v >= 5) {
- decode(persistent, bl);
- }
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
- void dump_xml(Formatter *f) const;
- std::string to_json_str() const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_sub_dest)
-
-struct rgw_pubsub_sub_config {
- rgw_user user;
- std::string name;
- std::string topic;
- rgw_pubsub_sub_dest dest;
- std::string s3_id;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(2, 1, bl);
- encode(user, bl);
- encode(name, bl);
- encode(topic, bl);
- encode(dest, bl);
- encode(s3_id, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(2, bl);
- decode(user, bl);
- decode(name, bl);
- decode(topic, bl);
- decode(dest, bl);
- if (struct_v >= 2) {
- decode(s3_id, bl);
- }
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_sub_config)
-
-struct rgw_pubsub_topic {
- rgw_user user;
- std::string name;
- rgw_pubsub_sub_dest dest;
- std::string arn;
- std::string opaque_data;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(3, 1, bl);
- encode(user, bl);
- encode(name, bl);
- encode(dest, bl);
- encode(arn, bl);
- encode(opaque_data, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(3, bl);
- decode(user, bl);
- decode(name, bl);
- if (struct_v >= 2) {
- decode(dest, bl);
- decode(arn, bl);
- }
- if (struct_v >= 3) {
- decode(opaque_data, bl);
- }
- DECODE_FINISH(bl);
- }
-
- std::string to_str() const {
- return user.tenant + "/" + name;
- }
-
- void dump(Formatter *f) const;
- void dump_xml(Formatter *f) const;
- void dump_xml_as_attributes(Formatter *f) const;
-
- bool operator<(const rgw_pubsub_topic& t) const {
- return to_str().compare(t.to_str());
- }
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic)
-
-struct rgw_pubsub_topic_subs {
- rgw_pubsub_topic topic;
- std::set<std::string> subs;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(topic, bl);
- encode(subs, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(topic, bl);
- decode(subs, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs)
-
-struct rgw_pubsub_topic_filter {
- rgw_pubsub_topic topic;
- rgw::notify::EventTypeList events;
- std::string s3_id;
- rgw_s3_filter s3_filter;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(3, 1, bl);
- encode(topic, bl);
- // events are stored as a vector of std::strings
- std::vector<std::string> tmp_events;
- std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string);
- encode(tmp_events, bl);
- encode(s3_id, bl);
- encode(s3_filter, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(3, bl);
- decode(topic, bl);
- // events are stored as a vector of std::strings
- events.clear();
- std::vector<std::string> tmp_events;
- decode(tmp_events, bl);
- std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string);
- if (struct_v >= 2) {
- decode(s3_id, bl);
- }
- if (struct_v >= 3) {
- decode(s3_filter, bl);
- }
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter)
-
-struct rgw_pubsub_bucket_topics {
- std::map<std::string, rgw_pubsub_topic_filter> topics;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(topics, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(topics, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics)
-
-struct rgw_pubsub_topics {
- std::map<std::string, rgw_pubsub_topic_subs> topics;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(topics, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(topics, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
- void dump_xml(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_pubsub_topics)
-
-static std::string pubsub_oid_prefix = "pubsub.";
-
-class RGWPubSub
-{
- friend class Bucket;
-
- rgw::sal::RadosStore* store;
- const std::string tenant;
- RGWSI_SysObj* svc_sysobj;
-
- rgw_raw_obj meta_obj;
-
- std::string meta_oid() const {
- return pubsub_oid_prefix + tenant;
- }
-
- std::string bucket_meta_oid(const rgw_bucket& bucket) const {
- return pubsub_oid_prefix + tenant + ".bucket." + bucket.name + "/" + bucket.marker;
- }
-
- std::string sub_meta_oid(const std::string& name) const {
- return pubsub_oid_prefix + tenant + ".sub." + name;
- }
-
- template <class T>
- int read(const rgw_raw_obj& obj, T* data, RGWObjVersionTracker* objv_tracker);
-
- template <class T>
- int write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
- RGWObjVersionTracker* obj_tracker, optional_yield y);
-
- int remove(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, RGWObjVersionTracker* objv_tracker,
- optional_yield y);
-
- int read_topics(rgw_pubsub_topics *result, RGWObjVersionTracker* objv_tracker);
- int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
- RGWObjVersionTracker* objv_tracker, optional_yield y);
-
-public:
- RGWPubSub(rgw::sal::RadosStore* _store, const std::string& tenant);
-
- class Bucket {
- friend class RGWPubSub;
- RGWPubSub *ps;
- rgw_bucket bucket;
- rgw_raw_obj bucket_meta_obj;
-
- // read the list of topics associated with a bucket and populate into result
- // use version tacker to enforce atomicity between read/write
- // return 0 on success or if no topic was associated with the bucket, error code otherwise
- int read_topics(rgw_pubsub_bucket_topics *result, RGWObjVersionTracker* objv_tracker);
- // set the list of topics associated with a bucket
- // use version tacker to enforce atomicity between read/write
- // return 0 on success, error code otherwise
- int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
- RGWObjVersionTracker* objv_tracker, optional_yield y);
- public:
- Bucket(RGWPubSub *_ps, const rgw_bucket& _bucket) : ps(_ps), bucket(_bucket) {
- ps->get_bucket_meta_obj(bucket, &bucket_meta_obj);
- }
-
- // read the list of topics associated with a bucket and populate into result
- // return 0 on success or if no topic was associated with the bucket, error code otherwise
- int get_topics(rgw_pubsub_bucket_topics *result);
- // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket
- // assigning a notification name is optional (needed for S3 compatible notifications)
- // if the topic already exist on the bucket, the filter event list may be updated
- // for S3 compliant notifications the version with: s3_filter and notif_name should be used
- // return -ENOENT if the topic does not exists
- // return 0 on success, error code otherwise
- int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, optional_yield y);
- int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y);
- // remove a topic and filter from bucket
- // if the topic does not exists on the bucket it is a no-op (considered success)
- // return -ENOENT if the topic does not exists
- // return 0 on success, error code otherwise
- int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y);
- // remove all notifications (and autogenerated topics) associated with the bucket
- // return 0 on success or if no topic was associated with the bucket, error code otherwise
- int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y);
- };
-
- using BucketRef = std::shared_ptr<Bucket>;
-
- BucketRef get_bucket(const rgw_bucket& bucket) {
- return std::make_shared<Bucket>(this, bucket);
- }
-
- void get_meta_obj(rgw_raw_obj *obj) const;
- void get_bucket_meta_obj(const rgw_bucket& bucket, rgw_raw_obj *obj) const;
-
- void get_sub_meta_obj(const std::string& name, rgw_raw_obj *obj) const;
-
- // get all topics (per tenant, if used)) and populate them into "result"
- // return 0 on success or if no topics exist, error code otherwise
- int get_topics(rgw_pubsub_topics *result);
- // get a topic with its subscriptions by its name and populate it into "result"
- // return -ENOENT if the topic does not exists
- // return 0 on success, error code otherwise
- int get_topic(const std::string& name, rgw_pubsub_topic_subs *result);
- // get a topic with by its name and populate it into "result"
- // return -ENOENT if the topic does not exists
- // return 0 on success, error code otherwise
- int get_topic(const std::string& name, rgw_pubsub_topic *result);
- // create a topic with a name only
- // if the topic already exists it is a no-op (considered success)
- // return 0 on success, error code otherwise
- int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
- // create a topic with push destination information and ARN
- // if the topic already exists the destination and ARN values may be updated (considered succsess)
- // return 0 on success, error code otherwise
- int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_sub_dest& dest, const std::string& arn, const std::string& opaque_data, optional_yield y);
- // remove a topic according to its name
- // if the topic does not exists it is a no-op (considered success)
- // return 0 on success, error code otherwise
- int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y);
-};
-
-
-template <class T>
-int RGWPubSub::read(const rgw_raw_obj& obj, T* result, RGWObjVersionTracker* objv_tracker)
-{
- bufferlist bl;
- int ret = rgw_get_system_obj(svc_sysobj,
- obj.pool, obj.oid,
- bl,
- objv_tracker,
- nullptr, null_yield, nullptr, nullptr);
- if (ret < 0) {
- return ret;
- }
-
- auto iter = bl.cbegin();
- try {
- decode(*result, iter);
- } catch (buffer::error& err) {
- return -EIO;
- }
-
- return 0;
-}
-
-template <class T>
-int RGWPubSub::write(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const T& info,
- RGWObjVersionTracker* objv_tracker, optional_yield y)
-{
- bufferlist bl;
- encode(info, bl);
-
- return rgw_put_system_obj(dpp, svc_sysobj, obj.pool, obj.oid,
- bl, false, objv_tracker, real_time(), y);
-}
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_pubsub_push.h"
-#include <string>
-#include <sstream>
-#include <algorithm>
-#include "include/buffer_fwd.h"
-#include "common/Formatter.h"
-#include "common/iso_8601.h"
-#include "common/async/completion.h"
-#include "rgw_common.h"
-#include "rgw_data_sync.h"
-#include "rgw_pubsub.h"
-#include "acconfig.h"
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-#include "rgw_amqp.h"
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-#include "rgw_kafka.h"
-#endif
-#include <boost/asio/yield.hpp>
-#include <boost/algorithm/string.hpp>
-#include <functional>
-#include "rgw_perf_counters.h"
-
-using namespace rgw;
-
-template<typename EventType>
-std::string json_format_pubsub_event(const EventType& event) {
- std::stringstream ss;
- JSONFormatter f(false);
- {
- Formatter::ObjectSection s(f, EventType::json_type_plural);
- {
- Formatter::ArraySection s(f, EventType::json_type_plural);
- encode_json("", event, &f);
- }
- }
- f.flush(ss);
- return ss.str();
-}
-
-bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
- bool value;
- bool exists;
- if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
- throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
- }
- if (!exists) {
- return default_value;
- }
- return value;
-}
-
-class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
-private:
- const std::string endpoint;
- typedef unsigned ack_level_t;
- ack_level_t ack_level; // TODO: not used for now
- const bool verify_ssl;
- const bool cloudevents;
- static const ack_level_t ACK_LEVEL_ANY = 0;
- static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
-
-public:
- RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) :
- endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false))
- {
- bool exists;
- const auto& str_ack_level = args.get("http-ack-level", &exists);
- if (!exists || str_ack_level == "any") {
- // "any" is default
- ack_level = ACK_LEVEL_ANY;
- } else if (str_ack_level == "non-error") {
- ack_level = ACK_LEVEL_NON_ERROR;
- } else {
- ack_level = std::atoi(str_ack_level.c_str());
- if (ack_level < 100 || ack_level >= 600) {
- throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
- }
- }
- }
-
- int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
- bufferlist read_bl;
- RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
- const auto post_data = json_format_pubsub_event(event);
- if (cloudevents) {
- // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
- // using "Binary Content Mode"
- request.append_header("ce-specversion", "1.0");
- request.append_header("ce-type", "com.amazonaws." + event.eventName);
- request.append_header("ce-time", to_iso_8601(event.eventTime));
- // default output of iso8601 is also RFC3339 compatible
- request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
- request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
- request.append_header("ce-subject", event.object_key);
- }
- request.set_post_data(post_data);
- request.set_send_length(post_data.length());
- request.append_header("Content-Type", "application/json");
- if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
- const auto rc = RGWHTTP::process(&request, y);
- if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
- // TODO: use read_bl to process return code and handle according to ack level
- return rc;
- }
-
- std::string to_str() const override {
- std::string str("HTTP/S Endpoint");
- str += "\nURI: " + endpoint;
- str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
- return str;
- }
-};
-
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
-class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
-private:
- enum class ack_level_t {
- None,
- Broker,
- Routable
- };
- CephContext* const cct;
- const std::string endpoint;
- const std::string topic;
- const std::string exchange;
- ack_level_t ack_level;
- amqp::connection_ptr_t conn;
-
- bool get_verify_ssl(const RGWHTTPArgs& args) {
- bool exists;
- auto str_verify_ssl = args.get("verify-ssl", &exists);
- if (!exists) {
- // verify server certificate by default
- return true;
- }
- boost::algorithm::to_lower(str_verify_ssl);
- if (str_verify_ssl == "true") {
- return true;
- }
- if (str_verify_ssl == "false") {
- return false;
- }
- throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
- }
-
- std::string get_exchange(const RGWHTTPArgs& args) {
- bool exists;
- const auto exchange = args.get("amqp-exchange", &exists);
- if (!exists) {
- throw configuration_error("AMQP: missing amqp-exchange");
- }
- return exchange;
- }
-
- ack_level_t get_ack_level(const RGWHTTPArgs& args) {
- bool exists;
- const auto& str_ack_level = args.get("amqp-ack-level", &exists);
- if (!exists || str_ack_level == "broker") {
- // "broker" is default
- return ack_level_t::Broker;
- }
- if (str_ack_level == "none") {
- return ack_level_t::None;
- }
- if (str_ack_level == "routable") {
- return ack_level_t::Routable;
- }
- throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
- }
-
-public:
- RGWPubSubAMQPEndpoint(const std::string& _endpoint,
- const std::string& _topic,
- const RGWHTTPArgs& args,
- CephContext* _cct) :
- cct(_cct),
- endpoint(_endpoint),
- topic(_topic),
- exchange(get_exchange(args)),
- ack_level(get_ack_level(args)),
- conn(amqp::connect(endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
- if (!conn) {
- throw configuration_error("AMQP: failed to create connection to: " + endpoint);
- }
- }
-
- // this allows waiting untill "finish()" is called from a different thread
- // waiting could be blocking the waiting thread or yielding, depending
- // with compilation flag support and whether the optional_yield is set
- class Waiter {
- using Signature = void(boost::system::error_code);
- using Completion = ceph::async::Completion<Signature>;
- std::unique_ptr<Completion> completion = nullptr;
- int ret;
-
- mutable std::atomic<bool> done = false;
- mutable std::mutex lock;
- mutable std::condition_variable cond;
-
- template <typename ExecutionContext, typename CompletionToken>
- auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
- boost::asio::async_completion<CompletionToken, Signature> init(token);
- auto& handler = init.completion_handler;
- {
- std::unique_lock l{lock};
- completion = Completion::create(ctx.get_executor(), std::move(handler));
- }
- return init.result.get();
- }
-
- public:
- int wait(optional_yield y) {
- if (done) {
- return ret;
- }
- if (y) {
- auto& io_ctx = y.get_io_context();
- auto& yield_ctx = y.get_yield_context();
- boost::system::error_code ec;
- async_wait(io_ctx, yield_ctx[ec]);
- return -ec.value();
- }
- std::unique_lock l(lock);
- cond.wait(l, [this]{return (done==true);});
- return ret;
- }
-
- void finish(int r) {
- std::unique_lock l{lock};
- ret = r;
- done = true;
- if (completion) {
- boost::system::error_code ec(-ret, boost::system::system_category());
- Completion::post(std::move(completion), ec);
- } else {
- cond.notify_all();
- }
- }
- };
-
- int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
- ceph_assert(conn);
- if (ack_level == ack_level_t::None) {
- return amqp::publish(conn, topic, json_format_pubsub_event(event));
- } else {
- // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
- // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
- auto w = std::unique_ptr<Waiter>(new Waiter);
- const auto rc = amqp::publish_with_confirm(conn,
- topic,
- json_format_pubsub_event(event),
- std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
- if (rc < 0) {
- // failed to publish, does not wait for reply
- return rc;
- }
- return w->wait(y);
- }
- }
-
- std::string to_str() const override {
- std::string str("AMQP(0.9.1) Endpoint");
- str += "\nURI: " + endpoint;
- str += "\nTopic: " + topic;
- str += "\nExchange: " + exchange;
- return str;
- }
-};
-
-static const std::string AMQP_0_9_1("0-9-1");
-static const std::string AMQP_1_0("1-0");
-static const std::string AMQP_SCHEMA("amqp");
-#endif // ifdef WITH_RADOSGW_AMQP_ENDPOINT
-
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
-private:
- enum class ack_level_t {
- None,
- Broker,
- };
- CephContext* const cct;
- const std::string topic;
- kafka::connection_ptr_t conn;
- const ack_level_t ack_level;
-
-
- ack_level_t get_ack_level(const RGWHTTPArgs& args) {
- bool exists;
- const auto& str_ack_level = args.get("kafka-ack-level", &exists);
- if (!exists || str_ack_level == "broker") {
- // "broker" is default
- return ack_level_t::Broker;
- }
- if (str_ack_level == "none") {
- return ack_level_t::None;
- }
- throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
- }
-
-public:
- RGWPubSubKafkaEndpoint(const std::string& _endpoint,
- const std::string& _topic,
- const RGWHTTPArgs& args,
- CephContext* _cct) :
- cct(_cct),
- topic(_topic),
- conn(kafka::connect(_endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"))) ,
- ack_level(get_ack_level(args)) {
- if (!conn) {
- throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
- }
- }
-
- // this allows waiting untill "finish()" is called from a different thread
- // waiting could be blocking the waiting thread or yielding, depending
- // with compilation flag support and whether the optional_yield is set
- class Waiter {
- using Signature = void(boost::system::error_code);
- using Completion = ceph::async::Completion<Signature>;
- std::unique_ptr<Completion> completion = nullptr;
- int ret;
-
- mutable std::atomic<bool> done = false;
- mutable std::mutex lock;
- mutable std::condition_variable cond;
-
- template <typename ExecutionContext, typename CompletionToken>
- auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
- boost::asio::async_completion<CompletionToken, Signature> init(token);
- auto& handler = init.completion_handler;
- {
- std::unique_lock l{lock};
- completion = Completion::create(ctx.get_executor(), std::move(handler));
- }
- return init.result.get();
- }
-
- public:
- int wait(optional_yield y) {
- if (done) {
- return ret;
- }
- if (y) {
- auto& io_ctx = y.get_io_context();
- auto& yield_ctx = y.get_yield_context();
- boost::system::error_code ec;
- async_wait(io_ctx, yield_ctx[ec]);
- return -ec.value();
- }
- std::unique_lock l(lock);
- cond.wait(l, [this]{return (done==true);});
- return ret;
- }
-
- void finish(int r) {
- std::unique_lock l{lock};
- ret = r;
- done = true;
- if (completion) {
- boost::system::error_code ec(-ret, boost::system::system_category());
- Completion::post(std::move(completion), ec);
- } else {
- cond.notify_all();
- }
- }
- };
-
- int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
- ceph_assert(conn);
- if (ack_level == ack_level_t::None) {
- return kafka::publish(conn, topic, json_format_pubsub_event(event));
- } else {
- // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
- auto w = std::unique_ptr<Waiter>(new Waiter);
- const auto rc = kafka::publish_with_confirm(conn,
- topic,
- json_format_pubsub_event(event),
- std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
- if (rc < 0) {
- // failed to publish, does not wait for reply
- return rc;
- }
- return w->wait(y);
- }
- }
-
- std::string to_str() const override {
- std::string str("Kafka Endpoint");
- str += kafka::to_string(conn);
- str += "\nTopic: " + topic;
- return str;
- }
-};
-
-static const std::string KAFKA_SCHEMA("kafka");
-#endif // ifdef WITH_RADOSGW_KAFKA_ENDPOINT
-
-static const std::string WEBHOOK_SCHEMA("webhook");
-static const std::string UNKNOWN_SCHEMA("unknown");
-static const std::string NO_SCHEMA("");
-
-const std::string& get_schema(const std::string& endpoint) {
- if (endpoint.empty()) {
- return NO_SCHEMA;
- }
- const auto pos = endpoint.find(':');
- if (pos == std::string::npos) {
- return UNKNOWN_SCHEMA;
- }
- const auto& schema = endpoint.substr(0,pos);
- if (schema == "http" || schema == "https") {
- return WEBHOOK_SCHEMA;
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
- } else if (schema == "amqp" || schema == "amqps") {
- return AMQP_SCHEMA;
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
- } else if (schema == "kafka") {
- return KAFKA_SCHEMA;
-#endif
- }
- return UNKNOWN_SCHEMA;
-}
-
-RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint,
- const std::string& topic,
- const RGWHTTPArgs& args,
- CephContext* cct) {
- const auto& schema = get_schema(endpoint);
- if (schema == WEBHOOK_SCHEMA) {
- return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
-#ifdef WITH_RADOSGW_AMQP_ENDPOINT
- } else if (schema == AMQP_SCHEMA) {
- bool exists;
- std::string version = args.get("amqp-version", &exists);
- if (!exists) {
- version = AMQP_0_9_1;
- }
- if (version == AMQP_0_9_1) {
- return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
- } else if (version == AMQP_1_0) {
- throw configuration_error("AMQP: v1.0 not supported");
- return nullptr;
- } else {
- throw configuration_error("AMQP: unknown version: " + version);
- return nullptr;
- }
-#endif
-#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
- } else if (schema == KAFKA_SCHEMA) {
- return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
-#endif
- }
-
- throw configuration_error("unknown schema in: " + endpoint);
- return nullptr;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-#pragma once
-
-#include <string>
-#include <memory>
-#include <stdexcept>
-#include "include/buffer_fwd.h"
-#include "include/common_fwd.h"
-#include "common/async/yield_context.h"
-
-// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
-class RGWDataSyncEnv;
-class RGWHTTPArgs;
-struct rgw_pubsub_s3_event;
-
-// endpoint base class all endpoint - types should derive from it
-class RGWPubSubEndpoint {
-public:
- RGWPubSubEndpoint() = default;
- // endpoint should not be copied
- RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
- const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
-
- typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
-
- // factory method for the actual notification endpoint
- // derived class specific arguments are passed in http args format
- // may throw a configuration_error if creation fails
- static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
-
- // this method is used in order to send notification (S3 compliant) and wait for completion
- // in async manner via a coroutine when invoked in the frontend environment
- virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
-
- // present as string
- virtual std::string to_str() const { return ""; }
-
- virtual ~RGWPubSubEndpoint() = default;
-
- // exception object for configuration error
- struct configuration_error : public std::logic_error {
- configuration_error(const std::string& what_arg) :
- std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
- };
-};
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2018 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_aio.h"
-#include "rgw_putobj_processor.h"
-#include "rgw_multi.h"
-#include "rgw_compression.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_zone.h"
-#include "rgw_sal_rados.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-namespace rgw::putobj {
-
-int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
-{
- const bool flush = (data.length() == 0);
-
- // capture the first chunk for special handling
- if (data_offset < head_chunk_size || data_offset == 0) {
- if (flush) {
- // flush partial chunk
- return process_first_chunk(std::move(head_data), &processor);
- }
-
- auto remaining = head_chunk_size - data_offset;
- auto count = std::min<uint64_t>(data.length(), remaining);
- data.splice(0, count, &head_data);
- data_offset += count;
-
- if (data_offset == head_chunk_size) {
- // process the first complete chunk
- ceph_assert(head_data.length() == head_chunk_size);
- int r = process_first_chunk(std::move(head_data), &processor);
- if (r < 0) {
- return r;
- }
- }
- if (data.length() == 0) { // avoid flushing stripe processor
- return 0;
- }
- }
- ceph_assert(processor); // process_first_chunk() must initialize
-
- // send everything else through the processor
- auto write_offset = data_offset;
- data_offset += data.length();
- return processor->process(std::move(data), write_offset);
-}
-
-
-static int process_completed(const AioResultList& completed, RawObjSet *written)
-{
- std::optional<int> error;
- for (auto& r : completed) {
- if (r.result >= 0) {
- written->insert(r.obj.get_ref().obj);
- } else if (!error) { // record first error code
- error = r.result;
- }
- }
- return error.value_or(0);
-}
-
-void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
- const rgw_obj obj = head_obj->get_obj();
- const RGWObjStateManifest *sm = obj_ctx.get_state(obj);
- const bool compressed = sm->state.compressed;
- uint32_t alloc_hint_flags = 0;
- if (compressed) {
- alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
- }
-
- op.set_alloc_hint2(0, 0, alloc_hint_flags);
-}
-
-int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
-{
- stripe_obj = store->svc()->rados->obj(raw_obj);
- return stripe_obj.open(dpp);
-}
-
-int RadosWriter::process(bufferlist&& bl, uint64_t offset)
-{
- bufferlist data = std::move(bl);
- const uint64_t cost = data.length();
- if (cost == 0) { // no empty writes, use aio directly for creates
- return 0;
- }
- librados::ObjectWriteOperation op;
- add_write_hint(op);
- if (offset == 0) {
- op.write_full(data);
- } else {
- op.write(offset, data);
- }
- constexpr uint64_t id = 0; // unused
- auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
- return process_completed(c, &written);
-}
-
-int RadosWriter::write_exclusive(const bufferlist& data)
-{
- const uint64_t cost = data.length();
-
- librados::ObjectWriteOperation op;
- op.create(true); // exclusive create
- add_write_hint(op);
- op.write_full(data);
-
- constexpr uint64_t id = 0; // unused
- auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
- auto d = aio->drain();
- c.splice(c.end(), d);
- return process_completed(c, &written);
-}
-
-int RadosWriter::drain()
-{
- return process_completed(aio->drain(), &written);
-}
-
-RadosWriter::~RadosWriter()
-{
- // wait on any outstanding aio completions
- process_completed(aio->drain(), &written);
-
- bool need_to_remove_head = false;
- std::optional<rgw_raw_obj> raw_head;
- if (!rgw::sal::Object::empty(head_obj.get())) {
- raw_head.emplace();
- rgw::sal::RadosObject* obj = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get());
- obj->get_raw_obj(&*raw_head);
- }
-
- /**
- * We should delete the object in the "multipart" namespace to avoid race condition.
- * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
- * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
- * written by the second upload may be deleted by the first upload.
- * details is describled on #11749
- *
- * The above comment still stands, but instead of searching for a specific object in the multipart
- * namespace, we just make sure that we remove the object that is marked as the head object after
- * we remove all the other raw objects. Note that we use different call to remove the head object,
- * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
- */
- for (const auto& obj : written) {
- if (raw_head && obj == *raw_head) {
- ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
- need_to_remove_head = true;
- continue;
- }
-
- int r = store->delete_raw_obj(dpp, obj);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
- }
- }
-
- if (need_to_remove_head) {
- std::string version_id;
- ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
- int r = head_obj->delete_object(dpp, null_yield);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
- }
- }
-}
-
-
-// advance to the next stripe
-int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
-{
- // advance the manifest
- int r = manifest_gen.create_next(offset);
- if (r < 0) {
- return r;
- }
-
- rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
- uint64_t chunk_size = 0;
- r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
- if (r < 0) {
- return r;
- }
- r = writer.set_stripe_obj(stripe_obj);
- if (r < 0) {
- return r;
- }
-
- chunk = ChunkProcessor(&writer, chunk_size);
- *pstripe_size = manifest_gen.cur_stripe_max_size();
- return 0;
-}
-
-
-
-int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
- DataProcessor **processor)
-{
- first_chunk = std::move(data);
- *processor = &stripe;
- return 0;
-}
-
-int AtomicObjectProcessor::prepare(optional_yield y)
-{
- uint64_t max_head_chunk_size;
- uint64_t head_max_size;
- uint64_t chunk_size = 0;
- uint64_t alignment;
-
- int r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(
- dpp, head_obj->get_bucket()->get_placement_rule(),
- &max_head_chunk_size, &alignment);
- if (r < 0) {
- return r;
- }
-
- bool same_pool = true;
- if (head_obj->get_bucket()->get_placement_rule() != tail_placement_rule) {
- if (!head_obj->placement_rules_match(head_obj->get_bucket()->get_placement_rule(), tail_placement_rule)) {
- same_pool = false;
- r = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_chunk_size(dpp, tail_placement_rule, &chunk_size);
- if (r < 0) {
- return r;
- }
- head_max_size = 0;
- }
- }
-
- if (same_pool) {
- RGWZonePlacementInfo placement_info;
- if (!store->svc()->zone->get_zone_params().get_placement(head_obj->get_bucket()->get_placement_rule().name, &placement_info) || placement_info.inline_data) {
- head_max_size = max_head_chunk_size;
- } else {
- head_max_size = 0;
- }
- chunk_size = max_head_chunk_size;
- }
-
- uint64_t stripe_size;
- const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
-
- dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_max_aligned_size(
- default_stripe_size, alignment, &stripe_size);
-
- manifest.set_trivial_rule(head_max_size, stripe_size);
-
- rgw_obj obj = head_obj->get_obj();
-
- r = manifest_gen.create_begin(store->ctx(), &manifest,
- head_obj->get_bucket()->get_placement_rule(),
- &tail_placement_rule,
- obj.bucket, obj);
- if (r < 0) {
- return r;
- }
-
- rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
- r = writer.set_stripe_obj(stripe_obj);
- if (r < 0) {
- return r;
- }
-
- set_head_chunk_size(head_max_size);
- // initialize the processors
- chunk = ChunkProcessor(&writer, chunk_size);
- stripe = StripeProcessor(&chunk, this, head_max_size);
- return 0;
-}
-
-int AtomicObjectProcessor::complete(size_t accounted_size,
- const std::string& etag,
- ceph::real_time *mtime,
- ceph::real_time set_mtime,
- rgw::sal::Attrs& attrs,
- ceph::real_time delete_at,
- const char *if_match,
- const char *if_nomatch,
- const std::string *user_data,
- rgw_zone_set *zones_trace,
- bool *pcanceled, optional_yield y)
-{
- int r = writer.drain();
- if (r < 0) {
- return r;
- }
- const uint64_t actual_size = get_actual_size();
- r = manifest_gen.create_next(actual_size);
- if (r < 0) {
- return r;
- }
-
- head_obj->set_atomic();
-
- RGWRados::Object op_target(store->getRados(),
- head_obj->get_bucket(),
- obj_ctx, head_obj.get());
- RGWRados::Object::Write obj_op(&op_target);
-
- /* some object types shouldn't be versioned, e.g., multipart parts */
- op_target.set_versioning_disabled(!head_obj->get_bucket()->versioning_enabled());
- obj_op.meta.data = &first_chunk;
- obj_op.meta.manifest = &manifest;
- obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
- obj_op.meta.if_match = if_match;
- obj_op.meta.if_nomatch = if_nomatch;
- obj_op.meta.mtime = mtime;
- obj_op.meta.set_mtime = set_mtime;
- obj_op.meta.owner = owner;
- obj_op.meta.flags = PUT_OBJ_CREATE;
- obj_op.meta.olh_epoch = olh_epoch;
- obj_op.meta.delete_at = delete_at;
- obj_op.meta.user_data = user_data;
- obj_op.meta.zones_trace = zones_trace;
- obj_op.meta.modify_tail = true;
-
- r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
- if (r < 0) {
- if (r == -ETIMEDOUT) {
- // The head object write may eventually succeed, clear the set of objects for deletion. if it
- // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
- writer.clear_written();
- }
- return r;
- }
- if (!obj_op.meta.canceled) {
- // on success, clear the set of objects for deletion
- writer.clear_written();
- }
- if (pcanceled) {
- *pcanceled = obj_op.meta.canceled;
- }
- return 0;
-}
-
-
-int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
- DataProcessor **processor)
-{
- // write the first chunk of the head object as part of an exclusive create,
- // then drain to wait for the result in case of EEXIST
- int r = writer.write_exclusive(data);
- if (r == -EEXIST) {
- // randomize the oid prefix and reprepare the head/manifest
- std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
-
- mp.init(target_obj->get_name(), upload_id, oid_rand);
- manifest.set_prefix(target_obj->get_name() + "." + oid_rand);
-
- r = prepare_head();
- if (r < 0) {
- return r;
- }
- // resubmit the write op on the new head object
- r = writer.write_exclusive(data);
- }
- if (r < 0) {
- return r;
- }
- *processor = &stripe;
- return 0;
-}
-
-int MultipartObjectProcessor::prepare_head()
-{
- const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
- uint64_t chunk_size;
- uint64_t stripe_size;
- uint64_t alignment;
-
- int r = dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_chunk_size(dpp,
- tail_placement_rule, &chunk_size, &alignment);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
- return r;
- }
- dynamic_cast<rgw::sal::RadosObject*>(target_obj.get())->get_max_aligned_size(
- default_stripe_size, alignment, &stripe_size);
-
- manifest.set_multipart_part_rule(stripe_size, part_num);
-
- r = manifest_gen.create_begin(store->ctx(), &manifest,
- head_obj->get_bucket()->get_placement_rule(),
- &tail_placement_rule,
- target_obj->get_bucket()->get_key(),
- target_obj->get_obj());
- if (r < 0) {
- return r;
- }
-
- rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
- dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->raw_obj_to_obj(stripe_obj);
- head_obj->set_hash_source(target_obj->get_name());
-
- r = writer.set_stripe_obj(stripe_obj);
- if (r < 0) {
- return r;
- }
- stripe_size = manifest_gen.cur_stripe_max_size();
- set_head_chunk_size(stripe_size);
-
- chunk = ChunkProcessor(&writer, chunk_size);
- stripe = StripeProcessor(&chunk, this, stripe_size);
- return 0;
-}
-
-int MultipartObjectProcessor::prepare(optional_yield y)
-{
- manifest.set_prefix(target_obj->get_name() + "." + upload_id);
-
- return prepare_head();
-}
-
-int MultipartObjectProcessor::complete(size_t accounted_size,
- const std::string& etag,
- ceph::real_time *mtime,
- ceph::real_time set_mtime,
- std::map<std::string, bufferlist>& attrs,
- ceph::real_time delete_at,
- const char *if_match,
- const char *if_nomatch,
- const std::string *user_data,
- rgw_zone_set *zones_trace,
- bool *pcanceled, optional_yield y)
-{
- int r = writer.drain();
- if (r < 0) {
- return r;
- }
- const uint64_t actual_size = get_actual_size();
- r = manifest_gen.create_next(actual_size);
- if (r < 0) {
- return r;
- }
-
- RGWRados::Object op_target(store->getRados(),
- head_obj->get_bucket(),
- obj_ctx, head_obj.get());
- RGWRados::Object::Write obj_op(&op_target);
-
- op_target.set_versioning_disabled(true);
- op_target.set_meta_placement_rule(&tail_placement_rule);
- obj_op.meta.set_mtime = set_mtime;
- obj_op.meta.mtime = mtime;
- obj_op.meta.owner = owner;
- obj_op.meta.delete_at = delete_at;
- obj_op.meta.zones_trace = zones_trace;
- obj_op.meta.modify_tail = true;
-
- r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
- if (r < 0)
- return r;
-
- bufferlist bl;
- RGWUploadPartInfo info;
- string p = "part.";
- bool sorted_omap = is_v2_upload_id(upload_id);
-
- if (sorted_omap) {
- char buf[32];
- snprintf(buf, sizeof(buf), "%08d", part_num);
- p.append(buf);
- } else {
- p.append(part_num_str);
- }
- info.num = part_num;
- info.etag = etag;
- info.size = actual_size;
- info.accounted_size = accounted_size;
- info.modified = real_clock::now();
- info.manifest = manifest;
-
- bool compressed;
- r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
- if (r < 0) {
- ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
- return r;
- }
-
- encode(info, bl);
-
- std::unique_ptr<rgw::sal::Object> meta_obj =
- head_obj->get_bucket()->get_object(rgw_obj_key(mp.get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
- meta_obj->set_in_extra_data(true);
-
- r = meta_obj->omap_set_val_by_key(dpp, p, bl, true, null_yield);
- if (r < 0) {
- return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
- }
-
- if (!obj_op.meta.canceled) {
- // on success, clear the set of objects for deletion
- writer.clear_written();
- }
- if (pcanceled) {
- *pcanceled = obj_op.meta.canceled;
- }
- return 0;
-}
-
-int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
-{
- int r = writer.write_exclusive(data);
- if (r < 0) {
- return r;
- }
- *processor = &stripe;
- return 0;
-}
-
-int AppendObjectProcessor::prepare(optional_yield y)
-{
- RGWObjState *astate;
- int r = head_obj->get_obj_state(dpp, &astate, y);
- if (r < 0) {
- return r;
- }
- cur_size = astate->size;
- *cur_accounted_size = astate->accounted_size;
- if (!astate->exists) {
- if (position != 0) {
- ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
- return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
- } else {
- cur_part_num = 1;
- //set the prefix
- char buf[33];
- gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
- string oid_prefix = head_obj->get_name();
- oid_prefix.append(".");
- oid_prefix.append(buf);
- oid_prefix.append("_");
- manifest.set_prefix(oid_prefix);
- }
- } else {
- // check whether the object appendable
- map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
- if (iter == astate->attrset.end()) {
- ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
- return -ERR_OBJECT_NOT_APPENDABLE;
- }
- if (position != *cur_accounted_size) {
- ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
- return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
- }
- try {
- using ceph::decode;
- decode(cur_part_num, iter->second);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
- return -EIO;
- }
- cur_part_num++;
- //get the current obj etag
- iter = astate->attrset.find(RGW_ATTR_ETAG);
- if (iter != astate->attrset.end()) {
- string s = rgw_string_unquote(iter->second.c_str());
- size_t pos = s.find("-");
- cur_etag = s.substr(0, pos);
- }
-
- iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
- if (iter != astate->attrset.end()) {
- tail_placement_rule.storage_class = iter->second.to_str();
- } else {
- tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
- }
- cur_manifest = dynamic_cast<rgw::sal::RadosObject*>(head_obj.get())->get_manifest();
- manifest.set_prefix(cur_manifest->get_prefix());
- astate->keep_tail = true;
- }
- manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
-
- rgw_obj obj = head_obj->get_obj();
-
- r = manifest_gen.create_begin(store->ctx(), &manifest, head_obj->get_bucket()->get_placement_rule(), &tail_placement_rule, obj.bucket, obj);
- if (r < 0) {
- return r;
- }
- rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
-
- uint64_t chunk_size = 0;
- r = store->get_raw_chunk_size(dpp, stripe_obj, &chunk_size);
- if (r < 0) {
- return r;
- }
- r = writer.set_stripe_obj(std::move(stripe_obj));
- if (r < 0) {
- return r;
- }
-
- uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
-
- uint64_t max_head_size = std::min(chunk_size, stripe_size);
- set_head_chunk_size(max_head_size);
-
- // initialize the processors
- chunk = ChunkProcessor(&writer, chunk_size);
- stripe = StripeProcessor(&chunk, this, stripe_size);
-
- return 0;
-}
-
-int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
- ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
- ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
- const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
- optional_yield y)
-{
- int r = writer.drain();
- if (r < 0)
- return r;
- const uint64_t actual_size = get_actual_size();
- r = manifest_gen.create_next(actual_size);
- if (r < 0) {
- return r;
- }
- head_obj->set_atomic();
- RGWRados::Object op_target(store->getRados(),
- head_obj->get_bucket(),
- obj_ctx, head_obj.get());
- RGWRados::Object::Write obj_op(&op_target);
- //For Append obj, disable versioning
- op_target.set_versioning_disabled(true);
- if (cur_manifest) {
- cur_manifest->append(dpp, manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
- obj_op.meta.manifest = cur_manifest;
- } else {
- obj_op.meta.manifest = &manifest;
- }
- obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
- obj_op.meta.mtime = mtime;
- obj_op.meta.set_mtime = set_mtime;
- obj_op.meta.owner = owner;
- obj_op.meta.flags = PUT_OBJ_CREATE;
- obj_op.meta.delete_at = delete_at;
- obj_op.meta.user_data = user_data;
- obj_op.meta.zones_trace = zones_trace;
- obj_op.meta.modify_tail = true;
- obj_op.meta.appendable = true;
- //Add the append part number
- bufferlist cur_part_num_bl;
- using ceph::encode;
- encode(cur_part_num, cur_part_num_bl);
- attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
- //calculate the etag
- if (!cur_etag.empty()) {
- MD5 hash;
- // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
- hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
- char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
- char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
- hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
- hash.Update((const unsigned char *)petag, sizeof(petag));
- hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
- hash.Update((const unsigned char *)petag, sizeof(petag));
- hash.Final((unsigned char *)final_etag);
- buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
- snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
- "-%lld", (long long)cur_part_num);
- bufferlist etag_bl;
- etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
- attrs[RGW_ATTR_ETAG] = etag_bl;
- }
- r = obj_op.write_meta(dpp, actual_size + cur_size,
- accounted_size + *cur_accounted_size,
- attrs, y);
- if (r < 0) {
- return r;
- }
- if (!obj_op.meta.canceled) {
- // on success, clear the set of objects for deletion
- writer.clear_written();
- }
- if (pcanceled) {
- *pcanceled = obj_op.meta.canceled;
- }
- *cur_accounted_size += accounted_size;
-
- return 0;
-}
-
-} // namespace rgw::putobj
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2018 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <optional>
-
-#include "rgw_putobj.h"
-#include "services/svc_rados.h"
-#include "services/svc_tier_rados.h"
-#include "rgw_sal.h"
-#include "rgw_obj_manifest.h"
-
-namespace rgw {
-
-namespace sal {
- class RadosStore;
-}
-
-class Aio;
-
-namespace putobj {
-
-// an object processor with special handling for the first chunk of the head.
-// the virtual process_first_chunk() function returns a processor to handle the
-// rest of the object
-class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
- uint64_t head_chunk_size;
- // buffer to capture the first chunk of the head object
- bufferlist head_data;
- // initialized after process_first_chunk() to process everything else
- rgw::sal::DataProcessor *processor = nullptr;
- uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
- protected:
- uint64_t get_actual_size() const { return data_offset; }
-
- // process the first chunk of data and return a processor for the rest
- virtual int process_first_chunk(bufferlist&& data,
- rgw::sal::DataProcessor **processor) = 0;
- public:
- HeadObjectProcessor(uint64_t head_chunk_size)
- : head_chunk_size(head_chunk_size)
- {}
-
- void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
-
- // cache first chunk for process_first_chunk(), then forward everything else
- // to the returned processor
- int process(bufferlist&& data, uint64_t logical_offset) final override;
-};
-
-using RawObjSet = std::set<rgw_raw_obj>;
-
-// a data sink that writes to rados objects and deletes them on cancelation
-class RadosWriter : public rgw::sal::DataProcessor {
- Aio *const aio;
- rgw::sal::RadosStore *const store;
- RGWObjectCtx& obj_ctx;
- std::unique_ptr<rgw::sal::Object> head_obj;
- RGWSI_RADOS::Obj stripe_obj; // current stripe object
- RawObjSet written; // set of written objects for deletion
- const DoutPrefixProvider *dpp;
- optional_yield y;
-
- public:
- RadosWriter(Aio *aio, rgw::sal::RadosStore *store,
- RGWObjectCtx& obj_ctx, std::unique_ptr<rgw::sal::Object> _head_obj,
- const DoutPrefixProvider *dpp, optional_yield y)
- : aio(aio), store(store),
- obj_ctx(obj_ctx), head_obj(std::move(_head_obj)), dpp(dpp), y(y)
- {}
- RadosWriter(RadosWriter&& r)
- : aio(r.aio), store(r.store),
- obj_ctx(r.obj_ctx), head_obj(std::move(r.head_obj)), dpp(r.dpp), y(r.y)
- {}
-
- ~RadosWriter();
-
- // add alloc hint to osd
- void add_write_hint(librados::ObjectWriteOperation& op);
-
- // change the current stripe object
- int set_stripe_obj(const rgw_raw_obj& obj);
-
- // write the data at the given offset of the current stripe object
- int process(bufferlist&& data, uint64_t stripe_offset) override;
-
- // write the data as an exclusive create and wait for it to complete
- int write_exclusive(const bufferlist& data);
-
- int drain();
-
- // when the operation completes successfully, clear the set of written objects
- // so they aren't deleted on destruction
- void clear_written() { written.clear(); }
-
-};
-
-
-// a rados object processor that stripes according to RGWObjManifest
-class ManifestObjectProcessor : public HeadObjectProcessor,
- public StripeGenerator {
- protected:
- rgw::sal::RadosStore* const store;
- rgw_placement_rule tail_placement_rule;
- rgw_user owner;
- RGWObjectCtx& obj_ctx;
- std::unique_ptr<rgw::sal::Object> head_obj;
-
- RadosWriter writer;
- RGWObjManifest manifest;
- RGWObjManifest::generator manifest_gen;
- ChunkProcessor chunk;
- StripeProcessor stripe;
- const DoutPrefixProvider *dpp;
-
- // implements StripeGenerator
- int next(uint64_t offset, uint64_t *stripe_size) override;
-
- public:
- ManifestObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
- const rgw_placement_rule *ptail_placement_rule,
- const rgw_user& owner, RGWObjectCtx& _obj_ctx,
- std::unique_ptr<rgw::sal::Object> _head_obj,
- const DoutPrefixProvider* dpp, optional_yield y)
- : HeadObjectProcessor(0),
- store(store),
- owner(owner),
- obj_ctx(_obj_ctx), head_obj(std::move(_head_obj)),
- writer(aio, store, obj_ctx, head_obj->clone(), dpp, y),
- chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
- if (ptail_placement_rule) {
- tail_placement_rule = *ptail_placement_rule;
- }
- }
-
- void set_owner(const rgw_user& _owner) {
- owner = _owner;
- }
-
- void set_tail_placement(const rgw_placement_rule& tpr) {
- tail_placement_rule = tpr;
- }
- void set_tail_placement(const rgw_placement_rule&& tpr) {
- tail_placement_rule = tpr;
- }
-
-};
-
-
-// a processor that completes with an atomic write to the head object as part of
-// a bucket index transaction
-class AtomicObjectProcessor : public ManifestObjectProcessor {
- const std::optional<uint64_t> olh_epoch;
- const std::string unique_tag;
- bufferlist first_chunk; // written with the head in complete()
-
- int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
- public:
- AtomicObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
- const rgw_placement_rule *ptail_placement_rule,
- const rgw_user& owner,
- RGWObjectCtx& obj_ctx,
- std::unique_ptr<rgw::sal::Object> _head_obj,
- std::optional<uint64_t> olh_epoch,
- const std::string& unique_tag,
- const DoutPrefixProvider *dpp, optional_yield y)
- : ManifestObjectProcessor(aio, store, ptail_placement_rule,
- owner, obj_ctx, std::move(_head_obj), dpp, y),
- olh_epoch(olh_epoch), unique_tag(unique_tag)
- {}
-
- // prepare a trivial manifest
- int prepare(optional_yield y) override;
- // write the head object atomically in a bucket index transaction
- int complete(size_t accounted_size, const std::string& etag,
- ceph::real_time *mtime, ceph::real_time set_mtime,
- std::map<std::string, bufferlist>& attrs,
- ceph::real_time delete_at,
- const char *if_match, const char *if_nomatch,
- const std::string *user_data,
- rgw_zone_set *zones_trace, bool *canceled,
- optional_yield y) override;
-
-};
-
-
-// a processor for multipart parts, which don't require atomic completion. the
-// part's head is written with an exclusive create to detect racing uploads of
-// the same part/upload id, which are restarted with a random oid prefix
-class MultipartObjectProcessor : public ManifestObjectProcessor {
- std::unique_ptr<rgw::sal::Object> target_obj; // target multipart object
- const std::string upload_id;
- const int part_num;
- const std::string part_num_str;
- RGWMPObj mp;
-
- // write the first chunk and wait on aio->drain() for its completion.
- // on EEXIST, retry with random prefix
- int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
- // prepare the head stripe and manifest
- int prepare_head();
- public:
- MultipartObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
- const rgw_placement_rule *ptail_placement_rule,
- const rgw_user& owner, RGWObjectCtx& obj_ctx,
- std::unique_ptr<rgw::sal::Object> _head_obj,
- const std::string& upload_id, uint64_t part_num,
- const std::string& part_num_str,
- const DoutPrefixProvider *dpp, optional_yield y)
- : ManifestObjectProcessor(aio, store, ptail_placement_rule,
- owner, obj_ctx, std::move(_head_obj), dpp, y),
- target_obj(head_obj->clone()), upload_id(upload_id),
- part_num(part_num), part_num_str(part_num_str),
- mp(head_obj->get_name(), upload_id)
- {}
-
- // prepare a multipart manifest
- int prepare(optional_yield y) override;
- // write the head object attributes in a bucket index transaction, then
- // register the completed part with the multipart meta object
- int complete(size_t accounted_size, const std::string& etag,
- ceph::real_time *mtime, ceph::real_time set_mtime,
- std::map<std::string, bufferlist>& attrs,
- ceph::real_time delete_at,
- const char *if_match, const char *if_nomatch,
- const std::string *user_data,
- rgw_zone_set *zones_trace, bool *canceled,
- optional_yield y) override;
-
-};
-
- class AppendObjectProcessor : public ManifestObjectProcessor {
- uint64_t cur_part_num;
- uint64_t position;
- uint64_t cur_size;
- uint64_t *cur_accounted_size;
- std::string cur_etag;
- const std::string unique_tag;
-
- RGWObjManifest *cur_manifest;
-
- int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
-
- public:
- AppendObjectProcessor(Aio *aio, rgw::sal::RadosStore* store,
- const rgw_placement_rule *ptail_placement_rule,
- const rgw_user& owner, RGWObjectCtx& obj_ctx,
- std::unique_ptr<rgw::sal::Object> _head_obj,
- const std::string& unique_tag, uint64_t position,
- uint64_t *cur_accounted_size,
- const DoutPrefixProvider *dpp, optional_yield y)
- : ManifestObjectProcessor(aio, store, ptail_placement_rule,
- owner, obj_ctx, std::move(_head_obj), dpp, y),
- position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
- unique_tag(unique_tag), cur_manifest(nullptr)
- {}
- int prepare(optional_yield y) override;
- int complete(size_t accounted_size, const std::string& etag,
- ceph::real_time *mtime, ceph::real_time set_mtime,
- std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
- const char *if_match, const char *if_nomatch, const std::string *user_data,
- rgw_zone_set *zones_trace, bool *canceled,
- optional_yield y) override;
- };
-
-} // namespace putobj
-} // namespace rgw
-
*
*/
-#ifndef CEPH_RGW_QUOTA_H
-#define CEPH_RGW_QUOTA_H
+#pragma once
#include "include/utime.h"
#include "common/config_fwd.h"
// apply default quotas from configuration
void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "include/compat.h"
-#include <errno.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sstream>
-
-#include <boost/algorithm/string.hpp>
-#include <string_view>
-
-#include <boost/container/flat_set.hpp>
-#include <boost/format.hpp>
-#include <boost/optional.hpp>
-#include <boost/utility/in_place_factory.hpp>
-
-#include "common/ceph_json.h"
-
-#include "common/errno.h"
-#include "common/Formatter.h"
-#include "common/Throttle.h"
-#include "common/BackTrace.h"
-
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-#include "rgw_cache.h"
-#include "rgw_acl.h"
-#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
-#include "rgw_aio_throttle.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_datalog.h"
-#include "rgw_putobj_processor.h"
-
-#include "cls/rgw/cls_rgw_ops.h"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/rgw/cls_rgw_const.h"
-#include "cls/refcount/cls_refcount_client.h"
-#include "cls/version/cls_version_client.h"
-#include "osd/osd_types.h"
-
-#include "rgw_tools.h"
-#include "rgw_coroutine.h"
-#include "rgw_compression.h"
-#include "rgw_etag_verifier.h"
-#include "rgw_worker.h"
-#include "rgw_notify.h"
-#include "rgw_http_errors.h"
-
-#undef fork // fails to compile RGWPeriod::fork() below
-
-#include "common/Clock.h"
-
-#include <string>
-#include <iostream>
-#include <vector>
-#include <atomic>
-#include <list>
-#include <map>
-#include "include/random.h"
-
-#include "rgw_gc.h"
-#include "rgw_lc.h"
-
-#include "rgw_object_expirer_core.h"
-#include "rgw_sync.h"
-#include "rgw_sync_counters.h"
-#include "rgw_sync_trace.h"
-#include "rgw_trim_datalog.h"
-#include "rgw_trim_mdlog.h"
-#include "rgw_data_sync.h"
-#include "rgw_realm_watcher.h"
-#include "rgw_reshard.h"
-#include "rgw_cr_rados.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_zone_utils.h"
-#include "services/svc_quota.h"
-#include "services/svc_sync_modules.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_sys_obj_cache.h"
-#include "services/svc_bucket.h"
-#include "services/svc_mdlog.h"
-
-#include "compressor/Compressor.h"
-
-#include "rgw_d3n_datacache.h"
-
-#ifdef WITH_LTTNG
-#define TRACEPOINT_DEFINE
-#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#include "tracing/rgw_rados.h"
-#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
-#undef TRACEPOINT_DEFINE
-#else
-#define tracepoint(...)
-#endif
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-using namespace librados;
-
-#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
-#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
-#define dendl_bitx dendl ; }
-
-static string shadow_ns = "shadow";
-static string default_bucket_index_pool_suffix = "rgw.buckets.index";
-static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
-
-static RGWObjCategory main_category = RGWObjCategory::Main;
-#define RGW_USAGE_OBJ_PREFIX "usage."
-
-rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* driver) const
-{
- if (!is_raw) {
- rgw_raw_obj r;
- driver->get_raw_obj(placement_rule, obj, &r);
- return r;
- }
- return raw_obj;
-}
-
-void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
-{
- obj_version* check_objv = version_for_check();
-
- if (check_objv) {
- cls_version_check(*op, *check_objv, VER_COND_EQ);
- }
-
- cls_version_read(*op, &read_version);
-}
-
-void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
-{
- obj_version* check_objv = version_for_check();
- obj_version* modify_version = version_for_write();
-
- if (check_objv) {
- cls_version_check(*op, *check_objv, VER_COND_EQ);
- }
-
- if (modify_version) {
- cls_version_set(*op, *modify_version);
- } else {
- cls_version_inc(*op);
- }
-}
-
-void RGWObjVersionTracker::apply_write()
-{
- const bool checked = (read_version.ver != 0);
- const bool incremented = (write_version.ver == 0);
-
- if (checked && incremented) {
- // apply cls_version_inc() so our next operation can recheck it
- ++read_version.ver;
- } else {
- read_version = write_version;
- }
- write_version = obj_version();
-}
-
-RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
- RGWObjStateManifest *result;
- typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
- lock.lock_shared();
- assert (!obj.empty());
- iter = objs_state.find(obj);
- if (iter != objs_state.end()) {
- result = &iter->second;
- lock.unlock_shared();
- } else {
- lock.unlock_shared();
- lock.lock();
- result = &objs_state[obj];
- lock.unlock();
- }
- return result;
-}
-
-void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
- std::unique_lock wl{lock};
- assert (!obj.empty());
- objs_state[obj].state.compressed = true;
-}
-
-void RGWObjectCtx::set_atomic(rgw_obj& obj) {
- std::unique_lock wl{lock};
- assert (!obj.empty());
- objs_state[obj].state.is_atomic = true;
-}
-void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
- std::unique_lock wl{lock};
- assert (!obj.empty());
- objs_state[obj].state.prefetch_data = true;
-}
-
-void RGWObjectCtx::invalidate(const rgw_obj& obj) {
- std::unique_lock wl{lock};
- auto iter = objs_state.find(obj);
- if (iter == objs_state.end()) {
- return;
- }
- bool is_atomic = iter->second.state.is_atomic;
- bool prefetch_data = iter->second.state.prefetch_data;
- bool compressed = iter->second.state.compressed;
-
- objs_state.erase(iter);
-
- if (is_atomic || prefetch_data) {
- auto& sm = objs_state[obj];
- sm.state.is_atomic = is_atomic;
- sm.state.prefetch_data = prefetch_data;
- sm.state.compressed = compressed;
- }
-}
-
-class RGWMetaNotifierManager : public RGWCoroutinesManager {
- RGWRados* store;
- RGWHTTPManager http_manager;
-
-public:
- RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
- http_manager(store->ctx(), completion_mgr) {
- http_manager.start();
- }
-
- int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
- rgw_http_param_pair pairs[] = { { "type", "metadata" },
- { "notify", NULL },
- { NULL, NULL } };
-
- list<RGWCoroutinesStack *> stacks;
- for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
- RGWRESTConn *conn = iter->second;
- RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
- stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
-
- stacks.push_back(stack);
- }
- return run(dpp, stacks);
- }
-};
-
-class RGWDataNotifierManager : public RGWCoroutinesManager {
- RGWRados* store;
- RGWHTTPManager http_manager;
-
-public:
- RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
- http_manager(store->ctx(), completion_mgr) {
- http_manager.start();
- }
-
- int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
- bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
-
- list<RGWCoroutinesStack *> stacks;
- const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
- for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
- RGWRESTConn *conn = iter->second;
- RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
- stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
- stacks.push_back(stack);
- }
-
- return run(dpp, stacks);
- }
-};
-
-/* class RGWRadosThread */
-
-void RGWRadosThread::start()
-{
- worker = new Worker(cct, this);
- worker->create(thread_name.c_str());
-}
-
-void RGWRadosThread::stop()
-{
- down_flag = true;
- stop_process();
- if (worker) {
- worker->signal();
- worker->join();
- }
- delete worker;
- worker = NULL;
-}
-
-void *RGWRadosThread::Worker::entry() {
- uint64_t msec = processor->interval_msec();
- auto interval = std::chrono::milliseconds(msec);
-
- do {
- auto start = ceph::real_clock::now();
- int r = processor->process(this);
- if (r < 0) {
- ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
- }
-
- if (processor->going_down())
- break;
-
- auto end = ceph::real_clock::now() - start;
-
- uint64_t cur_msec = processor->interval_msec();
- if (cur_msec != msec) { /* was it reconfigured? */
- msec = cur_msec;
- interval = std::chrono::milliseconds(msec);
- }
-
- if (cur_msec > 0) {
- if (interval <= end)
- continue; // next round
-
- auto wait_time = interval - end;
- wait_interval(wait_time);
- } else {
- wait();
- }
- } while (!processor->going_down());
-
- return NULL;
-}
-
-class RGWMetaNotifier : public RGWRadosThread {
- RGWMetaNotifierManager notify_mgr;
- RGWMetadataLog *const log;
-
- uint64_t interval_msec() override {
- return cct->_conf->rgw_md_notify_interval_msec;
- }
- void stop_process() override {
- notify_mgr.stop();
- }
-public:
- RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
- : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
-
- int process(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
-{
- set<int> shards;
-
- log->read_clear_modified(shards);
-
- if (shards.empty()) {
- return 0;
- }
-
- for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
- ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
- }
-
- notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
-
- return 0;
-}
-
-class RGWDataNotifier : public RGWRadosThread {
- RGWDataNotifierManager notify_mgr;
- bc::flat_set<rgw_data_notify_entry> entry;
-
- uint64_t interval_msec() override {
- return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
- }
- void stop_process() override {
- notify_mgr.stop();
- }
-public:
- RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
-
- int process(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
-{
- auto data_log = store->svc.datalog_rados;
- if (!data_log) {
- return 0;
- }
-
- auto shards = data_log->read_clear_modified();
-
- if (shards.empty()) {
- return 0;
- }
-
- for (const auto& [shard_id, entries] : shards) {
- bc::flat_set<rgw_data_notify_entry>::iterator it;
- for (const auto& entry : entries) {
- ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
- << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
- }
- }
-
- notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
-
- return 0;
-}
-
-class RGWSyncProcessorThread : public RGWRadosThread {
-public:
- RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
- RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
- ~RGWSyncProcessorThread() override {}
- int init(const DoutPrefixProvider *dpp) override = 0 ;
- int process(const DoutPrefixProvider *dpp) override = 0;
-};
-
-class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
-{
- RGWMetaSyncStatusManager sync;
-
- uint64_t interval_msec() override {
- return 0; /* no interval associated, it'll run once until stopped */
- }
- void stop_process() override {
- sync.stop();
- }
-public:
- RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
- : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
-
- void wakeup_sync_shards(set<int>& shard_ids) {
- for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
- sync.wakeup(*iter);
- }
- }
- RGWMetaSyncStatusManager* get_manager() { return &sync; }
-
- int init(const DoutPrefixProvider *dpp) override {
- int ret = sync.init(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
- return ret;
- }
- return 0;
- }
-
- int process(const DoutPrefixProvider *dpp) override {
- sync.run(dpp, null_yield);
- return 0;
- }
-};
-
-class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
-{
- PerfCountersRef counters;
- RGWDataSyncStatusManager sync;
- bool initialized;
-
- uint64_t interval_msec() override {
- if (initialized) {
- return 0; /* no interval associated, it'll run once until stopped */
- } else {
-#define DATA_SYNC_INIT_WAIT_SEC 20
- return DATA_SYNC_INIT_WAIT_SEC * 1000;
- }
- }
- void stop_process() override {
- sync.stop();
- }
-public:
- RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
- const RGWZone* source_zone)
- : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
- counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
- sync(_driver, async_rados, source_zone->id, counters.get()),
- initialized(false) {}
-
- void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
- for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
- sync.wakeup(iter->first, iter->second);
- }
- }
-
- RGWDataSyncStatusManager* get_manager() { return &sync; }
-
- int init(const DoutPrefixProvider *dpp) override {
- return 0;
- }
-
- int process(const DoutPrefixProvider *dpp) override {
- while (!initialized) {
- if (going_down()) {
- return 0;
- }
- int ret = sync.init(dpp);
- if (ret >= 0) {
- initialized = true;
- break;
- }
- /* we'll be back! */
- return 0;
- }
- sync.run(dpp);
- return 0;
- }
-};
-
-class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
-{
- RGWCoroutinesManager crs;
- rgw::sal::RadosStore* store;
- rgw::BucketTrimManager *bucket_trim;
- RGWHTTPManager http;
- const utime_t trim_interval;
-
- uint64_t interval_msec() override { return 0; }
- void stop_process() override { crs.stop(); }
-public:
- RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
- int interval)
- : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
- crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
- bucket_trim(bucket_trim),
- http(store->ctx(), crs.get_completion_mgr()),
- trim_interval(interval, 0)
- {}
-
- int init(const DoutPrefixProvider *dpp) override {
- return http.start();
- }
- int process(const DoutPrefixProvider *dpp) override {
- list<RGWCoroutinesStack*> stacks;
- auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
- cct->_conf->rgw_md_log_max_shards,
- trim_interval);
- if (!metatrimcr) {
- ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
- return -EINVAL;
- }
- auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
- meta->call(metatrimcr);
-
- stacks.push_back(meta);
-
- if (store->svc()->zone->sync_module_exports_data()) {
- auto data = new RGWCoroutinesStack(store->ctx(), &crs);
- data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
- cct->_conf->rgw_data_log_num_shards,
- trim_interval));
- stacks.push_back(data);
-
- auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
- bucket->call(bucket_trim->create_bucket_trim_cr(&http));
- stacks.push_back(bucket);
- }
-
- crs.run(dpp, stacks);
- return 0;
- }
-
- // implements DoutPrefixProvider
- CephContext *get_cct() const override { return store->ctx(); }
- unsigned get_subsys() const override
- {
- return dout_subsys;
- }
-
- std::ostream& gen_prefix(std::ostream& out) const override
- {
- return out << "sync log trim: ";
- }
-
-};
-
-void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
-{
- std::lock_guard l{meta_sync_thread_lock};
- if (meta_sync_processor_thread) {
- meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
- }
-}
-
-void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
-{
- ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
- for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
- ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
- bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
- for (const auto& [key, gen] : entries) {
- ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
- << ", gen=" << gen << dendl;
- }
- }
-
- std::lock_guard l{data_sync_thread_lock};
- auto iter = data_sync_processor_threads.find(source_zone);
- if (iter == data_sync_processor_threads.end()) {
- ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
- return;
- }
-
- RGWDataSyncProcessorThread *thread = iter->second;
- ceph_assert(thread);
- thread->wakeup_sync_shards(entries);
-}
-
-RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
-{
- std::lock_guard l{meta_sync_thread_lock};
- if (meta_sync_processor_thread) {
- return meta_sync_processor_thread->get_manager();
- }
- return nullptr;
-}
-
-RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
-{
- std::lock_guard l{data_sync_thread_lock};
- auto thread = data_sync_processor_threads.find(source_zone);
- if (thread == data_sync_processor_threads.end()) {
- return nullptr;
- }
- return thread->second->get_manager();
-}
-
-int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
-{
- IoCtx ioctx;
- int r = open_pool_ctx(dpp, pool, ioctx, false);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
- return r;
- }
-
- bool req;
- r = ioctx.pool_requires_alignment2(&req);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
- << r << dendl;
- return r;
- }
-
- if (!req) {
- *alignment = 0;
- return 0;
- }
-
- uint64_t align;
- r = ioctx.pool_required_alignment2(&align);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
- << r << dendl;
- return r;
- }
- if (align != 0) {
- ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
- }
- *alignment = align;
- return 0;
-}
-
-void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
-{
- if (alignment == 0) {
- *max_size = size;
- return;
- }
-
- if (size <= alignment) {
- *max_size = alignment;
- return;
- }
-
- *max_size = size - (size % alignment);
-}
-
-int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
-{
- uint64_t alignment;
- int r = get_required_alignment(dpp, pool, &alignment);
- if (r < 0) {
- return r;
- }
-
- if (palignment) {
- *palignment = alignment;
- }
-
- uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
-
- get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
-
- ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
-
- return 0;
-}
-
-int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
- uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
-{
- rgw_pool pool;
- if (!get_obj_data_pool(placement_rule, obj, &pool)) {
- ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
- return -EIO;
- }
- return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
-}
-
-void add_datalog_entry(const DoutPrefixProvider* dpp,
- RGWDataChangesLog* datalog,
- const RGWBucketInfo& bucket_info,
- uint32_t shard_id)
-{
- const auto& logs = bucket_info.layout.logs;
- if (logs.empty()) {
- return;
- }
- int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id);
- if (r < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
- } // datalog error is not fatal
-}
-
-class RGWIndexCompletionManager;
-
-struct complete_op_data {
- ceph::mutex lock = ceph::make_mutex("complete_op_data");
- AioCompletion *rados_completion{nullptr};
- int manager_shard_id{-1};
- RGWIndexCompletionManager *manager{nullptr};
- rgw_obj obj;
- RGWModifyOp op;
- string tag;
- rgw_bucket_entry_ver ver;
- cls_rgw_obj_key key;
- rgw_bucket_dir_entry_meta dir_meta;
- list<cls_rgw_obj_key> remove_objs;
- bool log_op;
- uint16_t bilog_op;
- rgw_zone_set zones_trace;
-
- bool stopped{false};
-
- void stop() {
- std::lock_guard l{lock};
- stopped = true;
- }
-};
-
-class RGWIndexCompletionManager {
- RGWRados* const store;
- const uint32_t num_shards;
- ceph::containers::tiny_vector<ceph::mutex> locks;
- std::vector<set<complete_op_data*>> completions;
- std::vector<complete_op_data*> retry_completions;
-
- std::condition_variable cond;
- std::mutex retry_completions_lock;
- bool _stop{false};
- std::thread retry_thread;
-
- // used to distribute the completions and the locks they use across
- // their respective vectors; it will get incremented and can wrap
- // around back to 0 without issue
- std::atomic<uint32_t> cur_shard {0};
-
- void process();
-
- void add_completion(complete_op_data *completion);
-
- void stop() {
- if (retry_thread.joinable()) {
- _stop = true;
- cond.notify_all();
- retry_thread.join();
- }
-
- for (uint32_t i = 0; i < num_shards; ++i) {
- std::lock_guard l{locks[i]};
- for (auto c : completions[i]) {
- c->stop();
- }
- }
- completions.clear();
- }
-
- uint32_t next_shard() {
- return cur_shard++ % num_shards;
- }
-
-public:
- RGWIndexCompletionManager(RGWRados *_driver) :
- store(_driver),
- num_shards(store->ctx()->_conf->rgw_thread_pool_size),
- locks{ceph::make_lock_container<ceph::mutex>(
- num_shards,
- [](const size_t i) {
- return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
- std::to_string(i));
- })},
- completions(num_shards),
- retry_thread(&RGWIndexCompletionManager::process, this)
- {}
-
- ~RGWIndexCompletionManager() {
- stop();
- }
-
- void create_completion(const rgw_obj& obj,
- RGWModifyOp op, string& tag,
- rgw_bucket_entry_ver& ver,
- const cls_rgw_obj_key& key,
- rgw_bucket_dir_entry_meta& dir_meta,
- list<cls_rgw_obj_key> *remove_objs, bool log_op,
- uint16_t bilog_op,
- rgw_zone_set *zones_trace,
- complete_op_data **result);
-
- bool handle_completion(completion_t cb, complete_op_data *arg);
-
- CephContext* ctx() {
- return store->ctx();
- }
-};
-
-static void obj_complete_cb(completion_t cb, void *arg)
-{
- complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
- completion->lock.lock();
- if (completion->stopped) {
- completion->lock.unlock(); /* can drop lock, no one else is referencing us */
- delete completion;
- return;
- }
- bool need_delete = completion->manager->handle_completion(cb, completion);
- completion->lock.unlock();
- if (need_delete) {
- delete completion;
- }
-}
-
-void RGWIndexCompletionManager::process()
-{
- DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
- while(!_stop) {
- std::vector<complete_op_data*> comps;
-
- {
- std::unique_lock l{retry_completions_lock};
- cond.wait(l, [this](){return _stop || !retry_completions.empty();});
- if (_stop) {
- return;
- }
- retry_completions.swap(comps);
- }
-
- for (auto c : comps) {
- std::unique_ptr<complete_op_data> up{c};
-
- ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
-
- RGWRados::BucketShard bs(store);
- RGWBucketInfo bucket_info;
-
- int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
- if (r < 0) {
- ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
- /* not much to do */
- continue;
- }
-
- r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
- [&](RGWRados::BucketShard *bs) -> int {
- const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
- ldout_bitx(bitx, &dpp, 10) <<
- "ENTERING " << __func__ << ": bucket-shard=" << bs <<
- " obj=" << c->obj << " tag=" << c->tag <<
- " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
- ldout_bitx(bitx, &dpp, 25) <<
- "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
-
- librados::ObjectWriteOperation o;
- o.assert_exists();
- cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
- c->log_op, c->bilog_op, &c->zones_trace);
- int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
- ldout_bitx(bitx, &dpp, 10) <<
- "EXITING " << __func__ << ": ret=" << dendl_bitx;
- return ret;
- });
- if (r < 0) {
- ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
- /* ignoring error, can't do anything about it */
- continue;
- }
-
- add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id);
- }
- }
-}
-
-void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
- RGWModifyOp op, string& tag,
- rgw_bucket_entry_ver& ver,
- const cls_rgw_obj_key& key,
- rgw_bucket_dir_entry_meta& dir_meta,
- list<cls_rgw_obj_key> *remove_objs, bool log_op,
- uint16_t bilog_op,
- rgw_zone_set *zones_trace,
- complete_op_data **result)
-{
- complete_op_data *entry = new complete_op_data;
-
- int shard_id = next_shard();
-
- entry->manager_shard_id = shard_id;
- entry->manager = this;
- entry->obj = obj;
- entry->op = op;
- entry->tag = tag;
- entry->ver = ver;
- entry->key = key;
- entry->dir_meta = dir_meta;
- entry->log_op = log_op;
- entry->bilog_op = bilog_op;
-
- if (remove_objs) {
- for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
- entry->remove_objs.push_back(*iter);
- }
- }
-
- if (zones_trace) {
- entry->zones_trace = *zones_trace;
- } else {
- entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
- }
-
- *result = entry;
-
- entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
-
- std::lock_guard l{locks[shard_id]};
- const auto ok = completions[shard_id].insert(entry).second;
- ceph_assert(ok);
-}
-
-void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
- {
- std::lock_guard l{retry_completions_lock};
- retry_completions.push_back(completion);
- }
- cond.notify_all();
-}
-
-bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
-{
- int shard_id = arg->manager_shard_id;
- {
- std::lock_guard l{locks[shard_id]};
-
- auto& comps = completions[shard_id];
-
- auto iter = comps.find(arg);
- if (iter == comps.end()) {
- ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
- return true;
- }
-
- comps.erase(iter);
- }
-
- int r = rados_aio_get_return_value(cb);
- if (r != -ERR_BUSY_RESHARDING) {
- ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " <<
- (r == 0 ? "ok" : "failed with " + to_string(r)) <<
- " for obj=" << arg->key << dendl;
- return true;
- }
- add_completion(arg);
- ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
- return false;
-}
-
-void RGWRados::finalize()
-{
- /* Before joining any sync threads, drain outstanding requests &
- * mark the async_processor as going_down() */
- if (svc.rados) {
- svc.rados->stop_processor();
- }
-
- if (run_sync_thread) {
- std::lock_guard l{meta_sync_thread_lock};
- meta_sync_processor_thread->stop();
-
- std::lock_guard dl{data_sync_thread_lock};
- for (auto iter : data_sync_processor_threads) {
- RGWDataSyncProcessorThread *thread = iter.second;
- thread->stop();
- }
- if (sync_log_trimmer) {
- sync_log_trimmer->stop();
- }
- }
- if (run_sync_thread) {
- delete meta_sync_processor_thread;
- meta_sync_processor_thread = NULL;
- std::lock_guard dl{data_sync_thread_lock};
- for (auto iter : data_sync_processor_threads) {
- RGWDataSyncProcessorThread *thread = iter.second;
- delete thread;
- }
- data_sync_processor_threads.clear();
- delete sync_log_trimmer;
- sync_log_trimmer = nullptr;
- bucket_trim = boost::none;
- }
- if (meta_notifier) {
- meta_notifier->stop();
- delete meta_notifier;
- }
- if (data_notifier) {
- data_notifier->stop();
- delete data_notifier;
- }
- delete sync_tracer;
-
- delete lc;
- lc = NULL;
-
- delete gc;
- gc = NULL;
-
- delete obj_expirer;
- obj_expirer = NULL;
-
- RGWQuotaHandler::free_handler(quota_handler);
- if (cr_registry) {
- cr_registry->put();
- }
-
- svc.shutdown();
-
- delete binfo_cache;
- delete obj_tombstone_cache;
- if (d3n_data_cache)
- delete d3n_data_cache;
-
- if (reshard_wait.get()) {
- reshard_wait->stop();
- reshard_wait.reset();
- }
-
- if (run_reshard_thread) {
- reshard->stop_processor();
- }
- delete reshard;
- delete index_completion_manager;
-
- rgw::notify::shutdown();
-}
-
-/**
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_rados()
-{
- int ret = 0;
-
- ret = rados.init_with_context(cct);
- if (ret < 0) {
- return ret;
- }
- ret = rados.connect();
- if (ret < 0) {
- return ret;
- }
-
- auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
- new RGWCoroutinesManagerRegistry(cct)};
- ret = crs->hook_to_admin_command("cr dump");
- if (ret < 0) {
- return ret;
- }
-
- cr_registry = crs.release();
-
- if (use_datacache) {
- d3n_data_cache = new D3nDataCache();
- d3n_data_cache->init(cct);
- }
-
- return ret;
-}
-
-int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
-{
- string name = cct->_conf->name.get_id();
- if (name.compare(0, 4, "rgw.") == 0) {
- name = name.substr(4);
- }
- map<string,string> metadata = meta;
- metadata["num_handles"] = "1"s;
- metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
- metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
- metadata["zone_name"] = svc.zone->zone_name();
- metadata["zone_id"] = svc.zone->zone_id().id;
- metadata["realm_name"] = svc.zone->get_realm().get_name();
- metadata["realm_id"] = svc.zone->get_realm().get_id();
- metadata["id"] = name;
- int ret = rados.service_daemon_register(
- daemon_type,
- stringify(rados.get_instance_id()),
- metadata);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
-{
- int ret = rados.service_daemon_update_status(move(status));
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- return 0;
-}
-
-/**
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_complete(const DoutPrefixProvider *dpp)
-{
- int ret;
-
- /*
- * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
- */
- sync_module = svc.sync_modules->get_sync_module();
-
- ret = open_root_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- ret = open_gc_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- ret = open_lc_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- ret = open_objexp_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- ret = open_reshard_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- ret = open_notif_pool_ctx(dpp);
- if (ret < 0)
- return ret;
-
- pools_initialized = true;
-
- if (use_gc) {
- gc = new RGWGC();
- gc->initialize(cct, this);
- } else {
- ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
- }
-
- obj_expirer = new RGWObjectExpirer(this->driver);
-
- if (use_gc_thread && use_gc) {
- gc->start_processor();
- obj_expirer->start_processor();
- }
-
- auto& current_period = svc.zone->get_current_period();
- auto& zonegroup = svc.zone->get_zonegroup();
- auto& zone_params = svc.zone->get_zone_params();
- auto& zone = svc.zone->get_zone();
-
- /* no point of running sync thread if we don't have a master zone configured
- or there is no rest_master_conn */
- if (!svc.zone->need_to_sync()) {
- run_sync_thread = false;
- }
-
- if (svc.zone->is_meta_master()) {
- auto md_log = svc.mdlog->get_log(current_period.get_id());
- meta_notifier = new RGWMetaNotifier(this, md_log);
- meta_notifier->start();
- }
-
- /* init it anyway, might run sync through radosgw-admin explicitly */
- sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
- sync_tracer->init(this);
- ret = sync_tracer->hook_to_admin_command();
- if (ret < 0) {
- return ret;
- }
-
- if (run_sync_thread) {
- for (const auto &pt: zonegroup.placement_targets) {
- if (zone_params.placement_pools.find(pt.second.name)
- == zone_params.placement_pools.end()){
- ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
- << pt.second.name << " present in zonegroup" << dendl;
- }
- }
- auto async_processor = svc.rados->get_async_processor();
- std::lock_guard l{meta_sync_thread_lock};
- meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
- ret = meta_sync_processor_thread->init(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
- return ret;
- }
- meta_sync_processor_thread->start();
-
- // configure the bucket trim manager
- rgw::BucketTrimConfig config;
- rgw::configure_bucket_trim(cct, config);
-
- bucket_trim.emplace(this->driver, config);
- ret = bucket_trim->init();
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
- return ret;
- }
- svc.datalog_rados->set_observer(&*bucket_trim);
-
- std::lock_guard dl{data_sync_thread_lock};
- for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
- ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
- auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
- ret = thread->init(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
- return ret;
- }
- thread->start();
- data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
- }
- auto interval = cct->_conf->rgw_sync_log_trim_interval;
- if (interval > 0) {
- sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
- ret = sync_log_trimmer->init(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
- return ret;
- }
- sync_log_trimmer->start();
- }
- }
- if (cct->_conf->rgw_data_notify_interval_msec) {
- data_notifier = new RGWDataNotifier(this);
- data_notifier->start();
- }
-
- binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
- binfo_cache->init(svc.cache);
-
- lc = new RGWLC();
- lc->initialize(cct, this->driver);
-
- if (use_lc_thread)
- lc->start_processor();
-
- quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
-
- bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
- zone.bucket_index_max_shards);
- if (bucket_index_max_shards > get_max_bucket_shards()) {
- bucket_index_max_shards = get_max_bucket_shards();
- ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
- << get_max_bucket_shards() << dendl;
- }
- ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
-
- bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
-
- if (need_tombstone_cache) {
- obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
- }
-
- reshard_wait = std::make_shared<RGWReshardWait>();
-
- reshard = new RGWReshard(this->driver);
-
- // disable reshard thread based on zone/zonegroup support
- run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
-
- if (run_reshard_thread) {
- reshard->start_processor();
- }
-
- index_completion_manager = new RGWIndexCompletionManager(this);
- ret = rgw::notify::init(cct, driver, dpp);
- if (ret < 0 ) {
- ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
- }
-
- return ret;
-}
-
-int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
-{
- if (raw) {
- return svc.init_raw(cct, use_cache, null_yield, dpp);
- }
-
- return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
-}
-
-int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
-{
- return ctl.init(&svc, driver, dpp);
-}
-
-/**
- * Initialize the RADOS instance and prepare to do other ops
- * Returns 0 on success, -ERR# on failure.
- */
-int RGWRados::init_begin(const DoutPrefixProvider *dpp)
-{
- int ret;
-
- inject_notify_timeout_probability =
- cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
- max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
-
- ret = init_svc(false, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
- return ret;
- }
-
- ret = init_ctl(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
- return ret;
- }
-
- host_id = svc.zone_utils->gen_host_id();
-
- return init_rados();
-}
-
-/**
- * Open the pool used as root for this gateway
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
-}
-
-int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
-}
-
-int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
-}
-
-int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
-}
-
-int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
-}
-
-int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
-{
- return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
-}
-
-int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
- bool mostly_omap)
-{
- constexpr bool create = true; // create the pool if it doesn't exist
- return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
-}
-
-/**** logs ****/
-
-struct log_list_state {
- string prefix;
- librados::IoCtx io_ctx;
- librados::NObjectIterator obit;
-};
-
-int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
-{
- log_list_state *state = new log_list_state;
- int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
- if (r < 0) {
- delete state;
- return r;
- }
- state->prefix = prefix;
- state->obit = state->io_ctx.nobjects_begin();
- *handle = (RGWAccessHandle)state;
- return 0;
-}
-
-int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
-{
- log_list_state *state = static_cast<log_list_state *>(handle);
- while (true) {
- if (state->obit == state->io_ctx.nobjects_end()) {
- delete state;
- return -ENOENT;
- }
- if (state->prefix.length() &&
- state->obit->get_oid().find(state->prefix) != 0) {
- state->obit++;
- continue;
- }
- *name = state->obit->get_oid();
- state->obit++;
- break;
- }
- return 0;
-}
-
-int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
-{
- librados::IoCtx io_ctx;
- int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
- if (r < 0)
- return r;
- return io_ctx.remove(name);
-}
-
-struct log_show_state {
- librados::IoCtx io_ctx;
- bufferlist bl;
- bufferlist::const_iterator p;
- string name;
- uint64_t pos;
- bool eof;
- log_show_state() : pos(0), eof(false) {}
-};
-
-int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
-{
- log_show_state *state = new log_show_state;
- int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
- if (r < 0) {
- delete state;
- return r;
- }
- state->name = name;
- *handle = (RGWAccessHandle)state;
- return 0;
-}
-
-int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
-{
- log_show_state *state = static_cast<log_show_state *>(handle);
- off_t off = state->p.get_off();
-
- ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
- << " off " << off
- << " eof " << (int)state->eof
- << dendl;
- // read some?
- unsigned chunk = 1024*1024;
- if ((state->bl.length() - off) < chunk/2 && !state->eof) {
- bufferlist more;
- int r = state->io_ctx.read(state->name, more, chunk, state->pos);
- if (r < 0)
- return r;
- state->pos += r;
- bufferlist old;
- try {
- old.substr_of(state->bl, off, state->bl.length() - off);
- } catch (buffer::error& err) {
- return -EINVAL;
- }
- state->bl = std::move(old);
- state->bl.claim_append(more);
- state->p = state->bl.cbegin();
- if ((unsigned)r < chunk)
- state->eof = true;
- ldpp_dout(dpp, 10) << " read " << r << dendl;
- }
-
- if (state->p.end())
- return 0; // end of file
- try {
- decode(*entry, state->p);
- }
- catch (const buffer::error &e) {
- return -EINVAL;
- }
- return 1;
-}
-
-/**
- * usage_log_hash: get usage log key hash, based on name and index
- *
- * Get the usage object name. Since a user may have more than 1
- * object holding that info (multiple shards), we use index to
- * specify that shard number. Once index exceeds max shards it
- * wraps.
- * If name is not being set, results for all users will be returned
- * and index will wrap only after total shards number.
- *
- * @param cct [in] ceph context
- * @param name [in] user name
- * @param hash [out] hash value
- * @param index [in] shard index number
- */
-static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
-{
- uint32_t val = index;
-
- if (!name.empty()) {
- int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
- val %= max_user_shards;
- val += ceph_str_hash_linux(name.c_str(), name.size());
- }
- char buf[17];
- int max_shards = cct->_conf->rgw_usage_max_shards;
- snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
- hash = buf;
-}
-
-int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
-{
- uint32_t index = 0;
-
- map<string, rgw_usage_log_info> log_objs;
-
- string hash;
- string last_user;
-
- /* restructure usage map, zone by object hash */
- map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
- for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
- const rgw_user_bucket& ub = iter->first;
- RGWUsageBatch& info = iter->second;
-
- if (ub.user.empty()) {
- ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
- continue;
- }
-
- if (ub.user != last_user) {
- /* index *should* be random, but why waste extra cycles
- in most cases max user shards is not going to exceed 1,
- so just incrementing it */
- usage_log_hash(cct, ub.user, hash, index++);
- }
- last_user = ub.user;
- vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
-
- for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
- v.push_back(miter->second);
- }
- }
-
- map<string, rgw_usage_log_info>::iterator liter;
-
- for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
- int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
- if (r < 0)
- return r;
- }
- return 0;
-}
-
-int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
- uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
- rgw_usage_log_entry>& usage)
-{
- uint32_t num = max_entries;
- string hash, first_hash;
- string user_str = user.to_str();
- usage_log_hash(cct, user_str, first_hash, 0);
-
- if (usage_iter.index) {
- usage_log_hash(cct, user_str, hash, usage_iter.index);
- } else {
- hash = first_hash;
- }
-
- usage.clear();
-
- do {
- map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
- map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
-
- int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
- usage_iter.read_iter, ret_usage, is_truncated);
- if (ret == -ENOENT)
- goto next;
-
- if (ret < 0)
- return ret;
-
- num -= ret_usage.size();
-
- for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
- usage[iter->first].aggregate(iter->second);
- }
-
-next:
- if (!*is_truncated) {
- usage_iter.read_iter.clear();
- usage_log_hash(cct, user_str, hash, ++usage_iter.index);
- }
- } while (num && !*is_truncated && hash != first_hash);
- return 0;
-}
-
-int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
-{
- uint32_t index = 0;
- string hash, first_hash;
- string user_str = user.to_str();
- usage_log_hash(cct, user_str, first_hash, index);
-
- hash = first_hash;
- do {
- int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
-
- if (ret < 0 && ret != -ENOENT)
- return ret;
-
- usage_log_hash(cct, user_str, hash, ++index);
- } while (hash != first_hash);
-
- return 0;
-}
-
-
-int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
-{
- auto max_shards = cct->_conf->rgw_usage_max_shards;
- int ret=0;
- for (unsigned i=0; i < max_shards; i++){
- string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
- ret = cls_obj_usage_log_clear(dpp, oid);
- if (ret < 0){
- ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
- return ret;
- }
- }
- return ret;
-}
-
-int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
- ceph::buffer::list& bl,
- ACLOwner *owner)
-{
- auto i = bl.cbegin();
- RGWAccessControlPolicy policy(cct);
- try {
- policy.decode_owner(i);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
- return -EIO;
- }
- *owner = policy.get_owner();
- return 0;
-}
-
-int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
-{
- rgw_bucket bucket = bucket_info.bucket;
- bucket.update_bucket_id(new_bucket_id);
-
- bucket_info.objv_tracker.clear();
- int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
- if (ret < 0) {
- return ret;
- }
-
- return 0;
-}
-
-
-/**
- * Get ordered listing of the objects in a bucket.
- *
- * max_p: maximum number of results to return
- * bucket: bucket to list contents of
- * prefix: only return results that match this prefix
- * delim: do not include results that match this string.
- * Any skipped results will have the matching portion of their name
- * inserted in common_prefixes with a "true" mark.
- * marker: if filled in, begin the listing with this object.
- * end_marker: if filled in, end the listing with this object.
- * result: the objects are put in here.
- * common_prefixes: if delim is filled in, any matching prefixes are
- * placed here.
- * is_truncated: if number of objects in the bucket is bigger than
- * max, then truncated.
- */
-int RGWRados::Bucket::List::list_objects_ordered(
- const DoutPrefixProvider *dpp,
- int64_t max_p,
- std::vector<rgw_bucket_dir_entry> *result,
- std::map<std::string, bool> *common_prefixes,
- bool *is_truncated,
- optional_yield y)
-{
- RGWRados *store = target->get_store();
- CephContext *cct = store->ctx();
- int shard_id = target->get_shard_id();
- const auto& current_index = target->get_bucket_info().layout.current_index;
-
- int count = 0;
- bool truncated = true;
- bool cls_filtered = false;
- const int64_t max = // protect against memory issues and negative vals
- std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
- int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
-
- result->clear();
-
- // use a local marker; either the marker will have a previous entry
- // or it will be empty; either way it's OK to copy
- rgw_obj_key marker_obj(params.marker.name,
- params.marker.instance,
- params.ns.empty() ? params.marker.ns : params.ns);
- rgw_obj_index_key cur_marker;
- marker_obj.get_index_key(&cur_marker);
-
- rgw_obj_key end_marker_obj(params.end_marker.name,
- params.end_marker.instance,
- params.ns.empty() ? params.end_marker.ns : params.ns);
- rgw_obj_index_key cur_end_marker;
- end_marker_obj.get_index_key(&cur_end_marker);
- const bool cur_end_marker_valid = !params.end_marker.empty();
-
- rgw_obj_key prefix_obj(params.prefix);
- prefix_obj.set_ns(params.ns);
- std::string cur_prefix = prefix_obj.get_index_key_name();
- std::string after_delim_s; /* needed in !params.delim.empty() AND later */
-
- if (!params.delim.empty()) {
- after_delim_s = cls_rgw_after_delim(params.delim);
- /* if marker points at a common prefix, fast forward it into its
- * upper bound string */
- int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
- if (delim_pos >= 0) {
- string s = cur_marker.name.substr(0, delim_pos);
- s.append(after_delim_s);
- cur_marker = s;
- }
- }
-
- // we'll stop after this many attempts as long we return at least
- // one entry; but we will also go beyond this number of attempts
- // until we return at least one entry
- constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
-
- rgw_obj_index_key prev_marker;
- for (uint16_t attempt = 1; /* empty */; ++attempt) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": starting attempt " << attempt << dendl;
-
- if (attempt > 1 && !(prev_marker < cur_marker)) {
- // we've failed to make forward progress
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " marker failed to make forward progress; attempt=" << attempt <<
- ", prev_marker=" << prev_marker <<
- ", cur_marker=" << cur_marker << dendl;
- break;
- }
- prev_marker = cur_marker;
-
- ent_map_t ent_map;
- ent_map.reserve(read_ahead);
- int r = store->cls_bucket_list_ordered(dpp,
- target->get_bucket_info(),
- current_index,
- shard_id,
- cur_marker,
- cur_prefix,
- params.delim,
- read_ahead + 1 - count,
- params.list_versions,
- attempt,
- ent_map,
- &truncated,
- &cls_filtered,
- &cur_marker,
- y,
- params.force_check_filter);
- if (r < 0) {
- return r;
- }
-
- for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
- rgw_bucket_dir_entry& entry = eiter->second;
- rgw_obj_index_key index_key = entry.key;
- rgw_obj_key obj(index_key);
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": considering entry " << entry.key << dendl;
-
- /* note that parse_raw_oid() here will not set the correct
- * object's instance, as rgw_obj_index_key encodes that
- * separately. We don't need to set the instance because it's
- * not needed for the checks here and we end up using the raw
- * entry for the return vector
- */
- bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
- if (!valid) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " could not parse object name: " << obj.name << dendl;
- continue;
- }
-
- bool matched_ns = (obj.ns == params.ns);
- if (!params.list_versions && !entry.is_visible()) {
- ldpp_dout(dpp, 10) << __func__ <<
- ": skipping not visible entry \"" << entry.key << "\"" << dendl;
- continue;
- }
-
- if (params.enforce_ns && !matched_ns) {
- if (!params.ns.empty()) {
- /* we've iterated past the namespace we're searching -- done now */
- truncated = false;
- ldpp_dout(dpp, 10) << __func__ <<
- ": finished due to getting past requested namespace \"" <<
- params.ns << "\"" << dendl;
- goto done;
- }
-
- /* we're skipping past namespaced objects */
- ldpp_dout(dpp, 20) << __func__ <<
- ": skipping past namespaced objects, including \"" << entry.key <<
- "\"" << dendl;
- continue;
- }
-
- if (cur_end_marker_valid && cur_end_marker <= index_key) {
- truncated = false;
- ldpp_dout(dpp, 10) << __func__ <<
- ": finished due to gitting end marker of \"" << cur_end_marker <<
- "\" with \"" << entry.key << "\"" << dendl;
- goto done;
- }
-
- if (count < max) {
- params.marker = index_key;
- next_marker = index_key;
- }
-
- if (params.access_list_filter &&
- ! params.access_list_filter->filter(obj.name, index_key.name)) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skipping past namespaced objects, including \"" << entry.key <<
- "\"" << dendl;
- continue;
- }
-
- if (params.prefix.size() &&
- 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skipping object \"" << entry.key <<
- "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
- continue;
- }
-
- if (!params.delim.empty()) {
- const int delim_pos = obj.name.find(params.delim, params.prefix.size());
- if (delim_pos >= 0) {
- // run either the code where delimiter filtering is done a)
- // in the OSD/CLS or b) here.
- if (cls_filtered) {
- // NOTE: this condition is for the newer versions of the
- // OSD that does filtering on the CLS side should only
- // find one delimiter at the end if it finds any after the
- // prefix
- if (delim_pos !=
- int(obj.name.length() - params.delim.length())) {
- ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
- " found delimiter in place other than the end of "
- "the prefix; obj.name=" << obj.name <<
- ", prefix=" << params.prefix << dendl;
- }
- if (common_prefixes) {
- if (count >= max) {
- truncated = true;
- ldpp_dout(dpp, 10) << __func__ <<
- ": stopping early with common prefix \"" << entry.key <<
- "\" because requested number (" << max <<
- ") reached (cls filtered)" << dendl;
- goto done;
- }
-
- (*common_prefixes)[obj.name] = true;
- count++;
- }
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": finished entry with common prefix \"" << entry.key <<
- "\" so continuing loop (cls filtered)" << dendl;
- continue;
- } else {
- // NOTE: this condition is for older versions of the OSD
- // that do not filter on the CLS side, so the following code
- // must do the filtering; once we reach version 16 of ceph,
- // this code can be removed along with the conditional that
- // can lead this way
-
- /* extract key -with trailing delimiter- for CommonPrefix */
- string prefix_key =
- obj.name.substr(0, delim_pos + params.delim.length());
-
- if (common_prefixes &&
- common_prefixes->find(prefix_key) == common_prefixes->end()) {
- if (count >= max) {
- truncated = true;
- ldpp_dout(dpp, 10) << __func__ <<
- ": stopping early with common prefix \"" << entry.key <<
- "\" because requested number (" << max <<
- ") reached (not cls filtered)" << dendl;
- goto done;
- }
- next_marker = prefix_key;
- (*common_prefixes)[prefix_key] = true;
-
- count++;
- }
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": finished entry with common prefix \"" << entry.key <<
- "\" so continuing loop (not cls filtered)" << dendl;
- continue;
- } // if we're running an older OSD version
- } // if a delimiter was found after prefix
- } // if a delimiter was passed in
-
- if (count >= max) {
- truncated = true;
- ldpp_dout(dpp, 10) << __func__ <<
- ": stopping early with entry \"" << entry.key <<
- "\" because requested number (" << max <<
- ") reached" << dendl;
- goto done;
- }
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": adding entry " << entry.key << " to result" << dendl;
-
- result->emplace_back(std::move(entry));
- count++;
- } // eiter for loop
-
- // NOTE: the following conditional is needed by older versions of
- // the OSD that don't do delimiter filtering on the CLS side; once
- // we reach version 16 of ceph, the following conditional and the
- // code within can be removed
- if (!cls_filtered && !params.delim.empty()) {
- int marker_delim_pos =
- cur_marker.name.find(params.delim, cur_prefix.size());
- if (marker_delim_pos >= 0) {
- std::string skip_after_delim =
- cur_marker.name.substr(0, marker_delim_pos);
- skip_after_delim.append(after_delim_s);
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": skip_after_delim=" << skip_after_delim << dendl;
-
- if (skip_after_delim > cur_marker.name) {
- cur_marker = skip_after_delim;
- ldpp_dout(dpp, 20) << __func__ <<
- ": setting cur_marker=" << cur_marker.name <<
- "[" << cur_marker.instance << "]" << dendl;
- }
- }
- } // if older osd didn't do delimiter filtering
-
- ldpp_dout(dpp, 10) << __func__ <<
- ": end of outer loop, truncated=" << truncated <<
- ", count=" << count << ", attempt=" << attempt << dendl;
-
- if (!truncated || count >= (max + 1) / 2) {
- // if we finished listing, or if we're returning at least half the
- // requested entries, that's enough; S3 and swift protocols allow
- // returning fewer than max entries
- ldpp_dout(dpp, 10) << __func__ <<
- ": exiting attempt loop because we reached end (" << truncated <<
- ") or we're returning half the requested entries (" << count <<
- " of " << max << ")" << dendl;
- break;
- } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
- // if we've made at least 8 attempts and we have some, but very
- // few, results, return with what we have
- ldpp_dout(dpp, 10) << __func__ <<
- ": exiting attempt loop because we made " << attempt <<
- " attempts and we're returning " << count << " entries" << dendl;
- break;
- }
- } // for (uint16_t attempt...
-
-done:
-
- if (is_truncated) {
- *is_truncated = truncated;
- }
-
- return 0;
-} // list_objects_ordered
-
-
-/**
- * Get listing of the objects in a bucket and allow the results to be out
- * of order.
- *
- * Even though there are key differences with the ordered counterpart,
- * the parameters are the same to maintain some compatability.
- *
- * max: maximum number of results to return
- * bucket: bucket to list contents of
- * prefix: only return results that match this prefix
- * delim: should not be set; if it is we should have indicated an error
- * marker: if filled in, begin the listing with this object.
- * end_marker: if filled in, end the listing with this object.
- * result: the objects are put in here.
- * common_prefixes: this is never filled with an unordered list; the param
- * is maintained for compatibility
- * is_truncated: if number of objects in the bucket is bigger than max, then
- * truncated.
- */
-int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
- int64_t max_p,
- std::vector<rgw_bucket_dir_entry>* result,
- std::map<std::string, bool>* common_prefixes,
- bool* is_truncated,
- optional_yield y)
-{
- RGWRados *store = target->get_store();
- int shard_id = target->get_shard_id();
- const auto& current_index = target->get_bucket_info().layout.current_index;
-
- int count = 0;
- bool truncated = true;
-
- const int64_t max = // protect against memory issues and negative vals
- std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
-
- // read a few extra in each call to cls_bucket_list_unordered in
- // case some are filtered out due to namespace matching, versioning,
- // filtering, etc.
- const int64_t max_read_ahead = 100;
- const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
-
- result->clear();
-
- // use a local marker; either the marker will have a previous entry
- // or it will be empty; either way it's OK to copy
- rgw_obj_key marker_obj(params.marker.name,
- params.marker.instance,
- params.ns.empty() ? params.marker.ns : params.ns);
- rgw_obj_index_key cur_marker;
- marker_obj.get_index_key(&cur_marker);
-
- rgw_obj_key end_marker_obj(params.end_marker.name,
- params.end_marker.instance,
- params.ns.empty() ? params.end_marker.ns : params.ns);
- rgw_obj_index_key cur_end_marker;
- end_marker_obj.get_index_key(&cur_end_marker);
- const bool cur_end_marker_valid = !params.end_marker.empty();
-
- rgw_obj_key prefix_obj(params.prefix);
- prefix_obj.set_ns(params.ns);
- std::string cur_prefix = prefix_obj.get_index_key_name();
-
- while (truncated && count <= max) {
- std::vector<rgw_bucket_dir_entry> ent_list;
- ent_list.reserve(read_ahead);
-
- int r = store->cls_bucket_list_unordered(dpp,
- target->get_bucket_info(),
- current_index,
- shard_id,
- cur_marker,
- cur_prefix,
- read_ahead,
- params.list_versions,
- ent_list,
- &truncated,
- &cur_marker,
- y);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " cls_bucket_list_unordered returned " << r << " for " <<
- target->get_bucket_info().bucket << dendl;
- return r;
- }
-
- // NB: while regions of ent_list will be sorted, we have no
- // guarantee that all items will be sorted since they can cross
- // shard boundaries
-
- for (auto& entry : ent_list) {
- rgw_obj_index_key index_key = entry.key;
- rgw_obj_key obj(index_key);
-
- if (count < max) {
- params.marker.set(index_key);
- next_marker.set(index_key);
- }
-
- /* note that parse_raw_oid() here will not set the correct
- * object's instance, as rgw_obj_index_key encodes that
- * separately. We don't need to set the instance because it's
- * not needed for the checks here and we end up using the raw
- * entry for the return vector
- */
- bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
- if (!valid) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " could not parse object name: " << obj.name << dendl;
- continue;
- }
-
- if (!params.list_versions && !entry.is_visible()) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skippping \"" << index_key <<
- "\" because not listing versions and entry not visibile" << dendl;
- continue;
- }
-
- if (params.enforce_ns && obj.ns != params.ns) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skippping \"" << index_key <<
- "\" because namespace does not match" << dendl;
- continue;
- }
-
- if (cur_end_marker_valid && cur_end_marker <= index_key) {
- // we're not guaranteed items will come in order, so we have
- // to loop through all
- ldpp_dout(dpp, 20) << __func__ <<
- ": skippping \"" << index_key <<
- "\" because after end_marker" << dendl;
- continue;
- }
-
- if (params.access_list_filter &&
- !params.access_list_filter->filter(obj.name, index_key.name)) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skippping \"" << index_key <<
- "\" because doesn't match filter" << dendl;
- continue;
- }
-
- if (params.prefix.size() &&
- (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
- ldpp_dout(dpp, 20) << __func__ <<
- ": skippping \"" << index_key <<
- "\" because doesn't match prefix" << dendl;
- continue;
- }
-
- if (count >= max) {
- truncated = true;
- goto done;
- }
-
- result->emplace_back(std::move(entry));
- count++;
- } // for (auto& entry : ent_list)
- } // while (truncated && count <= max)
-
-done:
-
- if (is_truncated) {
- *is_truncated = truncated;
- }
-
- return 0;
-} // list_objects_unordered
-
-
-/**
- * create a rados pool, associated meta info
- * returns 0 on success, -ERR# otherwise.
- */
-int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
-{
- librados::IoCtx io_ctx;
- constexpr bool create = true;
- return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
-}
-
-void RGWRados::create_bucket_id(string *bucket_id)
-{
- uint64_t iid = instance_id();
- uint64_t bid = next_bucket_id();
- char buf[svc.zone->get_zone_params().get_id().size() + 48];
- snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
- svc.zone->get_zone_params().get_id().c_str(), iid, bid);
- *bucket_id = buf;
-}
-
-int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
- const string& zonegroup_id,
- const rgw_placement_rule& placement_rule,
- const string& swift_ver_location,
- const RGWQuotaInfo * pquota_info,
- map<std::string, bufferlist>& attrs,
- RGWBucketInfo& info,
- obj_version *pobjv,
- obj_version *pep_objv,
- real_time creation_time,
- rgw_bucket *pmaster_bucket,
- uint32_t *pmaster_num_shards,
- optional_yield y,
- const DoutPrefixProvider *dpp,
- bool exclusive)
-{
-#define MAX_CREATE_RETRIES 20 /* need to bound retries */
- rgw_placement_rule selected_placement_rule;
- RGWZonePlacementInfo rule_info;
-
- for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
- int ret = 0;
- ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
- &selected_placement_rule, &rule_info, y);
- if (ret < 0)
- return ret;
-
- if (!pmaster_bucket) {
- create_bucket_id(&bucket.marker);
- bucket.bucket_id = bucket.marker;
- } else {
- bucket.marker = pmaster_bucket->marker;
- bucket.bucket_id = pmaster_bucket->bucket_id;
- }
-
- RGWObjVersionTracker& objv_tracker = info.objv_tracker;
-
- objv_tracker.read_version.clear();
-
- if (pobjv) {
- objv_tracker.write_version = *pobjv;
- } else {
- objv_tracker.generate_new_write_ver(cct);
- }
-
- info.bucket = bucket;
- info.owner = owner.user_id;
- info.zonegroup = zonegroup_id;
- info.placement_rule = selected_placement_rule;
- info.swift_ver_location = swift_ver_location;
- info.swift_versioning = (!swift_ver_location.empty());
-
- init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
- pmaster_num_shards ?
- std::optional{*pmaster_num_shards} :
- std::nullopt,
- rule_info.index_type);
-
- info.requester_pays = false;
- if (real_clock::is_zero(creation_time)) {
- info.creation_time = ceph::real_clock::now();
- } else {
- info.creation_time = creation_time;
- }
- if (pquota_info) {
- info.quota = *pquota_info;
- }
-
- int r = svc.bi->init_index(dpp, info, info.layout.current_index);
- if (r < 0) {
- return r;
- }
-
- ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
- if (ret == -ECANCELED) {
- ret = -EEXIST;
- }
- if (ret == -EEXIST) {
- /* we need to reread the info and return it, caller will have a use for it */
- RGWBucketInfo orig_info;
- r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
- if (r < 0) {
- if (r == -ENOENT) {
- continue;
- }
- ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
- return r;
- }
-
- /* only remove it if it's a different bucket instance */
- if (orig_info.bucket.bucket_id != bucket.bucket_id) {
- int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
- }
- r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
- /* continue anyway */
- }
- }
-
- info = std::move(orig_info);
- /* ret == -EEXIST here */
- }
- return ret;
- }
-
- /* this is highly unlikely */
- ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
- return -ENOENT;
-}
-
-bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
-{
- get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
-
- return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
-}
-
-std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
-{
- return svc.rados->cluster_fsid();
-}
-
-int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw_obj& obj,
- librados::IoCtx *ioctx)
-{
- std::string oid, key;
- get_obj_bucket_and_oid_loc(obj, oid, key);
-
- rgw_pool pool;
- if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
- ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
- ", probably misconfiguration" << dendl;
- return -EIO;
- }
-
- int r = open_pool_ctx(dpp, pool, *ioctx, false);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
- " for obj=" << obj << " with error-code=" << r << dendl;
- return r;
- }
-
- ioctx->locator_set_key(key);
-
- return 0;
-}
-
-int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
- const rgw_placement_rule& target_placement_rule,
- const rgw_obj& obj,
- rgw_rados_ref *ref)
-{
- get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
-
- rgw_pool pool;
- if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
- ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
- return -EIO;
- }
-
- ref->pool = svc.rados->pool(pool);
-
- int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
- .set_mostly_omap(false));
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
- return r;
- }
-
- ref->pool.ioctx().locator_set_key(ref->obj.loc);
-
- return 0;
-}
-
-int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw_obj& obj,
- rgw_rados_ref *ref)
-{
- return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
-}
-
-int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
-{
- ref->obj = obj;
-
- if (ref->obj.oid.empty()) {
- ref->obj.oid = obj.pool.to_str();
- ref->obj.pool = svc.zone->get_zone_params().domain_root;
- }
- ref->pool = svc.rados->pool(obj.pool);
- int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
- .set_mostly_omap(false));
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
- return r;
- }
-
- ref->pool.ioctx().locator_set_key(ref->obj.loc);
-
- return 0;
-}
-
-int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
-{
- return get_raw_obj_ref(dpp, obj, ref);
-}
-
-/*
- * fixes an issue where head objects were supposed to have a locator created, but ended
- * up without one
- */
-int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
-{
- const rgw_bucket& bucket = bucket_info.bucket;
- string oid;
- string locator;
-
- rgw_obj obj(bucket, key);
-
- get_obj_bucket_and_oid_loc(obj, oid, locator);
-
- if (locator.empty()) {
- ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
- return 0;
- }
-
- librados::IoCtx ioctx;
-
- int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
- if (ret < 0) {
- cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
- return ret;
- }
- ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
-
- uint64_t size;
- bufferlist data;
-
- struct timespec mtime_ts;
- map<string, bufferlist> attrs;
- librados::ObjectReadOperation op;
- op.getxattrs(&attrs, NULL);
- op.stat2(&size, &mtime_ts, NULL);
-#define HEAD_SIZE 512 * 1024
- op.read(0, HEAD_SIZE, &data, NULL);
-
- ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
- return ret;
- }
-
- if (size > HEAD_SIZE) {
- ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
- return -EIO;
- }
-
- if (size != data.length()) {
- ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
- return -EIO;
- }
-
- if (copy_obj) {
- librados::ObjectWriteOperation wop;
-
- wop.mtime2(&mtime_ts);
-
- map<string, bufferlist>::iterator iter;
- for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
- wop.setxattr(iter->first.c_str(), iter->second);
- }
-
- wop.write(0, data);
-
- ioctx.locator_set_key(locator);
- rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
- }
-
- if (remove_bad) {
- ioctx.locator_set_key(string());
-
- ret = ioctx.remove(oid);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
- return ret;
- }
- }
-
- return 0;
-}
-
-int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
- librados::IoCtx& src_ioctx,
- const string& src_oid, const string& src_locator,
- librados::IoCtx& dst_ioctx,
- const string& dst_oid, const string& dst_locator)
-{
-
-#define COPY_BUF_SIZE (4 * 1024 * 1024)
- bool done = false;
- uint64_t chunk_size = COPY_BUF_SIZE;
- uint64_t ofs = 0;
- int ret = 0;
- real_time mtime;
- struct timespec mtime_ts;
- uint64_t size;
-
- if (src_oid == dst_oid && src_locator == dst_locator) {
- return 0;
- }
-
- src_ioctx.locator_set_key(src_locator);
- dst_ioctx.locator_set_key(dst_locator);
-
- do {
- bufferlist data;
- ObjectReadOperation rop;
- ObjectWriteOperation wop;
-
- if (ofs == 0) {
- rop.stat2(&size, &mtime_ts, NULL);
- mtime = real_clock::from_timespec(mtime_ts);
- }
- rop.read(ofs, chunk_size, &data, NULL);
- ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
- if (ret < 0) {
- goto done_err;
- }
-
- if (data.length() == 0) {
- break;
- }
-
- if (ofs == 0) {
- wop.create(true); /* make it exclusive */
- wop.mtime2(&mtime_ts);
- mtime = real_clock::from_timespec(mtime_ts);
- }
- wop.write(ofs, data);
- ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
- if (ret < 0) {
- goto done_err;
- }
- ofs += data.length();
- done = data.length() != chunk_size;
- } while (!done);
-
- if (ofs != size) {
- ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
- << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
- ret = -EIO;
- goto done_err;
- }
-
- src_ioctx.remove(src_oid);
-
- return 0;
-
-done_err:
- // TODO: clean up dst_oid if we created it
- ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
- return ret;
-}
-
-/*
- * fixes an issue where head objects were supposed to have a locator created, but ended
- * up without one
- */
-int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info, rgw_obj_key& key,
- bool fix, bool *need_fix, optional_yield y)
-{
- std::unique_ptr<rgw::sal::Bucket> bucket;
- driver->get_bucket(nullptr, bucket_info, &bucket);
- std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-
- if (need_fix) {
- *need_fix = false;
- }
-
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
- if (r < 0) {
- return r;
- }
-
- RGWObjState *astate = nullptr;
- RGWObjManifest* manifest = nullptr;
- RGWObjectCtx rctx(this->driver);
- r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
- if (r < 0)
- return r;
-
- if (manifest) {
- RGWObjManifest::obj_iterator miter;
- for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
- rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(driver);
- rgw_obj loc;
- string oid;
- string locator;
-
- RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
-
- if (loc.key.ns.empty()) {
- /* continue, we're only interested in tail objects */
- continue;
- }
-
- auto& ioctx = ref.pool.ioctx();
-
- get_obj_bucket_and_oid_loc(loc, oid, locator);
- ref.pool.ioctx().locator_set_key(locator);
-
- ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
-
- r = ioctx.stat(oid, NULL, NULL);
- if (r != -ENOENT) {
- continue;
- }
-
- string bad_loc;
- prepend_bucket_marker(bucket->get_key(), loc.key.name, bad_loc);
-
- /* create a new ioctx with the bad locator */
- librados::IoCtx src_ioctx;
- src_ioctx.dup(ioctx);
- src_ioctx.locator_set_key(bad_loc);
-
- r = src_ioctx.stat(oid, NULL, NULL);
- if (r != 0) {
- /* cannot find a broken part */
- continue;
- }
- ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
- if (need_fix) {
- *need_fix = true;
- }
- if (fix) {
- r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
- if (r < 0) {
- ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
- }
- }
- }
- }
-
- return 0;
-}
-
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
- const rgw_obj& obj,
- RGWBucketInfo* bucket_info_out,
- const DoutPrefixProvider *dpp)
-{
- bucket = _bucket;
-
- RGWBucketInfo bucket_info;
- RGWBucketInfo* bucket_info_p =
- bucket_info_out ? bucket_info_out : &bucket_info;
-
- int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
- if (ret < 0) {
- return ret;
- }
-
- string oid;
-
- ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
- return ret;
- }
- ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
-
- return 0;
-}
-
-int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
- const rgw_obj& obj)
-{
- bucket = bucket_info.bucket;
-
- int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
- obj.get_hash_object(),
- &bucket_obj,
- &shard_id);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
- return ret;
- }
- ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
-
- return 0;
-}
-
-int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& index,
- int sid)
-{
- bucket = bucket_info.bucket;
- shard_id = sid;
-
- int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id,
- num_shards(index), index.gen,
- &bucket_obj);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
- return ret;
- }
- ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
-
- return 0;
-}
-
-
-/* Execute @handler on last item in bucket listing for bucket specified
- * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
- * to objects matching these criterias. */
-int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const std::string& obj_prefix,
- const std::string& obj_delim,
- std::function<int(const rgw_bucket_dir_entry&)> handler)
-{
- RGWRados::Bucket target(this, bucket_info);
- RGWRados::Bucket::List list_op(&target);
-
- list_op.params.prefix = obj_prefix;
- list_op.params.delim = obj_delim;
-
- ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
- << ", obj_prefix=" << obj_prefix
- << ", obj_delim=" << obj_delim
- << dendl;
-
- bool is_truncated = false;
-
- boost::optional<rgw_bucket_dir_entry> last_entry;
- /* We need to rewind to the last object in a listing. */
- do {
- /* List bucket entries in chunks. */
- static constexpr int MAX_LIST_OBJS = 100;
- std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
-
- int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
- &is_truncated, null_yield);
- if (ret < 0) {
- return ret;
- } else if (!entries.empty()) {
- last_entry = entries.back();
- }
- } while (is_truncated);
-
- if (last_entry) {
- return handler(*last_entry);
- }
-
- /* Empty listing - no items we can run handler on. */
- return 0;
-}
-
-bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
-{
- return bucket->get_info().has_swift_versioning() &&
- bucket->get_info().swift_ver_location.size();
-}
-
-int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
- const rgw_user& user,
- rgw::sal::Bucket* bucket,
- rgw::sal::Object* obj,
- const DoutPrefixProvider *dpp,
- optional_yield y)
-{
- if (! swift_versioning_enabled(bucket)) {
- return 0;
- }
-
- obj->set_atomic();
-
- RGWObjState * state = nullptr;
- RGWObjManifest *manifest = nullptr;
- int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj, &state, &manifest, false, y);
- if (r < 0) {
- return r;
- }
-
- if (!state->exists) {
- return 0;
- }
-
- const string& src_name = obj->get_oid();
- char buf[src_name.size() + 32];
- struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
- snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
- src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
-
- RGWBucketInfo dest_bucket_info;
-
- r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
- if (r < 0) {
- ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
- if (r == -ENOENT) {
- return -ERR_PRECONDITION_FAILED;
- }
- return r;
- }
-
- if (dest_bucket_info.owner != bucket->get_info().owner) {
- return -ERR_PRECONDITION_FAILED;
- }
-
- rgw::sal::RadosBucket dest_bucket(driver, dest_bucket_info);
- rgw::sal::RadosObject dest_obj(driver, rgw_obj_key(buf), &dest_bucket);
-
- if (dest_bucket_info.versioning_enabled()){
- dest_obj.gen_rand_obj_instance_name();
- }
-
- dest_obj.set_atomic();
-
- rgw_zone_id no_zone;
-
- r = copy_obj(obj_ctx,
- user,
- NULL, /* req_info *info */
- no_zone,
- &dest_obj,
- obj,
- &dest_bucket,
- bucket,
- bucket->get_placement_rule(),
- NULL, /* time_t *src_mtime */
- NULL, /* time_t *mtime */
- NULL, /* const time_t *mod_ptr */
- NULL, /* const time_t *unmod_ptr */
- false, /* bool high_precision_time */
- NULL, /* const char *if_match */
- NULL, /* const char *if_nomatch */
- RGWRados::ATTRSMOD_NONE,
- true, /* bool copy_if_newer */
- state->attrset,
- RGWObjCategory::Main,
- 0, /* uint64_t olh_epoch */
- real_time(), /* time_t delete_at */
- NULL, /* string *version_id */
- NULL, /* string *ptag */
- NULL, /* string *petag */
- NULL, /* void (*progress_cb)(off_t, void *) */
- NULL, /* void *progress_data */
- dpp,
- null_yield);
- if (r == -ECANCELED || r == -ENOENT) {
- /* Has already been overwritten, meaning another rgw process already
- * copied it out */
- return 0;
- }
-
- return r;
-}
-
-int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
- const rgw_user& user,
- rgw::sal::Bucket* bucket,
- rgw::sal::Object* obj,
- bool& restored, /* out */
- const DoutPrefixProvider *dpp)
-{
- if (! swift_versioning_enabled(bucket)) {
- return 0;
- }
-
- /* Bucket info of the bucket that stores previous versions of our object. */
- RGWBucketInfo archive_binfo;
-
- int ret = get_bucket_info(&svc, bucket->get_tenant(),
- bucket->get_info().swift_ver_location,
- archive_binfo, nullptr, null_yield, nullptr);
- if (ret < 0) {
- return ret;
- }
-
- /* Abort the operation if the bucket storing our archive belongs to someone
- * else. This is a limitation in comparison to Swift as we aren't taking ACLs
- * into consideration. For we can live with that.
- *
- * TODO: delegate this check to un upper layer and compare with ACLs. */
- if (bucket->get_info().owner != archive_binfo.owner) {
- return -EPERM;
- }
-
- /* This code will be executed on latest version of the object. */
- const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
- rgw_zone_id no_zone;
-
- /* We don't support object versioning of Swift API on those buckets that
- * are already versioned using the S3 mechanism. This affects also bucket
- * storing archived objects. Otherwise the delete operation would create
- * a deletion marker. */
- if (archive_binfo.versioned()) {
- restored = false;
- return -ERR_PRECONDITION_FAILED;
- }
-
- /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
- * irrelevant and may be safely skipped. */
- std::map<std::string, ceph::bufferlist> no_attrs;
-
- rgw::sal::RadosBucket archive_bucket(driver, archive_binfo);
- rgw::sal::RadosObject archive_obj(driver, entry.key, &archive_bucket);
-
- if (bucket->versioning_enabled()){
- obj->gen_rand_obj_instance_name();
- }
-
- archive_obj.set_atomic();
- obj->set_atomic();
-
- int ret = copy_obj(obj_ctx,
- user,
- nullptr, /* req_info *info */
- no_zone,
- obj, /* dest obj */
- &archive_obj, /* src obj */
- bucket, /* dest bucket info */
- &archive_bucket, /* src bucket info */
- bucket->get_placement_rule(), /* placement_rule */
- nullptr, /* time_t *src_mtime */
- nullptr, /* time_t *mtime */
- nullptr, /* const time_t *mod_ptr */
- nullptr, /* const time_t *unmod_ptr */
- false, /* bool high_precision_time */
- nullptr, /* const char *if_match */
- nullptr, /* const char *if_nomatch */
- RGWRados::ATTRSMOD_NONE,
- true, /* bool copy_if_newer */
- no_attrs,
- RGWObjCategory::Main,
- 0, /* uint64_t olh_epoch */
- real_time(), /* time_t delete_at */
- nullptr, /* string *version_id */
- nullptr, /* string *ptag */
- nullptr, /* string *petag */
- nullptr, /* void (*progress_cb)(off_t, void *) */
- nullptr, /* void *progress_data */
- dpp,
- null_yield);
- if (ret == -ECANCELED || ret == -ENOENT) {
- /* Has already been overwritten, meaning another rgw process already
- * copied it out */
- return 0;
- } else if (ret < 0) {
- return ret;
- } else {
- restored = true;
- }
-
- /* Need to remove the archived copy. */
- ret = delete_obj(dpp, archive_binfo, &archive_obj,
- archive_binfo.versioning_status());
-
- return ret;
- };
-
- const std::string& obj_name = obj->get_oid();
- const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
- % obj_name);
-
- return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
- handler);
-}
-
-int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
- uint64_t size, uint64_t accounted_size,
- map<string, bufferlist>& attrs,
- bool assume_noent, bool modify_tail,
- void *_index_op, optional_yield y)
-{
- RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
- RGWRados *store = target->get_store();
-
- ObjectWriteOperation op;
-#ifdef WITH_LTTNG
- const req_state* s = get_req_state();
- string req_id;
- if (!s) {
- // fake req_id
- req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
- } else {
- req_id = s->req_id;
- }
-#endif
-
- RGWObjState *state;
- RGWObjManifest *manifest = nullptr;
- int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
- if (r < 0)
- return r;
-
- rgw_obj obj = target->get_obj();
-
- if (obj.get_oid().empty()) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
- return -EIO;
- }
-
- rgw_rados_ref ref;
- r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
- if (r < 0)
- return r;
-
- bool is_olh = state->is_olh;
-
- bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
-
- const string *ptag = meta.ptag;
- if (!ptag && !index_op->get_optag()->empty()) {
- ptag = index_op->get_optag();
- }
- r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
- if (r < 0)
- return r;
-
- if (real_clock::is_zero(meta.set_mtime)) {
- meta.set_mtime = real_clock::now();
- }
-
- if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
- auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
- if (iter == attrs.end()) {
- real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
- string mode = target->get_bucket_info().obj_lock.get_mode();
- RGWObjectRetention obj_retention(mode, lock_until_date);
- bufferlist bl;
- obj_retention.encode(bl);
- op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
- }
- }
-
- if (state->is_olh) {
- op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
- }
-
- struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
- op.mtime2(&mtime_ts);
-
- if (meta.data) {
- /* if we want to overwrite the data, we also want to overwrite the
- xattrs, so just remove the object */
- op.write_full(*meta.data);
- if (state->compressed) {
- uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
- op.set_alloc_hint2(0, 0, alloc_hint_flags);
- }
- }
-
- string etag;
- string content_type;
- bufferlist acl_bl;
- string storage_class;
-
- map<string, bufferlist>::iterator iter;
- if (meta.rmattrs) {
- for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
- const string& name = iter->first;
- op.rmxattr(name.c_str());
- }
- }
-
- if (meta.manifest) {
- storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
-
- /* remove existing manifest attr */
- iter = attrs.find(RGW_ATTR_MANIFEST);
- if (iter != attrs.end())
- attrs.erase(iter);
-
- bufferlist bl;
- encode(*meta.manifest, bl);
- op.setxattr(RGW_ATTR_MANIFEST, bl);
- }
-
- for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
- const string& name = iter->first;
- bufferlist& bl = iter->second;
-
- if (!bl.length())
- continue;
-
- op.setxattr(name.c_str(), bl);
-
- if (name.compare(RGW_ATTR_ETAG) == 0) {
- etag = rgw_bl_str(bl);
- } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
- content_type = rgw_bl_str(bl);
- } else if (name.compare(RGW_ATTR_ACL) == 0) {
- acl_bl = bl;
- }
- }
- if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
- cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
- }
-
- if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
- bufferlist bl;
- encode(store->svc.zone->get_zone_short_id(), bl);
- op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
- }
-
- if (!storage_class.empty()) {
- bufferlist bl;
- bl.append(storage_class);
- op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
- }
-
- if (!op.size())
- return 0;
-
- uint64_t epoch;
- int64_t poolid;
- bool orig_exists;
- uint64_t orig_size;
-
- if (!reset_obj) { //Multipart upload, it has immutable head.
- orig_exists = false;
- orig_size = 0;
- } else {
- orig_exists = state->exists;
- orig_size = state->accounted_size;
- }
-
- bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
- !obj.key.instance.empty();
-
- bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
-
- if (versioned_op) {
- index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
- }
-
- if (!index_op->is_prepared()) {
- tracepoint(rgw_rados, prepare_enter, req_id.c_str());
- r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
- tracepoint(rgw_rados, prepare_exit, req_id.c_str());
- if (r < 0)
- return r;
- }
-
- auto& ioctx = ref.pool.ioctx();
-
- tracepoint(rgw_rados, operate_enter, req_id.c_str());
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- tracepoint(rgw_rados, operate_exit, req_id.c_str());
- if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
- or -ENOENT if was removed, or -EEXIST if it did not exist
- before and now it does */
- if (r == -EEXIST && assume_noent) {
- target->invalidate_state();
- return r;
- }
- goto done_cancel;
- }
-
- epoch = ioctx.get_last_version();
- poolid = ioctx.get_id();
-
- r = target->complete_atomic_modification(dpp);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
- }
-
- tracepoint(rgw_rados, complete_enter, req_id.c_str());
- r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
- meta.set_mtime, etag, content_type,
- storage_class, &acl_bl,
- meta.category, meta.remove_objs, meta.user_data, meta.appendable);
- tracepoint(rgw_rados, complete_exit, req_id.c_str());
- if (r < 0)
- goto done_cancel;
-
- if (meta.mtime) {
- *meta.mtime = meta.set_mtime;
- }
-
- /* note that index_op was using state so we couldn't invalidate it earlier */
- target->invalidate_state();
- state = NULL;
-
- if (versioned_op && meta.olh_epoch) {
- r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), target->get_target(), false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
- if (r < 0) {
- return r;
- }
- }
-
- if (!real_clock::is_zero(meta.delete_at)) {
- rgw_obj_index_key obj_key;
- obj.key.get_index_key(&obj_key);
-
- r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
- obj.bucket.bucket_id, obj_key);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
- /* ignoring error, nothing we can do at this point */
- }
- }
- meta.canceled = false;
-
- /* update quota cache */
- if (meta.completeMultipart){
- store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
- 0, orig_size);
- }
- else {
- store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
- accounted_size, orig_size);
- }
- return 0;
-
-done_cancel:
- int ret = index_op->cancel(dpp, meta.remove_objs);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
- }
-
- meta.canceled = true;
-
- /* we lost in a race. There are a few options:
- * - existing object was rewritten (ECANCELED)
- * - non existing object was created (EEXIST)
- * - object was removed (ENOENT)
- * should treat it as a success
- */
- if (meta.if_match == NULL && meta.if_nomatch == NULL) {
- if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
- r = 0;
- }
- } else {
- if (meta.if_match != NULL) {
- // only overwrite existing object
- if (strcmp(meta.if_match, "*") == 0) {
- if (r == -ENOENT) {
- r = -ERR_PRECONDITION_FAILED;
- } else if (r == -ECANCELED) {
- r = 0;
- }
- }
- }
-
- if (meta.if_nomatch != NULL) {
- // only create a new object
- if (strcmp(meta.if_nomatch, "*") == 0) {
- if (r == -EEXIST) {
- r = -ERR_PRECONDITION_FAILED;
- } else if (r == -ENOENT) {
- r = 0;
- }
- }
- }
- }
-
- return r;
-}
-
-int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
- map<string, bufferlist>& attrs, optional_yield y)
-{
- RGWBucketInfo& bucket_info = target->get_bucket_info();
-
- RGWRados::Bucket bop(target->get_store(), bucket_info);
- RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
- index_op.set_zones_trace(meta.zones_trace);
-
- bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
- int r;
- if (assume_noent) {
- r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
- if (r == -EEXIST) {
- assume_noent = false;
- }
- }
- if (!assume_noent) {
- r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
- }
- return r;
-}
-
-class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
-{
- const DoutPrefixProvider *dpp;
- CephContext* cct;
- rgw_obj obj;
- rgw::sal::DataProcessor *filter;
- boost::optional<RGWPutObj_Compress>& compressor;
- bool try_etag_verify;
- rgw::putobj::etag_verifier_ptr etag_verifier;
- boost::optional<rgw::putobj::ChunkProcessor> buffering;
- CompressorRef& plugin;
- rgw::sal::ObjectProcessor *processor;
- void (*progress_cb)(off_t, void *);
- void *progress_data;
- bufferlist extra_data_bl, manifest_bl;
- std::optional<RGWCompressionInfo> compression_info;
- uint64_t extra_data_left{0};
- bool need_to_process_attrs{true};
- uint64_t data_len{0};
- map<string, bufferlist> src_attrs;
- uint64_t ofs{0};
- uint64_t lofs{0}; /* logical ofs */
- std::function<int(map<string, bufferlist>&)> attrs_handler;
-
-public:
- RGWRadosPutObj(const DoutPrefixProvider *dpp,
- CephContext* cct,
- CompressorRef& plugin,
- boost::optional<RGWPutObj_Compress>& compressor,
- rgw::sal::ObjectProcessor *p,
- void (*_progress_cb)(off_t, void *),
- void *_progress_data,
- std::function<int(map<string, bufferlist>&)> _attrs_handler) :
- dpp(dpp),
- cct(cct),
- filter(p),
- compressor(compressor),
- try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
- plugin(plugin),
- processor(p),
- progress_cb(_progress_cb),
- progress_data(_progress_data),
- attrs_handler(_attrs_handler) {}
-
-
- int process_attrs(void) {
- if (extra_data_bl.length()) {
- JSONParser jp;
- if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
- ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
- return -EIO;
- }
-
- JSONDecoder::decode_json("attrs", src_attrs, &jp);
-
- auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
- if (iter != src_attrs.end()) {
- const bufferlist bl = std::move(iter->second);
- src_attrs.erase(iter); // don't preserve source compression info
-
- if (try_etag_verify) {
- // if we're trying to verify etags, we need to convert compressed
- // ranges in the manifest back into logical multipart part offsets
- RGWCompressionInfo info;
- bool compressed = false;
- int r = rgw_compression_info_from_attr(bl, compressed, info);
- if (r < 0) {
- ldpp_dout(dpp, 4) << "failed to decode compression info, "
- "disabling etag verification" << dendl;
- try_etag_verify = false;
- } else if (compressed) {
- compression_info = std::move(info);
- }
- }
- }
- /* We need the manifest to recompute the ETag for verification */
- iter = src_attrs.find(RGW_ATTR_MANIFEST);
- if (iter != src_attrs.end()) {
- manifest_bl = std::move(iter->second);
- src_attrs.erase(iter);
- }
-
- // filter out olh attributes
- iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
- while (iter != src_attrs.end()) {
- if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
- break;
- }
- iter = src_attrs.erase(iter);
- }
- }
-
- int ret = attrs_handler(src_attrs);
- if (ret < 0) {
- return ret;
- }
-
- if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
- //do not compress if object is encrypted
- compressor = boost::in_place(cct, plugin, filter);
- // add a filter that buffers data so we don't try to compress tiny blocks.
- // libcurl reads in 16k at a time, and we need at least 64k to get a good
- // compression ratio
- constexpr unsigned buffer_size = 512 * 1024;
- buffering = boost::in_place(&*compressor, buffer_size);
- filter = &*buffering;
- }
-
- /*
- * Presently we don't support ETag based verification if encryption is
- * requested. We can enable simultaneous support once we have a mechanism
- * to know the sequence in which the filters must be applied.
- */
- if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
- ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
- compression_info,
- etag_verifier);
- if (ret < 0) {
- ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
- "disabling etag verification" << dendl;
- } else {
- filter = etag_verifier.get();
- }
- }
-
- need_to_process_attrs = false;
-
- return 0;
- }
-
- int handle_data(bufferlist& bl, bool *pause) override {
- if (progress_cb) {
- progress_cb(data_len, progress_data);
- }
- if (extra_data_left) {
- uint64_t extra_len = bl.length();
- if (extra_len > extra_data_left)
- extra_len = extra_data_left;
-
- bufferlist extra;
- bl.splice(0, extra_len, &extra);
- extra_data_bl.append(extra);
-
- extra_data_left -= extra_len;
- if (extra_data_left == 0) {
- int res = process_attrs();
- if (res < 0)
- return res;
- }
- ofs += extra_len;
- if (bl.length() == 0) {
- return 0;
- }
- }
- if (need_to_process_attrs) {
- /* need to call process_attrs() even if we don't get any attrs,
- * need it to call attrs_handler().
- */
- int res = process_attrs();
- if (res < 0) {
- return res;
- }
- }
-
- ceph_assert(uint64_t(ofs) >= extra_data_len);
-
- uint64_t size = bl.length();
- ofs += size;
-
- const uint64_t lofs = data_len;
- data_len += size;
-
- return filter->process(std::move(bl), lofs);
- }
-
- int flush() {
- return filter->process({}, data_len);
- }
-
- bufferlist& get_extra_data() { return extra_data_bl; }
-
- map<string, bufferlist>& get_attrs() { return src_attrs; }
-
- void set_extra_data_len(uint64_t len) override {
- extra_data_left = len;
- RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
- }
-
- uint64_t get_data_len() {
- return data_len;
- }
-
- std::string get_verifier_etag() {
- if (etag_verifier) {
- etag_verifier->calculate_etag();
- return etag_verifier->get_calculated_etag();
- } else {
- return "";
- }
- }
-};
-
-/*
- * prepare attrset depending on attrs_mod.
- */
-static void set_copy_attrs(map<string, bufferlist>& src_attrs,
- map<string, bufferlist>& attrs,
- RGWRados::AttrsMod attrs_mod)
-{
- switch (attrs_mod) {
- case RGWRados::ATTRSMOD_NONE:
- attrs = src_attrs;
- break;
- case RGWRados::ATTRSMOD_REPLACE:
- if (!attrs[RGW_ATTR_ETAG].length()) {
- attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
- }
- if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
- auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
- if (ttiter != src_attrs.end()) {
- attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
- }
- }
- break;
- case RGWRados::ATTRSMOD_MERGE:
- for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
- if (attrs.find(it->first) == attrs.end()) {
- attrs[it->first] = it->second;
- }
- }
- break;
- }
-}
-
-int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
-{
- RGWObjectCtx rctx(this->driver);
- rgw::sal::Attrs attrset;
- uint64_t obj_size;
- ceph::real_time mtime;
- RGWRados::Object op_target(this, obj->get_bucket(), rctx, obj);
- RGWRados::Object::Read read_op(&op_target);
-
- read_op.params.attrs = &attrset;
- read_op.params.obj_size = &obj_size;
- read_op.params.lastmod = &mtime;
-
- int ret = read_op.prepare(y, dpp);
- if (ret < 0)
- return ret;
-
- attrset.erase(RGW_ATTR_ID_TAG);
- attrset.erase(RGW_ATTR_TAIL_TAG);
- attrset.erase(RGW_ATTR_STORAGE_CLASS);
-
- return this->copy_obj_data(rctx, obj->get_bucket(),
- obj->get_bucket()->get_info().placement_rule,
- read_op, obj_size - 1, obj, NULL, mtime,
- attrset, 0, real_time(), NULL, dpp, y);
-}
-
-struct obj_time_weight {
- real_time mtime;
- uint32_t zone_short_id;
- uint64_t pg_ver;
- bool high_precision;
-
- obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
-
- bool compare_low_precision(const obj_time_weight& rhs) {
- struct timespec l = ceph::real_clock::to_timespec(mtime);
- struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
- l.tv_nsec = 0;
- r.tv_nsec = 0;
- if (l > r) {
- return false;
- }
- if (l < r) {
- return true;
- }
- if (!zone_short_id || !rhs.zone_short_id) {
- /* don't compare zone ids, if one wasn't provided */
- return false;
- }
- if (zone_short_id != rhs.zone_short_id) {
- return (zone_short_id < rhs.zone_short_id);
- }
- return (pg_ver < rhs.pg_ver);
-
- }
-
- bool operator<(const obj_time_weight& rhs) {
- if (!high_precision || !rhs.high_precision) {
- return compare_low_precision(rhs);
- }
- if (mtime > rhs.mtime) {
- return false;
- }
- if (mtime < rhs.mtime) {
- return true;
- }
- if (!zone_short_id || !rhs.zone_short_id) {
- /* don't compare zone ids, if one wasn't provided */
- return false;
- }
- if (zone_short_id != rhs.zone_short_id) {
- return (zone_short_id < rhs.zone_short_id);
- }
- return (pg_ver < rhs.pg_ver);
- }
-
- void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
- mtime = _mtime;
- zone_short_id = _short_id;
- pg_ver = _pg_ver;
- }
-
- void init(RGWObjState *state) {
- mtime = state->mtime;
- zone_short_id = state->zone_short_id;
- pg_ver = state->pg_ver;
- }
-};
-
-inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
- out << o.mtime;
-
- if (o.zone_short_id != 0 || o.pg_ver != 0) {
- out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
- }
-
- return out;
-}
-
-class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
- bufferlist extra_data;
-public:
- RGWGetExtraDataCB() {}
- int handle_data(bufferlist& bl, bool *pause) override {
- int bl_len = (int)bl.length();
- if (extra_data.length() < extra_data_len) {
- off_t max = extra_data_len - extra_data.length();
- if (max > bl_len) {
- max = bl_len;
- }
- bl.splice(0, max, &extra_data);
- }
- return bl_len;
- }
-
- bufferlist& get_extra_data() {
- return extra_data;
- }
-};
-
-int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
- RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* src_obj,
- const RGWBucketInfo *src_bucket_info,
- real_time *src_mtime,
- uint64_t *psize,
- const real_time *mod_ptr,
- const real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- map<string, bufferlist> *pattrs,
- map<string, string> *pheaders,
- string *version_id,
- string *ptag,
- string *petag)
-{
- /* source is in a different zonegroup, copy from there */
-
- RGWRESTStreamRWRequest *in_stream_req;
- string tag;
- map<string, bufferlist> src_attrs;
- append_rand_alpha(cct, tag, tag, 32);
- obj_time_weight set_mtime_weight;
- set_mtime_weight.high_precision = high_precision_time;
-
- RGWRESTConn *conn;
- if (source_zone.empty()) {
- if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
- /* source is in the master zonegroup */
- conn = svc.zone->get_master_conn();
- } else {
- auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
- map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
- if (iter == zonegroup_conn_map.end()) {
- ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
- return -ENOENT;
- }
- conn = iter->second;
- }
- } else {
- auto& zone_conn_map = svc.zone->get_zone_conn_map();
- auto iter = zone_conn_map.find(source_zone);
- if (iter == zone_conn_map.end()) {
- ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
- return -ENOENT;
- }
- conn = iter->second;
- }
-
- RGWGetExtraDataCB cb;
- map<string, string> req_headers;
- real_time set_mtime;
-
- const real_time *pmod = mod_ptr;
-
- obj_time_weight dest_mtime_weight;
-
- constexpr bool prepend_meta = true;
- constexpr bool get_op = true;
- constexpr bool rgwx_stat = true;
- constexpr bool sync_manifest = true;
- constexpr bool skip_decrypt = true;
- int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
- dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
- prepend_meta, get_op, rgwx_stat,
- sync_manifest, skip_decrypt,
- true, &cb, &in_stream_req);
- if (ret < 0) {
- return ret;
- }
-
- ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
- nullptr, pheaders, null_yield);
- if (ret < 0) {
- return ret;
- }
-
- bufferlist& extra_data_bl = cb.get_extra_data();
- if (extra_data_bl.length()) {
- JSONParser jp;
- if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
- ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
- return -EIO;
- }
-
- JSONDecoder::decode_json("attrs", src_attrs, &jp);
-
- src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
- }
-
- if (src_mtime) {
- *src_mtime = set_mtime;
- }
-
- if (petag) {
- map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
- if (iter != src_attrs.end()) {
- bufferlist& etagbl = iter->second;
- *petag = etagbl.to_str();
- while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
- *petag = petag->substr(0, petag->size() - 1);
- }
- }
- }
-
- if (pattrs) {
- *pattrs = std::move(src_attrs);
- }
-
- return 0;
-}
-
-int RGWFetchObjFilter_Default::filter(CephContext *cct,
- const rgw_obj_key& source_key,
- const RGWBucketInfo& dest_bucket_info,
- std::optional<rgw_placement_rule> dest_placement_rule,
- const map<string, bufferlist>& obj_attrs,
- std::optional<rgw_user> *poverride_owner,
- const rgw_placement_rule **prule)
-{
- const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
- if (!ptail_rule) {
- auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
- if (iter != obj_attrs.end()) {
- dest_rule.storage_class = iter->second.to_str();
- dest_rule.inherit_from(dest_bucket_info.placement_rule);
- ptail_rule = &dest_rule;
- } else {
- ptail_rule = &dest_bucket_info.placement_rule;
- }
- }
- *prule = ptail_rule;
- return 0;
-}
-
-int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* dest_obj,
- rgw::sal::Object* src_obj,
- rgw::sal::Bucket* dest_bucket,
- rgw::sal::Bucket* src_bucket,
- std::optional<rgw_placement_rule> dest_placement_rule,
- real_time *src_mtime,
- real_time *mtime,
- const real_time *mod_ptr,
- const real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- AttrsMod attrs_mod,
- bool copy_if_newer,
- rgw::sal::Attrs& attrs,
- RGWObjCategory category,
- std::optional<uint64_t> olh_epoch,
- real_time delete_at,
- string *ptag,
- string *petag,
- void (*progress_cb)(off_t, void *),
- void *progress_data,
- const DoutPrefixProvider *dpp,
- RGWFetchObjFilter *filter,
- rgw_zone_set *zones_trace,
- std::optional<uint64_t>* bytes_transferred)
-{
- /* source is in a different zonegroup, copy from there */
-
- RGWRESTStreamRWRequest *in_stream_req;
- string tag;
- int i;
- append_rand_alpha(cct, tag, tag, 32);
- obj_time_weight set_mtime_weight;
- set_mtime_weight.high_precision = high_precision_time;
- int ret;
-
- rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
- using namespace rgw::putobj;
- AtomicObjectProcessor processor(&aio, this->driver, nullptr, user_id,
- obj_ctx, dest_obj->clone(), olh_epoch,
- tag, dpp, null_yield);
- RGWRESTConn *conn;
- auto& zone_conn_map = svc.zone->get_zone_conn_map();
- auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
- if (source_zone.empty()) {
- if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
- /* source is in the master zonegroup */
- conn = svc.zone->get_master_conn();
- } else {
- map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
- if (iter == zonegroup_conn_map.end()) {
- ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
- return -ENOENT;
- }
- conn = iter->second;
- }
- } else {
- auto iter = zone_conn_map.find(source_zone);
- if (iter == zone_conn_map.end()) {
- ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
- return -ENOENT;
- }
- conn = iter->second;
- }
-
- boost::optional<RGWPutObj_Compress> compressor;
- CompressorRef plugin;
-
- RGWFetchObjFilter_Default source_filter;
- if (!filter) {
- filter = &source_filter;
- }
-
- std::optional<rgw_user> override_owner;
-
- RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
- [&](map<string, bufferlist>& obj_attrs) {
- const rgw_placement_rule *ptail_rule;
-
- int ret = filter->filter(cct,
- src_obj->get_key(),
- dest_bucket->get_info(),
- dest_placement_rule,
- obj_attrs,
- &override_owner,
- &ptail_rule);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
- return ret;
- }
-
- processor.set_tail_placement(*ptail_rule);
-
- const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
- if (compression_type != "none") {
- plugin = Compressor::create(cct, compression_type);
- if (!plugin) {
- ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
- << compression_type << dendl;
- }
- }
-
- ret = processor.prepare(null_yield);
- if (ret < 0) {
- return ret;
- }
- return 0;
- });
-
- string etag;
- real_time set_mtime;
- uint64_t expected_size = 0;
-
- RGWObjState *dest_state = NULL;
- RGWObjManifest *manifest = nullptr;
-
- const real_time *pmod = mod_ptr;
-
- obj_time_weight dest_mtime_weight;
-
- if (copy_if_newer) {
- /* need to get mtime for destination */
- ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
- if (ret < 0)
- goto set_err_state;
-
- if (!real_clock::is_zero(dest_state->mtime)) {
- dest_mtime_weight.init(dest_state);
- pmod = &dest_mtime_weight.mtime;
- }
- }
-
- static constexpr bool prepend_meta = true;
- static constexpr bool get_op = true;
- static constexpr bool rgwx_stat = false;
- static constexpr bool sync_manifest = true;
- static constexpr bool skip_decrypt = true;
- ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
- dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
- prepend_meta, get_op, rgwx_stat,
- sync_manifest, skip_decrypt,
- true,
- &cb, &in_stream_req);
- if (ret < 0) {
- goto set_err_state;
- }
-
- ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
- &expected_size, nullptr, nullptr, null_yield);
- if (ret < 0) {
- goto set_err_state;
- }
- ret = cb.flush();
- if (ret < 0) {
- goto set_err_state;
- }
- if (cb.get_data_len() != expected_size) {
- ret = -EIO;
- ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
- << expected_size << " bytes but received " << cb.get_data_len() << dendl;
- goto set_err_state;
- }
- if (compressor && compressor->is_compressed()) {
- bufferlist tmp;
- RGWCompressionInfo cs_info;
- cs_info.compression_type = plugin->get_type_name();
- cs_info.orig_size = cb.get_data_len();
- cs_info.compressor_message = compressor->get_compressor_message();
- cs_info.blocks = move(compressor->get_compression_blocks());
- encode(cs_info, tmp);
- cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
- }
-
- if (override_owner) {
- processor.set_owner(*override_owner);
-
- auto& obj_attrs = cb.get_attrs();
-
- RGWUserInfo owner_info;
- if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
- ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
- return -EINVAL;
- }
-
- RGWAccessControlPolicy acl;
-
- auto aiter = obj_attrs.find(RGW_ATTR_ACL);
- if (aiter == obj_attrs.end()) {
- ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
- acl.create_default(owner_info.user_id, owner_info.display_name);
- } else {
- auto iter = aiter->second.cbegin();
- try {
- acl.decode(iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
- return -EIO;
- }
- }
-
- ACLOwner new_owner;
- new_owner.set_id(*override_owner);
- new_owner.set_name(owner_info.display_name);
-
- acl.set_owner(new_owner);
-
- bufferlist bl;
- acl.encode(bl);
- obj_attrs[RGW_ATTR_ACL] = std::move(bl);
- }
-
- if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
- cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
- } else {
- map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
- if (iter != cb.get_attrs().end()) {
- try {
- decode(delete_at, iter->second);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
- }
- }
- }
-
- if (src_mtime) {
- *src_mtime = set_mtime;
- }
-
- if (petag) {
- const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
- if (iter != cb.get_attrs().end()) {
- *petag = iter->second.to_str();
- }
- }
-
- //erase the append attr
- cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
-
- { // add x-amz-replication-status=REPLICA
- auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
- bl.clear(); // overwrite source's status
- bl.append("REPLICA");
- }
-
- if (source_zone.empty()) {
- set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
- } else {
- attrs = cb.get_attrs();
- }
-
- if (copy_if_newer) {
- uint64_t pg_ver = 0;
- auto i = attrs.find(RGW_ATTR_PG_VER);
- if (i != attrs.end() && i->second.length() > 0) {
- auto iter = i->second.cbegin();
- try {
- decode(pg_ver, iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
- /* non critical error */
- }
- }
- set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
- }
-
- /* Perform ETag verification is we have computed the object's MD5 sum at our end */
- if (const auto& verifier_etag = cb.get_verifier_etag();
- !verifier_etag.empty()) {
- string trimmed_etag = etag;
-
- /* Remove the leading and trailing double quotes from etag */
- trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
- trimmed_etag.end());
-
- if (verifier_etag != trimmed_etag) {
- ret = -EIO;
- ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
- << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
- goto set_err_state;
- }
- }
-
-#define MAX_COMPLETE_RETRY 100
- for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
- bool canceled = false;
- ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
- attrs, delete_at, nullptr, nullptr, nullptr,
- zones_trace, &canceled, null_yield);
- if (ret < 0) {
- goto set_err_state;
- }
-
- if (copy_if_newer && canceled) {
- ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
- obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
- ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj, &dest_state, &manifest, false, null_yield);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
- goto set_err_state;
- }
- dest_mtime_weight.init(dest_state);
- dest_mtime_weight.high_precision = high_precision_time;
- if (!dest_state->exists ||
- dest_mtime_weight < set_mtime_weight) {
- ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
- continue;
- } else {
- ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
- }
- }
- break;
- }
-
- if (i == MAX_COMPLETE_RETRY) {
- ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
- ret = -EIO;
- goto set_err_state;
- }
-
- if (bytes_transferred) {
- *bytes_transferred = cb.get_data_len();
- }
- return 0;
-set_err_state:
- if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
- // we may have already fetched during sync of OP_ADD, but were waiting
- // for OP_LINK_OLH to call set_olh() with a real olh_epoch
- if (olh_epoch && *olh_epoch > 0) {
- constexpr bool log_data_change = true;
- ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj, false, nullptr,
- *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
- } else {
- // we already have the latest copy
- ret = 0;
- }
- }
- return ret;
-}
-
-
-int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
- RGWObjState *astate,
- map<string, bufferlist>& src_attrs,
- RGWRados::Object::Read& read_op,
- const rgw_user& user_id,
- rgw::sal::Object* dest_obj,
- real_time *mtime)
-{
- string etag;
-
- RGWRESTStreamS3PutObj *out_stream_req;
-
- auto rest_master_conn = svc.zone->get_master_conn();
-
- int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
- if (ret < 0) {
- return ret;
- }
-
- out_stream_req->set_send_length(astate->size);
-
- ret = RGWHTTP::send(out_stream_req);
- if (ret < 0) {
- delete out_stream_req;
- return ret;
- }
-
- ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
- if (ret < 0) {
- delete out_stream_req;
- return ret;
- }
-
- ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-/**
- * Copy an object.
- * dest_obj: the object to copy into
- * src_obj: the object to copy from
- * attrs: usage depends on attrs_mod parameter
- * attrs_mod: the modification mode of the attrs, may have the following values:
- * ATTRSMOD_NONE - the attributes of the source object will be
- * copied without modifications, attrs parameter is ignored;
- * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
- * parameter, source object attributes are not copied;
- * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
- * are overwritten by values contained in attrs parameter.
- * err: stores any errors resulting from the get of the original object
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* dest_obj,
- rgw::sal::Object* src_obj,
- rgw::sal::Bucket* dest_bucket,
- rgw::sal::Bucket* src_bucket,
- const rgw_placement_rule& dest_placement,
- real_time *src_mtime,
- real_time *mtime,
- const real_time *mod_ptr,
- const real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- AttrsMod attrs_mod,
- bool copy_if_newer,
- rgw::sal::Attrs& attrs,
- RGWObjCategory category,
- uint64_t olh_epoch,
- real_time delete_at,
- string *version_id,
- string *ptag,
- string *petag,
- void (*progress_cb)(off_t, void *),
- void *progress_data,
- const DoutPrefixProvider *dpp,
- optional_yield y)
-{
- int ret;
- uint64_t obj_size;
- rgw_obj shadow_obj = dest_obj->get_obj();
- string shadow_oid;
-
- bool remote_src;
- bool remote_dest;
-
- append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
- shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
-
- auto& zonegroup = svc.zone->get_zonegroup();
-
- remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
- remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
-
- if (remote_src && remote_dest) {
- ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
- return -EINVAL;
- }
-
- ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
-
- if (remote_src || !source_zone.empty()) {
- return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
- dest_obj, src_obj, dest_bucket, src_bucket,
- dest_placement, src_mtime, mtime, mod_ptr,
- unmod_ptr, high_precision_time,
- if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
- olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
- nullptr /* filter */);
- }
-
- map<string, bufferlist> src_attrs;
- RGWRados::Object src_op_target(this, src_bucket, obj_ctx, src_obj);
- RGWRados::Object::Read read_op(&src_op_target);
-
- read_op.conds.mod_ptr = mod_ptr;
- read_op.conds.unmod_ptr = unmod_ptr;
- read_op.conds.high_precision_time = high_precision_time;
- read_op.conds.if_match = if_match;
- read_op.conds.if_nomatch = if_nomatch;
- read_op.params.attrs = &src_attrs;
- read_op.params.lastmod = src_mtime;
- read_op.params.obj_size = &obj_size;
-
- ret = read_op.prepare(y, dpp);
- if (ret < 0) {
- return ret;
- }
- if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
- // Current implementation does not follow S3 spec and even
- // may result in data corruption silently when copying
- // multipart objects acorss pools. So reject COPY operations
- //on encrypted objects before it is fully functional.
- ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
- << " has not been implemented." << dendl;
- return -ERR_NOT_IMPLEMENTED;
- }
-
- src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
- src_attrs.erase(RGW_ATTR_DELETE_AT);
-
- src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
- src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
- map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
- if (rt != attrs.end())
- src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
- map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
- if (lh != attrs.end())
- src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
-
- set_copy_attrs(src_attrs, attrs, attrs_mod);
- attrs.erase(RGW_ATTR_ID_TAG);
- attrs.erase(RGW_ATTR_PG_VER);
- attrs.erase(RGW_ATTR_SOURCE_ZONE);
- map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
- if (cmp != src_attrs.end())
- attrs[RGW_ATTR_COMPRESSION] = cmp->second;
-
- RGWObjManifest manifest;
- RGWObjState *astate = NULL;
- RGWObjManifest *amanifest = nullptr;
-
- ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj, &astate, &amanifest, y);
- if (ret < 0) {
- return ret;
- }
-
- vector<rgw_raw_obj> ref_objs;
-
- if (remote_dest) {
- /* dest is in a different zonegroup, copy it there */
- return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
- }
- uint64_t max_chunk_size;
-
- ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
- return ret;
- }
-
- rgw_pool src_pool;
- rgw_pool dest_pool;
-
- const rgw_placement_rule *src_rule{nullptr};
-
- if (amanifest) {
- src_rule = &amanifest->get_tail_placement().placement_rule;
- ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
- }
-
- if (!src_rule || src_rule->empty()) {
- src_rule = &src_bucket->get_placement_rule();
- }
-
- if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
- ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
- return -EIO;
- }
-
- if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
- ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
- return -EIO;
- }
-
- ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
- << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
-
- bool copy_data = (!amanifest) ||
- (*src_rule != dest_placement) ||
- (src_pool != dest_pool);
-
- bool copy_first = false;
- if (amanifest) {
- if (!amanifest->has_tail()) {
- copy_data = true;
- } else {
- uint64_t head_size = amanifest->get_head_size();
-
- if (head_size > 0) {
- if (head_size > max_chunk_size) {
- copy_data = true;
- } else {
- copy_first = true;
- }
- }
- }
- }
-
- if (petag) {
- const auto iter = attrs.find(RGW_ATTR_ETAG);
- if (iter != attrs.end()) {
- *petag = iter->second.to_str();
- }
- }
-
- if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
- attrs.erase(RGW_ATTR_TAIL_TAG);
- return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
- mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
- }
-
- /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
- RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
-
- if (copy_first) { // we need to copy first chunk, not increase refcount
- ++miter;
- }
-
- bufferlist first_chunk;
-
- const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
- RGWObjManifest *pmanifest;
- ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
-
- RGWRados::Object dest_op_target(this, dest_bucket, obj_ctx, dest_obj);
- RGWRados::Object::Write write_op(&dest_op_target);
-
- string tag;
-
- if (ptag) {
- tag = *ptag;
- }
-
- if (tag.empty()) {
- append_rand_alpha(cct, tag, tag, 32);
- }
-
- std::unique_ptr<rgw::Aio> aio;
- rgw::AioResultList all_results;
- if (!copy_itself) {
- aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
- attrs.erase(RGW_ATTR_TAIL_TAG);
- manifest = *amanifest;
- const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
- if (tail_placement.bucket.name.empty()) {
- manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
- }
- string ref_tag;
- for (; miter != amanifest->obj_end(dpp); ++miter) {
- ObjectWriteOperation op;
- ref_tag = tag + '\0';
- cls_refcount_get(op, ref_tag, true);
-
- auto obj = svc.rados->obj(miter.get_location().get_raw_obj(driver));
- ret = obj.open(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
- goto done_ret;
- }
-
- static constexpr uint64_t cost = 1; // 1 throttle unit per request
- static constexpr uint64_t id = 0; // ids unused
- rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
- ret = rgw::check_for_errors(completed);
- all_results.splice(all_results.end(), completed);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
- goto done_ret;
- }
- }
-
- rgw::AioResultList completed = aio->drain();
- ret = rgw::check_for_errors(completed);
- all_results.splice(all_results.end(), completed);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
- goto done_ret;
- }
-
- pmanifest = &manifest;
- } else {
- pmanifest = amanifest;
- /* don't send the object's tail for garbage collection */
- astate->keep_tail = true;
- }
-
- if (copy_first) {
- ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
- if (ret < 0) {
- goto done_ret;
- }
-
- pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
- } else {
- pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
- }
-
- write_op.meta.data = &first_chunk;
- write_op.meta.manifest = pmanifest;
- write_op.meta.ptag = &tag;
- write_op.meta.owner = dest_bucket->get_info().owner;
- write_op.meta.mtime = mtime;
- write_op.meta.flags = PUT_OBJ_CREATE;
- write_op.meta.category = category;
- write_op.meta.olh_epoch = olh_epoch;
- write_op.meta.delete_at = delete_at;
- write_op.meta.modify_tail = !copy_itself;
-
- ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
- if (ret < 0) {
- goto done_ret;
- }
-
- return 0;
-
-done_ret:
- if (!copy_itself) {
-
- /* wait all pending op done */
- rgw::AioResultList completed = aio->drain();
- all_results.splice(all_results.end(), completed);
-
- /* rollback reference */
- string ref_tag = tag + '\0';
- int ret2 = 0;
- for (auto& r : all_results) {
- if (r.result < 0) {
- continue; // skip errors
- }
- ObjectWriteOperation op;
- cls_refcount_put(op, ref_tag, true);
-
- static constexpr uint64_t cost = 1; // 1 throttle unit per request
- static constexpr uint64_t id = 0; // ids unused
- rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
- ret2 = rgw::check_for_errors(completed);
- if (ret2 < 0) {
- ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
- }
- }
- completed = aio->drain();
- ret2 = rgw::check_for_errors(completed);
- if (ret2 < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
- }
- }
- return ret;
-}
-
-
-int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
- rgw::sal::Bucket* bucket,
- const rgw_placement_rule& dest_placement,
- RGWRados::Object::Read& read_op, off_t end,
- rgw::sal::Object* dest_obj,
- real_time *mtime,
- real_time set_mtime,
- rgw::sal::Attrs& attrs,
- uint64_t olh_epoch,
- real_time delete_at,
- string *petag,
- const DoutPrefixProvider *dpp,
- optional_yield y)
-{
- string tag;
- append_rand_alpha(cct, tag, tag, 32);
-
- rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
- using namespace rgw::putobj;
- // do not change the null_yield in the initialization of this AtomicObjectProcessor
- // it causes crashes in the ragweed tests
- AtomicObjectProcessor processor(&aio, this->driver, &dest_placement,
- bucket->get_info().owner, obj_ctx,
- dest_obj->clone(), olh_epoch, tag,
- dpp, null_yield);
- int ret = processor.prepare(y);
- if (ret < 0)
- return ret;
-
- off_t ofs = 0;
-
- do {
- bufferlist bl;
- ret = read_op.read(ofs, end, bl, y, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
- return ret;
- }
-
- uint64_t read_len = ret;
- ret = processor.process(std::move(bl), ofs);
- if (ret < 0) {
- return ret;
- }
-
- ofs += read_len;
- } while (ofs <= end);
-
- // flush
- ret = processor.process({}, ofs);
- if (ret < 0) {
- return ret;
- }
-
- string etag;
- auto iter = attrs.find(RGW_ATTR_ETAG);
- if (iter != attrs.end()) {
- bufferlist& bl = iter->second;
- etag = bl.to_str();
- if (petag) {
- *petag = etag;
- }
- }
-
- uint64_t accounted_size;
- {
- bool compressed{false};
- RGWCompressionInfo cs_info;
- ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
- return ret;
- }
- // pass original size if compressed
- accounted_size = compressed ? cs_info.orig_size : ofs;
- }
-
- return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
- nullptr, nullptr, nullptr, nullptr, nullptr, y);
-}
-
-int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
- rgw::sal::Bucket* bucket,
- rgw::sal::Object& obj,
- const rgw_placement_rule& placement_rule,
- const real_time& mtime,
- uint64_t olh_epoch,
- const DoutPrefixProvider *dpp,
- optional_yield y)
-{
- rgw::sal::Attrs attrs;
- real_time read_mtime;
- uint64_t obj_size;
-
- obj.set_atomic();
- RGWRados::Object op_target(this, bucket, obj_ctx, &obj);
- RGWRados::Object::Read read_op(&op_target);
-
- read_op.params.attrs = &attrs;
- read_op.params.lastmod = &read_mtime;
- read_op.params.obj_size = &obj_size;
-
- int ret = read_op.prepare(y, dpp);
- if (ret < 0) {
- return ret;
- }
-
- if (read_mtime != mtime) {
- /* raced */
- return -ECANCELED;
- }
-
- attrs.erase(RGW_ATTR_ID_TAG);
- attrs.erase(RGW_ATTR_TAIL_TAG);
-
- ret = copy_obj_data(obj_ctx,
- bucket,
- placement_rule,
- read_op,
- obj_size - 1,
- &obj,
- nullptr /* pmtime */,
- mtime,
- attrs,
- olh_epoch,
- real_time(),
- nullptr /* petag */,
- dpp,
- y);
- if (ret < 0) {
- return ret;
- }
-
- return 0;
-}
-
-int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
-{
- constexpr uint NUM_ENTRIES = 1000u;
-
- rgw_obj_index_key marker;
- string prefix;
- bool is_truncated;
-
- do {
- std::vector<rgw_bucket_dir_entry> ent_list;
- ent_list.reserve(NUM_ENTRIES);
-
- int r = cls_bucket_list_unordered(dpp,
- bucket_info,
- bucket_info.layout.current_index,
- RGW_NO_SHARD,
- marker,
- prefix,
- NUM_ENTRIES,
- true,
- ent_list,
- &is_truncated,
- &marker,
- y);
- if (r < 0) {
- return r;
- }
-
- string ns;
- for (auto const& dirent : ent_list) {
- rgw_obj_key obj;
-
- if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
- return -ENOTEMPTY;
- }
- }
- } while (is_truncated);
-
- return 0;
-}
-
-/**
- * Delete a bucket.
- * bucket: the name of the bucket to delete
- * Returns 0 on success, -ERR# otherwise.
- */
-int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
-{
- const rgw_bucket& bucket = bucket_info.bucket;
- RGWSI_RADOS::Pool index_pool;
- map<int, string> bucket_objs;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
- if (r < 0)
- return r;
-
- if (check_empty) {
- r = check_bucket_empty(dpp, bucket_info, y);
- if (r < 0) {
- return r;
- }
- }
-
- bool remove_ep = true;
-
- if (objv_tracker.read_version.empty()) {
- RGWBucketEntryPoint ep;
- r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
- &ep,
- null_yield,
- dpp,
- RGWBucketCtl::Bucket::GetParams()
- .set_objv_tracker(&objv_tracker));
- if (r < 0 ||
- (!bucket_info.bucket.bucket_id.empty() &&
- ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
- if (r != -ENOENT) {
- ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
- /* we have no idea what caused the error, will not try to remove it */
- }
- /*
- * either failed to read bucket entrypoint, or it points to a different bucket instance than
- * requested
- */
- remove_ep = false;
- }
- }
-
- if (remove_ep) {
- r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
- RGWBucketCtl::Bucket::RemoveParams()
- .set_objv_tracker(&objv_tracker));
- if (r < 0)
- return r;
- }
-
- /* if the bucket is not synced we can remove the meta file */
- if (!svc.zone->is_syncing_bucket_meta(bucket)) {
- RGWObjVersionTracker objv_tracker;
- r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
- if (r < 0) {
- return r;
- }
-
- /* remove bucket index objects asynchronously by best effort */
- (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
- bucket_objs,
- cct->_conf->rgw_bucket_index_max_aio)();
- }
-
- return 0;
-}
-
-int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
-{
- RGWBucketInfo info;
- map<string, bufferlist> attrs;
- int r;
-
- if (bucket.bucket_id.empty()) {
- r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
- } else {
- r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
- }
- if (r < 0) {
- ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
- return r;
- }
-
- info.owner = owner.get_id();
-
- r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
- return r;
- }
-
- return 0;
-}
-
-
-int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
-{
- int ret = 0;
-
- vector<rgw_bucket>::iterator iter;
-
- for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
- rgw_bucket& bucket = *iter;
- if (enabled) {
- ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
- } else {
- ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
- }
-
- RGWBucketInfo info;
- map<string, bufferlist> attrs;
- int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
- ret = r;
- continue;
- }
- if (enabled) {
- info.flags &= ~BUCKET_SUSPENDED;
- } else {
- info.flags |= BUCKET_SUSPENDED;
- }
-
- r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
- ret = r;
- continue;
- }
- }
- return ret;
-}
-
-int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
-{
- RGWBucketInfo bucket_info;
- int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
- if (ret < 0) {
- return ret;
- }
-
- *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
- return 0;
-}
-
-int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
-{
- if ((!manifest)|| state->keep_tail)
- return 0;
-
- cls_rgw_obj_chain chain;
- store->update_gc_chain(dpp, obj->get_obj(), *manifest, &chain);
-
- if (chain.empty()) {
- return 0;
- }
-
- string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
- if (store->gc == nullptr) {
- ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
- //Delete objects inline just in case gc hasn't been initialised, prevents crashes
- store->delete_objs_inline(dpp, chain, tag);
- } else {
- auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
- if (ret < 0 && leftover_chain) {
- //Delete objects inline if send chain to gc fails
- store->delete_objs_inline(dpp, *leftover_chain, tag);
- }
- }
- return 0;
-}
-
-void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
-{
- RGWObjManifest::obj_iterator iter;
- rgw_raw_obj raw_head;
- obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
- for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
- const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(driver);
- if (mobj == raw_head)
- continue;
- cls_rgw_obj_key key(mobj.oid);
- chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
- }
-}
-
-std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
-{
- if (chain.empty()) {
- return {0, std::nullopt};
- }
-
- return gc->send_split_chain(chain, tag);
-}
-
-void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
-{
- string last_pool;
- std::unique_ptr<IoCtx> ctx(new IoCtx);
- int ret = 0;
- for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
- cls_rgw_obj& obj = *liter;
- if (obj.pool != last_pool) {
- ctx.reset(new IoCtx);
- ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
- if (ret < 0) {
- last_pool = "";
- ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
- obj.pool << dendl;
- continue;
- }
- last_pool = obj.pool;
- }
- ctx->locator_set_key(obj.loc);
- const string& oid = obj.key.name; /* just stored raw oid there */
- ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
- ":" << obj.key.name << dendl;
- ObjectWriteOperation op;
- cls_refcount_put(op, tag, true);
- ret = ctx->operate(oid, &op);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
- }
- }
-}
-
-static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
- map<RGWObjCategory, RGWStorageStats>& stats)
-{
- for (const auto& pair : header.stats) {
- const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
- const rgw_bucket_category_stats& header_stats = pair.second;
-
- RGWStorageStats& s = stats[category];
-
- s.category = category;
- s.size += header_stats.total_size;
- s.size_rounded += header_stats.total_size_rounded;
- s.size_utilized += header_stats.actual_size;
- s.num_objects += header_stats.num_entries;
- }
-}
-
-int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
- map<RGWObjCategory, RGWStorageStats> *existing_stats,
- map<RGWObjCategory, RGWStorageStats> *calculated_stats)
-{
- RGWSI_RADOS::Pool index_pool;
-
- // key - bucket index object id
- // value - bucket index check OP returned result with the given bucket index object (shard)
- map<int, string> oids;
-
- int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
- if (ret < 0) {
- return ret;
- }
-
- // declare and pre-populate
- map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
- for (auto& iter : oids) {
- bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
- }
-
- ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
- if (ret < 0) {
- return ret;
- }
-
- // aggregate results (from different shards if there are any)
- for (const auto& iter : bucket_objs_ret) {
- accumulate_raw_stats(iter.second.existing_header, *existing_stats);
- accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
- }
-
- return 0;
-}
-
-int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
-{
- RGWSI_RADOS::Pool index_pool;
- map<int, string> bucket_objs;
-
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
- if (r < 0) {
- return r;
- }
-
- return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
-}
-
-int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
-{
- RGWSI_RADOS::Pool index_pool;
- map<int, string> bucket_objs;
-
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": unable to open bucket index, r=" << r << " (" <<
- cpp_strerror(-r) << ")" << dendl;
- return r;
- }
-
- r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": unable to issue set bucket resharding, r=" << r << " (" <<
- cpp_strerror(-r) << ")" << dendl;
- }
- return r;
-}
-
-int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y)
-{
- std::string oid, key;
- get_obj_bucket_and_oid_loc(obj->get_obj(), oid, key);
- if (!rctx)
- return 0;
-
- RGWObjState *state = NULL;
- RGWObjManifest *manifest = nullptr;
-
- int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
- if (r < 0)
- return r;
-
- if (!state->is_atomic) {
- ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
- return -EINVAL;
- }
-
- string tag;
-
- if (state->tail_tag.length() > 0) {
- tag = state->tail_tag.c_str();
- } else if (state->obj_tag.length() > 0) {
- tag = state->obj_tag.c_str();
- } else {
- ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
- return -EINVAL;
- }
-
- ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
-
- cls_rgw_obj_chain chain;
- update_gc_chain(dpp, state->obj, *manifest, &chain);
- return gc->async_defer_chain(tag, chain);
-}
-
-void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
-{
- list<string> prefixes;
- prefixes.push_back(RGW_ATTR_OLH_PREFIX);
- cls_rgw_remove_obj(op, prefixes);
-}
-
-void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
-{
- cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
-}
-
-void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
-{
- cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
-}
-
-struct tombstone_entry {
- ceph::real_time mtime;
- uint32_t zone_short_id;
- uint64_t pg_ver;
-
- tombstone_entry() = default;
- explicit tombstone_entry(const RGWObjState& state)
- : mtime(state.mtime), zone_short_id(state.zone_short_id),
- pg_ver(state.pg_ver) {}
-};
-
-/**
- * Delete an object.
- * bucket: name of the bucket storing the object
- * obj: name of the object to delete
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
-{
- RGWRados *store = target->get_store();
- const string& instance = target->get_instance();
- rgw_obj obj = target->get_obj();
-
- if (instance == "null") {
- obj.key.instance.clear();
- }
-
- bool explicit_marker_version = (!params.marker_version_id.empty());
-
- if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
- if (instance.empty() || explicit_marker_version) {
- std::unique_ptr<rgw::sal::Object> marker = target->get_target()->clone();
- marker->clear_instance();
-
- if (!params.marker_version_id.empty()) {
- if (params.marker_version_id != "null") {
- marker->set_instance(params.marker_version_id);
- }
- } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
- marker->gen_rand_obj_instance_name();
- }
-
- result.version_id = marker->get_instance();
- if (result.version_id.empty())
- result.version_id = "null";
- result.delete_marker = true;
-
- struct rgw_bucket_dir_entry_meta meta;
-
- meta.owner = params.obj_owner.get_id().to_str();
- meta.owner_display_name = params.obj_owner.get_display_name();
-
- if (real_clock::is_zero(params.mtime)) {
- meta.mtime = real_clock::now();
- } else {
- meta.mtime = params.mtime;
- }
-
- int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker.get(), true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
- if (r < 0) {
- return r;
- }
- } else {
- rgw_bucket_dir_entry dirent;
-
- int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
- if (r < 0) {
- return r;
- }
- result.delete_marker = dirent.is_delete_marker();
- r = store->unlink_obj_instance(dpp, target->get_bucket_info(), target->get_target(), params.olh_epoch, y, params.zones_trace);
- if (r < 0) {
- return r;
- }
- result.version_id = instance;
- }
-
- BucketShard *bs = nullptr;
- int r = target->get_bucket_shard(&bs, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
- return r;
- }
-
- add_datalog_entry(dpp, store->svc.datalog_rados,
- target->get_bucket_info(), bs->shard_id);
-
- return 0;
- }
-
- rgw_rados_ref ref;
- int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
- if (r < 0) {
- return r;
- }
-
- RGWObjState *state;
- RGWObjManifest *manifest = nullptr;
- r = target->get_state(dpp, &state, &manifest, false, y);
- if (r < 0)
- return r;
-
- ObjectWriteOperation op;
-
- if (!real_clock::is_zero(params.unmod_since)) {
- struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
- struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
- if (!params.high_precision_time) {
- ctime.tv_nsec = 0;
- unmod.tv_nsec = 0;
- }
-
- ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
- if (ctime > unmod) {
- return -ERR_PRECONDITION_FAILED;
- }
-
- /* only delete object if mtime is less than or equal to params.unmod_since */
- store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
- }
- uint64_t obj_accounted_size = state->accounted_size;
-
- if(params.abortmp) {
- obj_accounted_size = params.parts_accounted_size;
- }
-
- if (!real_clock::is_zero(params.expiration_time)) {
- bufferlist bl;
- real_time delete_at;
-
- if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
- try {
- auto iter = bl.cbegin();
- decode(delete_at, iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
- return -EIO;
- }
-
- if (params.expiration_time != delete_at) {
- return -ERR_PRECONDITION_FAILED;
- }
- } else {
- return -ERR_PRECONDITION_FAILED;
- }
- }
-
- if (!state->exists) {
- target->invalidate_state();
- return -ENOENT;
- }
-
- r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
- if (r < 0)
- return r;
-
- RGWBucketInfo& bucket_info = target->get_bucket_info();
-
- RGWRados::Bucket bop(store, bucket_info);
- RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-
- index_op.set_zones_trace(params.zones_trace);
- index_op.set_bilog_flags(params.bilog_flags);
-
- r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
- if (r < 0)
- return r;
-
- store->remove_rgw_head_obj(op);
-
- auto& ioctx = ref.pool.ioctx();
- r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
-
- /* raced with another operation, object state is indeterminate */
- const bool need_invalidate = (r == -ECANCELED);
-
- int64_t poolid = ioctx.get_id();
- if (r >= 0) {
- tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
- if (obj_tombstone_cache) {
- tombstone_entry entry{*state};
- obj_tombstone_cache->add(obj, entry);
- }
- r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
-
- int ret = target->complete_atomic_modification(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
- }
- /* other than that, no need to propagate error */
- } else {
- int ret = index_op.cancel(dpp, params.remove_objs);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
- }
- }
-
- if (need_invalidate) {
- target->invalidate_state();
- }
-
- if (r < 0)
- return r;
-
- /* update quota cache */
- store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
-
- return 0;
-}
-
-int RGWRados::delete_obj(rgw::sal::Driver* store,
- const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw_obj& obj,
- int versioning_status, // versioning flags defined in enum RGWBucketFlags
- uint16_t bilog_flags,
- const real_time& expiration_time,
- rgw_zone_set *zones_trace)
-{
- std::unique_ptr<rgw::sal::Bucket> bucket;
- store->get_bucket(nullptr, bucket_info, &bucket);
- std::unique_ptr<rgw::sal::Object> object = bucket->get_object(obj.key);
-
- return delete_obj(dpp, bucket_info, object.get(), versioning_status,
- bilog_flags, expiration_time, zones_trace);
-}
-
-int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- rgw::sal::Object* obj,
- int versioning_status, // versioning flags defined in enum RGWBucketFlags
- uint16_t bilog_flags,
- const real_time& expiration_time,
- rgw_zone_set *zones_trace)
-{
- std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
-
- del_op->params.bucket_owner = bucket_info.owner;
- del_op->params.versioning_status = versioning_status;
- del_op->params.bilog_flags = bilog_flags;
- del_op->params.expiration_time = expiration_time;
- del_op->params.zones_trace = zones_trace;
-
- return del_op->delete_obj(dpp, null_yield);
-}
-
-int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
-{
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- ObjectWriteOperation op;
-
- op.remove();
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- if (r < 0)
- return r;
-
- return 0;
-}
-
-int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
-{
- std::string oid, key;
- get_obj_bucket_and_oid_loc(obj, oid, key);
-
- RGWBucketInfo bucket_info;
- int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
- return ret;
- }
-
- RGWRados::Bucket bop(this, bucket_info);
- RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-
- return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
-}
-
-static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Driver* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
-{
- string tag;
-
- RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
- if (mi != manifest.obj_end(dpp)) {
- if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
- ++mi;
- rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
- tag = mi.get_location().get_raw_obj(rstore).oid;
- tag.append("_");
- }
-
- unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
- char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
- MD5 hash;
- // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
- hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
-
- map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
- if (iter != attrset.end()) {
- bufferlist& bl = iter->second;
- hash.Update((const unsigned char *)bl.c_str(), bl.length());
- }
-
- hash.Final(md5);
- buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
- tag.append(md5_str);
-
- ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
-
- tag_bl.append(tag.c_str(), tag.size() + 1);
-}
-
-static bool is_olh(map<string, bufferlist>& attrs)
-{
- map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
- return (iter != attrs.end());
-}
-
-static bool has_olh_tag(map<string, bufferlist>& attrs)
-{
- map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
- return (iter != attrs.end());
-}
-
-int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
- obj_ctx, RGWBucketInfo& bucket_info,
- rgw::sal::Object* obj, RGWObjState *olh_state,
- RGWObjState **target_state,
- RGWObjManifest **target_manifest, optional_yield y)
-{
- ceph_assert(olh_state->is_olh);
-
- rgw_obj target;
- int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
- if (r < 0) {
- return r;
- }
-
- std::unique_ptr<rgw::sal::Bucket> bucket;
- driver->get_bucket(nullptr, bucket_info, &bucket);
- std::unique_ptr<rgw::sal::Object> target_obj = bucket->get_object(target.key);
-
- r = get_obj_state(dpp, &obj_ctx, bucket_info, target_obj.get(), target_state,
- target_manifest, false, y);
- if (r < 0) {
- return r;
- }
-
- return 0;
-}
-
-int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
- RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- RGWObjState **state, RGWObjManifest** manifest,
- bool follow_olh, optional_yield y, bool assume_noent)
-{
- if (obj->empty()) {
- return -EINVAL;
- }
-
- bool need_follow_olh = follow_olh && obj->get_obj().key.instance.empty();
- *manifest = nullptr;
-
- RGWObjStateManifest *sm = rctx->get_state(obj->get_obj());
- RGWObjState *s = &(sm->state);
- ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
- *state = s;
- if (sm->manifest) {
- *manifest = &(*sm->manifest);
- }
- if (s->has_attrs) {
- if (s->is_olh && need_follow_olh) {
- return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
- }
- return 0;
- }
-
- s->obj = obj->get_obj();
-
- rgw_raw_obj raw_obj;
- obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &raw_obj);
-
- int r = -ENOENT;
-
- if (!assume_noent) {
- r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
- }
-
- if (r == -ENOENT) {
- s->exists = false;
- s->has_attrs = true;
- tombstone_entry entry;
- if (obj_tombstone_cache && obj_tombstone_cache->find(obj->get_obj(), entry)) {
- s->mtime = entry.mtime;
- s->zone_short_id = entry.zone_short_id;
- s->pg_ver = entry.pg_ver;
- ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
- << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
- } else {
- s->mtime = real_time();
- }
- return 0;
- }
- if (r < 0)
- return r;
-
- s->exists = true;
- s->has_attrs = true;
- s->accounted_size = s->size;
-
- auto iter = s->attrset.find(RGW_ATTR_ETAG);
- if (iter != s->attrset.end()) {
- /* get rid of extra null character at the end of the etag, as we used to store it like that */
- bufferlist& bletag = iter->second;
- if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
- bufferlist newbl;
- bletag.splice(0, bletag.length() - 1, &newbl);
- bletag = std::move(newbl);
- }
- }
-
- iter = s->attrset.find(RGW_ATTR_COMPRESSION);
- const bool compressed = (iter != s->attrset.end());
- if (compressed) {
- // use uncompressed size for accounted_size
- try {
- RGWCompressionInfo info;
- auto p = iter->second.cbegin();
- decode(info, p);
- s->accounted_size = info.orig_size;
- } catch (buffer::error&) {
- ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
- return -EIO;
- }
- }
-
- iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
- if (iter != s->attrset.end()) {
- bufferlist bl = iter->second;
- bufferlist::iterator it = bl.begin();
- it.copy(bl.length(), s->shadow_obj);
- s->shadow_obj[bl.length()] = '\0';
- }
- s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
- auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
- if (ttiter != s->attrset.end()) {
- s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
- }
-
- bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
- if (manifest_bl.length()) {
- auto miter = manifest_bl.cbegin();
- try {
- sm->manifest.emplace();
- decode(*sm->manifest, miter);
- sm->manifest->set_head(bucket_info.placement_rule, obj->get_obj(), s->size); /* patch manifest to reflect the head we just read, some manifests might be
- broken due to old bugs */
- s->size = sm->manifest->get_obj_size();
- if (!compressed)
- s->accounted_size = s->size;
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
- return -EIO;
- }
- *manifest = &(*sm->manifest);
- ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
- if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
- sm->manifest->has_explicit_objs()) {
- RGWObjManifest::obj_iterator mi;
- for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
- ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(driver) << dendl;
- }
- }
-
- if (!s->obj_tag.length()) {
- /*
- * Uh oh, something's wrong, object with manifest should have tag. Let's
- * create one out of the manifest, would be unique
- */
- generate_fake_tag(dpp, driver, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
- s->fake_tag = true;
- }
- }
- map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
- if (aiter != s->attrset.end()) {
- bufferlist& pg_ver_bl = aiter->second;
- if (pg_ver_bl.length()) {
- auto pgbl = pg_ver_bl.cbegin();
- try {
- decode(s->pg_ver, pgbl);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
- }
- }
- }
- aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
- if (aiter != s->attrset.end()) {
- bufferlist& zone_short_id_bl = aiter->second;
- if (zone_short_id_bl.length()) {
- auto zbl = zone_short_id_bl.cbegin();
- try {
- decode(s->zone_short_id, zbl);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
- }
- }
- }
- if (s->obj_tag.length()) {
- ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
- } else {
- ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
- }
-
- /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
- * it exist, and not only if is_olh() returns true
- */
- iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
- if (iter != s->attrset.end()) {
- s->olh_tag = iter->second;
- }
-
- if (is_olh(s->attrset)) {
- s->is_olh = true;
-
- ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
-
- if (need_follow_olh) {
- return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
- } else if (obj->get_obj().key.have_null_instance() && !sm->manifest) {
- // read null version, and the head object only have olh info
- s->exists = false;
- return -ENOENT;
- }
- }
-
- return 0;
-}
-
-int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
- bool follow_olh, optional_yield y, bool assume_noent)
-{
- int ret;
-
- do {
- ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
- } while (ret == -EAGAIN);
-
- return ret;
-}
-
-int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
-{
- RGWObjState *astate;
- int r = get_state(dpp, &astate, pmanifest, true, y);
- if (r < 0) {
- return r;
- }
-
- return 0;
-}
-
-int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
-{
- RGWObjState *state;
- RGWObjManifest *manifest = nullptr;
- int r = source->get_state(dpp, &state, &manifest, true, y);
- if (r < 0)
- return r;
- if (!state->exists)
- return -ENOENT;
- if (!state->get_attr(name, dest))
- return -ENODATA;
-
- return 0;
-}
-
-int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
-{
- rgw::sal::Object* target = source->get_target();
- rgw_obj obj = target->get_obj();
- RGWRados *store = source->get_store();
-
- result.obj = obj;
- if (target->has_attrs()) {
- state.ret = 0;
- result.size = target->get_obj_size();
- result.mtime = ceph::real_clock::to_timespec(target->get_mtime());
- result.attrs = target->get_attrs();
- //result.manifest = sm->manifest;
- return 0;
- }
-
- string oid;
- string loc;
- get_obj_bucket_and_oid_loc(obj, oid, loc);
-
- int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
- if (r < 0) {
- return r;
- }
-
- librados::ObjectReadOperation op;
- op.stat2(&result.size, &result.mtime, NULL);
- op.getxattrs(&result.attrs, NULL);
- state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
- state.io_ctx.locator_set_key(loc);
- r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
- if (r < 0) {
- ldpp_dout(dpp, 5) << __func__
- << ": ERROR: aio_operate() returned ret=" << r
- << dendl;
- return r;
- }
-
- return 0;
-}
-
-
-int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
-{
- if (!state.completion) {
- return state.ret;
- }
-
- state.completion->wait_for_complete();
- state.ret = state.completion->get_return_value();
- state.completion->release();
-
- if (state.ret != 0) {
- return state.ret;
- }
-
- return finish(dpp);
-}
-
-int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
-{
- map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
- if (iter != result.attrs.end()) {
- bufferlist& bl = iter->second;
- auto biter = bl.cbegin();
- try {
- result.manifest.emplace();
- decode(*result.manifest, biter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
- return -EIO;
- }
- }
-
- return 0;
-}
-
-int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- ObjectOperation& op, RGWObjState **pstate,
- RGWObjManifest** pmanifest, optional_yield y)
-{
- int r = obj->get_obj_state(dpp, pstate, y, false);
- if (r < 0)
- return r;
-
- return append_atomic_test(dpp, *pstate, op);
-}
-
-int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
- const RGWObjState* state,
- librados::ObjectOperation& op)
-{
- if (!state->is_atomic) {
- ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
- return 0;
- }
-
- if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
- op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
- } else {
- ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
- }
- return 0;
-}
-
-int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
-{
- int r = obj->get_obj_state(dpp, pstate, y, follow_olh);
- if (r < 0) {
- return r;
- }
- *pmanifest = static_cast<rgw::sal::RadosObject*>(obj)->get_manifest();
-
- return r;
-}
-
-void RGWRados::Object::invalidate_state()
-{
- obj->invalidate();
-}
-
-int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
- ObjectWriteOperation& op, bool reset_obj, const string *ptag,
- const char *if_match, const char *if_nomatch, bool removal_op,
- bool modify_tail, optional_yield y)
-{
- int r = get_state(dpp, &state, &manifest, false, y);
- if (r < 0)
- return r;
-
- bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
- if_match != NULL || if_nomatch != NULL) &&
- (!state->fake_tag);
-
- if (!state->is_atomic) {
- ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
-
- if (reset_obj) {
- op.create(false);
- store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
- }
-
- return 0;
- }
-
- if (need_guard) {
- /* first verify that the object wasn't replaced under */
- if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
- op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
- // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
- }
-
- if (if_match) {
- if (strcmp(if_match, "*") == 0) {
- // test the object is existing
- if (!state->exists) {
- return -ERR_PRECONDITION_FAILED;
- }
- } else {
- bufferlist bl;
- if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
- strncmp(if_match, bl.c_str(), bl.length()) != 0) {
- return -ERR_PRECONDITION_FAILED;
- }
- }
- }
-
- if (if_nomatch) {
- if (strcmp(if_nomatch, "*") == 0) {
- // test the object is NOT existing
- if (state->exists) {
- return -ERR_PRECONDITION_FAILED;
- }
- } else {
- bufferlist bl;
- if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
- strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
- return -ERR_PRECONDITION_FAILED;
- }
- }
- }
- }
-
- if (reset_obj) {
- if (state->exists) {
- op.create(false);
- store->remove_rgw_head_obj(op);
- } else {
- op.create(true);
- }
- }
-
- if (removal_op) {
- /* the object is being removed, no need to update its tag */
- return 0;
- }
-
- if (ptag) {
- state->write_tag = *ptag;
- } else {
- append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
- }
- bufferlist bl;
- bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
-
- ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
-
- op.setxattr(RGW_ATTR_ID_TAG, bl);
- if (modify_tail) {
- op.setxattr(RGW_ATTR_TAIL_TAG, bl);
- }
-
- return 0;
-}
-
-/**
- * Set an attr on an object.
- * bucket: name of the bucket holding the object
- * obj: name of the object to set the attr on
- * name: the attr to set
- * bl: the contents of the attr
- * Returns: 0 on success, -ERR# otherwise.
- */
-int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl)
-{
- map<string, bufferlist> attrs;
- attrs[name] = bl;
- return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
-}
-
-int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* src_obj,
- map<string, bufferlist>& attrs,
- map<string, bufferlist>* rmattrs,
- optional_yield y)
-{
- std::unique_ptr<rgw::sal::Object> obj = src_obj->clone();
- if (obj->get_instance() == "null") {
- obj->clear_instance();
- }
-
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
- if (r < 0) {
- return r;
- }
-
- ObjectWriteOperation op;
- RGWObjState *state = NULL;
- RGWObjManifest *manifest = nullptr;
-
- r = append_atomic_test(dpp, bucket_info, obj.get(), op, &state, &manifest, y);
- if (r < 0)
- return r;
-
- // ensure null version object exist
- if (src_obj->get_instance() == "null" && !manifest) {
- return -ENOENT;
- }
-
- map<string, bufferlist>::iterator iter;
- if (rmattrs) {
- for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
- const string& name = iter->first;
- op.rmxattr(name.c_str());
- }
- }
-
- const rgw_bucket& bucket = obj->get_bucket()->get_key();
-
- for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
- const string& name = iter->first;
- bufferlist& bl = iter->second;
-
- if (!bl.length())
- continue;
-
- op.setxattr(name.c_str(), bl);
-
- if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
- real_time ts;
- try {
- decode(ts, bl);
-
- rgw_obj_index_key obj_key;
- obj->get_key().get_index_key(&obj_key);
-
- obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
- }
- }
- }
-
- if (!op.size())
- return 0;
-
- bufferlist bl;
- RGWRados::Bucket bop(this, bucket_info);
- RGWRados::Bucket::UpdateIndex index_op(&bop, obj->get_obj());
-
- if (state) {
- string tag;
- append_rand_alpha(cct, tag, tag, 32);
- state->write_tag = tag;
- r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
-
- if (r < 0)
- return r;
-
- bl.append(tag.c_str(), tag.size() + 1);
- op.setxattr(RGW_ATTR_ID_TAG, bl);
- }
-
-
- real_time mtime = real_clock::now();
- struct timespec mtime_ts = real_clock::to_timespec(mtime);
- op.mtime2(&mtime_ts);
- auto& ioctx = ref.pool.ioctx();
- r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
- if (state) {
- if (r >= 0) {
- bufferlist acl_bl = attrs[RGW_ATTR_ACL];
- bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
- bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
- string etag = rgw_bl_str(etag_bl);
- string content_type = rgw_bl_str(content_type_bl);
- string storage_class;
- auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
- if (iter != attrs.end()) {
- storage_class = rgw_bl_str(iter->second);
- }
- uint64_t epoch = ioctx.get_last_version();
- int64_t poolid = ioctx.get_id();
- r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
- mtime, etag, content_type, storage_class, &acl_bl,
- RGWObjCategory::Main, NULL);
- } else {
- int ret = index_op.cancel(dpp, nullptr);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
- }
- }
- }
- if (r < 0)
- return r;
-
- if (state) {
- state->obj_tag.swap(bl);
- if (rmattrs) {
- for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
- state->attrset.erase(iter->first);
- }
- }
-
- for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
- state->attrset[iter->first] = iter->second;
- }
-
- auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
- if (iter != state->attrset.end()) {
- iter->second = state->obj_tag;
- }
- }
-
- return 0;
-}
-
-int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
-{
- RGWRados *store = source->get_store();
- CephContext *cct = store->ctx();
-
- bufferlist etag;
-
- map<string, bufferlist>::iterator iter;
-
- RGWObjState *astate;
- RGWObjManifest *manifest = nullptr;
- int r = source->get_state(dpp, &astate, &manifest, true, y);
- if (r < 0)
- return r;
-
- if (!astate->exists) {
- return -ENOENT;
- }
-
- const RGWBucketInfo& bucket_info = source->get_bucket_info();
-
- state.obj = astate->obj;
- store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
-
- state.cur_pool = state.head_obj.pool;
- state.cur_ioctx = &state.io_ctxs[state.cur_pool];
-
- r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
- if (r < 0) {
- return r;
- }
- if (params.target_obj) {
- *params.target_obj = state.obj;
- }
- if (params.attrs) {
- *params.attrs = astate->attrset;
- if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
- for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
- ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
- }
- }
- }
-
- /* Convert all times go GMT to make them compatible */
- if (conds.mod_ptr || conds.unmod_ptr) {
- obj_time_weight src_weight;
- src_weight.init(astate);
- src_weight.high_precision = conds.high_precision_time;
-
- obj_time_weight dest_weight;
- dest_weight.high_precision = conds.high_precision_time;
-
- if (conds.mod_ptr && !conds.if_nomatch) {
- dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
- ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
- if (!(dest_weight < src_weight)) {
- return -ERR_NOT_MODIFIED;
- }
- }
-
- if (conds.unmod_ptr && !conds.if_match) {
- dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
- ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
- if (dest_weight < src_weight) {
- return -ERR_PRECONDITION_FAILED;
- }
- }
- }
- if (conds.if_match || conds.if_nomatch) {
- r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
- if (r < 0)
- return r;
-
- if (conds.if_match) {
- string if_match_str = rgw_string_unquote(conds.if_match);
- ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
- if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
- return -ERR_PRECONDITION_FAILED;
- }
- }
-
- if (conds.if_nomatch) {
- string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
- ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
- if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
- return -ERR_NOT_MODIFIED;
- }
- }
- }
-
- if (params.obj_size)
- *params.obj_size = astate->size;
- if (params.lastmod)
- *params.lastmod = astate->mtime;
-
- return 0;
-}
-
-int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
-{
- if (ofs < 0) {
- ofs += obj_size;
- if (ofs < 0)
- ofs = 0;
- end = obj_size - 1;
- } else if (end < 0) {
- end = obj_size - 1;
- }
-
- if (obj_size > 0) {
- if (ofs >= (off_t)obj_size) {
- return -ERANGE;
- }
- if (end >= (off_t)obj_size) {
- end = obj_size - 1;
- }
- }
- return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
-{
- RGWRados *store = target->get_store();
- BucketShard *bs = nullptr;
- int r;
-
-#define NUM_RESHARD_RETRIES 10
- for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
- int ret = get_bucket_shard(&bs, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" <<
- obj_instance.key << ". ret=" << ret << dendl;
- return ret;
- }
-
- r = call(bs);
- if (r != -ERR_BUSY_RESHARDING) {
- break;
- }
-
- ldpp_dout(dpp, 10) <<
- "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
- obj_instance.key << dendl;
-
- r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
- if (r == -ERR_BUSY_RESHARDING) {
- ldpp_dout(dpp, 10) << __func__ <<
- " NOTICE: block_while_resharding() still busy. obj=" <<
- obj_instance.key << dendl;
- continue;
- } else if (r < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: block_while_resharding() failed. obj=" <<
- obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
- return r;
- }
-
- ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
- i = 0; /* resharding is finished, make sure we can retry */
- invalidate_bs();
- } // for loop
-
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
- obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
- return r;
- }
-
- if (pbs) {
- *pbs = bs;
- }
-
- return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
-{
- if (blind) {
- return 0;
- }
- RGWRados *store = target->get_store();
-
- if (write_tag && write_tag->length()) {
- optag = string(write_tag->c_str(), write_tag->length());
- } else {
- if (optag.empty()) {
- append_rand_alpha(store->ctx(), optag, optag, 32);
- }
- }
-
- int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
- return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
- });
-
- if (r < 0) {
- return r;
- }
- prepared = true;
-
- return 0;
-}
-
-int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
- uint64_t size, uint64_t accounted_size,
- ceph::real_time& ut, const string& etag,
- const string& content_type, const string& storage_class,
- bufferlist *acl_bl,
- RGWObjCategory category,
- list<rgw_obj_index_key> *remove_objs, const string *user_data,
- bool appendable)
-{
- if (blind) {
- return 0;
- }
- RGWRados *store = target->get_store();
- BucketShard *bs = nullptr;
-
- int ret = get_bucket_shard(&bs, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
- return ret;
- }
-
- rgw_bucket_dir_entry ent;
- obj.key.get_index_key(&ent.key);
- ent.meta.size = size;
- ent.meta.accounted_size = accounted_size;
- ent.meta.mtime = ut;
- ent.meta.etag = etag;
- ent.meta.storage_class = storage_class;
- if (user_data)
- ent.meta.user_data = *user_data;
-
- ACLOwner owner;
- if (acl_bl && acl_bl->length()) {
- int ret = store->decode_policy(dpp, *acl_bl, &owner);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
- }
- }
- ent.meta.owner = owner.get_id().to_str();
- ent.meta.owner_display_name = owner.get_display_name();
- ent.meta.content_type = content_type;
- ent.meta.appendable = appendable;
-
- ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
-
- add_datalog_entry(dpp, store->svc.datalog_rados,
- target->bucket_info, bs->shard_id);
-
- return ret;
-}
-
-int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
- int64_t poolid, uint64_t epoch,
- real_time& removed_mtime,
- list<rgw_obj_index_key> *remove_objs)
-{
- if (blind) {
- return 0;
- }
- RGWRados *store = target->get_store();
- BucketShard *bs = nullptr;
-
- int ret = get_bucket_shard(&bs, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
- return ret;
- }
-
- ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
-
- add_datalog_entry(dpp, store->svc.datalog_rados,
- target->bucket_info, bs->shard_id);
-
- return ret;
-}
-
-
-int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
- list<rgw_obj_index_key> *remove_objs)
-{
- if (blind) {
- return 0;
- }
- RGWRados *store = target->get_store();
- BucketShard *bs;
-
- int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
- return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
- });
-
- /*
- * need to update data log anyhow, so that whoever follows needs to update its internal markers
- * for following the specific bucket shard log. Otherwise they end up staying behind, and users
- * have no way to tell that they're all caught up
- */
- add_datalog_entry(dpp, store->svc.datalog_rados,
- target->bucket_info, bs->shard_id);
-
- return ret;
-}
-
-/*
- * Read up through index `end` inclusive. Number of bytes read is up
- * to `end - ofs + 1`.
- */
-int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
- bufferlist& bl, optional_yield y,
- const DoutPrefixProvider *dpp)
-{
- RGWRados *store = source->get_store();
-
- rgw_raw_obj read_obj;
- uint64_t read_ofs = ofs;
- uint64_t len, read_len;
- bool reading_from_head = true;
- ObjectReadOperation op;
-
- bool merge_bl = false;
- bufferlist *pbl = &bl;
- bufferlist read_bl;
- uint64_t max_chunk_size;
-
- RGWObjState *astate;
- RGWObjManifest *manifest = nullptr;
- int r = source->get_state(dpp, &astate, &manifest, true, y);
- if (r < 0)
- return r;
-
- if (astate->size == 0) {
- end = 0;
- } else if (end >= (int64_t)astate->size) {
- end = astate->size - 1;
- }
-
- if (end < 0)
- len = 0;
- else
- len = end - ofs + 1;
-
- if (manifest && manifest->has_tail()) {
- /* now get the relevant object part */
- RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
-
- uint64_t stripe_ofs = iter.get_stripe_ofs();
- read_obj = iter.get_location().get_raw_obj(store->driver);
- len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
- read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
- reading_from_head = (read_obj == state.head_obj);
- } else {
- read_obj = state.head_obj;
- }
-
- r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
- return r;
- }
-
- if (len > max_chunk_size)
- len = max_chunk_size;
-
-
- read_len = len;
-
- if (reading_from_head) {
- /* only when reading from the head object do we need to do the atomic test */
- std::unique_ptr<rgw::sal::Object> obj = source->bucket->get_object(state.obj.key);
- r = store->append_atomic_test(dpp, source->get_bucket_info(), obj.get(), op, &astate, &manifest, y);
- if (r < 0)
- return r;
-
- if (astate && astate->prefetch_data) {
- if (!ofs && astate->data.length() >= len) {
- bl = astate->data;
- return bl.length();
- }
-
- if (ofs < astate->data.length()) {
- unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
- astate->data.begin(ofs).copy(copy_len, bl);
- read_len -= copy_len;
- read_ofs += copy_len;
- if (!read_len)
- return bl.length();
-
- merge_bl = true;
- pbl = &read_bl;
- }
- }
- }
-
- ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
- op.read(read_ofs, read_len, pbl, NULL);
-
- if (state.cur_pool != read_obj.pool) {
- auto iter = state.io_ctxs.find(read_obj.pool);
- if (iter == state.io_ctxs.end()) {
- state.cur_ioctx = &state.io_ctxs[read_obj.pool];
- r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
- if (r < 0) {
- ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
- return r;
- }
- } else {
- state.cur_ioctx = &iter->second;
- }
- state.cur_pool = read_obj.pool;
- }
-
- state.cur_ioctx->locator_set_key(read_obj.loc);
-
- r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
- ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
-
- if (r < 0) {
- return r;
- }
-
- if (merge_bl) {
- bl.append(read_bl);
- }
-
- return bl.length();
-}
-
-int get_obj_data::flush(rgw::AioResultList&& results) {
- int r = rgw::check_for_errors(results);
- if (r < 0) {
- return r;
- }
- std::list<bufferlist> bl_list;
-
- auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
- results.sort(cmp); // merge() requires results to be sorted first
- completed.merge(results, cmp); // merge results in sorted order
-
- while (!completed.empty() && completed.front().id == offset) {
- auto bl = std::move(completed.front().data);
-
- bl_list.push_back(bl);
- offset += bl.length();
- int r = client_cb->handle_data(bl, 0, bl.length());
- if (r < 0) {
- return r;
- }
-
- if (rgwrados->get_use_datacache()) {
- const std::lock_guard l(d3n_get_data.d3n_lock);
- auto oid = completed.front().obj.get_ref().obj.oid;
- if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
- lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
- rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
- } else {
- lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
- }
- }
- completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
- }
- return 0;
-}
-
-static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
- const rgw_raw_obj& read_obj, off_t obj_ofs,
- off_t read_ofs, off_t len, bool is_head_obj,
- RGWObjState *astate, void *arg)
-{
- struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
- return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
- is_head_obj, astate, arg);
-}
-
-int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
- const rgw_raw_obj& read_obj, off_t obj_ofs,
- off_t read_ofs, off_t len, bool is_head_obj,
- RGWObjState *astate, void *arg)
-{
- ObjectReadOperation op;
- struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
- string oid, key;
-
- if (is_head_obj) {
- /* only when reading from the head object do we need to do the atomic test */
- int r = append_atomic_test(dpp, astate, op);
- if (r < 0)
- return r;
-
- if (astate &&
- obj_ofs < astate->data.length()) {
- unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
-
- r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
- if (r < 0)
- return r;
-
- len -= chunk_len;
- d->offset += chunk_len;
- read_ofs += chunk_len;
- obj_ofs += chunk_len;
- if (!len)
- return 0;
- }
- }
-
- auto obj = d->rgwrados->svc.rados->obj(read_obj);
- int r = obj.open(dpp);
- if (r < 0) {
- ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
- return r;
- }
-
- ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
- op.read(read_ofs, len, nullptr, nullptr);
-
- const uint64_t cost = len;
- const uint64_t id = obj_ofs; // use logical object offset for sorting replies
-
- auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
-
- return d->flush(std::move(completed));
-}
-
-int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
- optional_yield y)
-{
- RGWRados *store = source->get_store();
- CephContext *cct = store->ctx();
- const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
- const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
-
- auto aio = rgw::make_throttle(window_size, y);
- get_obj_data data(store, cb, &*aio, ofs, y);
-
- int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(),
- source->get_target(),
- ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
- data.cancel(); // drain completions without writing back to client
- return r;
- }
-
- return data.drain();
-}
-
-int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
- RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- off_t ofs, off_t end, uint64_t max_chunk_size,
- iterate_obj_cb cb, void *arg, optional_yield y)
-{
- rgw_raw_obj head_obj;
- rgw_raw_obj read_obj;
- uint64_t read_ofs = ofs;
- uint64_t len;
- bool reading_from_head = true;
- RGWObjState *astate = NULL;
- RGWObjManifest *manifest = nullptr;
-
- obj_to_raw(bucket_info.placement_rule, obj->get_obj(), &head_obj);
-
- int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
- if (r < 0) {
- return r;
- }
-
- if (end < 0)
- len = 0;
- else
- len = end - ofs + 1;
-
- if (manifest) {
- /* now get the relevant object stripe */
- RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
-
- RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
-
- for (; iter != obj_end && ofs <= end; ++iter) {
- off_t stripe_ofs = iter.get_stripe_ofs();
- off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
-
- while (ofs < next_stripe_ofs && ofs <= end) {
- read_obj = iter.get_location().get_raw_obj(driver);
- uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
- read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
-
- if (read_len > max_chunk_size) {
- read_len = max_chunk_size;
- }
-
- reading_from_head = (read_obj == head_obj);
- r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
- if (r < 0) {
- return r;
- }
-
- len -= read_len;
- ofs += read_len;
- }
- }
- } else {
- while (ofs <= end) {
- read_obj = head_obj;
- uint64_t read_len = std::min(len, max_chunk_size);
-
- r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
- if (r < 0) {
- return r;
- }
-
- len -= read_len;
- ofs += read_len;
- }
- }
-
- return 0;
-}
-
-int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
-}
-
-int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- bufferlist outbl;
-
- return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
-}
-
-int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
-{
- ObjectWriteOperation op;
-
- ceph_assert(olh_obj.key.instance.empty());
-
- bool has_tag = (state.exists && has_olh_tag(state.attrset));
-
- if (!state.exists) {
- op.create(true);
- } else {
- op.assert_exists();
- struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
- op.mtime2(&mtime_ts);
- }
-
- /*
- * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
- * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
- * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
- * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
- * log will reflect that.
- *
- * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
- * is used for object data instance, olh_tag for olh instance.
- */
- if (has_tag) {
- /* guard against racing writes */
- bucket_index_guard_olh_op(dpp, state, op);
- }
-
- if (!has_tag) {
- /* obj tag */
- string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
-
- bufferlist bl;
- bl.append(obj_tag.c_str(), obj_tag.size());
- op.setxattr(RGW_ATTR_ID_TAG, bl);
-
- state.attrset[RGW_ATTR_ID_TAG] = bl;
- state.obj_tag = bl;
-
- /* olh tag */
- string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
-
- bufferlist olh_bl;
- olh_bl.append(olh_tag.c_str(), olh_tag.size());
- op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
-
- state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
- state.olh_tag = olh_bl;
- state.is_olh = true;
-
- bufferlist verbl;
- op.setxattr(RGW_ATTR_OLH_VER, verbl);
- }
-
- bufferlist bl;
- RGWOLHPendingInfo pending_info;
- pending_info.time = real_clock::now();
- encode(pending_info, bl);
-
-#define OLH_PENDING_TAG_LEN 32
- /* tag will start with current time epoch, this so that entries are sorted by time */
- char buf[32];
- utime_t ut(pending_info.time);
- snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
- *op_tag = buf;
-
- string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
-
- op_tag->append(s);
-
- string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
- attr_name.append(*op_tag);
-
- op.setxattr(attr_name.c_str(), bl);
-
- int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
- if (ret < 0) {
- return ret;
- }
-
- state.exists = true;
- state.attrset[attr_name] = bl;
-
- return 0;
-}
-
-int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
-{
- int ret;
-
- ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
- if (ret == -EEXIST) {
- ret = -ECANCELED;
- }
-
- return ret;
-}
-
-int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
- BucketShard *bs,
- const rgw_obj& obj_instance,
- RGWBucketInfo& bucket_info,
- std::function<int(BucketShard *)> call)
-{
- rgw_obj obj;
- const rgw_obj *pobj = &obj_instance;
- int r;
-
- for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
- r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
- if (r < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
- return r;
- }
-
- r = call(bs);
- if (r != -ERR_BUSY_RESHARDING) {
- break;
- }
-
- ldpp_dout(dpp, 10) <<
- "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
- obj_instance.key << dendl;
-
- r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
- if (r == -ERR_BUSY_RESHARDING) {
- ldpp_dout(dpp, 10) << __func__ <<
- " NOTICE: block_while_resharding() still busy. obj=" <<
- obj_instance.key << dendl;
- continue;
- } else if (r < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: block_while_resharding() failed. obj=" <<
- obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
- return r;
- }
-
- ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
- i = 0; /* resharding is finished, make sure we can retry */
- } // for loop
-
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" <<
- obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
- return r;
- }
-
- return 0;
-}
-
-
-int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
- const rgw_obj& obj_instance,
- RGWBucketInfo& bucket_info,
- optional_yield y,
- const DoutPrefixProvider *dpp)
-{
- int ret = 0;
- cls_rgw_bucket_instance_entry entry;
-
- // gets loaded by fetch_new_bucket_info; can be used by
- // clear_resharding
- std::map<std::string, bufferlist> bucket_attrs;
-
- // since we want to run this recovery code from two distinct places,
- // let's just put it in a lambda so we can easily re-use; if the
- // lambda successfully fetches a new bucket id, it sets
- // new_bucket_id and returns 0, otherwise it returns a negative
- // error code
- auto fetch_new_bucket_info =
- [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
- int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
- bucket_info, nullptr, y, dpp, &bucket_attrs);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: failed to refresh bucket info after reshard at " <<
- log_tag << ": " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- ret = bs->init(dpp, bucket_info, obj_instance);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: failed to refresh bucket shard generation after reshard at " <<
- log_tag << ": " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
- ldpp_dout(dpp, 20) << __func__ <<
- " INFO: refreshed bucket info after reshard at " <<
- log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
-
- return 0;
- }; // lambda fetch_new_bucket_info
-
- constexpr int num_retries = 10;
- for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
- auto& ref = bs->bucket_obj.get_ref();
- ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
- if (ret == -ENOENT) {
- ret = fetch_new_bucket_info("get_bucket_resharding_failed");
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " failed to refresh bucket info after reshard when get bucket "
- "resharding failed, error: " << cpp_strerror(-ret) << dendl;
- return ret;
- }
- } else if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
- dendl;
- return ret;
- }
-
- if (!entry.resharding_in_progress()) {
- ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " failed to refresh bucket info after reshard when get bucket "
- "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
-
- ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
- (i < num_retries ? "retrying" : "too many retries") << dendl;
-
- if (i == num_retries) {
- break;
- }
-
- // If bucket is erroneously marked as resharding (e.g., crash or
- // other error) then fix it. If we can take the bucket reshard
- // lock then it means no other resharding should be taking place,
- // and we're free to clear the flags.
- {
- // since we expect to do this rarely, we'll do our work in a
- // block and erase our work after each try
-
- RGWObjectCtx obj_ctx(this->driver);
- const rgw_bucket& b = bs->bucket;
- std::string bucket_id = b.get_key();
- RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
- ret = reshard_lock.lock(dpp);
- if (ret == -ENOENT) {
- continue;
- } else if (ret < 0) {
- ldpp_dout(dpp, 20) << __func__ <<
- " ERROR: failed to take reshard lock for bucket " <<
- bucket_id << "; expected if resharding underway" << dendl;
- } else {
- ldpp_dout(dpp, 10) << __func__ <<
- " INFO: was able to take reshard lock for bucket " <<
- bucket_id << dendl;
- // the reshard may have finished, so call clear_resharding()
- // with its current bucket info; ALSO this will load
- // bucket_attrs for call to clear_resharding below
- ret = fetch_new_bucket_info("trying_to_clear_resharding");
- if (ret < 0) {
- reshard_lock.unlock();
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: failed to update bucket info before clear resharding for bucket " <<
- bucket_id << dendl;
- continue; // try again
- }
-
- ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
- reshard_lock.unlock();
- if (ret == -ENOENT) {
- ldpp_dout(dpp, 5) << __func__ <<
- " INFO: no need to reset reshard flags; old shards apparently"
- " removed after successful resharding of bucket " <<
- bucket_id << dendl;
- continue; // immediately test again
- } else if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: failed to clear resharding flags for bucket " <<
- bucket_id << ", " << cpp_strerror(-ret) << dendl;
- // wait and then test again
- } else {
- ldpp_dout(dpp, 5) << __func__ <<
- " INFO: apparently successfully cleared resharding flags for "
- "bucket " << bucket_id << dendl;
- continue; // if we apparently succeed immediately test again
- } // if clear resharding succeeded
- } // if taking of lock succeeded
- } // block to encapsulate recovery from incomplete reshard
-
- ret = reshard_wait->wait(y);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: bucket is still resharding, please retry" << dendl;
- return ret;
- }
- } // for loop
-
- ldpp_dout(dpp, 0) << __func__ <<
- " ERROR: bucket is still resharding, please retry" << dendl;
- return -ERR_BUSY_RESHARDING;
-}
-
-int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
- RGWObjState& olh_state, const rgw_obj& obj_instance,
- bool delete_marker, const string& op_tag,
- struct rgw_bucket_dir_entry_meta *meta,
- uint64_t olh_epoch,
- real_time unmod_since, bool high_precision_time,
- rgw_zone_set *_zones_trace, bool log_data_change)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
- if (r < 0) {
- return r;
- }
-
- rgw_zone_set zones_trace;
- if (_zones_trace) {
- zones_trace = *_zones_trace;
- }
- zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
-
- BucketShard bs(this);
-
- r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
- [&](BucketShard *bs) -> int {
- cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
- auto& ref = bs->bucket_obj.get_ref();
- librados::ObjectWriteOperation op;
- op.assert_exists(); // bucket index shard must exist
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
- delete_marker, op_tag, meta, olh_epoch,
- unmod_since, high_precision_time,
- svc.zone->get_zone().log_data, zones_trace);
- return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- });
- if (r < 0) {
- ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
- return r;
- }
-
- add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id);
-
- return 0;
-}
-
-void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
-{
- ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
- op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
-}
-
-int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw_obj& obj_instance,
- const string& op_tag, const string& olh_tag,
- uint64_t olh_epoch, rgw_zone_set *_zones_trace)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
- if (r < 0) {
- return r;
- }
-
- rgw_zone_set zones_trace;
- if (_zones_trace) {
- zones_trace = *_zones_trace;
- }
- zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
-
- BucketShard bs(this);
-
- cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
- r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
- [&](BucketShard *bs) -> int {
- auto& ref = bs->bucket_obj.get_ref();
- librados::ObjectWriteOperation op;
- op.assert_exists(); // bucket index shard must exist
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_unlink_instance(op, key, op_tag,
- olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
- return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- });
- if (r < 0) {
- ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
- return r;
- }
-
- return 0;
-}
-
-int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info, RGWObjState& state,
- const rgw_obj& obj_instance, uint64_t ver_marker,
- std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
- bool *is_truncated)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
- if (r < 0) {
- return r;
- }
-
- BucketShard bs(this);
- int ret =
- bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
- cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
- auto& shard_ref = bs.bucket_obj.get_ref();
- ObjectReadOperation op;
-
- rgw_cls_read_olh_log_ret log_ret;
- int op_ret = 0;
- cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
- bufferlist outbl;
- r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
- if (r < 0) {
- return r;
- }
- if (op_ret < 0) {
- ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
- return op_ret;
- }
-
- *log = std::move(log_ret.log);
- *is_truncated = log_ret.is_truncated;
-
- return 0;
-}
-
-// a multisite sync bug resulted in the OLH head attributes being overwritten by
-// the attributes from another zone, causing link_olh() to fail endlessly due to
-// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
-// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
-int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
- const rgw_obj& obj)
-{
- // fetch the current olh entry from the bucket index
- rgw_bucket_olh_entry olh;
- int r = bi_get_olh(dpp, bucket_info, obj, &olh);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
- return r;
- }
- if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
- return 0;
- }
-
- ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
- << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
-
- // rewrite OLH_ID_TAG and OLH_INFO from current olh
- ObjectWriteOperation op;
- // assert this is the same olh tag we think we're fixing
- bucket_index_guard_olh_op(dpp, *state, op);
- // preserve existing mtime
- struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
- op.mtime2(&mtime_ts);
- {
- bufferlist bl;
- bl.append(olh.tag.c_str(), olh.tag.size());
- op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
- }
- {
- RGWOLHInfo info;
- info.target = rgw_obj(bucket_info.bucket, olh.key);
- info.removed = olh.delete_marker;
- bufferlist bl;
- encode(info, bl);
- op.setxattr(RGW_ATTR_OLH_INFO, bl);
- }
- rgw_rados_ref ref;
- r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
- if (r < 0) {
- return r;
- }
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
- << cpp_strerror(r) << dendl;
- return r;
- }
- return 0;
-}
-
-int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- RGWObjState& state,
- const rgw_obj& obj_instance, uint64_t ver)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
- if (r < 0) {
- return r;
- }
-
- BucketShard bs(this);
- int ret =
- bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
- cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
- ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
- [&](BucketShard *pbs) -> int {
- ObjectWriteOperation op;
- op.assert_exists(); // bucket index shard must exist
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- cls_rgw_trim_olh_log(op, key, ver, olh_tag);
- return pbs->bucket_obj.operate(dpp, &op, null_yield);
- });
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- RGWObjState& state,
- const rgw_obj& obj_instance)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
- if (r < 0) {
- return r;
- }
-
- BucketShard bs(this);
-
- string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
-
- cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
-
- int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
- [&](BucketShard *pbs) -> int {
- ObjectWriteOperation op;
- op.assert_exists(); // bucket index shard must exist
- auto& ref = pbs->bucket_obj.get_ref();
- cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
- cls_rgw_clear_olh(op, key, olh_tag);
- return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- });
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
-{
- try {
- auto biter = bl.cbegin();
- decode(*olh, biter);
- return 0;
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
- return -EIO;
- }
-}
-
-int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
- RGWObjState& state,
- RGWBucketInfo& bucket_info,
- const rgw::sal::Object* obj,
- bufferlist& olh_tag,
- std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
- uint64_t *plast_ver,
- rgw_zone_set* zones_trace)
-{
- if (log.empty()) {
- return 0;
- }
-
- librados::ObjectWriteOperation op;
-
- uint64_t last_ver = log.rbegin()->first;
- *plast_ver = last_ver;
-
- map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
-
- op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
- op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
-
- bufferlist ver_bl;
- string last_ver_s = to_string(last_ver);
- ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
- op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
-
- struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
- op.mtime2(&mtime_ts);
-
- bool need_to_link = false;
- uint64_t link_epoch = 0;
- cls_rgw_obj_key key;
- bool delete_marker = false;
- list<cls_rgw_obj_key> remove_instances;
- bool need_to_remove = false;
-
- // decode current epoch and instance
- auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
- if (olh_ver != state.attrset.end()) {
- std::string str = olh_ver->second.to_str();
- std::string err;
- link_epoch = strict_strtoll(str.c_str(), 10, &err);
- }
- auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
- if (olh_info != state.attrset.end()) {
- RGWOLHInfo info;
- int r = decode_olh_info(dpp, cct, olh_info->second, &info);
- if (r < 0) {
- return r;
- }
- info.target.key.get_index_key(&key);
- delete_marker = info.removed;
- }
-
- for (iter = log.begin(); iter != log.end(); ++iter) {
- vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
- for (; viter != iter->second.end(); ++viter) {
- rgw_bucket_olh_log_entry& entry = *viter;
-
- ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
- << " key=" << entry.key.name << "[" << entry.key.instance << "] "
- << (entry.delete_marker ? "(delete)" : "") << dendl;
- switch (entry.op) {
- case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
- remove_instances.push_back(entry.key);
- break;
- case CLS_RGW_OLH_OP_LINK_OLH:
- // only overwrite a link of the same epoch if its key sorts before
- if (link_epoch < iter->first || key.instance.empty() ||
- key.instance > entry.key.instance) {
- ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
- << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
- need_to_link = true;
- need_to_remove = false;
- key = entry.key;
- delete_marker = entry.delete_marker;
- } else {
- ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
- << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
- }
- break;
- case CLS_RGW_OLH_OP_UNLINK_OLH:
- need_to_remove = true;
- need_to_link = false;
- break;
- default:
- ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
- return -EIO;
- }
- string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
- attr_name.append(entry.op_tag);
- op.rmxattr(attr_name.c_str());
- }
- }
-
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, obj->get_obj(), &ref);
- if (r < 0) {
- return r;
- }
-
- rgw::sal::Bucket* bucket = obj->get_bucket();
-
- if (need_to_link) {
- rgw_obj target(bucket->get_key(), key);
- RGWOLHInfo info;
- info.target = target;
- info.removed = delete_marker;
- bufferlist bl;
- encode(info, bl);
- op.setxattr(RGW_ATTR_OLH_INFO, bl);
- }
-
- /* first remove object instances */
- for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
- liter != remove_instances.end(); ++liter) {
- cls_rgw_obj_key& key = *liter;
- std::unique_ptr<rgw::sal::Object> obj_instance = bucket->get_object(key);
- int ret = delete_obj(dpp, bucket_info, obj_instance.get(), 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
- if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
- return ret;
- }
- }
-
- /* update olh object */
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
- return r;
- }
-
- r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj->get_obj(), last_ver);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
- return r;
- }
-
- if (need_to_remove) {
- ObjectWriteOperation rm_op;
-
- rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
- rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
- cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
- rm_op.remove();
-
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
- if (r == -ECANCELED) {
- return 0; /* someone else won this race */
- } else {
- /*
- * only clear if was successful, otherwise we might clobber pending operations on this object
- */
- r = bucket_index_clear_olh(dpp, bucket_info, state, obj->get_obj());
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
- return r;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * read olh log and apply it
- */
-int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace)
-{
- map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
- bool is_truncated;
- uint64_t ver_marker = 0;
-
- do {
- int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj->get_obj(), ver_marker, &log, &is_truncated);
- if (ret < 0) {
- return ret;
- }
- ret = apply_olh_log(dpp, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
- if (ret < 0) {
- return ret;
- }
- } while (is_truncated);
-
- return 0;
-}
-
-int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
- RGWBucketInfo& bucket_info,
- rgw::sal::Object* target_obj, bool delete_marker,
- rgw_bucket_dir_entry_meta *meta,
- uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
- optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
-{
- string op_tag;
-
- std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
- olh_obj->clear_instance();
-
- RGWObjState *state = NULL;
- RGWObjManifest *manifest = nullptr;
-
- int ret = 0;
- int i;
-
-#define MAX_ECANCELED_RETRY 100
- for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
- if (ret == -ECANCELED) {
- olh_obj->invalidate();
- }
-
- ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj.get(), &state, &manifest, false, y); /* don't follow olh */
- if (ret < 0) {
- return ret;
- }
-
- ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
- if (ret == -ECANCELED) {
- continue;
- }
- return ret;
- }
- ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj->get_obj(),
- delete_marker, op_tag, meta, olh_epoch, unmod_since,
- high_precision_time, zones_trace, log_data_change);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
- if (ret == -ECANCELED) {
- // the bucket index rejected the link_olh() due to olh tag mismatch;
- // attempt to reconstruct olh head attributes based on the bucket index
- int r2 = repair_olh(dpp, state, bucket_info, olh_obj->get_obj());
- if (r2 < 0 && r2 != -ECANCELED) {
- return r2;
- }
- continue;
- }
- return ret;
- }
- break;
- }
-
- if (i == MAX_ECANCELED_RETRY) {
- ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
- return -EIO;
- }
-
- ret = update_olh(dpp, state, bucket_info, olh_obj.get());
- if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
- ret = 0;
- }
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
- uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
-{
- string op_tag;
-
- std::unique_ptr<rgw::sal::Object> olh_obj = target_obj->clone();
- olh_obj->clear_instance();
-
- RGWObjState *state = NULL;
-
- int ret = 0;
- int i;
-
- for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
- if (ret == -ECANCELED) {
- olh_obj->invalidate();
- }
-
- ret = olh_obj->get_obj_state(dpp, &state, y, false); /* don't follow olh */
- if (ret < 0)
- return ret;
-
- ret = olh_init_modification(dpp, bucket_info, *state, olh_obj->get_obj(), &op_tag);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
- if (ret == -ECANCELED) {
- continue;
- }
- return ret;
- }
-
- string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
-
- ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj->get_obj(), op_tag, olh_tag, olh_epoch, zones_trace);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
- if (ret == -ECANCELED) {
- continue;
- }
- return ret;
- }
- break;
- }
-
- if (i == MAX_ECANCELED_RETRY) {
- ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
- return -EIO;
- }
-
- ret = update_olh(dpp, state, bucket_info, olh_obj.get(), zones_trace);
- if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
- return 0;
- }
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
-{
-#define OBJ_INSTANCE_LEN 32
- char buf[OBJ_INSTANCE_LEN + 1];
-
- gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
- no underscore for instance name due to the way we encode the raw keys */
-
- target_key->set_instance(buf);
-}
-
-void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
-{
- gen_rand_obj_instance_name(&target_obj->key);
-}
-
-int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
-{
- map<string, bufferlist> attrset;
-
- ObjectReadOperation op;
- op.getxattrs(&attrset, NULL);
-
- int r = obj_operate(dpp, bucket_info, obj, &op);
- if (r < 0) {
- return r;
- }
-
- auto iter = attrset.find(RGW_ATTR_OLH_INFO);
- if (iter == attrset.end()) { /* not an olh */
- return -EINVAL;
- }
-
- return decode_olh_info(dpp, cct, iter->second, olh);
-}
-
-void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
- map<string, bufferlist>& pending_entries,
- map<string, bufferlist> *rm_pending_entries)
-{
- map<string, bufferlist>::iterator iter = pending_entries.begin();
-
- real_time now = real_clock::now();
-
- while (iter != pending_entries.end()) {
- auto biter = iter->second.cbegin();
- RGWOLHPendingInfo pending_info;
- try {
- decode(pending_info, biter);
- } catch (buffer::error& err) {
- /* skipping bad entry, we could remove it but it might hide a bug */
- ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
- ++iter;
- continue;
- }
-
- map<string, bufferlist>::iterator cur_iter = iter;
- ++iter;
- if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
- (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
- pending_entries.erase(cur_iter);
- } else {
- /* entries names are sorted by time (rounded to a second) */
- break;
- }
- }
-}
-
-int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
-{
- rgw_rados_ref ref;
- int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
- if (r < 0) {
- return r;
- }
-
- // trim no more than 1000 entries per osd op
- constexpr int max_entries = 1000;
-
- auto i = pending_attrs.begin();
- while (i != pending_attrs.end()) {
- ObjectWriteOperation op;
- bucket_index_guard_olh_op(dpp, state, op);
-
- for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
- op.rmxattr(i->first.c_str());
- }
-
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- if (r == -ENOENT || r == -ECANCELED) {
- /* raced with some other change, shouldn't sweat about it */
- return 0;
- }
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
- return r;
- }
- }
- return 0;
-}
-
-int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target)
-{
- map<string, bufferlist> pending_entries;
- rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
-
- map<string, bufferlist> rm_pending_entries;
- check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
-
- if (!rm_pending_entries.empty()) {
- int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj->get_obj(), rm_pending_entries);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
- return ret;
- }
- }
- if (!pending_entries.empty()) {
- ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj->get_bucket() << dendl;
-
- int ret = update_olh(dpp, state, bucket_info, olh_obj);
- if (ret < 0) {
- if (ret == -ECANCELED) {
- // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
- // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
- // return ENOENT to indicate that the OLH object was removed.
- ret = -ENOENT;
- }
- return ret;
- }
- }
-
- auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
- if (iter == state->attrset.end()) {
- return -EINVAL;
- }
-
- RGWOLHInfo olh;
- int ret = decode_olh_info(dpp, cct, iter->second, &olh);
- if (ret < 0) {
- return ret;
- }
-
- if (olh.removed) {
- return -ENOENT;
- }
-
- *target = olh.target;
-
- return 0;
-}
-
-int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
- rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
- map<string, bufferlist> *attrs, bufferlist *first_chunk,
- RGWObjVersionTracker *objv_tracker, optional_yield y)
-{
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- map<string, bufferlist> unfiltered_attrset;
- uint64_t size = 0;
- struct timespec mtime_ts;
-
- ObjectReadOperation op;
- if (objv_tracker) {
- objv_tracker->prepare_op_for_read(&op);
- }
- if (attrs) {
- op.getxattrs(&unfiltered_attrset, NULL);
- }
- if (psize || pmtime) {
- op.stat2(&size, &mtime_ts, NULL);
- }
- if (first_chunk) {
- op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
- }
- bufferlist outbl;
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
-
- if (epoch) {
- *epoch = ref.pool.ioctx().get_last_version();
- }
-
- if (r < 0)
- return r;
-
- if (psize)
- *psize = size;
- if (pmtime)
- *pmtime = ceph::real_clock::from_timespec(mtime_ts);
- if (attrs) {
- rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
- }
-
- return 0;
-}
-
-int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- int shard_id, string *bucket_ver, string *master_ver,
- map<RGWObjCategory, RGWStorageStats>& stats,
- string *max_marker, bool *syncstopped)
-{
- vector<rgw_bucket_dir_header> headers;
- map<int, string> bucket_instance_ids;
- int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
- if (r < 0) {
- return r;
- }
-
- ceph_assert(headers.size() == bucket_instance_ids.size());
-
- auto iter = headers.begin();
- map<int, string>::iterator viter = bucket_instance_ids.begin();
- BucketIndexShardsManager ver_mgr;
- BucketIndexShardsManager master_ver_mgr;
- BucketIndexShardsManager marker_mgr;
- char buf[64];
- for(; iter != headers.end(); ++iter, ++viter) {
- accumulate_raw_stats(*iter, stats);
- snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
- ver_mgr.add(viter->first, string(buf));
- snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
- master_ver_mgr.add(viter->first, string(buf));
- if (shard_id >= 0) {
- *max_marker = iter->max_marker;
- } else {
- marker_mgr.add(viter->first, iter->max_marker);
- }
- if (syncstopped != NULL)
- *syncstopped = iter->syncstopped;
- }
- ver_mgr.to_string(bucket_ver);
- master_ver_mgr.to_string(master_ver);
- if (shard_id < 0) {
- marker_mgr.to_string(max_marker);
- }
- return 0;
-}
-
-class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
- RGWGetBucketStats_CB *cb;
- uint32_t pendings;
- map<RGWObjCategory, RGWStorageStats> stats;
- int ret_code;
- bool should_cb;
- ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
-
-public:
- RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
- : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
- {}
-
- void handle_response(int r, rgw_bucket_dir_header& header) override {
- std::lock_guard l{lock};
- if (should_cb) {
- if ( r >= 0) {
- accumulate_raw_stats(header, stats);
- } else {
- ret_code = r;
- }
-
- // Are we all done?
- if (--pendings == 0) {
- if (!ret_code) {
- cb->set_response(&stats);
- }
- cb->handle_response(ret_code);
- cb->put();
- }
- }
- }
-
- void unset_cb() {
- std::lock_guard l{lock};
- should_cb = false;
- }
-};
-
-int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
-{
- int num_aio = 0;
- RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
- ceph_assert(get_ctx);
- int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
- if (r < 0) {
- ctx->put();
- if (num_aio) {
- get_ctx->unset_cb();
- }
- }
- get_ctx->put();
- return r;
-}
-
-int RGWRados::get_bucket_instance_info(const string& meta_key,
- RGWBucketInfo& info,
- real_time *pmtime,
- map<string, bufferlist> *pattrs,
- optional_yield y,
- const DoutPrefixProvider *dpp)
-{
- rgw_bucket bucket;
- rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
-
- return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
-}
-
-int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
- real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
- const DoutPrefixProvider *dpp)
-{
- return ctl.bucket->read_bucket_instance_info(bucket, &info,
- y,
- dpp,
- RGWBucketCtl::BucketInstance::GetParams()
- .set_mtime(pmtime)
- .set_attrs(pattrs));
-}
-
-int RGWRados::get_bucket_info(RGWServices *svc,
- const string& tenant, const string& bucket_name,
- RGWBucketInfo& info,
- real_time *pmtime,
- optional_yield y,
- const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
-{
- rgw_bucket bucket;
- bucket.tenant = tenant;
- bucket.name = bucket_name;
- return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
- RGWBucketCtl::BucketInstance::GetParams()
- .set_mtime(pmtime)
- .set_attrs(pattrs));
-}
-
-int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
- ceph::real_time *pmtime,
- const DoutPrefixProvider *dpp,
- map<string, bufferlist> *pattrs)
-{
- rgw_bucket bucket = info.bucket;
- bucket.bucket_id.clear();
-
- auto rv = info.objv_tracker.read_version;
-
- return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
- RGWBucketCtl::BucketInstance::GetParams()
- .set_mtime(pmtime)
- .set_attrs(pattrs)
- .set_refresh_version(rv));
-}
-
-int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
- real_time mtime, map<string, bufferlist> *pattrs,
- const DoutPrefixProvider *dpp)
-{
- return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
- RGWBucketCtl::BucketInstance::PutParams()
- .set_exclusive(exclusive)
- .set_mtime(mtime)
- .set_attrs(pattrs));
-}
-
-int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
- map<string, bufferlist> *pattrs, bool create_entry_point,
- const DoutPrefixProvider *dpp)
-{
- bool create_head = !info.has_instance_obj || create_entry_point;
-
- int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
- if (ret < 0) {
- return ret;
- }
-
- if (!create_head)
- return 0; /* done! */
-
- RGWBucketEntryPoint entry_point;
- entry_point.bucket = info.bucket;
- entry_point.owner = info.owner;
- entry_point.creation_time = info.creation_time;
- entry_point.linked = true;
- RGWObjVersionTracker ot;
- if (pep_objv && !pep_objv->tag.empty()) {
- ot.write_version = *pep_objv;
- } else {
- ot.generate_new_write_ver(cct);
- if (pep_objv) {
- *pep_objv = ot.write_version;
- }
- }
- ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
- .set_exclusive(exclusive)
- .set_objv_tracker(&ot)
- .set_mtime(mtime));
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
-{
- map<string, RGWBucketEnt>::iterator iter;
- for (iter = m.begin(); iter != m.end(); ++iter) {
- RGWBucketEnt& ent = iter->second;
- rgw_bucket& bucket = ent.bucket;
- ent.count = 0;
- ent.size = 0;
- ent.size_rounded = 0;
-
- vector<rgw_bucket_dir_header> headers;
-
- RGWBucketInfo bucket_info;
- int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
- if (ret < 0) {
- return ret;
- }
-
- int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
- if (r < 0)
- return r;
-
- auto hiter = headers.begin();
- for (; hiter != headers.end(); ++hiter) {
- RGWObjCategory category = main_category;
- auto iter = (hiter->stats).find(category);
- if (iter != hiter->stats.end()) {
- struct rgw_bucket_category_stats& stats = iter->second;
- ent.count += stats.num_entries;
- ent.size += stats.total_size;
- ent.size_rounded += stats.total_size_rounded;
- }
- }
-
- // fill in placement_rule from the bucket instance for use in swift's
- // per-storage policy statistics
- ent.placement_rule = std::move(bucket_info.placement_rule);
- }
-
- return m.size();
-}
-
-int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
-{
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
- librados::Rados *rad = get_rados_handle();
- librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
-
- r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
- completion->release();
- return r;
-}
-
-int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
-{
- librados::IoCtx& io_ctx = ctx.io_ctx;
- librados::NObjectIterator& iter = ctx.iter;
-
- int r = open_pool_ctx(dpp, pool, io_ctx, false);
- if (r < 0)
- return r;
-
- iter = io_ctx.nobjects_begin();
-
- return 0;
-}
-
-int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
-{
- librados::IoCtx& io_ctx = ctx.io_ctx;
- librados::NObjectIterator& iter = ctx.iter;
-
- int r = open_pool_ctx(dpp, pool, io_ctx, false);
- if (r < 0)
- return r;
-
- librados::ObjectCursor oc;
- if (!oc.from_str(cursor)) {
- ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
- return -EINVAL;
- }
-
- try {
- iter = io_ctx.nobjects_begin(oc);
- return 0;
- } catch (const std::system_error& e) {
- r = -e.code().value();
- ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
- << ", returning " << r << dendl;
- return r;
- } catch (const std::exception& e) {
- ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
- << ", returning -5" << dendl;
- return -EIO;
- }
-}
-
-string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
-{
- return ctx.iter.get_cursor().to_str();
-}
-
-static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
- vector<rgw_bucket_dir_entry>& objs,
- bool *is_truncated, RGWAccessListFilter *filter)
-{
- librados::IoCtx& io_ctx = ctx.io_ctx;
- librados::NObjectIterator& iter = ctx.iter;
-
- if (iter == io_ctx.nobjects_end())
- return -ENOENT;
-
- uint32_t i;
-
- for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
- rgw_bucket_dir_entry e;
-
- string oid = iter->get_oid();
- ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
-
- // fill it in with initial values; we may correct later
- if (filter && !filter->filter(oid, oid))
- continue;
-
- e.key = oid;
- objs.push_back(e);
- }
-
- if (is_truncated)
- *is_truncated = (iter != io_ctx.nobjects_end());
-
- return objs.size();
-}
-
-int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
- bool *is_truncated, RGWAccessListFilter *filter)
-{
- // catch exceptions from NObjectIterator::operator++()
- try {
- return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
- } catch (const std::system_error& e) {
- int r = -e.code().value();
- ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
- << ", returning " << r << dendl;
- return r;
- } catch (const std::exception& e) {
- ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
- << ", returning -5" << dendl;
- return -EIO;
- }
-}
-
-int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
-{
- if (!ctx->initialized) {
- int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
- if (r < 0) {
- ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
- return r;
- }
- ctx->initialized = true;
- }
- return 0;
-}
-
-int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
- RGWListRawObjsCtx& ctx, list<string>& oids,
- bool *is_truncated)
-{
- if (!ctx.initialized) {
- return -EINVAL;
- }
- RGWAccessListFilterPrefix filter(prefix_filter);
- vector<rgw_bucket_dir_entry> objs;
- int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
- if (r < 0) {
- if(r != -ENOENT)
- ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
- return r;
- }
-
- vector<rgw_bucket_dir_entry>::iterator iter;
- for (iter = objs.begin(); iter != objs.end(); ++iter) {
- oids.push_back(iter->key.name);
- }
-
- return oids.size();
-}
-
-int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
- int max, RGWListRawObjsCtx& ctx, list<string>& oids,
- bool *is_truncated)
-{
- if (!ctx.initialized) {
- int r = list_raw_objects_init(dpp, pool, string(), &ctx);
- if (r < 0) {
- return r;
- }
- }
-
- return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
-}
-
-string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
-{
- return pool_iterate_get_cursor(ctx.iter_ctx);
-}
-
-int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
- rgw_bucket_dir_entry *dirent)
-{
- rgw_cls_bi_entry bi_entry;
- int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
- }
- if (r < 0) {
- return r;
- }
- auto iter = bi_entry.data.cbegin();
- try {
- decode(*dirent, iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
- return -EIO;
- }
-
- return 0;
-}
-
-int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
- rgw_bucket_olh_entry *olh)
-{
- rgw_cls_bi_entry bi_entry;
- int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
- }
- if (r < 0) {
- return r;
- }
- auto iter = bi_entry.data.cbegin();
- try {
- decode(*olh, iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
- return -EIO;
- }
-
- return 0;
-}
-
-int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
- BIIndexType index_type, rgw_cls_bi_entry *entry)
-{
- BucketShard bs(this);
- int ret = bs.init(dpp, bucket_info, obj);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
-
- auto& ref = bs.bucket_obj.get_ref();
-
- return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
-}
-
-void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
-{
- auto& ref = bs.bucket_obj.get_ref();
- cls_rgw_bi_put(op, ref.obj.oid, entry);
-}
-
-int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
-{
- auto& ref = bs.bucket_obj.get_ref();
- int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
-{
- // make sure incomplete multipart uploads are hashed correctly
- if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
- RGWMPObj mp;
- mp.from_meta(obj.key.name);
- obj.index_hash_source = mp.get_key();
- }
- BucketShard bs(this);
-
- int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- return bi_put(bs, entry);
-}
-
-int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
- const string& obj_name_filter, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
- rgw_obj obj(bucket, obj_name_filter);
- BucketShard bs(this);
- int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- auto& ref = bs.bucket_obj.get_ref();
- ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
- if (ret == -ENOENT) {
- *is_truncated = false;
- }
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
- auto& ref = bs.bucket_obj.get_ref();
- int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-int RGWRados::bi_list(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
- list<rgw_cls_bi_entry> *entries, bool *is_truncated)
-{
- BucketShard bs(this);
- int ret = bs.init(dpp, bucket_info,
- bucket_info.layout.current_index,
- shard_id);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
- return ret;
- }
-
- return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
-}
-
-int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
-{
- auto& ref = bs.bucket_obj.get_ref();
- int ret = ref.pool.ioctx().remove(ref.obj.oid);
- if (ret == -ENOENT) {
- ret = 0;
- }
- if (ret < 0) {
- ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
-{
- return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
-}
-
-int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
- librados::ObjectWriteOperation *op)
-{
- return gc_pool_ctx.aio_operate(oid, c, op);
-}
-
-int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
-{
- return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
-}
-
-int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
-{
- return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
-}
-
-int RGWRados::process_gc(bool expired_only)
-{
- return gc->process(expired_only);
-}
-
-int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
- int& index)
-{
- return lc->list_lc_progress(marker, max_entries, progress_map, index);
-}
-
-int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
-{
- RGWLC lc;
- lc.initialize(cct, this->driver);
- RGWLC::LCWorker worker(&lc, cct, &lc, 0);
- auto ret = lc.process(&worker, optional_bucket, true /* once */);
- lc.stop_processor(); // sets down_flag, but returns immediately
- return ret;
-}
-
-bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
-{
- return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
-}
-
-int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
- rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
-{
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
- ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
- ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
- rgw_zone_set zones_trace;
- if (_zones_trace) {
- zones_trace = *_zones_trace;
- }
- zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
-
- ObjectWriteOperation o;
- o.assert_exists(); // bucket index shard must exist
-
- cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
- cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
- int ret = bs.bucket_obj.operate(dpp, &o, y);
- ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
- return ret;
-}
-
-int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
- int64_t pool, uint64_t epoch,
- rgw_bucket_dir_entry& ent, RGWObjCategory category,
- list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
-{
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
- ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
- " obj=" << obj << " tag=" << tag << " op=" << op <<
- ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
- ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
- ObjectWriteOperation o;
- o.assert_exists(); // bucket index shard must exist
-
- rgw_bucket_dir_entry_meta dir_meta;
- dir_meta = ent.meta;
- dir_meta.category = category;
-
- rgw_zone_set zones_trace;
- if (_zones_trace) {
- zones_trace = *_zones_trace;
- }
- zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
-
- rgw_bucket_entry_ver ver;
- ver.pool = pool;
- ver.epoch = epoch;
- cls_rgw_obj_key key(ent.key.name, ent.key.instance);
- cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
- cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
- svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
- complete_op_data *arg;
- index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
- svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
- librados::AioCompletion *completion = arg->rados_completion;
- int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
- completion->release(); /* can't reference arg here, as it might have already been released */
-
- ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
- return ret;
-}
-
-int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
- int64_t pool, uint64_t epoch,
- rgw_bucket_dir_entry& ent, RGWObjCategory category,
- list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
-{
- return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
-}
-
-int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
- int64_t pool, uint64_t epoch,
- rgw_obj& obj,
- real_time& removed_mtime,
- list<rgw_obj_index_key> *remove_objs,
- uint16_t bilog_flags,
- rgw_zone_set *zones_trace)
-{
- rgw_bucket_dir_entry ent;
- ent.meta.mtime = removed_mtime;
- obj.key.get_index_key(&ent.key);
- return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
- ent, RGWObjCategory::None, remove_objs,
- bilog_flags, zones_trace);
-}
-
-int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
- list<rgw_obj_index_key> *remove_objs,
- uint16_t bilog_flags, rgw_zone_set *zones_trace)
-{
- rgw_bucket_dir_entry ent;
- obj.key.get_index_key(&ent.key);
- return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
- -1 /* pool id */, 0, ent,
- RGWObjCategory::None, remove_objs, bilog_flags,
- zones_trace);
-}
-
-int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
-{
- RGWSI_RADOS::Pool index_pool;
- map<int, string> bucket_objs;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
- if (r < 0)
- return r;
-
- return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
-}
-
-
-// returns 0 if there is an error in calculation
-uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
- uint32_t num_shards)
-{
- if (num_shards == 0) {
- // we'll get a floating point exception since we divide by
- // num_shards
- return 0;
- }
-
- // We want to minimize the chances that when num_shards >>
- // num_entries that we return much fewer than num_entries to the
- // client. Given all the overhead of making a cls call to the osd,
- // returning a few entries is not much more work than returning one
- // entry. This minimum might be better tuned based on future
- // experiments where num_shards >> num_entries. (Note: ">>" should
- // be interpreted as "much greater than".)
- constexpr uint32_t min_read = 8;
-
- // The following is based on _"Balls into Bins" -- A Simple and
- // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
- // cases when num_shards >> num_entries (it almost serves as a
- // ceiling calculation). We also assume alpha is 1.0 and extract it
- // from the calculation. Future work could involve memoizing some of
- // the transcendental functions to minimize repeatedly re-calling
- // them with the same parameters, which we expect to be the case the
- // majority of the time.
- uint32_t calc_read =
- 1 +
- static_cast<uint32_t>((num_entries / num_shards) +
- sqrt((2 * num_entries) *
- log(num_shards) / num_shards));
-
- return std::max(min_read, calc_read);
-}
-
-
-int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- const int shard_id,
- const rgw_obj_index_key& start_after,
- const std::string& prefix,
- const std::string& delimiter,
- const uint32_t num_entries,
- const bool list_versions,
- const uint16_t expansion_factor,
- ent_map_t& m,
- bool* is_truncated,
- bool* cls_filtered,
- rgw_obj_index_key* last_entry,
- optional_yield y,
- RGWBucketListNameFilter force_check_filter)
-{
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-
- /* expansion_factor allows the number of entries to read to grow
- * exponentially; this is used when earlier reads are producing too
- * few results, perhaps due to filtering or to a series of
- * namespaced entries */
-
- ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
- " start_after=\"" << start_after.to_string() <<
- "\", prefix=\"" << prefix <<
- ", delimiter=\"" << delimiter <<
- "\", shard_id=" << shard_id <<
- "\", num_entries=" << num_entries <<
- ", shard_id=" << shard_id <<
- ", list_versions=" << list_versions <<
- ", expansion_factor=" << expansion_factor <<
- ", force_check_filter is " <<
- (force_check_filter ? "set" : "unset") << dendl_bitx;
- ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
- m.clear();
-
- RGWSI_RADOS::Pool index_pool;
- // key - oid (for different shards if there is any)
- // value - list result for the corresponding oid (shard), it is filled by
- // the AIO callback
- std::map<int, std::string> shard_oids;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
- &index_pool, &shard_oids,
- nullptr);
- if (r < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
- return r;
- }
-
- const uint32_t shard_count = shard_oids.size();
- if (shard_count == 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": the bucket index shard count appears to be 0, "
- "which is an illegal value" << dendl;
- return -ERR_INVALID_BUCKET_STATE;
- }
-
- uint32_t num_entries_per_shard;
- if (expansion_factor == 0) {
- num_entries_per_shard =
- calc_ordered_bucket_list_per_shard(num_entries, shard_count);
- } else if (expansion_factor <= 11) {
- // we'll max out the exponential multiplication factor at 1024 (2<<10)
- num_entries_per_shard =
- std::min(num_entries,
- (uint32_t(1 << (expansion_factor - 1)) *
- calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
- } else {
- num_entries_per_shard = num_entries;
- }
-
- if (num_entries_per_shard == 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": unable to calculate the number of entries to read from each "
- "bucket index shard" << dendl;
- return -ERR_INVALID_BUCKET_STATE;
- }
-
- ldpp_dout(dpp, 10) << __func__ <<
- ": request from each of " << shard_count <<
- " shard(s) for " << num_entries_per_shard << " entries to get " <<
- num_entries << " total entries" << dendl;
-
- auto& ioctx = index_pool.ioctx();
- std::map<int, rgw_cls_list_ret> shard_list_results;
- cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
- r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
- num_entries_per_shard,
- list_versions, shard_oids, shard_list_results,
- cct->_conf->rgw_bucket_index_max_aio)();
- if (r < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
- " failed" << dendl;
- return r;
- }
-
- // to manage the iterators through each shard's list results
- struct ShardTracker {
- const size_t shard_idx;
- rgw_cls_list_ret& result;
- const std::string& oid_name;
- RGWRados::ent_map_t::iterator cursor;
- RGWRados::ent_map_t::iterator end;
-
- // manages an iterator through a shard and provides other
- // accessors
- ShardTracker(size_t _shard_idx,
- rgw_cls_list_ret& _result,
- const std::string& _oid_name):
- shard_idx(_shard_idx),
- result(_result),
- oid_name(_oid_name),
- cursor(_result.dir.m.begin()),
- end(_result.dir.m.end())
- {}
-
- inline const std::string& entry_name() const {
- return cursor->first;
- }
- rgw_bucket_dir_entry& dir_entry() const {
- return cursor->second;
- }
- inline bool is_truncated() const {
- return result.is_truncated;
- }
- inline ShardTracker& advance() {
- ++cursor;
- // return a self-reference to allow for chaining of calls, such
- // as x.advance().at_end()
- return *this;
- }
- inline bool at_end() const {
- return cursor == end;
- }
- }; // ShardTracker
-
- // add the next unique candidate, or return false if we reach the end
- auto next_candidate = [] (CephContext *cct, ShardTracker& t,
- std::multimap<std::string, size_t>& candidates,
- size_t tracker_idx) {
- if (!t.at_end()) {
- candidates.emplace(t.entry_name(), tracker_idx);
- }
- return;
- };
-
- // one tracker per shard requested (may not be all shards)
- std::vector<ShardTracker> results_trackers;
- results_trackers.reserve(shard_list_results.size());
- for (auto& r : shard_list_results) {
- results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
-
- // if any *one* shard's result is trucated, the entire result is
- // truncated
- *is_truncated = *is_truncated || r.second.is_truncated;
-
- // unless *all* are shards are cls_filtered, the entire result is
- // not filtered
- *cls_filtered = *cls_filtered && r.second.cls_filtered;
- }
-
- // create a map to track the next candidate entry from ShardTracker
- // (key=candidate, value=index into results_trackers); as we consume
- // entries from shards, we replace them with the next entries in the
- // shards until we run out
- std::multimap<std::string, size_t> candidates;
- size_t tracker_idx = 0;
- std::vector<size_t> vidx;
- vidx.reserve(shard_list_results.size());
- for (auto& t : results_trackers) {
- // it's important that the values in the map refer to the index
- // into the results_trackers vector, which may not be the same
- // as the shard number (i.e., when not all shards are requested)
- next_candidate(cct, t, candidates, tracker_idx);
- ++tracker_idx;
- }
-
- rgw_bucket_dir_entry*
- last_entry_visited = nullptr; // to set last_entry (marker)
- std::map<std::string, bufferlist> updates;
- uint32_t count = 0;
- while (count < num_entries && !candidates.empty()) {
- r = 0;
- // select the next entry in lexical order (first key in map);
- // again tracker_idx is not necessarily shard number, but is index
- // into results_trackers vector
- tracker_idx = candidates.begin()->second;
- auto& tracker = results_trackers.at(tracker_idx);
-
- const std::string& name = tracker.entry_name();
- rgw_bucket_dir_entry& dirent = tracker.dir_entry();
-
- ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
- dirent.key << " from shard " << tracker.shard_idx << dendl;
-
- const bool force_check =
- force_check_filter && force_check_filter(dirent.key.name);
-
- if ((!dirent.exists &&
- !dirent.is_delete_marker() &&
- !dirent.is_common_prefix()) ||
- !dirent.pending_map.empty() ||
- force_check) {
- /* there are uncommitted ops. We need to check the current
- * state, and if the tags are old we need to do clean-up as
- * well. */
- librados::IoCtx sub_ctx;
- sub_ctx.dup(ioctx);
- ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
- " calling check_disk_state bucket=" << bucket_info.bucket <<
- " entry=" << dirent.key << dendl_bitx;
- r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
- updates[tracker.oid_name], y);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": check_disk_state for \"" << dirent.key <<
- "\" failed with r=" << r << dendl;
- return r;
- }
- } else {
- r = 0;
- }
-
- // at this point either r >= 0 or r == -ENOENT
- if (r >= 0) { // i.e., if r != -ENOENT
- ldpp_dout(dpp, 10) << __func__ << ": got " <<
- dirent.key << dendl;
-
- auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
- last_entry_visited = &it->second;
- if (inserted) {
- ++count;
- } else {
- ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
- " reassigned map value at \"" << name <<
- "\", which should not happen" << dendl;
- }
- } else {
- ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
- dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
- last_entry_visited = &tracker.dir_entry();
- }
-
- // refresh the candidates map
- vidx.clear();
- bool need_to_stop = false;
- auto range = candidates.equal_range(name);
- for (auto i = range.first; i != range.second; ++i) {
- vidx.push_back(i->second);
- }
- candidates.erase(range.first, range.second);
- for (auto idx : vidx) {
- auto& tracker_match = results_trackers.at(idx);
- tracker_match.advance();
- next_candidate(cct, tracker_match, candidates, idx);
- if (tracker_match.at_end() && tracker_match.is_truncated()) {
- need_to_stop = true;
- break;
- }
- }
- if (need_to_stop) {
- // once we exhaust one shard that is truncated, we need to stop,
- // as we cannot be certain that one of the next entries needs to
- // come from that shard; S3 and swift protocols allow returning
- // fewer than what was requested
- ldpp_dout(dpp, 10) << __func__ <<
- ": stopped accumulating results at count=" << count <<
- ", dirent=\"" << dirent.key <<
- "\", because its shard is truncated and exhausted" << dendl;
- break;
- }
- } // while we haven't provided requested # of result entries
-
- // suggest updates if there are any
- for (auto& miter : updates) {
- if (miter.second.length()) {
- ObjectWriteOperation o;
- cls_rgw_suggest_changes(o, miter.second);
- // we don't care if we lose suggested updates, send them off blindly
- AioCompletion *c =
- librados::Rados::aio_create_completion(nullptr, nullptr);
-
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
- ": doing dir_suggest on " << miter.first << dendl_bitx;
- ioctx.aio_operate(miter.first, c, &o);
- c->release();
- }
- } // updates loop
-
- // determine truncation by checking if all the returned entries are
- // consumed or not
- *is_truncated = false;
- for (const auto& t : results_trackers) {
- if (!t.at_end() || t.is_truncated()) {
- *is_truncated = true;
- break;
- }
- }
-
- ldpp_dout(dpp, 20) << __func__ <<
- ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
- dendl;
-
- if (*is_truncated && count < num_entries) {
- ldpp_dout(dpp, 10) << __func__ <<
- ": requested " << num_entries << " entries but returning " <<
- count << ", which is truncated" << dendl;
- }
-
- if (last_entry_visited != nullptr && last_entry) {
- *last_entry = last_entry_visited->key;
- ldpp_dout(dpp, 20) << __func__ <<
- ": returning, last_entry=" << *last_entry << dendl;
- } else {
- ldpp_dout(dpp, 20) << __func__ <<
- ": returning, last_entry NOT SET" << dendl;
- }
-
- ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
- return 0;
-} // RGWRados::cls_bucket_list_ordered
-
-
-// A helper function to retrieve the hash source from an incomplete
-// multipart entry by removing everything from the second to last
-// period on.
-static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
- std::size_t found = oid_wo_ns.rfind('.');
- if (found == std::string::npos || found < 1) {
- return -EINVAL;
- }
- found = oid_wo_ns.rfind('.', found - 1);
- if (found == std::string::npos || found < 1) {
- return -EINVAL;
- }
- *index_hash_source = oid_wo_ns.substr(0, found);
- return 0;
-}
-
-
-int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- int shard_id,
- const rgw_obj_index_key& start_after,
- const std::string& prefix,
- uint32_t num_entries,
- bool list_versions,
- std::vector<rgw_bucket_dir_entry>& ent_list,
- bool *is_truncated,
- rgw_obj_index_key *last_entry,
- optional_yield y,
- RGWBucketListNameFilter force_check_filter) {
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
-
- ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
- " start_after=\"" << start_after <<
- "\", prefix=\"" << prefix <<
- "\", shard_id=" << shard_id <<
- "\", num_entries=" << num_entries <<
- ", list_versions=" << list_versions <<
- (force_check_filter ? "set" : "unset") << dendl_bitx;
- ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
- ent_list.clear();
- static MultipartMetaFilter multipart_meta_filter;
-
- *is_truncated = false;
- RGWSI_RADOS::Pool index_pool;
-
- std::map<int, std::string> oids;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
- if (r < 0) {
- return r;
- }
-
- auto& ioctx = index_pool.ioctx();
-
- const uint32_t num_shards = oids.size();
-
- rgw_obj_index_key marker = start_after;
- uint32_t current_shard;
- if (shard_id >= 0) {
- current_shard = shard_id;
- } else if (start_after.empty()) {
- current_shard = 0u;
- } else {
- // at this point we have a marker (start_after) that has something
- // in it, so we need to get to the bucket shard index, so we can
- // start reading from there
-
-
- // now convert the key (oid) to an rgw_obj_key since that will
- // separate out the namespace, name, and instance
- rgw_obj_key obj_key;
- bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
- if (!parsed) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " received an invalid start marker: \"" << start_after << "\"" <<
- dendl;
- return -EINVAL;
- } else if (obj_key.name.empty()) {
- // if the name is empty that means the object name came in with
- // a namespace only, and therefore we need to start our scan at
- // the first bucket index shard
- current_shard = 0u;
- } else {
- // so now we have the key used to compute the bucket index shard
- // and can extract the specific shard from it
- if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
- // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
- // the implementation relying on MultipartMetaFilter
- // because MultipartMetaFilter only checks .meta suffix, which may
- // exclude data multiparts but include some regular objects with .meta suffix
- // by mistake.
- string index_hash_source;
- r = parse_index_hash_source(obj_key.name, &index_hash_source);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- " parse_index_hash_source unable to parse \"" << obj_key.name <<
- "\", r=" << r << dendl;
- return r;
- }
- current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
- } else {
- current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
- }
- }
- }
-
- uint32_t count = 0u;
- std::map<std::string, bufferlist> updates;
- rgw_obj_index_key last_added_entry;
- while (count <= num_entries &&
- ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
- current_shard < num_shards)) {
- const std::string& oid = oids[current_shard];
- rgw_cls_list_ret result;
-
- librados::ObjectReadOperation op;
- const std::string empty_delimiter;
- cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
- num_entries,
- list_versions, &result);
- r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
- return r;
- }
-
- for (auto& entry : result.dir.m) {
- rgw_bucket_dir_entry& dirent = entry.second;
-
- bool force_check = force_check_filter &&
- force_check_filter(dirent.key.name);
- if ((!dirent.exists && !dirent.is_delete_marker()) ||
- !dirent.pending_map.empty() ||
- force_check) {
- /* there are uncommitted ops. We need to check the current state,
- * and if the tags are old we need to do cleanup as well. */
- librados::IoCtx sub_ctx;
- sub_ctx.dup(ioctx);
- ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
- ": calling check_disk_state bucket=" << bucket_info.bucket <<
- " entry=" << dirent.key << dendl_bitx;
- r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
- if (r < 0 && r != -ENOENT) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
- ": error in check_disk_state, r=" << r << dendl;
- return r;
- }
- } else {
- r = 0;
- }
-
- // at this point either r >= 0 or r == -ENOENT
- if (r >= 0) { // i.e., if r != -ENOENT
- ldpp_dout(dpp, 10) << __func__ << ": got " <<
- dirent.key << dendl;
-
- if (count < num_entries) {
- marker = last_added_entry = dirent.key; // double assign
- ent_list.emplace_back(std::move(dirent));
- ++count;
- } else {
- last_added_entry = dirent.key;
- *is_truncated = true;
- ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
- ": reached max entries (" << num_entries << ") to return at \"" <<
- dirent.key << "\"" << dendl;
- goto check_updates;
- }
- } else { // r == -ENOENT
- // in the case of -ENOENT, make sure we're advancing marker
- // for possible next call to CLSRGWIssueBucketList
- marker = dirent.key;
- }
- } // entry for loop
-
- if (!result.is_truncated) {
- // if we reached the end of the shard read next shard
- ++current_shard;
- marker = rgw_obj_index_key();
- }
- } // shard loop
-
-check_updates:
-
- // suggest updates if there is any
- std::map<std::string, bufferlist>::iterator miter = updates.begin();
- for (; miter != updates.end(); ++miter) {
- if (miter->second.length()) {
- ObjectWriteOperation o;
- cls_rgw_suggest_changes(o, miter->second);
- // we don't care if we lose suggested updates, send them off blindly
- AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
-
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
- " doing dir_suggest on " << miter->first << dendl_bitx;
- ioctx.aio_operate(miter->first, c, &o);
- c->release();
- }
- }
-
- if (last_entry && !ent_list.empty()) {
- *last_entry = last_added_entry;
- }
-
- ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
- return 0;
-} // RGWRados::cls_bucket_list_unordered
-
-
-int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
- rgw_usage_log_info& info)
-{
- rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- ObjectWriteOperation op;
- cls_rgw_usage_log_add(op, info);
-
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- return r;
-}
-
-int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
- uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
- string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
- bool *is_truncated)
-{
- rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- *is_truncated = false;
-
- r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
- max_entries, read_iter, usage, is_truncated);
-
- return r;
-}
-
-static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
-{
- bool done = false;
- do {
- librados::ObjectWriteOperation op;
- cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
- int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- if (r == -ENODATA)
- done = true;
- else if (r < 0)
- return r;
- } while (!done);
-
- return 0;
-}
-
-int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
- uint64_t start_epoch, uint64_t end_epoch)
-{
- rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
-
- r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
- return r;
-}
-
-int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
-{
- rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
-
- rgw_rados_ref ref;
- int r = get_raw_obj_ref(dpp, obj, &ref);
- if (r < 0) {
- return r;
- }
- librados::ObjectWriteOperation op;
- cls_rgw_usage_log_clear(op);
- r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
- return r;
-}
-
-
-// note: this removes entries from the rados bucket index objects
-// without going through CLS; this is known to be called from
-// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
-int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const std::list<rgw_obj_index_key>& entry_key_list)
-{
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
- ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
- " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
- ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
-
- const auto& current_index = bucket_info.get_current_index();
- if (is_layout_indexless(current_index)) {
- return -EINVAL;
- }
- const uint32_t num_shards = current_index.layout.normal.num_shards;
-
- RGWSI_RADOS::Pool index_pool;
- std::map<int, std::string> index_oids;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
- bucket_info.layout.current_index,
- &index_pool, &index_oids, nullptr);
- if (r < 0) {
- ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
- " open_bucket_index returned " << r << dendl_bitx;
- return r;
- }
-
- // split up removals by shard
- std::map<int, std::set<std::string>> sharded_removals;
- for (const auto& entry_key : entry_key_list) {
- const rgw_obj_key obj_key(entry_key);
- const uint32_t shard =
- RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
-
- // entry_key already combines namespace and name, so we first have
- // to break that apart before we can then combine with instance
- std::string name;
- std::string ns; // namespace
- rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
- rgw_obj_key full_key(name, entry_key.instance, ns);
- std::string combined_key = full_key.get_oid();
-
- sharded_removals[shard].insert(combined_key);
-
- ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
- ": removal from bucket index, bucket=" << bucket_info.bucket <<
- " key=" << combined_key << " designated for shard " << shard <<
- dendl_bitx;
- }
-
- for (const auto& removals : sharded_removals) {
- const int shard = removals.first;
- const std::string& oid = index_oids[shard];
-
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
- ": removal from bucket index, bucket=" << bucket_info.bucket <<
- ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
- removals.second.size() << dendl_bitx;
-
- r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
- if (r < 0) {
- ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
- ": omap_rm_keys returned ret=" << r <<
- dendl_bitx;
- return r;
- }
- }
-
- ldout_bitx(bitx, dpp, 5) <<
- "EXITING " << __func__ << " and returning " << r << dendl_bitx;
-
- return r;
-}
-
-int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
- librados::IoCtx io_ctx,
- RGWBucketInfo& bucket_info,
- rgw_bucket_dir_entry& list_state,
- rgw_bucket_dir_entry& object,
- bufferlist& suggested_updates,
- optional_yield y)
-{
- const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
- ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
- bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
-
- std::unique_ptr<rgw::sal::Bucket> bucket;
- driver->get_bucket(nullptr, bucket_info, &bucket);
- uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
-
- std::string loc;
-
- std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(list_state.key);
- MultipartMetaFilter multipart_meta_filter;
- string temp_key;
- if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
- obj->set_in_extra_data(true);
- }
-
- string oid;
- get_obj_bucket_and_oid_loc(obj->get_obj(), oid, loc);
-
- if (loc != list_state.locator) {
- ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
- }
-
- io_ctx.locator_set_key(list_state.locator);
-
- RGWObjState *astate = NULL;
- RGWObjManifest *manifest = nullptr;
- RGWObjectCtx rctx(this->driver);
- int r = get_obj_state(dpp, &rctx, bucket_info, obj.get(), &astate, &manifest, false, y);
- if (r < 0)
- return r;
-
- list_state.pending_map.clear(); // we don't need this and it inflates size
- if (!list_state.is_delete_marker() && !astate->exists) {
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
- /* object doesn't exist right now -- hopefully because it's
- * marked as !exists and got deleted */
- if (list_state.exists) {
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
- /* FIXME: what should happen now? Work out if there are any
- * non-bad ways this could happen (there probably are, but annoying
- * to handle!) */
- }
-
- // encode a suggested removal of that key
- list_state.ver.epoch = io_ctx.get_last_version();
- list_state.ver.pool = io_ctx.get_id();
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
- cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
- return -ENOENT;
- }
-
- string etag;
- string content_type;
- string storage_class;
- ACLOwner owner;
- bool appendable = false;
-
- object.meta.size = astate->size;
- object.meta.accounted_size = astate->accounted_size;
- object.meta.mtime = astate->mtime;
-
- map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
- if (iter != astate->attrset.end()) {
- etag = rgw_bl_str(iter->second);
- }
- iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
- if (iter != astate->attrset.end()) {
- content_type = rgw_bl_str(iter->second);
- }
- iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
- if (iter != astate->attrset.end()) {
- storage_class = rgw_bl_str(iter->second);
- }
- iter = astate->attrset.find(RGW_ATTR_ACL);
- if (iter != astate->attrset.end()) {
- r = decode_policy(dpp, iter->second, &owner);
- if (r < 0) {
- ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
- }
- }
- iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
- if (iter != astate->attrset.end()) {
- appendable = true;
- }
-
- if (manifest) {
- RGWObjManifest::obj_iterator miter;
- for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
- const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(driver);
- rgw_obj loc;
- RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
-
- if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
- r = delete_obj_index(loc, astate->mtime, dpp);
- if (r < 0) {
- ldout_bitx(bitx, dpp, 0) <<
- "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
- }
- }
- }
- }
-
- object.meta.etag = etag;
- object.meta.content_type = content_type;
- object.meta.storage_class = storage_class;
- object.meta.owner = owner.get_id().to_str();
- object.meta.owner_display_name = owner.get_display_name();
- object.meta.appendable = appendable;
-
- // encode suggested updates
-
- list_state.meta.size = object.meta.size;
- list_state.meta.accounted_size = object.meta.accounted_size;
- list_state.meta.mtime = object.meta.mtime;
- list_state.meta.category = main_category;
- list_state.meta.etag = etag;
- list_state.meta.appendable = appendable;
- list_state.meta.content_type = content_type;
- list_state.meta.storage_class = storage_class;
-
- librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
- r = get_obj_head_ioctx(dpp, bucket_info, obj->get_obj(), &head_obj_ctx);
- if (r < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- " WARNING: unable to find head object data pool for \"" <<
- obj << "\", not updating version pool/epoch" << dendl;
- } else {
- list_state.ver.pool = head_obj_ctx.get_id();
- list_state.ver.epoch = astate->epoch;
- }
-
- if (astate->obj_tag.length() > 0) {
- list_state.tag = astate->obj_tag.c_str();
- }
-
- list_state.meta.owner = owner.get_id().to_str();
- list_state.meta.owner_display_name = owner.get_display_name();
-
- list_state.exists = true;
-
- ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
- ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
- cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
-
- ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
- return 0;
-} // RGWRados::check_disk_state
-
-int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
-{
- RGWSI_RADOS::Pool index_pool;
- map<int, string> oids;
- map<int, struct rgw_cls_list_ret> list_results;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
- if (r < 0) {
- ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
- << r << dendl;
- return r;
- }
-
- r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
- if (r < 0) {
- ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
- << r << dendl;
- return r;
- }
-
- map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
- for(; iter != list_results.end(); ++iter) {
- headers.push_back(std::move(iter->second.dir.header));
- }
- return 0;
-}
-
-int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
-{
- RGWSI_RADOS::Pool index_pool;
- map<int, string> bucket_objs;
- int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
- if (r < 0)
- return r;
-
- map<int, string>::iterator iter = bucket_objs.begin();
- for (; iter != bucket_objs.end(); ++iter) {
- r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
- if (r < 0) {
- ctx->put();
- break;
- } else {
- (*num_aio)++;
- }
- }
- return r;
-}
-
-int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
- const rgw_bucket& bucket,
- uint64_t num_objs,
- const DoutPrefixProvider *dpp)
-{
- if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
- return 0;
- }
-
- bool need_resharding = false;
- uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
- const uint32_t max_dynamic_shards =
- uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
-
- if (num_source_shards >= max_dynamic_shards) {
- return 0;
- }
-
- uint32_t suggested_num_shards = 0;
- const uint64_t max_objs_per_shard =
- cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
-
- // TODO: consider per-bucket sync policy here?
- const bool is_multisite = svc.zone->get_zone().log_data;
-
- quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
- num_objs, is_multisite, need_resharding,
- &suggested_num_shards);
- if (! need_resharding) {
- return 0;
- }
-
- const uint32_t final_num_shards =
- RGWBucketReshard::get_preferred_shards(suggested_num_shards,
- max_dynamic_shards);
- // final verification, so we don't reduce number of shards
- if (final_num_shards <= num_source_shards) {
- return 0;
- }
-
- ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
- " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
- "; new num shards " << final_num_shards << " (suggested " <<
- suggested_num_shards << ")" << dendl;
-
- return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
-}
-
-int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
-{
- RGWReshard reshard(this->driver, dpp);
-
- uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
-
- new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
- if (new_num_shards <= num_source_shards) {
- ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
- return 0;
- }
-
- cls_rgw_reshard_entry entry;
- entry.time = real_clock::now();
- entry.tenant = bucket_info.owner.tenant;
- entry.bucket_name = bucket_info.bucket.name;
- entry.bucket_id = bucket_info.bucket.bucket_id;
- entry.old_num_shards = num_source_shards;
- entry.new_num_shards = new_num_shards;
-
- return reshard.add(dpp, entry);
-}
-
-int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
- RGWQuota& quota,
- uint64_t obj_size, optional_yield y,
- bool check_size_only)
-{
- // if we only check size, then num_objs will set to 0
- if(check_size_only)
- return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
-
- return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
-}
-
-int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
- int *shard_id)
-{
- int r = 0;
- switch (layout.hash_type) {
- case rgw::BucketHashType::Mod:
- if (!layout.num_shards) {
- if (shard_id) {
- *shard_id = -1;
- }
- } else {
- uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
- if (shard_id) {
- *shard_id = (int)sid;
- }
- }
- break;
- default:
- r = -ENOTSUP;
- }
- return r;
-}
-
-uint64_t RGWRados::instance_id()
-{
- return get_rados_handle()->get_instance_id();
-}
-
-uint64_t RGWRados::next_bucket_id()
-{
- std::lock_guard l{bucket_id_lock};
- return ++max_bucket_id;
-}
-
-librados::Rados* RGWRados::get_rados_handle()
-{
- return &rados;
-}
-
-int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
-{
- rgw_rados_ref ref;
- int ret = get_raw_obj_ref(dpp, obj, &ref);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
- return ret;
- }
-
- ObjectWriteOperation op;
- list<string> prefixes;
- cls_rgw_remove_obj(op, prefixes);
-
- AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
- ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
- c->release();
- return ret;
- }
-
- handles.push_back(c);
-
- return 0;
-}
-
-int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
- RGWBucketInfo& bucket_info, RGWObjState *astate,
- list<librados::AioCompletion *>& handles, bool keep_index_consistent,
- optional_yield y)
-{
- rgw_rados_ref ref;
- int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
- return ret;
- }
-
- if (keep_index_consistent) {
- RGWRados::Bucket bop(this, bucket_info);
- RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
-
- ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
- return ret;
- }
- }
-
- ObjectWriteOperation op;
- list<string> prefixes;
- cls_rgw_remove_obj(op, prefixes);
-
- AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
- ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
- c->release();
- return ret;
- }
-
- handles.push_back(c);
-
- if (keep_index_consistent) {
- ret = delete_obj_index(obj, astate->mtime, dpp);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
- return ret;
- }
- }
- return ret;
-}
-
-void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
-{
- auto it = new objexp_hint_entry;
- it->tenant = "tenant1";
- it->bucket_name = "bucket1";
- it->bucket_id = "1234";
- it->obj_key = rgw_obj_key("obj");
- o.push_back(it);
- o.push_back(new objexp_hint_entry);
-}
-
-void objexp_hint_entry::dump(Formatter *f) const
-{
- f->open_object_section("objexp_hint_entry");
- encode_json("tenant", tenant, f);
- encode_json("bucket_name", bucket_name, f);
- encode_json("bucket_id", bucket_id, f);
- encode_json("rgw_obj_key", obj_key, f);
- utime_t ut(exp_time);
- encode_json("exp_time", ut, f);
- f->close_section();
-}
-
-void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
-{
- RGWOLHInfo *olh = new RGWOLHInfo;
- olh->removed = false;
- o.push_back(olh);
- o.push_back(new RGWOLHInfo);
-}
-
-void RGWOLHInfo::dump(Formatter *f) const
-{
- encode_json("target", target, f);
-}
-
-void RGWOLHPendingInfo::dump(Formatter *f) const
-{
- utime_t ut(time);
- encode_json("time", ut, f);
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGWRADOS_H
-#define CEPH_RGWRADOS_H
-
-#include <iostream>
-#include <functional>
-#include <boost/container/flat_map.hpp>
-#include <boost/container/flat_set.hpp>
-
-#include "include/rados/librados.hpp"
-#include "include/Context.h"
-#include "include/random.h"
-#include "common/RefCountedObj.h"
-#include "common/ceph_time.h"
-#include "common/Timer.h"
-#include "rgw_common.h"
-#include "cls/rgw/cls_rgw_types.h"
-#include "cls/version/cls_version_types.h"
-#include "cls/log/cls_log_types.h"
-#include "cls/timeindex/cls_timeindex_types.h"
-#include "cls/otp/cls_otp_types.h"
-#include "rgw_quota.h"
-#include "rgw_log.h"
-#include "rgw_metadata.h"
-#include "rgw_meta_sync_status.h"
-#include "rgw_period_puller.h"
-#include "rgw_obj_manifest.h"
-#include "rgw_sync_module.h"
-#include "rgw_trim_bilog.h"
-#include "rgw_service.h"
-#include "rgw_sal.h"
-#include "rgw_aio.h"
-#include "rgw_d3n_cacherequest.h"
-
-#include "services/svc_rados.h"
-#include "services/svc_bi_rados.h"
-#include "common/Throttle.h"
-#include "common/ceph_mutex.h"
-#include "rgw_cache.h"
-#include "rgw_sal_fwd.h"
-
-struct D3nDataCache;
-
-class RGWWatcher;
-class ACLOwner;
-class RGWGC;
-class RGWMetaNotifier;
-class RGWDataNotifier;
-class RGWLC;
-class RGWObjectExpirer;
-class RGWMetaSyncProcessorThread;
-class RGWDataSyncProcessorThread;
-class RGWSyncLogTrimThread;
-class RGWSyncTraceManager;
-struct RGWZoneGroup;
-struct RGWZoneParams;
-class RGWReshard;
-class RGWReshardWait;
-
-struct get_obj_data;
-
-/* flags for put_obj_meta() */
-#define PUT_OBJ_CREATE 0x01
-#define PUT_OBJ_EXCL 0x02
-#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
-
-static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
-{
- if (bucket.marker.empty() || orig_oid.empty()) {
- oid = orig_oid;
- } else {
- oid = bucket.marker;
- oid.append("_");
- oid.append(orig_oid);
- }
-}
-
-static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
-{
- const rgw_bucket& bucket = obj.bucket;
- prepend_bucket_marker(bucket, obj.get_oid(), oid);
- const std::string& loc = obj.key.get_loc();
- if (!loc.empty()) {
- prepend_bucket_marker(bucket, loc, locator);
- } else {
- locator.clear();
- }
-}
-
-struct RGWOLHInfo {
- rgw_obj target;
- bool removed;
-
- RGWOLHInfo() : removed(false) {}
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(target, bl);
- encode(removed, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(target, bl);
- decode(removed, bl);
- DECODE_FINISH(bl);
- }
- static void generate_test_instances(std::list<RGWOLHInfo*>& o);
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWOLHInfo)
-
-struct RGWOLHPendingInfo {
- ceph::real_time time;
-
- RGWOLHPendingInfo() {}
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
- encode(time, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- DECODE_START(1, bl);
- decode(time, bl);
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
-
-struct RGWUsageBatch {
- std::map<ceph::real_time, rgw_usage_log_entry> m;
-
- void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
- bool exists = m.find(t) != m.end();
- *account = !exists;
- m[t].aggregate(entry);
- }
-};
-
-struct RGWCloneRangeInfo {
- rgw_obj src;
- off_t src_ofs;
- off_t dst_ofs;
- uint64_t len;
-};
-
-class RGWFetchObjFilter {
-public:
- virtual ~RGWFetchObjFilter() {}
-
- virtual int filter(CephContext *cct,
- const rgw_obj_key& source_key,
- const RGWBucketInfo& dest_bucket_info,
- std::optional<rgw_placement_rule> dest_placement_rule,
- const std::map<std::string, bufferlist>& obj_attrs,
- std::optional<rgw_user> *poverride_owner,
- const rgw_placement_rule **prule) = 0;
-};
-
-class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
-protected:
- rgw_placement_rule dest_rule;
-public:
- RGWFetchObjFilter_Default() {}
-
- int filter(CephContext *cct,
- const rgw_obj_key& source_key,
- const RGWBucketInfo& dest_bucket_info,
- std::optional<rgw_placement_rule> dest_placement_rule,
- const std::map<std::string, bufferlist>& obj_attrs,
- std::optional<rgw_user> *poverride_owner,
- const rgw_placement_rule **prule) override;
-};
-
-struct RGWObjStateManifest {
- RGWObjState state;
- std::optional<RGWObjManifest> manifest;
-};
-
-class RGWObjectCtx {
- rgw::sal::Driver* driver;
- ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
-
- std::map<rgw_obj, RGWObjStateManifest> objs_state;
-public:
- explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
- RGWObjectCtx(RGWObjectCtx& _o) {
- std::unique_lock wl{lock};
- this->driver = _o.driver;
- this->objs_state = _o.objs_state;
- }
-
- rgw::sal::Driver* get_driver() {
- return driver;
- }
-
- RGWObjStateManifest *get_state(const rgw_obj& obj);
-
- void set_compressed(const rgw_obj& obj);
- void set_atomic(rgw_obj& obj);
- void set_prefetch_data(const rgw_obj& obj);
- void invalidate(const rgw_obj& obj);
-};
-
-
-struct RGWRawObjState {
- rgw_raw_obj obj;
- bool has_attrs{false};
- bool exists{false};
- uint64_t size{0};
- ceph::real_time mtime;
- uint64_t epoch{0};
- bufferlist obj_tag;
- bool has_data{false};
- bufferlist data;
- bool prefetch_data{false};
- uint64_t pg_ver{0};
-
- /* important! don't forget to update copy constructor */
-
- RGWObjVersionTracker objv_tracker;
-
- std::map<std::string, bufferlist> attrset;
- RGWRawObjState() {}
- RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
- has_attrs = rhs.has_attrs;
- exists = rhs.exists;
- size = rhs.size;
- mtime = rhs.mtime;
- epoch = rhs.epoch;
- if (rhs.obj_tag.length()) {
- obj_tag = rhs.obj_tag;
- }
- has_data = rhs.has_data;
- if (rhs.data.length()) {
- data = rhs.data;
- }
- prefetch_data = rhs.prefetch_data;
- pg_ver = rhs.pg_ver;
- objv_tracker = rhs.objv_tracker;
- }
-};
-
-struct RGWPoolIterCtx {
- librados::IoCtx io_ctx;
- librados::NObjectIterator iter;
-};
-
-struct RGWListRawObjsCtx {
- bool initialized;
- RGWPoolIterCtx iter_ctx;
-
- RGWListRawObjsCtx() : initialized(false) {}
-};
-
-struct objexp_hint_entry {
- std::string tenant;
- std::string bucket_name;
- std::string bucket_id;
- rgw_obj_key obj_key;
- ceph::real_time exp_time;
-
- void encode(bufferlist& bl) const {
- ENCODE_START(2, 1, bl);
- encode(bucket_name, bl);
- encode(bucket_id, bl);
- encode(obj_key, bl);
- encode(exp_time, bl);
- encode(tenant, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::const_iterator& bl) {
- // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
- DECODE_START(2, bl);
- decode(bucket_name, bl);
- decode(bucket_id, bl);
- decode(obj_key, bl);
- decode(exp_time, bl);
- if (struct_v >= 2) {
- decode(tenant, bl);
- } else {
- tenant.clear();
- }
- DECODE_FINISH(bl);
- }
-
- void dump(Formatter *f) const;
- static void generate_test_instances(std::list<objexp_hint_entry*>& o);
-};
-WRITE_CLASS_ENCODER(objexp_hint_entry)
-
-class RGWMetaSyncStatusManager;
-class RGWDataSyncStatusManager;
-class RGWCoroutinesManagerRegistry;
-
-class RGWGetDirHeader_CB;
-class RGWGetUserHeader_CB;
-namespace rgw { namespace sal {
- class RadosStore;
- class MPRadosSerializer;
- class LCRadosSerializer;
-} }
-
-class RGWAsyncRadosProcessor;
-
-template <class T>
-class RGWChainedCacheImpl;
-
-struct bucket_info_entry {
- RGWBucketInfo info;
- real_time mtime;
- std::map<std::string, bufferlist> attrs;
-};
-
-struct tombstone_entry;
-
-template <class K, class V>
-class lru_map;
-using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
-
-class RGWIndexCompletionManager;
-
-class RGWRados
-{
- friend class RGWGC;
- friend class RGWMetaNotifier;
- friend class RGWDataNotifier;
- friend class RGWObjectExpirer;
- friend class RGWMetaSyncProcessorThread;
- friend class RGWDataSyncProcessorThread;
- friend class RGWReshard;
- friend class RGWBucketReshard;
- friend class RGWBucketReshardLock;
- friend class BucketIndexLockGuard;
- friend class rgw::sal::MPRadosSerializer;
- friend class rgw::sal::LCRadosSerializer;
- friend class rgw::sal::RadosStore;
-
- /** Open the pool used as root for this gateway */
- int open_root_pool_ctx(const DoutPrefixProvider *dpp);
- int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
- int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
- int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
- int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
- int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
-
- int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
- bool mostly_omap);
-
-
- ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
- SafeTimer *timer;
-
- rgw::sal::RadosStore* driver = nullptr;
- RGWGC *gc = nullptr;
- RGWLC *lc;
- RGWObjectExpirer *obj_expirer;
- bool use_gc_thread;
- bool use_lc_thread;
- bool quota_threads;
- bool run_sync_thread;
- bool run_reshard_thread;
-
- RGWMetaNotifier *meta_notifier;
- RGWDataNotifier *data_notifier;
- RGWMetaSyncProcessorThread *meta_sync_processor_thread;
- RGWSyncTraceManager *sync_tracer = nullptr;
- std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
-
- boost::optional<rgw::BucketTrimManager> bucket_trim;
- RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
-
- ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
- ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
-
- librados::IoCtx root_pool_ctx; // .rgw
-
- double inject_notify_timeout_probability = 0;
- unsigned max_notify_retries = 0;
-
- friend class RGWWatcher;
-
- ceph::mutex bucket_id_lock = ceph::make_mutex("rados_bucket_id");
-
- // This field represents the number of bucket index object shards
- uint32_t bucket_index_max_shards;
-
- std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
-
- int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
- int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
- int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
- uint64_t max_bucket_id;
-
- int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
- RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- RGWObjState *olh_state, RGWObjState **target_state,
- RGWObjManifest **target_manifest, optional_yield y);
- int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
- bool follow_olh, optional_yield y, bool assume_noent = false);
- int append_atomic_test(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- librados::ObjectOperation& op, RGWObjState **state,
- RGWObjManifest** pmanifest, optional_yield y);
-
- int update_placement_map();
- int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
-
- void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
- void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
- void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
-protected:
- CephContext *cct;
-
- librados::Rados rados;
-
- using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
- RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
-
- tombstone_cache_t *obj_tombstone_cache;
-
- librados::IoCtx gc_pool_ctx; // .rgw.gc
- librados::IoCtx lc_pool_ctx; // .rgw.lc
- librados::IoCtx objexp_pool_ctx;
- librados::IoCtx reshard_pool_ctx;
- librados::IoCtx notif_pool_ctx; // .rgw.notif
-
- bool pools_initialized;
-
- RGWQuotaHandler *quota_handler;
-
- RGWCoroutinesManagerRegistry *cr_registry;
-
- RGWSyncModuleInstanceRef sync_module;
- bool writeable_zone{false};
-
- RGWIndexCompletionManager *index_completion_manager{nullptr};
-
- bool use_cache{false};
- bool use_gc{true};
- bool use_datacache{false};
-
- int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
-public:
- RGWRados(): timer(NULL),
- gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
- run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
- data_notifier(NULL), meta_sync_processor_thread(NULL),
- bucket_index_max_shards(0),
- max_bucket_id(0), cct(NULL),
- binfo_cache(NULL), obj_tombstone_cache(nullptr),
- pools_initialized(false),
- quota_handler(NULL),
- cr_registry(NULL),
- pctl(&ctl),
- reshard(NULL) {}
-
- RGWRados& set_use_cache(bool status) {
- use_cache = status;
- return *this;
- }
-
- RGWRados& set_use_gc(bool status) {
- use_gc = status;
- return *this;
- }
-
- RGWRados& set_use_datacache(bool status) {
- use_datacache = status;
- return *this;
- }
-
- bool get_use_datacache() {
- return use_datacache;
- }
-
- RGWLC *get_lc() {
- return lc;
- }
-
- RGWGC *get_gc() {
- return gc;
- }
-
- RGWRados& set_run_gc_thread(bool _use_gc_thread) {
- use_gc_thread = _use_gc_thread;
- return *this;
- }
-
- RGWRados& set_run_lc_thread(bool _use_lc_thread) {
- use_lc_thread = _use_lc_thread;
- return *this;
- }
-
- RGWRados& set_run_quota_threads(bool _run_quota_threads) {
- quota_threads = _run_quota_threads;
- return *this;
- }
-
- RGWRados& set_run_sync_thread(bool _run_sync_thread) {
- run_sync_thread = _run_sync_thread;
- return *this;
- }
-
- RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
- run_reshard_thread = _run_reshard_thread;
- return *this;
- }
-
- librados::IoCtx* get_lc_pool_ctx() {
- return &lc_pool_ctx;
- }
-
- librados::IoCtx& get_notif_pool_ctx() {
- return notif_pool_ctx;
- }
-
- void set_context(CephContext *_cct) {
- cct = _cct;
- }
- void set_store(rgw::sal::RadosStore* _driver) {
- driver = _driver;
- }
-
- RGWServices svc;
- RGWCtl ctl;
-
- RGWCtl *pctl{nullptr};
-
- /**
- * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
- * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
- */
- std::string host_id;
-
- RGWReshard *reshard;
- std::shared_ptr<RGWReshardWait> reshard_wait;
-
- virtual ~RGWRados() = default;
-
- tombstone_cache_t *get_tombstone_cache() {
- return obj_tombstone_cache;
- }
- const RGWSyncModuleInstanceRef& get_sync_module() {
- return sync_module;
- }
- RGWSyncTraceManager *get_sync_tracer() {
- return sync_tracer;
- }
-
- int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
- void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
- int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
- int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
-
- uint32_t get_max_bucket_shards() {
- return RGWSI_BucketIndex_RADOS::shards_max();
- }
-
-
- int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
-
- int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
- int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
- RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
- bool *is_truncated);
- int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
- RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
- bool *is_truncated);
- std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
-
- CephContext *ctx() { return cct; }
- /** do all necessary setup of the storage device */
- int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
- set_context(_cct);
- return init_begin(dpp);
- }
- /** Initialize the RADOS instance and prepare to do other ops */
- int init_svc(bool raw, const DoutPrefixProvider *dpp);
- int init_ctl(const DoutPrefixProvider *dpp);
- virtual int init_rados();
- int init_begin(const DoutPrefixProvider *dpp);
- int init_complete(const DoutPrefixProvider *dpp);
- void finalize();
-
- int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
- int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
-
- /// list logs
- int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
- int log_list_next(RGWAccessHandle handle, std::string *name);
-
- /// remove log
- int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
-
- /// show log
- int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
- int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
-
- // log bandwidth info
- int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
- int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
- uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
- rgw_usage_log_entry>& usage);
- int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
- int clear_usage(const DoutPrefixProvider *dpp);
-
- int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
-
- void create_bucket_id(std::string *bucket_id);
-
- bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
- bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
-
- int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
- const std::string& zonegroup_id,
- const rgw_placement_rule& placement_rule,
- const std::string& swift_ver_location,
- const RGWQuotaInfo * pquota_info,
- std::map<std::string,bufferlist>& attrs,
- RGWBucketInfo& bucket_info,
- obj_version *pobjv,
- obj_version *pep_objv,
- ceph::real_time creation_time,
- rgw_bucket *master_bucket,
- uint32_t *master_num_shards,
- optional_yield y,
- const DoutPrefixProvider *dpp,
- bool exclusive = true);
-
- RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
-
- struct BucketShard {
- RGWRados *store;
- rgw_bucket bucket;
- int shard_id;
- RGWSI_RADOS::Obj bucket_obj;
-
- explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
- int init(const rgw_bucket& _bucket, const rgw_obj& obj,
- RGWBucketInfo* out, const DoutPrefixProvider *dpp);
- int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
- int init(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& index, int sid);
-
- friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
- out << "BucketShard:{ bucket=" << bs.bucket <<
- ", shard_id=" << bs.shard_id <<
- ", bucket_ojb=" << bs.bucket_obj << "}";
- return out;
- }
- };
-
- class Object {
- RGWRados *store;
- rgw::sal::Bucket* bucket;
- RGWObjectCtx& ctx;
- rgw::sal::Object* obj;
-
- BucketShard bs;
-
- RGWObjState *state;
- RGWObjManifest *manifest;
-
- bool versioning_disabled;
-
- bool bs_initialized;
-
- const rgw_placement_rule *pmeta_placement_rule;
-
- protected:
- int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
- void invalidate_state();
-
- int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
- const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
- int complete_atomic_modification(const DoutPrefixProvider *dpp);
-
- public:
- Object(RGWRados *_store, rgw::sal::Bucket* _bucket, RGWObjectCtx& _ctx, rgw::sal::Object* _obj) : store(_store), bucket(_bucket),
- ctx(_ctx), obj(_obj), bs(store),
- state(NULL), manifest(nullptr), versioning_disabled(false),
- bs_initialized(false),
- pmeta_placement_rule(nullptr) {}
-
- RGWRados *get_store() { return store; }
- rgw_obj get_obj() { return obj->get_obj(); }
- RGWObjectCtx& get_ctx() { return ctx; }
- RGWBucketInfo& get_bucket_info() { return bucket->get_info(); }
- const std::string& get_instance() { return obj->get_instance(); }
- rgw::sal::Object* get_target() { return obj; }
- int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
-
- int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
- if (!bs_initialized) {
- int r =
- bs.init(bucket->get_key(), obj->get_obj(), nullptr /* no RGWBucketInfo */, dpp);
- if (r < 0) {
- return r;
- }
- bs_initialized = true;
- }
- *pbs = &bs;
- return 0;
- }
-
- void set_versioning_disabled(bool status) {
- versioning_disabled = status;
- }
-
- bool versioning_enabled() {
- return (!versioning_disabled && bucket->versioning_enabled());
- }
-
- void set_meta_placement_rule(const rgw_placement_rule *p) {
- pmeta_placement_rule = p;
- }
-
- const rgw_placement_rule& get_meta_placement_rule() {
- return pmeta_placement_rule ? *pmeta_placement_rule : bucket->get_placement_rule();
- }
-
- struct Read {
- RGWRados::Object *source;
-
- struct GetObjState {
- std::map<rgw_pool, librados::IoCtx> io_ctxs;
- rgw_pool cur_pool;
- librados::IoCtx *cur_ioctx{nullptr};
- rgw_obj obj;
- rgw_raw_obj head_obj;
- } state;
-
- struct ConditionParams {
- const ceph::real_time *mod_ptr;
- const ceph::real_time *unmod_ptr;
- bool high_precision_time;
- uint32_t mod_zone_id;
- uint64_t mod_pg_ver;
- const char *if_match;
- const char *if_nomatch;
-
- ConditionParams() :
- mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
- if_match(NULL), if_nomatch(NULL) {}
- } conds;
-
- struct Params {
- ceph::real_time *lastmod;
- uint64_t *obj_size;
- std::map<std::string, bufferlist> *attrs;
- rgw_obj *target_obj;
-
- Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
- target_obj(nullptr) {}
- } params;
-
- explicit Read(RGWRados::Object *_source) : source(_source) {}
-
- int prepare(optional_yield y, const DoutPrefixProvider *dpp);
- static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
- int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
- int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
- int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
- };
-
- struct Write {
- RGWRados::Object *target;
-
- struct MetaParams {
- ceph::real_time *mtime;
- std::map<std::string, bufferlist>* rmattrs;
- const bufferlist *data;
- RGWObjManifest *manifest;
- const std::string *ptag;
- std::list<rgw_obj_index_key> *remove_objs;
- ceph::real_time set_mtime;
- rgw_user owner;
- RGWObjCategory category;
- int flags;
- const char *if_match;
- const char *if_nomatch;
- std::optional<uint64_t> olh_epoch;
- ceph::real_time delete_at;
- bool canceled;
- const std::string *user_data;
- rgw_zone_set *zones_trace;
- bool modify_tail;
- bool completeMultipart;
- bool appendable;
-
- MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
- remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
- if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
- modify_tail(false), completeMultipart(false), appendable(false) {}
- } meta;
-
- explicit Write(RGWRados::Object *_target) : target(_target) {}
-
- int _do_write_meta(const DoutPrefixProvider *dpp,
- uint64_t size, uint64_t accounted_size,
- std::map<std::string, bufferlist>& attrs,
- bool modify_tail, bool assume_noent,
- void *index_op, optional_yield y);
- int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
- std::map<std::string, bufferlist>& attrs, optional_yield y);
- int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
- const req_state* get_req_state() {
- return nullptr; /* XXX dang Only used by LTTng, and it handles null anyway */
- }
- };
-
- struct Delete {
- RGWRados::Object *target;
-
- struct DeleteParams {
- rgw_user bucket_owner;
- int versioning_status; // versioning flags defined in enum RGWBucketFlags
- ACLOwner obj_owner; // needed for creation of deletion marker
- uint64_t olh_epoch;
- std::string marker_version_id;
- uint32_t bilog_flags;
- std::list<rgw_obj_index_key> *remove_objs;
- ceph::real_time expiration_time;
- ceph::real_time unmod_since;
- ceph::real_time mtime; /* for setting delete marker mtime */
- bool high_precision_time;
- rgw_zone_set *zones_trace;
- bool abortmp;
- uint64_t parts_accounted_size;
-
- DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
- } params;
-
- struct DeleteResult {
- bool delete_marker;
- std::string version_id;
-
- DeleteResult() : delete_marker(false) {}
- } result;
-
- explicit Delete(RGWRados::Object *_target) : target(_target) {}
-
- int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
- };
-
- struct Stat {
- RGWRados::Object *source;
-
- struct Result {
- rgw_obj obj;
- std::optional<RGWObjManifest> manifest;
- uint64_t size{0};
- struct timespec mtime {};
- std::map<std::string, bufferlist> attrs;
- } result;
-
- struct State {
- librados::IoCtx io_ctx;
- librados::AioCompletion *completion;
- int ret;
-
- State() : completion(NULL), ret(0) {}
- } state;
-
-
- explicit Stat(RGWRados::Object *_source) : source(_source) {}
-
- int stat_async(const DoutPrefixProvider *dpp);
- int wait(const DoutPrefixProvider *dpp);
- int stat();
- private:
- int finish(const DoutPrefixProvider *dpp);
- };
- };
-
- class Bucket {
- RGWRados *store;
- RGWBucketInfo bucket_info;
- rgw_bucket& bucket;
- int shard_id;
-
- public:
- Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
- shard_id(RGW_NO_SHARD) {}
- RGWRados *get_store() { return store; }
- rgw_bucket& get_bucket() { return bucket; }
- RGWBucketInfo& get_bucket_info() { return bucket_info; }
-
- int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
-
- int get_shard_id() { return shard_id; }
- void set_shard_id(int id) {
- shard_id = id;
- }
-
- class UpdateIndex {
- RGWRados::Bucket *target;
- std::string optag;
- rgw_obj obj;
- uint16_t bilog_flags{0};
- BucketShard bs;
- bool bs_initialized{false};
- bool blind;
- bool prepared{false};
- rgw_zone_set *zones_trace{nullptr};
-
- int init_bs(const DoutPrefixProvider *dpp) {
- int r =
- bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
- if (r < 0) {
- return r;
- }
- bs_initialized = true;
- return 0;
- }
-
- void invalidate_bs() {
- bs_initialized = false;
- }
-
- int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
- public:
-
- UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
- bs(target->get_store()) {
- blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
- }
-
- int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
- if (!bs_initialized) {
- int r = init_bs(dpp);
- if (r < 0) {
- return r;
- }
- }
- *pbs = &bs;
- return 0;
- }
-
- void set_bilog_flags(uint16_t flags) {
- bilog_flags = flags;
- }
-
- void set_zones_trace(rgw_zone_set *_zones_trace) {
- zones_trace = _zones_trace;
- }
-
- int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
- int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
- uint64_t accounted_size, ceph::real_time& ut,
- const std::string& etag, const std::string& content_type,
- const std::string& storage_class,
- bufferlist *acl_bl, RGWObjCategory category,
- std::list<rgw_obj_index_key> *remove_objs, const std::string *user_data = nullptr, bool appendable = false);
- int complete_del(const DoutPrefixProvider *dpp,
- int64_t poolid, uint64_t epoch,
- ceph::real_time& removed_mtime, /* mtime of removed object */
- std::list<rgw_obj_index_key> *remove_objs);
- int cancel(const DoutPrefixProvider *dpp,
- std::list<rgw_obj_index_key> *remove_objs);
-
- const std::string *get_optag() { return &optag; }
-
- bool is_prepared() { return prepared; }
- }; // class UpdateIndex
-
- class List {
- protected:
- // absolute maximum number of objects that
- // list_objects_(un)ordered can return
- static constexpr int64_t bucket_list_objects_absolute_max = 25000;
-
- RGWRados::Bucket *target;
- rgw_obj_key next_marker;
-
- int list_objects_ordered(const DoutPrefixProvider *dpp,
- int64_t max,
- std::vector<rgw_bucket_dir_entry> *result,
- std::map<std::string, bool> *common_prefixes,
- bool *is_truncated,
- optional_yield y);
- int list_objects_unordered(const DoutPrefixProvider *dpp,
- int64_t max,
- std::vector<rgw_bucket_dir_entry> *result,
- std::map<std::string, bool> *common_prefixes,
- bool *is_truncated,
- optional_yield y);
-
- public:
-
- struct Params {
- std::string prefix;
- std::string delim;
- rgw_obj_key marker;
- rgw_obj_key end_marker;
- std::string ns;
- bool enforce_ns;
- RGWAccessListFilter* access_list_filter;
- RGWBucketListNameFilter force_check_filter;
- bool list_versions;
- bool allow_unordered;
-
- Params() :
- enforce_ns(true),
- access_list_filter(nullptr),
- list_versions(false),
- allow_unordered(false)
- {}
- } params;
-
- explicit List(RGWRados::Bucket *_target) : target(_target) {}
-
- int list_objects(const DoutPrefixProvider *dpp, int64_t max,
- std::vector<rgw_bucket_dir_entry> *result,
- std::map<std::string, bool> *common_prefixes,
- bool *is_truncated,
- optional_yield y) {
- if (params.allow_unordered) {
- return list_objects_unordered(dpp, max, result, common_prefixes,
- is_truncated, y);
- } else {
- return list_objects_ordered(dpp, max, result, common_prefixes,
- is_truncated, y);
- }
- }
- rgw_obj_key& get_next_marker() {
- return next_marker;
- }
- }; // class List
- }; // class Bucket
-
- int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const std::string& obj_prefix,
- const std::string& obj_delim,
- std::function<int(const rgw_bucket_dir_entry&)> handler);
-
- bool swift_versioning_enabled(rgw::sal::Bucket* bucket) const;
-
- int swift_versioning_copy(RGWObjectCtx& obj_ctx, /* in/out */
- const rgw_user& user, /* in */
- rgw::sal::Bucket* bucket, /* in */
- rgw::sal::Object* obj, /* in */
- const DoutPrefixProvider *dpp, /* in/out */
- optional_yield y); /* in */
- int swift_versioning_restore(RGWObjectCtx& obj_ctx, /* in/out */
- const rgw_user& user, /* in */
- rgw::sal::Bucket* bucket, /* in */
- rgw::sal::Object* obj, /* in */
- bool& restored, /* out */
- const DoutPrefixProvider *dpp); /* in/out */
- int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
- RGWObjState *astate,
- std::map<std::string, bufferlist>& src_attrs,
- RGWRados::Object::Read& read_op,
- const rgw_user& user_id,
- rgw::sal::Object* dest_obj,
- ceph::real_time *mtime);
-
- enum AttrsMod {
- ATTRSMOD_NONE = 0,
- ATTRSMOD_REPLACE = 1,
- ATTRSMOD_MERGE = 2
- };
-
- D3nDataCache* d3n_data_cache{nullptr};
-
- int rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y);
-
- int stat_remote_obj(const DoutPrefixProvider *dpp,
- RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* src_obj,
- const RGWBucketInfo *src_bucket_info,
- real_time *src_mtime,
- uint64_t *psize,
- const real_time *mod_ptr,
- const real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- std::map<std::string, bufferlist> *pattrs,
- std::map<std::string, std::string> *pheaders,
- std::string *version_id,
- std::string *ptag,
- std::string *petag);
-
- int fetch_remote_obj(RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* dest_obj,
- rgw::sal::Object* src_obj,
- rgw::sal::Bucket* dest_bucket,
- rgw::sal::Bucket* src_bucket,
- std::optional<rgw_placement_rule> dest_placement,
- ceph::real_time *src_mtime,
- ceph::real_time *mtime,
- const ceph::real_time *mod_ptr,
- const ceph::real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- AttrsMod attrs_mod,
- bool copy_if_newer,
- rgw::sal::Attrs& attrs,
- RGWObjCategory category,
- std::optional<uint64_t> olh_epoch,
- ceph::real_time delete_at,
- std::string *ptag,
- std::string *petag,
- void (*progress_cb)(off_t, void *),
- void *progress_data,
- const DoutPrefixProvider *dpp,
- RGWFetchObjFilter *filter,
- rgw_zone_set *zones_trace= nullptr,
- std::optional<uint64_t>* bytes_transferred = 0);
- /**
- * Copy an object.
- * dest_obj: the object to copy into
- * src_obj: the object to copy from
- * attrs: usage depends on attrs_mod parameter
- * attrs_mod: the modification mode of the attrs, may have the following values:
- * ATTRSMOD_NONE - the attributes of the source object will be
- * copied without modifications, attrs parameter is ignored;
- * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
- * parameter, source object attributes are not copied;
- * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
- * are overwritten by values contained in attrs parameter.
- * Returns: 0 on success, -ERR# otherwise.
- */
- int copy_obj(RGWObjectCtx& obj_ctx,
- const rgw_user& user_id,
- req_info *info,
- const rgw_zone_id& source_zone,
- rgw::sal::Object* dest_obj,
- rgw::sal::Object* src_obj,
- rgw::sal::Bucket* dest_bucket,
- rgw::sal::Bucket* src_bucket,
- const rgw_placement_rule& dest_placement,
- ceph::real_time *src_mtime,
- ceph::real_time *mtime,
- const ceph::real_time *mod_ptr,
- const ceph::real_time *unmod_ptr,
- bool high_precision_time,
- const char *if_match,
- const char *if_nomatch,
- AttrsMod attrs_mod,
- bool copy_if_newer,
- std::map<std::string, bufferlist>& attrs,
- RGWObjCategory category,
- uint64_t olh_epoch,
- ceph::real_time delete_at,
- std::string *version_id,
- std::string *ptag,
- std::string *petag,
- void (*progress_cb)(off_t, void *),
- void *progress_data,
- const DoutPrefixProvider *dpp,
- optional_yield y);
-
- int copy_obj_data(RGWObjectCtx& obj_ctx,
- rgw::sal::Bucket* bucket,
- const rgw_placement_rule& dest_placement,
- RGWRados::Object::Read& read_op, off_t end,
- rgw::sal::Object* dest_obj,
- ceph::real_time *mtime,
- ceph::real_time set_mtime,
- std::map<std::string, bufferlist>& attrs,
- uint64_t olh_epoch,
- ceph::real_time delete_at,
- std::string *petag,
- const DoutPrefixProvider *dpp,
- optional_yield y);
-
- int transition_obj(RGWObjectCtx& obj_ctx,
- rgw::sal::Bucket* bucket,
- rgw::sal::Object& obj,
- const rgw_placement_rule& placement_rule,
- const real_time& mtime,
- uint64_t olh_epoch,
- const DoutPrefixProvider *dpp,
- optional_yield y);
-
- int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
-
- /**
- * Delete a bucket.
- * bucket: the name of the bucket to delete
- * Returns 0 on success, -ERR# otherwise.
- */
- int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
-
- void wakeup_meta_sync_shards(std::set<int>& shard_ids);
-
- void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
-
- RGWMetaSyncStatusManager* get_meta_sync_manager();
- RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
-
- int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
- int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
- int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
-
- /** Delete an object.*/
- int delete_obj(rgw::sal::Driver* driver,
- const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_owner,
- const rgw_obj& src_obj,
- int versioning_status, // versioning flags defined in enum RGWBucketFlags
- uint16_t bilog_flags = 0,
- const ceph::real_time& expiration_time = ceph::real_time(),
- rgw_zone_set *zones_trace = nullptr);
- int delete_obj(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_owner,
- rgw::sal::Object* src_obj,
- int versioning_status, // versioning flags defined in enum RGWBucketFlags
- uint16_t bilog_flags = 0,
- const ceph::real_time& expiration_time = ceph::real_time(),
- rgw_zone_set *zones_trace = nullptr);
-
- int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
-
- /** Remove an object from the bucket index */
- int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp);
-
- /**
- * Set an attr on an object.
- * bucket: name of the bucket holding the object
- * obj: name of the object to set the attr on
- * name: the attr to set
- * bl: the contents of the attr
- * Returns: 0 on success, -ERR# otherwise.
- */
- int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, const char *name, bufferlist& bl);
-
- int set_attrs(const DoutPrefixProvider *dpp, void *ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj,
- std::map<std::string, bufferlist>& attrs,
- std::map<std::string, bufferlist>* rmattrs,
- optional_yield y);
-
- int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest,
- bool follow_olh, optional_yield y, bool assume_noent = false);
- int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
- return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
- }
-
- using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
- off_t, bool, RGWObjState*, void*);
-
- int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
- rgw::sal::Object* obj, off_t ofs, off_t end,
- uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
- optional_yield y);
-
- int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
-
- virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
- const rgw_raw_obj& read_obj, off_t obj_ofs,
- off_t read_ofs, off_t len, bool is_head_obj,
- RGWObjState *astate, void *arg);
-
- /**
- * a simple object read without keeping state
- */
-
- int raw_obj_stat(const DoutPrefixProvider *dpp,
- rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
- std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
- RGWObjVersionTracker *objv_tracker, optional_yield y);
-
- int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
- int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
-
- int guard_reshard(const DoutPrefixProvider *dpp,
- BucketShard *bs,
- const rgw_obj& obj_instance,
- RGWBucketInfo& bucket_info,
- std::function<int(BucketShard *)> call);
- int block_while_resharding(RGWRados::BucketShard *bs,
- const rgw_obj& obj_instance,
- RGWBucketInfo& bucket_info,
- optional_yield y,
- const DoutPrefixProvider *dpp);
-
- void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
- int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
- int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
- int bucket_index_link_olh(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info, RGWObjState& olh_state,
- const rgw_obj& obj_instance, bool delete_marker,
- const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
- uint64_t olh_epoch,
- ceph::real_time unmod_since, bool high_precision_time,
- rgw_zone_set *zones_trace = nullptr,
- bool log_data_change = false);
- int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw_obj& obj_instance,
- const std::string& op_tag, const std::string& olh_tag,
- uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
- int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info, RGWObjState& state,
- const rgw_obj& obj_instance, uint64_t ver_marker,
- std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
- int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
- int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance);
- int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj,
- bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
- uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
- int update_olh(const DoutPrefixProvider *dpp, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw::sal::Object* obj, rgw_zone_set *zones_trace = nullptr);
- int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
- uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
- optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
- int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
- const rgw_obj& obj);
- int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, rgw::sal::Object* target_obj,
- uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
-
- void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
- int remove_olh_pending_entries(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
- int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, rgw::sal::Object* olh_obj, rgw_obj *target);
- int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
-
- void gen_rand_obj_instance_name(rgw_obj_key *target_key);
- void gen_rand_obj_instance_name(rgw_obj *target);
-
- int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
- int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
-
-public:
- void set_atomic(void *ctx, rgw_obj& obj) {
- RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
- rctx->set_atomic(obj);
- }
- void set_prefetch_data(void *ctx, const rgw_obj& obj) {
- RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
- rctx->set_prefetch_data(obj);
- }
- void set_compressed(void *ctx, const rgw_obj& obj) {
- RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
- rctx->set_compressed(obj);
- }
- int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
- int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
- std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
- int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
-
- int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp);
- /* xxx dang obj_ctx -> svc */
- int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
- int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
-
- static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
-
- int get_bucket_info(RGWServices *svc,
- const std::string& tenant_name, const std::string& bucket_name,
- RGWBucketInfo& info,
- ceph::real_time *pmtime, optional_yield y,
- const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
-
- // Returns 0 on successful refresh. Returns error code if there was
- // an error or the version stored on the OSD is the same as that
- // presented in the BucketInfo structure.
- //
- int try_refresh_bucket_info(RGWBucketInfo& info,
- ceph::real_time *pmtime,
- const DoutPrefixProvider *dpp,
- std::map<std::string, bufferlist> *pattrs = nullptr);
-
- int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
- std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
- const DoutPrefixProvider *dpp);
-
- int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
- int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
- rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
- int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
- RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
- int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
- ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
- int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
- std::list<rgw_obj_index_key> *remove_objs,
- uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
- int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
-
- using ent_map_t =
- boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
-
- int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- const int shard_id,
- const rgw_obj_index_key& start_after,
- const std::string& prefix,
- const std::string& delimiter,
- const uint32_t num_entries,
- const bool list_versions,
- const uint16_t exp_factor, // 0 means ignore
- ent_map_t& m,
- bool* is_truncated,
- bool* cls_filtered,
- rgw_obj_index_key *last_entry,
- optional_yield y,
- RGWBucketListNameFilter force_check_filter = {});
- int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- int shard_id,
- const rgw_obj_index_key& start_after,
- const std::string& prefix,
- uint32_t num_entries,
- bool list_versions,
- std::vector<rgw_bucket_dir_entry>& ent_list,
- bool *is_truncated,
- rgw_obj_index_key *last_entry,
- optional_yield y,
- RGWBucketListNameFilter force_check_filter = {});
- int cls_bucket_head(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- int shard_id, std::vector<rgw_bucket_dir_header>& headers,
- std::map<int, std::string> *bucket_instance_ids = NULL);
- int cls_bucket_head_async(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& idx_layout,
- int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
- int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
- int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
- int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
- void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
- int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
- int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
- int bi_list(const DoutPrefixProvider *dpp,
- const RGWBucketInfo& bucket_info,
- int shard_id,
- const std::string& filter_obj,
- const std::string& marker,
- uint32_t max,
- std::list<rgw_cls_bi_entry> *entries,
- bool *is_truncated);
- int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
- int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
- std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
- int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
-
- int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
- int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
- uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
- std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
- int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
- uint64_t end_epoch);
- int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
-
- int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
-
- int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
- int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
-
- void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
- std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
- void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
- int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
- int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
- librados::ObjectWriteOperation *op);
- int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
-
- int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
- int process_gc(bool expired_only);
- bool process_expire_objects(const DoutPrefixProvider *dpp);
- int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, rgw::sal::Object* obj, optional_yield y);
-
- int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
- int list_lc_progress(std::string& marker, uint32_t max_entries,
- std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
- int& index);
-
- int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
- std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
- std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
- int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
- int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
- int remove_objs_from_index(const DoutPrefixProvider *dpp,
- RGWBucketInfo& bucket_info,
- const std::list<rgw_obj_index_key>& oid_list);
- int move_rados_obj(const DoutPrefixProvider *dpp,
- librados::IoCtx& src_ioctx,
- const std::string& src_oid, const std::string& src_locator,
- librados::IoCtx& dst_ioctx,
- const std::string& dst_oid, const std::string& dst_locator);
- int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
- int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
- rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
-
- int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
- RGWQuota& quota, uint64_t obj_size,
- optional_yield y, bool check_size_only = false);
-
- int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
- uint64_t num_objs, const DoutPrefixProvider *dpp);
-
- int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
-
- uint64_t instance_id();
-
- librados::Rados* get_rados_handle();
-
- int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
- int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
- std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
- optional_yield y);
-
- private:
- /**
- * Check the actual on-disk state of the object specified
- * by list_state, and fill in the time and size of object.
- * Then append any changes to suggested_updates for
- * the rgw class' dir_suggest_changes function.
- *
- * Note that this can maul list_state; don't use it afterwards. Also
- * it expects object to already be filled in from list_state; it only
- * sets the size and mtime.
- *
- * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
- * and -errno on other failures. (-ENOENT is not a failure, and it
- * will encode that info as a suggested update.)
- */
- int check_disk_state(const DoutPrefixProvider *dpp,
- librados::IoCtx io_ctx,
- RGWBucketInfo& bucket_info,
- rgw_bucket_dir_entry& list_state,
- rgw_bucket_dir_entry& object,
- bufferlist& suggested_updates,
- optional_yield y);
-
- /**
- * Init pool iteration
- * pool: pool to use for the ctx initialization
- * ctx: context object to use for the iteration
- * Returns: 0 on success, -ERR# otherwise.
- */
- int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
-
- /**
- * Init pool iteration
- * pool: pool to use
- * cursor: position to start iteration
- * ctx: context object to use for the iteration
- * Returns: 0 on success, -ERR# otherwise.
- */
- int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
-
- /**
- * Get pool iteration position
- * ctx: context object to use for the iteration
- * Returns: std::string representation of position
- */
- std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
-
- /**
- * Iterate over pool return object names, use optional filter
- * ctx: iteration context, initialized with pool_iterate_begin()
- * num: max number of objects to return
- * objs: a vector that the results will append into
- * is_truncated: if not NULL, will hold true iff iteration is complete
- * filter: if not NULL, will be used to filter returned objects
- * Returns: 0 on success, -ERR# otherwise.
- */
- int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
- std::vector<rgw_bucket_dir_entry>& objs,
- bool *is_truncated, RGWAccessListFilter *filter);
-
- uint64_t next_bucket_id();
-
- /**
- * This is broken out to facilitate unit testing.
- */
- static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
- uint32_t num_shards);
-};
-
-
-struct get_obj_data {
- RGWRados* rgwrados;
- RGWGetDataCB* client_cb = nullptr;
- rgw::Aio* aio;
- uint64_t offset; // next offset to write to client
- rgw::AioResultList completed; // completed read results, sorted by offset
- optional_yield yield;
-
- get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
- uint64_t offset, optional_yield yield)
- : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
- ~get_obj_data() {
- if (rgwrados->get_use_datacache()) {
- const std::lock_guard l(d3n_get_data.d3n_lock);
- }
- }
-
- D3nGetObjData d3n_get_data;
- std::atomic_bool d3n_bypass_cache_write{false};
-
- int flush(rgw::AioResultList&& results);
-
- void cancel() {
- // wait for all completions to drain and ignore the results
- aio->drain();
- }
-
- int drain() {
- auto c = aio->wait();
- while (!c.empty()) {
- int r = flush(std::move(c));
- if (r < 0) {
- cancel();
- return r;
- }
- c = aio->wait();
- }
- return flush(std::move(c));
- }
-};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_REALM_RELOADER_H
-#define RGW_REALM_RELOADER_H
+#pragma once
#include "rgw_realm_watcher.h"
#include "common/Cond.h"
ceph::condition_variable cond; //< to signal reload() after an invalid realm config
C_Reload* reload_scheduled; //< reload() context if scheduled
};
-
-#endif // RGW_REALM_RELOADER_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_REALM_WATCHER_H
-#define RGW_REALM_WATCHER_H
+#pragma once
#include "include/rados/librados.hpp"
#include "include/ceph_assert.h"
std::map<RGWRealmNotify, Watcher&> watchers;
};
-
-#endif // RGW_REALM_WATCHER_H
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_REQUEST_H
-#define RGW_REQUEST_H
+#pragma once
#include "rgw_common.h"
#include "rgw_acl.h"
: RGWRequest(req_id), method(_m), resource(_r), content_length(_cl),
fail_flag(ff) {}
};
-
-#endif /* RGW_REQUEST_H */
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <limits>
-#include <sstream>
-
-#include "rgw_zone.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_reshard.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/lock/cls_lock_client.h"
-#include "common/errno.h"
-#include "common/ceph_json.h"
-
-#include "common/dout.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_tier_rados.h"
-#include "services/svc_bilog_rados.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-const string reshard_oid_prefix = "reshard.";
-const string reshard_lock_name = "reshard_process";
-const string bucket_instance_lock_name = "bucket_instance_lock";
-
-/* All primes up to 2000 used to attempt to make dynamic sharding use
- * a prime numbers of shards. Note: this list also includes 1 for when
- * 1 shard is the most appropriate, even though 1 is not prime.
- */
-const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
- 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
- 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
- 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
- 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
- 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
- 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
- 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
- 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
- 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
- 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
- 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
- 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
- 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
- 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
- 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
- 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
- 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
- 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
- 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
- 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
- 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
- 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
- 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
- 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
-};
-
-class BucketReshardShard {
- rgw::sal::RadosStore* store;
- const RGWBucketInfo& bucket_info;
- int shard_id;
- RGWRados::BucketShard bs;
- vector<rgw_cls_bi_entry> entries;
- map<RGWObjCategory, rgw_bucket_category_stats> stats;
- deque<librados::AioCompletion *>& aio_completions;
- uint64_t max_aio_completions;
- uint64_t reshard_shard_batch_size;
-
- int wait_next_completion() {
- librados::AioCompletion *c = aio_completions.front();
- aio_completions.pop_front();
-
- c->wait_for_complete();
-
- int ret = c->get_return_value();
- c->release();
-
- if (ret < 0) {
- derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- return 0;
- }
-
- int get_completion(librados::AioCompletion **c) {
- if (aio_completions.size() >= max_aio_completions) {
- int ret = wait_next_completion();
- if (ret < 0) {
- return ret;
- }
- }
-
- *c = librados::Rados::aio_create_completion(nullptr, nullptr);
- aio_completions.push_back(*c);
-
- return 0;
- }
-
-public:
- BucketReshardShard(const DoutPrefixProvider *dpp,
- rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
- const rgw::bucket_index_layout_generation& index,
- int shard_id, deque<librados::AioCompletion *>& _completions) :
- store(_store), bucket_info(_bucket_info), shard_id(shard_id),
- bs(store->getRados()), aio_completions(_completions)
- {
- bs.init(dpp, bucket_info, index, shard_id);
-
- max_aio_completions =
- store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
- reshard_shard_batch_size =
- store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
- }
-
- int get_shard_id() const {
- return shard_id;
- }
-
- int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
- const rgw_bucket_category_stats& entry_stats) {
- entries.push_back(entry);
- if (account) {
- rgw_bucket_category_stats& target = stats[category];
- target.num_entries += entry_stats.num_entries;
- target.total_size += entry_stats.total_size;
- target.total_size_rounded += entry_stats.total_size_rounded;
- target.actual_size += entry_stats.actual_size;
- }
- if (entries.size() >= reshard_shard_batch_size) {
- int ret = flush();
- if (ret < 0) {
- return ret;
- }
- }
-
- return 0;
- }
-
- int flush() {
- if (entries.size() == 0) {
- return 0;
- }
-
- librados::ObjectWriteOperation op;
- for (auto& entry : entries) {
- store->getRados()->bi_put(op, bs, entry);
- }
- cls_rgw_bucket_update_stats(op, false, stats);
-
- librados::AioCompletion *c;
- int ret = get_completion(&c);
- if (ret < 0) {
- return ret;
- }
- ret = bs.bucket_obj.aio_operate(c, &op);
- if (ret < 0) {
- derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- entries.clear();
- stats.clear();
- return 0;
- }
-
- int wait_all_aio() {
- int ret = 0;
- while (!aio_completions.empty()) {
- int r = wait_next_completion();
- if (r < 0) {
- ret = r;
- }
- }
- return ret;
- }
-}; // class BucketReshardShard
-
-
-class BucketReshardManager {
- rgw::sal::RadosStore *store;
- deque<librados::AioCompletion *> completions;
- vector<BucketReshardShard> target_shards;
-
-public:
- BucketReshardManager(const DoutPrefixProvider *dpp,
- rgw::sal::RadosStore *_store,
- const RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& target)
- : store(_store)
- {
- const int num_shards = target.layout.normal.num_shards;
- target_shards.reserve(num_shards);
- for (int i = 0; i < num_shards; ++i) {
- target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
- }
- }
-
- ~BucketReshardManager() {
- for (auto& shard : target_shards) {
- int ret = shard.wait_all_aio();
- if (ret < 0) {
- ldout(store->ctx(), 20) << __func__ <<
- ": shard->wait_all_aio() returned ret=" << ret << dendl;
- }
- }
- }
-
- int add_entry(int shard_index,
- rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
- const rgw_bucket_category_stats& entry_stats) {
- int ret = target_shards[shard_index].add_entry(entry, account, category,
- entry_stats);
- if (ret < 0) {
- derr << "ERROR: target_shards.add_entry(" << entry.idx <<
- ") returned error: " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- return 0;
- }
-
- int finish() {
- int ret = 0;
- for (auto& shard : target_shards) {
- int r = shard.flush();
- if (r < 0) {
- derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
- ret = r;
- }
- }
- for (auto& shard : target_shards) {
- int r = shard.wait_all_aio();
- if (r < 0) {
- derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
- ret = r;
- }
- }
- target_shards.clear();
- return ret;
- }
-}; // class BucketReshardManager
-
-RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
- const RGWBucketInfo& _bucket_info,
- const std::map<std::string, bufferlist>& _bucket_attrs,
- RGWBucketReshardLock* _outer_reshard_lock) :
- store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
- reshard_lock(store, bucket_info, true),
- outer_reshard_lock(_outer_reshard_lock)
-{ }
-
-// sets reshard status of bucket index shards for the current index layout
-static int set_resharding_status(const DoutPrefixProvider *dpp,
- rgw::sal::RadosStore* store,
- const RGWBucketInfo& bucket_info,
- cls_rgw_reshard_status status)
-{
- cls_rgw_bucket_instance_entry instance_entry;
- instance_entry.set_status(status);
-
- int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
- return 0;
-}
-
-static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
- const rgw_bucket& bucket,
- const DoutPrefixProvider* dpp)
-{
- RGWBucketInfo info;
- int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
- nullptr, null_yield, dpp);
- if (r < 0) {
- return r;
- }
-
- // delete its shard objects (ignore errors)
- store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
- // delete the bucket instance metadata
- return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
-}
-
-// initialize the new bucket index shard objects
-static int init_target_index(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- const rgw::bucket_index_layout_generation& index,
- const DoutPrefixProvider* dpp)
-{
- int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
- "target index shard objects: " << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- if (!bucket_info.datasync_flag_enabled()) {
- // if bucket sync is disabled, disable it on each of the new shards too
- auto log = rgw::log_layout_from_index(0, index);
- ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
- "bucket sync on the target index shard objects: "
- << cpp_strerror(ret) << dendl;
- store->svc()->bi->clean_index(dpp, bucket_info, index);
- return ret;
- }
- }
-
- return ret;
-}
-
-// initialize a target index layout, create its bucket index shard objects, and
-// write the target layout to the bucket instance metadata
-static int init_target_layout(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- uint32_t new_num_shards,
- const DoutPrefixProvider* dpp)
-{
- auto prev = bucket_info.layout; // make a copy for cleanup
- const auto current = prev.current_index;
-
- // initialize a new normal target index layout generation
- rgw::bucket_index_layout_generation target;
- target.layout.type = rgw::BucketIndexType::Normal;
- target.layout.normal.num_shards = new_num_shards;
- target.gen = current.gen + 1;
-
- if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
- // backward-compatible cleanup of old reshards, where the target was in a
- // different bucket instance
- if (!bucket_info.new_bucket_instance_id.empty()) {
- rgw_bucket new_bucket = bucket_info.bucket;
- new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
- ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
- "from a previous reshard attempt" << dendl;
- // ignore errors
- remove_old_reshard_instance(store, new_bucket, dpp);
- }
- bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
- }
-
- if (bucket_info.layout.target_index) {
- // a previous reshard failed or stalled, and its reshard lock dropped
- ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
- "objects from a previous reshard attempt" << dendl;
- // delete its existing shard objects (ignore errors)
- store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
- // don't reuse this same generation in the new target layout, in case
- // something is still trying to operate on its shard objects
- target.gen = bucket_info.layout.target_index->gen + 1;
- }
-
- // create the index shard objects
- int ret = init_target_index(store, bucket_info, target, dpp);
- if (ret < 0) {
- return ret;
- }
-
- // retry in case of racing writes to the bucket instance metadata
- static constexpr auto max_retries = 10;
- int tries = 0;
- do {
- // update resharding state
- bucket_info.layout.target_index = target;
- bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
-
- if (ret = fault.check("set_target_layout");
- ret == 0) { // no fault injected, write the bucket instance metadata
- ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
- real_time(), &bucket_attrs, dpp);
- } else if (ret == -ECANCELED) {
- fault.clear(); // clear the fault so a retry can succeed
- }
-
- if (ret == -ECANCELED) {
- // racing write detected, read the latest bucket info and try again
- int ret2 = store->getRados()->get_bucket_instance_info(
- bucket_info.bucket, bucket_info,
- nullptr, &bucket_attrs, null_yield, dpp);
- if (ret2 < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
- "bucket info: " << cpp_strerror(ret2) << dendl;
- ret = ret2;
- break;
- }
-
- // check that we're still in the reshard state we started in
- if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
- bucket_info.layout.current_index != current) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
- "another reshard" << dendl;
- break;
- }
-
- prev = bucket_info.layout; // update the copy
- }
- ++tries;
- } while (ret == -ECANCELED && tries < max_retries);
-
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
- "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
-
- bucket_info.layout = std::move(prev); // restore in-memory layout
-
- // delete the target shard objects (ignore errors)
- store->svc()->bi->clean_index(dpp, bucket_info, target);
- return ret;
- }
- return 0;
-} // init_target_layout
-
-// delete the bucket index shards associated with the target layout and remove
-// it from the bucket instance metadata
-static int revert_target_layout(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- const DoutPrefixProvider* dpp)
-{
- auto prev = bucket_info.layout; // make a copy for cleanup
-
- // remove target index shard objects
- int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
- "target index with: " << cpp_strerror(ret) << dendl;
- ret = 0; // non-fatal error
- }
-
- // retry in case of racing writes to the bucket instance metadata
- static constexpr auto max_retries = 10;
- int tries = 0;
- do {
- // clear target_index and resharding state
- bucket_info.layout.target_index = std::nullopt;
- bucket_info.layout.resharding = rgw::BucketReshardState::None;
-
- if (ret = fault.check("revert_target_layout");
- ret == 0) { // no fault injected, revert the bucket instance metadata
- ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
- real_time(),
- &bucket_attrs, dpp);
- } else if (ret == -ECANCELED) {
- fault.clear(); // clear the fault so a retry can succeed
- }
-
- if (ret == -ECANCELED) {
- // racing write detected, read the latest bucket info and try again
- int ret2 = store->getRados()->get_bucket_instance_info(
- bucket_info.bucket, bucket_info,
- nullptr, &bucket_attrs, null_yield, dpp);
- if (ret2 < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
- "bucket info: " << cpp_strerror(ret2) << dendl;
- ret = ret2;
- break;
- }
-
- // check that we're still in the reshard state we started in
- if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
- "reshard cancel" << dendl;
- return -ECANCELED;
- }
- if (bucket_info.layout.current_index != prev.current_index ||
- bucket_info.layout.target_index != prev.target_index) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
- "another reshard" << dendl;
- return -ECANCELED;
- }
-
- prev = bucket_info.layout; // update the copy
- }
- ++tries;
- } while (ret == -ECANCELED && tries < max_retries);
-
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
- "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
-
- bucket_info.layout = std::move(prev); // restore in-memory layout
- return ret;
- }
- return 0;
-} // remove_target_layout
-
-static int init_reshard(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- uint32_t new_num_shards,
- const DoutPrefixProvider *dpp)
-{
- int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
- if (ret < 0) {
- return ret;
- }
-
- if (ret = fault.check("block_writes");
- ret == 0) { // no fault injected, block writes to the current index shards
- ret = set_resharding_status(dpp, store, bucket_info,
- cls_rgw_reshard_status::IN_PROGRESS);
- }
-
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
- "writes to the current index: " << cpp_strerror(ret) << dendl;
- // clean up the target layout (ignore errors)
- revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
- return ret;
- }
- return 0;
-} // init_reshard
-
-static int cancel_reshard(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- const DoutPrefixProvider *dpp)
-{
- // unblock writes to the current index shard objects
- int ret = set_resharding_status(dpp, store, bucket_info,
- cls_rgw_reshard_status::NOT_RESHARDING);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
- "writes to current index objects: " << cpp_strerror(ret) << dendl;
- ret = 0; // non-fatal error
- }
-
- if (bucket_info.layout.target_index) {
- return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
- }
- // there is nothing to revert
- return 0;
-} // cancel_reshard
-
-static int commit_target_layout(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- const DoutPrefixProvider *dpp)
-{
- auto& layout = bucket_info.layout;
- const auto next_log_gen = layout.logs.empty() ? 1 :
- layout.logs.back().gen + 1;
-
- if (!store->svc()->zone->need_to_log_data()) {
- // if we're not syncing data, we can drop any existing logs
- layout.logs.clear();
- }
-
- // use the new index layout as current
- ceph_assert(layout.target_index);
- layout.current_index = std::move(*layout.target_index);
- layout.target_index = std::nullopt;
- layout.resharding = rgw::BucketReshardState::None;
- // add the in-index log layout
- layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
-
- int ret = fault.check("commit_target_layout");
- if (ret == 0) { // no fault injected, write the bucket instance metadata
- ret = store->getRados()->put_bucket_instance_info(
- bucket_info, false, real_time(), &bucket_attrs, dpp);
- } else if (ret == -ECANCELED) {
- fault.clear(); // clear the fault so a retry can succeed
- }
- return ret;
-} // commit_target_layout
-
-static int commit_reshard(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- ReshardFaultInjector& fault,
- const DoutPrefixProvider *dpp)
-{
- auto prev = bucket_info.layout; // make a copy for cleanup
-
- // retry in case of racing writes to the bucket instance metadata
- static constexpr auto max_retries = 10;
- int tries = 0;
- int ret = 0;
- do {
- ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
- if (ret == -ECANCELED) {
- // racing write detected, read the latest bucket info and try again
- int ret2 = store->getRados()->get_bucket_instance_info(
- bucket_info.bucket, bucket_info,
- nullptr, &bucket_attrs, null_yield, dpp);
- if (ret2 < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
- "bucket info: " << cpp_strerror(ret2) << dendl;
- ret = ret2;
- break;
- }
-
- // check that we're still in the reshard state we started in
- if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
- "reshard cancel" << dendl;
- return -ECANCELED; // whatever canceled us already did the cleanup
- }
- if (bucket_info.layout.current_index != prev.current_index ||
- bucket_info.layout.target_index != prev.target_index) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
- "another reshard" << dendl;
- return -ECANCELED; // whatever canceled us already did the cleanup
- }
-
- prev = bucket_info.layout; // update the copy
- }
- ++tries;
- } while (ret == -ECANCELED && tries < max_retries);
-
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
- "target index layout: " << cpp_strerror(ret) << dendl;
-
- bucket_info.layout = std::move(prev); // restore in-memory layout
-
- // unblock writes to the current index shard objects
- int ret2 = set_resharding_status(dpp, store, bucket_info,
- cls_rgw_reshard_status::NOT_RESHARDING);
- if (ret2 < 0) {
- ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
- "writes to current index objects: " << cpp_strerror(ret2) << dendl;
- // non-fatal error
- }
- return ret;
- }
-
- if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
- prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
- // write a datalog entry for each shard of the previous index. triggering
- // sync on the old shards will force them to detect the end-of-log for that
- // generation, and eventually transition to the next
- // TODO: use a log layout to support types other than BucketLogType::InIndex
- for (uint32_t shard_id = 0; shard_id < prev.current_index.layout.normal.num_shards; ++shard_id) {
- ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
- << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
- << prev.logs.back().gen << ")" << dendl;
- } // datalog error is not fatal
- }
- }
-
- // check whether the old index objects are still needed for bilogs
- const auto& logs = bucket_info.layout.logs;
- auto log = std::find_if(logs.begin(), logs.end(),
- [&prev] (const rgw::bucket_log_layout_generation& log) {
- return log.layout.type == rgw::BucketLogType::InIndex
- && log.layout.in_index.gen == prev.current_index.gen;
- });
- if (log == logs.end()) {
- // delete the index objects (ignore errors)
- store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
- }
- return 0;
-} // commit_reshard
-
-int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- const DoutPrefixProvider* dpp)
-{
- ReshardFaultInjector no_fault;
- return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
-}
-
-int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
-{
- int ret = reshard_lock.lock(dpp);
- if (ret < 0) {
- return ret;
- }
-
- if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
- ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
- ret = -EINVAL;
- } else {
- ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
- }
-
- reshard_lock.unlock();
- return ret;
-}
-
-RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
- const std::string& reshard_lock_oid,
- bool _ephemeral) :
- store(_store),
- lock_oid(reshard_lock_oid),
- ephemeral(_ephemeral),
- internal_lock(reshard_lock_name)
-{
- const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
- "rgw_reshard_bucket_lock_duration");
- duration = std::chrono::seconds(lock_dur_secs);
-
-#define COOKIE_LEN 16
- char cookie_buf[COOKIE_LEN + 1];
- gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
- cookie_buf[COOKIE_LEN] = '\0';
-
- internal_lock.set_cookie(cookie_buf);
- internal_lock.set_duration(duration);
-}
-
-int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
- internal_lock.set_must_renew(false);
-
- int ret;
- if (ephemeral) {
- ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
- lock_oid);
- } else {
- ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
- }
-
- if (ret == -EBUSY) {
- ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
- " found lock on " << lock_oid <<
- " to be held by another RGW process; skipping for now" << dendl;
- return ret;
- } else if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
- " failed to acquire lock on " << lock_oid << ": " <<
- cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- reset_time(Clock::now());
-
- return 0;
-}
-
-void RGWBucketReshardLock::unlock() {
- int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
- if (ret < 0) {
- ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
- " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
- }
-}
-
-int RGWBucketReshardLock::renew(const Clock::time_point& now) {
- internal_lock.set_must_renew(true);
- int ret;
- if (ephemeral) {
- ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
- lock_oid);
- } else {
- ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
- }
- if (ret < 0) { /* expired or already locked by another processor */
- std::stringstream error_s;
- if (-ENOENT == ret) {
- error_s << "ENOENT (lock expired or never initially locked)";
- } else {
- error_s << ret << " (" << cpp_strerror(-ret) << ")";
- }
- ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
- lock_oid << " with error " << error_s.str() << dendl;
- return ret;
- }
- internal_lock.set_must_renew(false);
-
- reset_time(now);
- ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
- lock_oid << dendl;
-
- return 0;
-}
-
-
-int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
- const rgw::bucket_index_layout_generation& target,
- int max_entries,
- bool verbose,
- ostream *out,
- Formatter *formatter,
- const DoutPrefixProvider *dpp)
-{
- if (out) {
- (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
- (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
- }
-
- /* update bucket info -- in progress*/
- list<rgw_cls_bi_entry> entries;
-
- if (max_entries < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": can't reshard, negative max_entries" << dendl;
- return -EINVAL;
- }
-
- BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
-
- bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
-
- if (verbose_json_out) {
- formatter->open_array_section("entries");
- }
-
- uint64_t total_entries = 0;
-
- if (!verbose_json_out && out) {
- (*out) << "total entries:";
- }
-
- const int num_source_shards = current.layout.normal.num_shards;
- string marker;
- for (int i = 0; i < num_source_shards; ++i) {
- bool is_truncated = true;
- marker.clear();
- const std::string null_object_filter; // empty string since we're not filtering by object
- while (is_truncated) {
- entries.clear();
- int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
- if (ret < 0 && ret != -ENOENT) {
- derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
- rgw_cls_bi_entry& entry = *iter;
- if (verbose_json_out) {
- formatter->open_object_section("entry");
-
- encode_json("shard_id", i, formatter);
- encode_json("num_entry", total_entries, formatter);
- encode_json("entry", entry, formatter);
- }
- total_entries++;
-
- marker = entry.idx;
-
- int target_shard_id;
- cls_rgw_obj_key cls_key;
- RGWObjCategory category;
- rgw_bucket_category_stats stats;
- bool account = entry.get_info(&cls_key, &category, &stats);
- rgw_obj_key key(cls_key);
- if (entry.type == BIIndexType::OLH && key.empty()) {
- // bogus entry created by https://tracker.ceph.com/issues/46456
- // to fix, skip so it doesn't get include in the new bucket instance
- total_entries--;
- ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
- continue;
- }
- rgw_obj obj(bucket_info.bucket, key);
- RGWMPObj mp;
- if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
- // place the multipart .meta object on the same shard as its head object
- obj.index_hash_source = mp.get_key();
- }
- ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
- obj.get_hash_object(), &target_shard_id);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
- return ret;
- }
-
- int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
-
- ret = target_shards_mgr.add_entry(shard_index, entry, account,
- category, stats);
- if (ret < 0) {
- return ret;
- }
-
- Clock::time_point now = Clock::now();
- if (reshard_lock.should_renew(now)) {
- // assume outer locks have timespans at least the size of ours, so
- // can call inside conditional
- if (outer_reshard_lock) {
- ret = outer_reshard_lock->renew(now);
- if (ret < 0) {
- return ret;
- }
- }
- ret = reshard_lock.renew(now);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
- return ret;
- }
- }
- if (verbose_json_out) {
- formatter->close_section();
- formatter->flush(*out);
- } else if (out && !(total_entries % 1000)) {
- (*out) << " " << total_entries;
- }
- } // entries loop
- }
- }
-
- if (verbose_json_out) {
- formatter->close_section();
- formatter->flush(*out);
- } else if (out) {
- (*out) << " " << total_entries << std::endl;
- }
-
- int ret = target_shards_mgr.finish();
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
- return -EIO;
- }
- return 0;
-} // RGWBucketReshard::do_reshard
-
-int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
-{
- return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
-}
-
-int RGWBucketReshard::execute(int num_shards,
- ReshardFaultInjector& fault,
- int max_op_entries,
- const DoutPrefixProvider *dpp,
- bool verbose, ostream *out,
- Formatter *formatter,
- RGWReshard* reshard_log)
-{
- // take a reshard lock on the bucket
- int ret = reshard_lock.lock(dpp);
- if (ret < 0) {
- return ret;
- }
- // unlock when scope exits
- auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
-
- if (reshard_log) {
- ret = reshard_log->update(dpp, bucket_info);
- if (ret < 0) {
- return ret;
- }
- }
-
- // prepare the target index and add its layout the bucket info
- ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
- if (ret < 0) {
- return ret;
- }
-
- if (ret = fault.check("do_reshard");
- ret == 0) { // no fault injected, do the reshard
- ret = do_reshard(bucket_info.layout.current_index,
- *bucket_info.layout.target_index,
- max_op_entries, verbose, out, formatter, dpp);
- }
-
- if (ret < 0) {
- cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
-
- ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
- << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
- return ret;
- }
-
- ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
- if (ret < 0) {
- return ret;
- }
-
- ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
- << bucket_info.bucket.name << "\" completed successfully" << dendl;
- return 0;
-} // execute
-
-bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
- const RGWSI_Zone* zone_svc)
-{
- return !zone_svc->need_to_log_data() ||
- bucket.layout.logs.size() < max_bilog_history;
-}
-
-
-RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
- Formatter *_formatter) :
- store(_store), instance_lock(bucket_instance_lock_name),
- verbose(_verbose), out(_out), formatter(_formatter)
-{
- num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
-}
-
-string RGWReshard::get_logshard_key(const string& tenant,
- const string& bucket_name)
-{
- return tenant + ":" + bucket_name;
-}
-
-#define MAX_RESHARD_LOGSHARDS_PRIME 7877
-
-void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
-{
- string key = get_logshard_key(tenant, bucket_name);
-
- uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
- uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
- sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
-
- get_logshard_oid(int(sid), oid);
-}
-
-int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
-{
- if (!store->svc()->zone->can_reshard()) {
- ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled" << dendl;
- return 0;
- }
-
- string logshard_oid;
-
- get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
- librados::ObjectWriteOperation op;
- cls_rgw_reshard_add(op, entry);
-
- int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
- return ret;
- }
- return 0;
-}
-
-int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
-{
- cls_rgw_reshard_entry entry;
- entry.bucket_name = bucket_info.bucket.name;
- entry.bucket_id = bucket_info.bucket.bucket_id;
- entry.tenant = bucket_info.owner.tenant;
-
- int ret = get(dpp, entry);
- if (ret < 0) {
- return ret;
- }
-
- ret = add(dpp, entry);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
- cpp_strerror(-ret) << dendl;
- }
-
- return ret;
-}
-
-
-int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
-{
- string logshard_oid;
-
- get_logshard_oid(logshard_num, &logshard_oid);
-
- int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
-
- if (ret == -ENOENT) {
- // these shard objects aren't created until we actually write something to
- // them, so treat ENOENT as a successful empty listing
- *is_truncated = false;
- ret = 0;
- } else if (ret == -EACCES) {
- ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
- << ". Fix the pool access permissions of your client" << dendl;
- } else if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
- << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
- }
-
- return ret;
-}
-
-int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
-{
- string logshard_oid;
-
- get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
- int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
- if (ret < 0) {
- if (ret != -ENOENT) {
- ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
- " bucket=" << entry.bucket_name << dendl;
- }
- return ret;
- }
-
- return 0;
-}
-
-int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
-{
- string logshard_oid;
-
- get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
-
- librados::ObjectWriteOperation op;
- cls_rgw_reshard_remove(op, entry);
-
- int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
- return ret;
- }
-
- return ret;
-}
-
-int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
-{
- int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
- if (ret < 0) {
- ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
- return ret;
- }
-
- return 0;
-}
-
-int RGWReshardWait::wait(optional_yield y)
-{
- std::unique_lock lock(mutex);
-
- if (going_down) {
- return -ECANCELED;
- }
-
- if (y) {
- auto& context = y.get_io_context();
- auto& yield = y.get_yield_context();
-
- Waiter waiter(context);
- waiters.push_back(waiter);
- lock.unlock();
-
- waiter.timer.expires_after(duration);
-
- boost::system::error_code ec;
- waiter.timer.async_wait(yield[ec]);
-
- lock.lock();
- waiters.erase(waiters.iterator_to(waiter));
- return -ec.value();
- }
-
- cond.wait_for(lock, duration);
-
- if (going_down) {
- return -ECANCELED;
- }
-
- return 0;
-}
-
-void RGWReshardWait::stop()
-{
- std::scoped_lock lock(mutex);
- going_down = true;
- cond.notify_all();
- for (auto& waiter : waiters) {
- // unblock any waiters with ECANCELED
- waiter.timer.cancel();
- }
-}
-
-int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
- int max_entries, const DoutPrefixProvider *dpp)
-{
- ldpp_dout(dpp, 20) << __func__ << " resharding " <<
- entry.bucket_name << dendl;
-
- rgw_bucket bucket;
- RGWBucketInfo bucket_info;
- std::map<std::string, bufferlist> bucket_attrs;
-
- int ret = store->getRados()->get_bucket_info(store->svc(),
- entry.tenant,
- entry.bucket_name,
- bucket_info, nullptr,
- null_yield, dpp,
- &bucket_attrs);
- if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": Error in get_bucket_info for bucket " << entry.bucket_name <<
- ": " << cpp_strerror(-ret) << dendl;
- if (ret != -ENOENT) {
- // any error other than ENOENT will abort
- return ret;
- }
- } else {
- ldpp_dout(dpp, 0) << __func__ <<
- ": Bucket: " << entry.bucket_name <<
- " already resharded by someone, skipping " << dendl;
- }
-
- // we've encountered a reshard queue entry for an apparently
- // non-existent bucket; let's try to recover by cleaning up
- ldpp_dout(dpp, 0) << __func__ <<
- ": removing reshard queue entry for a resharded or non-existent bucket" <<
- entry.bucket_name << dendl;
-
- ret = remove(dpp, entry);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": Error removing non-existent bucket " <<
- entry.bucket_name << " from resharding queue: " <<
- cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- // we cleaned up, move on to the next entry
- return 0;
- }
-
- if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
- ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
- "eligible for resharding until peer zones finish syncing one "
- "or more of its old log generations" << dendl;
- return remove(dpp, entry);
- }
-
- RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
-
- ReshardFaultInjector f; // no fault injected
- ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
- false, nullptr, nullptr, this);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ <<
- ": Error during resharding bucket " << entry.bucket_name << ":" <<
- cpp_strerror(-ret)<< dendl;
- return ret;
- }
-
- ldpp_dout(dpp, 20) << __func__ <<
- " removing reshard queue entry for bucket " << entry.bucket_name <<
- dendl;
-
- ret = remove(dpp, entry);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
- entry.bucket_name << " from resharding queue: " <<
- cpp_strerror(-ret) << dendl;
- return ret;
- }
- return 0;
-}
-
-int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
-{
- string marker;
- bool truncated = true;
-
- constexpr uint32_t max_entries = 1000;
-
- string logshard_oid;
- get_logshard_oid(logshard_num, &logshard_oid);
-
- RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
-
- int ret = logshard_lock.lock(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
- logshard_oid << ", ret = " << ret <<dendl;
- return ret;
- }
-
- do {
- std::list<cls_rgw_reshard_entry> entries;
- ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
- if (ret < 0) {
- ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
- logshard_oid << dendl;
- continue;
- }
-
- for(auto& entry: entries) { // logshard entries
- process_entry(entry, max_entries, dpp);
- if (ret < 0) {
- return ret;
- }
-
- Clock::time_point now = Clock::now();
- if (logshard_lock.should_renew(now)) {
- ret = logshard_lock.renew(now);
- if (ret < 0) {
- return ret;
- }
- }
-
- entry.get_key(&marker);
- } // entry for loop
- } while (truncated);
-
- logshard_lock.unlock();
- return 0;
-}
-
-
-void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
-{
- char buf[32];
- snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
-
- string objname(reshard_oid_prefix);
- *logshard = objname + buf;
-}
-
-int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
-{
- int ret = 0;
-
- for (int i = 0; i < num_logshards; i++) {
- string logshard;
- get_logshard_oid(i, &logshard);
-
- ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
-
- ret = process_single_logshard(i, dpp);
-
- ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
- }
-
- return 0;
-}
-
-bool RGWReshard::going_down()
-{
- return down_flag;
-}
-
-void RGWReshard::start_processor()
-{
- worker = new ReshardWorker(store->ctx(), this);
- worker->create("rgw_reshard");
-}
-
-void RGWReshard::stop_processor()
-{
- down_flag = true;
- if (worker) {
- worker->stop();
- worker->join();
- }
- delete worker;
- worker = nullptr;
-}
-
-void *RGWReshard::ReshardWorker::entry() {
- do {
- utime_t start = ceph_clock_now();
- reshard->process_all_logshards(this);
-
- if (reshard->going_down())
- break;
-
- utime_t end = ceph_clock_now();
- end -= start;
- int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
-
- if (secs <= end.sec())
- continue; // next round
-
- secs -= end.sec();
-
- std::unique_lock locker{lock};
- cond.wait_for(locker, std::chrono::seconds(secs));
- } while (!reshard->going_down());
-
- return NULL;
-}
-
-void RGWReshard::ReshardWorker::stop()
-{
- std::lock_guard l{lock};
- cond.notify_all();
-}
-
-CephContext *RGWReshard::ReshardWorker::get_cct() const
-{
- return cct;
-}
-
-unsigned RGWReshard::ReshardWorker::get_subsys() const
-{
- return dout_subsys;
-}
-
-std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
-{
- return out << "rgw reshard worker thread: ";
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef RGW_RESHARD_H
-#define RGW_RESHARD_H
-
-#include <vector>
-#include <initializer_list>
-#include <functional>
-#include <iterator>
-#include <algorithm>
-
-#include <boost/intrusive/list.hpp>
-#include <boost/asio/basic_waitable_timer.hpp>
-
-#include "include/common_fwd.h"
-#include "include/rados/librados.hpp"
-#include "common/ceph_time.h"
-#include "common/async/yield_context.h"
-#include "cls/rgw/cls_rgw_types.h"
-#include "cls/lock/cls_lock_client.h"
-
-#include "rgw_common.h"
-#include "common/fault_injector.h"
-
-
-class RGWReshard;
-namespace rgw { namespace sal {
- class RadosStore;
-} }
-
-using ReshardFaultInjector = FaultInjector<std::string_view>;
-
-class RGWBucketReshardLock {
- using Clock = ceph::coarse_mono_clock;
-
- rgw::sal::RadosStore* store;
- const std::string lock_oid;
- const bool ephemeral;
- rados::cls::lock::Lock internal_lock;
- std::chrono::seconds duration;
-
- Clock::time_point start_time;
- Clock::time_point renew_thresh;
-
- void reset_time(const Clock::time_point& now) {
- start_time = now;
- renew_thresh = start_time + duration / 2;
- }
-
-public:
- RGWBucketReshardLock(rgw::sal::RadosStore* _store,
- const std::string& reshard_lock_oid,
- bool _ephemeral);
- RGWBucketReshardLock(rgw::sal::RadosStore* _store,
- const RGWBucketInfo& bucket_info,
- bool _ephemeral) :
- RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
- {}
-
- int lock(const DoutPrefixProvider *dpp);
- void unlock();
- int renew(const Clock::time_point&);
-
- bool should_renew(const Clock::time_point& now) const {
- return now >= renew_thresh;
- }
-}; // class RGWBucketReshardLock
-
-class RGWBucketReshard {
- public:
- using Clock = ceph::coarse_mono_clock;
-
- private:
- rgw::sal::RadosStore *store;
- RGWBucketInfo bucket_info;
- std::map<std::string, bufferlist> bucket_attrs;
-
- RGWBucketReshardLock reshard_lock;
- RGWBucketReshardLock* outer_reshard_lock;
-
- // using an initializer_list as an array in contiguous memory
- // allocated in at once
- static const std::initializer_list<uint16_t> reshard_primes;
-
- int do_reshard(const rgw::bucket_index_layout_generation& current,
- const rgw::bucket_index_layout_generation& target,
- int max_entries,
- bool verbose,
- std::ostream *os,
- Formatter *formatter,
- const DoutPrefixProvider *dpp);
-public:
-
- // pass nullptr for the final parameter if no outer reshard lock to
- // manage
- RGWBucketReshard(rgw::sal::RadosStore* _store,
- const RGWBucketInfo& _bucket_info,
- const std::map<std::string, bufferlist>& _bucket_attrs,
- RGWBucketReshardLock* _outer_reshard_lock);
- int execute(int num_shards, ReshardFaultInjector& f,
- int max_op_entries, const DoutPrefixProvider *dpp,
- bool verbose = false, std::ostream *out = nullptr,
- ceph::Formatter *formatter = nullptr,
- RGWReshard *reshard_log = nullptr);
- int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
- int cancel(const DoutPrefixProvider* dpp);
-
- static int clear_resharding(rgw::sal::RadosStore* store,
- RGWBucketInfo& bucket_info,
- std::map<std::string, bufferlist>& bucket_attrs,
- const DoutPrefixProvider* dpp);
-
- static uint32_t get_max_prime_shards() {
- return *std::crbegin(reshard_primes);
- }
-
- // returns the prime in our list less than or equal to the
- // parameter; the lowest value that can be returned is 1
- static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
- auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
- requested_shards);
- if (it == reshard_primes.begin()) {
- return 1;
- } else {
- return *(--it);
- }
- }
-
- // returns the prime in our list greater than or equal to the
- // parameter; if we do not have such a prime, 0 is returned
- static uint32_t get_prime_shards_greater_or_equal(
- uint32_t requested_shards)
- {
- auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
- requested_shards);
- if (it == reshard_primes.end()) {
- return 0;
- } else {
- return *it;
- }
- }
-
- // returns a preferred number of shards given a calculated number of
- // shards based on max_dynamic_shards and the list of prime values
- static uint32_t get_preferred_shards(uint32_t suggested_shards,
- uint32_t max_dynamic_shards) {
-
- // use a prime if max is within our prime range, otherwise use
- // specified max
- const uint32_t absolute_max =
- max_dynamic_shards >= get_max_prime_shards() ?
- max_dynamic_shards :
- get_prime_shards_less_or_equal(max_dynamic_shards);
-
- // if we can use a prime number, use it, otherwise use suggested;
- // note get_prime_shards_greater_or_equal will return 0 if no prime in
- // prime range
- const uint32_t prime_ish_num_shards =
- std::max(get_prime_shards_greater_or_equal(suggested_shards),
- suggested_shards);
-
- // dynamic sharding cannot reshard more than defined maximum
- const uint32_t final_num_shards =
- std::min(prime_ish_num_shards, absolute_max);
-
- return final_num_shards;
- }
-
- const std::map<std::string, bufferlist>& get_bucket_attrs() const {
- return bucket_attrs;
- }
-
- // for multisite, the RGWBucketInfo keeps a history of old log generations
- // until all peers are done with them. prevent this log history from growing
- // too large by refusing to reshard the bucket until the old logs get trimmed
- static constexpr size_t max_bilog_history = 4;
-
- static bool can_reshard(const RGWBucketInfo& bucket,
- const RGWSI_Zone* zone_svc);
-}; // RGWBucketReshard
-
-
-class RGWReshard {
-public:
- using Clock = ceph::coarse_mono_clock;
-
-private:
- rgw::sal::RadosStore* store;
- std::string lock_name;
- rados::cls::lock::Lock instance_lock;
- int num_logshards;
-
- bool verbose;
- std::ostream *out;
- Formatter *formatter;
-
- void get_logshard_oid(int shard_num, std::string *shard);
-protected:
- class ReshardWorker : public Thread, public DoutPrefixProvider {
- CephContext *cct;
- RGWReshard *reshard;
- ceph::mutex lock = ceph::make_mutex("ReshardWorker");
- ceph::condition_variable cond;
-
- public:
- ReshardWorker(CephContext * const _cct,
- RGWReshard * const _reshard)
- : cct(_cct),
- reshard(_reshard) {}
-
- void *entry() override;
- void stop();
-
- CephContext *get_cct() const override;
- unsigned get_subsys() const override;
- std::ostream& gen_prefix(std::ostream& out) const override;
- };
-
- ReshardWorker *worker = nullptr;
- std::atomic<bool> down_flag = { false };
-
- std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
- void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
-
-public:
- RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
- int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
- int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
- int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
- int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
- int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
- int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
-
- /* reshard thread */
- int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
- const DoutPrefixProvider *dpp);
- int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
- int process_all_logshards(const DoutPrefixProvider *dpp);
- bool going_down();
- void start_processor();
- void stop_processor();
-};
-
-class RGWReshardWait {
- public:
- // the blocking wait uses std::condition_variable::wait_for(), which uses the
- // std::chrono::steady_clock. use that for the async waits as well
- using Clock = std::chrono::steady_clock;
- private:
- const ceph::timespan duration;
- ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
- ceph::condition_variable cond;
-
- struct Waiter : boost::intrusive::list_base_hook<> {
- using Executor = boost::asio::io_context::executor_type;
- using Timer = boost::asio::basic_waitable_timer<Clock,
- boost::asio::wait_traits<Clock>, Executor>;
- Timer timer;
- explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
- };
- boost::intrusive::list<Waiter> waiters;
-
- bool going_down{false};
-
-public:
- RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
- : duration(duration) {}
- ~RGWReshardWait() {
- ceph_assert(going_down);
- }
- int wait(optional_yield y);
- // unblock any threads waiting on reshard
- void stop();
-};
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_RESOLVE_H
-#define CEPH_RGW_RESOLVE_H
+#pragma once
#include "rgw_common.h"
extern void rgw_init_resolver(void);
extern void rgw_shutdown_resolver(void);
extern RGWResolver *rgw_resolver;
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_op.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_rest_bucket.h"
-#include "rgw_sal.h"
-
-#include "include/str_list.h"
-
-#include "services/svc_sys_obj.h"
-#include "services/svc_zone.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-class RGWOp_Bucket_Info : public RGWRESTOp {
-
-public:
- RGWOp_Bucket_Info() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_READ);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "get_bucket_info"; }
-};
-
-void RGWOp_Bucket_Info::execute(optional_yield y)
-{
- RGWBucketAdminOpState op_state;
-
- bool fetch_stats;
-
- std::string bucket;
-
- string uid_str;
-
- RESTArgs::get_string(s, "uid", uid_str, &uid_str);
- rgw_user uid(uid_str);
-
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_bool(s, "stats", false, &fetch_stats);
-
- op_state.set_user_id(uid);
- op_state.set_bucket_name(bucket);
- op_state.set_fetch_stats(fetch_stats);
-
- op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
-}
-
-class RGWOp_Get_Policy : public RGWRESTOp {
-
-public:
- RGWOp_Get_Policy() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_READ);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "get_policy"; }
-};
-
-void RGWOp_Get_Policy::execute(optional_yield y)
-{
- RGWBucketAdminOpState op_state;
-
- std::string bucket;
- std::string object;
-
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_string(s, "object", object, &object);
-
- op_state.set_bucket_name(bucket);
- op_state.set_object(object);
-
- op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
-}
-
-class RGWOp_Check_Bucket_Index : public RGWRESTOp {
-
-public:
- RGWOp_Check_Bucket_Index() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "check_bucket_index"; }
-};
-
-void RGWOp_Check_Bucket_Index::execute(optional_yield y)
-{
- std::string bucket;
-
- bool fix_index;
- bool check_objects;
-
- RGWBucketAdminOpState op_state;
-
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_bool(s, "fix", false, &fix_index);
- RESTArgs::get_bool(s, "check-objects", false, &check_objects);
-
- op_state.set_bucket_name(bucket);
- op_state.set_fix_index(fix_index);
- op_state.set_check_objects(check_objects);
-
- op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
-}
-
-class RGWOp_Bucket_Link : public RGWRESTOp {
-
-public:
- RGWOp_Bucket_Link() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "link_bucket"; }
-};
-
-void RGWOp_Bucket_Link::execute(optional_yield y)
-{
- std::string uid_str;
- std::string bucket;
- std::string bucket_id;
- std::string new_bucket_name;
-
- RGWBucketAdminOpState op_state;
-
- RESTArgs::get_string(s, "uid", uid_str, &uid_str);
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
- RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
-
- rgw_user uid(uid_str);
- op_state.set_user_id(uid);
- op_state.set_bucket_name(bucket);
- op_state.set_bucket_id(bucket_id);
- op_state.set_new_bucket_name(new_bucket_name);
-
- bufferlist data;
- op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
- if (op_ret < 0) {
- ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
- return;
- }
- op_ret = RGWBucketAdminOp::link(driver, op_state, s);
-}
-
-class RGWOp_Bucket_Unlink : public RGWRESTOp {
-
-public:
- RGWOp_Bucket_Unlink() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "unlink_bucket"; }
-};
-
-void RGWOp_Bucket_Unlink::execute(optional_yield y)
-{
- std::string uid_str;
- std::string bucket;
-
- RGWBucketAdminOpState op_state;
-
- RESTArgs::get_string(s, "uid", uid_str, &uid_str);
- rgw_user uid(uid_str);
-
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
-
- op_state.set_user_id(uid);
- op_state.set_bucket_name(bucket);
-
- bufferlist data;
- op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
- if (op_ret < 0) {
- ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
- return;
- }
- op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
-}
-
-class RGWOp_Bucket_Remove : public RGWRESTOp {
-
-public:
- RGWOp_Bucket_Remove() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "remove_bucket"; }
-};
-
-void RGWOp_Bucket_Remove::execute(optional_yield y)
-{
- std::string bucket_name;
- bool delete_children;
- std::unique_ptr<rgw::sal::Bucket> bucket;
-
- RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
- RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
-
- /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
- * the master. This user is actually the OP caller, not the bucket owner. */
- op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
- if (op_ret == -ENOENT) {
- op_ret = -ERR_NO_SUCH_BUCKET;
- }
- return;
- }
-
- op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
-}
-
-class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
-
-public:
- RGWOp_Set_Bucket_Quota() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "set_bucket_quota"; }
-};
-
-#define QUOTA_INPUT_MAX_LEN 1024
-
-void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
-{
- bool uid_arg_existed = false;
- std::string uid_str;
- RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
- if (! uid_arg_existed) {
- op_ret = -EINVAL;
- return;
- }
- rgw_user uid(uid_str);
- bool bucket_arg_existed = false;
- std::string bucket_name;
- RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
- if (! bucket_arg_existed) {
- op_ret = -EINVAL;
- return;
- }
-
- bool use_http_params;
-
- if (s->content_length > 0) {
- use_http_params = false;
- } else {
- const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
- use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
- }
- RGWQuotaInfo quota;
- if (!use_http_params) {
- bool empty;
- op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
- if (op_ret < 0) {
- if (!empty)
- return;
- /* was probably chunked input, but no content provided, configure via http params */
- use_http_params = true;
- }
- }
- if (use_http_params) {
- std::unique_ptr<rgw::sal::Bucket> bucket;
- op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
- if (op_ret < 0) {
- return;
- }
- RGWQuotaInfo *old_quota = &bucket->get_info().quota;
- int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
- int64_t max_size_kb;
- bool has_max_size_kb = false;
- RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, "a.max_objects);
- RESTArgs::get_int64(s, "max-size", old_quota->max_size, "a.max_size);
- RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
- if (has_max_size_kb)
- quota.max_size = max_size_kb * 1024;
- RESTArgs::get_bool(s, "enabled", old_quota->enabled, "a.enabled);
- }
-
- RGWBucketAdminOpState op_state;
- op_state.set_user_id(uid);
- op_state.set_bucket_name(bucket_name);
- op_state.set_quota(quota);
-
- op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
-}
-
-class RGWOp_Sync_Bucket : public RGWRESTOp {
-
-public:
- RGWOp_Sync_Bucket() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "sync_bucket"; }
-};
-
-void RGWOp_Sync_Bucket::execute(optional_yield y)
-{
- std::string bucket;
- std::string tenant;
- bool sync_bucket;
-
- RGWBucketAdminOpState op_state;
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_string(s, "tenant", tenant, &tenant);
- RESTArgs::get_bool(s, "sync", true, &sync_bucket);
-
- op_state.set_bucket_name(bucket);
- op_state.set_tenant(tenant);
- op_state.set_sync_bucket(sync_bucket);
-
- op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
-}
-
-class RGWOp_Object_Remove: public RGWRESTOp {
-
-public:
- RGWOp_Object_Remove() {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("buckets", RGW_CAP_WRITE);
- }
-
- void execute(optional_yield y) override;
-
- const char* name() const override { return "remove_object"; }
-};
-
-void RGWOp_Object_Remove::execute(optional_yield y)
-{
- std::string bucket;
- std::string object;
-
- RGWBucketAdminOpState op_state;
-
- RESTArgs::get_string(s, "bucket", bucket, &bucket);
- RESTArgs::get_string(s, "object", object, &object);
-
- op_state.set_bucket_name(bucket);
- op_state.set_object(object);
-
- op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
-}
-
-
-RGWOp *RGWHandler_Bucket::op_get()
-{
-
- if (s->info.args.sub_resource_exists("policy"))
- return new RGWOp_Get_Policy;
-
- if (s->info.args.sub_resource_exists("index"))
- return new RGWOp_Check_Bucket_Index;
-
- return new RGWOp_Bucket_Info;
-}
-
-RGWOp *RGWHandler_Bucket::op_put()
-{
- if (s->info.args.sub_resource_exists("quota"))
- return new RGWOp_Set_Bucket_Quota;
-
- if (s->info.args.sub_resource_exists("sync"))
- return new RGWOp_Sync_Bucket;
-
- return new RGWOp_Bucket_Link;
-}
-
-RGWOp *RGWHandler_Bucket::op_post()
-{
- return new RGWOp_Bucket_Unlink;
-}
-
-RGWOp *RGWHandler_Bucket::op_delete()
-{
- if (s->info.args.sub_resource_exists("object"))
- return new RGWOp_Object_Remove;
-
- return new RGWOp_Bucket_Remove;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-
-
-class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
-protected:
- RGWOp *op_get() override;
- RGWOp *op_put() override;
- RGWOp *op_post() override;
- RGWOp *op_delete() override;
-public:
- using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
- ~RGWHandler_Bucket() override = default;
-
- int read_permissions(RGWOp*, optional_yield y) override {
- return 0;
- }
-};
-
-class RGWRESTMgr_Bucket : public RGWRESTMgr {
-public:
- RGWRESTMgr_Bucket() = default;
- ~RGWRESTMgr_Bucket() override = default;
-
- RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
- req_state*,
- const rgw::auth::StrategyRegistry& auth_registry,
- const std::string&) override {
- return new RGWHandler_Bucket(auth_registry);
- }
-};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/ceph_json.h"
-#include "common/strtol.h"
-#include "rgw_rest.h"
-#include "rgw_op.h"
-#include "rgw_rest_s3.h"
-#include "rgw_rest_log.h"
-#include "rgw_client_io.h"
-#include "rgw_sync.h"
-#include "rgw_data_sync.h"
-#include "rgw_common.h"
-#include "rgw_zone.h"
-#include "rgw_mdlog.h"
-#include "rgw_datalog_notify.h"
-#include "rgw_trim_bilog.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_mdlog.h"
-#include "services/svc_bilog_rados.h"
-
-#include "common/errno.h"
-#include "include/ceph_assert.h"
-
-#define dout_context g_ceph_context
-#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-void RGWOp_MDLog_List::execute(optional_yield y) {
- string period = s->info.args.get("period");
- string shard = s->info.args.get("id");
- string max_entries_str = s->info.args.get("max-entries");
- string marker = s->info.args.get("marker"),
- err;
- void *handle;
- unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
- if (s->info.args.exists("start-time") ||
- s->info.args.exists("end-time")) {
- ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (!max_entries_str.empty()) {
- max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
- op_ret = -EINVAL;
- return;
- }
- if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
- max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
- }
- }
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
- period = driver->get_zone()->get_current_period_id();
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id" << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
-
- RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
- meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
-
- op_ret = meta_log.list_entries(this, handle, max_entries, entries,
- &last_marker, &truncated);
-
- meta_log.complete_list_entries(handle);
-}
-
-void RGWOp_MDLog_List::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret < 0)
- return;
-
- s->formatter->open_object_section("log_entries");
- s->formatter->dump_string("marker", last_marker);
- s->formatter->dump_bool("truncated", truncated);
- {
- s->formatter->open_array_section("entries");
- for (list<cls_log_entry>::iterator iter = entries.begin();
- iter != entries.end(); ++iter) {
- cls_log_entry& entry = *iter;
- static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
- flusher.flush();
- }
- s->formatter->close_section();
- }
- s->formatter->close_section();
- flusher.flush();
-}
-
-void RGWOp_MDLog_Info::execute(optional_yield y) {
- num_objects = s->cct->_conf->rgw_md_log_max_shards;
- period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
- op_ret = period.get_error();
-}
-
-void RGWOp_MDLog_Info::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- s->formatter->open_object_section("mdlog");
- s->formatter->dump_unsigned("num_objects", num_objects);
- if (period) {
- s->formatter->dump_string("period", period.get_period().get_id());
- s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
- }
- s->formatter->close_section();
- flusher.flush();
-}
-
-void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
- string period = s->info.args.get("period");
- string shard = s->info.args.get("id");
- string err;
-
- unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
- period = driver->get_zone()->get_current_period_id();
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id" << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
- RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
- op_ret = meta_log.get_info(this, shard_id, &info);
-}
-
-void RGWOp_MDLog_ShardInfo::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- encode_json("info", info, s->formatter);
- flusher.flush();
-}
-
-void RGWOp_MDLog_Delete::execute(optional_yield y) {
- string marker = s->info.args.get("marker"),
- period = s->info.args.get("period"),
- shard = s->info.args.get("id"),
- err;
- unsigned shard_id;
-
-
- if (s->info.args.exists("start-time") ||
- s->info.args.exists("end-time")) {
- ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
- op_ret = -EINVAL;
- }
-
- if (s->info.args.exists("start-marker")) {
- ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
- op_ret = -EINVAL;
- }
-
- if (s->info.args.exists("end-marker")) {
- if (!s->info.args.exists("marker")) {
- marker = s->info.args.get("end-marker");
- } else {
- ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
- op_ret = -EINVAL;
- }
- }
-
- op_ret = 0;
-
- shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (marker.empty()) { /* bounding end */
- op_ret = -EINVAL;
- return;
- }
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
- period = driver->get_zone()->get_current_period_id();
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id" << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
- RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
-
- op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
-}
-
-void RGWOp_MDLog_Lock::execute(optional_yield y) {
- string period, shard_id_str, duration_str, locker_id, zone_id;
- unsigned shard_id;
-
- op_ret = 0;
-
- period = s->info.args.get("period");
- shard_id_str = s->info.args.get("id");
- duration_str = s->info.args.get("length");
- locker_id = s->info.args.get("locker-id");
- zone_id = s->info.args.get("zone-id");
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
- period = driver->get_zone()->get_current_period_id();
- }
-
- if (period.empty() ||
- shard_id_str.empty() ||
- (duration_str.empty()) ||
- locker_id.empty() ||
- zone_id.empty()) {
- ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- string err;
- shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
- unsigned dur;
- dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
- if (!err.empty() || dur <= 0) {
- ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
- op_ret = -EINVAL;
- return;
- }
- op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
- locker_id);
- if (op_ret == -EBUSY)
- op_ret = -ERR_LOCKED;
-}
-
-void RGWOp_MDLog_Unlock::execute(optional_yield y) {
- string period, shard_id_str, locker_id, zone_id;
- unsigned shard_id;
-
- op_ret = 0;
-
- period = s->info.args.get("period");
- shard_id_str = s->info.args.get("id");
- locker_id = s->info.args.get("locker-id");
- zone_id = s->info.args.get("zone-id");
-
- if (period.empty()) {
- ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
- period = driver->get_zone()->get_current_period_id();
- }
-
- if (period.empty() ||
- shard_id_str.empty() ||
- locker_id.empty() ||
- zone_id.empty()) {
- ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- string err;
- shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
- op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
-}
-
-void RGWOp_MDLog_Notify::execute(optional_yield y) {
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
- int r = 0;
- bufferlist data;
- std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
- if (r < 0) {
- op_ret = r;
- return;
- }
-
- char* buf = data.c_str();
- ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
-
- JSONParser p;
- r = p.parse(buf, data.length());
- if (r < 0) {
- ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
- op_ret = r;
- return;
- }
-
- set<int> updated_shards;
- try {
- decode_json_obj(updated_shards, &p);
- } catch (JSONDecoder::err& err) {
- ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
- for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
- ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
- }
- }
-
- driver->wakeup_meta_sync_shards(updated_shards);
-
- op_ret = 0;
-}
-
-void RGWOp_BILog_List::execute(optional_yield y) {
- bool gen_specified = false;
- string tenant_name = s->info.args.get("tenant"),
- bucket_name = s->info.args.get("bucket"),
- marker = s->info.args.get("marker"),
- max_entries_str = s->info.args.get("max-entries"),
- bucket_instance = s->info.args.get("bucket-instance"),
- gen_str = s->info.args.get("generation", &gen_specified),
- format_version_str = s->info.args.get("format-ver");
- std::unique_ptr<rgw::sal::Bucket> bucket;
- rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
- unsigned max_entries;
-
- if (bucket_name.empty() && bucket_instance.empty()) {
- ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- string err;
- std::optional<uint64_t> gen;
- if (gen_specified) {
- gen = strict_strtoll(gen_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
-
- if (!format_version_str.empty()) {
- format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
-
- int shard_id;
- string bn;
- op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
- if (op_ret < 0) {
- return;
- }
-
- if (!bucket_instance.empty()) {
- b.name = bn;
- b.bucket_id = bucket_instance;
- }
- op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
- return;
- }
-
- const auto& logs = bucket->get_info().layout.logs;
- if (logs.empty()) {
- ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
- op_ret = -ENOENT;
- return;
- }
-
- auto log = std::prev(logs.end());
- if (gen) {
- log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
- if (log == logs.end()) {
- ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
- op_ret = -ENOENT;
- return;
- }
- }
- if (auto next = std::next(log); next != logs.end()) {
- next_log_layout = *next; // get the next log after the current latest
- }
- auto& log_layout = *log; // current log layout for log listing
-
- unsigned count = 0;
-
-
- max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
- if (!err.empty())
- max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
- send_response();
- do {
- list<rgw_bi_log_entry> entries;
- int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
- marker, max_entries - count,
- entries, &truncated);
- if (ret < 0) {
- ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
- return;
- }
-
- count += entries.size();
-
- send_response(entries, marker);
- } while (truncated && count < max_entries);
-
- send_response_end();
-}
-
-void RGWOp_BILog_List::send_response() {
- if (sent_header)
- return;
-
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- sent_header = true;
-
- if (op_ret < 0)
- return;
-
- if (format_ver >= 2) {
- s->formatter->open_object_section("result");
- }
-
- s->formatter->open_array_section("entries");
-}
-
-void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
-{
- for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
- rgw_bi_log_entry& entry = *iter;
- encode_json("entry", entry, s->formatter);
-
- marker = entry.id;
- flusher.flush();
- }
-}
-
-void RGWOp_BILog_List::send_response_end() {
- s->formatter->close_section();
-
- if (format_ver >= 2) {
- encode_json("truncated", truncated, s->formatter);
-
- if (next_log_layout) {
- s->formatter->open_object_section("next_log");
- encode_json("generation", next_log_layout->gen, s->formatter);
- encode_json("num_shards", next_log_layout->layout.in_index.layout.num_shards, s->formatter);
- s->formatter->close_section(); // next_log
- }
-
- s->formatter->close_section(); // result
- }
-
- flusher.flush();
-}
-
-void RGWOp_BILog_Info::execute(optional_yield y) {
- string tenant_name = s->info.args.get("tenant"),
- bucket_name = s->info.args.get("bucket"),
- bucket_instance = s->info.args.get("bucket-instance");
- std::unique_ptr<rgw::sal::Bucket> bucket;
- rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
- if (bucket_name.empty() && bucket_instance.empty()) {
- ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- int shard_id;
- string bn;
- op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
- if (op_ret < 0) {
- return;
- }
-
- if (!bucket_instance.empty()) {
- b.name = bn;
- b.bucket_id = bucket_instance;
- }
- op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
- return;
- }
-
- const auto& logs = bucket->get_info().layout.logs;
- if (logs.empty()) {
- ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
- op_ret = -ENOENT;
- return;
- }
-
- map<RGWObjCategory, RGWStorageStats> stats;
- const auto& index = log_to_index_layout(logs.back());
-
- int ret = bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
- if (ret < 0 && ret != -ENOENT) {
- op_ret = ret;
- return;
- }
-
- oldest_gen = logs.front().gen;
- latest_gen = logs.back().gen;
-
- for (auto& log : logs) {
- uint32_t num_shards = log.layout.in_index.layout.num_shards;
- generations.push_back({log.gen, num_shards});
- }
-}
-
-void RGWOp_BILog_Info::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret < 0)
- return;
-
- s->formatter->open_object_section("info");
- encode_json("bucket_ver", bucket_ver, s->formatter);
- encode_json("master_ver", master_ver, s->formatter);
- encode_json("max_marker", max_marker, s->formatter);
- encode_json("syncstopped", syncstopped, s->formatter);
- encode_json("oldest_gen", oldest_gen, s->formatter);
- encode_json("latest_gen", latest_gen, s->formatter);
- encode_json("generations", generations, s->formatter);
- s->formatter->close_section();
-
- flusher.flush();
-}
-
-void RGWOp_BILog_Delete::execute(optional_yield y) {
- bool gen_specified = false;
- string tenant_name = s->info.args.get("tenant"),
- bucket_name = s->info.args.get("bucket"),
- start_marker = s->info.args.get("start-marker"),
- end_marker = s->info.args.get("end-marker"),
- bucket_instance = s->info.args.get("bucket-instance"),
- gen_str = s->info.args.get("generation", &gen_specified);
-
- std::unique_ptr<rgw::sal::Bucket> bucket;
- rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
-
- op_ret = 0;
- if ((bucket_name.empty() && bucket_instance.empty()) ||
- end_marker.empty()) {
- ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- string err;
- uint64_t gen = 0;
- if (gen_specified) {
- gen = strict_strtoll(gen_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
- op_ret = -EINVAL;
- return;
- }
- }
-
- int shard_id;
- string bn;
- op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
- if (op_ret < 0) {
- return;
- }
-
- if (!bucket_instance.empty()) {
- b.name = bn;
- b.bucket_id = bucket_instance;
- }
- op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
- return;
- }
-
- op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
- bucket->get_info(), gen, shard_id,
- start_marker, end_marker);
- if (op_ret < 0) {
- ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
- }
-
- return;
-}
-
-void RGWOp_DATALog_List::execute(optional_yield y) {
- string shard = s->info.args.get("id");
-
- string max_entries_str = s->info.args.get("max-entries"),
- marker = s->info.args.get("marker"),
- err;
- unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
-
- if (s->info.args.exists("start-time") ||
- s->info.args.exists("end-time")) {
- ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
- op_ret = -EINVAL;
- }
-
- s->info.args.get_bool("extra-info", &extra_info, false);
-
- shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (!max_entries_str.empty()) {
- max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
- op_ret = -EINVAL;
- return;
- }
- if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
- max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
- }
- }
-
- // Note that last_marker is updated to be the marker of the last
- // entry listed
- op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->list_entries(this, shard_id,
- max_entries, entries,
- marker, &last_marker,
- &truncated);
-}
-
-void RGWOp_DATALog_List::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret < 0)
- return;
-
- s->formatter->open_object_section("log_entries");
- s->formatter->dump_string("marker", last_marker);
- s->formatter->dump_bool("truncated", truncated);
- {
- s->formatter->open_array_section("entries");
- for (const auto& entry : entries) {
- if (!extra_info) {
- encode_json("entry", entry.entry, s->formatter);
- } else {
- encode_json("entry", entry, s->formatter);
- }
- flusher.flush();
- }
- s->formatter->close_section();
- }
- s->formatter->close_section();
- flusher.flush();
-}
-
-
-void RGWOp_DATALog_Info::execute(optional_yield y) {
- num_objects = s->cct->_conf->rgw_data_log_num_shards;
- op_ret = 0;
-}
-
-void RGWOp_DATALog_Info::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- s->formatter->open_object_section("num_objects");
- s->formatter->dump_unsigned("num_objects", num_objects);
- s->formatter->close_section();
- flusher.flush();
-}
-
-void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
- string shard = s->info.args.get("id");
- string err;
-
- unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->get_info(this, shard_id, &info);
-}
-
-void RGWOp_DATALog_ShardInfo::send_response() {
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- encode_json("info", info, s->formatter);
- flusher.flush();
-}
-
-void RGWOp_DATALog_Notify::execute(optional_yield y) {
- string source_zone = s->info.args.get("source-zone");
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
- int r = 0;
- bufferlist data;
- std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
- if (r < 0) {
- op_ret = r;
- return;
- }
-
- char* buf = data.c_str();
- ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
-
- JSONParser p;
- r = p.parse(buf, data.length());
- if (r < 0) {
- ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
- op_ret = r;
- return;
- }
-
- bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
- try {
- auto decoder = rgw_data_notify_v1_decoder{updated_shards};
- decode_json_obj(decoder, &p);
- } catch (JSONDecoder::err& err) {
- ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
- for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
- ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
- bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
- for (const auto& [key, gen] : entries) {
- ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
- << " of gen=" << gen << dendl;
- }
- }
- }
-
- driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
-
- op_ret = 0;
-}
-
-void RGWOp_DATALog_Notify2::execute(optional_yield y) {
- string source_zone = s->info.args.get("source-zone");
-#define LARGE_ENOUGH_BUF (128 * 1024)
-
- int r = 0;
- bufferlist data;
- std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
- if (r < 0) {
- op_ret = r;
- return;
- }
-
- char* buf = data.c_str();
- ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
-
- JSONParser p;
- r = p.parse(buf, data.length());
- if (r < 0) {
- ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
- op_ret = r;
- return;
- }
-
- bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
- try {
- decode_json_obj(updated_shards, &p);
- } catch (JSONDecoder::err& err) {
- ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
- for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
- updated_shards.begin(); iter != updated_shards.end(); ++iter) {
- ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
- bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
- for (const auto& [key, gen] : entries) {
- ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
- " of generation=" << gen << dendl;
- }
- }
- }
-
- driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
-
- op_ret = 0;
-}
-
-void RGWOp_DATALog_Delete::execute(optional_yield y) {
- string marker = s->info.args.get("marker"),
- shard = s->info.args.get("id"),
- err;
- unsigned shard_id;
-
- op_ret = 0;
-
- if (s->info.args.exists("start-time") ||
- s->info.args.exists("end-time")) {
- ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
- op_ret = -EINVAL;
- }
-
- if (s->info.args.exists("start-marker")) {
- ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
- op_ret = -EINVAL;
- }
-
- if (s->info.args.exists("end-marker")) {
- if (!s->info.args.exists("marker")) {
- marker = s->info.args.get("end-marker");
- } else {
- ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
- op_ret = -EINVAL;
- }
- }
-
- shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
- if (!err.empty()) {
- ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
- op_ret = -EINVAL;
- return;
- }
- if (marker.empty()) { /* bounding end */
- op_ret = -EINVAL;
- return;
- }
-
- op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados->trim_entries(this, shard_id, marker);
-}
-
-// not in header to avoid pulling in rgw_sync.h
-class RGWOp_MDLog_Status : public RGWRESTOp {
- rgw_meta_sync_status status;
-public:
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override { return "get_metadata_log_status"; }
-};
-
-void RGWOp_MDLog_Status::execute(optional_yield y)
-{
- auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
- if (sync == nullptr) {
- ldpp_dout(this, 1) << "no sync manager" << dendl;
- op_ret = -ENOENT;
- return;
- }
- op_ret = sync->read_sync_status(this, &status);
-}
-
-void RGWOp_MDLog_Status::send_response()
-{
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret >= 0) {
- encode_json("status", status, s->formatter);
- }
- flusher.flush();
-}
-
-// not in header to avoid pulling in rgw_data_sync.h
-class RGWOp_BILog_Status : public RGWRESTOp {
- bilog_status_v2 status;
- int version = 1;
-public:
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("bilog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override { return "get_bucket_index_log_status"; }
-};
-
-void RGWOp_BILog_Status::execute(optional_yield y)
-{
- const auto options = s->info.args.get("options");
- bool merge = (options == "merge");
- const auto source_zone = s->info.args.get("source-zone");
- const auto source_key = s->info.args.get("source-bucket");
- auto key = s->info.args.get("bucket");
- op_ret = s->info.args.get_int("version", &version, 1);
-
- if (key.empty()) {
- key = source_key;
- }
- if (key.empty()) {
- ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- rgw_bucket b;
- int shard_id{-1}; // unused
- op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
- if (op_ret < 0) {
- ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
- op_ret = -EINVAL;
- return;
- }
-
- // read the bucket instance info for num_shards
- std::unique_ptr<rgw::sal::Bucket> bucket;
- op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
- return;
- }
-
- rgw_bucket source_bucket;
-
- if (source_key.empty() ||
- source_key == key) {
- source_bucket = bucket->get_key();
- } else {
- op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
- if (op_ret < 0) {
- ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
- return;
- }
- }
-
- const auto& local_zone_id = driver->get_zone()->get_id();
-
- if (!merge) {
- rgw_sync_bucket_pipe pipe;
- pipe.source.zone = source_zone;
- pipe.source.bucket = source_bucket;
- pipe.dest.zone = local_zone_id;
- pipe.dest.bucket = bucket->get_key();
-
- ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
-
- op_ret = rgw_read_bucket_full_sync_status(
- this,
- static_cast<rgw::sal::RadosStore*>(driver),
- pipe,
- &status.sync_status,
- s->yield);
- if (op_ret < 0) {
- ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
- return;
- }
- status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
-
- op_ret = rgw_read_bucket_inc_sync_status(
- this,
- static_cast<rgw::sal::RadosStore*>(driver),
- pipe,
- status.sync_status.incremental_gen,
- &status.inc_status);
- if (op_ret < 0) {
- ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
- }
- return;
- }
-
- rgw_zone_id source_zone_id(source_zone);
-
- RGWBucketSyncPolicyHandlerRef source_handler;
- op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
- if (op_ret < 0) {
- ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
- return;
- }
-
- auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
-
- std::vector<rgw_bucket_shard_sync_info> current_status;
- for (auto& entry : local_dests) {
- auto pipe = entry.second;
-
- ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
-
- RGWBucketInfo *pinfo = &bucket->get_info();
- std::optional<RGWBucketInfo> opt_dest_info;
-
- if (!pipe.dest.bucket) {
- /* Uh oh, something went wrong */
- ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
- op_ret = -EIO;
- return;
- }
-
- if (*pipe.dest.bucket != pinfo->bucket) {
- opt_dest_info.emplace();
- std::unique_ptr<rgw::sal::Bucket> dest_bucket;
- op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
- if (op_ret < 0) {
- ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
- return;
- }
-
- *opt_dest_info = dest_bucket->get_info();
- pinfo = &(*opt_dest_info);
- pipe.dest.bucket = pinfo->bucket;
- }
-
- op_ret = rgw_read_bucket_full_sync_status(
- this,
- static_cast<rgw::sal::RadosStore*>(driver),
- pipe,
- &status.sync_status,
- s->yield);
- if (op_ret < 0) {
- ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
- return;
- }
-
- current_status.resize(status.sync_status.shards_done_with_gen.size());
- int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
- pipe, status.sync_status.incremental_gen, ¤t_status);
- if (r < 0) {
- ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
- op_ret = r;
- return;
- }
-
- if (status.inc_status.empty()) {
- status.inc_status = std::move(current_status);
- } else {
- if (current_status.size() != status.inc_status.size()) {
- op_ret = -EINVAL;
- ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
- "syncing from the same source: status.size()= "
- << status.inc_status.size()
- << " current_status.size()="
- << current_status.size() << dendl;
- return;
- }
- auto m = status.inc_status.begin();
- for (auto& cur_shard_status : current_status) {
- auto& result_shard_status = *m++;
- // always take the first marker, or any later marker that's smaller
- if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
- result_shard_status = std::move(cur_shard_status);
- }
- }
- }
- }
-}
-
-void RGWOp_BILog_Status::send_response()
-{
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret >= 0) {
- if (version < 2) {
- encode_json("status", status.inc_status, s->formatter);
- } else {
- encode_json("status", status, s->formatter);
- }
- }
- flusher.flush();
-}
-
-// not in header to avoid pulling in rgw_data_sync.h
-class RGWOp_DATALog_Status : public RGWRESTOp {
- rgw_data_sync_status status;
-public:
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override ;
- void send_response() override;
- const char* name() const override { return "get_data_changes_log_status"; }
-};
-
-void RGWOp_DATALog_Status::execute(optional_yield y)
-{
- const auto source_zone = s->info.args.get("source-zone");
- auto sync = driver->get_data_sync_manager(source_zone);
- if (sync == nullptr) {
- ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
- op_ret = -ENOENT;
- return;
- }
- op_ret = sync->read_sync_status(this, &status);
-}
-
-void RGWOp_DATALog_Status::send_response()
-{
- set_req_state_err(s, op_ret);
- dump_errno(s);
- end_header(s);
-
- if (op_ret >= 0) {
- encode_json("status", status, s->formatter);
- }
- flusher.flush();
-}
-
-
-RGWOp *RGWHandler_Log::op_get() {
- bool exists;
- string type = s->info.args.get("type", &exists);
-
- if (!exists) {
- return NULL;
- }
-
- if (type.compare("metadata") == 0) {
- if (s->info.args.exists("id")) {
- if (s->info.args.exists("info")) {
- return new RGWOp_MDLog_ShardInfo;
- } else {
- return new RGWOp_MDLog_List;
- }
- } else if (s->info.args.exists("status")) {
- return new RGWOp_MDLog_Status;
- } else {
- return new RGWOp_MDLog_Info;
- }
- } else if (type.compare("bucket-index") == 0) {
- if (s->info.args.exists("info")) {
- return new RGWOp_BILog_Info;
- } else if (s->info.args.exists("status")) {
- return new RGWOp_BILog_Status;
- } else {
- return new RGWOp_BILog_List;
- }
- } else if (type.compare("data") == 0) {
- if (s->info.args.exists("id")) {
- if (s->info.args.exists("info")) {
- return new RGWOp_DATALog_ShardInfo;
- } else {
- return new RGWOp_DATALog_List;
- }
- } else if (s->info.args.exists("status")) {
- return new RGWOp_DATALog_Status;
- } else {
- return new RGWOp_DATALog_Info;
- }
- }
- return NULL;
-}
-
-RGWOp *RGWHandler_Log::op_delete() {
- bool exists;
- string type = s->info.args.get("type", &exists);
-
- if (!exists) {
- return NULL;
- }
-
- if (type.compare("metadata") == 0)
- return new RGWOp_MDLog_Delete;
- else if (type.compare("bucket-index") == 0)
- return new RGWOp_BILog_Delete;
- else if (type.compare("data") == 0)
- return new RGWOp_DATALog_Delete;
- return NULL;
-}
-
-RGWOp *RGWHandler_Log::op_post() {
- bool exists;
- string type = s->info.args.get("type", &exists);
-
- if (!exists) {
- return NULL;
- }
-
- if (type.compare("metadata") == 0) {
- if (s->info.args.exists("lock"))
- return new RGWOp_MDLog_Lock;
- else if (s->info.args.exists("unlock"))
- return new RGWOp_MDLog_Unlock;
- else if (s->info.args.exists("notify"))
- return new RGWOp_MDLog_Notify;
- } else if (type.compare("data") == 0) {
- if (s->info.args.exists("notify")) {
- return new RGWOp_DATALog_Notify;
- } else if (s->info.args.exists("notify2")) {
- return new RGWOp_DATALog_Notify2;
- }
- }
- return NULL;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_datalog.h"
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-#include "rgw_metadata.h"
-#include "rgw_mdlog.h"
-#include "rgw_data_sync.h"
-
-class RGWOp_BILog_List : public RGWRESTOp {
- bool sent_header;
- uint32_t format_ver{0};
- bool truncated{false};
- std::optional<rgw::bucket_log_layout_generation> next_log_layout;
-
-public:
- RGWOp_BILog_List() : sent_header(false) {}
- ~RGWOp_BILog_List() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("bilog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void send_response() override;
- virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
- virtual void send_response_end();
- void execute(optional_yield y) override;
- const char* name() const override {
- return "list_bucket_index_log";
- }
-};
-
-class RGWOp_BILog_Info : public RGWRESTOp {
- std::string bucket_ver;
- std::string master_ver;
- std::string max_marker;
- bool syncstopped;
- uint64_t oldest_gen = 0;
- uint64_t latest_gen = 0;
- std::vector<store_gen_shards> generations;
-
-public:
- RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
- ~RGWOp_BILog_Info() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("bilog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void send_response() override;
- void execute(optional_yield y) override;
- const char* name() const override {
- return "bucket_index_log_info";
- }
-};
-
-class RGWOp_BILog_Delete : public RGWRESTOp {
-public:
- RGWOp_BILog_Delete() {}
- ~RGWOp_BILog_Delete() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("bilog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "trim_bucket_index_log";
- }
-};
-
-class RGWOp_MDLog_List : public RGWRESTOp {
- std::list<cls_log_entry> entries;
- std::string last_marker;
- bool truncated;
-public:
- RGWOp_MDLog_List() : truncated(false) {}
- ~RGWOp_MDLog_List() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "list_metadata_log";
- }
-};
-
-class RGWOp_MDLog_Info : public RGWRESTOp {
- unsigned num_objects;
- RGWPeriodHistory::Cursor period;
-public:
- RGWOp_MDLog_Info() : num_objects(0) {}
- ~RGWOp_MDLog_Info() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "get_metadata_log_info";
- }
-};
-
-class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
- RGWMetadataLogInfo info;
-public:
- RGWOp_MDLog_ShardInfo() {}
- ~RGWOp_MDLog_ShardInfo() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "get_metadata_log_shard_info";
- }
-};
-
-class RGWOp_MDLog_Lock : public RGWRESTOp {
-public:
- RGWOp_MDLog_Lock() {}
- ~RGWOp_MDLog_Lock() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "lock_mdlog_object";
- }
-};
-
-class RGWOp_MDLog_Unlock : public RGWRESTOp {
-public:
- RGWOp_MDLog_Unlock() {}
- ~RGWOp_MDLog_Unlock() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "unlock_mdlog_object";
- }
-};
-
-class RGWOp_MDLog_Notify : public RGWRESTOp {
-public:
- RGWOp_MDLog_Notify() {}
- ~RGWOp_MDLog_Notify() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "mdlog_notify";
- }
- RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
-};
-
-class RGWOp_MDLog_Delete : public RGWRESTOp {
-public:
- RGWOp_MDLog_Delete() {}
- ~RGWOp_MDLog_Delete() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("mdlog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "trim_metadata_log";
- }
-};
-
-class RGWOp_DATALog_List : public RGWRESTOp {
- std::vector<rgw_data_change_log_entry> entries;
- std::string last_marker;
- bool truncated;
- bool extra_info;
-public:
- RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
- ~RGWOp_DATALog_List() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "list_data_changes_log";
- }
-};
-
-class RGWOp_DATALog_Info : public RGWRESTOp {
- unsigned num_objects;
-public:
- RGWOp_DATALog_Info() : num_objects(0) {}
- ~RGWOp_DATALog_Info() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "get_data_changes_log_info";
- }
-};
-
-class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
- RGWDataChangesLogInfo info;
-public:
- RGWOp_DATALog_ShardInfo() {}
- ~RGWOp_DATALog_ShardInfo() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_READ);
- }
- int verify_permission(optional_yield y) override {
- return check_caps(s->user->get_caps());
- }
- void execute(optional_yield y) override;
- void send_response() override;
- const char* name() const override {
- return "get_data_changes_log_shard_info";
- }
-};
-
-class RGWOp_DATALog_Notify : public RGWRESTOp {
-public:
- RGWOp_DATALog_Notify() {}
- ~RGWOp_DATALog_Notify() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "datalog_notify";
- }
- RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
-};
-
-class RGWOp_DATALog_Notify2 : public RGWRESTOp {
- rgw_data_notify_entry data_notify;
-public:
- RGWOp_DATALog_Notify2() {}
- ~RGWOp_DATALog_Notify2() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "datalog_notify2";
- }
- RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
-};
-
-class RGWOp_DATALog_Delete : public RGWRESTOp {
-public:
- RGWOp_DATALog_Delete() {}
- ~RGWOp_DATALog_Delete() override {}
-
- int check_caps(const RGWUserCaps& caps) override {
- return caps.check_cap("datalog", RGW_CAP_WRITE);
- }
- void execute(optional_yield y) override;
- const char* name() const override {
- return "trim_data_changes_log";
- }
-};
-
-class RGWHandler_Log : public RGWHandler_Auth_S3 {
-protected:
- RGWOp *op_get() override;
- RGWOp *op_delete() override;
- RGWOp *op_post() override;
-
- int read_permissions(RGWOp*, optional_yield) override {
- return 0;
- }
-public:
- using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
- ~RGWHandler_Log() override = default;
-};
-
-class RGWRESTMgr_Log : public RGWRESTMgr {
-public:
- RGWRESTMgr_Log() = default;
- ~RGWRESTMgr_Log() override = default;
-
- RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
- req_state* const,
- const rgw::auth::StrategyRegistry& auth_registry,
- const std::string& frontend_prefixs) override {
- return new RGWHandler_Log(auth_registry);
- }
-};
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_ROLE_H
-#define CEPH_RGW_ROLE_H
+#pragma once
#include <string>
#include "common/ceph_json.h"
#include "common/ceph_context.h"
-#include "rgw/rgw_rados.h"
+#include "rgw_rados.h"
#include "rgw_metadata.h"
class RGWRados;
Driver* driver;
};
} } // namespace rgw::sal
-
-#endif /* CEPH_RGW_ROLE_H */
// vim: ts=8 sw=2 smarttab ft=cpp
//
+#pragma once
+
namespace rgw::s3select {
RGWOp* create_s3select_op();
}
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_STRING_H
-#define CEPH_RGW_STRING_H
+#pragma once
#include <errno.h>
#include <stdlib.h>
extern bool match_wildcards(std::string_view pattern,
std::string_view input,
uint32_t flags = 0);
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_STS_H
-#define CEPH_RGW_STS_H
+#pragma once
#include "rgw_role.h"
#include "rgw_auth.h"
AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req);
};
}
-#endif /* CEPH_RGW_STS_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_SWIFT_AUTH_H
-#define CEPH_RGW_SWIFT_AUTH_H
+#pragma once
#include "rgw_common.h"
#include "rgw_user.h"
return new RGWHandler_SWIFT_Auth;
}
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_TAG_H
-#define RGW_TAG_H
+#pragma once
#include <string>
#include <include/types.h>
tag_map_t& get_tags() {return tag_map;}
};
WRITE_CLASS_ENCODER(RGWObjTags)
-
-#endif /* RGW_TAG_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef RGW_TAG_S3_H
-#define RGW_TAG_S3_H
+#pragma once
#include <map>
#include <string>
return tagset.rebuild(dest);
}
};
-
-
-#endif /* RGW_TAG_S3_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_TAR_H
-#define CEPH_RGW_TAR_H
+#pragma once
#include <algorithm>
#include <array>
} /* namespace tar */
} /* namespace rgw */
-
-#endif /* CEPH_RGW_TAR_H */
*
*/
-#ifndef RGW_TOKEN_H
-#define RGW_TOKEN_H
+#pragma once
#include <stdint.h>
#include <boost/algorithm/string.hpp>
}
} /* namespace rgw */
-
-#endif /* RGW_TOKEN_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_TORRENT_H
-#define CEPH_RGW_TORRENT_H
+#pragma once
#include <string>
#include <list>
void sha1(SHA1 *h, bufferlist &bl, off_t bl_len);
int save_torrent_file(optional_yield y);
};
-#endif /* CEPH_RGW_TORRENT_H */
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_USAGE_H
-#define CEPH_RGW_USAGE_H
+#pragma once
#include <string>
#include <map>
static int clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver);
};
-
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_WEB_IDP_H
-#define CEPH_RGW_WEB_IDP_H
+#pragma once
namespace rgw {
namespace web_idp {
}; /* namespace web_idp */
}; /* namespace rgw */
-
-#endif /* CEPH_RGW_WEB_IDP_H */
*
*/
-#ifndef RGW_WEBSITE_H
-#define RGW_WEBSITE_H
+#pragma once
#include <list>
#include <string>
}
};
WRITE_CLASS_ENCODER(RGWBucketWebsiteConf)
-
-#endif
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab ft=cpp
-#ifndef CEPH_RGW_XML_H
-#define CEPH_RGW_XML_H
+#pragma once
#include <map>
#include <stdexcept>
encode_xml(name, *o, f);
}
-
-
-#endif
#include "svc_meta_be.h"
#include "svc_bucket_types.h"
#include "svc_bucket.h"
+#include "svc_bucket_sync.h"
class RGWSI_Zone;
class RGWSI_SysObj;
class RGWSI_SysObj_Cache;
class RGWSI_Meta;
class RGWSI_SyncModules;
-class RGWSI_Bucket_Sync;
struct rgw_cache_entry_info;
#include "svc_meta_be.h"
#include "svc_user.h"
+#include "rgw_bucket.h"
class RGWSI_RADOS;
class RGWSI_Zone;
struct rgw_cache_entry_info;
-class RGWUserBuckets;
-
class RGWGetUserHeader_CB;
class RGWGetUserStats_CB;