From: Oguzhan Ozmen Date: Thu, 31 Jul 2025 22:15:24 +0000 (+0000) Subject: RGW: multi object delete op; skip olh update for all deletes but the last one X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0e13c8740c20067baaffcdd4d3a3c28f490e7bd9;p=ceph.git RGW: multi object delete op; skip olh update for all deletes but the last one Fixes: https://tracker.ceph.com/issues/72375 Signed-off-by: Oguzhan Ozmen (cherry picked from commit 9bb170104446bfea0ad87b34244f3a3d47962fcc) Conflicts: src/rgw/rgw_op.cc src/rgw/rgw_op.h - RGWDeleteMultiObj kept the vector of objects to be deleted as "rgw_obj_key" rather than "RGWMultiDelObject". - RGWDeleteMultiObj::execute didn't factor out the object deletions into "handle_objects" helper method. - There was no check whether RGWDeleteMultiObj::execute is already running in a coroutine or not before handling objects. --- diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index bfb2dd3d66c2..55fc6d469d9c 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -6355,7 +6355,8 @@ struct tombstone_entry { int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider* dpp, bool log_op, - const bool force) + const bool force, + const bool skip_olh_obj_update) { RGWRados *store = target->get_store(); const rgw_obj& src_obj = target->get_obj(); @@ -6402,7 +6403,7 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, - y, params.zones_trace, add_log); + y, params.zones_trace, add_log, skip_olh_obj_update); if (r < 0) { return r; } @@ -6417,7 +6418,7 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, r = store->unlink_obj_instance( dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.bilog_flags, - params.null_verid, params.zones_trace, add_log, force); + params.null_verid, params.zones_trace, add_log, force, skip_olh_obj_update); if (r < 0) { return r; } @@ -6595,7 +6596,8 @@ int RGWRados::delete_obj(const DoutPrefixProvider *dpp, const real_time& expiration_time, rgw_zone_set *zones_trace, bool log_op, - const bool force) // force removal even if head object is broken + const bool force, // force removal even if head object is broken + const bool skip_olh_obj_update) // true for all deletes (except the last one) initiated by a multi-object delete op { RGWRados::Object del_target(this, bucket_info, obj_ctx, obj); RGWRados::Object::Delete del_op(&del_target); @@ -6607,7 +6609,7 @@ int RGWRados::delete_obj(const DoutPrefixProvider *dpp, del_op.params.zones_trace = zones_trace; del_op.params.null_verid = null_verid; - return del_op.delete_obj(y, dpp, log_op, force); + return del_op.delete_obj(y, dpp, log_op, force, skip_olh_obj_update); } int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, optional_yield y) @@ -9060,7 +9062,7 @@ int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp, rgw_obj obj_instance(bucket, key); int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, y, null_verid, RGW_BILOG_FLAG_VERSIONED_OP, - ceph::real_time(), zones_trace, log_op, force); + ceph::real_time(), zones_trace, log_op, force, true /* skip_olh_obj_update */); if (ret < 0 && ret != -ENOENT) { ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl; return ret; @@ -9254,9 +9256,11 @@ int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, // it's possible that the pending xattr from this op prevented the olh // object from being cleaned by another thread that was deleting the last // existing version. We invoke a best-effort update_olh here to handle this case. - int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, y, zones_trace, log_data_change); - if (r < 0 && r != -ECANCELED) { - ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + if (! skip_olh_obj_update) { + int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, y, zones_trace, log_data_change); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + } } return ret; } @@ -9270,6 +9274,7 @@ int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, // exit early if we're skipping the olh update and just updating the index if (skip_olh_obj_update) { + ldpp_dout(dpp, 20) << "skip update_olh() target_obj=" << olh_obj << dendl; return 0; } @@ -9295,7 +9300,8 @@ int RGWRados::unlink_obj_instance(const DoutPrefixProvider* dpp, bool null_verid, rgw_zone_set* zones_trace, bool log_op, - const bool force) + const bool force, + const bool skip_olh_obj_update) { string op_tag; @@ -9353,10 +9359,12 @@ int RGWRados::unlink_obj_instance(const DoutPrefixProvider* dpp, // it's possible that the pending xattr from this op prevented the olh // object from being cleaned by another thread that was deleting the last // existing version. We invoke a best-effort update_olh here to handle this case. - int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, y, - zones_trace, null_verid, log_op, force); - if (r < 0 && r != -ECANCELED) { - ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + if (! skip_olh_obj_update) { + int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, y, + zones_trace, null_verid, log_op, force); + if (r < 0 && r != -ECANCELED) { + ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl; + } } return ret; } // if error in bucket_index_unlink_instance call @@ -9368,6 +9376,11 @@ int RGWRados::unlink_obj_instance(const DoutPrefixProvider* dpp, return -EIO; } + if (skip_olh_obj_update) { + ldpp_dout(dpp, 20) << "skip update_olh() target_obj=" << olh_obj << dendl; + return 0; + } + ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, y, zones_trace, null_verid, log_op, force); if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */ diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h index 01a8b2553954..221738174fa7 100644 --- a/src/rgw/driver/rados/rgw_rados.h +++ b/src/rgw/driver/rados/rgw_rados.h @@ -899,7 +899,9 @@ public: int delete_obj(optional_yield y, const DoutPrefixProvider* dpp, bool log_op, - const bool force); // if head object missing, do a best effort + const bool force, // if head object missing, do a best effort + const bool skip_olh_obj_update // true for all deletes (except the last one) initiated by a multi-object delete op + ); }; // struct RGWRados::Object::Delete struct Stat { @@ -1321,7 +1323,8 @@ int restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx, const ceph::real_time& expiration_time = ceph::real_time(), rgw_zone_set *zones_trace = nullptr, bool log_op = true, - const bool force = false); // if head object missing, do a best effort + const bool force = false, // if head object missing, do a best effort + const bool skip_olh_obj_update = false); // true for all deletes (except the last one) initiated by a multi-object delete op int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, optional_yield y); @@ -1468,7 +1471,7 @@ int restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx, uint64_t olh_epoch, optional_yield y, uint16_t bilog_flags, bool null_verid, rgw_zone_set *zones_trace = nullptr, - bool log_op = true, const bool force = false); + bool log_op = true, const bool force = false, const bool skip_olh_obj_update = false); void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map& pending_entries, std::map *rm_pending_entries); int remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map& pending_attrs, optional_yield y); diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc index a8e0a8e22926..21a550083f53 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.cc +++ b/src/rgw/driver/rados/rgw_sal_rados.cc @@ -3590,7 +3590,7 @@ int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, option parent_op.params.check_objv = params.objv_tracker->version_for_check(); } - int ret = parent_op.delete_obj(y, dpp, flags & FLAG_LOG_OP, flags & FLAG_FORCE_OP); + int ret = parent_op.delete_obj(y, dpp, flags & FLAG_LOG_OP, flags & FLAG_FORCE_OP, flags & FLAG_SKIP_UPDATE_OLH); if (ret < 0) { return ret; } @@ -3621,7 +3621,7 @@ int RadosObject::delete_object(const DoutPrefixProvider* dpp, } // convert flags to bool params - return del_op.delete_obj(y, dpp, flags & FLAG_LOG_OP, flags & FLAG_FORCE_OP); + return del_op.delete_obj(y, dpp, flags & FLAG_LOG_OP, flags & FLAG_FORCE_OP, flags & FLAG_SKIP_UPDATE_OLH); } // RadosObject::delete_object int RadosObject::copy_object(const ACLOwner& owner, diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 8587bc273794..b8f74b0acae9 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -7521,7 +7521,7 @@ void RGWDeleteMultiObj::write_ops_log_entry(rgw_log_entry& entry) const { entry.delete_multi_obj_meta.objects = std::move(ops_log_entries); } -void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_yield y) +void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_yield y, const bool skip_olh_obj_update) { // add the object key to the dout prefix so we can trace concurrent calls struct ObjectPrefix : public DoutPrefixPipe { @@ -7606,11 +7606,12 @@ void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_ del_op->params.bucket_owner = s->bucket_owner.id; del_op->params.marker_version_id = version_id; - op_ret = del_op->delete_obj(dpp, y, rgw::sal::FLAG_LOG_OP); + op_ret = del_op->delete_obj(dpp, y, + rgw::sal::FLAG_LOG_OP | (skip_olh_obj_update ? rgw::sal::FLAG_SKIP_UPDATE_OLH : 0)); if (op_ret == -ENOENT) { op_ret = 0; } - + if (auto ret = rgw::bucketlogging::log_record(driver, rgw::bucketlogging::LoggingType::Any, obj.get(), s, canonical_name(), etag, obj_size, this, y, true, false); ret < 0) { // don't reply with an error in case of failed delete logging ldpp_dout(this, 5) << "WARNING: multi DELETE operation ignores bucket logging failure: " << ret << dendl; @@ -7628,6 +7629,70 @@ void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_ send_partial_response(o, del_op->result.delete_marker, del_op->result.version_id, op_ret); } +void RGWDeleteMultiObj::handle_versioned_objects(const std::vector& objects, + uint32_t max_aio, + boost::asio::yield_context yield) +{ + auto group = ceph::async::spawn_throttle{yield, max_aio}; + std::map> grouped_objects; + + // group objects by their keys + for (const auto& object : objects) { + const std::string& key = object.name; + grouped_objects[key].push_back(object); + } + + // for each group of objects, handle all but the last object and skip update_olh + for (const auto& [_, objects] : grouped_objects) { + for (size_t i = 0; i + 1 < objects.size(); ++i) { // skip the last element + group.spawn([this, &objects, i] (boost::asio::yield_context yield) { + handle_individual_object(objects[i], yield, true /* skip_olh_obj_update */); + }); + + rgw_flush_formatter(s, s->formatter); + } + } + group.wait(); + + // Now handle the last object of each group with update_olh + for (const auto& [_, objects] : grouped_objects) { + const auto& object = objects.back(); + group.spawn([this, &object] (boost::asio::yield_context yield) { + handle_individual_object(object, yield); + }); + + rgw_flush_formatter(s, s->formatter); + } + group.wait(); +} + +void RGWDeleteMultiObj::handle_non_versioned_objects(const std::vector& objects, + uint32_t max_aio, + boost::asio::yield_context yield) +{ + auto group = ceph::async::spawn_throttle{yield, max_aio}; + + for (const auto& object : objects) { + group.spawn([this, &object] (boost::asio::yield_context yield) { + handle_individual_object(object, yield); + }); + + rgw_flush_formatter(s, s->formatter); + } + group.wait(); +} + +void RGWDeleteMultiObj::handle_objects(const std::vector& objects, + uint32_t max_aio, + boost::asio::yield_context yield) +{ + if (bucket->versioned()) { + handle_versioned_objects(objects, max_aio, yield); + } else { + handle_non_versioned_objects(objects, max_aio, yield); + } +} + void RGWDeleteMultiObj::execute(optional_yield y) { const char* buf = data.c_str(); @@ -7695,16 +7760,24 @@ void RGWDeleteMultiObj::execute(optional_yield y) // process up to max_aio object deletes in parallel const uint32_t max_aio = std::max(1, s->cct->_conf->rgw_multi_obj_del_max_aio); - auto group = ceph::async::spawn_throttle{y, max_aio}; - for (const auto& key : multi_delete->objects) { - group.spawn([this, &key] (boost::asio::yield_context yield) { - handle_individual_object(key, yield); - }); - - rgw_flush_formatter(s, s->formatter); + // if we're not already running in a coroutine, spawn one + if (!y) { + auto& objects = multi_delete->objects; + + boost::asio::io_context context; + boost::asio::spawn(context, + [this, &objects, max_aio] (boost::asio::yield_context yield) { + handle_objects(objects, max_aio, yield); + }, + [] (std::exception_ptr eptr) { + if (eptr) std::rethrow_exception(eptr); + }); + context.run(); + } else { + // use the existing coroutine's yield context + handle_objects(multi_delete->objects, max_aio, y.get_yield_context()); } - group.wait(); /* set the return code to zero, errors at this point will be dumped to the response */ diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index ffddbe4c4e73..59534fd16a16 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -2113,7 +2113,16 @@ class RGWDeleteMultiObj : public RGWOp { * Handles the deletion of an individual object and uses * set_partial_response to record the outcome. */ - void handle_individual_object(const rgw_obj_key& o, optional_yield y); + void handle_individual_object(const rgw_obj_key& object, + optional_yield y, + const bool skip_olh_obj_update = false); + + void handle_versioned_objects(const std::vector& objects, + uint32_t max_aio, boost::asio::yield_context yield); + void handle_non_versioned_objects(const std::vector& objects, + uint32_t max_aio, boost::asio::yield_context yield); + void handle_objects(const std::vector& objects, + uint32_t max_aio, boost::asio::yield_context yield); protected: std::vector ops_log_entries; diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h index 776c151349b9..9e93936a7812 100644 --- a/src/rgw/rgw_sal.h +++ b/src/rgw/rgw_sal.h @@ -163,6 +163,7 @@ static constexpr uint32_t FLAG_PREVENT_VERSIONING = 0x0002; // if cannot do all elements of op, do as much as possible (e.g., // delete object where head object is missing) static constexpr uint32_t FLAG_FORCE_OP = 0x0004; +static constexpr uint32_t FLAG_SKIP_UPDATE_OLH = 0x0008; enum class RGWRestoreStatus : uint8_t { None = 0,