From 42b1ae572579d45074507814fc02ce8cd7c8dc2c Mon Sep 17 00:00:00 2001 From: Pritha Srivastava Date: Thu, 31 Mar 2022 15:29:19 +0530 Subject: [PATCH] rgw: splitting gc chains into smaller parts to prevent OSD_WRITETOOBIG error. fixes: https://tracker.ceph.com/issues/49823 Signed-off-by: Pritha Srivastava --- src/cls/rgw/cls_rgw_ops.h | 6 +++++ src/cls/rgw/cls_rgw_types.h | 36 +++++++++++++++++++++++++ src/rgw/rgw_gc.cc | 52 ++++++++++++++++++++++++++++++++++++- src/rgw/rgw_gc.h | 3 ++- src/rgw/rgw_rados.cc | 12 ++++----- src/rgw/rgw_rados.h | 2 +- src/rgw/rgw_sal_rados.cc | 6 ++--- 7 files changed, 105 insertions(+), 12 deletions(-) diff --git a/src/cls/rgw/cls_rgw_ops.h b/src/cls/rgw/cls_rgw_ops.h index 42eacf00cb1..f6015eacea0 100644 --- a/src/cls/rgw/cls_rgw_ops.h +++ b/src/cls/rgw/cls_rgw_ops.h @@ -838,6 +838,12 @@ struct cls_rgw_gc_set_entry_op { void dump(ceph::Formatter *f) const; static void generate_test_instances(std::list& ls); + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix + constexpr size_t expr_secs_overhead = sizeof(__u32); // expiration_seconds_overhead + return start_overhead + expr_secs_overhead + info.estimate_encoded_size(); + } }; WRITE_CLASS_ENCODER(cls_rgw_gc_set_entry_op) diff --git a/src/cls/rgw/cls_rgw_types.h b/src/cls/rgw/cls_rgw_types.h index dac3c51353f..ef76204b394 100644 --- a/src/cls/rgw/cls_rgw_types.h +++ b/src/cls/rgw/cls_rgw_types.h @@ -419,6 +419,14 @@ struct cls_rgw_obj_key { ls.back()->name = "name"; ls.back()->instance = "instance"; } + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix + constexpr size_t string_overhead = sizeof(__u32); // strings are encoded with 32-bit length prefix + return start_overhead + + string_overhead + name.size() + + string_overhead + instance.size(); + } }; WRITE_CLASS_ENCODER(cls_rgw_obj_key) @@ -1148,6 +1156,16 @@ struct cls_rgw_obj { ls.back()->key.name = "myoid"; ls.back()->loc = "mykey"; } + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix + constexpr size_t string_overhead = sizeof(__u32); // strings are encoded with 32-bit length prefix + return start_overhead + + string_overhead + pool.size() + + string_overhead + key.name.size() + + string_overhead + loc.size() + + key.estimate_encoded_size(); + } }; WRITE_CLASS_ENCODER(cls_rgw_obj) @@ -1192,6 +1210,16 @@ struct cls_rgw_obj_chain { bool empty() { return objs.empty(); } + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); + constexpr size_t size_overhead = sizeof(__u32); // size of the chain + size_t chain_overhead = 0; + for (auto& it : objs) { + chain_overhead += it.estimate_encoded_size(); + } + return (start_overhead + size_overhead + chain_overhead); + } }; WRITE_CLASS_ENCODER(cls_rgw_obj_chain) @@ -1233,6 +1261,14 @@ struct cls_rgw_gc_obj_info ceph_timespec ts{ceph_le32(21), ceph_le32(32)}; ls.back()->time = ceph::real_clock::from_ceph_timespec(ts); } + + size_t estimate_encoded_size() const { + constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix + constexpr size_t string_overhead = sizeof(__u32); // strings are encoded with 32-bit length prefix + constexpr size_t time_overhead = 2 * sizeof(ceph_le32); // time is stored as tv_sec and tv_nsec + return start_overhead + string_overhead + tag.size() + + time_overhead + chain.estimate_encoded_size(); + } }; WRITE_CLASS_ENCODER(cls_rgw_gc_obj_info) diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc index d59cc65b4b2..bd16bde1bd5 100644 --- a/src/rgw/rgw_gc.cc +++ b/src/rgw/rgw_gc.cc @@ -65,7 +65,57 @@ int RGWGC::tag_index(const string& tag) return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs); } -int RGWGC::send_chain(cls_rgw_obj_chain& chain, const string& tag) +std::tuple> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag) +{ + ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl; + + if (cct->_conf->rgw_max_chunk_size) { + cls_rgw_obj_chain broken_chain; + ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl; + + for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) { + ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl; + broken_chain.objs.emplace_back(*it); + cls_rgw_gc_obj_info info; + info.tag = tag; + info.chain = broken_chain; + cls_rgw_gc_set_entry_op op; + op.info = info; + size_t total_encoded_size = op.estimate_encoded_size(); + ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl; + + if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc + broken_chain.objs.pop_back(); + --it; + ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl; + auto ret = send_chain(broken_chain, tag); + if (ret < 0) { + broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {broken_chain}}; + } + broken_chain.objs.clear(); + } + } + if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size + ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl; + auto ret = send_chain(broken_chain, tag); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {broken_chain}}; + } + } + } else { + auto ret = send_chain(chain, tag); + if (ret < 0) { + ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl; + return {ret, {std::move(chain)}}; + } + } + return {0, {}}; +} + +int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag) { ObjectWriteOperation op; cls_rgw_gc_obj_info info; diff --git a/src/rgw/rgw_gc.h b/src/rgw/rgw_gc.h index f66b0be89c8..196f2802c16 100644 --- a/src/rgw/rgw_gc.h +++ b/src/rgw/rgw_gc.h @@ -29,6 +29,7 @@ class RGWGC : public DoutPrefixProvider { static constexpr uint64_t seed = 8675309; int tag_index(const std::string& tag); + int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag); class GCWorker : public Thread { const DoutPrefixProvider *dpp; @@ -51,7 +52,7 @@ public: finalize(); } std::vector transitioned_objects_cache; - int send_chain(cls_rgw_obj_chain& chain, const std::string& tag); + std::tuple> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag); // asynchronously defer garbage collection on an object that's still being read int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index e51cba0d4d2..b57cbc188e8 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -4927,10 +4927,10 @@ int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp //Delete objects inline just in case gc hasn't been initialised, prevents crashes store->delete_objs_inline(dpp, chain, tag); } else { - auto ret = store->gc->send_chain(chain, tag); // do it synchronously - if (ret < 0) { + auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously + if (ret < 0 && leftover_chain) { //Delete objects inline if send chain to gc fails - store->delete_objs_inline(dpp, chain, tag); + store->delete_objs_inline(dpp, *leftover_chain, tag); } } return 0; @@ -4950,13 +4950,13 @@ void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, } } -int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag) +std::tuple> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag) { if (chain.empty()) { - return 0; + return {0, std::nullopt}; } - return gc->send_chain(chain, tag); + return gc->send_split_chain(chain, tag); } void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag) diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 0bac6c607b1..243963711b8 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1469,7 +1469,7 @@ public: int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id); void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain); - int send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag); + std::tuple> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag); void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag); int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op); int gc_aio_operate(const std::string& oid, librados::AioCompletion *c, diff --git a/src/rgw/rgw_sal_rados.cc b/src/rgw/rgw_sal_rados.cc index be2ecf90481..6ca6703c484 100644 --- a/src/rgw/rgw_sal_rados.cc +++ b/src/rgw/rgw_sal_rados.cc @@ -2003,14 +2003,14 @@ int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct) store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id()); } else { /* use upload id as tag and do it synchronously */ - ret = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id()); - if (ret < 0) { + auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id()); + if (ret < 0 && leftover_chain) { ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl; if (ret == -ENOENT) { return -ERR_NO_SUCH_UPLOAD; } //Delete objects inline if send chain to gc fails - store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id()); + store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id()); } } -- 2.39.5