From 5ad1aaf9c67086e8ad694e923c7dee3466f27d70 Mon Sep 17 00:00:00 2001 From: Bill Scales <156200352+bill-scales@users.noreply.github.com> Date: Thu, 6 Mar 2025 09:46:05 +0000 Subject: [PATCH] osd: EC optimizations: changes to rollback to support partial writes EC Pools create an ObjectModDesc entry in each log entry that describes how to undo the modifcation. During peering if it is determined that only some of the shards completed an update then these shards are instructed to rollback the change. This ensures that each update is either applied to all or none of the shards. For EC optimized pools rollback becomes a bit more complicated. Firstly because not all shards may have been updated the rollback needs to be more selective in what is undone. Secondly optimized pools do not pad objects to be a multiple of the stripe width which means shards can be different sizes. There is a single ObjectModDesc entry that contains a set of operations applied by every shard, individial operations need to include enough information to work out what has to be undone on each shard. Signed-off-by: Bill Scales --- src/osd/ECBackend.h | 6 +- src/osd/ECBackendL.h | 3 + src/osd/ECSwitch.h | 2 +- src/osd/PGBackend.cc | 178 +++++++++++++++++++++++++++++++++++-------- src/osd/PGBackend.h | 9 ++- src/osd/osd_types.cc | 27 ++++++- src/osd/osd_types.h | 56 +++++++++++--- 7 files changed, 227 insertions(+), 54 deletions(-) diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 71d14a1e9fc..8b50cc22e48 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -388,7 +388,11 @@ public: int get_ec_stripe_chunk_size() const { return sinfo.get_chunk_size(); } - uint64_t object_size_to_shard_size(const uint64_t size, int shard) const { + uint64_t object_size_to_shard_size(const uint64_t size, + shard_id_t shard) const { + if (size == std::numeric_limits::max()) { + return size; + } return sinfo.logical_to_next_chunk_offset(size); } /** diff --git a/src/osd/ECBackendL.h b/src/osd/ECBackendL.h index f4f0c3afd50..6a578057acd 100644 --- a/src/osd/ECBackendL.h +++ b/src/osd/ECBackendL.h @@ -385,6 +385,9 @@ END_IGNORE_DEPRECATED return sinfo.get_chunk_size(); } uint64_t object_size_to_shard_size(const uint64_t size) const { + if (size == std::numeric_limits::max()) { + return size; + } return sinfo.logical_to_next_chunk_offset(size); } /** diff --git a/src/osd/ECSwitch.h b/src/osd/ECSwitch.h index 893869793ef..a5e81e8cbb9 100644 --- a/src/osd/ECSwitch.h +++ b/src/osd/ECSwitch.h @@ -347,7 +347,7 @@ public: } uint64_t - object_size_to_shard_size(const uint64_t size, int shard) const override + object_size_to_shard_size(const uint64_t size, shard_id_t shard) const override { if (is_optimized()) { return optimized.object_size_to_shard_size(size, shard); diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index 9bf461f2528..e91614192f5 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -204,23 +204,56 @@ void PGBackend::rollback( struct RollbackVisitor : public ObjectModDesc::Visitor { const hobject_t &hoid; PGBackend *pg; + const pg_log_entry_t &entry; ObjectStore::Transaction t; RollbackVisitor( const hobject_t &hoid, - PGBackend *pg) : hoid(hoid), pg(pg) {} + PGBackend *pg, + const pg_log_entry_t &entry) : hoid(hoid), pg(pg), entry(entry) {} void append(uint64_t old_size) override { ObjectStore::Transaction temp; - int s = static_cast(pg->get_parent()->whoami_shard().shard); - const uint64_t shard_size = pg->object_size_to_shard_size(old_size, s); + auto dpp = pg->get_parent()->get_dpp(); + const uint64_t shard_size = pg->object_size_to_shard_size(old_size, + pg->get_parent()->whoami_shard().shard); + ldpp_dout(dpp, 20) << " entry " << entry.version + << " rollback append object_size " << old_size + << " shard_size " << shard_size << dendl; pg->rollback_append(hoid, shard_size, &temp); temp.append(t); temp.swap(t); } void setattrs(map > &attrs) override { - ObjectStore::Transaction temp; - pg->rollback_setattrs(hoid, attrs, &temp); - temp.append(t); - temp.swap(t); + auto dpp = pg->get_parent()->get_dpp(); + const pg_pool_t &pool = pg->get_parent()->get_pool(); + if (pool.is_nonprimary_shard(pg->get_parent()->whoami_shard().shard)) { + if (entry.is_written_shard(pg->get_parent()->whoami_shard().shard)) { + // Written shard - only rollback OI attr + ldpp_dout(dpp, 20) << " entry " << entry.version + << " written shard OI attr rollback " + << pg->get_parent()->whoami_shard().shard + << dendl; + ObjectStore::Transaction temp; + pg->rollback_setattrs(hoid, attrs, &temp, true); + temp.append(t); + temp.swap(t); + } else { + // Unwritten shard - nothing to rollback + ldpp_dout(dpp, 20) << " entry " << entry.version + << " unwritten shard skipping attr rollback " + << pg->get_parent()->whoami_shard().shard + << dendl; + } + } else { + // Primary shard - rollback all attrs + ldpp_dout(dpp, 20) << " entry " << entry.version + << " primary_shard attr rollback " + << pg->get_parent()->whoami_shard().shard + << dendl; + ObjectStore::Transaction temp; + pg->rollback_setattrs(hoid, attrs, &temp, false); + temp.append(t); + temp.swap(t); + } } void rmobject(version_t old_version) override { ObjectStore::Transaction temp; @@ -247,17 +280,58 @@ void PGBackend::rollback( temp.swap(t); } void rollback_extents( - version_t gen, - const vector > &extents) override { + const version_t gen, + const std::vector> &extents, + const uint64_t object_size, + const std::vector &shards) override { ObjectStore::Transaction temp; - pg->rollback_extents(gen, extents, hoid, &temp); - temp.append(t); - temp.swap(t); + const pg_pool_t& pool = pg->get_parent()->get_pool(); + ceph_assert(entry.written_shards.empty() || + pool.allows_ecoptimizations()); + auto dpp = pg->get_parent()->get_dpp(); + bool donework = false; + ceph_assert(shards.empty() || shards.size() == extents.size()); + for (unsigned int i = 0; i < extents.size(); i++) { + if (shards.empty() || + shards[i].empty() || + shards[i].contains(pg->get_parent()->whoami_shard().shard)) { + // Written shard - rollback extents + const uint64_t shard_size = pg->object_size_to_shard_size( + object_size, + pg->get_parent()->whoami_shard().shard); + ldpp_dout(dpp, 20) << " entry " << entry.version + << " written shard rollback_extents " + << entry.written_shards + << " shards " + << (shards.empty() ? shard_id_set() : shards[i]) + << " " << pg->get_parent()->whoami_shard().shard + << " " << object_size + << " " << shard_size + << dendl; + pg->rollback_extents(gen, extents[i].first, extents[i].second, + hoid, shard_size, &temp); + donework = true; + } else { + // Unwritten shard - nothing to rollback + ldpp_dout(dpp, 20) << " entry " << entry.version + << " unwritten shard skipping rollback_extents " + << entry.written_shards + << " " << pg->get_parent()->whoami_shard().shard + << dendl; + } + } + if (donework) { + t.remove( + pg->coll, + ghobject_t(hoid, gen, pg->get_parent()->whoami_shard().shard)); + temp.append(t); + temp.swap(t); + } } }; ceph_assert(entry.mod_desc.can_rollback()); - RollbackVisitor vis(entry.soid, this); + RollbackVisitor vis(entry.soid, this, entry); entry.mod_desc.visit(&vis); t->append(vis.t); } @@ -279,12 +353,28 @@ struct Trimmer : public ObjectModDesc::Visitor { } // try_rmobject defaults to rmobject void rollback_extents( - version_t gen, - const vector > &extents) override { - pg->trim_rollback_object( - soid, - gen, - t); + const version_t gen, + const std::vector> &extents, + const uint64_t object_size, + const std::vector &shards) override { + auto dpp = pg->get_parent()->get_dpp(); + ceph_assert(shards.empty() || shards.size() == extents.size()); + for (unsigned int i = 0; i < extents.size(); i++) { + if (shards.empty() || + shards[i].empty() || + shards[i].contains(pg->get_parent()->whoami_shard().shard)) { + ldpp_dout(dpp, 30) << __func__ << " trim " << shards << " " + << pg->get_parent()->whoami_shard().shard << dendl; + pg->trim_rollback_object( + soid, + gen, + t); + break; + } else { + ldpp_dout(dpp, 20) << __func__ << " skipping trim " << shards << " " + << pg->get_parent()->whoami_shard().shard << dendl; + } + } } }; @@ -481,7 +571,8 @@ int PGBackend::objects_get_attrs( void PGBackend::rollback_setattrs( const hobject_t &hoid, map > &old_attrs, - ObjectStore::Transaction *t) { + ObjectStore::Transaction *t, + bool only_oi) { map> to_set; ceph_assert(!hoid.is_temp()); for (map >::iterator i = old_attrs.begin(); @@ -489,17 +580,25 @@ void PGBackend::rollback_setattrs( ++i) { if (i->second) { to_set[i->first] = *(i->second); - } else { + } else if (!only_oi) { t->rmattr( coll, ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), i->first); } } - t->setattrs( - coll, - ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), - to_set); + if (only_oi) { + t->setattr( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + OI_ATTR, + to_set[OI_ATTR]); + } else { + t->setattrs( + coll, + ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), + to_set); + } } void PGBackend::rollback_append( @@ -533,6 +632,7 @@ void PGBackend::rollback_try_stash( version_t old_version, ObjectStore::Transaction *t) { ceph_assert(!hoid.is_temp()); + dout(20) << __func__ << " " << hoid << " " << old_version << dendl; t->remove( coll, ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); @@ -544,22 +644,33 @@ void PGBackend::rollback_try_stash( void PGBackend::rollback_extents( version_t gen, - const vector > &extents, + const uint64_t offset, + uint64_t length, const hobject_t &hoid, + const uint64_t shard_size, ObjectStore::Transaction *t) { auto shard = get_parent()->whoami_shard().shard; - for (auto &&extent: extents) { + if (offset >= shard_size) { + // extent on this shard is beyond the end of the object - nothing to do + dout(20) << __func__ << " " << hoid << " " + << offset << "~" << length << " is out of range " + << shard_size << dendl; + } else { + if (offset + length > shard_size) { + dout(20) << __func__ << " " << length << " is being truncated" << dendl; + // extent on this shard goes beyond end of the object - truncate length + length = shard_size - offset; + } + dout(20) << __func__ << " " << hoid << " " << offset << "~" << length + << dendl; t->clone_range( coll, ghobject_t(hoid, gen, shard), ghobject_t(hoid, ghobject_t::NO_GEN, shard), - extent.first, - extent.second, - extent.first); + offset, + length, + offset); } - t->remove( - coll, - ghobject_t(hoid, gen, shard)); } void PGBackend::trim_rollback_object( @@ -567,6 +678,7 @@ void PGBackend::trim_rollback_object( version_t old_version, ObjectStore::Transaction *t) { ceph_assert(!hoid.is_temp()); + dout(20) << __func__ << " trim " << hoid << " " << old_version << dendl; t->remove( coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard)); } diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 9103e7e57d9..32ca5f17820 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -426,7 +426,7 @@ typedef std::shared_ptr OSDMapRef; virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0; virtual unsigned int get_ec_data_chunk_count() const { return 0; }; virtual int get_ec_stripe_chunk_size() const { return 0; }; - virtual uint64_t object_size_to_shard_size(const uint64_t size, int shard) const { return size; }; + virtual uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { return size; }; virtual void dump_recovery_info(ceph::Formatter *f) const = 0; private: @@ -504,7 +504,8 @@ typedef std::shared_ptr OSDMapRef; void rollback_setattrs( const hobject_t &hoid, std::map > &old_attrs, - ObjectStore::Transaction *t); + ObjectStore::Transaction *t, + bool only_oi); /// Truncate object to rollback append void rollback_append( @@ -534,8 +535,10 @@ typedef std::shared_ptr OSDMapRef; /// Clone the extents back into place void rollback_extents( version_t gen, - const std::vector > &extents, + const uint64_t offset, + uint64_t length, const hobject_t &hoid, + const uint64_t shard_size, ObjectStore::Transaction *t); public: diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 6068e6a1b1f..4a4e74758fb 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -4668,11 +4668,25 @@ void ObjectModDesc::visit(Visitor *visitor) const break; } case ROLLBACK_EXTENTS: { - vector > extents; + vector> extents; version_t gen; + uint64_t object_size; + vector shards; decode(gen, bp); decode(extents, bp); - visitor->rollback_extents(gen,extents); + if (struct_v < 3) { + // Object size is used by optimized EC pools that do not pad objects to a + // multiple of the strip size. Rollback clone operations for each shard + // need to be truncated to not exceed the object size. Legacy EC pools + // do not store the object_size, but because objects are padded do not + // need to truncate the clones. Setting object_size to max avoids + // truncation. + object_size = std::numeric_limits::max(); + } else { + decode(object_size, bp); + decode(shards, bp); + } + visitor->rollback_extents(gen, extents, object_size, shards); break; } default: @@ -4728,11 +4742,16 @@ struct DumpVisitor : public ObjectModDesc::Visitor { f->close_section(); } void rollback_extents( - version_t gen, - const vector > &extents) override { + const version_t gen, + const vector> &extents, + const uint64_t object_size, + const vector &shards) override { f->open_object_section("op"); f->dump_string("code", "ROLLBACK_EXTENTS"); f->dump_unsigned("gen", gen); + f->dump_unsigned("object_size", object_size); + f->dump_stream("extents") << extents; + f->dump_stream("shards") << shards; f->dump_stream("snaps") << extents; f->close_section(); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 1cbfef7e15e..be34ad0f717 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4052,8 +4052,10 @@ public: virtual void create() {} virtual void update_snaps(const std::set &old_snaps) {} virtual void rollback_extents( - version_t gen, - const std::vector > &extents) {} + const version_t gen, + const std::vector> &extents, + const uint64_t object_size, + const std::vector &shards) {} virtual ~Visitor() {} }; void visit(Visitor *visitor) const; @@ -4076,8 +4078,9 @@ public: rollback_info_completed = other.rollback_info_completed; } void claim_append(ObjectModDesc &other) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return; + } if (!other.can_local_rollback) { mark_unrollbackable(); return; @@ -4099,24 +4102,27 @@ public: encode(_id, bl); } void append(uint64_t old_size) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return; + } ENCODE_START(1, 1, bl); append_id(APPEND); encode(old_size, bl); ENCODE_FINISH(bl); } void setattrs(std::map> &old_attrs) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return; + } ENCODE_START(1, 1, bl); append_id(SETATTRS); encode(old_attrs, bl); ENCODE_FINISH(bl); } bool rmobject(version_t deletion_version) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return false; + } ENCODE_START(1, 1, bl); append_id(DELETE); encode(deletion_version, bl); @@ -4125,8 +4131,9 @@ public: return true; } bool try_rmobject(version_t deletion_version) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return false; + } ENCODE_START(1, 1, bl); append_id(TRY_DELETE); encode(deletion_version, bl); @@ -4135,27 +4142,51 @@ public: return true; } void create() { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return; + } rollback_info_completed = true; ENCODE_START(1, 1, bl); append_id(CREATE); ENCODE_FINISH(bl); } void update_snaps(const std::set &old_snaps) { - if (!can_local_rollback || rollback_info_completed) + if (!can_local_rollback || rollback_info_completed) { return; + } ENCODE_START(1, 1, bl); append_id(UPDATE_SNAPS); encode(old_snaps, bl); ENCODE_FINISH(bl); } void rollback_extents( - version_t gen, const std::vector > &extents) { + const version_t gen, + const std::vector> &extents, + const uint64_t object_size, + const std::vector &shards) { + ceph_assert(can_local_rollback); + ceph_assert(!rollback_info_completed); + if (max_required_version < 2) { + max_required_version = 2; + } + ENCODE_START(3, 2, bl); + append_id(ROLLBACK_EXTENTS); + encode(gen, bl); + encode(extents, bl); + encode(object_size, bl); + encode(shards, bl); + ENCODE_FINISH(bl); + } + + // Version for legacy EC (can be deleted when EC*L.cc is deleted. + void rollback_extents( + const version_t gen, + const std::vector> &extents) { ceph_assert(can_local_rollback); ceph_assert(!rollback_info_completed); - if (max_required_version < 2) + if (max_required_version < 2) { max_required_version = 2; + } ENCODE_START(2, 2, bl); append_id(ROLLBACK_EXTENTS); encode(gen, bl); @@ -4185,8 +4216,9 @@ public: * message buffer */ void trim_bl() const { - if (bl.length() > 0) + if (bl.length() > 0) { bl.rebuild(); + } } void encode(ceph::buffer::list &bl) const; void decode(ceph::buffer::list::const_iterator &bl); -- 2.39.5