From: Alex Ainscow Date: Tue, 29 Apr 2025 11:02:07 +0000 (+0100) Subject: osd: Refactor partial_write to address multiple issues. X-Git-Tag: v20.1.0~68^2~32 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f51f60f3b27e4339972645cdeb34369a164e6ae2;p=ceph.git osd: Refactor partial_write to address multiple issues. We fix a number of issues with partial_write here. Fix an issue where it is unclear whether the empty PWLC state is newer or older than a populated PWLC on another shard by always updating the pwlc with an empty range, rather than blank. This is an unfortunate small increase in metadata, so we should come back to this in a later commit (or possibly later PR). Normally a PG log consists of a set of log entries with each log entry have a version number one greater than the previous entry. When a PG splits the PG log is split so that each of the new PGs only has log entries for objects in that PG, which means there can be gaps between version numbers. PGBackend::partial_write is trying to keep track of adjacent log updates than do not update a particular shard storing these as a range in partial_writes_last_complete. To do this it must compare with the version number of the previous log entry rather than testing for a version number increment of one. Also simplify partial_writes to make it more readable. Signed-off-by: Bill Scales Signed-off-by: Alex Ainscow (cherry picked from commit 2467406f3e22c0746ce20cd04b838dccedadf055) --- diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 6384f93f72d..dd35b7f0ecd 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -478,9 +478,13 @@ public: void trim(const pg_log_entry_t &entry) override { // TODO } - void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override { + void partial_write(pg_info_t *info, + eversion_t previous_version, + const pg_log_entry_t &entry + ) override { // TODO - ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty()); + ceph_assert(entry.written_shards.empty() && + info->partial_writes_last_complete.empty()); } }; PGLog::LogEntryHandlerRef get_log_handler( diff --git a/src/osd/PG.h b/src/osd/PG.h index 87c52f5bb72..f3e760c1b8f 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1144,8 +1144,10 @@ protected: void trim(const pg_log_entry_t &entry) override { pg->get_pgbackend()->trim(entry, t); } - void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override { - pg->get_pgbackend()->partial_write(info, entry); + void partial_write(pg_info_t *info, eversion_t previous_version, + const pg_log_entry_t &entry + ) override { + pg->get_pgbackend()->partial_write(info, previous_version, entry); } }; diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index ba492a69974..b199c9248dc 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -413,51 +413,53 @@ void PGBackend::try_stash( void PGBackend::partial_write( pg_info_t *info, + eversion_t previous_version, const pg_log_entry_t &entry) { ceph_assert(info != nullptr); + if (entry.written_shards.empty() && info->partial_writes_last_complete.empty()) { + return; + } auto dpp = get_parent()->get_dpp(); - if (!entry.written_shards.empty()) { - ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version - << " written_shards=" << entry.written_shards - << " present_shards=" << entry.present_shards - << " pwlc=" << info->partial_writes_last_complete - << dendl; - const pg_pool_t &pool = get_parent()->get_pool(); - for (unsigned int shard = 0; - shard < get_parent()->get_pool().size; - shard++) { - if (pool.is_nonprimary_shard(shard_id_t(shard))) { - if (!entry.is_written_shard(shard_id_t(shard))) { - if (!info->partial_writes_last_complete.contains(shard_id_t(shard))) { - // 1st partial write since all logs were updated - info->partial_writes_last_complete[shard_id_t(shard)] = - std::pair(entry.prior_version, entry.version); - } else if (info->partial_writes_last_complete[shard_id_t(shard)] - .second.version + 1 == entry.version.version) { - // Subsequent partial write, version is sequential - info->partial_writes_last_complete[shard_id_t(shard)].second = - entry.version; - } else { - // Subsequent partial write, discontiguous versions - ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard - << dendl; - } - } else { - // Log updated or shard absent, partial write entry not required - info->partial_writes_last_complete.erase(shard_id_t(shard)); - } + ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version + << " written_shards=" << entry.written_shards + << " present_shards=" << entry.present_shards + << " pwlc=" << info->partial_writes_last_complete + << " previous_version=" << previous_version + << dendl; + const pg_pool_t &pool = get_parent()->get_pool(); + for (shard_id_t shard : pool.nonprimary_shards) { + auto pwlc_iter = info->partial_writes_last_complete.find(shard); + if (!entry.is_written_shard(shard)) { + if (pwlc_iter == info->partial_writes_last_complete.end()) { + // 1st partial write since all logs were updated + info->partial_writes_last_complete[shard] = + std::pair(previous_version, entry.version); + + continue; } + auto &&[old_v, new_v] = pwlc_iter->second; + if (old_v == new_v) { + old_v = previous_version; + new_v = entry.version; + } else if (new_v == previous_version) { + // Subsequent partial write, contiguous versions + new_v = entry.version; + } else { + // Subsequent partial write, discontiguous versions + ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard + << dendl; + } + } else if (pwlc_iter != info->partial_writes_last_complete.end()) { + auto &&[old_v, new_v] = pwlc_iter->second; + // Log updated or shard absent, partial write entry is a no-op + // FIXME: In a later commit (or later PR) we should look at other ways of + // actually clearing the PWLC once all shards have seen the update. + old_v = new_v = entry.version; } - ldpp_dout(dpp, 20) << __func__ << " after pwlc=" - << info->partial_writes_last_complete << dendl; - } else { - // All shard updated - clear partial write data - if (!info->partial_writes_last_complete.empty()) { - ldpp_dout(dpp, 20) << __func__ << " clear pwlc" << dendl; - } - info->partial_writes_last_complete.clear(); } + ldpp_dout(dpp, 20) << __func__ << " after pwlc=" + << info->partial_writes_last_complete << dendl; } void PGBackend::remove( diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index efe2f173cf5..acbff0745b2 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -504,6 +504,7 @@ typedef std::shared_ptr OSDMapRef; void partial_write( pg_info_t *info, + eversion_t previous_version, const pg_log_entry_t &entry); void remove( diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 329a968509a..be7e97381f2 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -152,8 +152,10 @@ struct PGLog : DoutPrefixProvider { const hobject_t &hoid, version_t v) = 0; virtual void partial_write( - pg_info_t *info, - const pg_log_entry_t &entry) = 0; + pg_info_t *info, + eversion_t previous_version, + const pg_log_entry_t &entry + ) = 0; virtual ~LogEntryHandler() {} }; using LogEntryHandlerRef = std::unique_ptr; @@ -200,13 +202,20 @@ public: rollback_info_trimmed_to = to; } + eversion_t previous_version; + if (rollback_info_trimmed_to_riter == log.rend()) { + previous_version = tail; + } else { + previous_version = rollback_info_trimmed_to_riter->version; + } while (rollback_info_trimmed_to_riter != log.rbegin()) { --rollback_info_trimmed_to_riter; if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { ++rollback_info_trimmed_to_riter; break; } - f(*rollback_info_trimmed_to_riter); + f(*rollback_info_trimmed_to_riter, previous_version); + previous_version = rollback_info_trimmed_to_riter->version; } return dirty_log; @@ -260,30 +269,31 @@ public: void trim_rollback_info_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) { advance_can_rollback_to( to, - [&](pg_log_entry_t &entry) { + [&](pg_log_entry_t &entry, eversion_t previous_version) { h->trim(entry); - h->partial_write(info, entry); + h->partial_write(info, previous_version, entry); }); } bool roll_forward_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) { return advance_can_rollback_to( to, - [&](pg_log_entry_t &entry) { + [&](pg_log_entry_t &entry, eversion_t previous_version) { h->rollforward(entry); - h->partial_write(info, entry); + h->partial_write(info, previous_version, entry); }); } void skip_can_rollback_to_to_head(pg_info_t *info, LogEntryHandler *h) { advance_can_rollback_to( head, - [&](pg_log_entry_t &entry) { - h->partial_write(info, entry); + [&](pg_log_entry_t &entry, eversion_t previous_version) { + h->partial_write(info, previous_version, entry); }); } void skip_can_rollback_to_to_head() { - advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); + advance_can_rollback_to(head, [&](const pg_log_entry_t &entry, + eversion_t previous_version) {}); } mempool::osd_pglog::list rewind_from_head(eversion_t newhead) { diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index ef6b8758dd3..ff275b92bc6 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -234,8 +234,10 @@ public: void trim( const pg_log_entry_t &entry) override {} void partial_write( - pg_info_t *info, - const pg_log_entry_t &entry) override {} + pg_info_t *info, + eversion_t previous_version, + const pg_log_entry_t &entry + ) override {} }; template @@ -360,8 +362,10 @@ struct TestHandler : public PGLog::LogEntryHandler { void trim( const pg_log_entry_t &entry) override {} void partial_write( - pg_info_t *info, - const pg_log_entry_t &entry) override {} + pg_info_t *info, + eversion_t previous_version, + const pg_log_entry_t &entry + ) override {} }; TEST_F(PGLogTest, rewind_divergent_log) {