From 63f20ebdd63e72d03ff0741f900bf1fb0a1591bb Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Wed, 26 Mar 2025 10:46:07 +0000 Subject: [PATCH] osd: EC Optimizations: Relax reset_complete_to for partial writes EC Optimized pools can have shards missing log entries because of partial writes. This means it is possible to have a missing entry with a newer version than the log. Relax an assert in reset_complete_to to avoid this. reset_complete_to also resets last_complete to 0 when the oldest missing object is before the first log entry. This is to aggressive for partial writes and needs to be relaxed. Signed-off-by: Bill Scales --- src/osd/PGLog.h | 26 ++++++++++++++++++++------ src/osd/PeeringState.cc | 13 ++++++++----- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 4d899f2356b38..a6434098d2d34 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -929,7 +929,7 @@ public: ceph_assert(log.get_can_rollback_to() >= v); } - void reset_complete_to(pg_info_t *info) { + void reset_complete_to(pg_info_t *info, bool ec_optimizations_enabled) { if (log.log.empty()) // caller is split_into() return; log.complete_to = log.log.begin(); @@ -938,13 +938,26 @@ public: if (oldest_need != eversion_t()) { while (log.complete_to->version < oldest_need) { ++log.complete_to; + // partial writes allow a shard which did not participate in a write to + // have a missing version that is newer that the most recent log entry + if (ec_optimizations_enabled && (log.complete_to == log.log.end())) { + break; + } ceph_assert(log.complete_to != log.log.end()); } } if (!info) return; if (log.complete_to == log.log.begin()) { - info->last_complete = eversion_t(); + // partial writes use last complete to track shards that did not + // participate in a write - do not reset it unnecessarily + if (!ec_optimizations_enabled) { + info->last_complete = eversion_t(); + } else if ((oldest_need != eversion_t()) && + info->last_complete >= oldest_need) { + info->last_complete = eversion_t(oldest_need.epoch, + oldest_need.version - 1); + } } else { --log.complete_to; info->last_complete = log.complete_to->version; @@ -952,8 +965,8 @@ public: } } - void activate_not_complete(pg_info_t &info) { - reset_complete_to(&info); + void activate_not_complete(pg_info_t &info, bool ec_optimizations_enabled) { + reset_complete_to(&info, ec_optimizations_enabled); log.last_requested = 0; } @@ -1322,7 +1335,8 @@ public: bool append_new_log_entries( const hobject_t &last_backfill, const mempool::osd_pglog::list &entries, - LogEntryHandler *rollbacker) { + LogEntryHandler *rollbacker, + bool ec_optimizations_enabled) { bool invalidate_stats = append_log_entries_update_missing( last_backfill, entries, @@ -1342,7 +1356,7 @@ public: // always in a std::list of solely lost_delete entries, so it is // sufficient to check whether the first entry is a // lost_delete - reset_complete_to(nullptr); + reset_complete_to(nullptr, ec_optimizations_enabled); } } return invalidate_stats; diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 3d0a74dcf6f8f..89f6037406ef7 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -2837,7 +2837,7 @@ void PeeringState::activate( } else { psdout(10) << "activate - not complete, " << missing << dendl; info.stats.stats.sum.num_objects_missing = missing.num_missing(); - pg_log.activate_not_complete(info); + pg_log.activate_not_complete(info, pool.info.allows_ecoptimizations()); } log_weirdness(); @@ -3389,6 +3389,8 @@ void PeeringState::try_mark_clean() void PeeringState::split_into( pg_t child_pgid, PeeringState *child, unsigned split_bits) { + bool ec_optimizations_enabled = pool.info.allows_ecoptimizations(); + child->update_osdmap_ref(get_osdmap()); child->pool = pool; @@ -3407,8 +3409,8 @@ void PeeringState::split_into( child->info.log_tail = child->pg_log.get_tail(); // reset last_complete, we might have modified pg_log & missing above - pg_log.reset_complete_to(&info); - child->pg_log.reset_complete_to(&child->info); + pg_log.reset_complete_to(&info, ec_optimizations_enabled); + child->pg_log.reset_complete_to(&child->info, ec_optimizations_enabled); // Info child->info.history = info.history; @@ -4227,7 +4229,8 @@ bool PeeringState::append_log_entries_update_missing( pg_log.append_new_log_entries( info.last_backfill, entries, - rollbacker.get()); + rollbacker.get(), + pool.info.allows_ecoptimizations()); if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) { pg_log.roll_forward(&info, rollbacker.get()); @@ -4502,7 +4505,7 @@ void PeeringState::force_object_missing( peer_missing[peer].add(soid, version, eversion_t(), false); } else { pg_log.missing_add(soid, version, eversion_t()); - pg_log.reset_complete_to(&info); + pg_log.reset_complete_to(&info, pool.info.allows_ecoptimizations()); pg_log.set_last_requested(0); } } -- 2.39.5