From 8ca209e33709b1915858a4cd9747d6c580797a4c Mon Sep 17 00:00:00 2001 From: Bill Scales Date: Fri, 6 Jun 2025 13:28:14 +0100 Subject: [PATCH] osd: EC optimizations fix bug when recovering only partial write objects PGLog::reset_complete_to is not handling the scenario where all the missing objects have a partial write that excludes updating the shard being recovered as their most recent update. In this scenario the oldest need is newer than newest log entry. Setting last_compelte to the head of the log confuses code and makes it think that recovery has completed. The fix is to hold last_complete one entry behind the head of the log until all missing objects have been recovered. PGLog::recover_got already does this when an object is recovered and the remaining objects to recover match this scenario, so this fix just makes reset_complete_to behave the same way as recover_got. Signed-off-by: Bill Scales --- src/osd/PGLog.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index c829f05348983..53ca8e92eb2ab 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -1010,6 +1010,9 @@ public: // partial writes allow a shard which did not participate in a write to // have a missing version that is newer that the most recent log entry if (ec_optimizations_enabled && (log.complete_to == log.log.end())) { + // keep complete_to one entry behind the end of the log to stop + // code incorrectly using it to deduce that recovery has completed + --log.complete_to; break; } ceph_assert(log.complete_to != log.log.end()); -- 2.39.5