From: Alex Ainscow Date: Mon, 12 Jan 2026 09:36:35 +0000 (+0000) Subject: osd: Change rmissing map key from version_t to eversion_t X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ea27f511bf287592db2ea7dde675b58c6fe8eff4;p=ceph-ci.git osd: Change rmissing map key from version_t to eversion_t Changed the rmissing reverse index map from std::map to std::map to properly track both epoch and version information for missing objects. Motivation: This fixes a critical bug where out-of-order divergent logs across epoch boundaries could corrupt the missing list and cause assertions in pg_missing_set::get_oldest_need(). The root cause was that rmissing used only version_t as the key, which could lead to version number collisions when the same version number appears in different epochs. When merging logs from epoch 1 and epoch 2, objects with the same version number (e.g., version 1 in epoch 1 vs version 1 in epoch 2) would collide in the map, causing the reverse index to become inconsistent with the forward missing map. During out-of-order recovery, this inconsistency would trigger the assertion: FAILED ceph_assert(it != missing.end()) in get_oldest_need() when trying to look up an object that should exist in missing but whose entry in rmissing was overwritten by a collision. Fix: By using the full eversion_t (epoch + version) as the key, the map now: Properly orders missing objects by epoch first, then version prevents key collisions when version numbers overlap across epochs Ensures correct recovery ordering during epoch transitions Maintains consistency between missing and rmissing maps Changes: Updated rmissing type declaration and all accessor methods in pg_missing_set Updated all map operations to use eversion_t keys throughout the codebase Added safety assertions to detect duplicate keys and ensure consistency between missing and rmissing maps Fixed iterator types and conversions in recovery code paths Upgrade Compatibility: This change requires no upgrade or versioning code because rmissing is never serialized to disk or transmitted over the network. The encode() method (line 5319) only encodes the missing map, not rmissing. During decode() (line 5325), rmissing is completely reconstructed from the decoded missing map (lines 5354-5363). This means: Old OSDs writing data will only serialize missing New OSDs reading data will reconstruct rmissing with the new key type No on-disk format changes are required No network protocol changes are required The change is transparent to upgrade/downgrade scenarios Testing: All existing tests pass (unittest_pglog: 53 tests, unittest_osd_types: 70 tests) Safety assertions successfully detected and prevented a test bug where duplicate eversion_t keys were being used Renamed test cases for clarity: merge_log_epoch_change_basic: Tests fundamental invariant merge_log_epoch_change_out_of_order_recovery: Tests recovery ordering Note: AI assistance was used to generate the unit test cases that validate the epoch change behavior and out-of-order recovery scenarios. Fixes: https://tracker.ceph.com/issues/74306 Signed-off-by: Alex Ainscow --- diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index 48d12d6c92e..69456cd5fe2 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -123,12 +123,12 @@ size_t PGRecovery::start_primary_recovery_ops( unsigned started = 0; int skipped = 0; - map::const_iterator p = - missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested); + map::const_iterator p = + missing.get_rmissing().lower_bound(eversion_t(0, pg->get_peering_state().get_pg_log().get_log().last_requested)); while (started < max_to_start && p != missing.get_rmissing().end()) { // TODO: chain futures here to enable yielding to scheduler? hobject_t soid; - version_t v = p->first; + eversion_t v = p->first; auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second); if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) { @@ -190,7 +190,7 @@ size_t PGRecovery::start_primary_recovery_ops( } if (!skipped) - pg->get_peering_state().set_last_requested(v); + pg->get_peering_state().set_last_requested(v.version); } DEBUGDPP("started {} skipped {}", pg->get_dpp(), started, skipped); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 883a32a77e6..b4c66df78b0 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -821,7 +821,7 @@ void PrimaryLogPG::maybe_force_recovery() return; // find the oldest missing object - version_t min_version = recovery_state.get_pg_log().get_log().head.version; + eversion_t min_version = recovery_state.get_pg_log().get_log().head; hobject_t soid; if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) { min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first; @@ -13532,12 +13532,12 @@ uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handl int skipped = 0; PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); - map::const_iterator p = - missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested); + map::const_iterator p = + missing.get_rmissing().lower_bound(eversion_t(0, recovery_state.get_pg_log().get_log().last_requested)); while (p != missing.get_rmissing().end()) { handle.reset_tp_timeout(); hobject_t soid; - version_t v = p->first; + eversion_t v = p->first; auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second); if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) { @@ -13671,7 +13671,7 @@ uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handl // only advance last_requested if we haven't skipped anything if (!skipped) - recovery_state.set_last_requested(v); + recovery_state.set_last_requested(v.version); } pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority()); @@ -13842,7 +13842,7 @@ uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &hand // oldest first! const pg_missing_t &m(pm->second); - for (map::const_iterator p = m.get_rmissing().begin(); + for (map::const_iterator p = m.get_rmissing().begin(); p != m.get_rmissing().end() && started < max; ++p) { handle.reset_tp_timeout(); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 2a08d51bb90..d16b77883d1 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5013,7 +5013,7 @@ class pg_missing_const_i { public: virtual const std::map & get_items() const = 0; - virtual const std::map &get_rmissing() const = 0; + virtual const std::map &get_rmissing() const = 0; virtual bool get_may_include_deletes() const = 0; virtual unsigned int num_missing() const = 0; virtual bool have_missing() const = 0; @@ -5059,7 +5059,20 @@ template class pg_missing_set : public pg_missing_const_i { using item = pg_missing_item; std::map missing; // oid -> (need v, have v) - std::map rmissing; // v -> oid + /** + * rmissing + * + * Reverse mapping from pg_missing_item::need -> object + * + * This mapping uses eversion_t as a key because although the log + * and missing sets may not contain two entries with the same version, + * this doesn't hold *during* the log merge process because + * the order of divergent entries may not match the order of the + * corresponding entries in the authoritiative log. + * + * See https://tracker.ceph.com/issues/74306 + */ + std::map rmissing; // v -> oid ChangeTracker tracker; public: @@ -5079,7 +5092,7 @@ public: const std::map &get_items() const override { return missing; } - const std::map &get_rmissing() const override { + const std::map &get_rmissing() const override { return rmissing; } bool get_may_include_deletes() const override { @@ -5147,7 +5160,7 @@ public: if (e.prior_version == eversion_t() || e.is_clone()) { // new object. if (is_missing_divergent_item) { // use iterator - rmissing.erase(missing_it->second.need.version); + rmissing.erase(missing_it->second.need); // .have = nil missing_it->second = item(e.version, eversion_t(), e.is_delete()); missing_it->second.clean_regions.mark_fully_dirty(); @@ -5162,7 +5175,7 @@ public: } } else if (is_missing_divergent_item) { // already missing (prior). - rmissing.erase((missing_it->second).need.version); + rmissing.erase((missing_it->second).need); missing_it->second.need = e.version; // leave .have unchanged. missing_it->second.set_delete(e.is_delete()); if (e.is_lost_revert()) @@ -5182,7 +5195,7 @@ public: missing[e.soid].clean_regions = e.clean_regions; } if (!skipped) { - rmissing[e.version.version] = e.soid; + rmissing[e.version] = e.soid; tracker.changed(e.soid); } } @@ -5190,7 +5203,7 @@ public: void revise_need(hobject_t oid, eversion_t need, bool is_delete) { auto p = missing.find(oid); if (p != missing.end()) { - rmissing.erase((p->second).need.version); + rmissing.erase((p->second).need); p->second.need = need; // do not adjust .have p->second.set_delete(is_delete); p->second.clean_regions.mark_fully_dirty(); @@ -5198,7 +5211,7 @@ public: missing[oid] = item(need, eversion_t(), is_delete); missing[oid].clean_regions.mark_fully_dirty(); } - rmissing[need.version] = oid; + rmissing[need] = oid; tracker.changed(oid); } @@ -5222,12 +5235,12 @@ public: void add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete) { missing[oid] = item(need, have, is_delete, true); - rmissing[need.version] = oid; + rmissing[need] = oid; tracker.changed(oid); } void add(const hobject_t& oid, pg_missing_item&& item) { - rmissing[item.need.version] = oid; + rmissing[item.need] = oid; missing.insert({oid, std::move(item)}); tracker.changed(oid); } @@ -5240,7 +5253,7 @@ public: void rm(std::map::const_iterator m) { tracker.changed(m->first); - rmissing.erase(m->second.need.version); + rmissing.erase(m->second.need); missing.erase(m); } @@ -5253,7 +5266,7 @@ public: void got(std::map::const_iterator m) { tracker.changed(m->first); - rmissing.erase(m->second.need.version); + rmissing.erase(m->second.need); missing.erase(m); } @@ -5322,7 +5335,7 @@ public: missing.begin(); it != missing.end(); ++it) - rmissing[it->second.need.version] = it->first; + rmissing[it->second.need] = it->first; for (auto const &i: missing) tracker.changed(i.first); } diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index 8e7993550e5..3ffe972bce1 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -3255,6 +3255,164 @@ TEST_F(PGLogTrimTest, TestCopyAfter2) { EXPECT_EQ(7u, copy.dups.size()) << copy; } -// Local Variables: -// compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true --debug-osd=20 # --gtest_filter=*.* " -// End: +// Test for merge_log with existing missing list and divergent log +// This reproduces the issue where missing and rmissing lists become inconsistent +// Test that merge_log correctly handles epoch changes with out-of-order recovery. +// When objects are recovered out of order after an epoch change, the missing and +// rmissing maps must remain consistent. +TEST_F(PGLogTest, merge_log_epoch_change_out_of_order_recovery) { + clear(); + + // Create objects matching the log scenario + hobject_t obj_3, obj_4, obj_0, obj_5, obj_1, obj_2; + + obj_0.set_hash(0); + obj_1.set_hash(1); + obj_2.set_hash(2); + obj_3.set_hash(3); + obj_4.set_hash(4); + obj_5.set_hash(5); + + // Set up the initial missing list with ALL 14 objects at epoch 1 versions + missing.add(obj_3, eversion_t(1, 1), eversion_t(), false); + missing.add(obj_1, eversion_t(1, 2), eversion_t(), false); + missing.add(obj_4, eversion_t(1, 3), eversion_t(), false); + missing.add(obj_0, eversion_t(1, 4), eversion_t(), false); + missing.add(obj_5, eversion_t(1, 5), eversion_t(), false); + missing.add(obj_2, eversion_t(1, 6), eversion_t(), false); + + missing.flush(); + + // Build the divergent log (what we have locally) - epoch 1 + // This represents one branch of history + log.tail = eversion_t(1, 0); + // Divergent entries matching the actual log scenario + log.log.push_back(mk_ple_mod(obj_3, eversion_t(1, 1), eversion_t())); + log.log.push_back(mk_ple_mod(obj_1, eversion_t(1, 2), eversion_t())); + log.log.push_back(mk_ple_mod(obj_4, eversion_t(1, 3), eversion_t())); + log.log.push_back(mk_ple_mod(obj_0, eversion_t(1, 4), eversion_t())); + log.log.push_back(mk_ple_mod(obj_5, eversion_t(1, 5), eversion_t())); + log.log.push_back(mk_ple_mod(obj_2, eversion_t(1, 6), eversion_t())); + + log.head = eversion_t(1, 14); + log.index(); + + pg_info_t info; + info.last_update = eversion_t(1, 6); + info.last_complete = eversion_t(); + info.log_tail = eversion_t(1, 0); + info.last_backfill = hobject_t::get_max(); + + // Build the authoritative log (olog) - epoch 2 (NEWER epoch!) + // This represents the authoritative branch that extends the head + IndexedLog olog; + olog.tail = eversion_t(1, 0); + // Authoritative entries with NEWER epoch - this is the key difference! + olog.log.push_back(mk_ple_mod(obj_3, eversion_t(2, 1), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_4, eversion_t(2, 2), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_0, eversion_t(2, 3), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_5, eversion_t(2, 4), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_1, eversion_t(2, 5), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_2, eversion_t(2, 6), eversion_t())); + + olog.head = eversion_t(2, 6); + olog.index(); + + pg_info_t oinfo; + oinfo.last_update = eversion_t(2, 6); + oinfo.last_complete = eversion_t(); + oinfo.log_tail = eversion_t(2, 0); + oinfo.last_backfill = hobject_t::get_max(); + + // Perform merge_log - this should handle the existing missing list correctly + LogHandler h; + bool dirty_info = false; + bool dirty_big_info = false; + + merge_log( + oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info, + pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false); + + // Recover objects in the order shown in the logs + pg_info_t recovery_info; + recovery_info.last_update = eversion_t(2, 2); + recovery_info.last_complete = eversion_t(); + + // Recover out of order. + recover_got(obj_3, eversion_t(2, 1), recovery_info); + // Skip v2 + recover_got(obj_0, eversion_t(2, 3), recovery_info); + recover_got(obj_5, eversion_t(2, 4), recovery_info); + recover_got(obj_1, eversion_t(2, 5), recovery_info); + recover_got(obj_2, eversion_t(2, 6), recovery_info); + // Now to v2. + + recover_got(obj_4, eversion_t(2, 2), recovery_info); + + ASSERT_FALSE(missing.have_missing()); + ASSERT_EQ(0, missing.get_rmissing().size()); +} + +// Test basic invariant: after merge_log with epoch change, missing and rmissing +// sizes must match. This is the fundamental consistency check. +TEST_F(PGLogTest, merge_log_epoch_change_basic) { + clear(); + + // Create objects matching the log scenario + hobject_t obj_0, obj_1; + + obj_0.set_hash(0); + obj_1.set_hash(1); + + // Set up the initial missing list with ALL 14 objects at epoch 1 versions + missing.add(obj_0, eversion_t(1, 1), eversion_t(), false); + missing.add(obj_1, eversion_t(1, 2), eversion_t(), false); + + missing.flush(); + + // Build the divergent log (what we have locally) - epoch 1 + // This represents one branch of history + log.tail = eversion_t(1, 0); + // Divergent entries matching the actual log scenario + log.log.push_back(mk_ple_mod(obj_0, eversion_t(1, 1), eversion_t())); + log.log.push_back(mk_ple_mod(obj_1, eversion_t(1, 2), eversion_t())); + + log.head = eversion_t(1, 2); + log.index(); + + pg_info_t info; + info.last_update = eversion_t(1, 2); + info.last_complete = eversion_t(); + info.log_tail = eversion_t(1, 0); + info.last_backfill = hobject_t::get_max(); + + // Build the authoritative log (olog) - epoch 2 (NEWER epoch!) + // This represents the authoritative branch that extends the head + IndexedLog olog; + olog.tail = eversion_t(1, 0); + // Authoritative entries with NEWER epoch - this is the key difference! + olog.log.push_back(mk_ple_mod(obj_1, eversion_t(2, 1), eversion_t())); + olog.log.push_back(mk_ple_mod(obj_0, eversion_t(2, 2), eversion_t())); + + olog.head = eversion_t(2, 2); + olog.index(); + + pg_info_t oinfo; + oinfo.last_update = eversion_t(2, 2); + oinfo.last_complete = eversion_t(2, 2); + oinfo.log_tail = eversion_t(1, 0); + oinfo.last_backfill = hobject_t::get_max(); + + // Perform merge_log - this should handle the existing missing list correctly + LogHandler h; + bool dirty_info = false; + bool dirty_big_info = false; + + merge_log( + oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info, + pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false); + + // Revers missing should be same length as missing! + ASSERT_EQ(2, missing.num_missing()); + ASSERT_EQ(2, missing.get_rmissing().size()); +} diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc index 3c9f088fb06..62a3020bd1c 100644 --- a/src/test/osd/types.cc +++ b/src/test/osd/types.cc @@ -1153,7 +1153,7 @@ TEST(pg_missing_t, add_next_event) missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); @@ -1178,7 +1178,7 @@ TEST(pg_missing_t, add_next_event) missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); @@ -1203,7 +1203,7 @@ TEST(pg_missing_t, add_next_event) missing.add_next_event(e, pg_pool_t(), shard_id_t()); EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); @@ -1230,7 +1230,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(missing.is_missing(oid)); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); EXPECT_EQ(version, missing.get_items().at(oid).need); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); } @@ -1255,7 +1255,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(missing.get_items().at(oid).is_delete()); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); EXPECT_EQ(version, missing.get_items().at(oid).need); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); } @@ -1282,7 +1282,7 @@ TEST(pg_missing_t, add_next_event) EXPECT_TRUE(missing.get_items().at(oid).is_delete()); EXPECT_EQ(prior_version, missing.get_items().at(oid).have); EXPECT_EQ(e.version, missing.get_items().at(oid).need); - EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version)); + EXPECT_EQ(oid, missing.get_rmissing().at(e.version)); EXPECT_EQ(1U, missing.num_missing()); EXPECT_EQ(1U, missing.get_rmissing().size()); } @@ -1418,8 +1418,8 @@ TEST(pg_missing_t, split_into) uint32_t hash2 = 2; hobject_t oid2(object_t("objname"), "key2", 123, hash2, 0, ""); pg_missing_t missing; - missing.add(oid1, eversion_t(), eversion_t(), false); - missing.add(oid2, eversion_t(), eversion_t(), false); + missing.add(oid1, eversion_t(1, 1), eversion_t(), false); + missing.add(oid2, eversion_t(1, 2), eversion_t(), false); pg_t child_pgid; child_pgid.m_seed = 1; pg_missing_t child;