unsigned started = 0;
int skipped = 0;
- map<version_t, hobject_t>::const_iterator p =
- missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested);
+ map<eversion_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(eversion_t(0, pg->get_peering_state().get_pg_log().get_log().last_requested));
while (started < max_to_start && p != missing.get_rmissing().end()) {
// TODO: chain futures here to enable yielding to scheduler?
hobject_t soid;
- version_t v = p->first;
+ eversion_t v = p->first;
auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second);
if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) {
}
if (!skipped)
- pg->get_peering_state().set_last_requested(v);
+ pg->get_peering_state().set_last_requested(v.version);
}
DEBUGDPP("started {} skipped {}", pg->get_dpp(), started, skipped);
return;
// find the oldest missing object
- version_t min_version = recovery_state.get_pg_log().get_log().head.version;
+ eversion_t min_version = recovery_state.get_pg_log().get_log().head;
hobject_t soid;
if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
int skipped = 0;
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
- map<version_t, hobject_t>::const_iterator p =
- missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
+ map<eversion_t, hobject_t>::const_iterator p =
+ missing.get_rmissing().lower_bound(eversion_t(0, recovery_state.get_pg_log().get_log().last_requested));
while (p != missing.get_rmissing().end()) {
handle.reset_tp_timeout();
hobject_t soid;
- version_t v = p->first;
+ eversion_t v = p->first;
auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
// only advance last_requested if we haven't skipped anything
if (!skipped)
- recovery_state.set_last_requested(v);
+ recovery_state.set_last_requested(v.version);
}
pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
// oldest first!
const pg_missing_t &m(pm->second);
- for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
+ for (map<eversion_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
p != m.get_rmissing().end() && started < max;
++p) {
handle.reset_tp_timeout();
public:
virtual const std::map<hobject_t, pg_missing_item> &
get_items() const = 0;
- virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
+ virtual const std::map<eversion_t, hobject_t> &get_rmissing() const = 0;
virtual bool get_may_include_deletes() const = 0;
virtual unsigned int num_missing() const = 0;
virtual bool have_missing() const = 0;
class pg_missing_set : public pg_missing_const_i {
using item = pg_missing_item;
std::map<hobject_t, item> missing; // oid -> (need v, have v)
- std::map<version_t, hobject_t> rmissing; // v -> oid
+ /**
+ * rmissing
+ *
+ * Reverse mapping from pg_missing_item::need -> object
+ *
+ * This mapping uses eversion_t as a key because although the log
+ * and missing sets may not contain two entries with the same version,
+ * this doesn't hold *during* the log merge process because
+ * the order of divergent entries may not match the order of the
+ * corresponding entries in the authoritiative log.
+ *
+ * See https://tracker.ceph.com/issues/74306
+ */
+ std::map<eversion_t, hobject_t> rmissing; // v -> oid
ChangeTracker<TrackChanges> tracker;
public:
const std::map<hobject_t, item> &get_items() const override {
return missing;
}
- const std::map<version_t, hobject_t> &get_rmissing() const override {
+ const std::map<eversion_t, hobject_t> &get_rmissing() const override {
return rmissing;
}
bool get_may_include_deletes() const override {
if (e.prior_version == eversion_t() || e.is_clone()) {
// new object.
if (is_missing_divergent_item) { // use iterator
- rmissing.erase(missing_it->second.need.version);
+ rmissing.erase(missing_it->second.need);
// .have = nil
missing_it->second = item(e.version, eversion_t(), e.is_delete());
missing_it->second.clean_regions.mark_fully_dirty();
}
} else if (is_missing_divergent_item) {
// already missing (prior).
- rmissing.erase((missing_it->second).need.version);
+ rmissing.erase((missing_it->second).need);
missing_it->second.need = e.version; // leave .have unchanged.
missing_it->second.set_delete(e.is_delete());
if (e.is_lost_revert())
missing[e.soid].clean_regions = e.clean_regions;
}
if (!skipped) {
- rmissing[e.version.version] = e.soid;
+ rmissing[e.version] = e.soid;
tracker.changed(e.soid);
}
}
void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
auto p = missing.find(oid);
if (p != missing.end()) {
- rmissing.erase((p->second).need.version);
+ rmissing.erase((p->second).need);
p->second.need = need; // do not adjust .have
p->second.set_delete(is_delete);
p->second.clean_regions.mark_fully_dirty();
missing[oid] = item(need, eversion_t(), is_delete);
missing[oid].clean_regions.mark_fully_dirty();
}
- rmissing[need.version] = oid;
+ rmissing[need] = oid;
tracker.changed(oid);
}
void add(const hobject_t& oid, eversion_t need, eversion_t have,
bool is_delete) {
missing[oid] = item(need, have, is_delete, true);
- rmissing[need.version] = oid;
+ rmissing[need] = oid;
tracker.changed(oid);
}
void add(const hobject_t& oid, pg_missing_item&& item) {
- rmissing[item.need.version] = oid;
+ rmissing[item.need] = oid;
missing.insert({oid, std::move(item)});
tracker.changed(oid);
}
void rm(std::map<hobject_t, item>::const_iterator m) {
tracker.changed(m->first);
- rmissing.erase(m->second.need.version);
+ rmissing.erase(m->second.need);
missing.erase(m);
}
void got(std::map<hobject_t, item>::const_iterator m) {
tracker.changed(m->first);
- rmissing.erase(m->second.need.version);
+ rmissing.erase(m->second.need);
missing.erase(m);
}
missing.begin();
it != missing.end();
++it)
- rmissing[it->second.need.version] = it->first;
+ rmissing[it->second.need] = it->first;
for (auto const &i: missing)
tracker.changed(i.first);
}
EXPECT_EQ(7u, copy.dups.size()) << copy;
}
-// Local Variables:
-// compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true --debug-osd=20 # --gtest_filter=*.* "
-// End:
+// Test for merge_log with existing missing list and divergent log
+// This reproduces the issue where missing and rmissing lists become inconsistent
+// Test that merge_log correctly handles epoch changes with out-of-order recovery.
+// When objects are recovered out of order after an epoch change, the missing and
+// rmissing maps must remain consistent.
+TEST_F(PGLogTest, merge_log_epoch_change_out_of_order_recovery) {
+ clear();
+
+ // Create objects matching the log scenario
+ hobject_t obj_3, obj_4, obj_0, obj_5, obj_1, obj_2;
+
+ obj_0.set_hash(0);
+ obj_1.set_hash(1);
+ obj_2.set_hash(2);
+ obj_3.set_hash(3);
+ obj_4.set_hash(4);
+ obj_5.set_hash(5);
+
+ // Set up the initial missing list with ALL 14 objects at epoch 1 versions
+ missing.add(obj_3, eversion_t(1, 1), eversion_t(), false);
+ missing.add(obj_1, eversion_t(1, 2), eversion_t(), false);
+ missing.add(obj_4, eversion_t(1, 3), eversion_t(), false);
+ missing.add(obj_0, eversion_t(1, 4), eversion_t(), false);
+ missing.add(obj_5, eversion_t(1, 5), eversion_t(), false);
+ missing.add(obj_2, eversion_t(1, 6), eversion_t(), false);
+
+ missing.flush();
+
+ // Build the divergent log (what we have locally) - epoch 1
+ // This represents one branch of history
+ log.tail = eversion_t(1, 0);
+ // Divergent entries matching the actual log scenario
+ log.log.push_back(mk_ple_mod(obj_3, eversion_t(1, 1), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_1, eversion_t(1, 2), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_4, eversion_t(1, 3), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_0, eversion_t(1, 4), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_5, eversion_t(1, 5), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_2, eversion_t(1, 6), eversion_t()));
+
+ log.head = eversion_t(1, 14);
+ log.index();
+
+ pg_info_t info;
+ info.last_update = eversion_t(1, 6);
+ info.last_complete = eversion_t();
+ info.log_tail = eversion_t(1, 0);
+ info.last_backfill = hobject_t::get_max();
+
+ // Build the authoritative log (olog) - epoch 2 (NEWER epoch!)
+ // This represents the authoritative branch that extends the head
+ IndexedLog olog;
+ olog.tail = eversion_t(1, 0);
+ // Authoritative entries with NEWER epoch - this is the key difference!
+ olog.log.push_back(mk_ple_mod(obj_3, eversion_t(2, 1), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_4, eversion_t(2, 2), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_0, eversion_t(2, 3), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_5, eversion_t(2, 4), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_1, eversion_t(2, 5), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_2, eversion_t(2, 6), eversion_t()));
+
+ olog.head = eversion_t(2, 6);
+ olog.index();
+
+ pg_info_t oinfo;
+ oinfo.last_update = eversion_t(2, 6);
+ oinfo.last_complete = eversion_t();
+ oinfo.log_tail = eversion_t(2, 0);
+ oinfo.last_backfill = hobject_t::get_max();
+
+ // Perform merge_log - this should handle the existing missing list correctly
+ LogHandler h;
+ bool dirty_info = false;
+ bool dirty_big_info = false;
+
+ merge_log(
+ oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info,
+ pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false);
+
+ // Recover objects in the order shown in the logs
+ pg_info_t recovery_info;
+ recovery_info.last_update = eversion_t(2, 2);
+ recovery_info.last_complete = eversion_t();
+
+ // Recover out of order.
+ recover_got(obj_3, eversion_t(2, 1), recovery_info);
+ // Skip v2
+ recover_got(obj_0, eversion_t(2, 3), recovery_info);
+ recover_got(obj_5, eversion_t(2, 4), recovery_info);
+ recover_got(obj_1, eversion_t(2, 5), recovery_info);
+ recover_got(obj_2, eversion_t(2, 6), recovery_info);
+ // Now to v2.
+
+ recover_got(obj_4, eversion_t(2, 2), recovery_info);
+
+ ASSERT_FALSE(missing.have_missing());
+ ASSERT_EQ(0, missing.get_rmissing().size());
+}
+
+// Test basic invariant: after merge_log with epoch change, missing and rmissing
+// sizes must match. This is the fundamental consistency check.
+TEST_F(PGLogTest, merge_log_epoch_change_basic) {
+ clear();
+
+ // Create objects matching the log scenario
+ hobject_t obj_0, obj_1;
+
+ obj_0.set_hash(0);
+ obj_1.set_hash(1);
+
+ // Set up the initial missing list with ALL 14 objects at epoch 1 versions
+ missing.add(obj_0, eversion_t(1, 1), eversion_t(), false);
+ missing.add(obj_1, eversion_t(1, 2), eversion_t(), false);
+
+ missing.flush();
+
+ // Build the divergent log (what we have locally) - epoch 1
+ // This represents one branch of history
+ log.tail = eversion_t(1, 0);
+ // Divergent entries matching the actual log scenario
+ log.log.push_back(mk_ple_mod(obj_0, eversion_t(1, 1), eversion_t()));
+ log.log.push_back(mk_ple_mod(obj_1, eversion_t(1, 2), eversion_t()));
+
+ log.head = eversion_t(1, 2);
+ log.index();
+
+ pg_info_t info;
+ info.last_update = eversion_t(1, 2);
+ info.last_complete = eversion_t();
+ info.log_tail = eversion_t(1, 0);
+ info.last_backfill = hobject_t::get_max();
+
+ // Build the authoritative log (olog) - epoch 2 (NEWER epoch!)
+ // This represents the authoritative branch that extends the head
+ IndexedLog olog;
+ olog.tail = eversion_t(1, 0);
+ // Authoritative entries with NEWER epoch - this is the key difference!
+ olog.log.push_back(mk_ple_mod(obj_1, eversion_t(2, 1), eversion_t()));
+ olog.log.push_back(mk_ple_mod(obj_0, eversion_t(2, 2), eversion_t()));
+
+ olog.head = eversion_t(2, 2);
+ olog.index();
+
+ pg_info_t oinfo;
+ oinfo.last_update = eversion_t(2, 2);
+ oinfo.last_complete = eversion_t(2, 2);
+ oinfo.log_tail = eversion_t(1, 0);
+ oinfo.last_backfill = hobject_t::get_max();
+
+ // Perform merge_log - this should handle the existing missing list correctly
+ LogHandler h;
+ bool dirty_info = false;
+ bool dirty_big_info = false;
+
+ merge_log(
+ oinfo, std::move(olog), pg_shard_t(1, shard_id_t(0)), info,
+ pg_pool_t(), pg_shard_t(), &h, dirty_info, dirty_big_info, false);
+
+ // Revers missing should be same length as missing!
+ ASSERT_EQ(2, missing.num_missing());
+ ASSERT_EQ(2, missing.get_rmissing().size());
+}
missing.add_next_event(e, pg_pool_t(), shard_id_t());
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
missing.add_next_event(e, pg_pool_t(), shard_id_t());
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
missing.add_next_event(e, pg_pool_t(), shard_id_t());
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(eversion_t(), missing.get_items().at(oid).have);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
EXPECT_TRUE(missing.is_missing(oid));
EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
EXPECT_EQ(version, missing.get_items().at(oid).need);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
}
EXPECT_TRUE(missing.get_items().at(oid).is_delete());
EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
EXPECT_EQ(version, missing.get_items().at(oid).need);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
}
EXPECT_TRUE(missing.get_items().at(oid).is_delete());
EXPECT_EQ(prior_version, missing.get_items().at(oid).have);
EXPECT_EQ(e.version, missing.get_items().at(oid).need);
- EXPECT_EQ(oid, missing.get_rmissing().at(e.version.version));
+ EXPECT_EQ(oid, missing.get_rmissing().at(e.version));
EXPECT_EQ(1U, missing.num_missing());
EXPECT_EQ(1U, missing.get_rmissing().size());
}
uint32_t hash2 = 2;
hobject_t oid2(object_t("objname"), "key2", 123, hash2, 0, "");
pg_missing_t missing;
- missing.add(oid1, eversion_t(), eversion_t(), false);
- missing.add(oid2, eversion_t(), eversion_t(), false);
+ missing.add(oid1, eversion_t(1, 1), eversion_t(), false);
+ missing.add(oid2, eversion_t(1, 2), eversion_t(), false);
pg_t child_pgid;
child_pgid.m_seed = 1;
pg_missing_t child;